title_grabber 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +16 -0
- data/exe/title-grabber +26 -3
- data/lib/http_helper.rb +2 -7
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +25 -7
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8c1d79385bd87e3116d10dc56c14b69929dccb1eb2597e99ed2166c683146f59
|
4
|
+
data.tar.gz: 1ea1fc865f6570c0ecb0f0f39c023585a9fac0c11dca43efbfe23a1d0aa1f075
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b2fe97ff3c4a8723b6bfa634f3bf00f8096375a9066c217359e6cae0e5ba657560093ebf647a4c042f82b482030a842c0896a94ceb71fd943143bd0f9fa03b3
|
7
|
+
data.tar.gz: 936b43e66159e54246a57f59570e5dca550d4afbf87f7fae4bf1be8166af8fe0b1cab8d000409eb68293d4b5046ef40791574567010347e00ed20c359b9f5593
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -33,6 +33,22 @@ Data is either recorded to out.csv in the CWD or the file specified using the
|
|
33
33
|
title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
|
34
34
|
```
|
35
35
|
|
36
|
+
### Environment Variables
|
37
|
+
|
38
|
+
```
|
39
|
+
DEBUG - when set it logs to STDOUT instead of to its default target, title_grabber.log
|
40
|
+
```
|
41
|
+
|
42
|
+
```
|
43
|
+
MAX_THREADS - max. # of threads to use, defaults to the # of CPU cores in the machine
|
44
|
+
```
|
45
|
+
|
46
|
+
```
|
47
|
+
CONNECT_TIMEOUT (in seconds) - defaults to 15
|
48
|
+
READ_TIMEOUT (in seconds) - defaults to 15
|
49
|
+
WRITE_TIMEOUT (in seconds) - defaults to 15
|
50
|
+
```
|
51
|
+
|
36
52
|
## Development
|
37
53
|
|
38
54
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/exe/title-grabber
CHANGED
@@ -6,15 +6,38 @@ require "pathname"
|
|
6
6
|
require_relative '../lib/title_grabber'
|
7
7
|
|
8
8
|
script_path = Pathname(__FILE__)
|
9
|
-
def_out_path = Pathname('out.csv')
|
10
9
|
|
11
|
-
arguments = {
|
10
|
+
arguments = {}
|
12
11
|
OptionParser.new do |args|
|
13
12
|
args.banner = "Usage: #{script_path.basename} [options]"
|
14
13
|
|
15
|
-
args.on("-o", "--output FILE", "Output file
|
14
|
+
args.on("-o", "--output FILE", "Output file. Defaults to #{TitleGrabber::DEF_OUT_PATH.basename}") do |out|
|
16
15
|
arguments[:output] = Pathname(out)
|
17
16
|
end
|
17
|
+
|
18
|
+
args.on("--connect-timeout TIMEOUT", Integer, "HTTP Connect timeout. Defaults to the value of the CONNECT_TIMEOUT env var or #{TitleGrabber::CONNECT_TO}") do |timeout|
|
19
|
+
arguments[:connect_to] = timeout
|
20
|
+
end
|
21
|
+
|
22
|
+
args.on("--read-timeout TIMEOUT", Integer, "HTTP Read timeout. Defaults to the value of the READ_TIMEOUT env var or #{TitleGrabber::READ_TO}") do |timeout|
|
23
|
+
arguments[:read_to] = timeout
|
24
|
+
end
|
25
|
+
|
26
|
+
args.on("--write-timeout TIMEOUT", Integer, "HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or #{TitleGrabber::WRITE_TO}") do |timeout|
|
27
|
+
arguments[:write_to] = timeout
|
28
|
+
end
|
29
|
+
|
30
|
+
args.on("-r", "--max-retries RETRIES", Integer, "Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or #{TitleGrabber::MAX_RETRIES}") do |retries|
|
31
|
+
arguments[:max_retries] = retries
|
32
|
+
end
|
33
|
+
|
34
|
+
args.on("-t", "--max-threads THREADS", Integer, "Max. # of threads to use. Defaults to the value of the MAX_THREADS env var or the # of logical processors in the system") do |threads|
|
35
|
+
arguments[:max_threads] = threads
|
36
|
+
end
|
37
|
+
|
38
|
+
args.on("-d", "--debug", "Log to STDOUT instead of to a file in the CWD") do |debug|
|
39
|
+
arguments[:debug] = true
|
40
|
+
end
|
18
41
|
end.parse!
|
19
42
|
|
20
43
|
if ARGV.empty?
|
data/lib/http_helper.rb
CHANGED
@@ -3,19 +3,14 @@ require "http"
|
|
3
3
|
require_relative "text_helper"
|
4
4
|
|
5
5
|
module HTTPHelper
|
6
|
-
WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 15))
|
7
|
-
CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 15))
|
8
|
-
READ_TO = Integer(ENV.fetch("READ_TIMEOUT", 15))
|
9
6
|
MAX_HOPS = 5
|
10
|
-
MAX_RETRIES = 3
|
11
7
|
INVALID_BYTE_SEQ = "invalid byte sequence".freeze
|
12
8
|
CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
|
13
9
|
REST_INTERVAL = 0.5..1
|
14
10
|
|
15
11
|
include TextHelper
|
16
12
|
|
17
|
-
def open_w_timeout(url, write_to
|
18
|
-
read_to: READ_TO)
|
13
|
+
def open_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
|
19
14
|
logger.info "[Thread: ##{Thread.current[:id]}] GET #{url}"
|
20
15
|
retries = 0
|
21
16
|
|
@@ -33,7 +28,7 @@ module HTTPHelper
|
|
33
28
|
CONNECTION_ERRORS.any? { |e| msg.start_with?(e) }
|
34
29
|
retries += 1
|
35
30
|
|
36
|
-
if retries <=
|
31
|
+
if retries <= max_retries
|
37
32
|
rest_time = rand(REST_INTERVAL)
|
38
33
|
logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
|
39
34
|
sleep(rest_time)
|
data/lib/title_grabber.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require "csv"
|
2
1
|
require "etc"
|
2
|
+
require "csv"
|
3
3
|
require "fileutils"
|
4
4
|
require "logger"
|
5
5
|
require "pathname"
|
@@ -12,6 +12,12 @@ require_relative "http_helper"
|
|
12
12
|
require_relative "text_helper"
|
13
13
|
|
14
14
|
module TitleGrabber
|
15
|
+
DEF_OUT_PATH = Pathname('out.csv')
|
16
|
+
CONNECT_TO = 15
|
17
|
+
READ_TO = 15
|
18
|
+
WRITE_TO = 15
|
19
|
+
MAX_RETRIES = 5
|
20
|
+
MAX_THREADS = Etc.nprocessors
|
15
21
|
URL_RE = %r(https?://\S+)i
|
16
22
|
URL_HEADER = -"url"
|
17
23
|
PAGE_TIT_HEAD = -"page_title"
|
@@ -26,14 +32,22 @@ module TitleGrabber
|
|
26
32
|
include HTTPHelper
|
27
33
|
include TextHelper
|
28
34
|
|
29
|
-
attr_reader :lines, :out_path, :tmp_path, :
|
35
|
+
attr_reader :lines, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
|
36
|
+
:max_retries, :max_threads, :logger
|
30
37
|
|
31
38
|
def initialize(lines, options)
|
32
39
|
@lines = lines
|
33
|
-
|
40
|
+
|
41
|
+
@out_path = options.fetch(:output, DEF_OUT_PATH)
|
34
42
|
@tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
|
35
43
|
|
36
|
-
|
44
|
+
@connect_to = options.fetch(:connect_to, CONNECT_TO)
|
45
|
+
@read_to = options.fetch(:read_to, READ_TO)
|
46
|
+
@write_to = options.fetch(:write_to, WRITE_TO)
|
47
|
+
@max_retries = options.fetch(:max_retries, MAX_RETRIES)
|
48
|
+
@max_threads = options.fetch(:max_th, Etc.nprocessors)
|
49
|
+
|
50
|
+
logging_target = if options[:debug]
|
37
51
|
STDOUT
|
38
52
|
else
|
39
53
|
log_file = Pathname(__FILE__).sub_ext(".log").
|
@@ -64,8 +78,7 @@ module TitleGrabber
|
|
64
78
|
end
|
65
79
|
lines = nil
|
66
80
|
|
67
|
-
thr_cnt = [
|
68
|
-
queue.size].min
|
81
|
+
thr_cnt = [max_threads, queue.size].min
|
69
82
|
threads = 1.upto(thr_cnt).map.with_index { |_, i|
|
70
83
|
Thread.new(i) do |j|
|
71
84
|
Thread.current[:id] = i + 1
|
@@ -75,7 +88,7 @@ module TitleGrabber
|
|
75
88
|
rescue ThreadError; end
|
76
89
|
|
77
90
|
while url
|
78
|
-
if (html = open_w_timeout(url)) && !html.empty?
|
91
|
+
if (html = open_w_timeout(url, **http_opts)) && !html.empty?
|
79
92
|
doc = begin
|
80
93
|
Oga.parse_html(html)
|
81
94
|
rescue ArgumentError, LL::ParserError => err
|
@@ -134,5 +147,10 @@ module TitleGrabber
|
|
134
147
|
urls
|
135
148
|
end
|
136
149
|
end
|
150
|
+
|
151
|
+
def http_opts
|
152
|
+
@http_opts ||= { connect_to: connect_to, read_to: read_to,
|
153
|
+
write_to: write_to, max_retries: max_retries }
|
154
|
+
end
|
137
155
|
end
|
138
156
|
end
|