title_grabber 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +16 -0
- data/exe/title-grabber +26 -3
- data/lib/http_helper.rb +2 -7
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +25 -7
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8c1d79385bd87e3116d10dc56c14b69929dccb1eb2597e99ed2166c683146f59
|
4
|
+
data.tar.gz: 1ea1fc865f6570c0ecb0f0f39c023585a9fac0c11dca43efbfe23a1d0aa1f075
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b2fe97ff3c4a8723b6bfa634f3bf00f8096375a9066c217359e6cae0e5ba657560093ebf647a4c042f82b482030a842c0896a94ceb71fd943143bd0f9fa03b3
|
7
|
+
data.tar.gz: 936b43e66159e54246a57f59570e5dca550d4afbf87f7fae4bf1be8166af8fe0b1cab8d000409eb68293d4b5046ef40791574567010347e00ed20c359b9f5593
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -33,6 +33,22 @@ Data is either recorded to out.csv in the CWD or the file specified using the
|
|
33
33
|
title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
|
34
34
|
```
|
35
35
|
|
36
|
+
### Environment Variables
|
37
|
+
|
38
|
+
```
|
39
|
+
DEBUG - when set it logs to STDOUT instead of to its default target, title_grabber.log
|
40
|
+
```
|
41
|
+
|
42
|
+
```
|
43
|
+
MAX_THREADS - max. # of threads to use, defaults to the # of CPU cores in the machine
|
44
|
+
```
|
45
|
+
|
46
|
+
```
|
47
|
+
CONNECT_TIMEOUT (in seconds) - defaults to 15
|
48
|
+
READ_TIMEOUT (in seconds) - defaults to 15
|
49
|
+
WRITE_TIMEOUT (in seconds) - defaults to 15
|
50
|
+
```
|
51
|
+
|
36
52
|
## Development
|
37
53
|
|
38
54
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/exe/title-grabber
CHANGED
@@ -6,15 +6,38 @@ require "pathname"
|
|
6
6
|
require_relative '../lib/title_grabber'
|
7
7
|
|
8
8
|
script_path = Pathname(__FILE__)
|
9
|
-
def_out_path = Pathname('out.csv')
|
10
9
|
|
11
|
-
arguments = {
|
10
|
+
arguments = {}
|
12
11
|
OptionParser.new do |args|
|
13
12
|
args.banner = "Usage: #{script_path.basename} [options]"
|
14
13
|
|
15
|
-
args.on("-o", "--output FILE", "Output file
|
14
|
+
args.on("-o", "--output FILE", "Output file. Defaults to #{TitleGrabber::DEF_OUT_PATH.basename}") do |out|
|
16
15
|
arguments[:output] = Pathname(out)
|
17
16
|
end
|
17
|
+
|
18
|
+
args.on("--connect-timeout TIMEOUT", Integer, "HTTP Connect timeout. Defaults to the value of the CONNECT_TIMEOUT env var or #{TitleGrabber::CONNECT_TO}") do |timeout|
|
19
|
+
arguments[:connect_to] = timeout
|
20
|
+
end
|
21
|
+
|
22
|
+
args.on("--read-timeout TIMEOUT", Integer, "HTTP Read timeout. Defaults to the value of the READ_TIMEOUT env var or #{TitleGrabber::READ_TO}") do |timeout|
|
23
|
+
arguments[:read_to] = timeout
|
24
|
+
end
|
25
|
+
|
26
|
+
args.on("--write-timeout TIMEOUT", Integer, "HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or #{TitleGrabber::WRITE_TO}") do |timeout|
|
27
|
+
arguments[:write_to] = timeout
|
28
|
+
end
|
29
|
+
|
30
|
+
args.on("-r", "--max-retries RETRIES", Integer, "Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or #{TitleGrabber::MAX_RETRIES}") do |retries|
|
31
|
+
arguments[:max_retries] = retries
|
32
|
+
end
|
33
|
+
|
34
|
+
args.on("-t", "--max-threads THREADS", Integer, "Max. # of threads to use. Defaults to the value of the MAX_THREADS env var or the # of logical processors in the system") do |threads|
|
35
|
+
arguments[:max_threads] = threads
|
36
|
+
end
|
37
|
+
|
38
|
+
args.on("-d", "--debug", "Log to STDOUT instead of to a file in the CWD") do |debug|
|
39
|
+
arguments[:debug] = true
|
40
|
+
end
|
18
41
|
end.parse!
|
19
42
|
|
20
43
|
if ARGV.empty?
|
data/lib/http_helper.rb
CHANGED
@@ -3,19 +3,14 @@ require "http"
|
|
3
3
|
require_relative "text_helper"
|
4
4
|
|
5
5
|
module HTTPHelper
|
6
|
-
WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 15))
|
7
|
-
CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 15))
|
8
|
-
READ_TO = Integer(ENV.fetch("READ_TIMEOUT", 15))
|
9
6
|
MAX_HOPS = 5
|
10
|
-
MAX_RETRIES = 3
|
11
7
|
INVALID_BYTE_SEQ = "invalid byte sequence".freeze
|
12
8
|
CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
|
13
9
|
REST_INTERVAL = 0.5..1
|
14
10
|
|
15
11
|
include TextHelper
|
16
12
|
|
17
|
-
def open_w_timeout(url, write_to
|
18
|
-
read_to: READ_TO)
|
13
|
+
def open_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
|
19
14
|
logger.info "[Thread: ##{Thread.current[:id]}] GET #{url}"
|
20
15
|
retries = 0
|
21
16
|
|
@@ -33,7 +28,7 @@ module HTTPHelper
|
|
33
28
|
CONNECTION_ERRORS.any? { |e| msg.start_with?(e) }
|
34
29
|
retries += 1
|
35
30
|
|
36
|
-
if retries <=
|
31
|
+
if retries <= max_retries
|
37
32
|
rest_time = rand(REST_INTERVAL)
|
38
33
|
logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
|
39
34
|
sleep(rest_time)
|
data/lib/title_grabber.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require "csv"
|
2
1
|
require "etc"
|
2
|
+
require "csv"
|
3
3
|
require "fileutils"
|
4
4
|
require "logger"
|
5
5
|
require "pathname"
|
@@ -12,6 +12,12 @@ require_relative "http_helper"
|
|
12
12
|
require_relative "text_helper"
|
13
13
|
|
14
14
|
module TitleGrabber
|
15
|
+
DEF_OUT_PATH = Pathname('out.csv')
|
16
|
+
CONNECT_TO = 15
|
17
|
+
READ_TO = 15
|
18
|
+
WRITE_TO = 15
|
19
|
+
MAX_RETRIES = 5
|
20
|
+
MAX_THREADS = Etc.nprocessors
|
15
21
|
URL_RE = %r(https?://\S+)i
|
16
22
|
URL_HEADER = -"url"
|
17
23
|
PAGE_TIT_HEAD = -"page_title"
|
@@ -26,14 +32,22 @@ module TitleGrabber
|
|
26
32
|
include HTTPHelper
|
27
33
|
include TextHelper
|
28
34
|
|
29
|
-
attr_reader :lines, :out_path, :tmp_path, :
|
35
|
+
attr_reader :lines, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
|
36
|
+
:max_retries, :max_threads, :logger
|
30
37
|
|
31
38
|
def initialize(lines, options)
|
32
39
|
@lines = lines
|
33
|
-
|
40
|
+
|
41
|
+
@out_path = options.fetch(:output, DEF_OUT_PATH)
|
34
42
|
@tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
|
35
43
|
|
36
|
-
|
44
|
+
@connect_to = options.fetch(:connect_to, CONNECT_TO)
|
45
|
+
@read_to = options.fetch(:read_to, READ_TO)
|
46
|
+
@write_to = options.fetch(:write_to, WRITE_TO)
|
47
|
+
@max_retries = options.fetch(:max_retries, MAX_RETRIES)
|
48
|
+
@max_threads = options.fetch(:max_th, Etc.nprocessors)
|
49
|
+
|
50
|
+
logging_target = if options[:debug]
|
37
51
|
STDOUT
|
38
52
|
else
|
39
53
|
log_file = Pathname(__FILE__).sub_ext(".log").
|
@@ -64,8 +78,7 @@ module TitleGrabber
|
|
64
78
|
end
|
65
79
|
lines = nil
|
66
80
|
|
67
|
-
thr_cnt = [
|
68
|
-
queue.size].min
|
81
|
+
thr_cnt = [max_threads, queue.size].min
|
69
82
|
threads = 1.upto(thr_cnt).map.with_index { |_, i|
|
70
83
|
Thread.new(i) do |j|
|
71
84
|
Thread.current[:id] = i + 1
|
@@ -75,7 +88,7 @@ module TitleGrabber
|
|
75
88
|
rescue ThreadError; end
|
76
89
|
|
77
90
|
while url
|
78
|
-
if (html = open_w_timeout(url)) && !html.empty?
|
91
|
+
if (html = open_w_timeout(url, **http_opts)) && !html.empty?
|
79
92
|
doc = begin
|
80
93
|
Oga.parse_html(html)
|
81
94
|
rescue ArgumentError, LL::ParserError => err
|
@@ -134,5 +147,10 @@ module TitleGrabber
|
|
134
147
|
urls
|
135
148
|
end
|
136
149
|
end
|
150
|
+
|
151
|
+
def http_opts
|
152
|
+
@http_opts ||= { connect_to: connect_to, read_to: read_to,
|
153
|
+
write_to: write_to, max_retries: max_retries }
|
154
|
+
end
|
137
155
|
end
|
138
156
|
end
|