title_grabber 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -19
- data/README.md +12 -0
- data/exe/title-grabber +1 -9
- data/lib/http_helper.rb +27 -25
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +8 -12
- data/title_grabber.gemspec +0 -1
- metadata +1 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '028ecf2bea074495354ac08494517b7f254e8a27d782e2a3ff1eec7bd313474b'
|
4
|
+
data.tar.gz: 20adeb673a80980e1c3e8ee04285451eb0aa4d8e98c4c17cad5b72f9549f04ee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a6268347f464325956f4281f0132b9f6c03d945eefab10754fb93387b0bd0db384cac05899238532f42c62869f58e6282e1e8eadf326bde9026b3d65ba97261
|
7
|
+
data.tar.gz: e1388a778ea78d29f9c791db0e6eaaa0fa379ba912f6869a0b90f087c1fb7e366c61b08f47dce25f003abbc8f6909dde3b3c99a1636fdcf1777eb2fcbb61c72e
|
data/Gemfile.lock
CHANGED
@@ -1,42 +1,24 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
title_grabber (0.4.
|
5
|
-
http (~> 4.1)
|
4
|
+
title_grabber (0.4.1)
|
6
5
|
oga (~> 2.15)
|
7
6
|
|
8
7
|
GEM
|
9
8
|
remote: https://rubygems.org/
|
10
9
|
specs:
|
11
|
-
addressable (2.6.0)
|
12
|
-
public_suffix (>= 2.0.2, < 4.0)
|
13
10
|
ansi (1.5.0)
|
14
11
|
ast (2.4.0)
|
15
|
-
domain_name (0.5.20180417)
|
16
|
-
unf (>= 0.0.5, < 1.0.0)
|
17
|
-
http (4.1.1)
|
18
|
-
addressable (~> 2.3)
|
19
|
-
http-cookie (~> 1.0)
|
20
|
-
http-form_data (~> 2.0)
|
21
|
-
http_parser.rb (~> 0.6.0)
|
22
|
-
http-cookie (1.0.3)
|
23
|
-
domain_name (~> 0.5)
|
24
|
-
http-form_data (2.1.1)
|
25
|
-
http_parser.rb (0.6.0)
|
26
12
|
minitest (5.11.3)
|
27
13
|
minitest-line (0.6.5)
|
28
14
|
minitest (~> 5.0)
|
29
15
|
oga (2.15)
|
30
16
|
ast
|
31
17
|
ruby-ll (~> 2.1)
|
32
|
-
public_suffix (3.0.3)
|
33
18
|
rake (10.5.0)
|
34
19
|
ruby-ll (2.1.2)
|
35
20
|
ansi
|
36
21
|
ast
|
37
|
-
unf (0.1.4)
|
38
|
-
unf_ext
|
39
|
-
unf_ext (0.0.7.5)
|
40
22
|
|
41
23
|
PLATFORMS
|
42
24
|
ruby
|
data/README.md
CHANGED
@@ -39,6 +39,18 @@ See all available CLI switches and env vars
|
|
39
39
|
title-grabber -h
|
40
40
|
```
|
41
41
|
|
42
|
+
Usage: title-grabber [options]
|
43
|
+
-V, --version Print program version and exit
|
44
|
+
-f, --files /f1.txt,f2.txt 1 or more comma-separated paths to text files containing 1 URL per line
|
45
|
+
-o, --output FILE Output file. Defaults to out.csv
|
46
|
+
--connect-timeout TIMEOUT HTTP Connect timeout. Defaults to the value of the CONNECT_TIMEOUT env var or 15
|
47
|
+
--read-timeout TIMEOUT HTTP Read timeout. Defaults to the value of the READ_TIMEOUT env var or 15
|
48
|
+
--write-timeout TIMEOUT HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or 15
|
49
|
+
--max-redirects REDIRECTS Max. # of HTTP redirects to follow. Defaults to the value of the MAX_REDIRECTS env var or 5
|
50
|
+
-r, --max-retries RETRIES Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or 5
|
51
|
+
-t, --max-threads THREADS Max. # of threads to use. Defaults to the value of the MAX_THREADS env var or the # of logical processors in the system
|
52
|
+
-d, --debug Log to STDOUT instead of to a file in the CWD
|
53
|
+
|
42
54
|
## Development
|
43
55
|
|
44
56
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/exe/title-grabber
CHANGED
@@ -16,7 +16,7 @@ OptionParser.new do |args|
|
|
16
16
|
exit
|
17
17
|
end
|
18
18
|
|
19
|
-
args.on("-f", "--files /
|
19
|
+
args.on("-f", "--files /f1.txt,f2.txt", Array, "1 or more comma-separated paths to text files containing 1 URL per line ") do |files|
|
20
20
|
arguments[:file_paths] = files.map { |f| Pathname(f).expand_path }.
|
21
21
|
select { |f| f.file? && f.exist? }
|
22
22
|
end
|
@@ -33,14 +33,6 @@ OptionParser.new do |args|
|
|
33
33
|
arguments[:read_to] = timeout
|
34
34
|
end
|
35
35
|
|
36
|
-
args.on("--write-timeout TIMEOUT", Integer, "HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or #{TitleGrabber::WRITE_TO}") do |timeout|
|
37
|
-
arguments[:write_to] = timeout
|
38
|
-
end
|
39
|
-
|
40
|
-
args.on("--max-redirects REDIRECTS", Integer, "Max. # of HTTP redirects to follow. Defaults to the value of the MAX_REDIRECTS env var or #{TitleGrabber::MAX_REDIRECTS}") do |redirects|
|
41
|
-
arguments[:max_redirects] = redirects
|
42
|
-
end
|
43
|
-
|
44
36
|
args.on("-r", "--max-retries RETRIES", Integer, "Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or #{TitleGrabber::MAX_RETRIES}") do |retries|
|
45
37
|
arguments[:max_retries] = retries
|
46
38
|
end
|
data/lib/http_helper.rb
CHANGED
@@ -1,35 +1,47 @@
|
|
1
|
-
require "
|
1
|
+
require "openssl"
|
2
|
+
require "open-uri"
|
3
|
+
require "timeout"
|
2
4
|
|
3
5
|
require_relative "text_helper"
|
4
6
|
|
5
7
|
module HTTPHelper
|
6
8
|
INVALID_BYTE_SEQ = "invalid byte sequence".freeze
|
7
|
-
CONNECTION_ERRORS = ["SSL_connect", "Connection reset"
|
9
|
+
CONNECTION_ERRORS = ["SSL_connect", "Connection reset",
|
10
|
+
"execution expired"].freeze
|
8
11
|
REST_INTERVAL = 0.5..1
|
12
|
+
# redirection forbidden: https://t.co/blui5zKJjD -> http://bit.ly/2HXRYGw (RuntimeError)
|
13
|
+
REDIR_FORBIDDEN = /redirection forbidden/
|
14
|
+
HTTP_REDIR = %r(-> (http://\S+))
|
15
|
+
SRV_UNAVAILABLE = 503
|
9
16
|
|
10
17
|
include TextHelper
|
11
18
|
|
12
|
-
def open_w_timeout(url,
|
19
|
+
def open_w_timeout(url, connect_to:, read_to:, max_retries:)
|
13
20
|
logger.info "[#{Thread.current.name}] GET #{url}"
|
14
21
|
retries = 0
|
15
22
|
|
16
23
|
begin
|
17
24
|
res = Timeout.timeout(read_to) {
|
18
|
-
|
19
|
-
|
20
|
-
get(url, ssl_context: ssl_ctx)
|
25
|
+
open(url, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE,
|
26
|
+
open_timeout: connect_to, read_timeout: read_to)
|
21
27
|
}
|
22
|
-
rescue HTTP::Redirector::TooManyRedirectsError
|
23
|
-
logger.warn "[#{Thread.current.name}] GET #{url} resulted in more than #{max_redirects} redirect#{'s' unless max_redirects == 1}"
|
24
|
-
nil
|
25
28
|
rescue => err
|
26
29
|
msg = err.message
|
30
|
+
if msg =~ REDIR_FORBIDDEN
|
31
|
+
url = msg[HTTP_REDIR, 1]
|
32
|
+
url ? retry : return
|
33
|
+
end
|
27
34
|
|
28
|
-
if err.
|
35
|
+
if err.is_a?(OpenURI::HTTPError) || err.is_a?(Timeout::Error) ||
|
29
36
|
CONNECTION_ERRORS.any? { |e| msg.start_with?(e) }
|
30
37
|
retries += 1
|
31
38
|
|
32
|
-
|
39
|
+
will_retry = retries <= max_retries
|
40
|
+
if will_retry && err.is_a?(OpenURI::HTTPError)
|
41
|
+
will_retry = Integer(err.io.status.first) == SRV_UNAVAILABLE
|
42
|
+
end
|
43
|
+
|
44
|
+
if will_retry
|
33
45
|
rest_time = rand(REST_INTERVAL)
|
34
46
|
logger.warn "[#{Thread.current.name}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
|
35
47
|
sleep(rest_time)
|
@@ -45,20 +57,10 @@ module HTTPHelper
|
|
45
57
|
end
|
46
58
|
end
|
47
59
|
|
48
|
-
def read_w_timeout(url,
|
49
|
-
if res = open_w_timeout(url,
|
50
|
-
|
51
|
-
[res.
|
60
|
+
def read_w_timeout(url, connect_to:, read_to:, max_retries:)
|
61
|
+
if res = open_w_timeout(url, connect_to: connect_to, read_to: read_to,
|
62
|
+
max_retries: max_retries)
|
63
|
+
[res.base_uri.to_s, utf8_encode(res.read)]
|
52
64
|
end
|
53
65
|
end
|
54
|
-
|
55
|
-
private
|
56
|
-
|
57
|
-
def ssl_ctx
|
58
|
-
@ssl_ctx ||= begin
|
59
|
-
ctx = OpenSSL::SSL::SSLContext.new
|
60
|
-
ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
61
|
-
ctx
|
62
|
-
end
|
63
|
-
end
|
64
66
|
end
|
data/lib/title_grabber.rb
CHANGED
@@ -13,11 +13,9 @@ require_relative "text_helper"
|
|
13
13
|
|
14
14
|
module TitleGrabber
|
15
15
|
DEF_OUT_PATH = Pathname('out.csv')
|
16
|
-
CONNECT_TO =
|
17
|
-
READ_TO =
|
18
|
-
|
19
|
-
MAX_REDIRECTS = 5
|
20
|
-
MAX_RETRIES = 5
|
16
|
+
CONNECT_TO = 30
|
17
|
+
READ_TO = 30
|
18
|
+
MAX_RETRIES = 3
|
21
19
|
MAX_THREADS = Etc.nprocessors
|
22
20
|
URL_RE = %r(https?://\S+)i
|
23
21
|
URL_HEADER = -"url"
|
@@ -41,8 +39,8 @@ module TitleGrabber
|
|
41
39
|
include HTTPHelper
|
42
40
|
include TextHelper
|
43
41
|
|
44
|
-
attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to,
|
45
|
-
:
|
42
|
+
attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to,
|
43
|
+
:max_retries, :max_threads, :logger
|
46
44
|
|
47
45
|
def initialize(options)
|
48
46
|
@file_paths = options[:file_paths]
|
@@ -52,10 +50,8 @@ module TitleGrabber
|
|
52
50
|
|
53
51
|
@connect_to = options.fetch(:connect_to, CONNECT_TO)
|
54
52
|
@read_to = options.fetch(:read_to, READ_TO)
|
55
|
-
@write_to = options.fetch(:write_to, WRITE_TO)
|
56
|
-
@max_redirects = options.fetch(:max_redirects, MAX_REDIRECTS)
|
57
53
|
@max_retries = options.fetch(:max_retries, MAX_RETRIES)
|
58
|
-
@max_threads = options.fetch(:
|
54
|
+
@max_threads = options.fetch(:max_threads, Etc.nprocessors)
|
59
55
|
|
60
56
|
logging_target = if options[:debug]
|
61
57
|
STDOUT
|
@@ -118,7 +114,7 @@ module TitleGrabber
|
|
118
114
|
tweet_urls.uniq!
|
119
115
|
tweet_urls.map! do |url|
|
120
116
|
if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
|
121
|
-
uri = res.
|
117
|
+
uri = res.base_uri
|
122
118
|
uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
|
123
119
|
else
|
124
120
|
url
|
@@ -192,7 +188,7 @@ module TitleGrabber
|
|
192
188
|
|
193
189
|
def http_opts
|
194
190
|
@http_opts ||= { connect_to: connect_to, read_to: read_to,
|
195
|
-
|
191
|
+
max_retries: max_retries }
|
196
192
|
end
|
197
193
|
end
|
198
194
|
end
|
data/title_grabber.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
@@ -10,20 +10,6 @@ bindir: exe
|
|
10
10
|
cert_chain: []
|
11
11
|
date: 2019-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: http
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '4.1'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '4.1'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: oga
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|