title_grabber 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -19
- data/README.md +12 -0
- data/exe/title-grabber +1 -9
- data/lib/http_helper.rb +27 -25
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +8 -12
- data/title_grabber.gemspec +0 -1
- metadata +1 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '028ecf2bea074495354ac08494517b7f254e8a27d782e2a3ff1eec7bd313474b'
|
4
|
+
data.tar.gz: 20adeb673a80980e1c3e8ee04285451eb0aa4d8e98c4c17cad5b72f9549f04ee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a6268347f464325956f4281f0132b9f6c03d945eefab10754fb93387b0bd0db384cac05899238532f42c62869f58e6282e1e8eadf326bde9026b3d65ba97261
|
7
|
+
data.tar.gz: e1388a778ea78d29f9c791db0e6eaaa0fa379ba912f6869a0b90f087c1fb7e366c61b08f47dce25f003abbc8f6909dde3b3c99a1636fdcf1777eb2fcbb61c72e
|
data/Gemfile.lock
CHANGED
@@ -1,42 +1,24 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
title_grabber (0.4.
|
5
|
-
http (~> 4.1)
|
4
|
+
title_grabber (0.4.1)
|
6
5
|
oga (~> 2.15)
|
7
6
|
|
8
7
|
GEM
|
9
8
|
remote: https://rubygems.org/
|
10
9
|
specs:
|
11
|
-
addressable (2.6.0)
|
12
|
-
public_suffix (>= 2.0.2, < 4.0)
|
13
10
|
ansi (1.5.0)
|
14
11
|
ast (2.4.0)
|
15
|
-
domain_name (0.5.20180417)
|
16
|
-
unf (>= 0.0.5, < 1.0.0)
|
17
|
-
http (4.1.1)
|
18
|
-
addressable (~> 2.3)
|
19
|
-
http-cookie (~> 1.0)
|
20
|
-
http-form_data (~> 2.0)
|
21
|
-
http_parser.rb (~> 0.6.0)
|
22
|
-
http-cookie (1.0.3)
|
23
|
-
domain_name (~> 0.5)
|
24
|
-
http-form_data (2.1.1)
|
25
|
-
http_parser.rb (0.6.0)
|
26
12
|
minitest (5.11.3)
|
27
13
|
minitest-line (0.6.5)
|
28
14
|
minitest (~> 5.0)
|
29
15
|
oga (2.15)
|
30
16
|
ast
|
31
17
|
ruby-ll (~> 2.1)
|
32
|
-
public_suffix (3.0.3)
|
33
18
|
rake (10.5.0)
|
34
19
|
ruby-ll (2.1.2)
|
35
20
|
ansi
|
36
21
|
ast
|
37
|
-
unf (0.1.4)
|
38
|
-
unf_ext
|
39
|
-
unf_ext (0.0.7.5)
|
40
22
|
|
41
23
|
PLATFORMS
|
42
24
|
ruby
|
data/README.md
CHANGED
@@ -39,6 +39,18 @@ See all available CLI switches and env vars
|
|
39
39
|
title-grabber -h
|
40
40
|
```
|
41
41
|
|
42
|
+
Usage: title-grabber [options]
|
43
|
+
-V, --version Print program version and exit
|
44
|
+
-f, --files /f1.txt,f2.txt 1 or more comma-separated paths to text files containing 1 URL per line
|
45
|
+
-o, --output FILE Output file. Defaults to out.csv
|
46
|
+
--connect-timeout TIMEOUT HTTP Connect timeout. Defaults to the value of the CONNECT_TIMEOUT env var or 15
|
47
|
+
--read-timeout TIMEOUT HTTP Read timeout. Defaults to the value of the READ_TIMEOUT env var or 15
|
48
|
+
--write-timeout TIMEOUT HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or 15
|
49
|
+
--max-redirects REDIRECTS Max. # of HTTP redirects to follow. Defaults to the value of the MAX_REDIRECTS env var or 5
|
50
|
+
-r, --max-retries RETRIES Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or 5
|
51
|
+
-t, --max-threads THREADS Max. # of threads to use. Defaults to the value of the MAX_THREADS env var or the # of logical processors in the system
|
52
|
+
-d, --debug Log to STDOUT instead of to a file in the CWD
|
53
|
+
|
42
54
|
## Development
|
43
55
|
|
44
56
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/exe/title-grabber
CHANGED
@@ -16,7 +16,7 @@ OptionParser.new do |args|
|
|
16
16
|
exit
|
17
17
|
end
|
18
18
|
|
19
|
-
args.on("-f", "--files /
|
19
|
+
args.on("-f", "--files /f1.txt,f2.txt", Array, "1 or more comma-separated paths to text files containing 1 URL per line ") do |files|
|
20
20
|
arguments[:file_paths] = files.map { |f| Pathname(f).expand_path }.
|
21
21
|
select { |f| f.file? && f.exist? }
|
22
22
|
end
|
@@ -33,14 +33,6 @@ OptionParser.new do |args|
|
|
33
33
|
arguments[:read_to] = timeout
|
34
34
|
end
|
35
35
|
|
36
|
-
args.on("--write-timeout TIMEOUT", Integer, "HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or #{TitleGrabber::WRITE_TO}") do |timeout|
|
37
|
-
arguments[:write_to] = timeout
|
38
|
-
end
|
39
|
-
|
40
|
-
args.on("--max-redirects REDIRECTS", Integer, "Max. # of HTTP redirects to follow. Defaults to the value of the MAX_REDIRECTS env var or #{TitleGrabber::MAX_REDIRECTS}") do |redirects|
|
41
|
-
arguments[:max_redirects] = redirects
|
42
|
-
end
|
43
|
-
|
44
36
|
args.on("-r", "--max-retries RETRIES", Integer, "Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or #{TitleGrabber::MAX_RETRIES}") do |retries|
|
45
37
|
arguments[:max_retries] = retries
|
46
38
|
end
|
data/lib/http_helper.rb
CHANGED
@@ -1,35 +1,47 @@
|
|
1
|
-
require "
|
1
|
+
require "openssl"
|
2
|
+
require "open-uri"
|
3
|
+
require "timeout"
|
2
4
|
|
3
5
|
require_relative "text_helper"
|
4
6
|
|
5
7
|
module HTTPHelper
|
6
8
|
INVALID_BYTE_SEQ = "invalid byte sequence".freeze
|
7
|
-
CONNECTION_ERRORS = ["SSL_connect", "Connection reset"
|
9
|
+
CONNECTION_ERRORS = ["SSL_connect", "Connection reset",
|
10
|
+
"execution expired"].freeze
|
8
11
|
REST_INTERVAL = 0.5..1
|
12
|
+
# redirection forbidden: https://t.co/blui5zKJjD -> http://bit.ly/2HXRYGw (RuntimeError)
|
13
|
+
REDIR_FORBIDDEN = /redirection forbidden/
|
14
|
+
HTTP_REDIR = %r(-> (http://\S+))
|
15
|
+
SRV_UNAVAILABLE = 503
|
9
16
|
|
10
17
|
include TextHelper
|
11
18
|
|
12
|
-
def open_w_timeout(url,
|
19
|
+
def open_w_timeout(url, connect_to:, read_to:, max_retries:)
|
13
20
|
logger.info "[#{Thread.current.name}] GET #{url}"
|
14
21
|
retries = 0
|
15
22
|
|
16
23
|
begin
|
17
24
|
res = Timeout.timeout(read_to) {
|
18
|
-
|
19
|
-
|
20
|
-
get(url, ssl_context: ssl_ctx)
|
25
|
+
open(url, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE,
|
26
|
+
open_timeout: connect_to, read_timeout: read_to)
|
21
27
|
}
|
22
|
-
rescue HTTP::Redirector::TooManyRedirectsError
|
23
|
-
logger.warn "[#{Thread.current.name}] GET #{url} resulted in more than #{max_redirects} redirect#{'s' unless max_redirects == 1}"
|
24
|
-
nil
|
25
28
|
rescue => err
|
26
29
|
msg = err.message
|
30
|
+
if msg =~ REDIR_FORBIDDEN
|
31
|
+
url = msg[HTTP_REDIR, 1]
|
32
|
+
url ? retry : return
|
33
|
+
end
|
27
34
|
|
28
|
-
if err.
|
35
|
+
if err.is_a?(OpenURI::HTTPError) || err.is_a?(Timeout::Error) ||
|
29
36
|
CONNECTION_ERRORS.any? { |e| msg.start_with?(e) }
|
30
37
|
retries += 1
|
31
38
|
|
32
|
-
|
39
|
+
will_retry = retries <= max_retries
|
40
|
+
if will_retry && err.is_a?(OpenURI::HTTPError)
|
41
|
+
will_retry = Integer(err.io.status.first) == SRV_UNAVAILABLE
|
42
|
+
end
|
43
|
+
|
44
|
+
if will_retry
|
33
45
|
rest_time = rand(REST_INTERVAL)
|
34
46
|
logger.warn "[#{Thread.current.name}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
|
35
47
|
sleep(rest_time)
|
@@ -45,20 +57,10 @@ module HTTPHelper
|
|
45
57
|
end
|
46
58
|
end
|
47
59
|
|
48
|
-
def read_w_timeout(url,
|
49
|
-
if res = open_w_timeout(url,
|
50
|
-
|
51
|
-
[res.
|
60
|
+
def read_w_timeout(url, connect_to:, read_to:, max_retries:)
|
61
|
+
if res = open_w_timeout(url, connect_to: connect_to, read_to: read_to,
|
62
|
+
max_retries: max_retries)
|
63
|
+
[res.base_uri.to_s, utf8_encode(res.read)]
|
52
64
|
end
|
53
65
|
end
|
54
|
-
|
55
|
-
private
|
56
|
-
|
57
|
-
def ssl_ctx
|
58
|
-
@ssl_ctx ||= begin
|
59
|
-
ctx = OpenSSL::SSL::SSLContext.new
|
60
|
-
ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
61
|
-
ctx
|
62
|
-
end
|
63
|
-
end
|
64
66
|
end
|
data/lib/title_grabber.rb
CHANGED
@@ -13,11 +13,9 @@ require_relative "text_helper"
|
|
13
13
|
|
14
14
|
module TitleGrabber
|
15
15
|
DEF_OUT_PATH = Pathname('out.csv')
|
16
|
-
CONNECT_TO =
|
17
|
-
READ_TO =
|
18
|
-
|
19
|
-
MAX_REDIRECTS = 5
|
20
|
-
MAX_RETRIES = 5
|
16
|
+
CONNECT_TO = 30
|
17
|
+
READ_TO = 30
|
18
|
+
MAX_RETRIES = 3
|
21
19
|
MAX_THREADS = Etc.nprocessors
|
22
20
|
URL_RE = %r(https?://\S+)i
|
23
21
|
URL_HEADER = -"url"
|
@@ -41,8 +39,8 @@ module TitleGrabber
|
|
41
39
|
include HTTPHelper
|
42
40
|
include TextHelper
|
43
41
|
|
44
|
-
attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to,
|
45
|
-
:
|
42
|
+
attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to,
|
43
|
+
:max_retries, :max_threads, :logger
|
46
44
|
|
47
45
|
def initialize(options)
|
48
46
|
@file_paths = options[:file_paths]
|
@@ -52,10 +50,8 @@ module TitleGrabber
|
|
52
50
|
|
53
51
|
@connect_to = options.fetch(:connect_to, CONNECT_TO)
|
54
52
|
@read_to = options.fetch(:read_to, READ_TO)
|
55
|
-
@write_to = options.fetch(:write_to, WRITE_TO)
|
56
|
-
@max_redirects = options.fetch(:max_redirects, MAX_REDIRECTS)
|
57
53
|
@max_retries = options.fetch(:max_retries, MAX_RETRIES)
|
58
|
-
@max_threads = options.fetch(:
|
54
|
+
@max_threads = options.fetch(:max_threads, Etc.nprocessors)
|
59
55
|
|
60
56
|
logging_target = if options[:debug]
|
61
57
|
STDOUT
|
@@ -118,7 +114,7 @@ module TitleGrabber
|
|
118
114
|
tweet_urls.uniq!
|
119
115
|
tweet_urls.map! do |url|
|
120
116
|
if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
|
121
|
-
uri = res.
|
117
|
+
uri = res.base_uri
|
122
118
|
uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
|
123
119
|
else
|
124
120
|
url
|
@@ -192,7 +188,7 @@ module TitleGrabber
|
|
192
188
|
|
193
189
|
def http_opts
|
194
190
|
@http_opts ||= { connect_to: connect_to, read_to: read_to,
|
195
|
-
|
191
|
+
max_retries: max_retries }
|
196
192
|
end
|
197
193
|
end
|
198
194
|
end
|
data/title_grabber.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
@@ -10,20 +10,6 @@ bindir: exe
|
|
10
10
|
cert_chain: []
|
11
11
|
date: 2019-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: http
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '4.1'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '4.1'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: oga
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|