title_grabber 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 58ceec17ac17673c12eee2cdd5fbef23cdc561b89b5820f652e592a91047a399
4
- data.tar.gz: 3f4be7dbff89b096c51fb28839d0bd2f4a85873507ee6600b7d2e481c16ea766
3
+ metadata.gz: '028ecf2bea074495354ac08494517b7f254e8a27d782e2a3ff1eec7bd313474b'
4
+ data.tar.gz: 20adeb673a80980e1c3e8ee04285451eb0aa4d8e98c4c17cad5b72f9549f04ee
5
5
  SHA512:
6
- metadata.gz: 2e87da24fb755d6869b68af2fd201a19e7f9ea6754b1dd26263f58124055847a9594cd186a88c39b2b91f59507f969a35b47bbf4924ca0c35c8f2776e4533836
7
- data.tar.gz: '0957bf0dc17b6f02dff00b185ae0fd9ba163ce9610655063dab0ce8e16bd8c761db1a7cd8d283a685a408ed2470bef1ac7bcb42167f995e2857feeeb94e7647c'
6
+ metadata.gz: 2a6268347f464325956f4281f0132b9f6c03d945eefab10754fb93387b0bd0db384cac05899238532f42c62869f58e6282e1e8eadf326bde9026b3d65ba97261
7
+ data.tar.gz: e1388a778ea78d29f9c791db0e6eaaa0fa379ba912f6869a0b90f087c1fb7e366c61b08f47dce25f003abbc8f6909dde3b3c99a1636fdcf1777eb2fcbb61c72e
data/Gemfile.lock CHANGED
@@ -1,42 +1,24 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.4.0)
5
- http (~> 4.1)
4
+ title_grabber (0.4.1)
6
5
  oga (~> 2.15)
7
6
 
8
7
  GEM
9
8
  remote: https://rubygems.org/
10
9
  specs:
11
- addressable (2.6.0)
12
- public_suffix (>= 2.0.2, < 4.0)
13
10
  ansi (1.5.0)
14
11
  ast (2.4.0)
15
- domain_name (0.5.20180417)
16
- unf (>= 0.0.5, < 1.0.0)
17
- http (4.1.1)
18
- addressable (~> 2.3)
19
- http-cookie (~> 1.0)
20
- http-form_data (~> 2.0)
21
- http_parser.rb (~> 0.6.0)
22
- http-cookie (1.0.3)
23
- domain_name (~> 0.5)
24
- http-form_data (2.1.1)
25
- http_parser.rb (0.6.0)
26
12
  minitest (5.11.3)
27
13
  minitest-line (0.6.5)
28
14
  minitest (~> 5.0)
29
15
  oga (2.15)
30
16
  ast
31
17
  ruby-ll (~> 2.1)
32
- public_suffix (3.0.3)
33
18
  rake (10.5.0)
34
19
  ruby-ll (2.1.2)
35
20
  ansi
36
21
  ast
37
- unf (0.1.4)
38
- unf_ext
39
- unf_ext (0.0.7.5)
40
22
 
41
23
  PLATFORMS
42
24
  ruby
data/README.md CHANGED
@@ -39,6 +39,18 @@ See all available CLI switches and env vars
39
39
  title-grabber -h
40
40
  ```
41
41
 
42
+ Usage: title-grabber [options]
43
+ -V, --version Print program version and exit
44
+ -f, --files /f1.txt,f2.txt 1 or more comma-separated paths to text files containing 1 URL per line
45
+ -o, --output FILE Output file. Defaults to out.csv
46
+ --connect-timeout TIMEOUT HTTP Connect timeout. Defaults to the value of the CONNECT_TIMEOUT env var or 15
47
+ --read-timeout TIMEOUT HTTP Read timeout. Defaults to the value of the READ_TIMEOUT env var or 15
48
+ --write-timeout TIMEOUT HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or 15
49
+ --max-redirects REDIRECTS Max. # of HTTP redirects to follow. Defaults to the value of the MAX_REDIRECTS env var or 5
50
+ -r, --max-retries RETRIES Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or 5
51
+ -t, --max-threads THREADS Max. # of threads to use. Defaults to the value of the MAX_THREADS env var or the # of logical processors in the system
52
+ -d, --debug Log to STDOUT instead of to a file in the CWD
53
+
42
54
  ## Development
43
55
 
44
56
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/exe/title-grabber CHANGED
@@ -16,7 +16,7 @@ OptionParser.new do |args|
16
16
  exit
17
17
  end
18
18
 
19
- args.on("-f", "--files /abs/f1,rel/f2", Array, "1 or more comma-separated paths to text files containing 1 URL per line ") do |files|
19
+ args.on("-f", "--files /f1.txt,f2.txt", Array, "1 or more comma-separated paths to text files containing 1 URL per line ") do |files|
20
20
  arguments[:file_paths] = files.map { |f| Pathname(f).expand_path }.
21
21
  select { |f| f.file? && f.exist? }
22
22
  end
@@ -33,14 +33,6 @@ OptionParser.new do |args|
33
33
  arguments[:read_to] = timeout
34
34
  end
35
35
 
36
- args.on("--write-timeout TIMEOUT", Integer, "HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or #{TitleGrabber::WRITE_TO}") do |timeout|
37
- arguments[:write_to] = timeout
38
- end
39
-
40
- args.on("--max-redirects REDIRECTS", Integer, "Max. # of HTTP redirects to follow. Defaults to the value of the MAX_REDIRECTS env var or #{TitleGrabber::MAX_REDIRECTS}") do |redirects|
41
- arguments[:max_redirects] = redirects
42
- end
43
-
44
36
  args.on("-r", "--max-retries RETRIES", Integer, "Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or #{TitleGrabber::MAX_RETRIES}") do |retries|
45
37
  arguments[:max_retries] = retries
46
38
  end
data/lib/http_helper.rb CHANGED
@@ -1,35 +1,47 @@
1
- require "http"
1
+ require "openssl"
2
+ require "open-uri"
3
+ require "timeout"
2
4
 
3
5
  require_relative "text_helper"
4
6
 
5
7
  module HTTPHelper
6
8
  INVALID_BYTE_SEQ = "invalid byte sequence".freeze
7
- CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
9
+ CONNECTION_ERRORS = ["SSL_connect", "Connection reset",
10
+ "execution expired"].freeze
8
11
  REST_INTERVAL = 0.5..1
12
+ # redirection forbidden: https://t.co/blui5zKJjD -> http://bit.ly/2HXRYGw (RuntimeError)
13
+ REDIR_FORBIDDEN = /redirection forbidden/
14
+ HTTP_REDIR = %r(-> (http://\S+))
15
+ SRV_UNAVAILABLE = 503
9
16
 
10
17
  include TextHelper
11
18
 
12
- def open_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
19
+ def open_w_timeout(url, connect_to:, read_to:, max_retries:)
13
20
  logger.info "[#{Thread.current.name}] GET #{url}"
14
21
  retries = 0
15
22
 
16
23
  begin
17
24
  res = Timeout.timeout(read_to) {
18
- HTTP.timeout(write: write_to, connect: connect_to, read: read_to).
19
- follow(max_hops: max_redirects).
20
- get(url, ssl_context: ssl_ctx)
25
+ open(url, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE,
26
+ open_timeout: connect_to, read_timeout: read_to)
21
27
  }
22
- rescue HTTP::Redirector::TooManyRedirectsError
23
- logger.warn "[#{Thread.current.name}] GET #{url} resulted in more than #{max_redirects} redirect#{'s' unless max_redirects == 1}"
24
- nil
25
28
  rescue => err
26
29
  msg = err.message
30
+ if msg =~ REDIR_FORBIDDEN
31
+ url = msg[HTTP_REDIR, 1]
32
+ url ? retry : return
33
+ end
27
34
 
28
- if err.kind_of?(HTTP::Error) || err.kind_of?(Timeout::Error) ||
35
+ if err.is_a?(OpenURI::HTTPError) || err.is_a?(Timeout::Error) ||
29
36
  CONNECTION_ERRORS.any? { |e| msg.start_with?(e) }
30
37
  retries += 1
31
38
 
32
- if retries <= max_retries
39
+ will_retry = retries <= max_retries
40
+ if will_retry && err.is_a?(OpenURI::HTTPError)
41
+ will_retry = Integer(err.io.status.first) == SRV_UNAVAILABLE
42
+ end
43
+
44
+ if will_retry
33
45
  rest_time = rand(REST_INTERVAL)
34
46
  logger.warn "[#{Thread.current.name}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
35
47
  sleep(rest_time)
@@ -45,20 +57,10 @@ module HTTPHelper
45
57
  end
46
58
  end
47
59
 
48
- def read_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
49
- if res = open_w_timeout(url, write_to: write_to, connect_to: connect_to,
50
- read_to: read_to, max_retries: max_retries)
51
- [res.uri.to_s, utf8_encode(res.to_s)]
60
+ def read_w_timeout(url, connect_to:, read_to:, max_retries:)
61
+ if res = open_w_timeout(url, connect_to: connect_to, read_to: read_to,
62
+ max_retries: max_retries)
63
+ [res.base_uri.to_s, utf8_encode(res.read)]
52
64
  end
53
65
  end
54
-
55
- private
56
-
57
- def ssl_ctx
58
- @ssl_ctx ||= begin
59
- ctx = OpenSSL::SSL::SSLContext.new
60
- ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
61
- ctx
62
- end
63
- end
64
66
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.4.1"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -13,11 +13,9 @@ require_relative "text_helper"
13
13
 
14
14
  module TitleGrabber
15
15
  DEF_OUT_PATH = Pathname('out.csv')
16
- CONNECT_TO = 15
17
- READ_TO = 15
18
- WRITE_TO = 15
19
- MAX_REDIRECTS = 5
20
- MAX_RETRIES = 5
16
+ CONNECT_TO = 30
17
+ READ_TO = 30
18
+ MAX_RETRIES = 3
21
19
  MAX_THREADS = Etc.nprocessors
22
20
  URL_RE = %r(https?://\S+)i
23
21
  URL_HEADER = -"url"
@@ -41,8 +39,8 @@ module TitleGrabber
41
39
  include HTTPHelper
42
40
  include TextHelper
43
41
 
44
- attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
45
- :max_redirects, :max_retries, :max_threads, :logger
42
+ attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to,
43
+ :max_retries, :max_threads, :logger
46
44
 
47
45
  def initialize(options)
48
46
  @file_paths = options[:file_paths]
@@ -52,10 +50,8 @@ module TitleGrabber
52
50
 
53
51
  @connect_to = options.fetch(:connect_to, CONNECT_TO)
54
52
  @read_to = options.fetch(:read_to, READ_TO)
55
- @write_to = options.fetch(:write_to, WRITE_TO)
56
- @max_redirects = options.fetch(:max_redirects, MAX_REDIRECTS)
57
53
  @max_retries = options.fetch(:max_retries, MAX_RETRIES)
58
- @max_threads = options.fetch(:max_th, Etc.nprocessors)
54
+ @max_threads = options.fetch(:max_threads, Etc.nprocessors)
59
55
 
60
56
  logging_target = if options[:debug]
61
57
  STDOUT
@@ -118,7 +114,7 @@ module TitleGrabber
118
114
  tweet_urls.uniq!
119
115
  tweet_urls.map! do |url|
120
116
  if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
121
- uri = res.uri
117
+ uri = res.base_uri
122
118
  uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
123
119
  else
124
120
  url
@@ -192,7 +188,7 @@ module TitleGrabber
192
188
 
193
189
  def http_opts
194
190
  @http_opts ||= { connect_to: connect_to, read_to: read_to,
195
- write_to: write_to, max_retries: max_retries }
191
+ max_retries: max_retries }
196
192
  end
197
193
  end
198
194
  end
@@ -38,7 +38,6 @@ Gem::Specification.new do |spec|
38
38
 
39
39
  spec.required_ruby_version = "~> 2.3"
40
40
 
41
- spec.add_runtime_dependency "http", "~> 4.1"
42
41
  spec.add_runtime_dependency "oga", "~> 2.15"
43
42
 
44
43
  spec.add_development_dependency "bundler", "~> 1.17"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
@@ -10,20 +10,6 @@ bindir: exe
10
10
  cert_chain: []
11
11
  date: 2019-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: http
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '4.1'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '4.1'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: oga
29
15
  requirement: !ruby/object:Gem::Requirement