title_grabber 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 58ceec17ac17673c12eee2cdd5fbef23cdc561b89b5820f652e592a91047a399
4
- data.tar.gz: 3f4be7dbff89b096c51fb28839d0bd2f4a85873507ee6600b7d2e481c16ea766
3
+ metadata.gz: '028ecf2bea074495354ac08494517b7f254e8a27d782e2a3ff1eec7bd313474b'
4
+ data.tar.gz: 20adeb673a80980e1c3e8ee04285451eb0aa4d8e98c4c17cad5b72f9549f04ee
5
5
  SHA512:
6
- metadata.gz: 2e87da24fb755d6869b68af2fd201a19e7f9ea6754b1dd26263f58124055847a9594cd186a88c39b2b91f59507f969a35b47bbf4924ca0c35c8f2776e4533836
7
- data.tar.gz: '0957bf0dc17b6f02dff00b185ae0fd9ba163ce9610655063dab0ce8e16bd8c761db1a7cd8d283a685a408ed2470bef1ac7bcb42167f995e2857feeeb94e7647c'
6
+ metadata.gz: 2a6268347f464325956f4281f0132b9f6c03d945eefab10754fb93387b0bd0db384cac05899238532f42c62869f58e6282e1e8eadf326bde9026b3d65ba97261
7
+ data.tar.gz: e1388a778ea78d29f9c791db0e6eaaa0fa379ba912f6869a0b90f087c1fb7e366c61b08f47dce25f003abbc8f6909dde3b3c99a1636fdcf1777eb2fcbb61c72e
data/Gemfile.lock CHANGED
@@ -1,42 +1,24 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.4.0)
5
- http (~> 4.1)
4
+ title_grabber (0.4.1)
6
5
  oga (~> 2.15)
7
6
 
8
7
  GEM
9
8
  remote: https://rubygems.org/
10
9
  specs:
11
- addressable (2.6.0)
12
- public_suffix (>= 2.0.2, < 4.0)
13
10
  ansi (1.5.0)
14
11
  ast (2.4.0)
15
- domain_name (0.5.20180417)
16
- unf (>= 0.0.5, < 1.0.0)
17
- http (4.1.1)
18
- addressable (~> 2.3)
19
- http-cookie (~> 1.0)
20
- http-form_data (~> 2.0)
21
- http_parser.rb (~> 0.6.0)
22
- http-cookie (1.0.3)
23
- domain_name (~> 0.5)
24
- http-form_data (2.1.1)
25
- http_parser.rb (0.6.0)
26
12
  minitest (5.11.3)
27
13
  minitest-line (0.6.5)
28
14
  minitest (~> 5.0)
29
15
  oga (2.15)
30
16
  ast
31
17
  ruby-ll (~> 2.1)
32
- public_suffix (3.0.3)
33
18
  rake (10.5.0)
34
19
  ruby-ll (2.1.2)
35
20
  ansi
36
21
  ast
37
- unf (0.1.4)
38
- unf_ext
39
- unf_ext (0.0.7.5)
40
22
 
41
23
  PLATFORMS
42
24
  ruby
data/README.md CHANGED
@@ -39,6 +39,18 @@ See all available CLI switches and env vars
39
39
  title-grabber -h
40
40
  ```
41
41
 
42
+ Usage: title-grabber [options]
43
+ -V, --version Print program version and exit
44
+ -f, --files /f1.txt,f2.txt 1 or more comma-separated paths to text files containing 1 URL per line
45
+ -o, --output FILE Output file. Defaults to out.csv
46
+ --connect-timeout TIMEOUT HTTP Connect timeout. Defaults to the value of the CONNECT_TIMEOUT env var or 15
47
+ --read-timeout TIMEOUT HTTP Read timeout. Defaults to the value of the READ_TIMEOUT env var or 15
48
+ --write-timeout TIMEOUT HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or 15
49
+ --max-redirects REDIRECTS Max. # of HTTP redirects to follow. Defaults to the value of the MAX_REDIRECTS env var or 5
50
+ -r, --max-retries RETRIES Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or 5
51
+ -t, --max-threads THREADS Max. # of threads to use. Defaults to the value of the MAX_THREADS env var or the # of logical processors in the system
52
+ -d, --debug Log to STDOUT instead of to a file in the CWD
53
+
42
54
  ## Development
43
55
 
44
56
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/exe/title-grabber CHANGED
@@ -16,7 +16,7 @@ OptionParser.new do |args|
16
16
  exit
17
17
  end
18
18
 
19
- args.on("-f", "--files /abs/f1,rel/f2", Array, "1 or more comma-separated paths to text files containing 1 URL per line ") do |files|
19
+ args.on("-f", "--files /f1.txt,f2.txt", Array, "1 or more comma-separated paths to text files containing 1 URL per line ") do |files|
20
20
  arguments[:file_paths] = files.map { |f| Pathname(f).expand_path }.
21
21
  select { |f| f.file? && f.exist? }
22
22
  end
@@ -33,14 +33,6 @@ OptionParser.new do |args|
33
33
  arguments[:read_to] = timeout
34
34
  end
35
35
 
36
- args.on("--write-timeout TIMEOUT", Integer, "HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or #{TitleGrabber::WRITE_TO}") do |timeout|
37
- arguments[:write_to] = timeout
38
- end
39
-
40
- args.on("--max-redirects REDIRECTS", Integer, "Max. # of HTTP redirects to follow. Defaults to the value of the MAX_REDIRECTS env var or #{TitleGrabber::MAX_REDIRECTS}") do |redirects|
41
- arguments[:max_redirects] = redirects
42
- end
43
-
44
36
  args.on("-r", "--max-retries RETRIES", Integer, "Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or #{TitleGrabber::MAX_RETRIES}") do |retries|
45
37
  arguments[:max_retries] = retries
46
38
  end
data/lib/http_helper.rb CHANGED
@@ -1,35 +1,47 @@
1
- require "http"
1
+ require "openssl"
2
+ require "open-uri"
3
+ require "timeout"
2
4
 
3
5
  require_relative "text_helper"
4
6
 
5
7
  module HTTPHelper
6
8
  INVALID_BYTE_SEQ = "invalid byte sequence".freeze
7
- CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
9
+ CONNECTION_ERRORS = ["SSL_connect", "Connection reset",
10
+ "execution expired"].freeze
8
11
  REST_INTERVAL = 0.5..1
12
+ # redirection forbidden: https://t.co/blui5zKJjD -> http://bit.ly/2HXRYGw (RuntimeError)
13
+ REDIR_FORBIDDEN = /redirection forbidden/
14
+ HTTP_REDIR = %r(-> (http://\S+))
15
+ SRV_UNAVAILABLE = 503
9
16
 
10
17
  include TextHelper
11
18
 
12
- def open_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
19
+ def open_w_timeout(url, connect_to:, read_to:, max_retries:)
13
20
  logger.info "[#{Thread.current.name}] GET #{url}"
14
21
  retries = 0
15
22
 
16
23
  begin
17
24
  res = Timeout.timeout(read_to) {
18
- HTTP.timeout(write: write_to, connect: connect_to, read: read_to).
19
- follow(max_hops: max_redirects).
20
- get(url, ssl_context: ssl_ctx)
25
+ open(url, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE,
26
+ open_timeout: connect_to, read_timeout: read_to)
21
27
  }
22
- rescue HTTP::Redirector::TooManyRedirectsError
23
- logger.warn "[#{Thread.current.name}] GET #{url} resulted in more than #{max_redirects} redirect#{'s' unless max_redirects == 1}"
24
- nil
25
28
  rescue => err
26
29
  msg = err.message
30
+ if msg =~ REDIR_FORBIDDEN
31
+ url = msg[HTTP_REDIR, 1]
32
+ url ? retry : return
33
+ end
27
34
 
28
- if err.kind_of?(HTTP::Error) || err.kind_of?(Timeout::Error) ||
35
+ if err.is_a?(OpenURI::HTTPError) || err.is_a?(Timeout::Error) ||
29
36
  CONNECTION_ERRORS.any? { |e| msg.start_with?(e) }
30
37
  retries += 1
31
38
 
32
- if retries <= max_retries
39
+ will_retry = retries <= max_retries
40
+ if will_retry && err.is_a?(OpenURI::HTTPError)
41
+ will_retry = Integer(err.io.status.first) == SRV_UNAVAILABLE
42
+ end
43
+
44
+ if will_retry
33
45
  rest_time = rand(REST_INTERVAL)
34
46
  logger.warn "[#{Thread.current.name}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
35
47
  sleep(rest_time)
@@ -45,20 +57,10 @@ module HTTPHelper
45
57
  end
46
58
  end
47
59
 
48
- def read_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
49
- if res = open_w_timeout(url, write_to: write_to, connect_to: connect_to,
50
- read_to: read_to, max_retries: max_retries)
51
- [res.uri.to_s, utf8_encode(res.to_s)]
60
+ def read_w_timeout(url, connect_to:, read_to:, max_retries:)
61
+ if res = open_w_timeout(url, connect_to: connect_to, read_to: read_to,
62
+ max_retries: max_retries)
63
+ [res.base_uri.to_s, utf8_encode(res.read)]
52
64
  end
53
65
  end
54
-
55
- private
56
-
57
- def ssl_ctx
58
- @ssl_ctx ||= begin
59
- ctx = OpenSSL::SSL::SSLContext.new
60
- ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
61
- ctx
62
- end
63
- end
64
66
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.4.1"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -13,11 +13,9 @@ require_relative "text_helper"
13
13
 
14
14
  module TitleGrabber
15
15
  DEF_OUT_PATH = Pathname('out.csv')
16
- CONNECT_TO = 15
17
- READ_TO = 15
18
- WRITE_TO = 15
19
- MAX_REDIRECTS = 5
20
- MAX_RETRIES = 5
16
+ CONNECT_TO = 30
17
+ READ_TO = 30
18
+ MAX_RETRIES = 3
21
19
  MAX_THREADS = Etc.nprocessors
22
20
  URL_RE = %r(https?://\S+)i
23
21
  URL_HEADER = -"url"
@@ -41,8 +39,8 @@ module TitleGrabber
41
39
  include HTTPHelper
42
40
  include TextHelper
43
41
 
44
- attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
45
- :max_redirects, :max_retries, :max_threads, :logger
42
+ attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to,
43
+ :max_retries, :max_threads, :logger
46
44
 
47
45
  def initialize(options)
48
46
  @file_paths = options[:file_paths]
@@ -52,10 +50,8 @@ module TitleGrabber
52
50
 
53
51
  @connect_to = options.fetch(:connect_to, CONNECT_TO)
54
52
  @read_to = options.fetch(:read_to, READ_TO)
55
- @write_to = options.fetch(:write_to, WRITE_TO)
56
- @max_redirects = options.fetch(:max_redirects, MAX_REDIRECTS)
57
53
  @max_retries = options.fetch(:max_retries, MAX_RETRIES)
58
- @max_threads = options.fetch(:max_th, Etc.nprocessors)
54
+ @max_threads = options.fetch(:max_threads, Etc.nprocessors)
59
55
 
60
56
  logging_target = if options[:debug]
61
57
  STDOUT
@@ -118,7 +114,7 @@ module TitleGrabber
118
114
  tweet_urls.uniq!
119
115
  tweet_urls.map! do |url|
120
116
  if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
121
- uri = res.uri
117
+ uri = res.base_uri
122
118
  uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
123
119
  else
124
120
  url
@@ -192,7 +188,7 @@ module TitleGrabber
192
188
 
193
189
  def http_opts
194
190
  @http_opts ||= { connect_to: connect_to, read_to: read_to,
195
- write_to: write_to, max_retries: max_retries }
191
+ max_retries: max_retries }
196
192
  end
197
193
  end
198
194
  end
@@ -38,7 +38,6 @@ Gem::Specification.new do |spec|
38
38
 
39
39
  spec.required_ruby_version = "~> 2.3"
40
40
 
41
- spec.add_runtime_dependency "http", "~> 4.1"
42
41
  spec.add_runtime_dependency "oga", "~> 2.15"
43
42
 
44
43
  spec.add_development_dependency "bundler", "~> 1.17"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
@@ -10,20 +10,6 @@ bindir: exe
10
10
  cert_chain: []
11
11
  date: 2019-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: http
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '4.1'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '4.1'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: oga
29
15
  requirement: !ruby/object:Gem::Requirement