title_grabber 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f341d404fbe5f16ef1eb904c75fbdd9331a76de7cb08c969d59526dbae9d95d0
4
- data.tar.gz: 586a09fdca43a48db702b69e4e6e5d123e123d5aa6631a8df8c7b9eb46dd8820
3
+ metadata.gz: 8c1d79385bd87e3116d10dc56c14b69929dccb1eb2597e99ed2166c683146f59
4
+ data.tar.gz: 1ea1fc865f6570c0ecb0f0f39c023585a9fac0c11dca43efbfe23a1d0aa1f075
5
5
  SHA512:
6
- metadata.gz: 14ceff1f56aea4cd1b9185b46e99af361ae243f809d50571933a96de2a757aa0589b4ecda34dfdeb9ff647c42412fed95acd1a1ee03fb875ed05229f53a71505
7
- data.tar.gz: fc2924e719dab7d72b8adf119fe11d29909433a84732a19b878a2faf437dfb5e4e595c696e63d525864133499816bb9722c76af375dea8000851e455908705ae
6
+ metadata.gz: 3b2fe97ff3c4a8723b6bfa634f3bf00f8096375a9066c217359e6cae0e5ba657560093ebf647a4c042f82b482030a842c0896a94ceb71fd943143bd0f9fa03b3
7
+ data.tar.gz: 936b43e66159e54246a57f59570e5dca550d4afbf87f7fae4bf1be8166af8fe0b1cab8d000409eb68293d4b5046ef40791574567010347e00ed20c359b9f5593
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.3.1)
4
+ title_grabber (0.3.2)
5
5
  http (~> 4.1)
6
6
  oga (~> 2.15)
7
7
 
data/README.md CHANGED
@@ -33,6 +33,22 @@ Data is either recorded to out.csv in the CWD or the file specified using the
33
33
  title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
34
34
  ```
35
35
 
36
+ ### Environment Variables
37
+
38
+ ```
39
+ DEBUG - when set it logs to STDOUT instead of to its default target, title_grabber.log
40
+ ```
41
+
42
+ ```
43
+ MAX_THREADS - max. # of threads to use, defaults to the # of CPU cores in the machine
44
+ ```
45
+
46
+ ```
47
+ CONNECT_TIMEOUT (in seconds) - defaults to 15
48
+ READ_TIMEOUT (in seconds) - defaults to 15
49
+ WRITE_TIMEOUT (in seconds) - defaults to 15
50
+ ```
51
+
36
52
  ## Development
37
53
 
38
54
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/exe/title-grabber CHANGED
@@ -6,15 +6,38 @@ require "pathname"
6
6
  require_relative '../lib/title_grabber'
7
7
 
8
8
  script_path = Pathname(__FILE__)
9
- def_out_path = Pathname('out.csv')
10
9
 
11
- arguments = { output: def_out_path }
10
+ arguments = {}
12
11
  OptionParser.new do |args|
13
12
  args.banner = "Usage: #{script_path.basename} [options]"
14
13
 
15
- args.on("-o", "--output FILE", "Output file (defaults to #{def_out_path.basename})") do |out|
14
+ args.on("-o", "--output FILE", "Output file. Defaults to #{TitleGrabber::DEF_OUT_PATH.basename}") do |out|
16
15
  arguments[:output] = Pathname(out)
17
16
  end
17
+
18
+ args.on("--connect-timeout TIMEOUT", Integer, "HTTP Connect timeout. Defaults to the value of the CONNECT_TIMEOUT env var or #{TitleGrabber::CONNECT_TO}") do |timeout|
19
+ arguments[:connect_to] = timeout
20
+ end
21
+
22
+ args.on("--read-timeout TIMEOUT", Integer, "HTTP Read timeout. Defaults to the value of the READ_TIMEOUT env var or #{TitleGrabber::READ_TO}") do |timeout|
23
+ arguments[:read_to] = timeout
24
+ end
25
+
26
+ args.on("--write-timeout TIMEOUT", Integer, "HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or #{TitleGrabber::WRITE_TO}") do |timeout|
27
+ arguments[:write_to] = timeout
28
+ end
29
+
30
+ args.on("-r", "--max-retries RETRIES", Integer, "Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or #{TitleGrabber::MAX_RETRIES}") do |retries|
31
+ arguments[:max_retries] = retries
32
+ end
33
+
34
+ args.on("-t", "--max-threads THREADS", Integer, "Max. # of threads to use. Defaults to the value of the MAX_THREADS env var or the # of logical processors in the system") do |threads|
35
+ arguments[:max_threads] = threads
36
+ end
37
+
38
+ args.on("-d", "--debug", "Log to STDOUT instead of to a file in the CWD") do |debug|
39
+ arguments[:debug] = true
40
+ end
18
41
  end.parse!
19
42
 
20
43
  if ARGV.empty?
data/lib/http_helper.rb CHANGED
@@ -3,19 +3,14 @@ require "http"
3
3
  require_relative "text_helper"
4
4
 
5
5
  module HTTPHelper
6
- WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 15))
7
- CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 15))
8
- READ_TO = Integer(ENV.fetch("READ_TIMEOUT", 15))
9
6
  MAX_HOPS = 5
10
- MAX_RETRIES = 3
11
7
  INVALID_BYTE_SEQ = "invalid byte sequence".freeze
12
8
  CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
13
9
  REST_INTERVAL = 0.5..1
14
10
 
15
11
  include TextHelper
16
12
 
17
- def open_w_timeout(url, write_to: WRITE_TO, connect_to: CONN_TO,
18
- read_to: READ_TO)
13
+ def open_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
19
14
  logger.info "[Thread: ##{Thread.current[:id]}] GET #{url}"
20
15
  retries = 0
21
16
 
@@ -33,7 +28,7 @@ module HTTPHelper
33
28
  CONNECTION_ERRORS.any? { |e| msg.start_with?(e) }
34
29
  retries += 1
35
30
 
36
- if retries <= MAX_RETRIES
31
+ if retries <= max_retries
37
32
  rest_time = rand(REST_INTERVAL)
38
33
  logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
39
34
  sleep(rest_time)
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.3.1"
2
+ VERSION = "0.3.2"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -1,5 +1,5 @@
1
- require "csv"
2
1
  require "etc"
2
+ require "csv"
3
3
  require "fileutils"
4
4
  require "logger"
5
5
  require "pathname"
@@ -12,6 +12,12 @@ require_relative "http_helper"
12
12
  require_relative "text_helper"
13
13
 
14
14
  module TitleGrabber
15
+ DEF_OUT_PATH = Pathname('out.csv')
16
+ CONNECT_TO = 15
17
+ READ_TO = 15
18
+ WRITE_TO = 15
19
+ MAX_RETRIES = 5
20
+ MAX_THREADS = Etc.nprocessors
15
21
  URL_RE = %r(https?://\S+)i
16
22
  URL_HEADER = -"url"
17
23
  PAGE_TIT_HEAD = -"page_title"
@@ -26,14 +32,22 @@ module TitleGrabber
26
32
  include HTTPHelper
27
33
  include TextHelper
28
34
 
29
- attr_reader :lines, :out_path, :tmp_path, :logger
35
+ attr_reader :lines, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
36
+ :max_retries, :max_threads, :logger
30
37
 
31
38
  def initialize(lines, options)
32
39
  @lines = lines
33
- @out_path = options[:output]
40
+
41
+ @out_path = options.fetch(:output, DEF_OUT_PATH)
34
42
  @tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
35
43
 
36
- logging_target = if ENV["DEBUG"]
44
+ @connect_to = options.fetch(:connect_to, CONNECT_TO)
45
+ @read_to = options.fetch(:read_to, READ_TO)
46
+ @write_to = options.fetch(:write_to, WRITE_TO)
47
+ @max_retries = options.fetch(:max_retries, MAX_RETRIES)
48
+ @max_threads = options.fetch(:max_th, Etc.nprocessors)
49
+
50
+ logging_target = if options[:debug]
37
51
  STDOUT
38
52
  else
39
53
  log_file = Pathname(__FILE__).sub_ext(".log").
@@ -64,8 +78,7 @@ module TitleGrabber
64
78
  end
65
79
  lines = nil
66
80
 
67
- thr_cnt = [Integer(ENV.fetch("MAX_THREADS", Etc.nprocessors)),
68
- queue.size].min
81
+ thr_cnt = [max_threads, queue.size].min
69
82
  threads = 1.upto(thr_cnt).map.with_index { |_, i|
70
83
  Thread.new(i) do |j|
71
84
  Thread.current[:id] = i + 1
@@ -75,7 +88,7 @@ module TitleGrabber
75
88
  rescue ThreadError; end
76
89
 
77
90
  while url
78
- if (html = open_w_timeout(url)) && !html.empty?
91
+ if (html = open_w_timeout(url, **http_opts)) && !html.empty?
79
92
  doc = begin
80
93
  Oga.parse_html(html)
81
94
  rescue ArgumentError, LL::ParserError => err
@@ -134,5 +147,10 @@ module TitleGrabber
134
147
  urls
135
148
  end
136
149
  end
150
+
151
+ def http_opts
152
+ @http_opts ||= { connect_to: connect_to, read_to: read_to,
153
+ write_to: write_to, max_retries: max_retries }
154
+ end
137
155
  end
138
156
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch