title_grabber 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f341d404fbe5f16ef1eb904c75fbdd9331a76de7cb08c969d59526dbae9d95d0
4
- data.tar.gz: 586a09fdca43a48db702b69e4e6e5d123e123d5aa6631a8df8c7b9eb46dd8820
3
+ metadata.gz: 8c1d79385bd87e3116d10dc56c14b69929dccb1eb2597e99ed2166c683146f59
4
+ data.tar.gz: 1ea1fc865f6570c0ecb0f0f39c023585a9fac0c11dca43efbfe23a1d0aa1f075
5
5
  SHA512:
6
- metadata.gz: 14ceff1f56aea4cd1b9185b46e99af361ae243f809d50571933a96de2a757aa0589b4ecda34dfdeb9ff647c42412fed95acd1a1ee03fb875ed05229f53a71505
7
- data.tar.gz: fc2924e719dab7d72b8adf119fe11d29909433a84732a19b878a2faf437dfb5e4e595c696e63d525864133499816bb9722c76af375dea8000851e455908705ae
6
+ metadata.gz: 3b2fe97ff3c4a8723b6bfa634f3bf00f8096375a9066c217359e6cae0e5ba657560093ebf647a4c042f82b482030a842c0896a94ceb71fd943143bd0f9fa03b3
7
+ data.tar.gz: 936b43e66159e54246a57f59570e5dca550d4afbf87f7fae4bf1be8166af8fe0b1cab8d000409eb68293d4b5046ef40791574567010347e00ed20c359b9f5593
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.3.1)
4
+ title_grabber (0.3.2)
5
5
  http (~> 4.1)
6
6
  oga (~> 2.15)
7
7
 
data/README.md CHANGED
@@ -33,6 +33,22 @@ Data is either recorded to out.csv in the CWD or the file specified using the
33
33
  title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
34
34
  ```
35
35
 
36
+ ### Environment Variables
37
+
38
+ ```
39
+ DEBUG - when set it logs to STDOUT instead of to its default target, title_grabber.log
40
+ ```
41
+
42
+ ```
43
+ MAX_THREADS - max. # of threads to use, defaults to the # of CPU cores in the machine
44
+ ```
45
+
46
+ ```
47
+ CONNECT_TIMEOUT (in seconds) - defaults to 15
48
+ READ_TIMEOUT (in seconds) - defaults to 15
49
+ WRITE_TIMEOUT (in seconds) - defaults to 15
50
+ ```
51
+
36
52
  ## Development
37
53
 
38
54
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/exe/title-grabber CHANGED
@@ -6,15 +6,38 @@ require "pathname"
6
6
  require_relative '../lib/title_grabber'
7
7
 
8
8
  script_path = Pathname(__FILE__)
9
- def_out_path = Pathname('out.csv')
10
9
 
11
- arguments = { output: def_out_path }
10
+ arguments = {}
12
11
  OptionParser.new do |args|
13
12
  args.banner = "Usage: #{script_path.basename} [options]"
14
13
 
15
- args.on("-o", "--output FILE", "Output file (defaults to #{def_out_path.basename})") do |out|
14
+ args.on("-o", "--output FILE", "Output file. Defaults to #{TitleGrabber::DEF_OUT_PATH.basename}") do |out|
16
15
  arguments[:output] = Pathname(out)
17
16
  end
17
+
18
+ args.on("--connect-timeout TIMEOUT", Integer, "HTTP Connect timeout. Defaults to the value of the CONNECT_TIMEOUT env var or #{TitleGrabber::CONNECT_TO}") do |timeout|
19
+ arguments[:connect_to] = timeout
20
+ end
21
+
22
+ args.on("--read-timeout TIMEOUT", Integer, "HTTP Read timeout. Defaults to the value of the READ_TIMEOUT env var or #{TitleGrabber::READ_TO}") do |timeout|
23
+ arguments[:read_to] = timeout
24
+ end
25
+
26
+ args.on("--write-timeout TIMEOUT", Integer, "HTTP Write timeout. Defaults to the value of the WRITE_TIMEOUT env var or #{TitleGrabber::WRITE_TO}") do |timeout|
27
+ arguments[:write_to] = timeout
28
+ end
29
+
30
+ args.on("-r", "--max-retries RETRIES", Integer, "Max. # of times to retry failed HTTP reqs. Defaults to the value of the MAX_RETRIES env var or #{TitleGrabber::MAX_RETRIES}") do |retries|
31
+ arguments[:max_retries] = retries
32
+ end
33
+
34
+ args.on("-t", "--max-threads THREADS", Integer, "Max. # of threads to use. Defaults to the value of the MAX_THREADS env var or the # of logical processors in the system") do |threads|
35
+ arguments[:max_threads] = threads
36
+ end
37
+
38
+ args.on("-d", "--debug", "Log to STDOUT instead of to a file in the CWD") do |debug|
39
+ arguments[:debug] = true
40
+ end
18
41
  end.parse!
19
42
 
20
43
  if ARGV.empty?
data/lib/http_helper.rb CHANGED
@@ -3,19 +3,14 @@ require "http"
3
3
  require_relative "text_helper"
4
4
 
5
5
  module HTTPHelper
6
- WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 15))
7
- CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 15))
8
- READ_TO = Integer(ENV.fetch("READ_TIMEOUT", 15))
9
6
  MAX_HOPS = 5
10
- MAX_RETRIES = 3
11
7
  INVALID_BYTE_SEQ = "invalid byte sequence".freeze
12
8
  CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
13
9
  REST_INTERVAL = 0.5..1
14
10
 
15
11
  include TextHelper
16
12
 
17
- def open_w_timeout(url, write_to: WRITE_TO, connect_to: CONN_TO,
18
- read_to: READ_TO)
13
+ def open_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
19
14
  logger.info "[Thread: ##{Thread.current[:id]}] GET #{url}"
20
15
  retries = 0
21
16
 
@@ -33,7 +28,7 @@ module HTTPHelper
33
28
  CONNECTION_ERRORS.any? { |e| msg.start_with?(e) }
34
29
  retries += 1
35
30
 
36
- if retries <= MAX_RETRIES
31
+ if retries <= max_retries
37
32
  rest_time = rand(REST_INTERVAL)
38
33
  logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
39
34
  sleep(rest_time)
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.3.1"
2
+ VERSION = "0.3.2"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -1,5 +1,5 @@
1
- require "csv"
2
1
  require "etc"
2
+ require "csv"
3
3
  require "fileutils"
4
4
  require "logger"
5
5
  require "pathname"
@@ -12,6 +12,12 @@ require_relative "http_helper"
12
12
  require_relative "text_helper"
13
13
 
14
14
  module TitleGrabber
15
+ DEF_OUT_PATH = Pathname('out.csv')
16
+ CONNECT_TO = 15
17
+ READ_TO = 15
18
+ WRITE_TO = 15
19
+ MAX_RETRIES = 5
20
+ MAX_THREADS = Etc.nprocessors
15
21
  URL_RE = %r(https?://\S+)i
16
22
  URL_HEADER = -"url"
17
23
  PAGE_TIT_HEAD = -"page_title"
@@ -26,14 +32,22 @@ module TitleGrabber
26
32
  include HTTPHelper
27
33
  include TextHelper
28
34
 
29
- attr_reader :lines, :out_path, :tmp_path, :logger
35
+ attr_reader :lines, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
36
+ :max_retries, :max_threads, :logger
30
37
 
31
38
  def initialize(lines, options)
32
39
  @lines = lines
33
- @out_path = options[:output]
40
+
41
+ @out_path = options.fetch(:output, DEF_OUT_PATH)
34
42
  @tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
35
43
 
36
- logging_target = if ENV["DEBUG"]
44
+ @connect_to = options.fetch(:connect_to, CONNECT_TO)
45
+ @read_to = options.fetch(:read_to, READ_TO)
46
+ @write_to = options.fetch(:write_to, WRITE_TO)
47
+ @max_retries = options.fetch(:max_retries, MAX_RETRIES)
48
+ @max_threads = options.fetch(:max_th, Etc.nprocessors)
49
+
50
+ logging_target = if options[:debug]
37
51
  STDOUT
38
52
  else
39
53
  log_file = Pathname(__FILE__).sub_ext(".log").
@@ -64,8 +78,7 @@ module TitleGrabber
64
78
  end
65
79
  lines = nil
66
80
 
67
- thr_cnt = [Integer(ENV.fetch("MAX_THREADS", Etc.nprocessors)),
68
- queue.size].min
81
+ thr_cnt = [max_threads, queue.size].min
69
82
  threads = 1.upto(thr_cnt).map.with_index { |_, i|
70
83
  Thread.new(i) do |j|
71
84
  Thread.current[:id] = i + 1
@@ -75,7 +88,7 @@ module TitleGrabber
75
88
  rescue ThreadError; end
76
89
 
77
90
  while url
78
- if (html = open_w_timeout(url)) && !html.empty?
91
+ if (html = open_w_timeout(url, **http_opts)) && !html.empty?
79
92
  doc = begin
80
93
  Oga.parse_html(html)
81
94
  rescue ArgumentError, LL::ParserError => err
@@ -134,5 +147,10 @@ module TitleGrabber
134
147
  urls
135
148
  end
136
149
  end
150
+
151
+ def http_opts
152
+ @http_opts ||= { connect_to: connect_to, read_to: read_to,
153
+ write_to: write_to, max_retries: max_retries }
154
+ end
137
155
  end
138
156
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch