title_grabber 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7532ff11d949765bb08b4ae1deeefc1aea1c70e8b794139cf747cf10c948e433
4
- data.tar.gz: f59192b55720a85dab8767b03346fc335e825d5696d20f7ac46e0880a081b96d
3
+ metadata.gz: 6a541f189b34294558e51d72b88dc447e55e7b1d2fb802463ccd9bf27c2b4e19
4
+ data.tar.gz: a1ab49973e029e4bdcbcae37fefbafe6ae2e5042dc653aa3b3e7dbff42c2d804
5
5
  SHA512:
6
- metadata.gz: 1102126705a068b7e7e92721c93cc283bf9b4bef28a49c7799aeb3f03825548751ebb108861e73952ba79de57620d4e9390a6a95df32b9a04cc2e7a75df704c5
7
- data.tar.gz: 24f10e575e1f5b706877e952e7fefdcba17a589a3a0f0282c697d6b1882e79f40066bac01509e5d3ed43d50f2102e68ff3f2baae5c5bd9c3eecd295e86f5af03
6
+ metadata.gz: f6c4ecdb7a780af13b926146a76d337fd433def78e4326ee7e674fb8617a3902d5942ef61c3a14c3d7a3522920daf7a66c2fdc2e85a402c81f076a2589692eab
7
+ data.tar.gz: 2d921af22310ede97c1035c858abb1c87da7125df09587124b1a6d36a1aab6fa0cc745d560a83cce74b9aca2a63ae96244c74a05429cd124ec6342fdb64598ac
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.2.4)
4
+ title_grabber (0.3.0)
5
5
  http (~> 4.1)
6
6
  oga (~> 2.15)
7
7
 
data/lib/http_helper.rb CHANGED
@@ -5,19 +5,20 @@ require "http"
5
5
  require_relative "text_helper"
6
6
 
7
7
  module HTTPHelper
8
- WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 5))
9
- CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 10))
8
+ WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 15))
9
+ CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 15))
10
10
  READ_TO = Integer(ENV.fetch("READ_TIMEOUT", 15))
11
11
  MAX_HOPS = 5
12
12
  MAX_RETRIES = 3
13
13
  INVALID_BYTE_SEQ = "invalid byte sequence".freeze
14
14
  CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
15
+ REST_INTERVAL = 0.5..1
15
16
 
16
17
  include TextHelper
17
18
 
18
19
  def open_w_timeout(url, write_to: WRITE_TO, connect_to: CONN_TO,
19
20
  read_to: READ_TO)
20
- logger.info "GET #{url}"
21
+ logger.info "[Thread: ##{Thread.current[:id]}] GET #{url}"
21
22
  retries = 0
22
23
 
23
24
  begin
@@ -35,27 +36,29 @@ module HTTPHelper
35
36
  retries += 1
36
37
 
37
38
  if retries <= MAX_RETRIES
38
- logger.warn "URL: #{url} [#{msg}] - Retry ##{retries}"
39
+ rest_time = rand(REST_INTERVAL)
40
+ logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time} secs - Retry ##{retries}"
41
+ sleep(rest_time)
39
42
  retry
40
43
  else
41
- logger.error "URL: #{url} [#{msg}]"
44
+ logger.error "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}]"
42
45
  nil
43
46
  end
44
47
  end
45
48
  rescue => err
46
- logger.error "URL: #{url} [#{err.message}]"
49
+ logger.error "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{err.message}]"
47
50
  nil
48
51
  else
49
52
  utf8_encode(body)
50
53
  end
51
54
  end
52
55
 
53
- private
54
-
55
56
  def logger
56
57
  @logger ||= Logger.new(STDOUT)
57
58
  end
58
59
 
60
+ private
61
+
59
62
  def ssl_ctx
60
63
  @ssl_ctx ||= begin
61
64
  ctx = OpenSSL::SSL::SSLContext.new
data/lib/text_helper.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  module TextHelper
2
+ SINGLE_SPACE = -" "
3
+
2
4
  def utf8_encode(text = nil)
3
5
  begin
4
6
  String(text).encode(-"UTF-8", invalid: :replace, undef: :replace,
@@ -11,8 +13,9 @@ module TextHelper
11
13
  # document.querySelector('title').textContent.trim().replace(/\n/g, ' ').replace(/\s{2,}/g, ' ')
12
14
  def clean_up_whitespace(text)
13
15
  text.strip!
14
- text.gsub!("\n", " ")
15
- text.gsub(/\s{2,}/, ' ')
16
+ text.gsub!("\n", SINGLE_SPACE)
17
+ text.gsub(/\s{2,}/, SINGLE_SPACE)
18
+ text.delete!(-"\u0000") # get rid of nasty null bytes
16
19
  text
17
20
  end
18
21
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.2.4"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -51,23 +51,34 @@ module TitleGrabber
51
51
  end
52
52
  lines = nil
53
53
 
54
- thr_cnt = [Etc.nprocessors * 2, queue.size].min
55
- threads = 1.upto(thr_cnt).map {
56
- Thread.new do
54
+ thr_cnt = [Integer(ENV.fetch("MAX_THREADS", Etc.nprocessors)),
55
+ queue.size].min
56
+ threads = 1.upto(thr_cnt).map.with_index { |_, i|
57
+ Thread.new(i) do |j|
58
+ Thread.current[:id] = i + 1
59
+
57
60
  url = begin
58
61
  queue.pop(true)
59
62
  rescue ThreadError; end
60
63
 
61
64
  while url
62
65
  if html = open_w_timeout(url)
63
- doc = Oga.parse_html(html)
64
- page_title = doc.at_css('title')&.text || -""
65
- clean_up_whitespace(page_title) unless page_title.empty?
66
- article_title = doc.at_css('article h1')&.text
67
- article_title ||= doc.at_css('h1')&.text || -""
68
- clean_up_whitespace(article_title) unless article_title.empty?
69
-
70
- csv << [url, page_title, article_title]
66
+ doc = begin
67
+ Oga.parse_html(html)
68
+ rescue LL::ParserError => err
69
+ logger.error "[Thread: ##{Thread.current[:id]}] Unable to parse HTML from URL '#{url}' - #{err.message}"
70
+ nil
71
+ end
72
+
73
+ if doc
74
+ page_title = doc.at_css('title')&.text || -""
75
+ clean_up_whitespace(page_title) unless page_title.empty?
76
+ article_title = doc.at_css('article h1')&.text
77
+ article_title ||= doc.at_css('h1')&.text || -""
78
+ clean_up_whitespace(article_title) unless article_title.empty?
79
+
80
+ csv << [url, page_title, article_title]
81
+ end
71
82
  end
72
83
 
73
84
  url = begin
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-03 00:00:00.000000000 Z
11
+ date: 2019-04-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http