title_grabber 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7532ff11d949765bb08b4ae1deeefc1aea1c70e8b794139cf747cf10c948e433
4
- data.tar.gz: f59192b55720a85dab8767b03346fc335e825d5696d20f7ac46e0880a081b96d
3
+ metadata.gz: 6a541f189b34294558e51d72b88dc447e55e7b1d2fb802463ccd9bf27c2b4e19
4
+ data.tar.gz: a1ab49973e029e4bdcbcae37fefbafe6ae2e5042dc653aa3b3e7dbff42c2d804
5
5
  SHA512:
6
- metadata.gz: 1102126705a068b7e7e92721c93cc283bf9b4bef28a49c7799aeb3f03825548751ebb108861e73952ba79de57620d4e9390a6a95df32b9a04cc2e7a75df704c5
7
- data.tar.gz: 24f10e575e1f5b706877e952e7fefdcba17a589a3a0f0282c697d6b1882e79f40066bac01509e5d3ed43d50f2102e68ff3f2baae5c5bd9c3eecd295e86f5af03
6
+ metadata.gz: f6c4ecdb7a780af13b926146a76d337fd433def78e4326ee7e674fb8617a3902d5942ef61c3a14c3d7a3522920daf7a66c2fdc2e85a402c81f076a2589692eab
7
+ data.tar.gz: 2d921af22310ede97c1035c858abb1c87da7125df09587124b1a6d36a1aab6fa0cc745d560a83cce74b9aca2a63ae96244c74a05429cd124ec6342fdb64598ac
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.2.4)
4
+ title_grabber (0.3.0)
5
5
  http (~> 4.1)
6
6
  oga (~> 2.15)
7
7
 
data/lib/http_helper.rb CHANGED
@@ -5,19 +5,20 @@ require "http"
5
5
  require_relative "text_helper"
6
6
 
7
7
  module HTTPHelper
8
- WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 5))
9
- CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 10))
8
+ WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 15))
9
+ CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 15))
10
10
  READ_TO = Integer(ENV.fetch("READ_TIMEOUT", 15))
11
11
  MAX_HOPS = 5
12
12
  MAX_RETRIES = 3
13
13
  INVALID_BYTE_SEQ = "invalid byte sequence".freeze
14
14
  CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
15
+ REST_INTERVAL = 0.5..1
15
16
 
16
17
  include TextHelper
17
18
 
18
19
  def open_w_timeout(url, write_to: WRITE_TO, connect_to: CONN_TO,
19
20
  read_to: READ_TO)
20
- logger.info "GET #{url}"
21
+ logger.info "[Thread: ##{Thread.current[:id]}] GET #{url}"
21
22
  retries = 0
22
23
 
23
24
  begin
@@ -35,27 +36,29 @@ module HTTPHelper
35
36
  retries += 1
36
37
 
37
38
  if retries <= MAX_RETRIES
38
- logger.warn "URL: #{url} [#{msg}] - Retry ##{retries}"
39
+ rest_time = rand(REST_INTERVAL)
40
+ logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time} secs - Retry ##{retries}"
41
+ sleep(rest_time)
39
42
  retry
40
43
  else
41
- logger.error "URL: #{url} [#{msg}]"
44
+ logger.error "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}]"
42
45
  nil
43
46
  end
44
47
  end
45
48
  rescue => err
46
- logger.error "URL: #{url} [#{err.message}]"
49
+ logger.error "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{err.message}]"
47
50
  nil
48
51
  else
49
52
  utf8_encode(body)
50
53
  end
51
54
  end
52
55
 
53
- private
54
-
55
56
  def logger
56
57
  @logger ||= Logger.new(STDOUT)
57
58
  end
58
59
 
60
+ private
61
+
59
62
  def ssl_ctx
60
63
  @ssl_ctx ||= begin
61
64
  ctx = OpenSSL::SSL::SSLContext.new
data/lib/text_helper.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  module TextHelper
2
+ SINGLE_SPACE = -" "
3
+
2
4
  def utf8_encode(text = nil)
3
5
  begin
4
6
  String(text).encode(-"UTF-8", invalid: :replace, undef: :replace,
@@ -11,8 +13,9 @@ module TextHelper
11
13
  # document.querySelector('title').textContent.trim().replace(/\n/g, ' ').replace(/\s{2,}/g, ' ')
12
14
  def clean_up_whitespace(text)
13
15
  text.strip!
14
- text.gsub!("\n", " ")
15
- text.gsub(/\s{2,}/, ' ')
16
+ text.gsub!("\n", SINGLE_SPACE)
17
+ text.gsub(/\s{2,}/, SINGLE_SPACE)
18
+ text.delete!(-"\u0000") # get rid of nasty null bytes
16
19
  text
17
20
  end
18
21
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.2.4"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -51,23 +51,34 @@ module TitleGrabber
51
51
  end
52
52
  lines = nil
53
53
 
54
- thr_cnt = [Etc.nprocessors * 2, queue.size].min
55
- threads = 1.upto(thr_cnt).map {
56
- Thread.new do
54
+ thr_cnt = [Integer(ENV.fetch("MAX_THREADS", Etc.nprocessors)),
55
+ queue.size].min
56
+ threads = 1.upto(thr_cnt).map.with_index { |_, i|
57
+ Thread.new(i) do |j|
58
+ Thread.current[:id] = i + 1
59
+
57
60
  url = begin
58
61
  queue.pop(true)
59
62
  rescue ThreadError; end
60
63
 
61
64
  while url
62
65
  if html = open_w_timeout(url)
63
- doc = Oga.parse_html(html)
64
- page_title = doc.at_css('title')&.text || -""
65
- clean_up_whitespace(page_title) unless page_title.empty?
66
- article_title = doc.at_css('article h1')&.text
67
- article_title ||= doc.at_css('h1')&.text || -""
68
- clean_up_whitespace(article_title) unless article_title.empty?
69
-
70
- csv << [url, page_title, article_title]
66
+ doc = begin
67
+ Oga.parse_html(html)
68
+ rescue LL::ParserError => err
69
+ logger.error "[Thread: ##{Thread.current[:id]}] Unable to parse HTML from URL '#{url}' - #{err.message}"
70
+ nil
71
+ end
72
+
73
+ if doc
74
+ page_title = doc.at_css('title')&.text || -""
75
+ clean_up_whitespace(page_title) unless page_title.empty?
76
+ article_title = doc.at_css('article h1')&.text
77
+ article_title ||= doc.at_css('h1')&.text || -""
78
+ clean_up_whitespace(article_title) unless article_title.empty?
79
+
80
+ csv << [url, page_title, article_title]
81
+ end
71
82
  end
72
83
 
73
84
  url = begin
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-03 00:00:00.000000000 Z
11
+ date: 2019-04-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http