title_grabber 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/http_helper.rb +11 -8
- data/lib/text_helper.rb +5 -2
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +22 -11
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6a541f189b34294558e51d72b88dc447e55e7b1d2fb802463ccd9bf27c2b4e19
|
4
|
+
data.tar.gz: a1ab49973e029e4bdcbcae37fefbafe6ae2e5042dc653aa3b3e7dbff42c2d804
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f6c4ecdb7a780af13b926146a76d337fd433def78e4326ee7e674fb8617a3902d5942ef61c3a14c3d7a3522920daf7a66c2fdc2e85a402c81f076a2589692eab
|
7
|
+
data.tar.gz: 2d921af22310ede97c1035c858abb1c87da7125df09587124b1a6d36a1aab6fa0cc745d560a83cce74b9aca2a63ae96244c74a05429cd124ec6342fdb64598ac
|
data/Gemfile.lock
CHANGED
data/lib/http_helper.rb
CHANGED
@@ -5,19 +5,20 @@ require "http"
|
|
5
5
|
require_relative "text_helper"
|
6
6
|
|
7
7
|
module HTTPHelper
|
8
|
-
WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT",
|
9
|
-
CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT",
|
8
|
+
WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 15))
|
9
|
+
CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 15))
|
10
10
|
READ_TO = Integer(ENV.fetch("READ_TIMEOUT", 15))
|
11
11
|
MAX_HOPS = 5
|
12
12
|
MAX_RETRIES = 3
|
13
13
|
INVALID_BYTE_SEQ = "invalid byte sequence".freeze
|
14
14
|
CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
|
15
|
+
REST_INTERVAL = 0.5..1
|
15
16
|
|
16
17
|
include TextHelper
|
17
18
|
|
18
19
|
def open_w_timeout(url, write_to: WRITE_TO, connect_to: CONN_TO,
|
19
20
|
read_to: READ_TO)
|
20
|
-
logger.info "GET #{url}"
|
21
|
+
logger.info "[Thread: ##{Thread.current[:id]}] GET #{url}"
|
21
22
|
retries = 0
|
22
23
|
|
23
24
|
begin
|
@@ -35,27 +36,29 @@ module HTTPHelper
|
|
35
36
|
retries += 1
|
36
37
|
|
37
38
|
if retries <= MAX_RETRIES
|
38
|
-
|
39
|
+
rest_time = rand(REST_INTERVAL)
|
40
|
+
logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time} secs - Retry ##{retries}"
|
41
|
+
sleep(rest_time)
|
39
42
|
retry
|
40
43
|
else
|
41
|
-
logger.error "URL: #{url} [#{msg}]"
|
44
|
+
logger.error "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}]"
|
42
45
|
nil
|
43
46
|
end
|
44
47
|
end
|
45
48
|
rescue => err
|
46
|
-
logger.error "URL: #{url} [#{err.message}]"
|
49
|
+
logger.error "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{err.message}]"
|
47
50
|
nil
|
48
51
|
else
|
49
52
|
utf8_encode(body)
|
50
53
|
end
|
51
54
|
end
|
52
55
|
|
53
|
-
private
|
54
|
-
|
55
56
|
def logger
|
56
57
|
@logger ||= Logger.new(STDOUT)
|
57
58
|
end
|
58
59
|
|
60
|
+
private
|
61
|
+
|
59
62
|
def ssl_ctx
|
60
63
|
@ssl_ctx ||= begin
|
61
64
|
ctx = OpenSSL::SSL::SSLContext.new
|
data/lib/text_helper.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
module TextHelper
|
2
|
+
SINGLE_SPACE = -" "
|
3
|
+
|
2
4
|
def utf8_encode(text = nil)
|
3
5
|
begin
|
4
6
|
String(text).encode(-"UTF-8", invalid: :replace, undef: :replace,
|
@@ -11,8 +13,9 @@ module TextHelper
|
|
11
13
|
# document.querySelector('title').textContent.trim().replace(/\n/g, ' ').replace(/\s{2,}/g, ' ')
|
12
14
|
def clean_up_whitespace(text)
|
13
15
|
text.strip!
|
14
|
-
text.gsub!("\n",
|
15
|
-
text.gsub(/\s{2,}/,
|
16
|
+
text.gsub!("\n", SINGLE_SPACE)
|
17
|
+
text.gsub(/\s{2,}/, SINGLE_SPACE)
|
18
|
+
text.delete!(-"\u0000") # get rid of nasty null bytes
|
16
19
|
text
|
17
20
|
end
|
18
21
|
end
|
data/lib/title_grabber.rb
CHANGED
@@ -51,23 +51,34 @@ module TitleGrabber
|
|
51
51
|
end
|
52
52
|
lines = nil
|
53
53
|
|
54
|
-
thr_cnt = [Etc.nprocessors
|
55
|
-
|
56
|
-
|
54
|
+
thr_cnt = [Integer(ENV.fetch("MAX_THREADS", Etc.nprocessors)),
|
55
|
+
queue.size].min
|
56
|
+
threads = 1.upto(thr_cnt).map.with_index { |_, i|
|
57
|
+
Thread.new(i) do |j|
|
58
|
+
Thread.current[:id] = i + 1
|
59
|
+
|
57
60
|
url = begin
|
58
61
|
queue.pop(true)
|
59
62
|
rescue ThreadError; end
|
60
63
|
|
61
64
|
while url
|
62
65
|
if html = open_w_timeout(url)
|
63
|
-
doc =
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
66
|
+
doc = begin
|
67
|
+
Oga.parse_html(html)
|
68
|
+
rescue LL::ParserError => err
|
69
|
+
logger.error "[Thread: ##{Thread.current[:id]}] Unable to parse HTML from URL '#{url}' - #{err.message}"
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
|
73
|
+
if doc
|
74
|
+
page_title = doc.at_css('title')&.text || -""
|
75
|
+
clean_up_whitespace(page_title) unless page_title.empty?
|
76
|
+
article_title = doc.at_css('article h1')&.text
|
77
|
+
article_title ||= doc.at_css('h1')&.text || -""
|
78
|
+
clean_up_whitespace(article_title) unless article_title.empty?
|
79
|
+
|
80
|
+
csv << [url, page_title, article_title]
|
81
|
+
end
|
71
82
|
end
|
72
83
|
|
73
84
|
url = begin
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: http
|