title_grabber 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/http_helper.rb +4 -4
- data/lib/title_grabber.rb +8 -14
- data/lib/title_grabber/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8a8b57f801507ef8c856ebd741b247c30e08472b5df3be45d30e6fd7fa3c0092
|
4
|
+
data.tar.gz: 75ff595060fe8dd71a669ebadcf38f040d5c47492a5c6ea770e893832493ddff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 661dd87e4e60dcfd4a66168799f3b07babf5c21a890b2e61afb99b527b1e15cb7ebd875a60f39bd12cc2f06ef286b603efb89c4908f461606fa3c60a09ad5db6
|
7
|
+
data.tar.gz: b6b1152856c43702082a788a1a056ccc2cef93101faada8890eda27b949d424cf3b9a83991fd91e07733f2c860003b26644510ae9635ea5c6336a1990f997445
|
data/Gemfile.lock
CHANGED
data/lib/http_helper.rb
CHANGED
@@ -11,7 +11,7 @@ module HTTPHelper
|
|
11
11
|
include TextHelper
|
12
12
|
|
13
13
|
def open_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
|
14
|
-
logger.info "[
|
14
|
+
logger.info "[#{Thread.current.name}] GET #{url}"
|
15
15
|
retries = 0
|
16
16
|
|
17
17
|
begin
|
@@ -30,16 +30,16 @@ module HTTPHelper
|
|
30
30
|
|
31
31
|
if retries <= max_retries
|
32
32
|
rest_time = rand(REST_INTERVAL)
|
33
|
-
logger.warn "[
|
33
|
+
logger.warn "[#{Thread.current.name}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
|
34
34
|
sleep(rest_time)
|
35
35
|
retry
|
36
36
|
else
|
37
|
-
logger.error "[
|
37
|
+
logger.error "[#{Thread.current.name}] URL: #{url} [#{msg}]"
|
38
38
|
nil
|
39
39
|
end
|
40
40
|
end
|
41
41
|
rescue => err
|
42
|
-
logger.error "[
|
42
|
+
logger.error "[#{Thread.current.name}] URL: #{url} [#{err.message}]"
|
43
43
|
nil
|
44
44
|
else
|
45
45
|
utf8_encode(body)
|
data/lib/title_grabber.rb
CHANGED
@@ -58,8 +58,6 @@ module TitleGrabber
|
|
58
58
|
end
|
59
59
|
|
60
60
|
def call
|
61
|
-
install_at_exit_handler
|
62
|
-
|
63
61
|
queue = Queue.new
|
64
62
|
CSV.open(tmp_path, "w", force_quotes: true) do |csv|
|
65
63
|
csv << HEADERS
|
@@ -81,7 +79,7 @@ module TitleGrabber
|
|
81
79
|
thr_cnt = [max_threads, queue.size].min
|
82
80
|
threads = 1.upto(thr_cnt).map.with_index { |_, i|
|
83
81
|
Thread.new(i) do |j|
|
84
|
-
Thread.current
|
82
|
+
Thread.current.name = "Thread ##{i + 1}"
|
85
83
|
|
86
84
|
url = begin
|
87
85
|
queue.pop(true)
|
@@ -92,7 +90,7 @@ module TitleGrabber
|
|
92
90
|
doc = begin
|
93
91
|
Oga.parse_html(html)
|
94
92
|
rescue ArgumentError, LL::ParserError => err
|
95
|
-
logger.error "[
|
93
|
+
logger.error "[#{Thread.current.name}] Unable to parse HTML from URL '#{url}' - #{err.message}"
|
96
94
|
nil
|
97
95
|
end
|
98
96
|
|
@@ -114,20 +112,16 @@ module TitleGrabber
|
|
114
112
|
end
|
115
113
|
}.each(&:join)
|
116
114
|
end
|
115
|
+
ensure
|
116
|
+
if tmp_path.size?
|
117
|
+
FileUtils.mv(tmp_path, out_path)
|
118
|
+
else
|
119
|
+
tmp_path.unlink if tmp_path.exist?
|
120
|
+
end
|
117
121
|
end
|
118
122
|
|
119
123
|
private
|
120
124
|
|
121
|
-
def install_at_exit_handler
|
122
|
-
at_exit do
|
123
|
-
if tmp_path.size?
|
124
|
-
FileUtils.mv(tmp_path, out_path)
|
125
|
-
else
|
126
|
-
tmp_path.unlink if tmp_path.exist?
|
127
|
-
end
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
125
|
def processed_urls
|
132
126
|
@processed_urls ||= begin
|
133
127
|
urls = {}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: http
|