title_grabber 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/http_helper.rb +4 -4
- data/lib/title_grabber.rb +8 -14
- data/lib/title_grabber/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8a8b57f801507ef8c856ebd741b247c30e08472b5df3be45d30e6fd7fa3c0092
|
4
|
+
data.tar.gz: 75ff595060fe8dd71a669ebadcf38f040d5c47492a5c6ea770e893832493ddff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 661dd87e4e60dcfd4a66168799f3b07babf5c21a890b2e61afb99b527b1e15cb7ebd875a60f39bd12cc2f06ef286b603efb89c4908f461606fa3c60a09ad5db6
|
7
|
+
data.tar.gz: b6b1152856c43702082a788a1a056ccc2cef93101faada8890eda27b949d424cf3b9a83991fd91e07733f2c860003b26644510ae9635ea5c6336a1990f997445
|
data/Gemfile.lock
CHANGED
data/lib/http_helper.rb
CHANGED
@@ -11,7 +11,7 @@ module HTTPHelper
|
|
11
11
|
include TextHelper
|
12
12
|
|
13
13
|
def open_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
|
14
|
-
logger.info "[
|
14
|
+
logger.info "[#{Thread.current.name}] GET #{url}"
|
15
15
|
retries = 0
|
16
16
|
|
17
17
|
begin
|
@@ -30,16 +30,16 @@ module HTTPHelper
|
|
30
30
|
|
31
31
|
if retries <= max_retries
|
32
32
|
rest_time = rand(REST_INTERVAL)
|
33
|
-
logger.warn "[
|
33
|
+
logger.warn "[#{Thread.current.name}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
|
34
34
|
sleep(rest_time)
|
35
35
|
retry
|
36
36
|
else
|
37
|
-
logger.error "[
|
37
|
+
logger.error "[#{Thread.current.name}] URL: #{url} [#{msg}]"
|
38
38
|
nil
|
39
39
|
end
|
40
40
|
end
|
41
41
|
rescue => err
|
42
|
-
logger.error "[
|
42
|
+
logger.error "[#{Thread.current.name}] URL: #{url} [#{err.message}]"
|
43
43
|
nil
|
44
44
|
else
|
45
45
|
utf8_encode(body)
|
data/lib/title_grabber.rb
CHANGED
@@ -58,8 +58,6 @@ module TitleGrabber
|
|
58
58
|
end
|
59
59
|
|
60
60
|
def call
|
61
|
-
install_at_exit_handler
|
62
|
-
|
63
61
|
queue = Queue.new
|
64
62
|
CSV.open(tmp_path, "w", force_quotes: true) do |csv|
|
65
63
|
csv << HEADERS
|
@@ -81,7 +79,7 @@ module TitleGrabber
|
|
81
79
|
thr_cnt = [max_threads, queue.size].min
|
82
80
|
threads = 1.upto(thr_cnt).map.with_index { |_, i|
|
83
81
|
Thread.new(i) do |j|
|
84
|
-
Thread.current
|
82
|
+
Thread.current.name = "Thread ##{i + 1}"
|
85
83
|
|
86
84
|
url = begin
|
87
85
|
queue.pop(true)
|
@@ -92,7 +90,7 @@ module TitleGrabber
|
|
92
90
|
doc = begin
|
93
91
|
Oga.parse_html(html)
|
94
92
|
rescue ArgumentError, LL::ParserError => err
|
95
|
-
logger.error "[
|
93
|
+
logger.error "[#{Thread.current.name}] Unable to parse HTML from URL '#{url}' - #{err.message}"
|
96
94
|
nil
|
97
95
|
end
|
98
96
|
|
@@ -114,20 +112,16 @@ module TitleGrabber
|
|
114
112
|
end
|
115
113
|
}.each(&:join)
|
116
114
|
end
|
115
|
+
ensure
|
116
|
+
if tmp_path.size?
|
117
|
+
FileUtils.mv(tmp_path, out_path)
|
118
|
+
else
|
119
|
+
tmp_path.unlink if tmp_path.exist?
|
120
|
+
end
|
117
121
|
end
|
118
122
|
|
119
123
|
private
|
120
124
|
|
121
|
-
def install_at_exit_handler
|
122
|
-
at_exit do
|
123
|
-
if tmp_path.size?
|
124
|
-
FileUtils.mv(tmp_path, out_path)
|
125
|
-
else
|
126
|
-
tmp_path.unlink if tmp_path.exist?
|
127
|
-
end
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
125
|
def processed_urls
|
132
126
|
@processed_urls ||= begin
|
133
127
|
urls = {}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: http
|