title_grabber 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/http_helper.rb +1 -3
- data/lib/text_helper.rb +6 -2
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +27 -6
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f341d404fbe5f16ef1eb904c75fbdd9331a76de7cb08c969d59526dbae9d95d0
|
4
|
+
data.tar.gz: 586a09fdca43a48db702b69e4e6e5d123e123d5aa6631a8df8c7b9eb46dd8820
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14ceff1f56aea4cd1b9185b46e99af361ae243f809d50571933a96de2a757aa0589b4ecda34dfdeb9ff647c42412fed95acd1a1ee03fb875ed05229f53a71505
|
7
|
+
data.tar.gz: fc2924e719dab7d72b8adf119fe11d29909433a84732a19b878a2faf437dfb5e4e595c696e63d525864133499816bb9722c76af375dea8000851e455908705ae
|
data/Gemfile.lock
CHANGED
data/lib/http_helper.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require "logger"
|
2
|
-
|
3
1
|
require "http"
|
4
2
|
|
5
3
|
require_relative "text_helper"
|
@@ -37,7 +35,7 @@ module HTTPHelper
|
|
37
35
|
|
38
36
|
if retries <= MAX_RETRIES
|
39
37
|
rest_time = rand(REST_INTERVAL)
|
40
|
-
logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time} secs - Retry ##{retries}"
|
38
|
+
logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
|
41
39
|
sleep(rest_time)
|
42
40
|
retry
|
43
41
|
else
|
data/lib/text_helper.rb
CHANGED
@@ -2,11 +2,16 @@ module TextHelper
|
|
2
2
|
SINGLE_SPACE = -" "
|
3
3
|
|
4
4
|
def utf8_encode(text = nil)
|
5
|
+
txt = String(text)
|
6
|
+
|
5
7
|
begin
|
6
|
-
|
8
|
+
txt.encode!(-"UTF-8", invalid: :replace, undef: :replace,
|
7
9
|
replace: -"")
|
8
10
|
rescue EncodingError
|
9
11
|
-""
|
12
|
+
else
|
13
|
+
txt.delete!(-"\u0000") # get rid of nasty null bytes
|
14
|
+
txt
|
10
15
|
end
|
11
16
|
end
|
12
17
|
|
@@ -15,7 +20,6 @@ module TextHelper
|
|
15
20
|
text.strip!
|
16
21
|
text.gsub!("\n", SINGLE_SPACE)
|
17
22
|
text.gsub(/\s{2,}/, SINGLE_SPACE)
|
18
|
-
text.delete!(-"\u0000") # get rid of nasty null bytes
|
19
23
|
text
|
20
24
|
end
|
21
25
|
end
|
data/lib/title_grabber.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require "csv"
|
2
2
|
require "etc"
|
3
3
|
require "fileutils"
|
4
|
+
require "logger"
|
5
|
+
require "pathname"
|
4
6
|
|
5
7
|
require "bundler/setup"
|
6
8
|
require "oga"
|
@@ -24,16 +26,27 @@ module TitleGrabber
|
|
24
26
|
include HTTPHelper
|
25
27
|
include TextHelper
|
26
28
|
|
27
|
-
attr_reader :lines, :out_path
|
29
|
+
attr_reader :lines, :out_path, :tmp_path, :logger
|
28
30
|
|
29
31
|
def initialize(lines, options)
|
30
32
|
@lines = lines
|
31
33
|
@out_path = options[:output]
|
34
|
+
@tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
|
35
|
+
|
36
|
+
logging_target = if ENV["DEBUG"]
|
37
|
+
STDOUT
|
38
|
+
else
|
39
|
+
log_file = Pathname(__FILE__).sub_ext(".log").
|
40
|
+
basename.
|
41
|
+
open("w")
|
42
|
+
end
|
43
|
+
@logger = Logger.new(logging_target)
|
32
44
|
end
|
33
45
|
|
34
46
|
def call
|
47
|
+
install_at_exit_handler
|
48
|
+
|
35
49
|
queue = Queue.new
|
36
|
-
tmp_path = out_path.sub_ext(".tmp#{out_path.extname}")
|
37
50
|
CSV.open(tmp_path, "w", force_quotes: true) do |csv|
|
38
51
|
csv << HEADERS
|
39
52
|
|
@@ -62,10 +75,10 @@ module TitleGrabber
|
|
62
75
|
rescue ThreadError; end
|
63
76
|
|
64
77
|
while url
|
65
|
-
if html = open_w_timeout(url)
|
78
|
+
if (html = open_w_timeout(url)) && !html.empty?
|
66
79
|
doc = begin
|
67
80
|
Oga.parse_html(html)
|
68
|
-
rescue LL::ParserError => err
|
81
|
+
rescue ArgumentError, LL::ParserError => err
|
69
82
|
logger.error "[Thread: ##{Thread.current[:id]}] Unable to parse HTML from URL '#{url}' - #{err.message}"
|
70
83
|
nil
|
71
84
|
end
|
@@ -88,12 +101,20 @@ module TitleGrabber
|
|
88
101
|
end
|
89
102
|
}.each(&:join)
|
90
103
|
end
|
91
|
-
|
92
|
-
FileUtils.mv(tmp_path, out_path)
|
93
104
|
end
|
94
105
|
|
95
106
|
private
|
96
107
|
|
108
|
+
def install_at_exit_handler
|
109
|
+
at_exit do
|
110
|
+
if tmp_path.size?
|
111
|
+
FileUtils.mv(tmp_path, out_path)
|
112
|
+
else
|
113
|
+
tmp_path.unlink if tmp_path.exist?
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
97
118
|
def processed_urls
|
98
119
|
@processed_urls ||= begin
|
99
120
|
urls = {}
|