title_grabber 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/http_helper.rb +1 -3
- data/lib/text_helper.rb +6 -2
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +27 -6
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f341d404fbe5f16ef1eb904c75fbdd9331a76de7cb08c969d59526dbae9d95d0
|
4
|
+
data.tar.gz: 586a09fdca43a48db702b69e4e6e5d123e123d5aa6631a8df8c7b9eb46dd8820
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14ceff1f56aea4cd1b9185b46e99af361ae243f809d50571933a96de2a757aa0589b4ecda34dfdeb9ff647c42412fed95acd1a1ee03fb875ed05229f53a71505
|
7
|
+
data.tar.gz: fc2924e719dab7d72b8adf119fe11d29909433a84732a19b878a2faf437dfb5e4e595c696e63d525864133499816bb9722c76af375dea8000851e455908705ae
|
data/Gemfile.lock
CHANGED
data/lib/http_helper.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require "logger"
|
2
|
-
|
3
1
|
require "http"
|
4
2
|
|
5
3
|
require_relative "text_helper"
|
@@ -37,7 +35,7 @@ module HTTPHelper
|
|
37
35
|
|
38
36
|
if retries <= MAX_RETRIES
|
39
37
|
rest_time = rand(REST_INTERVAL)
|
40
|
-
logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time} secs - Retry ##{retries}"
|
38
|
+
logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
|
41
39
|
sleep(rest_time)
|
42
40
|
retry
|
43
41
|
else
|
data/lib/text_helper.rb
CHANGED
@@ -2,11 +2,16 @@ module TextHelper
|
|
2
2
|
SINGLE_SPACE = -" "
|
3
3
|
|
4
4
|
def utf8_encode(text = nil)
|
5
|
+
txt = String(text)
|
6
|
+
|
5
7
|
begin
|
6
|
-
|
8
|
+
txt.encode!(-"UTF-8", invalid: :replace, undef: :replace,
|
7
9
|
replace: -"")
|
8
10
|
rescue EncodingError
|
9
11
|
-""
|
12
|
+
else
|
13
|
+
txt.delete!(-"\u0000") # get rid of nasty null bytes
|
14
|
+
txt
|
10
15
|
end
|
11
16
|
end
|
12
17
|
|
@@ -15,7 +20,6 @@ module TextHelper
|
|
15
20
|
text.strip!
|
16
21
|
text.gsub!("\n", SINGLE_SPACE)
|
17
22
|
text.gsub(/\s{2,}/, SINGLE_SPACE)
|
18
|
-
text.delete!(-"\u0000") # get rid of nasty null bytes
|
19
23
|
text
|
20
24
|
end
|
21
25
|
end
|
data/lib/title_grabber.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require "csv"
|
2
2
|
require "etc"
|
3
3
|
require "fileutils"
|
4
|
+
require "logger"
|
5
|
+
require "pathname"
|
4
6
|
|
5
7
|
require "bundler/setup"
|
6
8
|
require "oga"
|
@@ -24,16 +26,27 @@ module TitleGrabber
|
|
24
26
|
include HTTPHelper
|
25
27
|
include TextHelper
|
26
28
|
|
27
|
-
attr_reader :lines, :out_path
|
29
|
+
attr_reader :lines, :out_path, :tmp_path, :logger
|
28
30
|
|
29
31
|
def initialize(lines, options)
|
30
32
|
@lines = lines
|
31
33
|
@out_path = options[:output]
|
34
|
+
@tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
|
35
|
+
|
36
|
+
logging_target = if ENV["DEBUG"]
|
37
|
+
STDOUT
|
38
|
+
else
|
39
|
+
log_file = Pathname(__FILE__).sub_ext(".log").
|
40
|
+
basename.
|
41
|
+
open("w")
|
42
|
+
end
|
43
|
+
@logger = Logger.new(logging_target)
|
32
44
|
end
|
33
45
|
|
34
46
|
def call
|
47
|
+
install_at_exit_handler
|
48
|
+
|
35
49
|
queue = Queue.new
|
36
|
-
tmp_path = out_path.sub_ext(".tmp#{out_path.extname}")
|
37
50
|
CSV.open(tmp_path, "w", force_quotes: true) do |csv|
|
38
51
|
csv << HEADERS
|
39
52
|
|
@@ -62,10 +75,10 @@ module TitleGrabber
|
|
62
75
|
rescue ThreadError; end
|
63
76
|
|
64
77
|
while url
|
65
|
-
if html = open_w_timeout(url)
|
78
|
+
if (html = open_w_timeout(url)) && !html.empty?
|
66
79
|
doc = begin
|
67
80
|
Oga.parse_html(html)
|
68
|
-
rescue LL::ParserError => err
|
81
|
+
rescue ArgumentError, LL::ParserError => err
|
69
82
|
logger.error "[Thread: ##{Thread.current[:id]}] Unable to parse HTML from URL '#{url}' - #{err.message}"
|
70
83
|
nil
|
71
84
|
end
|
@@ -88,12 +101,20 @@ module TitleGrabber
|
|
88
101
|
end
|
89
102
|
}.each(&:join)
|
90
103
|
end
|
91
|
-
|
92
|
-
FileUtils.mv(tmp_path, out_path)
|
93
104
|
end
|
94
105
|
|
95
106
|
private
|
96
107
|
|
108
|
+
def install_at_exit_handler
|
109
|
+
at_exit do
|
110
|
+
if tmp_path.size?
|
111
|
+
FileUtils.mv(tmp_path, out_path)
|
112
|
+
else
|
113
|
+
tmp_path.unlink if tmp_path.exist?
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
97
118
|
def processed_urls
|
98
119
|
@processed_urls ||= begin
|
99
120
|
urls = {}
|