title_grabber 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/http_helper.rb +6 -7
- data/lib/title_grabber.rb +16 -6
- data/lib/title_grabber/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: efadff19dbc0622e9188c0349f80ff8d7e5a3040c91db075f3c51529efb39b03
|
4
|
+
data.tar.gz: 3230ec05cd35e8d3b797b9caa69eca8bf30a41598fb98b312b59522b9c348353
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02adc0c9b125a64a11fce2fc6be7ee1ab3d3798045536823f6c2763c7c004c950bfbacafdc443634e43c2f742adb42ba6213a5c1197575648b3ff23c5a2ab1e2
|
7
|
+
data.tar.gz: d487cbd2f084db0f076a42f03fe14912d614bacbf8a7ae33d3eed93f2e6125da69c5ef4433d70e94fbbe4103341bd86f07bd02c725d3721ce6f6816c00227e7e
|
data/Gemfile.lock
CHANGED
data/lib/http_helper.rb
CHANGED
@@ -15,12 +15,11 @@ module HTTPHelper
|
|
15
15
|
retries = 0
|
16
16
|
|
17
17
|
begin
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
}
|
18
|
+
res = Timeout.timeout(read_to) {
|
19
|
+
HTTP.timeout(write: write_to, connect: connect_to, read: read_to).
|
20
|
+
follow(max_hops: MAX_HOPS).
|
21
|
+
get(url, ssl_context: ssl_ctx)
|
22
|
+
}
|
24
23
|
rescue HTTP::Error, Timeout::Error => err
|
25
24
|
msg = err.message
|
26
25
|
|
@@ -42,7 +41,7 @@ module HTTPHelper
|
|
42
41
|
logger.error "[#{Thread.current.name}] URL: #{url} [#{err.message}]"
|
43
42
|
nil
|
44
43
|
else
|
45
|
-
utf8_encode(
|
44
|
+
[res.uri.to_s, utf8_encode(res.to_s)]
|
46
45
|
end
|
47
46
|
end
|
48
47
|
|
data/lib/title_grabber.rb
CHANGED
@@ -20,9 +20,11 @@ module TitleGrabber
|
|
20
20
|
MAX_THREADS = Etc.nprocessors
|
21
21
|
URL_RE = %r(https?://\S+)i
|
22
22
|
URL_HEADER = -"url"
|
23
|
+
END_URL_HEAD = -"end_url"
|
23
24
|
PAGE_TIT_HEAD = -"page_title"
|
24
25
|
ART_TIT_HEAD = -"article_title"
|
25
|
-
HEADERS = [URL_HEADER, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
|
26
|
+
HEADERS = [URL_HEADER, END_URL_HEAD, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
|
27
|
+
ART_TIT_SEL = ["article h1", "h1"].freeze
|
26
28
|
|
27
29
|
def self.call(lines, options)
|
28
30
|
MultiThreadedGrabber.new(lines, options).call
|
@@ -86,7 +88,8 @@ module TitleGrabber
|
|
86
88
|
rescue ThreadError; end
|
87
89
|
|
88
90
|
while url
|
89
|
-
|
91
|
+
end_url, html = open_w_timeout(url, **http_opts)
|
92
|
+
if html && !html&.empty?
|
90
93
|
doc = begin
|
91
94
|
Oga.parse_html(html)
|
92
95
|
rescue ArgumentError, LL::ParserError => err
|
@@ -97,11 +100,16 @@ module TitleGrabber
|
|
97
100
|
if doc
|
98
101
|
page_title = doc.at_css('title')&.text || -""
|
99
102
|
clean_up_whitespace(page_title) unless page_title.empty?
|
100
|
-
|
101
|
-
article_title
|
103
|
+
|
104
|
+
article_title = nil
|
105
|
+
ART_TIT_SEL.each do |selector|
|
106
|
+
article_title = doc.at_css(selector)&.text
|
107
|
+
break if article_title && !article_title.empty?
|
108
|
+
end
|
109
|
+
article_title ||= -""
|
102
110
|
clean_up_whitespace(article_title) unless article_title.empty?
|
103
111
|
|
104
|
-
csv << [url, page_title, article_title]
|
112
|
+
csv << [url, end_url, page_title, article_title]
|
105
113
|
end
|
106
114
|
end
|
107
115
|
|
@@ -128,11 +136,13 @@ module TitleGrabber
|
|
128
136
|
|
129
137
|
if out_path.exist?
|
130
138
|
CSV.foreach(out_path, headers: true) do |r|
|
139
|
+
end_url = r[END_URL_HEAD]
|
131
140
|
page_tit = r[PAGE_TIT_HEAD]
|
132
141
|
art_tit = r[ART_TIT_HEAD]
|
133
142
|
|
134
143
|
unless page_tit.empty? && art_tit.empty?
|
135
|
-
urls[r[URL_HEADER]] = {
|
144
|
+
urls[r[URL_HEADER]] = { END_URL_HEAD => end_url,
|
145
|
+
PAGE_TIT_HEAD => page_tit,
|
136
146
|
ART_TIT_HEAD => art_tit }
|
137
147
|
end
|
138
148
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: http
|