title_grabber 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/http_helper.rb +6 -7
- data/lib/title_grabber.rb +16 -6
- data/lib/title_grabber/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: efadff19dbc0622e9188c0349f80ff8d7e5a3040c91db075f3c51529efb39b03
|
4
|
+
data.tar.gz: 3230ec05cd35e8d3b797b9caa69eca8bf30a41598fb98b312b59522b9c348353
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02adc0c9b125a64a11fce2fc6be7ee1ab3d3798045536823f6c2763c7c004c950bfbacafdc443634e43c2f742adb42ba6213a5c1197575648b3ff23c5a2ab1e2
|
7
|
+
data.tar.gz: d487cbd2f084db0f076a42f03fe14912d614bacbf8a7ae33d3eed93f2e6125da69c5ef4433d70e94fbbe4103341bd86f07bd02c725d3721ce6f6816c00227e7e
|
data/Gemfile.lock
CHANGED
data/lib/http_helper.rb
CHANGED
@@ -15,12 +15,11 @@ module HTTPHelper
|
|
15
15
|
retries = 0
|
16
16
|
|
17
17
|
begin
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
}
|
18
|
+
res = Timeout.timeout(read_to) {
|
19
|
+
HTTP.timeout(write: write_to, connect: connect_to, read: read_to).
|
20
|
+
follow(max_hops: MAX_HOPS).
|
21
|
+
get(url, ssl_context: ssl_ctx)
|
22
|
+
}
|
24
23
|
rescue HTTP::Error, Timeout::Error => err
|
25
24
|
msg = err.message
|
26
25
|
|
@@ -42,7 +41,7 @@ module HTTPHelper
|
|
42
41
|
logger.error "[#{Thread.current.name}] URL: #{url} [#{err.message}]"
|
43
42
|
nil
|
44
43
|
else
|
45
|
-
utf8_encode(
|
44
|
+
[res.uri.to_s, utf8_encode(res.to_s)]
|
46
45
|
end
|
47
46
|
end
|
48
47
|
|
data/lib/title_grabber.rb
CHANGED
@@ -20,9 +20,11 @@ module TitleGrabber
|
|
20
20
|
MAX_THREADS = Etc.nprocessors
|
21
21
|
URL_RE = %r(https?://\S+)i
|
22
22
|
URL_HEADER = -"url"
|
23
|
+
END_URL_HEAD = -"end_url"
|
23
24
|
PAGE_TIT_HEAD = -"page_title"
|
24
25
|
ART_TIT_HEAD = -"article_title"
|
25
|
-
HEADERS = [URL_HEADER, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
|
26
|
+
HEADERS = [URL_HEADER, END_URL_HEAD, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
|
27
|
+
ART_TIT_SEL = ["article h1", "h1"].freeze
|
26
28
|
|
27
29
|
def self.call(lines, options)
|
28
30
|
MultiThreadedGrabber.new(lines, options).call
|
@@ -86,7 +88,8 @@ module TitleGrabber
|
|
86
88
|
rescue ThreadError; end
|
87
89
|
|
88
90
|
while url
|
89
|
-
|
91
|
+
end_url, html = open_w_timeout(url, **http_opts)
|
92
|
+
if html && !html&.empty?
|
90
93
|
doc = begin
|
91
94
|
Oga.parse_html(html)
|
92
95
|
rescue ArgumentError, LL::ParserError => err
|
@@ -97,11 +100,16 @@ module TitleGrabber
|
|
97
100
|
if doc
|
98
101
|
page_title = doc.at_css('title')&.text || -""
|
99
102
|
clean_up_whitespace(page_title) unless page_title.empty?
|
100
|
-
|
101
|
-
article_title
|
103
|
+
|
104
|
+
article_title = nil
|
105
|
+
ART_TIT_SEL.each do |selector|
|
106
|
+
article_title = doc.at_css(selector)&.text
|
107
|
+
break if article_title && !article_title.empty?
|
108
|
+
end
|
109
|
+
article_title ||= -""
|
102
110
|
clean_up_whitespace(article_title) unless article_title.empty?
|
103
111
|
|
104
|
-
csv << [url, page_title, article_title]
|
112
|
+
csv << [url, end_url, page_title, article_title]
|
105
113
|
end
|
106
114
|
end
|
107
115
|
|
@@ -128,11 +136,13 @@ module TitleGrabber
|
|
128
136
|
|
129
137
|
if out_path.exist?
|
130
138
|
CSV.foreach(out_path, headers: true) do |r|
|
139
|
+
end_url = r[END_URL_HEAD]
|
131
140
|
page_tit = r[PAGE_TIT_HEAD]
|
132
141
|
art_tit = r[ART_TIT_HEAD]
|
133
142
|
|
134
143
|
unless page_tit.empty? && art_tit.empty?
|
135
|
-
urls[r[URL_HEADER]] = {
|
144
|
+
urls[r[URL_HEADER]] = { END_URL_HEAD => end_url,
|
145
|
+
PAGE_TIT_HEAD => page_tit,
|
136
146
|
ART_TIT_HEAD => art_tit }
|
137
147
|
end
|
138
148
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: http
|