title_grabber 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/http_helper.rb +4 -4
- data/lib/title_grabber.rb +40 -30
- data/lib/title_grabber/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4e56496e45f91fb324125076d07c4fe3ec57fc9200246f78c78f9a13f3ca3b44
|
4
|
+
data.tar.gz: b1d99e4a53edc27a27b95fd1326159861a1c548af103a5992a60ce7aa83c0d83
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d132efc517ca28ab5f6c857d5d9267005d1d74f3cbff29032c88b1a40ff60e6a49827f7479b486e8b8a2922cabab447e4370d70fbaa9db963b9a5a00fb1c91a1
|
7
|
+
data.tar.gz: a6d9d08caca7314a1a2c595039a065a8e22ca8b37aecf60303973db4b8939baa0fb62011ae2be201c01b9247ac7c31b7486c6a37343c86836a4c30167d565dd0
|
data/Gemfile.lock
CHANGED
data/lib/http_helper.rb
CHANGED
@@ -21,10 +21,10 @@ module HTTPHelper
|
|
21
21
|
retries = 0
|
22
22
|
|
23
23
|
begin
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
24
|
+
Timeout.timeout(read_to) {
|
25
|
+
open(url, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE,
|
26
|
+
open_timeout: connect_to, read_timeout: read_to)
|
27
|
+
}
|
28
28
|
rescue => err
|
29
29
|
msg = err.message
|
30
30
|
if msg =~ REDIR_FORBIDDEN
|
data/lib/title_grabber.rb
CHANGED
@@ -84,7 +84,7 @@ module TitleGrabber
|
|
84
84
|
end
|
85
85
|
|
86
86
|
thr_cnt = [max_threads, queue.size].min
|
87
|
-
|
87
|
+
1.upto(thr_cnt).map.with_index { |_, i|
|
88
88
|
Thread.new(i) do |j|
|
89
89
|
Thread.current.name = "Thread ##{i + 1}"
|
90
90
|
|
@@ -104,33 +104,9 @@ module TitleGrabber
|
|
104
104
|
end
|
105
105
|
|
106
106
|
if doc
|
107
|
-
|
108
|
-
|
109
|
-
tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
|
110
|
-
map { |a| a[-"href"] })
|
111
|
-
|
112
|
-
end
|
113
|
-
tweet_urls.compact!
|
114
|
-
tweet_urls.uniq!
|
115
|
-
tweet_urls.map! do |url|
|
116
|
-
if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
|
117
|
-
uri = res.base_uri
|
118
|
-
uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
|
119
|
-
else
|
120
|
-
url
|
121
|
-
end
|
122
|
-
end
|
123
|
-
tweet_urls.compact!
|
124
|
-
tweet_urls.map! do |url|
|
125
|
-
url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
|
107
|
+
if e_url = parse_end_url_from(doc)
|
108
|
+
end_url = e_url
|
126
109
|
end
|
127
|
-
tweet_urls.delete_if { |url|
|
128
|
-
uri = URI(url)
|
129
|
-
uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
|
130
|
-
!uri.to_s.match?(TWITTER_STATUS_RE)
|
131
|
-
}
|
132
|
-
tweet_urls.sort!
|
133
|
-
end_url = tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
|
134
110
|
|
135
111
|
page_title = doc.at_css('title')&.text || -""
|
136
112
|
clean_up_whitespace(page_title) unless page_title.empty?
|
@@ -164,6 +140,11 @@ module TitleGrabber
|
|
164
140
|
|
165
141
|
private
|
166
142
|
|
143
|
+
def http_opts
|
144
|
+
@http_opts ||= { connect_to: connect_to, read_to: read_to,
|
145
|
+
max_retries: max_retries }
|
146
|
+
end
|
147
|
+
|
167
148
|
def processed_urls
|
168
149
|
@processed_urls ||= begin
|
169
150
|
urls = {}
|
@@ -186,9 +167,38 @@ module TitleGrabber
|
|
186
167
|
end
|
187
168
|
end
|
188
169
|
|
189
|
-
def
|
190
|
-
|
191
|
-
|
170
|
+
def parse_end_url_from(doc)
|
171
|
+
tweet_urls = []
|
172
|
+
TWEET_TXT_SELS.each do |tweet_txt_sel|
|
173
|
+
tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
|
174
|
+
map { |a| a[-"href"] })
|
175
|
+
|
176
|
+
end
|
177
|
+
tweet_urls.compact!
|
178
|
+
tweet_urls.uniq!
|
179
|
+
|
180
|
+
tweet_urls.map! do |url|
|
181
|
+
if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
|
182
|
+
uri = res.base_uri
|
183
|
+
uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
|
184
|
+
else
|
185
|
+
url
|
186
|
+
end
|
187
|
+
end
|
188
|
+
tweet_urls.compact!
|
189
|
+
|
190
|
+
tweet_urls.map! do |url|
|
191
|
+
url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
|
192
|
+
end
|
193
|
+
|
194
|
+
tweet_urls.delete_if { |url|
|
195
|
+
uri = URI(url)
|
196
|
+
uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
|
197
|
+
!uri.to_s.match?(TWITTER_STATUS_RE)
|
198
|
+
}
|
199
|
+
tweet_urls.sort!
|
200
|
+
|
201
|
+
tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
|
192
202
|
end
|
193
203
|
end
|
194
204
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: oga
|