title_grabber 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/http_helper.rb +4 -4
- data/lib/title_grabber.rb +40 -30
- data/lib/title_grabber/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4e56496e45f91fb324125076d07c4fe3ec57fc9200246f78c78f9a13f3ca3b44
|
4
|
+
data.tar.gz: b1d99e4a53edc27a27b95fd1326159861a1c548af103a5992a60ce7aa83c0d83
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d132efc517ca28ab5f6c857d5d9267005d1d74f3cbff29032c88b1a40ff60e6a49827f7479b486e8b8a2922cabab447e4370d70fbaa9db963b9a5a00fb1c91a1
|
7
|
+
data.tar.gz: a6d9d08caca7314a1a2c595039a065a8e22ca8b37aecf60303973db4b8939baa0fb62011ae2be201c01b9247ac7c31b7486c6a37343c86836a4c30167d565dd0
|
data/Gemfile.lock
CHANGED
data/lib/http_helper.rb
CHANGED
@@ -21,10 +21,10 @@ module HTTPHelper
|
|
21
21
|
retries = 0
|
22
22
|
|
23
23
|
begin
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
24
|
+
Timeout.timeout(read_to) {
|
25
|
+
open(url, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE,
|
26
|
+
open_timeout: connect_to, read_timeout: read_to)
|
27
|
+
}
|
28
28
|
rescue => err
|
29
29
|
msg = err.message
|
30
30
|
if msg =~ REDIR_FORBIDDEN
|
data/lib/title_grabber.rb
CHANGED
@@ -84,7 +84,7 @@ module TitleGrabber
|
|
84
84
|
end
|
85
85
|
|
86
86
|
thr_cnt = [max_threads, queue.size].min
|
87
|
-
|
87
|
+
1.upto(thr_cnt).map.with_index { |_, i|
|
88
88
|
Thread.new(i) do |j|
|
89
89
|
Thread.current.name = "Thread ##{i + 1}"
|
90
90
|
|
@@ -104,33 +104,9 @@ module TitleGrabber
|
|
104
104
|
end
|
105
105
|
|
106
106
|
if doc
|
107
|
-
|
108
|
-
|
109
|
-
tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
|
110
|
-
map { |a| a[-"href"] })
|
111
|
-
|
112
|
-
end
|
113
|
-
tweet_urls.compact!
|
114
|
-
tweet_urls.uniq!
|
115
|
-
tweet_urls.map! do |url|
|
116
|
-
if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
|
117
|
-
uri = res.base_uri
|
118
|
-
uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
|
119
|
-
else
|
120
|
-
url
|
121
|
-
end
|
122
|
-
end
|
123
|
-
tweet_urls.compact!
|
124
|
-
tweet_urls.map! do |url|
|
125
|
-
url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
|
107
|
+
if e_url = parse_end_url_from(doc)
|
108
|
+
end_url = e_url
|
126
109
|
end
|
127
|
-
tweet_urls.delete_if { |url|
|
128
|
-
uri = URI(url)
|
129
|
-
uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
|
130
|
-
!uri.to_s.match?(TWITTER_STATUS_RE)
|
131
|
-
}
|
132
|
-
tweet_urls.sort!
|
133
|
-
end_url = tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
|
134
110
|
|
135
111
|
page_title = doc.at_css('title')&.text || -""
|
136
112
|
clean_up_whitespace(page_title) unless page_title.empty?
|
@@ -164,6 +140,11 @@ module TitleGrabber
|
|
164
140
|
|
165
141
|
private
|
166
142
|
|
143
|
+
def http_opts
|
144
|
+
@http_opts ||= { connect_to: connect_to, read_to: read_to,
|
145
|
+
max_retries: max_retries }
|
146
|
+
end
|
147
|
+
|
167
148
|
def processed_urls
|
168
149
|
@processed_urls ||= begin
|
169
150
|
urls = {}
|
@@ -186,9 +167,38 @@ module TitleGrabber
|
|
186
167
|
end
|
187
168
|
end
|
188
169
|
|
189
|
-
def
|
190
|
-
|
191
|
-
|
170
|
+
def parse_end_url_from(doc)
|
171
|
+
tweet_urls = []
|
172
|
+
TWEET_TXT_SELS.each do |tweet_txt_sel|
|
173
|
+
tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
|
174
|
+
map { |a| a[-"href"] })
|
175
|
+
|
176
|
+
end
|
177
|
+
tweet_urls.compact!
|
178
|
+
tweet_urls.uniq!
|
179
|
+
|
180
|
+
tweet_urls.map! do |url|
|
181
|
+
if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
|
182
|
+
uri = res.base_uri
|
183
|
+
uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
|
184
|
+
else
|
185
|
+
url
|
186
|
+
end
|
187
|
+
end
|
188
|
+
tweet_urls.compact!
|
189
|
+
|
190
|
+
tweet_urls.map! do |url|
|
191
|
+
url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
|
192
|
+
end
|
193
|
+
|
194
|
+
tweet_urls.delete_if { |url|
|
195
|
+
uri = URI(url)
|
196
|
+
uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
|
197
|
+
!uri.to_s.match?(TWITTER_STATUS_RE)
|
198
|
+
}
|
199
|
+
tweet_urls.sort!
|
200
|
+
|
201
|
+
tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
|
192
202
|
end
|
193
203
|
end
|
194
204
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: oga
|