title_grabber 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '028ecf2bea074495354ac08494517b7f254e8a27d782e2a3ff1eec7bd313474b'
4
- data.tar.gz: 20adeb673a80980e1c3e8ee04285451eb0aa4d8e98c4c17cad5b72f9549f04ee
3
+ metadata.gz: 4e56496e45f91fb324125076d07c4fe3ec57fc9200246f78c78f9a13f3ca3b44
4
+ data.tar.gz: b1d99e4a53edc27a27b95fd1326159861a1c548af103a5992a60ce7aa83c0d83
5
5
  SHA512:
6
- metadata.gz: 2a6268347f464325956f4281f0132b9f6c03d945eefab10754fb93387b0bd0db384cac05899238532f42c62869f58e6282e1e8eadf326bde9026b3d65ba97261
7
- data.tar.gz: e1388a778ea78d29f9c791db0e6eaaa0fa379ba912f6869a0b90f087c1fb7e366c61b08f47dce25f003abbc8f6909dde3b3c99a1636fdcf1777eb2fcbb61c72e
6
+ metadata.gz: d132efc517ca28ab5f6c857d5d9267005d1d74f3cbff29032c88b1a40ff60e6a49827f7479b486e8b8a2922cabab447e4370d70fbaa9db963b9a5a00fb1c91a1
7
+ data.tar.gz: a6d9d08caca7314a1a2c595039a065a8e22ca8b37aecf60303973db4b8939baa0fb62011ae2be201c01b9247ac7c31b7486c6a37343c86836a4c30167d565dd0
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.4.1)
4
+ title_grabber (0.5.0)
5
5
  oga (~> 2.15)
6
6
 
7
7
  GEM
@@ -21,10 +21,10 @@ module HTTPHelper
21
21
  retries = 0
22
22
 
23
23
  begin
24
- res = Timeout.timeout(read_to) {
25
- open(url, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE,
26
- open_timeout: connect_to, read_timeout: read_to)
27
- }
24
+ Timeout.timeout(read_to) {
25
+ open(url, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE,
26
+ open_timeout: connect_to, read_timeout: read_to)
27
+ }
28
28
  rescue => err
29
29
  msg = err.message
30
30
  if msg =~ REDIR_FORBIDDEN
@@ -84,7 +84,7 @@ module TitleGrabber
84
84
  end
85
85
 
86
86
  thr_cnt = [max_threads, queue.size].min
87
- threads = 1.upto(thr_cnt).map.with_index { |_, i|
87
+ 1.upto(thr_cnt).map.with_index { |_, i|
88
88
  Thread.new(i) do |j|
89
89
  Thread.current.name = "Thread ##{i + 1}"
90
90
 
@@ -104,33 +104,9 @@ module TitleGrabber
104
104
  end
105
105
 
106
106
  if doc
107
- tweet_urls = []
108
- TWEET_TXT_SELS.each do |tweet_txt_sel|
109
- tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
110
- map { |a| a[-"href"] })
111
-
112
- end
113
- tweet_urls.compact!
114
- tweet_urls.uniq!
115
- tweet_urls.map! do |url|
116
- if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
117
- uri = res.base_uri
118
- uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
119
- else
120
- url
121
- end
122
- end
123
- tweet_urls.compact!
124
- tweet_urls.map! do |url|
125
- url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
107
+ if e_url = parse_end_url_from(doc)
108
+ end_url = e_url
126
109
  end
127
- tweet_urls.delete_if { |url|
128
- uri = URI(url)
129
- uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
130
- !uri.to_s.match?(TWITTER_STATUS_RE)
131
- }
132
- tweet_urls.sort!
133
- end_url = tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
134
110
 
135
111
  page_title = doc.at_css('title')&.text || -""
136
112
  clean_up_whitespace(page_title) unless page_title.empty?
@@ -164,6 +140,11 @@ module TitleGrabber
164
140
 
165
141
  private
166
142
 
143
+ def http_opts
144
+ @http_opts ||= { connect_to: connect_to, read_to: read_to,
145
+ max_retries: max_retries }
146
+ end
147
+
167
148
  def processed_urls
168
149
  @processed_urls ||= begin
169
150
  urls = {}
@@ -186,9 +167,38 @@ module TitleGrabber
186
167
  end
187
168
  end
188
169
 
189
- def http_opts
190
- @http_opts ||= { connect_to: connect_to, read_to: read_to,
191
- max_retries: max_retries }
170
+ def parse_end_url_from(doc)
171
+ tweet_urls = []
172
+ TWEET_TXT_SELS.each do |tweet_txt_sel|
173
+ tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
174
+ map { |a| a[-"href"] })
175
+
176
+ end
177
+ tweet_urls.compact!
178
+ tweet_urls.uniq!
179
+
180
+ tweet_urls.map! do |url|
181
+ if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
182
+ uri = res.base_uri
183
+ uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
184
+ else
185
+ url
186
+ end
187
+ end
188
+ tweet_urls.compact!
189
+
190
+ tweet_urls.map! do |url|
191
+ url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
192
+ end
193
+
194
+ tweet_urls.delete_if { |url|
195
+ uri = URI(url)
196
+ uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
197
+ !uri.to_s.match?(TWITTER_STATUS_RE)
198
+ }
199
+ tweet_urls.sort!
200
+
201
+ tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
192
202
  end
193
203
  end
194
204
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.5.0"
2
+ VERSION = "0.5.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-18 00:00:00.000000000 Z
11
+ date: 2019-04-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: oga