title_grabber 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '028ecf2bea074495354ac08494517b7f254e8a27d782e2a3ff1eec7bd313474b'
4
- data.tar.gz: 20adeb673a80980e1c3e8ee04285451eb0aa4d8e98c4c17cad5b72f9549f04ee
3
+ metadata.gz: 4e56496e45f91fb324125076d07c4fe3ec57fc9200246f78c78f9a13f3ca3b44
4
+ data.tar.gz: b1d99e4a53edc27a27b95fd1326159861a1c548af103a5992a60ce7aa83c0d83
5
5
  SHA512:
6
- metadata.gz: 2a6268347f464325956f4281f0132b9f6c03d945eefab10754fb93387b0bd0db384cac05899238532f42c62869f58e6282e1e8eadf326bde9026b3d65ba97261
7
- data.tar.gz: e1388a778ea78d29f9c791db0e6eaaa0fa379ba912f6869a0b90f087c1fb7e366c61b08f47dce25f003abbc8f6909dde3b3c99a1636fdcf1777eb2fcbb61c72e
6
+ metadata.gz: d132efc517ca28ab5f6c857d5d9267005d1d74f3cbff29032c88b1a40ff60e6a49827f7479b486e8b8a2922cabab447e4370d70fbaa9db963b9a5a00fb1c91a1
7
+ data.tar.gz: a6d9d08caca7314a1a2c595039a065a8e22ca8b37aecf60303973db4b8939baa0fb62011ae2be201c01b9247ac7c31b7486c6a37343c86836a4c30167d565dd0
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.4.1)
4
+ title_grabber (0.5.0)
5
5
  oga (~> 2.15)
6
6
 
7
7
  GEM
@@ -21,10 +21,10 @@ module HTTPHelper
21
21
  retries = 0
22
22
 
23
23
  begin
24
- res = Timeout.timeout(read_to) {
25
- open(url, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE,
26
- open_timeout: connect_to, read_timeout: read_to)
27
- }
24
+ Timeout.timeout(read_to) {
25
+ open(url, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE,
26
+ open_timeout: connect_to, read_timeout: read_to)
27
+ }
28
28
  rescue => err
29
29
  msg = err.message
30
30
  if msg =~ REDIR_FORBIDDEN
@@ -84,7 +84,7 @@ module TitleGrabber
84
84
  end
85
85
 
86
86
  thr_cnt = [max_threads, queue.size].min
87
- threads = 1.upto(thr_cnt).map.with_index { |_, i|
87
+ 1.upto(thr_cnt).map.with_index { |_, i|
88
88
  Thread.new(i) do |j|
89
89
  Thread.current.name = "Thread ##{i + 1}"
90
90
 
@@ -104,33 +104,9 @@ module TitleGrabber
104
104
  end
105
105
 
106
106
  if doc
107
- tweet_urls = []
108
- TWEET_TXT_SELS.each do |tweet_txt_sel|
109
- tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
110
- map { |a| a[-"href"] })
111
-
112
- end
113
- tweet_urls.compact!
114
- tweet_urls.uniq!
115
- tweet_urls.map! do |url|
116
- if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
117
- uri = res.base_uri
118
- uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
119
- else
120
- url
121
- end
122
- end
123
- tweet_urls.compact!
124
- tweet_urls.map! do |url|
125
- url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
107
+ if e_url = parse_end_url_from(doc)
108
+ end_url = e_url
126
109
  end
127
- tweet_urls.delete_if { |url|
128
- uri = URI(url)
129
- uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
130
- !uri.to_s.match?(TWITTER_STATUS_RE)
131
- }
132
- tweet_urls.sort!
133
- end_url = tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
134
110
 
135
111
  page_title = doc.at_css('title')&.text || -""
136
112
  clean_up_whitespace(page_title) unless page_title.empty?
@@ -164,6 +140,11 @@ module TitleGrabber
164
140
 
165
141
  private
166
142
 
143
+ def http_opts
144
+ @http_opts ||= { connect_to: connect_to, read_to: read_to,
145
+ max_retries: max_retries }
146
+ end
147
+
167
148
  def processed_urls
168
149
  @processed_urls ||= begin
169
150
  urls = {}
@@ -186,9 +167,38 @@ module TitleGrabber
186
167
  end
187
168
  end
188
169
 
189
- def http_opts
190
- @http_opts ||= { connect_to: connect_to, read_to: read_to,
191
- max_retries: max_retries }
170
+ def parse_end_url_from(doc)
171
+ tweet_urls = []
172
+ TWEET_TXT_SELS.each do |tweet_txt_sel|
173
+ tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
174
+ map { |a| a[-"href"] })
175
+
176
+ end
177
+ tweet_urls.compact!
178
+ tweet_urls.uniq!
179
+
180
+ tweet_urls.map! do |url|
181
+ if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
182
+ uri = res.base_uri
183
+ uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
184
+ else
185
+ url
186
+ end
187
+ end
188
+ tweet_urls.compact!
189
+
190
+ tweet_urls.map! do |url|
191
+ url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
192
+ end
193
+
194
+ tweet_urls.delete_if { |url|
195
+ uri = URI(url)
196
+ uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
197
+ !uri.to_s.match?(TWITTER_STATUS_RE)
198
+ }
199
+ tweet_urls.sort!
200
+
201
+ tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
192
202
  end
193
203
  end
194
204
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.5.0"
2
+ VERSION = "0.5.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-18 00:00:00.000000000 Z
11
+ date: 2019-04-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: oga