GDNewsScraper 3.0.7 → 3.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0ab7ee6a0e7e30c64e6ece2a156b9c94756c00a6
4
- data.tar.gz: 30887530f6a43130209a98402fb0c32b647a5f77
3
+ metadata.gz: 4930eeeb0c78d881acc7a9fad32bac5e55d3743e
4
+ data.tar.gz: 11be82b0c24cd044485a71b863088892a1cb99ad
5
5
  SHA512:
6
- metadata.gz: cefe85f767614e30d79caa71c50c43eb466e6b52359755ab0dd84491682c42fbff25ac362c181eb394ddb0590e9717fdf4b7b6d611f273caaf25595f584894a1
7
- data.tar.gz: 35fec37b2669653852e1dd3dcd39d95bb0d5233b3d4bee92bb5004731bf9062a2ac3285ca35529350f3694cfad50c4c2502c470622ee4c5321779f4f6b5db7ae
6
+ metadata.gz: 8d1617126dfdbc603d5328c4027b45a63fe76a22618188a966f873b5e3566cf4894c8b2987d89880665e8e969c21eda2bf4c0daf3a4635894350241801801f7e
7
+ data.tar.gz: 1361f266a1816e9c588c2c68f8230d29476c7a41ef3820732eec23e8b91dfa625f884937184090297c7e7f61622e9053c449c0a7e31394644729c5f0e9400711
@@ -1,3 +1,4 @@
1
+ require 'pry'
1
2
  require 'base64'
2
3
  require 'json'
3
4
 
@@ -47,10 +48,6 @@ module GDNewsScraper::Scrapers
47
48
  end
48
49
  end
49
50
 
50
- def refresh(article_url)
51
- parse_article_body(article_url)
52
- end
53
-
54
51
  def parse(article)
55
52
  pulse = Hash.new
56
53
 
@@ -65,7 +62,8 @@ module GDNewsScraper::Scrapers
65
62
  begin
66
63
  article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
67
64
 
68
- is_a_video = article_page.at('.c-video-embed').nil?
65
+ first_element = article_page.at('.l-col__main').elements.first
66
+ is_a_video = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'
69
67
 
70
68
  key = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
71
69
  url = article
@@ -92,14 +90,20 @@ module GDNewsScraper::Scrapers
92
90
  raise ArgumentError.new('Invalid URL')
93
91
  end
94
92
  elsif article.is_a?(Nokogiri::XML::Element)
95
- is_a_video = !article.at('.c-entry-box--compact--video').nil?
96
-
97
- key = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
98
- url = article.at('.c-entry-box--compact__title').at('> a').attr('href')
99
- title = strip(article.at('.c-entry-box--compact__title'))
100
- cover = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
101
- author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
102
- date = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
93
+ article_container = article.at('.c-entry-box--compact--article')
94
+
95
+ if article_container.nil?
96
+ raise StandardError.new('Not an Article, skipping..')
97
+ else
98
+ key = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
99
+ url = article.at('.c-entry-box--compact__title').at('> a').attr('href')
100
+ title = strip(article.at('.c-entry-box--compact__title'))
101
+ cover = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
102
+ author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
103
+ date = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
104
+
105
+ article_page = url
106
+ end
103
107
  else
104
108
  raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
105
109
  end
@@ -111,22 +115,29 @@ module GDNewsScraper::Scrapers
111
115
  pulse[:title] = title
112
116
  pulse[:author] = author
113
117
  pulse[:date] = date
114
- pulse[:content] = parse_article_body(url, is_a_video)
118
+ pulse[:content] = parse_article_body(article_page)
115
119
  pulse[:tags] = title.downcase.split
116
120
 
117
121
  return pulse
118
122
  rescue => e
119
- "There was a problem while parsing this Article: #{ e }"
123
+ {
124
+ success: false,
125
+ message: "There was a problem while parsing this Article: #{ e }"
126
+ }
120
127
  end
121
128
 
122
- def parse_article_body(article_url, is_a_video = false)
123
- article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
129
+ def parse_article_body(article)
130
+ if article.is_a?(String)
131
+ article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
132
+ else
133
+ article_page = article
134
+ end
135
+
124
136
  article_container = article_page.at('.c-entry-content')
125
137
 
126
138
  article_body = {
127
139
  galleries: { },
128
140
  videos: { },
129
- images: { },
130
141
 
131
142
  anchors: { },
132
143
  figures: { },
@@ -134,14 +145,28 @@ module GDNewsScraper::Scrapers
134
145
  body: [ ]
135
146
  }
136
147
 
148
+ # Check here as well since an Article CAN have an embeded video instead
149
+ # of a Cover and still show as a non-video artciel on the News page from
150
+ # where we initially took the 'is_a_video' check
151
+ #
152
+ first_element = article_page.at('.l-col__main').elements.first
153
+ is_a_video = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'
154
+
137
155
  if is_a_video
138
- iframe = article_page.at('.c-video-embed--media').at('iframe')
139
- iframe_id = unique_id
156
+ id = unique_id(first_element)
140
157
 
141
- article_body[:videos][iframe_id] = {}
142
- article_body[:videos][iframe_id][:url] = iframe.attr('src')
158
+ is_polygon_video = !first_element.attributes['data-volume-uuid'].nil?
143
159
 
144
- article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
160
+ if is_polygon_video
161
+ article_body[:videos][id] = {}
162
+ article_body[:videos][id][:label] = first_element.attr('data-analytics-label')&.split('|')&.first&.strip
163
+ article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ first_element.attr('data-volume-uuid') }"
164
+ else
165
+ article_body[:videos][id] = {}
166
+ article_body[:videos][id][:url] = first_element.at('iframe').attr('src')
167
+ end
168
+
169
+ article_body[:body] << first_element.replace("{{video:#{ id }}}").to_html
145
170
  end
146
171
 
147
172
  article_container.children.each do |node|
@@ -161,12 +186,12 @@ module GDNewsScraper::Scrapers
161
186
  iframe = node.at('iframe')
162
187
 
163
188
  if iframe # YouTube videos
164
- iframe_id = unique_id
189
+ id = unique_id(iframe)
165
190
 
166
- article_body[:videos][iframe_id] = {}
167
- article_body[:videos][iframe_id][:url] = iframe.attr('src')
191
+ article_body[:videos][id] = {}
192
+ article_body[:videos][id][:url] = iframe.attr('src')
168
193
 
169
- article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
194
+ article_body[:body] << iframe.replace("{{video:#{ id }}}").to_html
170
195
  end
171
196
 
172
197
  # Check to see if the Article has a video by Polygon, which is
@@ -175,7 +200,7 @@ module GDNewsScraper::Scrapers
175
200
  polygon_video = node.attributes['data-volume-uuid']
176
201
 
177
202
  unless polygon_video.nil?
178
- id = unique_id
203
+ id = unique_id(polygon_video)
179
204
 
180
205
  article_body[:videos][id] = {}
181
206
  article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
@@ -191,18 +216,18 @@ module GDNewsScraper::Scrapers
191
216
  if gallery
192
217
  gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
193
218
 
194
- gallery_id = unique_id
195
- article_body[:galleries][gallery_id] = []
219
+ id = unique_id(gallery)
220
+ article_body[:galleries][id] = []
196
221
 
197
222
  gallery_container.children.children.each do |image_container|
198
223
  image = image_container.at('a')
199
224
 
200
225
  if image
201
- article_body[:galleries][gallery_id] << image.attr('href')
226
+ article_body[:galleries][id] << image.attr('href')
202
227
  end
203
228
  end
204
229
 
205
- article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
230
+ article_body[:body] << gallery.replace("{{gallery:#{ id }}}").to_html
206
231
  end
207
232
 
208
233
  twitdget = node.at('.twitter-tweet')
@@ -230,32 +255,23 @@ module GDNewsScraper::Scrapers
230
255
  if figure
231
256
  node.css('.e-image__image').each do |image|
232
257
  image_url = image.attr('data-original')
258
+
259
+ id = unique_id(node)
260
+
261
+ article_body[:figures][id] = { }
262
+ article_body[:figures][id][:image] = image_url
233
263
 
234
- if image_url.split('.').last == 'gif'
235
- id = unique_id
236
-
237
- article_body[:images][id] = { }
238
- article_body[:images][id][:url] = image_url
239
-
240
- article_body[:body] << node.replace("{{image:#{ id }}}").to_html
241
- else
242
- id = unique_id
243
-
244
- article_body[:figures][id] = { }
245
-
246
- article_body[:figures][id][:image] = image_url
247
- article_body[:figures][id][:title] = image.at('img').attr('title')
248
- article_body[:figures][id][:alt] = image.at('img').attr('alt')
264
+ article_body[:figures][id][:title] = image.at('img')&.attr('title')
265
+ article_body[:figures][id][:alt] = image.at('img')&.attr('alt')
249
266
 
250
- image_meta = node.at('.e-image__meta')
267
+ image_meta = node.at('.e-image__meta')
251
268
 
252
- unless image_meta.nil?
253
- article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
254
- article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
255
- end
256
-
257
- article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
269
+ unless image_meta.nil?
270
+ article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
271
+ article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
258
272
  end
273
+
274
+ article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
259
275
  end
260
276
 
261
277
  node.traverse { |children| children.remove }
@@ -275,7 +291,7 @@ module GDNewsScraper::Scrapers
275
291
  node.children.each do |inner_node|
276
292
  case inner_node.name
277
293
  when 'a'
278
- id = unique_id
294
+ id = unique_id(inner_node)
279
295
 
280
296
  article_body[:anchors][id] = {
281
297
  text: inner_node.children.text,
@@ -312,11 +328,7 @@ module GDNewsScraper::Scrapers
312
328
 
313
329
  return article_body
314
330
  rescue => e
315
- "There was a problem while parsing this Article: #{ e }"
316
- end
317
-
318
- def figure(article_body, id, node, image, image_url)
319
-
331
+ "There was a problem while parsing this Article's body: #{ e }"
320
332
  end
321
333
 
322
334
  private
@@ -329,8 +341,12 @@ module GDNewsScraper::Scrapers
329
341
  string&.text&.strip
330
342
  end
331
343
 
332
- def unique_id
333
- (0...50).map { (65 + rand(25)).chr }.join.to_sym
344
+ def unique_id(node)
345
+ Base64.strict_encode64(node.to_s)
346
+ .reverse
347
+ .gsub(/[^0-9A-Za-z]/, '')[0..100]
348
+ .downcase
349
+ .to_sym
334
350
  end
335
351
  end # News
336
352
  end # PolygonCOM
@@ -1,5 +1,5 @@
1
1
  module GDNewsScraper
2
- VERSION ||= '3.0.7'
2
+ VERSION ||= '3.0.9'
3
3
 
4
4
  # => major: A new Source has been added or removed
5
5
  # => minor: A Source code has changed drastically to a point where it's not
@@ -37,5 +37,27 @@ module GDNewsScraper
37
37
  # about 10% on average! :)
38
38
  # v3.0.7 - Changed the way figures are added to the articles which takes in
39
39
  # consideration deeply nested figures as well
40
- #
40
+ # v3.0.8 - Removed the refresh method since you can parse an article by
41
+ # passing it URL
42
+ # v3.0.9
43
+ #
44
+ # - Generate truly unique strings using Base64.strict_encode. This stops Rails
45
+ # thinking the Article has not been changed even though its the same. Previous
46
+ # unique_id method would re-regenerate the id every time the Article is
47
+ # requested
48
+ # - Identify whether or not the Article is a video when indexing the Article
49
+ # page rather than doing it when scraping the Articles page
50
+ # - Only account for the Video that's inside the Article, not any other video
51
+ # that might be on the page using 'node.at()' instead of 'node.css()' which
52
+ # returns only the first match
53
+ # - Remove the 'is_a_video' argument when parsing the body since we're doing
54
+ # the check there from now on.
55
+ # Some articles that are of type video don't necessarily show as such when
56
+ # viewing them on the Articles page
57
+ # - Change the paramater that is passed to 'parse_article_body'. When we're
58
+ # parsing an article from a URL we don't need to re-request the page with
59
+ # Nokogiri
60
+ # - Increased the size of the unique_id to 100.
61
+ # Images have the same url prefix, where only the image name is different, as
62
+ # such, the unique_id was not that unique anymore..
41
63
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: GDNewsScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.7
4
+ version: 3.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Radulescu