GDNewsScraper 3.0.7 → 3.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0ab7ee6a0e7e30c64e6ece2a156b9c94756c00a6
4
- data.tar.gz: 30887530f6a43130209a98402fb0c32b647a5f77
3
+ metadata.gz: 4930eeeb0c78d881acc7a9fad32bac5e55d3743e
4
+ data.tar.gz: 11be82b0c24cd044485a71b863088892a1cb99ad
5
5
  SHA512:
6
- metadata.gz: cefe85f767614e30d79caa71c50c43eb466e6b52359755ab0dd84491682c42fbff25ac362c181eb394ddb0590e9717fdf4b7b6d611f273caaf25595f584894a1
7
- data.tar.gz: 35fec37b2669653852e1dd3dcd39d95bb0d5233b3d4bee92bb5004731bf9062a2ac3285ca35529350f3694cfad50c4c2502c470622ee4c5321779f4f6b5db7ae
6
+ metadata.gz: 8d1617126dfdbc603d5328c4027b45a63fe76a22618188a966f873b5e3566cf4894c8b2987d89880665e8e969c21eda2bf4c0daf3a4635894350241801801f7e
7
+ data.tar.gz: 1361f266a1816e9c588c2c68f8230d29476c7a41ef3820732eec23e8b91dfa625f884937184090297c7e7f61622e9053c449c0a7e31394644729c5f0e9400711
@@ -1,3 +1,4 @@
1
+ require 'pry'
1
2
  require 'base64'
2
3
  require 'json'
3
4
 
@@ -47,10 +48,6 @@ module GDNewsScraper::Scrapers
47
48
  end
48
49
  end
49
50
 
50
- def refresh(article_url)
51
- parse_article_body(article_url)
52
- end
53
-
54
51
  def parse(article)
55
52
  pulse = Hash.new
56
53
 
@@ -65,7 +62,8 @@ module GDNewsScraper::Scrapers
65
62
  begin
66
63
  article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
67
64
 
68
- is_a_video = article_page.at('.c-video-embed').nil?
65
+ first_element = article_page.at('.l-col__main').elements.first
66
+ is_a_video = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'
69
67
 
70
68
  key = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
71
69
  url = article
@@ -92,14 +90,20 @@ module GDNewsScraper::Scrapers
92
90
  raise ArgumentError.new('Invalid URL')
93
91
  end
94
92
  elsif article.is_a?(Nokogiri::XML::Element)
95
- is_a_video = !article.at('.c-entry-box--compact--video').nil?
96
-
97
- key = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
98
- url = article.at('.c-entry-box--compact__title').at('> a').attr('href')
99
- title = strip(article.at('.c-entry-box--compact__title'))
100
- cover = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
101
- author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
102
- date = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
93
+ article_container = article.at('.c-entry-box--compact--article')
94
+
95
+ if article_container.nil?
96
+ raise StandardError.new('Not an Article, skipping..')
97
+ else
98
+ key = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
99
+ url = article.at('.c-entry-box--compact__title').at('> a').attr('href')
100
+ title = strip(article.at('.c-entry-box--compact__title'))
101
+ cover = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
102
+ author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
103
+ date = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
104
+
105
+ article_page = url
106
+ end
103
107
  else
104
108
  raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
105
109
  end
@@ -111,22 +115,29 @@ module GDNewsScraper::Scrapers
111
115
  pulse[:title] = title
112
116
  pulse[:author] = author
113
117
  pulse[:date] = date
114
- pulse[:content] = parse_article_body(url, is_a_video)
118
+ pulse[:content] = parse_article_body(article_page)
115
119
  pulse[:tags] = title.downcase.split
116
120
 
117
121
  return pulse
118
122
  rescue => e
119
- "There was a problem while parsing this Article: #{ e }"
123
+ {
124
+ success: false,
125
+ message: "There was a problem while parsing this Article: #{ e }"
126
+ }
120
127
  end
121
128
 
122
- def parse_article_body(article_url, is_a_video = false)
123
- article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
129
+ def parse_article_body(article)
130
+ if article.is_a?(String)
131
+ article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
132
+ else
133
+ article_page = article
134
+ end
135
+
124
136
  article_container = article_page.at('.c-entry-content')
125
137
 
126
138
  article_body = {
127
139
  galleries: { },
128
140
  videos: { },
129
- images: { },
130
141
 
131
142
  anchors: { },
132
143
  figures: { },
@@ -134,14 +145,28 @@ module GDNewsScraper::Scrapers
134
145
  body: [ ]
135
146
  }
136
147
 
148
+ # Check here as well since an Article CAN have an embeded video instead
149
+ # of a Cover and still show as a non-video artciel on the News page from
150
+ # where we initially took the 'is_a_video' check
151
+ #
152
+ first_element = article_page.at('.l-col__main').elements.first
153
+ is_a_video = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'
154
+
137
155
  if is_a_video
138
- iframe = article_page.at('.c-video-embed--media').at('iframe')
139
- iframe_id = unique_id
156
+ id = unique_id(first_element)
140
157
 
141
- article_body[:videos][iframe_id] = {}
142
- article_body[:videos][iframe_id][:url] = iframe.attr('src')
158
+ is_polygon_video = !first_element.attributes['data-volume-uuid'].nil?
143
159
 
144
- article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
160
+ if is_polygon_video
161
+ article_body[:videos][id] = {}
162
+ article_body[:videos][id][:label] = first_element.attr('data-analytics-label')&.split('|')&.first&.strip
163
+ article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ first_element.attr('data-volume-uuid') }"
164
+ else
165
+ article_body[:videos][id] = {}
166
+ article_body[:videos][id][:url] = first_element.at('iframe').attr('src')
167
+ end
168
+
169
+ article_body[:body] << first_element.replace("{{video:#{ id }}}").to_html
145
170
  end
146
171
 
147
172
  article_container.children.each do |node|
@@ -161,12 +186,12 @@ module GDNewsScraper::Scrapers
161
186
  iframe = node.at('iframe')
162
187
 
163
188
  if iframe # YouTube videos
164
- iframe_id = unique_id
189
+ id = unique_id(iframe)
165
190
 
166
- article_body[:videos][iframe_id] = {}
167
- article_body[:videos][iframe_id][:url] = iframe.attr('src')
191
+ article_body[:videos][id] = {}
192
+ article_body[:videos][id][:url] = iframe.attr('src')
168
193
 
169
- article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
194
+ article_body[:body] << iframe.replace("{{video:#{ id }}}").to_html
170
195
  end
171
196
 
172
197
  # Check to see if the Article has a video by Polygon, which is
@@ -175,7 +200,7 @@ module GDNewsScraper::Scrapers
175
200
  polygon_video = node.attributes['data-volume-uuid']
176
201
 
177
202
  unless polygon_video.nil?
178
- id = unique_id
203
+ id = unique_id(polygon_video)
179
204
 
180
205
  article_body[:videos][id] = {}
181
206
  article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
@@ -191,18 +216,18 @@ module GDNewsScraper::Scrapers
191
216
  if gallery
192
217
  gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
193
218
 
194
- gallery_id = unique_id
195
- article_body[:galleries][gallery_id] = []
219
+ id = unique_id(gallery)
220
+ article_body[:galleries][id] = []
196
221
 
197
222
  gallery_container.children.children.each do |image_container|
198
223
  image = image_container.at('a')
199
224
 
200
225
  if image
201
- article_body[:galleries][gallery_id] << image.attr('href')
226
+ article_body[:galleries][id] << image.attr('href')
202
227
  end
203
228
  end
204
229
 
205
- article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
230
+ article_body[:body] << gallery.replace("{{gallery:#{ id }}}").to_html
206
231
  end
207
232
 
208
233
  twitdget = node.at('.twitter-tweet')
@@ -230,32 +255,23 @@ module GDNewsScraper::Scrapers
230
255
  if figure
231
256
  node.css('.e-image__image').each do |image|
232
257
  image_url = image.attr('data-original')
258
+
259
+ id = unique_id(node)
260
+
261
+ article_body[:figures][id] = { }
262
+ article_body[:figures][id][:image] = image_url
233
263
 
234
- if image_url.split('.').last == 'gif'
235
- id = unique_id
236
-
237
- article_body[:images][id] = { }
238
- article_body[:images][id][:url] = image_url
239
-
240
- article_body[:body] << node.replace("{{image:#{ id }}}").to_html
241
- else
242
- id = unique_id
243
-
244
- article_body[:figures][id] = { }
245
-
246
- article_body[:figures][id][:image] = image_url
247
- article_body[:figures][id][:title] = image.at('img').attr('title')
248
- article_body[:figures][id][:alt] = image.at('img').attr('alt')
264
+ article_body[:figures][id][:title] = image.at('img')&.attr('title')
265
+ article_body[:figures][id][:alt] = image.at('img')&.attr('alt')
249
266
 
250
- image_meta = node.at('.e-image__meta')
267
+ image_meta = node.at('.e-image__meta')
251
268
 
252
- unless image_meta.nil?
253
- article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
254
- article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
255
- end
256
-
257
- article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
269
+ unless image_meta.nil?
270
+ article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
271
+ article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
258
272
  end
273
+
274
+ article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
259
275
  end
260
276
 
261
277
  node.traverse { |children| children.remove }
@@ -275,7 +291,7 @@ module GDNewsScraper::Scrapers
275
291
  node.children.each do |inner_node|
276
292
  case inner_node.name
277
293
  when 'a'
278
- id = unique_id
294
+ id = unique_id(inner_node)
279
295
 
280
296
  article_body[:anchors][id] = {
281
297
  text: inner_node.children.text,
@@ -312,11 +328,7 @@ module GDNewsScraper::Scrapers
312
328
 
313
329
  return article_body
314
330
  rescue => e
315
- "There was a problem while parsing this Article: #{ e }"
316
- end
317
-
318
- def figure(article_body, id, node, image, image_url)
319
-
331
+ "There was a problem while parsing this Article's body: #{ e }"
320
332
  end
321
333
 
322
334
  private
@@ -329,8 +341,12 @@ module GDNewsScraper::Scrapers
329
341
  string&.text&.strip
330
342
  end
331
343
 
332
- def unique_id
333
- (0...50).map { (65 + rand(25)).chr }.join.to_sym
344
+ def unique_id(node)
345
+ Base64.strict_encode64(node.to_s)
346
+ .reverse
347
+ .gsub(/[^0-9A-Za-z]/, '')[0..100]
348
+ .downcase
349
+ .to_sym
334
350
  end
335
351
  end # News
336
352
  end # PolygonCOM
@@ -1,5 +1,5 @@
1
1
  module GDNewsScraper
2
- VERSION ||= '3.0.7'
2
+ VERSION ||= '3.0.9'
3
3
 
4
4
  # => major: A new Source has been added or removed
5
5
  # => minor: A Source code has changed drastically to a point where it's not
@@ -37,5 +37,27 @@ module GDNewsScraper
37
37
  # about 10% on average! :)
38
38
  # v3.0.7 - Changed the way figures are added to the articles which takes in
39
39
  # consideration deeply nested figures as well
40
- #
40
+ # v3.0.8 - Removed the refresh method since you can parse an article by
41
+ # passing it URL
42
+ # v3.0.9
43
+ #
44
+ # - Generate truly unique strings using Base64.strict_encode. This stops Rails
45
+ # thinking the Article has not been changed even though its the same. Previous
46
+ # unique_id method would re-regenerate the id every time the Article is
47
+ # requested
48
+ # - Identify whether or not the Article is a video when indexing the Article
49
+ # page rather than doing it when scraping the Articles page
50
+ # - Only account for the Video that's inside the Article, not any other video
51
+ # that might be on the page using 'node.at()' instead of 'node.css()' which
52
+ # returns only the first match
53
+ # - Remove the 'is_a_video' argument when parsing the body since we're doing
54
+ # the check there from now on.
55
+ # Some articles that are of type video don't necessarily show as such when
56
+ # viewing them on the Articles page
57
+ # - Change the paramater that is passed to 'parse_article_body'. When we're
58
+ # parsing an article from a URL we don't need to re-request the page with
59
+ # Nokogiri
60
+ # - Increased the size of the unique_id to 100.
61
+ # Images have the same url prefix, where only the image name is different, as
62
+ # such, the unique_id was not that unique anymore..
41
63
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: GDNewsScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.7
4
+ version: 3.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Radulescu