GDNewsScraper 3.0.7 → 3.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/GDNewsScraper/scrapers/polygon_com/news.rb +78 -62
- data/lib/GDNewsScraper/version.rb +24 -2
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4930eeeb0c78d881acc7a9fad32bac5e55d3743e
|
|
4
|
+
data.tar.gz: 11be82b0c24cd044485a71b863088892a1cb99ad
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 8d1617126dfdbc603d5328c4027b45a63fe76a22618188a966f873b5e3566cf4894c8b2987d89880665e8e969c21eda2bf4c0daf3a4635894350241801801f7e
|
|
7
|
+
data.tar.gz: 1361f266a1816e9c588c2c68f8230d29476c7a41ef3820732eec23e8b91dfa625f884937184090297c7e7f61622e9053c449c0a7e31394644729c5f0e9400711
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
require 'pry'
|
|
1
2
|
require 'base64'
|
|
2
3
|
require 'json'
|
|
3
4
|
|
|
@@ -47,10 +48,6 @@ module GDNewsScraper::Scrapers
|
|
|
47
48
|
end
|
|
48
49
|
end
|
|
49
50
|
|
|
50
|
-
def refresh(article_url)
|
|
51
|
-
parse_article_body(article_url)
|
|
52
|
-
end
|
|
53
|
-
|
|
54
51
|
def parse(article)
|
|
55
52
|
pulse = Hash.new
|
|
56
53
|
|
|
@@ -65,7 +62,8 @@ module GDNewsScraper::Scrapers
|
|
|
65
62
|
begin
|
|
66
63
|
article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
|
67
64
|
|
|
68
|
-
|
|
65
|
+
first_element = article_page.at('.l-col__main').elements.first
|
|
66
|
+
is_a_video = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'
|
|
69
67
|
|
|
70
68
|
key = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
|
|
71
69
|
url = article
|
|
@@ -92,14 +90,20 @@ module GDNewsScraper::Scrapers
|
|
|
92
90
|
raise ArgumentError.new('Invalid URL')
|
|
93
91
|
end
|
|
94
92
|
elsif article.is_a?(Nokogiri::XML::Element)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
93
|
+
article_container = article.at('.c-entry-box--compact--article')
|
|
94
|
+
|
|
95
|
+
if article_container.nil?
|
|
96
|
+
raise StandardError.new('Not an Article, skipping..')
|
|
97
|
+
else
|
|
98
|
+
key = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
|
|
99
|
+
url = article.at('.c-entry-box--compact__title').at('> a').attr('href')
|
|
100
|
+
title = strip(article.at('.c-entry-box--compact__title'))
|
|
101
|
+
cover = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
|
|
102
|
+
author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
|
|
103
|
+
date = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
|
|
104
|
+
|
|
105
|
+
article_page = url
|
|
106
|
+
end
|
|
103
107
|
else
|
|
104
108
|
raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
|
|
105
109
|
end
|
|
@@ -111,22 +115,29 @@ module GDNewsScraper::Scrapers
|
|
|
111
115
|
pulse[:title] = title
|
|
112
116
|
pulse[:author] = author
|
|
113
117
|
pulse[:date] = date
|
|
114
|
-
pulse[:content] = parse_article_body(
|
|
118
|
+
pulse[:content] = parse_article_body(article_page)
|
|
115
119
|
pulse[:tags] = title.downcase.split
|
|
116
120
|
|
|
117
121
|
return pulse
|
|
118
122
|
rescue => e
|
|
119
|
-
|
|
123
|
+
{
|
|
124
|
+
success: false,
|
|
125
|
+
message: "There was a problem while parsing this Article: #{ e }"
|
|
126
|
+
}
|
|
120
127
|
end
|
|
121
128
|
|
|
122
|
-
def parse_article_body(
|
|
123
|
-
|
|
129
|
+
def parse_article_body(article)
|
|
130
|
+
if article.is_a?(String)
|
|
131
|
+
article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
|
132
|
+
else
|
|
133
|
+
article_page = article
|
|
134
|
+
end
|
|
135
|
+
|
|
124
136
|
article_container = article_page.at('.c-entry-content')
|
|
125
137
|
|
|
126
138
|
article_body = {
|
|
127
139
|
galleries: { },
|
|
128
140
|
videos: { },
|
|
129
|
-
images: { },
|
|
130
141
|
|
|
131
142
|
anchors: { },
|
|
132
143
|
figures: { },
|
|
@@ -134,14 +145,28 @@ module GDNewsScraper::Scrapers
|
|
|
134
145
|
body: [ ]
|
|
135
146
|
}
|
|
136
147
|
|
|
148
|
+
# Check here as well since an Article CAN have an embeded video instead
|
|
149
|
+
# of a Cover and still show as a non-video artciel on the News page from
|
|
150
|
+
# where we initially took the 'is_a_video' check
|
|
151
|
+
#
|
|
152
|
+
first_element = article_page.at('.l-col__main').elements.first
|
|
153
|
+
is_a_video = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'
|
|
154
|
+
|
|
137
155
|
if is_a_video
|
|
138
|
-
|
|
139
|
-
iframe_id = unique_id
|
|
156
|
+
id = unique_id(first_element)
|
|
140
157
|
|
|
141
|
-
|
|
142
|
-
article_body[:videos][iframe_id][:url] = iframe.attr('src')
|
|
158
|
+
is_polygon_video = !first_element.attributes['data-volume-uuid'].nil?
|
|
143
159
|
|
|
144
|
-
|
|
160
|
+
if is_polygon_video
|
|
161
|
+
article_body[:videos][id] = {}
|
|
162
|
+
article_body[:videos][id][:label] = first_element.attr('data-analytics-label')&.split('|')&.first&.strip
|
|
163
|
+
article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ first_element.attr('data-volume-uuid') }"
|
|
164
|
+
else
|
|
165
|
+
article_body[:videos][id] = {}
|
|
166
|
+
article_body[:videos][id][:url] = first_element.at('iframe').attr('src')
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
article_body[:body] << first_element.replace("{{video:#{ id }}}").to_html
|
|
145
170
|
end
|
|
146
171
|
|
|
147
172
|
article_container.children.each do |node|
|
|
@@ -161,12 +186,12 @@ module GDNewsScraper::Scrapers
|
|
|
161
186
|
iframe = node.at('iframe')
|
|
162
187
|
|
|
163
188
|
if iframe # YouTube videos
|
|
164
|
-
|
|
189
|
+
id = unique_id(iframe)
|
|
165
190
|
|
|
166
|
-
article_body[:videos][
|
|
167
|
-
article_body[:videos][
|
|
191
|
+
article_body[:videos][id] = {}
|
|
192
|
+
article_body[:videos][id][:url] = iframe.attr('src')
|
|
168
193
|
|
|
169
|
-
article_body[:body] << iframe.replace("{{video:#{
|
|
194
|
+
article_body[:body] << iframe.replace("{{video:#{ id }}}").to_html
|
|
170
195
|
end
|
|
171
196
|
|
|
172
197
|
# Check to see if the Article has a video by Polygon, which is
|
|
@@ -175,7 +200,7 @@ module GDNewsScraper::Scrapers
|
|
|
175
200
|
polygon_video = node.attributes['data-volume-uuid']
|
|
176
201
|
|
|
177
202
|
unless polygon_video.nil?
|
|
178
|
-
id = unique_id
|
|
203
|
+
id = unique_id(polygon_video)
|
|
179
204
|
|
|
180
205
|
article_body[:videos][id] = {}
|
|
181
206
|
article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
|
|
@@ -191,18 +216,18 @@ module GDNewsScraper::Scrapers
|
|
|
191
216
|
if gallery
|
|
192
217
|
gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
|
|
193
218
|
|
|
194
|
-
|
|
195
|
-
article_body[:galleries][
|
|
219
|
+
id = unique_id(gallery)
|
|
220
|
+
article_body[:galleries][id] = []
|
|
196
221
|
|
|
197
222
|
gallery_container.children.children.each do |image_container|
|
|
198
223
|
image = image_container.at('a')
|
|
199
224
|
|
|
200
225
|
if image
|
|
201
|
-
article_body[:galleries][
|
|
226
|
+
article_body[:galleries][id] << image.attr('href')
|
|
202
227
|
end
|
|
203
228
|
end
|
|
204
229
|
|
|
205
|
-
article_body[:body] << gallery.replace("{{gallery:#{
|
|
230
|
+
article_body[:body] << gallery.replace("{{gallery:#{ id }}}").to_html
|
|
206
231
|
end
|
|
207
232
|
|
|
208
233
|
twitdget = node.at('.twitter-tweet')
|
|
@@ -230,32 +255,23 @@ module GDNewsScraper::Scrapers
|
|
|
230
255
|
if figure
|
|
231
256
|
node.css('.e-image__image').each do |image|
|
|
232
257
|
image_url = image.attr('data-original')
|
|
258
|
+
|
|
259
|
+
id = unique_id(node)
|
|
260
|
+
|
|
261
|
+
article_body[:figures][id] = { }
|
|
262
|
+
article_body[:figures][id][:image] = image_url
|
|
233
263
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
article_body[:images][id] = { }
|
|
238
|
-
article_body[:images][id][:url] = image_url
|
|
239
|
-
|
|
240
|
-
article_body[:body] << node.replace("{{image:#{ id }}}").to_html
|
|
241
|
-
else
|
|
242
|
-
id = unique_id
|
|
243
|
-
|
|
244
|
-
article_body[:figures][id] = { }
|
|
245
|
-
|
|
246
|
-
article_body[:figures][id][:image] = image_url
|
|
247
|
-
article_body[:figures][id][:title] = image.at('img').attr('title')
|
|
248
|
-
article_body[:figures][id][:alt] = image.at('img').attr('alt')
|
|
264
|
+
article_body[:figures][id][:title] = image.at('img')&.attr('title')
|
|
265
|
+
article_body[:figures][id][:alt] = image.at('img')&.attr('alt')
|
|
249
266
|
|
|
250
|
-
|
|
267
|
+
image_meta = node.at('.e-image__meta')
|
|
251
268
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
end
|
|
256
|
-
|
|
257
|
-
article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
|
|
269
|
+
unless image_meta.nil?
|
|
270
|
+
article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
|
|
271
|
+
article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
|
|
258
272
|
end
|
|
273
|
+
|
|
274
|
+
article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
|
|
259
275
|
end
|
|
260
276
|
|
|
261
277
|
node.traverse { |children| children.remove }
|
|
@@ -275,7 +291,7 @@ module GDNewsScraper::Scrapers
|
|
|
275
291
|
node.children.each do |inner_node|
|
|
276
292
|
case inner_node.name
|
|
277
293
|
when 'a'
|
|
278
|
-
id = unique_id
|
|
294
|
+
id = unique_id(inner_node)
|
|
279
295
|
|
|
280
296
|
article_body[:anchors][id] = {
|
|
281
297
|
text: inner_node.children.text,
|
|
@@ -312,11 +328,7 @@ module GDNewsScraper::Scrapers
|
|
|
312
328
|
|
|
313
329
|
return article_body
|
|
314
330
|
rescue => e
|
|
315
|
-
"There was a problem while parsing this Article: #{ e }"
|
|
316
|
-
end
|
|
317
|
-
|
|
318
|
-
def figure(article_body, id, node, image, image_url)
|
|
319
|
-
|
|
331
|
+
"There was a problem while parsing this Article's body: #{ e }"
|
|
320
332
|
end
|
|
321
333
|
|
|
322
334
|
private
|
|
@@ -329,8 +341,12 @@ module GDNewsScraper::Scrapers
|
|
|
329
341
|
string&.text&.strip
|
|
330
342
|
end
|
|
331
343
|
|
|
332
|
-
def unique_id
|
|
333
|
-
(
|
|
344
|
+
def unique_id(node)
|
|
345
|
+
Base64.strict_encode64(node.to_s)
|
|
346
|
+
.reverse
|
|
347
|
+
.gsub(/[^0-9A-Za-z]/, '')[0..100]
|
|
348
|
+
.downcase
|
|
349
|
+
.to_sym
|
|
334
350
|
end
|
|
335
351
|
end # News
|
|
336
352
|
end # PolygonCOM
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module GDNewsScraper
|
|
2
|
-
VERSION ||= '3.0.
|
|
2
|
+
VERSION ||= '3.0.9'
|
|
3
3
|
|
|
4
4
|
# => major: A new Source has been added or removed
|
|
5
5
|
# => minor: A Source code has changed drastically to a point where it's not
|
|
@@ -37,5 +37,27 @@ module GDNewsScraper
|
|
|
37
37
|
# about 10% on average! :)
|
|
38
38
|
# v3.0.7 - Changed the way figures are added to the articles which takes in
|
|
39
39
|
# consideration deeply nested figures as well
|
|
40
|
-
#
|
|
40
|
+
# v3.0.8 - Removed the refresh method since you can parse an article by
|
|
41
|
+
# passing it URL
|
|
42
|
+
# v3.0.9
|
|
43
|
+
#
|
|
44
|
+
# - Generate truly unique strings using Base64.strict_encode. This stops Rails
|
|
45
|
+
# thinking the Article has not been changed even though its the same. Previous
|
|
46
|
+
# unique_id method would re-regenerate the id every time the Article is
|
|
47
|
+
# requested
|
|
48
|
+
# - Identify whether or not the Article is a video when indexing the Article
|
|
49
|
+
# page rather than doing it when scraping the Articles page
|
|
50
|
+
# - Only account for the Video that's inside the Article, not any other video
|
|
51
|
+
# that might be on the page using 'node.at()' instead of 'node.css()' which
|
|
52
|
+
# returns only the first match
|
|
53
|
+
# - Remove the 'is_a_video' argument when parsing the body since we're doing
|
|
54
|
+
# the check there from now on.
|
|
55
|
+
# Some articles that are of type video don't necessarily show as such when
|
|
56
|
+
# viewing them on the Articles page
|
|
57
|
+
# - Change the paramater that is passed to 'parse_article_body'. When we're
|
|
58
|
+
# parsing an article from a URL we don't need to re-request the page with
|
|
59
|
+
# Nokogiri
|
|
60
|
+
# - Increased the size of the unique_id to 100.
|
|
61
|
+
# Images have the same url prefix, where only the image name is different, as
|
|
62
|
+
# such, the unique_id was not that unique anymore..
|
|
41
63
|
end
|