GDNewsScraper 3.0.7 → 3.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/GDNewsScraper/scrapers/polygon_com/news.rb +78 -62
- data/lib/GDNewsScraper/version.rb +24 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4930eeeb0c78d881acc7a9fad32bac5e55d3743e
|
4
|
+
data.tar.gz: 11be82b0c24cd044485a71b863088892a1cb99ad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d1617126dfdbc603d5328c4027b45a63fe76a22618188a966f873b5e3566cf4894c8b2987d89880665e8e969c21eda2bf4c0daf3a4635894350241801801f7e
|
7
|
+
data.tar.gz: 1361f266a1816e9c588c2c68f8230d29476c7a41ef3820732eec23e8b91dfa625f884937184090297c7e7f61622e9053c449c0a7e31394644729c5f0e9400711
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'pry'
|
1
2
|
require 'base64'
|
2
3
|
require 'json'
|
3
4
|
|
@@ -47,10 +48,6 @@ module GDNewsScraper::Scrapers
|
|
47
48
|
end
|
48
49
|
end
|
49
50
|
|
50
|
-
def refresh(article_url)
|
51
|
-
parse_article_body(article_url)
|
52
|
-
end
|
53
|
-
|
54
51
|
def parse(article)
|
55
52
|
pulse = Hash.new
|
56
53
|
|
@@ -65,7 +62,8 @@ module GDNewsScraper::Scrapers
|
|
65
62
|
begin
|
66
63
|
article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
67
64
|
|
68
|
-
|
65
|
+
first_element = article_page.at('.l-col__main').elements.first
|
66
|
+
is_a_video = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'
|
69
67
|
|
70
68
|
key = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
|
71
69
|
url = article
|
@@ -92,14 +90,20 @@ module GDNewsScraper::Scrapers
|
|
92
90
|
raise ArgumentError.new('Invalid URL')
|
93
91
|
end
|
94
92
|
elsif article.is_a?(Nokogiri::XML::Element)
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
93
|
+
article_container = article.at('.c-entry-box--compact--article')
|
94
|
+
|
95
|
+
if article_container.nil?
|
96
|
+
raise StandardError.new('Not an Article, skipping..')
|
97
|
+
else
|
98
|
+
key = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
|
99
|
+
url = article.at('.c-entry-box--compact__title').at('> a').attr('href')
|
100
|
+
title = strip(article.at('.c-entry-box--compact__title'))
|
101
|
+
cover = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
|
102
|
+
author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
|
103
|
+
date = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
|
104
|
+
|
105
|
+
article_page = url
|
106
|
+
end
|
103
107
|
else
|
104
108
|
raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
|
105
109
|
end
|
@@ -111,22 +115,29 @@ module GDNewsScraper::Scrapers
|
|
111
115
|
pulse[:title] = title
|
112
116
|
pulse[:author] = author
|
113
117
|
pulse[:date] = date
|
114
|
-
pulse[:content] = parse_article_body(
|
118
|
+
pulse[:content] = parse_article_body(article_page)
|
115
119
|
pulse[:tags] = title.downcase.split
|
116
120
|
|
117
121
|
return pulse
|
118
122
|
rescue => e
|
119
|
-
|
123
|
+
{
|
124
|
+
success: false,
|
125
|
+
message: "There was a problem while parsing this Article: #{ e }"
|
126
|
+
}
|
120
127
|
end
|
121
128
|
|
122
|
-
def parse_article_body(
|
123
|
-
|
129
|
+
def parse_article_body(article)
|
130
|
+
if article.is_a?(String)
|
131
|
+
article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
132
|
+
else
|
133
|
+
article_page = article
|
134
|
+
end
|
135
|
+
|
124
136
|
article_container = article_page.at('.c-entry-content')
|
125
137
|
|
126
138
|
article_body = {
|
127
139
|
galleries: { },
|
128
140
|
videos: { },
|
129
|
-
images: { },
|
130
141
|
|
131
142
|
anchors: { },
|
132
143
|
figures: { },
|
@@ -134,14 +145,28 @@ module GDNewsScraper::Scrapers
|
|
134
145
|
body: [ ]
|
135
146
|
}
|
136
147
|
|
148
|
+
# Check here as well since an Article CAN have an embeded video instead
|
149
|
+
# of a Cover and still show as a non-video artciel on the News page from
|
150
|
+
# where we initially took the 'is_a_video' check
|
151
|
+
#
|
152
|
+
first_element = article_page.at('.l-col__main').elements.first
|
153
|
+
is_a_video = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'
|
154
|
+
|
137
155
|
if is_a_video
|
138
|
-
|
139
|
-
iframe_id = unique_id
|
156
|
+
id = unique_id(first_element)
|
140
157
|
|
141
|
-
|
142
|
-
article_body[:videos][iframe_id][:url] = iframe.attr('src')
|
158
|
+
is_polygon_video = !first_element.attributes['data-volume-uuid'].nil?
|
143
159
|
|
144
|
-
|
160
|
+
if is_polygon_video
|
161
|
+
article_body[:videos][id] = {}
|
162
|
+
article_body[:videos][id][:label] = first_element.attr('data-analytics-label')&.split('|')&.first&.strip
|
163
|
+
article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ first_element.attr('data-volume-uuid') }"
|
164
|
+
else
|
165
|
+
article_body[:videos][id] = {}
|
166
|
+
article_body[:videos][id][:url] = first_element.at('iframe').attr('src')
|
167
|
+
end
|
168
|
+
|
169
|
+
article_body[:body] << first_element.replace("{{video:#{ id }}}").to_html
|
145
170
|
end
|
146
171
|
|
147
172
|
article_container.children.each do |node|
|
@@ -161,12 +186,12 @@ module GDNewsScraper::Scrapers
|
|
161
186
|
iframe = node.at('iframe')
|
162
187
|
|
163
188
|
if iframe # YouTube videos
|
164
|
-
|
189
|
+
id = unique_id(iframe)
|
165
190
|
|
166
|
-
article_body[:videos][
|
167
|
-
article_body[:videos][
|
191
|
+
article_body[:videos][id] = {}
|
192
|
+
article_body[:videos][id][:url] = iframe.attr('src')
|
168
193
|
|
169
|
-
article_body[:body] << iframe.replace("{{video:#{
|
194
|
+
article_body[:body] << iframe.replace("{{video:#{ id }}}").to_html
|
170
195
|
end
|
171
196
|
|
172
197
|
# Check to see if the Article has a video by Polygon, which is
|
@@ -175,7 +200,7 @@ module GDNewsScraper::Scrapers
|
|
175
200
|
polygon_video = node.attributes['data-volume-uuid']
|
176
201
|
|
177
202
|
unless polygon_video.nil?
|
178
|
-
id = unique_id
|
203
|
+
id = unique_id(polygon_video)
|
179
204
|
|
180
205
|
article_body[:videos][id] = {}
|
181
206
|
article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
|
@@ -191,18 +216,18 @@ module GDNewsScraper::Scrapers
|
|
191
216
|
if gallery
|
192
217
|
gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
|
193
218
|
|
194
|
-
|
195
|
-
article_body[:galleries][
|
219
|
+
id = unique_id(gallery)
|
220
|
+
article_body[:galleries][id] = []
|
196
221
|
|
197
222
|
gallery_container.children.children.each do |image_container|
|
198
223
|
image = image_container.at('a')
|
199
224
|
|
200
225
|
if image
|
201
|
-
article_body[:galleries][
|
226
|
+
article_body[:galleries][id] << image.attr('href')
|
202
227
|
end
|
203
228
|
end
|
204
229
|
|
205
|
-
article_body[:body] << gallery.replace("{{gallery:#{
|
230
|
+
article_body[:body] << gallery.replace("{{gallery:#{ id }}}").to_html
|
206
231
|
end
|
207
232
|
|
208
233
|
twitdget = node.at('.twitter-tweet')
|
@@ -230,32 +255,23 @@ module GDNewsScraper::Scrapers
|
|
230
255
|
if figure
|
231
256
|
node.css('.e-image__image').each do |image|
|
232
257
|
image_url = image.attr('data-original')
|
258
|
+
|
259
|
+
id = unique_id(node)
|
260
|
+
|
261
|
+
article_body[:figures][id] = { }
|
262
|
+
article_body[:figures][id][:image] = image_url
|
233
263
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
article_body[:images][id] = { }
|
238
|
-
article_body[:images][id][:url] = image_url
|
239
|
-
|
240
|
-
article_body[:body] << node.replace("{{image:#{ id }}}").to_html
|
241
|
-
else
|
242
|
-
id = unique_id
|
243
|
-
|
244
|
-
article_body[:figures][id] = { }
|
245
|
-
|
246
|
-
article_body[:figures][id][:image] = image_url
|
247
|
-
article_body[:figures][id][:title] = image.at('img').attr('title')
|
248
|
-
article_body[:figures][id][:alt] = image.at('img').attr('alt')
|
264
|
+
article_body[:figures][id][:title] = image.at('img')&.attr('title')
|
265
|
+
article_body[:figures][id][:alt] = image.at('img')&.attr('alt')
|
249
266
|
|
250
|
-
|
267
|
+
image_meta = node.at('.e-image__meta')
|
251
268
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
end
|
256
|
-
|
257
|
-
article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
|
269
|
+
unless image_meta.nil?
|
270
|
+
article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
|
271
|
+
article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
|
258
272
|
end
|
273
|
+
|
274
|
+
article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
|
259
275
|
end
|
260
276
|
|
261
277
|
node.traverse { |children| children.remove }
|
@@ -275,7 +291,7 @@ module GDNewsScraper::Scrapers
|
|
275
291
|
node.children.each do |inner_node|
|
276
292
|
case inner_node.name
|
277
293
|
when 'a'
|
278
|
-
id = unique_id
|
294
|
+
id = unique_id(inner_node)
|
279
295
|
|
280
296
|
article_body[:anchors][id] = {
|
281
297
|
text: inner_node.children.text,
|
@@ -312,11 +328,7 @@ module GDNewsScraper::Scrapers
|
|
312
328
|
|
313
329
|
return article_body
|
314
330
|
rescue => e
|
315
|
-
"There was a problem while parsing this Article: #{ e }"
|
316
|
-
end
|
317
|
-
|
318
|
-
def figure(article_body, id, node, image, image_url)
|
319
|
-
|
331
|
+
"There was a problem while parsing this Article's body: #{ e }"
|
320
332
|
end
|
321
333
|
|
322
334
|
private
|
@@ -329,8 +341,12 @@ module GDNewsScraper::Scrapers
|
|
329
341
|
string&.text&.strip
|
330
342
|
end
|
331
343
|
|
332
|
-
def unique_id
|
333
|
-
(
|
344
|
+
def unique_id(node)
|
345
|
+
Base64.strict_encode64(node.to_s)
|
346
|
+
.reverse
|
347
|
+
.gsub(/[^0-9A-Za-z]/, '')[0..100]
|
348
|
+
.downcase
|
349
|
+
.to_sym
|
334
350
|
end
|
335
351
|
end # News
|
336
352
|
end # PolygonCOM
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module GDNewsScraper
|
2
|
-
VERSION ||= '3.0.
|
2
|
+
VERSION ||= '3.0.9'
|
3
3
|
|
4
4
|
# => major: A new Source has been added or removed
|
5
5
|
# => minor: A Source code has changed drastically to a point where it's not
|
@@ -37,5 +37,27 @@ module GDNewsScraper
|
|
37
37
|
# about 10% on average! :)
|
38
38
|
# v3.0.7 - Changed the way figures are added to the articles which takes in
|
39
39
|
# consideration deeply nested figures as well
|
40
|
-
#
|
40
|
+
# v3.0.8 - Removed the refresh method since you can parse an article by
|
41
|
+
# passing it URL
|
42
|
+
# v3.0.9
|
43
|
+
#
|
44
|
+
# - Generate truly unique strings using Base64.strict_encode. This stops Rails
|
45
|
+
# thinking the Article has not been changed even though its the same. Previous
|
46
|
+
# unique_id method would re-regenerate the id every time the Article is
|
47
|
+
# requested
|
48
|
+
# - Identify whether or not the Article is a video when indexing the Article
|
49
|
+
# page rather than doing it when scraping the Articles page
|
50
|
+
# - Only account for the Video that's inside the Article, not any other video
|
51
|
+
# that might be on the page using 'node.at()' instead of 'node.css()' which
|
52
|
+
# returns only the first match
|
53
|
+
# - Remove the 'is_a_video' argument when parsing the body since we're doing
|
54
|
+
# the check there from now on.
|
55
|
+
# Some articles that are of type video don't necessarily show as such when
|
56
|
+
# viewing them on the Articles page
|
57
|
+
# - Change the paramater that is passed to 'parse_article_body'. When we're
|
58
|
+
# parsing an article from a URL we don't need to re-request the page with
|
59
|
+
# Nokogiri
|
60
|
+
# - Increased the size of the unique_id to 100.
|
61
|
+
# Images have the same url prefix, where only the image name is different, as
|
62
|
+
# such, the unique_id was not that unique anymore..
|
41
63
|
end
|