GDNewsScraper 3.0.4 → 3.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/GDNewsScraper/scrapers/polygon_com/news.rb +201 -142
- data/lib/GDNewsScraper/string.rb +5 -0
- data/lib/GDNewsScraper/version.rb +6 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c37e3bbf8420be9d2bf182d091eb5b8713e46679
|
4
|
+
data.tar.gz: c20ce1f2ee2b57757bd0d9d87b3bdc34fd751cf7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74639942afca8966f6602c642a89cb31979e896a4656c8ed6dbf20d5e8764a642ab62f6a05566265b75531f685c542d1ff0daaa8ce0f474e3b882355a1f9afc3
|
7
|
+
data.tar.gz: 1783aa15b5339131027ecf0aa0d2776d4959664157cee1305eca2acf206e43551f9079e4a60a322fe7a99f95f1513c302d8e13cf9529c95a97aa6e2476215b49
|
@@ -7,49 +7,31 @@ module GDNewsScraper::Scrapers
|
|
7
7
|
"User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
8
8
|
}
|
9
9
|
|
10
|
-
|
10
|
+
URL ||= 'https://www.polygon.com'
|
11
11
|
|
12
12
|
WHITELIST ||= {
|
13
13
|
default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
|
14
14
|
inner: ['strong', 'em', 'li']
|
15
15
|
}
|
16
16
|
|
17
|
-
DOM ||= {
|
18
|
-
article: {
|
19
|
-
wrapper: '.c-compact-river',
|
20
|
-
container: '.c-compact-river__entry',
|
21
|
-
inner_container: '.c-entry-box--compact',
|
22
|
-
inner_container_video: '.c-entry-box--compact--video',
|
23
|
-
title: '.c-entry-box--compact__title',
|
24
|
-
cover: '.c-entry-box--compact__image',
|
25
|
-
meta: '.c-byline'
|
26
|
-
},
|
27
|
-
|
28
|
-
pagination: {
|
29
|
-
previous: '.c-pagination__prev',
|
30
|
-
info: '.c-pagination__text',
|
31
|
-
next: '.c-pagination__next'
|
32
|
-
}
|
33
|
-
}
|
34
|
-
|
35
17
|
class News
|
36
18
|
attr_accessor :stream
|
37
19
|
|
38
|
-
def initialize(offset =
|
20
|
+
def initialize(offset = nil)
|
39
21
|
unless offset.nil?
|
40
|
-
uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::
|
22
|
+
uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::URL }/news/archives/#{ offset }"
|
41
23
|
|
42
24
|
@page = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
43
25
|
@stream = Hash.new
|
44
26
|
|
45
27
|
stream[:stream] = Hash.new
|
46
|
-
stream[:stream][:size] = @page.
|
47
|
-
stream[:stream][:pages] = @page.
|
48
|
-
stream[:stream][:prev] = @page.
|
49
|
-
stream[:stream][:next] = @page.
|
28
|
+
stream[:stream][:size] = @page.at('.c-pagination__text').text.split.first.to_num
|
29
|
+
stream[:stream][:pages] = @page.at('.c-pagination__text').text.split.last.to_num
|
30
|
+
stream[:stream][:prev] = @page.at('.c-pagination__prev')&.attr('href')&.split('/')&.last.to_i
|
31
|
+
stream[:stream][:next] = @page.at('.c-pagination__next')&.attr('href')&.split('/')&.last.to_i
|
50
32
|
|
51
33
|
stream[:feed] = Hash.new
|
52
|
-
stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::
|
34
|
+
stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::URL
|
53
35
|
stream[:feed][:source] = 'polygon'
|
54
36
|
stream[:feed][:label] = 'Polygon'
|
55
37
|
|
@@ -60,8 +42,8 @@ module GDNewsScraper::Scrapers
|
|
60
42
|
end
|
61
43
|
|
62
44
|
def perform
|
63
|
-
@page.css(
|
64
|
-
stream[:articles]
|
45
|
+
@page.css('.c-compact-river__entry').first(2).each do |article|
|
46
|
+
stream[:articles].push(parse(article))
|
65
47
|
end
|
66
48
|
end
|
67
49
|
|
@@ -72,38 +54,74 @@ module GDNewsScraper::Scrapers
|
|
72
54
|
def parse(article)
|
73
55
|
pulse = Hash.new
|
74
56
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
57
|
+
# This allows the Parser to get its data from the Index page, when the
|
58
|
+
# article is a Nokogiri::XML or from the Article page when the article
|
59
|
+
# is a URL.
|
60
|
+
#
|
61
|
+
# Passing a URL is mainly for debugging in case an Article fails to
|
62
|
+
# parse and should only be used as such..
|
63
|
+
#
|
64
|
+
if article.is_a?(String)
|
65
|
+
begin
|
66
|
+
article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
67
|
+
|
68
|
+
is_a_video = article_page.at('.c-video-embed').nil?
|
69
|
+
|
70
|
+
key = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
|
71
|
+
url = article
|
72
|
+
title = strip(article_page.css('.c-entry-hero').at('.c-page-title'))
|
73
|
+
cover = (is_a_video ? nil : article_page.css('.l-col__main').at('.e-image__image').attr('data-original'))
|
74
|
+
author = strip(article_page.css('.c-entry-hero').at('.c-byline').css('.c-byline__item > a').children[0])
|
75
|
+
|
76
|
+
begin
|
77
|
+
article_date = strip(article_page.css('.c-entry-hero').at('.c-byline').css('time.c-byline__item'))
|
78
|
+
parsed_date = DateTime.parse(article_date)
|
79
|
+
|
80
|
+
date = parsed_date.to_time.to_i
|
81
|
+
|
82
|
+
# Never failed so not entirely sure what to rescue from, but with
|
83
|
+
# dates it allways risky not to rescue
|
84
|
+
#
|
85
|
+
# TODO: When it fails, find out why and rescue from that instead
|
86
|
+
# of rescuing from 'everything' ..
|
87
|
+
#
|
88
|
+
rescue
|
89
|
+
date = nil
|
90
|
+
end
|
91
|
+
rescue TypeError
|
92
|
+
raise ArgumentError.new('Invalid URL')
|
93
|
+
end
|
94
|
+
elsif article.is_a?(Nokogiri::XML::Element)
|
95
|
+
is_a_video = !article.at('.c-entry-box--compact--video').nil?
|
96
|
+
|
97
|
+
key = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
|
98
|
+
url = article.at('.c-entry-box--compact__title').at('> a').attr('href')
|
99
|
+
title = strip(article.at('.c-entry-box--compact__title'))
|
100
|
+
cover = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
|
101
|
+
author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
|
102
|
+
date = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
|
103
|
+
else
|
104
|
+
raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
|
88
105
|
end
|
89
|
-
|
106
|
+
|
107
|
+
pulse[:id] = key
|
108
|
+
pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
|
109
|
+
pulse[:cover] = cover
|
90
110
|
pulse[:url] = url
|
91
111
|
pulse[:title] = title
|
92
|
-
pulse[:author] =
|
93
|
-
pulse[:date] =
|
112
|
+
pulse[:author] = author
|
113
|
+
pulse[:date] = date
|
94
114
|
pulse[:content] = parse_article_body(url, is_a_video)
|
95
115
|
pulse[:tags] = title.downcase.split
|
96
116
|
|
97
117
|
return pulse
|
98
118
|
rescue => e
|
99
|
-
"There was a problem while parsing Article
|
119
|
+
"There was a problem while parsing this Article: #{ e }"
|
100
120
|
end
|
101
121
|
|
102
|
-
private
|
103
|
-
|
104
122
|
def parse_article_body(article_url, is_a_video = false)
|
105
123
|
article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
106
|
-
article_container = article_page.
|
124
|
+
article_container = article_page.at('.c-entry-content')
|
107
125
|
|
108
126
|
article_body = {
|
109
127
|
galleries: { },
|
@@ -118,7 +136,7 @@ module GDNewsScraper::Scrapers
|
|
118
136
|
|
119
137
|
if is_a_video
|
120
138
|
iframe = article_page.at('.c-video-embed--media').at('iframe')
|
121
|
-
iframe_id =
|
139
|
+
iframe_id = unique_id
|
122
140
|
|
123
141
|
article_body[:videos][iframe_id] = {}
|
124
142
|
article_body[:videos][iframe_id][:url] = iframe.attr('src')
|
@@ -127,148 +145,189 @@ module GDNewsScraper::Scrapers
|
|
127
145
|
end
|
128
146
|
|
129
147
|
article_container.children.each do |node|
|
130
|
-
|
148
|
+
content = node.content.strip.empty?
|
149
|
+
text = node.text.strip.empty?
|
150
|
+
attributes = node.attributes.empty?
|
151
|
+
children = node.children.empty?
|
131
152
|
|
132
|
-
|
133
|
-
|
134
|
-
|
153
|
+
if content && text && attributes && children
|
154
|
+
node.remove
|
155
|
+
else
|
156
|
+
if node.name == 'div'
|
135
157
|
|
136
|
-
|
137
|
-
|
158
|
+
# Check to see if the div contains a embeded video
|
159
|
+
#
|
160
|
+
iframe = node.at('iframe')
|
138
161
|
|
139
|
-
|
140
|
-
|
162
|
+
if iframe # YouTube videos
|
163
|
+
iframe_id = unique_id
|
141
164
|
|
142
|
-
|
143
|
-
|
165
|
+
article_body[:videos][iframe_id] = {}
|
166
|
+
article_body[:videos][iframe_id][:url] = iframe.attr('src')
|
144
167
|
|
145
|
-
|
146
|
-
|
147
|
-
gallery = node.at('.c-image-gallery')
|
168
|
+
article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
|
169
|
+
end
|
148
170
|
|
149
|
-
|
150
|
-
|
171
|
+
# Check to see if the Article has a video by Polygon, which is
|
172
|
+
# embeded differnetly than a YouTube video..
|
173
|
+
#
|
174
|
+
polygon_video = node.attributes['data-volume-uuid']
|
175
|
+
|
176
|
+
unless polygon_video.nil?
|
177
|
+
id = unique_id
|
178
|
+
|
179
|
+
article_body[:videos][id] = {}
|
180
|
+
article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
|
181
|
+
article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ node.attr('data-volume-uuid') }"
|
182
|
+
|
183
|
+
article_body[:body] << node.replace("{{video:#{ id }}}").to_html
|
184
|
+
end
|
185
|
+
|
186
|
+
# Check to see if the div contains a gallery
|
187
|
+
#
|
188
|
+
gallery = node.at('.c-image-gallery')
|
189
|
+
|
190
|
+
if gallery
|
191
|
+
gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
|
151
192
|
|
152
|
-
|
153
|
-
|
193
|
+
gallery_id = unique_id
|
194
|
+
article_body[:galleries][gallery_id] = []
|
154
195
|
|
155
|
-
|
156
|
-
|
196
|
+
gallery_container.children.children.each do |image_container|
|
197
|
+
image = image_container.at('a')
|
157
198
|
|
158
|
-
|
159
|
-
|
199
|
+
if image
|
200
|
+
article_body[:galleries][gallery_id] << image.attr('href')
|
201
|
+
end
|
160
202
|
end
|
203
|
+
|
204
|
+
article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
|
161
205
|
end
|
162
206
|
|
163
|
-
|
164
|
-
end
|
207
|
+
twitdget = node.at('.twitter-tweet')
|
165
208
|
|
166
|
-
|
209
|
+
if twitdget
|
210
|
+
article_body[:body] << twitdget.to_html
|
211
|
+
end
|
167
212
|
|
168
|
-
|
169
|
-
|
213
|
+
redditget = node.at('.reddit-card')
|
214
|
+
|
215
|
+
if redditget
|
216
|
+
article_body[:body] << redditget.to_html
|
217
|
+
end
|
170
218
|
end
|
171
219
|
|
172
|
-
|
220
|
+
# First ensure the node is an actual element. This removes random HTML elements
|
221
|
+
#
|
222
|
+
# => node.element?
|
223
|
+
#
|
224
|
+
# Secondly, ensure the node is what we actual want. We don't want <div>'s
|
225
|
+
# which are usualy used for placing inline advertisments or content specific
|
226
|
+
# only to that website
|
227
|
+
#
|
228
|
+
# => WHITELIST[:default].include?(node.name)
|
229
|
+
#
|
230
|
+
if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
|
231
|
+
case node.name
|
232
|
+
when 'figure'
|
173
233
|
|
174
|
-
|
175
|
-
|
176
|
-
end
|
177
|
-
end
|
234
|
+
image = node.at('.e-image__image')
|
235
|
+
image_url = image.attr('data-original')
|
178
236
|
|
179
|
-
# First ensure the node is an actual element. This removes random HTML elements
|
180
|
-
#
|
181
|
-
# => node.element?
|
182
|
-
#
|
183
|
-
# Secondly, ensure the node is what we actual want. We don't want <div>'s
|
184
|
-
# which are usualy used for placing inline advertisments or content specific
|
185
|
-
# only to that website
|
186
|
-
#
|
187
|
-
# => WHITELIST[:default].include?(node.name)
|
188
|
-
#
|
189
|
-
if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
|
190
|
-
case node.name
|
191
|
-
when 'figure'
|
192
|
-
|
193
|
-
image = node.css('.e-image__image').first
|
194
|
-
image_url = image.attr('data-original')
|
195
|
-
|
196
|
-
begin
|
197
237
|
if image_url.split('.').last == 'gif'
|
198
|
-
|
238
|
+
id = unique_id
|
199
239
|
|
200
|
-
article_body[:images][
|
201
|
-
article_body[:images][
|
240
|
+
article_body[:images][id] = { }
|
241
|
+
article_body[:images][id][:url] = image_url
|
202
242
|
|
203
|
-
article_body[:body] << node.replace("{{image:#{
|
243
|
+
article_body[:body] << node.replace("{{image:#{ id }}}").to_html
|
204
244
|
else
|
205
|
-
|
206
|
-
|
245
|
+
id = unique_id
|
246
|
+
|
247
|
+
figure(article_body, id, node, image, image_url)
|
248
|
+
|
249
|
+
article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
|
250
|
+
end
|
251
|
+
|
252
|
+
else
|
207
253
|
|
208
|
-
|
254
|
+
node.children.each do |inner_node|
|
255
|
+
case inner_node.name
|
256
|
+
when 'a'
|
257
|
+
id = unique_id
|
258
|
+
|
259
|
+
article_body[:anchors][id] = {
|
260
|
+
text: inner_node.children.text,
|
261
|
+
url: inner_node.attr('href')
|
262
|
+
}
|
209
263
|
|
210
|
-
|
264
|
+
inner_node.replace("{{anchor:#{ id }}}")
|
265
|
+
when 'figure'
|
266
|
+
id = unique_id
|
211
267
|
|
212
|
-
|
268
|
+
image = node.at('.e-image__image')
|
269
|
+
image_url = image.attr('data-original')
|
213
270
|
|
214
|
-
|
215
|
-
article_body[:figures][figure_id][:title] = image_title
|
216
|
-
article_body[:figures][figure_id][:alt] = image_alt
|
271
|
+
figure(article_body, id, node, image, image_url)
|
217
272
|
|
218
|
-
|
219
|
-
article_body[:
|
220
|
-
article_body[:figures][figure_id][:cite] = image_meta.first.at('cite')&.text
|
273
|
+
node = node.replace("{{figure:#{ id }}}").to_html
|
274
|
+
article_body[:body] << node
|
221
275
|
end
|
222
|
-
|
223
|
-
article_body[:body] << node.replace("{{figure:#{ figure_id }}}").to_html
|
224
276
|
end
|
225
|
-
rescue
|
226
|
-
raise 'Unknown format, please review.'
|
227
|
-
end
|
228
|
-
else
|
229
277
|
|
230
|
-
node.children.each do |url|
|
231
278
|
begin
|
232
|
-
if url.name == 'a'
|
233
|
-
url_id = random_string
|
234
|
-
|
235
|
-
article_body[:anchors][url_id] = {
|
236
|
-
text: url.children.text,
|
237
|
-
url: url.attributes['href'].value
|
238
|
-
}
|
239
279
|
|
240
|
-
|
241
|
-
|
280
|
+
# Remove all attributes
|
281
|
+
#
|
282
|
+
parsed_node = node.xpath('.//@*').remove
|
283
|
+
|
284
|
+
# Return clean HTML, including HTML elements and text
|
285
|
+
#
|
286
|
+
parsed_node = node.to_html
|
287
|
+
|
242
288
|
rescue
|
243
|
-
|
289
|
+
|
244
290
|
end
|
245
291
|
end
|
246
292
|
|
247
|
-
|
248
|
-
#
|
249
|
-
parsed_node = node.xpath('.//@*').remove
|
250
|
-
|
251
|
-
# Return clean HTML, including HTML elements and text
|
252
|
-
#
|
253
|
-
parsed_node = node.to_html
|
293
|
+
article_body[:body] << parsed_node unless parsed_node.nil?
|
254
294
|
end
|
255
|
-
|
256
|
-
article_body[:body] << parsed_node
|
257
295
|
end
|
258
296
|
end
|
259
297
|
|
260
298
|
return article_body
|
261
299
|
rescue => e
|
262
|
-
"There was a problem while parsing
|
300
|
+
"There was a problem while parsing this Article: #{ e }"
|
263
301
|
end
|
264
302
|
|
303
|
+
def figure(article_body, id, node, image, image_url)
|
304
|
+
article_body[:figures][id] = { }
|
305
|
+
|
306
|
+
article_body[:figures][id][:image] = image_url
|
307
|
+
article_body[:figures][id][:title] = image.at('img').attr('title')
|
308
|
+
article_body[:figures][id][:alt] = image.at('img').attr('alt')
|
309
|
+
|
310
|
+
image_meta = node.at('.e-image__meta')
|
311
|
+
|
312
|
+
unless image_meta.nil?
|
313
|
+
article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
|
314
|
+
article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
private
|
319
|
+
|
265
320
|
def attr(attribute)
|
266
321
|
attributes&.fetch(attribute, nil)&.value
|
267
322
|
end
|
268
323
|
|
269
|
-
def
|
324
|
+
def strip(string)
|
325
|
+
string&.text&.strip
|
326
|
+
end
|
327
|
+
|
328
|
+
def unique_id
|
270
329
|
(0...50).map { (65 + rand(25)).chr }.join.to_sym
|
271
330
|
end
|
272
331
|
end # News
|
273
332
|
end # PolygonCOM
|
274
|
-
end # GDNewsScraper::Scrapers
|
333
|
+
end # GDNewsScraper::Scrapers
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module GDNewsScraper
|
2
|
-
VERSION ||= '3.0.
|
2
|
+
VERSION ||= '3.0.6'
|
3
3
|
|
4
4
|
# => major: A new Source has been added or removed
|
5
5
|
# => minor: A Source code has changed drastically to a point where it's not
|
@@ -31,5 +31,9 @@ module GDNewsScraper
|
|
31
31
|
# v3.0.3 - Added a new method which will refresh the content of an Article
|
32
32
|
# v3.0.4 - Fixed an issue caused by Featured Articles which have a different
|
33
33
|
# DOM structure
|
34
|
-
#
|
34
|
+
# v3.0.5 - Adds the possibility to parse an article from its URL rather than
|
35
|
+
# having to go through the index page to get its metadata
|
36
|
+
# v3.0.6 - Small refactor of the code which also improved parsing speed by
|
37
|
+
# about 10% on average! :)
|
38
|
+
#
|
35
39
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: GDNewsScraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Vlad Radulescu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-11-
|
11
|
+
date: 2017-11-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -74,6 +74,7 @@ files:
|
|
74
74
|
- lib/GDNewsScraper.rb
|
75
75
|
- lib/GDNewsScraper/scrapers/polygon_com/news.rb
|
76
76
|
- lib/GDNewsScraper/scrapers/polygon_com/reviews.rb
|
77
|
+
- lib/GDNewsScraper/string.rb
|
77
78
|
- lib/GDNewsScraper/version.rb
|
78
79
|
homepage: https://github.com/games-directory/scraper
|
79
80
|
licenses:
|