GDNewsScraper 3.0.6 → 3.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/GDNewsScraper.gemspec +1 -0
- data/lib/GDNewsScraper/scrapers/polygon_com/news.rb +63 -59
- data/lib/GDNewsScraper/version.rb +3 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ab7ee6a0e7e30c64e6ece2a156b9c94756c00a6
|
4
|
+
data.tar.gz: 30887530f6a43130209a98402fb0c32b647a5f77
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cefe85f767614e30d79caa71c50c43eb466e6b52359755ab0dd84491682c42fbff25ac362c181eb394ddb0590e9717fdf4b7b6d611f273caaf25595f584894a1
|
7
|
+
data.tar.gz: 35fec37b2669653852e1dd3dcd39d95bb0d5233b3d4bee92bb5004731bf9062a2ac3285ca35529350f3694cfad50c4c2502c470622ee4c5321779f4f6b5db7ae
|
data/GDNewsScraper.gemspec
CHANGED
@@ -10,7 +10,7 @@ module GDNewsScraper::Scrapers
|
|
10
10
|
URL ||= 'https://www.polygon.com'
|
11
11
|
|
12
12
|
WHITELIST ||= {
|
13
|
-
default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', '
|
13
|
+
default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'blockquote', 'ul', 'ol'],
|
14
14
|
inner: ['strong', 'em', 'li']
|
15
15
|
}
|
16
16
|
|
@@ -42,7 +42,7 @@ module GDNewsScraper::Scrapers
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def perform
|
45
|
-
@page.css('.c-compact-river__entry').
|
45
|
+
@page.css('.c-compact-river__entry').each do |article|
|
46
46
|
stream[:articles].push(parse(article))
|
47
47
|
end
|
48
48
|
end
|
@@ -153,8 +153,9 @@ module GDNewsScraper::Scrapers
|
|
153
153
|
if content && text && attributes && children
|
154
154
|
node.remove
|
155
155
|
else
|
156
|
-
if node.name == 'div'
|
157
156
|
|
157
|
+
if node.name == 'div'
|
158
|
+
|
158
159
|
# Check to see if the div contains a embeded video
|
159
160
|
#
|
160
161
|
iframe = node.at('iframe')
|
@@ -217,21 +218,17 @@ module GDNewsScraper::Scrapers
|
|
217
218
|
end
|
218
219
|
end
|
219
220
|
|
220
|
-
#
|
221
|
-
#
|
222
|
-
#
|
223
|
-
#
|
224
|
-
# Secondly, ensure the node is what we actual want. We don't want <div>'s
|
225
|
-
# which are usualy used for placing inline advertisments or content specific
|
226
|
-
# only to that website
|
221
|
+
# Extract 'figure' outside the node check because in many cases it's
|
222
|
+
# nested within other HTML elements and it makes it harder to
|
223
|
+
# extract without being too specific
|
227
224
|
#
|
228
|
-
#
|
225
|
+
# Do a double check because if the current node is in fact a figure,
|
226
|
+
# it will return false
|
229
227
|
#
|
230
|
-
|
231
|
-
case node.name
|
232
|
-
when 'figure'
|
228
|
+
figure = (node.name == 'figure' || node.at('figure.e-image'))
|
233
229
|
|
234
|
-
|
230
|
+
if figure
|
231
|
+
node.css('.e-image__image').each do |image|
|
235
232
|
image_url = image.attr('data-original')
|
236
233
|
|
237
234
|
if image_url.split('.').last == 'gif'
|
@@ -244,54 +241,72 @@ module GDNewsScraper::Scrapers
|
|
244
241
|
else
|
245
242
|
id = unique_id
|
246
243
|
|
247
|
-
|
244
|
+
article_body[:figures][id] = { }
|
248
245
|
|
249
|
-
article_body[:
|
250
|
-
|
246
|
+
article_body[:figures][id][:image] = image_url
|
247
|
+
article_body[:figures][id][:title] = image.at('img').attr('title')
|
248
|
+
article_body[:figures][id][:alt] = image.at('img').attr('alt')
|
251
249
|
|
252
|
-
|
250
|
+
image_meta = node.at('.e-image__meta')
|
253
251
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
article_body[:anchors][id] = {
|
260
|
-
text: inner_node.children.text,
|
261
|
-
url: inner_node.attr('href')
|
262
|
-
}
|
252
|
+
unless image_meta.nil?
|
253
|
+
article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
|
254
|
+
article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
|
255
|
+
end
|
263
256
|
|
264
|
-
|
265
|
-
|
266
|
-
|
257
|
+
article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
|
258
|
+
end
|
259
|
+
end
|
267
260
|
|
268
|
-
|
269
|
-
|
261
|
+
node.traverse { |children| children.remove }
|
262
|
+
end
|
270
263
|
|
271
|
-
|
264
|
+
# First ensure the node is an actual element. This removes random HTML elements
|
265
|
+
#
|
266
|
+
# => node.element?
|
267
|
+
#
|
268
|
+
# Secondly, ensure the node is what we actual want. We don't want <div>'s
|
269
|
+
# which are usualy used for placing inline advertisments or content specific
|
270
|
+
# only to that website
|
271
|
+
#
|
272
|
+
# => WHITELIST[:default].include?(node.name)
|
273
|
+
#
|
274
|
+
if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
|
275
|
+
node.children.each do |inner_node|
|
276
|
+
case inner_node.name
|
277
|
+
when 'a'
|
278
|
+
id = unique_id
|
279
|
+
|
280
|
+
article_body[:anchors][id] = {
|
281
|
+
text: inner_node.children.text,
|
282
|
+
url: inner_node.attr('href')
|
283
|
+
}
|
272
284
|
|
273
|
-
|
274
|
-
article_body[:body] << node
|
275
|
-
end
|
285
|
+
inner_node.replace("{{anchor:#{ id }}}")
|
276
286
|
end
|
287
|
+
end
|
277
288
|
|
278
|
-
|
289
|
+
begin
|
279
290
|
|
280
|
-
|
281
|
-
|
282
|
-
|
291
|
+
# Remove all attributes
|
292
|
+
#
|
293
|
+
parsed_node = node.xpath('.//@*').remove
|
283
294
|
|
284
|
-
|
285
|
-
|
286
|
-
|
295
|
+
# Check the integrity of the node before parsing it into html
|
296
|
+
# since 'content' is a Nokogiri feature
|
297
|
+
#
|
298
|
+
omit_node = node.content.empty?
|
287
299
|
|
288
|
-
|
300
|
+
# Return clean HTML, including HTML elements and text
|
301
|
+
#
|
302
|
+
parsed_node = node.to_html
|
289
303
|
|
290
|
-
|
291
|
-
end
|
304
|
+
rescue
|
292
305
|
|
293
|
-
|
306
|
+
end
|
294
307
|
end
|
308
|
+
|
309
|
+
article_body[:body] << parsed_node unless parsed_node.nil? || omit_node
|
295
310
|
end
|
296
311
|
end
|
297
312
|
|
@@ -301,18 +316,7 @@ module GDNewsScraper::Scrapers
|
|
301
316
|
end
|
302
317
|
|
303
318
|
def figure(article_body, id, node, image, image_url)
|
304
|
-
article_body[:figures][id] = { }
|
305
|
-
|
306
|
-
article_body[:figures][id][:image] = image_url
|
307
|
-
article_body[:figures][id][:title] = image.at('img').attr('title')
|
308
|
-
article_body[:figures][id][:alt] = image.at('img').attr('alt')
|
309
319
|
|
310
|
-
image_meta = node.at('.e-image__meta')
|
311
|
-
|
312
|
-
unless image_meta.nil?
|
313
|
-
article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
|
314
|
-
article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
|
315
|
-
end
|
316
320
|
end
|
317
321
|
|
318
322
|
private
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module GDNewsScraper
|
2
|
-
VERSION ||= '3.0.
|
2
|
+
VERSION ||= '3.0.7'
|
3
3
|
|
4
4
|
# => major: A new Source has been added or removed
|
5
5
|
# => minor: A Source code has changed drastically to a point where it's not
|
@@ -35,5 +35,7 @@ module GDNewsScraper
|
|
35
35
|
# having to go through the index page to get its metadata
|
36
36
|
# v3.0.6 - Small refactor of the code which also improved parsing speed by
|
37
37
|
# about 10% on average! :)
|
38
|
+
# v3.0.7 - Changed the way figures are added to the articles which takes in
|
39
|
+
# consideration deeply nested figures as well
|
38
40
|
#
|
39
41
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: GDNewsScraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Vlad Radulescu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-12-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
description: A Ruby Scraper created for games.directory to crawl the web for gaming
|
56
70
|
News and Reviews.
|
57
71
|
email:
|