GDNewsScraper 3.0.6 → 3.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c37e3bbf8420be9d2bf182d091eb5b8713e46679
4
- data.tar.gz: c20ce1f2ee2b57757bd0d9d87b3bdc34fd751cf7
3
+ metadata.gz: 0ab7ee6a0e7e30c64e6ece2a156b9c94756c00a6
4
+ data.tar.gz: 30887530f6a43130209a98402fb0c32b647a5f77
5
5
  SHA512:
6
- metadata.gz: 74639942afca8966f6602c642a89cb31979e896a4656c8ed6dbf20d5e8764a642ab62f6a05566265b75531f685c542d1ff0daaa8ce0f474e3b882355a1f9afc3
7
- data.tar.gz: 1783aa15b5339131027ecf0aa0d2776d4959664157cee1305eca2acf206e43551f9079e4a60a322fe7a99f95f1513c302d8e13cf9529c95a97aa6e2476215b49
6
+ metadata.gz: cefe85f767614e30d79caa71c50c43eb466e6b52359755ab0dd84491682c42fbff25ac362c181eb394ddb0590e9717fdf4b7b6d611f273caaf25595f584894a1
7
+ data.tar.gz: 35fec37b2669653852e1dd3dcd39d95bb0d5233b3d4bee92bb5004731bf9062a2ac3285ca35529350f3694cfad50c4c2502c470622ee4c5321779f4f6b5db7ae
@@ -23,4 +23,5 @@ Gem::Specification.new do |spec|
23
23
 
24
24
  spec.add_development_dependency 'bundler', '~> 1.12'
25
25
  spec.add_development_dependency 'rake', '~> 10.0'
26
+ spec.add_development_dependency 'pry'
26
27
  end
@@ -10,7 +10,7 @@ module GDNewsScraper::Scrapers
10
10
  URL ||= 'https://www.polygon.com'
11
11
 
12
12
  WHITELIST ||= {
13
- default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
13
+ default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'blockquote', 'ul', 'ol'],
14
14
  inner: ['strong', 'em', 'li']
15
15
  }
16
16
 
@@ -42,7 +42,7 @@ module GDNewsScraper::Scrapers
42
42
  end
43
43
 
44
44
  def perform
45
- @page.css('.c-compact-river__entry').first(2).each do |article|
45
+ @page.css('.c-compact-river__entry').each do |article|
46
46
  stream[:articles].push(parse(article))
47
47
  end
48
48
  end
@@ -153,8 +153,9 @@ module GDNewsScraper::Scrapers
153
153
  if content && text && attributes && children
154
154
  node.remove
155
155
  else
156
- if node.name == 'div'
157
156
 
157
+ if node.name == 'div'
158
+
158
159
  # Check to see if the div contains a embeded video
159
160
  #
160
161
  iframe = node.at('iframe')
@@ -217,21 +218,17 @@ module GDNewsScraper::Scrapers
217
218
  end
218
219
  end
219
220
 
220
- # First ensure the node is an actual element. This removes random HTML elements
221
- #
222
- # => node.element?
223
- #
224
- # Secondly, ensure the node is what we actual want. We don't want <div>'s
225
- # which are usualy used for placing inline advertisments or content specific
226
- # only to that website
221
+ # Extract 'figure' outside the node check because in many cases it's
222
+ # nested within other HTML elements and it makes it harder to
223
+ # extract without being too specific
227
224
  #
228
- # => WHITELIST[:default].include?(node.name)
225
+ # Do a double check because if the current node is in fact a figure,
226
+ # it will return false
229
227
  #
230
- if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
231
- case node.name
232
- when 'figure'
228
+ figure = (node.name == 'figure' || node.at('figure.e-image'))
233
229
 
234
- image = node.at('.e-image__image')
230
+ if figure
231
+ node.css('.e-image__image').each do |image|
235
232
  image_url = image.attr('data-original')
236
233
 
237
234
  if image_url.split('.').last == 'gif'
@@ -244,54 +241,72 @@ module GDNewsScraper::Scrapers
244
241
  else
245
242
  id = unique_id
246
243
 
247
- figure(article_body, id, node, image, image_url)
244
+ article_body[:figures][id] = { }
248
245
 
249
- article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
250
- end
246
+ article_body[:figures][id][:image] = image_url
247
+ article_body[:figures][id][:title] = image.at('img').attr('title')
248
+ article_body[:figures][id][:alt] = image.at('img').attr('alt')
251
249
 
252
- else
250
+ image_meta = node.at('.e-image__meta')
253
251
 
254
- node.children.each do |inner_node|
255
- case inner_node.name
256
- when 'a'
257
- id = unique_id
258
-
259
- article_body[:anchors][id] = {
260
- text: inner_node.children.text,
261
- url: inner_node.attr('href')
262
- }
252
+ unless image_meta.nil?
253
+ article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
254
+ article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
255
+ end
263
256
 
264
- inner_node.replace("{{anchor:#{ id }}}")
265
- when 'figure'
266
- id = unique_id
257
+ article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
258
+ end
259
+ end
267
260
 
268
- image = node.at('.e-image__image')
269
- image_url = image.attr('data-original')
261
+ node.traverse { |children| children.remove }
262
+ end
270
263
 
271
- figure(article_body, id, node, image, image_url)
264
+ # First ensure the node is an actual element. This removes random HTML elements
265
+ #
266
+ # => node.element?
267
+ #
268
+ # Secondly, ensure the node is what we actual want. We don't want <div>'s
269
+ # which are usualy used for placing inline advertisments or content specific
270
+ # only to that website
271
+ #
272
+ # => WHITELIST[:default].include?(node.name)
273
+ #
274
+ if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
275
+ node.children.each do |inner_node|
276
+ case inner_node.name
277
+ when 'a'
278
+ id = unique_id
279
+
280
+ article_body[:anchors][id] = {
281
+ text: inner_node.children.text,
282
+ url: inner_node.attr('href')
283
+ }
272
284
 
273
- node = node.replace("{{figure:#{ id }}}").to_html
274
- article_body[:body] << node
275
- end
285
+ inner_node.replace("{{anchor:#{ id }}}")
276
286
  end
287
+ end
277
288
 
278
- begin
289
+ begin
279
290
 
280
- # Remove all attributes
281
- #
282
- parsed_node = node.xpath('.//@*').remove
291
+ # Remove all attributes
292
+ #
293
+ parsed_node = node.xpath('.//@*').remove
283
294
 
284
- # Return clean HTML, including HTML elements and text
285
- #
286
- parsed_node = node.to_html
295
+ # Check the integrity of the node before parsing it into html
296
+ # since 'content' is a Nokogiri feature
297
+ #
298
+ omit_node = node.content.empty?
287
299
 
288
- rescue
300
+ # Return clean HTML, including HTML elements and text
301
+ #
302
+ parsed_node = node.to_html
289
303
 
290
- end
291
- end
304
+ rescue
292
305
 
293
- article_body[:body] << parsed_node unless parsed_node.nil?
306
+ end
294
307
  end
308
+
309
+ article_body[:body] << parsed_node unless parsed_node.nil? || omit_node
295
310
  end
296
311
  end
297
312
 
@@ -301,18 +316,7 @@ module GDNewsScraper::Scrapers
301
316
  end
302
317
 
303
318
  def figure(article_body, id, node, image, image_url)
304
- article_body[:figures][id] = { }
305
-
306
- article_body[:figures][id][:image] = image_url
307
- article_body[:figures][id][:title] = image.at('img').attr('title')
308
- article_body[:figures][id][:alt] = image.at('img').attr('alt')
309
319
 
310
- image_meta = node.at('.e-image__meta')
311
-
312
- unless image_meta.nil?
313
- article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
314
- article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
315
- end
316
320
  end
317
321
 
318
322
  private
@@ -1,5 +1,5 @@
1
1
  module GDNewsScraper
2
- VERSION ||= '3.0.6'
2
+ VERSION ||= '3.0.7'
3
3
 
4
4
  # => major: A new Source has been added or removed
5
5
  # => minor: A Source code has changed drastically to a point where it's not
@@ -35,5 +35,7 @@ module GDNewsScraper
35
35
  # having to go through the index page to get its metadata
36
36
  # v3.0.6 - Small refactor of the code which also improved parsing speed by
37
37
  # about 10% on average! :)
38
+ # v3.0.7 - Changed the way figures are added to the articles which takes in
39
+ # consideration deeply nested figures as well
38
40
  #
39
41
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: GDNewsScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.6
4
+ version: 3.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Radulescu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-30 00:00:00.000000000 Z
11
+ date: 2017-12-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  description: A Ruby Scraper created for games.directory to crawl the web for gaming
56
70
  News and Reviews.
57
71
  email: