RubyGems - GDNewsScraper - Versions diffs - 3.0.6 → 3.0.7 - Mend

GDNewsScraper 3.0.6 → 3.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/GDNewsScraper.gemspec +1 -0
data/lib/GDNewsScraper/scrapers/polygon_com/news.rb +63 -59
data/lib/GDNewsScraper/version.rb +3 -1
metadata +16 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: c37e3bbf8420be9d2bf182d091eb5b8713e46679
-  data.tar.gz: c20ce1f2ee2b57757bd0d9d87b3bdc34fd751cf7
+  metadata.gz: 0ab7ee6a0e7e30c64e6ece2a156b9c94756c00a6
+  data.tar.gz: 30887530f6a43130209a98402fb0c32b647a5f77
 SHA512:
-  metadata.gz: 74639942afca8966f6602c642a89cb31979e896a4656c8ed6dbf20d5e8764a642ab62f6a05566265b75531f685c542d1ff0daaa8ce0f474e3b882355a1f9afc3
-  data.tar.gz: 1783aa15b5339131027ecf0aa0d2776d4959664157cee1305eca2acf206e43551f9079e4a60a322fe7a99f95f1513c302d8e13cf9529c95a97aa6e2476215b49
+  metadata.gz: cefe85f767614e30d79caa71c50c43eb466e6b52359755ab0dd84491682c42fbff25ac362c181eb394ddb0590e9717fdf4b7b6d611f273caaf25595f584894a1
+  data.tar.gz: 35fec37b2669653852e1dd3dcd39d95bb0d5233b3d4bee92bb5004731bf9062a2ac3285ca35529350f3694cfad50c4c2502c470622ee4c5321779f4f6b5db7ae

data/GDNewsScraper.gemspec CHANGED Viewed

@@ -23,4 +23,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'bundler', '~> 1.12'
   spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'pry'
 end

data/lib/GDNewsScraper/scrapers/polygon_com/news.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module GDNewsScraper::Scrapers
     URL ||= 'https://www.polygon.com'
     WHITELIST ||= {
-      default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
+      default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'blockquote', 'ul', 'ol'],
       inner:   ['strong', 'em', 'li']
     }
@@ -42,7 +42,7 @@ module GDNewsScraper::Scrapers
       end
       def perform
-        @page.css('.c-compact-river__entry').first(2).each do |article|
+        @page.css('.c-compact-river__entry').each do |article|
           stream[:articles].push(parse(article))
         end
       end
@@ -153,8 +153,9 @@ module GDNewsScraper::Scrapers
           if content && text && attributes && children
             node.remove
           else
-            if node.name == 'div'
+            if node.name == 'div'
               # Check to see if the div contains a embeded video
               #
               iframe = node.at('iframe')
@@ -217,21 +218,17 @@ module GDNewsScraper::Scrapers
               end
             end
-            # First ensure the node is an actual element. This removes random HTML elements
-            #
-            # => node.element?
-            #
-            # Secondly, ensure the node is what we actual want. We don't want <div>'s
-            # which are usualy used for placing inline advertisments or content specific
-            # only to that website
+            # Extract 'figure' outside the node check because in many cases it's
+            # nested within other HTML elements and it makes it harder to
+            # extract without being too specific
             #
-            # => WHITELIST[:default].include?(node.name)
+            # Do a double check because if the current node is in fact a figure,
+            # it will return false
             #
-            if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
-              case node.name
-              when 'figure'
+            figure = (node.name == 'figure' || node.at('figure.e-image'))
-                image = node.at('.e-image__image')
+            if figure
+              node.css('.e-image__image').each do |image|
                 image_url = image.attr('data-original')
                 if image_url.split('.').last == 'gif'
@@ -244,54 +241,72 @@ module GDNewsScraper::Scrapers
                 else
                   id = unique_id
-                  figure(article_body, id, node, image, image_url)
+                  article_body[:figures][id] = { }
-                  article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
-                end
+                  article_body[:figures][id][:image] = image_url
+                  article_body[:figures][id][:title] = image.at('img').attr('title')
+                  article_body[:figures][id][:alt]   = image.at('img').attr('alt')
-              else
+                  image_meta = node.at('.e-image__meta')
-                node.children.each do |inner_node|
-                  case inner_node.name
-                  when 'a'
-                    id = unique_id
-                    article_body[:anchors][id] = {
-                      text: inner_node.children.text,
-                      url: inner_node.attr('href')
-                    }
+                  unless image_meta.nil?
+                    article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
+                    article_body[:figures][id][:cite]    = strip(image_meta.at('cite'))
+                  end
-                    inner_node.replace("{{anchor:#{ id }}}")
-                  when 'figure'
-                    id = unique_id
+                  article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
+                end
+              end
-                    image = node.at('.e-image__image')
-                    image_url = image.attr('data-original')
+              node.traverse { |children| children.remove }
+            end
-                    figure(article_body, id, node, image, image_url)
+            # First ensure the node is an actual element. This removes random HTML elements
+            #
+            # => node.element?
+            #
+            # Secondly, ensure the node is what we actual want. We don't want <div>'s
+            # which are usualy used for placing inline advertisments or content specific
+            # only to that website
+            #
+            # => WHITELIST[:default].include?(node.name)
+            #
+            if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
+              node.children.each do |inner_node|
+                case inner_node.name
+                when 'a'
+                  id = unique_id
+                  article_body[:anchors][id] = {
+                    text: inner_node.children.text,
+                    url: inner_node.attr('href')
+                  }
-                    node = node.replace("{{figure:#{ id }}}").to_html
-                    article_body[:body] << node
-                  end
+                  inner_node.replace("{{anchor:#{ id }}}")
                 end
+              end
-                begin
+              begin
-                  # Remove all attributes
-                  #
-                  parsed_node = node.xpath('.//@*').remove
+                # Remove all attributes
+                #
+                parsed_node = node.xpath('.//@*').remove
-                  # Return clean HTML, including HTML elements and text
-                  #
-                  parsed_node = node.to_html
+                # Check the integrity of the node before parsing it into html
+                # since 'content' is a Nokogiri feature
+                #
+                omit_node = node.content.empty?
-                rescue
+                # Return clean HTML, including HTML elements and text
+                #
+                parsed_node = node.to_html
-                end
-              end
+              rescue
-              article_body[:body] << parsed_node unless parsed_node.nil?
+              end
             end
+            article_body[:body] << parsed_node unless parsed_node.nil? || omit_node
           end
         end
@@ -301,18 +316,7 @@ module GDNewsScraper::Scrapers
       end
       def figure(article_body, id, node, image, image_url)
-        article_body[:figures][id] = { }
-        article_body[:figures][id][:image] = image_url
-        article_body[:figures][id][:title] = image.at('img').attr('title')
-        article_body[:figures][id][:alt]   = image.at('img').attr('alt')
-        image_meta = node.at('.e-image__meta')
-        unless image_meta.nil?
-          article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
-          article_body[:figures][id][:cite]    = strip(image_meta.at('cite'))
-        end
       end
     private

data/lib/GDNewsScraper/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module GDNewsScraper
-  VERSION ||= '3.0.6'
+  VERSION ||= '3.0.7'
   # => major: A new Source has been added or removed
   # => minor: A Source code has changed drastically to a point where it's not
@@ -35,5 +35,7 @@ module GDNewsScraper
   #          having to go through the index page to get its metadata
   # v3.0.6 - Small refactor of the code which also improved parsing speed by
   #          about 10% on average! :)
+  # v3.0.7 - Changed the way figures are added to the articles which takes in
+  #          consideration deeply nested figures as well
   #
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: GDNewsScraper
 version: !ruby/object:Gem::Version
-  version: 3.0.6
+  version: 3.0.7
 platform: ruby
 authors:
 - Vlad Radulescu
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-11-30 00:00:00.000000000 Z
+date: 2017-12-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -52,6 +52,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: A Ruby Scraper created for games.directory to crawl the web for gaming
   News and Reviews.
 email: