RubyGems - GDNewsScraper - Versions diffs - 3.0.4 → 3.0.6 - Mend

GDNewsScraper 3.0.4 → 3.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/GDNewsScraper/scrapers/polygon_com/news.rb +201 -142
data/lib/GDNewsScraper/string.rb +5 -0
data/lib/GDNewsScraper/version.rb +6 -2
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 692d674f129613f9b5fa1b379abf86c588d7f3f1
-  data.tar.gz: 3653b5992703ae9e7027e75b07ef0ed7818313a9
+  metadata.gz: c37e3bbf8420be9d2bf182d091eb5b8713e46679
+  data.tar.gz: c20ce1f2ee2b57757bd0d9d87b3bdc34fd751cf7
 SHA512:
-  metadata.gz: 580d4967034bed31b74e80b72993fda91614f6b8bb91c2c3f924590448e6f1c22c490ee6292722528434b619ed9f8a8c0625eb24c8dcaa767c4176adebebe184
-  data.tar.gz: fe10ea48908f0e012a14a78de2b340bb6ac21956475d7efdb4235a6d3bf0fc7e39e7a8a812c9cbb0e4844242d3b8ae63cff358b989e4a4c2fa6811e19e0105c6
+  metadata.gz: 74639942afca8966f6602c642a89cb31979e896a4656c8ed6dbf20d5e8764a642ab62f6a05566265b75531f685c542d1ff0daaa8ce0f474e3b882355a1f9afc3
+  data.tar.gz: 1783aa15b5339131027ecf0aa0d2776d4959664157cee1305eca2acf206e43551f9079e4a60a322fe7a99f95f1513c302d8e13cf9529c95a97aa6e2476215b49

data/lib/GDNewsScraper/scrapers/polygon_com/news.rb CHANGED Viewed

@@ -7,49 +7,31 @@ module GDNewsScraper::Scrapers
       "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
     }
-    STREAM_URI ||= 'https://www.polygon.com'
+    URL ||= 'https://www.polygon.com'
     WHITELIST ||= {
       default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
       inner:   ['strong', 'em', 'li']
     }
-    DOM ||= {
-      article: {
-        wrapper:               '.c-compact-river',
-        container:             '.c-compact-river__entry',
-        inner_container:       '.c-entry-box--compact',
-        inner_container_video: '.c-entry-box--compact--video',
-        title:                 '.c-entry-box--compact__title',
-        cover:                 '.c-entry-box--compact__image',
-        meta:                  '.c-byline'
-      },
-      pagination: {
-        previous: '.c-pagination__prev',
-        info:     '.c-pagination__text',
-        next:     '.c-pagination__next'
-      }
-    }
     class News
       attr_accessor :stream
-      def initialize(offset = 0)
+      def initialize(offset = nil)
         unless offset.nil?
-          uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI }/news/archives/#{ offset }"
+          uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::URL }/news/archives/#{ offset }"
           @page   = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
           @stream = Hash.new
           stream[:stream] = Hash.new
-          stream[:stream][:size]  = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
-          stream[:stream][:pages] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
-          stream[:stream][:prev]  = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
-          stream[:stream][:next]  = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
+          stream[:stream][:size]  = @page.at('.c-pagination__text').text.split.first.to_num
+          stream[:stream][:pages] = @page.at('.c-pagination__text').text.split.last.to_num
+          stream[:stream][:prev]  = @page.at('.c-pagination__prev')&.attr('href')&.split('/')&.last.to_i
+          stream[:stream][:next]  = @page.at('.c-pagination__next')&.attr('href')&.split('/')&.last.to_i
           stream[:feed] = Hash.new
-          stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI
+          stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::URL
           stream[:feed][:source] = 'polygon'
           stream[:feed][:label] = 'Polygon'
@@ -60,8 +42,8 @@ module GDNewsScraper::Scrapers
       end
       def perform
-        @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:container]).each do |article|
-          stream[:articles] << parse(article)
+        @page.css('.c-compact-river__entry').first(2).each do |article|
+          stream[:articles].push(parse(article))
         end
       end
@@ -72,38 +54,74 @@ module GDNewsScraper::Scrapers
       def parse(article)
         pulse = Hash.new
-        is_a_video = !article.at(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container_video]).nil?
-        key   = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
-        url   = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.attr('href')
-        title = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.text
-        pulse[:id]   = key
-        pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
-        begin
-          pulse[:cover] = article.children.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:cover]).children.children.first.attr('src')
-        rescue
-          pulse[:cover] = nil
+        # This allows the Parser to get its data from the Index page, when the
+        # article is a Nokogiri::XML or from the Article page when the article
+        # is a URL.
+        #
+        # Passing a URL is mainly for debugging in case an Article fails to
+        # parse and should only be used as such..
+        #
+        if article.is_a?(String)
+          begin
+            article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
+            is_a_video = article_page.at('.c-video-embed').nil?
+            key    = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
+            url    = article
+            title  = strip(article_page.css('.c-entry-hero').at('.c-page-title'))
+            cover  = (is_a_video ? nil : article_page.css('.l-col__main').at('.e-image__image').attr('data-original'))
+            author = strip(article_page.css('.c-entry-hero').at('.c-byline').css('.c-byline__item > a').children[0])
+            begin
+              article_date = strip(article_page.css('.c-entry-hero').at('.c-byline').css('time.c-byline__item'))
+              parsed_date  = DateTime.parse(article_date)
+              date = parsed_date.to_time.to_i
+              # Never failed so not entirely sure what to rescue from, but with
+              # dates it allways risky not to rescue
+              #
+              # TODO: When it fails, find out why and rescue from that instead
+              #       of rescuing from 'everything' ..
+              #
+            rescue
+              date = nil
+            end
+          rescue TypeError
+            raise ArgumentError.new('Invalid URL')
+          end
+        elsif article.is_a?(Nokogiri::XML::Element)
+          is_a_video = !article.at('.c-entry-box--compact--video').nil?
+          key    = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
+          url    = article.at('.c-entry-box--compact__title').at('> a').attr('href')
+          title  = strip(article.at('.c-entry-box--compact__title'))
+          cover  = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
+          author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
+          date   = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
+        else
+          raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
         end
+        pulse[:id]      = key
+        pulse[:hash]    = ::Base64.encode64("#{ title } - #{ key }")
+        pulse[:cover]   = cover
         pulse[:url]     = url
         pulse[:title]   = title
-        pulse[:author]  = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.children[1].children[1]&.text
-        pulse[:date]    = JSON.parse(article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
+        pulse[:author]  = author
+        pulse[:date]    = date
         pulse[:content] = parse_article_body(url, is_a_video)
         pulse[:tags]    = title.downcase.split
         return pulse
       rescue => e
-        "There was a problem while parsing Article for '#{ title }' => #{ e }"
+        "There was a problem while parsing this Article: #{ e }"
       end
-    private
       def parse_article_body(article_url, is_a_video = false)
         article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
-        article_container = article_page.css('.c-entry-content')
+        article_container = article_page.at('.c-entry-content')
         article_body = {
           galleries: { },
@@ -118,7 +136,7 @@ module GDNewsScraper::Scrapers
         if is_a_video
           iframe = article_page.at('.c-video-embed--media').at('iframe')
-          iframe_id = random_string
+          iframe_id = unique_id
           article_body[:videos][iframe_id] = {}
           article_body[:videos][iframe_id][:url] = iframe.attr('src')
@@ -127,148 +145,189 @@ module GDNewsScraper::Scrapers
         end
         article_container.children.each do |node|
-          if node.name == 'div'
+          content    = node.content.strip.empty?
+          text       = node.text.strip.empty?
+          attributes = node.attributes.empty?
+          children   = node.children.empty?
-            # Check to see if the div contains a embeded video
-            #
-            iframe = node.at('iframe')
+          if content && text && attributes && children
+            node.remove
+          else
+            if node.name == 'div'
-            if iframe # YouTube videos
-              iframe_id = random_string
+              # Check to see if the div contains a embeded video
+              #
+              iframe = node.at('iframe')
-              article_body[:videos][iframe_id] = {}
-              article_body[:videos][iframe_id][:url] = iframe.attr('src')
+              if iframe # YouTube videos
+                iframe_id = unique_id
-              article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
-            end
+                article_body[:videos][iframe_id] = {}
+                article_body[:videos][iframe_id][:url] = iframe.attr('src')
-            # Check to see if the div contains a gallery
-            #
-            gallery = node.at('.c-image-gallery')
+                article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
+              end
-            if gallery
-              gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
+              # Check to see if the Article has a video by Polygon, which is
+              # embeded differnetly than a YouTube video..
+              #
+              polygon_video = node.attributes['data-volume-uuid']
+              unless polygon_video.nil?
+                id = unique_id
+                article_body[:videos][id] = {}
+                article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
+                article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ node.attr('data-volume-uuid') }"
+                article_body[:body] << node.replace("{{video:#{ id }}}").to_html
+              end
+              # Check to see if the div contains a gallery
+              #
+              gallery = node.at('.c-image-gallery')
+              if gallery
+                gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
-              gallery_id = random_string
-              article_body[:galleries][gallery_id] = []
+                gallery_id = unique_id
+                article_body[:galleries][gallery_id] = []
-              gallery_container.children.children.each do |image_container|
-                image = image_container.at('a')
+                gallery_container.children.children.each do |image_container|
+                  image = image_container.at('a')
-                if image
-                  article_body[:galleries][gallery_id] << image.attr('href')
+                  if image
+                    article_body[:galleries][gallery_id] << image.attr('href')
+                  end
                 end
+                article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
               end
-              article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
-            end
+              twitdget = node.at('.twitter-tweet')
-            twitdget = node.at('.twitter-tweet')
+              if twitdget
+                article_body[:body] << twitdget.to_html
+              end
-            if twitdget
-              article_body[:body] << twitdget.to_html
+              redditget = node.at('.reddit-card')
+              if redditget
+                article_body[:body] << redditget.to_html
+              end
             end
-            redditget = node.at('.reddit-card')
+            # First ensure the node is an actual element. This removes random HTML elements
+            #
+            # => node.element?
+            #
+            # Secondly, ensure the node is what we actual want. We don't want <div>'s
+            # which are usualy used for placing inline advertisments or content specific
+            # only to that website
+            #
+            # => WHITELIST[:default].include?(node.name)
+            #
+            if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
+              case node.name
+              when 'figure'
-            if redditget
-              article_body[:body] << redditget.to_html
-            end
-          end
+                image = node.at('.e-image__image')
+                image_url = image.attr('data-original')
-          # First ensure the node is an actual element. This removes random HTML elements
-          #
-          # => node.element?
-          #
-          # Secondly, ensure the node is what we actual want. We don't want <div>'s
-          # which are usualy used for placing inline advertisments or content specific
-          # only to that website
-          #
-          # => WHITELIST[:default].include?(node.name)
-          #
-          if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
-            case node.name
-            when 'figure'
-              image = node.css('.e-image__image').first
-              image_url = image.attr('data-original')
-              begin
                 if image_url.split('.').last == 'gif'
-                  image_id = random_string
+                  id = unique_id
-                  article_body[:images][image_id] = {}
-                  article_body[:images][image_id][:url] = image_url
+                  article_body[:images][id] = { }
+                  article_body[:images][id][:url] = image_url
-                  article_body[:body] << node.replace("{{image:#{ image_id }}}").to_html
+                  article_body[:body] << node.replace("{{image:#{ id }}}").to_html
                 else
-                  image_alt = image.children.at('img').attr('alt')
-                  image_title = image.children.at('img').attr('title')
+                  id = unique_id
+                  figure(article_body, id, node, image, image_url)
+                  article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
+                end
+              else
-                  image_meta = node.css('.e-image__meta')
+                node.children.each do |inner_node|
+                  case inner_node.name
+                  when 'a'
+                    id = unique_id
+                    article_body[:anchors][id] = {
+                      text: inner_node.children.text,
+                      url: inner_node.attr('href')
+                    }
-                  figure_id = random_string
+                    inner_node.replace("{{anchor:#{ id }}}")
+                  when 'figure'
+                    id = unique_id
-                  article_body[:figures][figure_id] = {}
+                    image = node.at('.e-image__image')
+                    image_url = image.attr('data-original')
-                  article_body[:figures][figure_id][:image] = image_url
-                  article_body[:figures][figure_id][:title] = image_title
-                  article_body[:figures][figure_id][:alt]   = image_alt
+                    figure(article_body, id, node, image, image_url)
-                  unless image_meta.empty?
-                    article_body[:figures][figure_id][:caption] = image_meta.first.at('figcaption')&.text
-                    article_body[:figures][figure_id][:cite]    = image_meta.first.at('cite')&.text
+                    node = node.replace("{{figure:#{ id }}}").to_html
+                    article_body[:body] << node
                   end
-                  article_body[:body] << node.replace("{{figure:#{ figure_id }}}").to_html
                 end
-              rescue
-                raise 'Unknown format, please review.'
-              end
-            else
-              node.children.each do |url|
                 begin
-                  if url.name == 'a'
-                    url_id = random_string
-                    article_body[:anchors][url_id] = {
-                      text: url.children.text,
-                      url: url.attributes['href'].value
-                    }
-                    url.replace("{{anchor:#{ url_id }}}")
-                  end
+                  # Remove all attributes
+                  #
+                  parsed_node = node.xpath('.//@*').remove
+                  # Return clean HTML, including HTML elements and text
+                  #
+                  parsed_node = node.to_html
                 rescue
-                  raise 'Unknown format, please review.'
                 end
               end
-              # Remove all attributes
-              #
-              parsed_node = node.xpath('.//@*').remove
-              # Return clean HTML, including HTML elements and text
-              #
-              parsed_node = node.to_html
+              article_body[:body] << parsed_node unless parsed_node.nil?
             end
-            article_body[:body] << parsed_node
           end
         end
         return article_body
       rescue => e
-        "There was a problem while parsing the Article body for '#{ title }' => #{ e }"
+        "There was a problem while parsing this Article: #{ e }"
       end
+      def figure(article_body, id, node, image, image_url)
+        article_body[:figures][id] = { }
+        article_body[:figures][id][:image] = image_url
+        article_body[:figures][id][:title] = image.at('img').attr('title')
+        article_body[:figures][id][:alt]   = image.at('img').attr('alt')
+        image_meta = node.at('.e-image__meta')
+        unless image_meta.nil?
+          article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
+          article_body[:figures][id][:cite]    = strip(image_meta.at('cite'))
+        end
+      end
+    private
       def attr(attribute)
         attributes&.fetch(attribute, nil)&.value
       end
-      def random_string
+      def strip(string)
+        string&.text&.strip
+      end
+      def unique_id
         (0...50).map { (65 + rand(25)).chr }.join.to_sym
       end
     end # News
   end # PolygonCOM
-end # GDNewsScraper::Scrapers
+end # GDNewsScraper::Scrapers

data/lib/GDNewsScraper/string.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class String
+  def to_num
+    gsub(/\D/, '').to_i
+  end
+end

data/lib/GDNewsScraper/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module GDNewsScraper
-  VERSION ||= '3.0.4'
+  VERSION ||= '3.0.6'
   # => major: A new Source has been added or removed
   # => minor: A Source code has changed drastically to a point where it's not
@@ -31,5 +31,9 @@ module GDNewsScraper
   # v3.0.3 - Added a new method which will refresh the content of an Article
   # v3.0.4 - Fixed an issue caused by Featured Articles which have a different
   #          DOM structure
-  #
+  # v3.0.5 - Adds the possibility to parse an article from its URL rather than
+  #          having to go through the index page to get its metadata
+  # v3.0.6 - Small refactor of the code which also improved parsing speed by
+  #          about 10% on average! :)
+  #
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: GDNewsScraper
 version: !ruby/object:Gem::Version
-  version: 3.0.4
+  version: 3.0.6
 platform: ruby
 authors:
 - Vlad Radulescu
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-11-28 00:00:00.000000000 Z
+date: 2017-11-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -74,6 +74,7 @@ files:
 - lib/GDNewsScraper.rb
 - lib/GDNewsScraper/scrapers/polygon_com/news.rb
 - lib/GDNewsScraper/scrapers/polygon_com/reviews.rb
+- lib/GDNewsScraper/string.rb
 - lib/GDNewsScraper/version.rb
 homepage: https://github.com/games-directory/scraper
 licenses: