RubyGems - GDNewsScraper - Versions diffs - 3.0.4 → 3.0.6 - Mend

GDNewsScraper 3.0.4 → 3.0.6

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/GDNewsScraper/scrapers/polygon_com/news.rb +201 -142
data/lib/GDNewsScraper/string.rb +5 -0
data/lib/GDNewsScraper/version.rb +6 -2
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 692d674f129613f9b5fa1b379abf86c588d7f3f1
-  data.tar.gz: 3653b5992703ae9e7027e75b07ef0ed7818313a9
+  metadata.gz: c37e3bbf8420be9d2bf182d091eb5b8713e46679
+  data.tar.gz: c20ce1f2ee2b57757bd0d9d87b3bdc34fd751cf7
 SHA512:
-  metadata.gz: 580d4967034bed31b74e80b72993fda91614f6b8bb91c2c3f924590448e6f1c22c490ee6292722528434b619ed9f8a8c0625eb24c8dcaa767c4176adebebe184
-  data.tar.gz: fe10ea48908f0e012a14a78de2b340bb6ac21956475d7efdb4235a6d3bf0fc7e39e7a8a812c9cbb0e4844242d3b8ae63cff358b989e4a4c2fa6811e19e0105c6
+  metadata.gz: 74639942afca8966f6602c642a89cb31979e896a4656c8ed6dbf20d5e8764a642ab62f6a05566265b75531f685c542d1ff0daaa8ce0f474e3b882355a1f9afc3
+  data.tar.gz: 1783aa15b5339131027ecf0aa0d2776d4959664157cee1305eca2acf206e43551f9079e4a60a322fe7a99f95f1513c302d8e13cf9529c95a97aa6e2476215b49

data/lib/GDNewsScraper/scrapers/polygon_com/news.rb CHANGED Viewed

@@ -7,49 +7,31 @@ module GDNewsScraper::Scrapers
       "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
     }
-    STREAM_URI ||= 'https://www.polygon.com'
+    URL ||= 'https://www.polygon.com'
     WHITELIST ||= {
       default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
       inner:   ['strong', 'em', 'li']
     }
-    DOM ||= {
-      article: {
-        wrapper:               '.c-compact-river',
-        container:             '.c-compact-river__entry',
-        inner_container:       '.c-entry-box--compact',
-        inner_container_video: '.c-entry-box--compact--video',
-        title:                 '.c-entry-box--compact__title',
-        cover:                 '.c-entry-box--compact__image',
-        meta:                  '.c-byline'
-      },
-      pagination: {
-        previous: '.c-pagination__prev',
-        info:     '.c-pagination__text',
-        next:     '.c-pagination__next'
-      }
-    }
     class News
       attr_accessor :stream
-      def initialize(offset = 0)
+      def initialize(offset = nil)
         unless offset.nil?
-          uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI }/news/archives/#{ offset }"
+          uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::URL }/news/archives/#{ offset }"
           @page   = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
           @stream = Hash.new
           stream[:stream] = Hash.new
-          stream[:stream][:size]  = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
-          stream[:stream][:pages] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
-          stream[:stream][:prev]  = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
-          stream[:stream][:next]  = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
+          stream[:stream][:size]  = @page.at('.c-pagination__text').text.split.first.to_num
+          stream[:stream][:pages] = @page.at('.c-pagination__text').text.split.last.to_num
+          stream[:stream][:prev]  = @page.at('.c-pagination__prev')&.attr('href')&.split('/')&.last.to_i
+          stream[:stream][:next]  = @page.at('.c-pagination__next')&.attr('href')&.split('/')&.last.to_i
           stream[:feed] = Hash.new
-          stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI
+          stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::URL
           stream[:feed][:source] = 'polygon'
           stream[:feed][:label] = 'Polygon'
@@ -60,8 +42,8 @@ module GDNewsScraper::Scrapers
       end
       def perform
-        @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:container]).each do |article|
-          stream[:articles] << parse(article)
+        @page.css('.c-compact-river__entry').first(2).each do |article|
+          stream[:articles].push(parse(article))
         end
       end
@@ -72,38 +54,74 @@ module GDNewsScraper::Scrapers
       def parse(article)
         pulse = Hash.new
-        is_a_video = !article.at(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container_video]).nil?
-        key   = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
-        url   = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.attr('href')
-        title = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.text
-        pulse[:id]   = key
-        pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
-        begin
-          pulse[:cover] = article.children.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:cover]).children.children.first.attr('src')
-        rescue
-          pulse[:cover] = nil
+        # This allows the Parser to get its data from the Index page, when the
+        # article is a Nokogiri::XML or from the Article page when the article
+        # is a URL.
+        #
+        # Passing a URL is mainly for debugging in case an Article fails to
+        # parse and should only be used as such..
+        #
+        if article.is_a?(String)
+          begin
+            article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
+            is_a_video = article_page.at('.c-video-embed').nil?
+            key    = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
+            url    = article
+            title  = strip(article_page.css('.c-entry-hero').at('.c-page-title'))
+            cover  = (is_a_video ? nil : article_page.css('.l-col__main').at('.e-image__image').attr('data-original'))
+            author = strip(article_page.css('.c-entry-hero').at('.c-byline').css('.c-byline__item > a').children[0])
+            begin
+              article_date = strip(article_page.css('.c-entry-hero').at('.c-byline').css('time.c-byline__item'))
+              parsed_date  = DateTime.parse(article_date)
+              date = parsed_date.to_time.to_i
+              # Never failed so not entirely sure what to rescue from, but with
+              # dates it allways risky not to rescue
+              #
+              # TODO: When it fails, find out why and rescue from that instead
+              #       of rescuing from 'everything' ..
+              #
+            rescue
+              date = nil
+            end
+          rescue TypeError
+            raise ArgumentError.new('Invalid URL')
+          end
+        elsif article.is_a?(Nokogiri::XML::Element)
+          is_a_video = !article.at('.c-entry-box--compact--video').nil?
+          key    = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
+          url    = article.at('.c-entry-box--compact__title').at('> a').attr('href')
+          title  = strip(article.at('.c-entry-box--compact__title'))
+          cover  = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
+          author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
+          date   = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
+        else
+          raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
         end
+        pulse[:id]      = key
+        pulse[:hash]    = ::Base64.encode64("#{ title } - #{ key }")
+        pulse[:cover]   = cover
         pulse[:url]     = url
         pulse[:title]   = title
-        pulse[:author]  = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.children[1].children[1]&.text
-        pulse[:date]    = JSON.parse(article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
+        pulse[:author]  = author
+        pulse[:date]    = date
         pulse[:content] = parse_article_body(url, is_a_video)
         pulse[:tags]    = title.downcase.split
         return pulse
       rescue => e
-        "There was a problem while parsing Article for '#{ title }' => #{ e }"
+        "There was a problem while parsing this Article: #{ e }"
       end
-    private
       def parse_article_body(article_url, is_a_video = false)
         article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
-        article_container = article_page.css('.c-entry-content')
+        article_container = article_page.at('.c-entry-content')
         article_body = {
           galleries: { },
@@ -118,7 +136,7 @@ module GDNewsScraper::Scrapers
         if is_a_video
           iframe = article_page.at('.c-video-embed--media').at('iframe')
-          iframe_id = random_string
+          iframe_id = unique_id
           article_body[:videos][iframe_id] = {}
           article_body[:videos][iframe_id][:url] = iframe.attr('src')
@@ -127,148 +145,189 @@ module GDNewsScraper::Scrapers
         end
         article_container.children.each do |node|
-          if node.name == 'div'
+          content    = node.content.strip.empty?
+          text       = node.text.strip.empty?
+          attributes = node.attributes.empty?
+          children   = node.children.empty?
-            # Check to see if the div contains a embeded video
-            #
-            iframe = node.at('iframe')
+          if content && text && attributes && children
+            node.remove
+          else
+            if node.name == 'div'
-            if iframe # YouTube videos
-              iframe_id = random_string
+              # Check to see if the div contains a embeded video
+              #
+              iframe = node.at('iframe')
-              article_body[:videos][iframe_id] = {}
-              article_body[:videos][iframe_id][:url] = iframe.attr('src')
+              if iframe # YouTube videos
+                iframe_id = unique_id
-              article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
-            end
+                article_body[:videos][iframe_id] = {}
+                article_body[:videos][iframe_id][:url] = iframe.attr('src')
-            # Check to see if the div contains a gallery
-            #
-            gallery = node.at('.c-image-gallery')
+                article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
+              end
-            if gallery
-              gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
+              # Check to see if the Article has a video by Polygon, which is
+              # embeded differnetly than a YouTube video..
+              #
+              polygon_video = node.attributes['data-volume-uuid']
+              unless polygon_video.nil?
+                id = unique_id
+                article_body[:videos][id] = {}
+                article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
+                article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ node.attr('data-volume-uuid') }"
+                article_body[:body] << node.replace("{{video:#{ id }}}").to_html
+              end
+              # Check to see if the div contains a gallery
+              #
+              gallery = node.at('.c-image-gallery')
+              if gallery
+                gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
-              gallery_id = random_string
-              article_body[:galleries][gallery_id] = []
+                gallery_id = unique_id
+                article_body[:galleries][gallery_id] = []
-              gallery_container.children.children.each do |image_container|
-                image = image_container.at('a')
+                gallery_container.children.children.each do |image_container|
+                  image = image_container.at('a')
-                if image
-                  article_body[:galleries][gallery_id] << image.attr('href')
+                  if image
+                    article_body[:galleries][gallery_id] << image.attr('href')
+                  end
                 end
+                article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
               end
-              article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
-            end
+              twitdget = node.at('.twitter-tweet')
-            twitdget = node.at('.twitter-tweet')
+              if twitdget
+                article_body[:body] << twitdget.to_html
+              end
-            if twitdget
-              article_body[:body] << twitdget.to_html
+              redditget = node.at('.reddit-card')
+              if redditget
+                article_body[:body] << redditget.to_html
+              end
             end
-            redditget = node.at('.reddit-card')
+            # First ensure the node is an actual element. This removes random HTML elements
+            #
+            # => node.element?
+            #
+            # Secondly, ensure the node is what we actual want. We don't want <div>'s
+            # which are usualy used for placing inline advertisments or content specific
+            # only to that website
+            #
+            # => WHITELIST[:default].include?(node.name)
+            #
+            if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
+              case node.name
+              when 'figure'
-            if redditget
-              article_body[:body] << redditget.to_html
-            end
-          end
+                image = node.at('.e-image__image')
+                image_url = image.attr('data-original')
-          # First ensure the node is an actual element. This removes random HTML elements
-          #
-          # => node.element?
-          #
-          # Secondly, ensure the node is what we actual want. We don't want <div>'s
-          # which are usualy used for placing inline advertisments or content specific
-          # only to that website
-          #
-          # => WHITELIST[:default].include?(node.name)
-          #
-          if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
-            case node.name
-            when 'figure'
-              image = node.css('.e-image__image').first
-              image_url = image.attr('data-original')
-              begin
                 if image_url.split('.').last == 'gif'
-                  image_id = random_string
+                  id = unique_id
-                  article_body[:images][image_id] = {}
-                  article_body[:images][image_id][:url] = image_url
+                  article_body[:images][id] = { }
+                  article_body[:images][id][:url] = image_url
-                  article_body[:body] << node.replace("{{image:#{ image_id }}}").to_html
+                  article_body[:body] << node.replace("{{image:#{ id }}}").to_html
                 else
-                  image_alt = image.children.at('img').attr('alt')
-                  image_title = image.children.at('img').attr('title')
+                  id = unique_id
+                  figure(article_body, id, node, image, image_url)
+                  article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
+                end
+              else
-                  image_meta = node.css('.e-image__meta')
+                node.children.each do |inner_node|
+                  case inner_node.name
+                  when 'a'
+                    id = unique_id
+                    article_body[:anchors][id] = {
+                      text: inner_node.children.text,
+                      url: inner_node.attr('href')
+                    }
-                  figure_id = random_string
+                    inner_node.replace("{{anchor:#{ id }}}")
+                  when 'figure'
+                    id = unique_id
-                  article_body[:figures][figure_id] = {}
+                    image = node.at('.e-image__image')
+                    image_url = image.attr('data-original')
-                  article_body[:figures][figure_id][:image] = image_url
-                  article_body[:figures][figure_id][:title] = image_title
-                  article_body[:figures][figure_id][:alt]   = image_alt
+                    figure(article_body, id, node, image, image_url)
-                  unless image_meta.empty?
-                    article_body[:figures][figure_id][:caption] = image_meta.first.at('figcaption')&.text
-                    article_body[:figures][figure_id][:cite]    = image_meta.first.at('cite')&.text
+                    node = node.replace("{{figure:#{ id }}}").to_html
+                    article_body[:body] << node
                   end
-                  article_body[:body] << node.replace("{{figure:#{ figure_id }}}").to_html
                 end
-              rescue
-                raise 'Unknown format, please review.'
-              end
-            else
-              node.children.each do |url|
                 begin
-                  if url.name == 'a'
-                    url_id = random_string
-                    article_body[:anchors][url_id] = {
-                      text: url.children.text,
-                      url: url.attributes['href'].value
-                    }
-                    url.replace("{{anchor:#{ url_id }}}")
-                  end
+                  # Remove all attributes
+                  #
+                  parsed_node = node.xpath('.//@*').remove
+                  # Return clean HTML, including HTML elements and text
+                  #
+                  parsed_node = node.to_html
                 rescue
-                  raise 'Unknown format, please review.'
                 end
               end
-              # Remove all attributes
-              #
-              parsed_node = node.xpath('.//@*').remove
-              # Return clean HTML, including HTML elements and text
-              #
-              parsed_node = node.to_html
+              article_body[:body] << parsed_node unless parsed_node.nil?
             end
-            article_body[:body] << parsed_node
           end
         end
         return article_body
       rescue => e
-        "There was a problem while parsing the Article body for '#{ title }' => #{ e }"
+        "There was a problem while parsing this Article: #{ e }"
       end
+      def figure(article_body, id, node, image, image_url)
+        article_body[:figures][id] = { }
+        article_body[:figures][id][:image] = image_url
+        article_body[:figures][id][:title] = image.at('img').attr('title')
+        article_body[:figures][id][:alt]   = image.at('img').attr('alt')
+        image_meta = node.at('.e-image__meta')
+        unless image_meta.nil?
+          article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
+          article_body[:figures][id][:cite]    = strip(image_meta.at('cite'))
+        end
+      end
+    private
       def attr(attribute)
         attributes&.fetch(attribute, nil)&.value
       end
-      def random_string
+      def strip(string)
+        string&.text&.strip
+      end
+      def unique_id
         (0...50).map { (65 + rand(25)).chr }.join.to_sym
       end
     end # News
   end # PolygonCOM
-end # GDNewsScraper::Scrapers
+end # GDNewsScraper::Scrapers

data/lib/GDNewsScraper/string.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class String
+  def to_num
+    gsub(/\D/, '').to_i
+  end
+end

data/lib/GDNewsScraper/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module GDNewsScraper
-  VERSION ||= '3.0.4'
+  VERSION ||= '3.0.6'
   # => major: A new Source has been added or removed
   # => minor: A Source code has changed drastically to a point where it's not
@@ -31,5 +31,9 @@ module GDNewsScraper
   # v3.0.3 - Added a new method which will refresh the content of an Article
   # v3.0.4 - Fixed an issue caused by Featured Articles which have a different
   #          DOM structure
-  #
+  # v3.0.5 - Adds the possibility to parse an article from its URL rather than
+  #          having to go through the index page to get its metadata
+  # v3.0.6 - Small refactor of the code which also improved parsing speed by
+  #          about 10% on average! :)
+  #
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: GDNewsScraper
 version: !ruby/object:Gem::Version
-  version: 3.0.4
+  version: 3.0.6
 platform: ruby
 authors:
 - Vlad Radulescu
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-11-28 00:00:00.000000000 Z
+date: 2017-11-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -74,6 +74,7 @@ files:
 - lib/GDNewsScraper.rb
 - lib/GDNewsScraper/scrapers/polygon_com/news.rb
 - lib/GDNewsScraper/scrapers/polygon_com/reviews.rb
+- lib/GDNewsScraper/string.rb
 - lib/GDNewsScraper/version.rb
 homepage: https://github.com/games-directory/scraper
 licenses: