RubyGems - GDNewsScraper - Versions diffs - 3.0.7 → 3.0.9 - Mend

GDNewsScraper 3.0.7 → 3.0.9

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/GDNewsScraper/scrapers/polygon_com/news.rb +78 -62
data/lib/GDNewsScraper/version.rb +24 -2
metadata +1 -1

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 0ab7ee6a0e7e30c64e6ece2a156b9c94756c00a6
-  data.tar.gz: 30887530f6a43130209a98402fb0c32b647a5f77
+  metadata.gz: 4930eeeb0c78d881acc7a9fad32bac5e55d3743e
+  data.tar.gz: 11be82b0c24cd044485a71b863088892a1cb99ad
 SHA512:
-  metadata.gz: cefe85f767614e30d79caa71c50c43eb466e6b52359755ab0dd84491682c42fbff25ac362c181eb394ddb0590e9717fdf4b7b6d611f273caaf25595f584894a1
-  data.tar.gz: 35fec37b2669653852e1dd3dcd39d95bb0d5233b3d4bee92bb5004731bf9062a2ac3285ca35529350f3694cfad50c4c2502c470622ee4c5321779f4f6b5db7ae
+  metadata.gz: 8d1617126dfdbc603d5328c4027b45a63fe76a22618188a966f873b5e3566cf4894c8b2987d89880665e8e969c21eda2bf4c0daf3a4635894350241801801f7e
+  data.tar.gz: 1361f266a1816e9c588c2c68f8230d29476c7a41ef3820732eec23e8b91dfa625f884937184090297c7e7f61622e9053c449c0a7e31394644729c5f0e9400711

data/lib/GDNewsScraper/scrapers/polygon_com/news.rb CHANGED

@@ -1,3 +1,4 @@
+require 'pry'
 require 'base64'
 require 'json'
@@ -47,10 +48,6 @@ module GDNewsScraper::Scrapers
         end
       end
-      def refresh(article_url)
-        parse_article_body(article_url)
-      end
       def parse(article)
         pulse = Hash.new
@@ -65,7 +62,8 @@ module GDNewsScraper::Scrapers
           begin
             article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
-            is_a_video = article_page.at('.c-video-embed').nil?
+            first_element = article_page.at('.l-col__main').elements.first
+            is_a_video    = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'
             key    = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
             url    = article
@@ -92,14 +90,20 @@ module GDNewsScraper::Scrapers
             raise ArgumentError.new('Invalid URL')
           end
         elsif article.is_a?(Nokogiri::XML::Element)
-          is_a_video = !article.at('.c-entry-box--compact--video').nil?
-          key    = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
-          url    = article.at('.c-entry-box--compact__title').at('> a').attr('href')
-          title  = strip(article.at('.c-entry-box--compact__title'))
-          cover  = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
-          author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
-          date   = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
+          article_container = article.at('.c-entry-box--compact--article')
+          if article_container.nil?
+            raise StandardError.new('Not an Article, skipping..')
+          else
+            key    = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
+            url    = article.at('.c-entry-box--compact__title').at('> a').attr('href')
+            title  = strip(article.at('.c-entry-box--compact__title'))
+            cover  = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
+            author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
+            date   = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
+            article_page = url
+          end
         else
           raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
         end
@@ -111,22 +115,29 @@ module GDNewsScraper::Scrapers
         pulse[:title]   = title
         pulse[:author]  = author
         pulse[:date]    = date
-        pulse[:content] = parse_article_body(url, is_a_video)
+        pulse[:content] = parse_article_body(article_page)
         pulse[:tags]    = title.downcase.split
         return pulse
       rescue => e
-        "There was a problem while parsing this Article: #{ e }"
+        {
+          success: false,
+          message: "There was a problem while parsing this Article: #{ e }"
+        }
       end
-      def parse_article_body(article_url, is_a_video = false)
-        article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
+      def parse_article_body(article)
+        if article.is_a?(String)
+          article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
+        else
+          article_page = article
+        end
         article_container = article_page.at('.c-entry-content')
         article_body = {
           galleries: { },
           videos: { },
-          images: { },
           anchors: { },
           figures: { },
@@ -134,14 +145,28 @@ module GDNewsScraper::Scrapers
           body: [ ]
         }
+        # Check here as well since an Article CAN have an embeded video instead
+        # of a Cover and still show as a non-video artciel on the News page from
+        # where we initially took the 'is_a_video' check
+        #
+        first_element = article_page.at('.l-col__main').elements.first
+        is_a_video    = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'
         if is_a_video
-          iframe = article_page.at('.c-video-embed--media').at('iframe')
-          iframe_id = unique_id
+          id = unique_id(first_element)
-          article_body[:videos][iframe_id] = {}
-          article_body[:videos][iframe_id][:url] = iframe.attr('src')
+          is_polygon_video = !first_element.attributes['data-volume-uuid'].nil?
-          article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
+          if is_polygon_video
+            article_body[:videos][id] = {}
+            article_body[:videos][id][:label] = first_element.attr('data-analytics-label')&.split('|')&.first&.strip
+            article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ first_element.attr('data-volume-uuid') }"
+          else
+            article_body[:videos][id] = {}
+            article_body[:videos][id][:url] = first_element.at('iframe').attr('src')
+          end
+          article_body[:body] << first_element.replace("{{video:#{ id }}}").to_html
         end
         article_container.children.each do |node|
@@ -161,12 +186,12 @@ module GDNewsScraper::Scrapers
               iframe = node.at('iframe')
               if iframe # YouTube videos
-                iframe_id = unique_id
+                id = unique_id(iframe)
-                article_body[:videos][iframe_id] = {}
-                article_body[:videos][iframe_id][:url] = iframe.attr('src')
+                article_body[:videos][id] = {}
+                article_body[:videos][id][:url] = iframe.attr('src')
-                article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
+                article_body[:body] << iframe.replace("{{video:#{ id }}}").to_html
               end
               # Check to see if the Article has a video by Polygon, which is
@@ -175,7 +200,7 @@ module GDNewsScraper::Scrapers
               polygon_video = node.attributes['data-volume-uuid']
               unless polygon_video.nil?
-                id = unique_id
+                id = unique_id(polygon_video)
                 article_body[:videos][id] = {}
                 article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
@@ -191,18 +216,18 @@ module GDNewsScraper::Scrapers
               if gallery
                 gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
-                gallery_id = unique_id
-                article_body[:galleries][gallery_id] = []
+                id = unique_id(gallery)
+                article_body[:galleries][id] = []
                 gallery_container.children.children.each do |image_container|
                   image = image_container.at('a')
                   if image
-                    article_body[:galleries][gallery_id] << image.attr('href')
+                    article_body[:galleries][id] << image.attr('href')
                   end
                 end
-                article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
+                article_body[:body] << gallery.replace("{{gallery:#{ id }}}").to_html
               end
               twitdget = node.at('.twitter-tweet')
@@ -230,32 +255,23 @@ module GDNewsScraper::Scrapers
             if figure
               node.css('.e-image__image').each do |image|
                 image_url = image.attr('data-original')
+                id = unique_id(node)
+                article_body[:figures][id] = { }
+                article_body[:figures][id][:image] = image_url
-                if image_url.split('.').last == 'gif'
-                  id = unique_id
-                  article_body[:images][id] = { }
-                  article_body[:images][id][:url] = image_url
-                  article_body[:body] << node.replace("{{image:#{ id }}}").to_html
-                else
-                  id = unique_id
-                  article_body[:figures][id] = { }
-                  article_body[:figures][id][:image] = image_url
-                  article_body[:figures][id][:title] = image.at('img').attr('title')
-                  article_body[:figures][id][:alt]   = image.at('img').attr('alt')
+                article_body[:figures][id][:title] = image.at('img')&.attr('title')
+                article_body[:figures][id][:alt]   = image.at('img')&.attr('alt')
-                  image_meta = node.at('.e-image__meta')
+                image_meta = node.at('.e-image__meta')
-                  unless image_meta.nil?
-                    article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
-                    article_body[:figures][id][:cite]    = strip(image_meta.at('cite'))
-                  end
-                  article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
+                unless image_meta.nil?
+                  article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
+                  article_body[:figures][id][:cite]    = strip(image_meta.at('cite'))
                 end
+                article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
               end
               node.traverse { |children| children.remove }
@@ -275,7 +291,7 @@ module GDNewsScraper::Scrapers
               node.children.each do |inner_node|
                 case inner_node.name
                 when 'a'
-                  id = unique_id
+                  id = unique_id(inner_node)
                   article_body[:anchors][id] = {
                     text: inner_node.children.text,
@@ -312,11 +328,7 @@ module GDNewsScraper::Scrapers
         return article_body
       rescue => e
-        "There was a problem while parsing this Article: #{ e }"
-      end
-      def figure(article_body, id, node, image, image_url)
+        "There was a problem while parsing this Article's body: #{ e }"
       end
     private
@@ -329,8 +341,12 @@ module GDNewsScraper::Scrapers
         string&.text&.strip
       end
-      def unique_id
-        (0...50).map { (65 + rand(25)).chr }.join.to_sym
+      def unique_id(node)
+        Base64.strict_encode64(node.to_s)
+          .reverse
+          .gsub(/[^0-9A-Za-z]/, '')[0..100]
+          .downcase
+          .to_sym
       end
     end # News
   end # PolygonCOM

data/lib/GDNewsScraper/version.rb CHANGED

@@ -1,5 +1,5 @@
 module GDNewsScraper
-  VERSION ||= '3.0.7'
+  VERSION ||= '3.0.9'
   # => major: A new Source has been added or removed
   # => minor: A Source code has changed drastically to a point where it's not
@@ -37,5 +37,27 @@ module GDNewsScraper
   #          about 10% on average! :)
   # v3.0.7 - Changed the way figures are added to the articles which takes in
   #          consideration deeply nested figures as well
-  #
+  # v3.0.8 - Removed the refresh method since you can parse an article by
+  #          passing it URL
+  # v3.0.9
+  #
+  # - Generate truly unique strings using Base64.strict_encode. This stops Rails
+  # thinking the Article has not been changed even though its the same. Previous
+  # unique_id method would re-regenerate the id every time the Article is
+  # requested
+  # - Identify whether or not the Article is a video when indexing the Article
+  # page rather than doing it when scraping the Articles page
+  # - Only account for the Video that's inside the Article, not any other video
+  # that might be on the page using 'node.at()' instead of 'node.css()' which
+  # returns only the first match
+  # - Remove the 'is_a_video' argument when parsing the body since we're doing
+  # the check there from now on.
+  # Some articles that are of type video don't necessarily show as such when
+  # viewing them on the Articles page
+  # - Change the paramater that is passed to 'parse_article_body'. When we're
+  # parsing an article from a URL we don't need to re-request the page with
+  # Nokogiri
+  # - Increased the size of the unique_id to 100.
+  # Images have the same url prefix, where only the image name is different, as
+  # such, the unique_id was not that unique anymore..
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: GDNewsScraper
 version: !ruby/object:Gem::Version
-  version: 3.0.7
+  version: 3.0.9
 platform: ruby
 authors:
 - Vlad Radulescu