RubyGems - pageflow-chart - Versions diffs - 0.2.1 → 0.2.2 - Mend

pageflow-chart 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/app/jobs/pageflow/chart/scrape_site_job.rb +3 -2
data/lib/pageflow/chart/refresh_tag_following_downloader.rb +62 -0
data/lib/pageflow/chart/version.rb +1 -1
data/spec/jobs/pageflow/chart/scrape_site_job_spec.rb +3 -3
data/spec/pageflow/chart/refresh_tag_following_downloader_spec.rb +178 -0
metadata +4 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 11c6001c705a34c2d17a065441d9686e344d7599
-  data.tar.gz: 46f602eb70655ddd464c18eda8de6d609361d6b3
+  metadata.gz: 1d165bb8184706fe6b0de27586f90d10ea710380
+  data.tar.gz: 5c330166481ba0d5ebf61256dc0d0ca81c47e97c
 SHA512:
-  metadata.gz: 4bd30df75eb0b9604b04f4d7a7a4aabd713b622a171453f58107318acb2dc4fe1a86421c6ccabe958f4d9b04531c262e0a3839975934d43a98823ac4d904aba6
-  data.tar.gz: ae52e80fa81f6cbd3e94102874c6902a151b6c77b4b55bd0422db8c0b7595e91b993ad18b5164c467e3c5b455d2c3d722c27635c113535bd5db1d03a18ff6b1e
+  metadata.gz: 031a5819cda5782c92e801f1cc229ccf73be853df6c7cb0a5843f9abe6cdc1bcc36b7a4da10d8464ad6251a43228cb661dab057eb0f1f9eff568a51cab3755d3
+  data.tar.gz: 5aadd7313e897e65b0d834dc3e47f130724141612c3bf22af7862388e336337e9532f6e416b0b260fd9c7b3acfb3a2c50e948e296b41e4b9373dddfd705c18d5

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,14 @@
 # CHANGELOG
+### Version 0.2.2
+2017-07-12
+[Compare changes](https://github.com/codevise/pageflow-chart/compare/v0.2.1...v0.2.2)
+- Follow redirects from refresh meta tags
+  ([#37](https://github.com/codevise/pageflow-chart/pull/37))
 ### Version 0.2.1
 2017-07-12

data/app/jobs/pageflow/chart/scrape_site_job.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Pageflow
       end
       def perform(scraped_site)
-        downloader.load(scraped_site.url) do |file|
+        downloader.load_following_refresh_tags(scraped_site.url) do |file|
           scraper = Scraper.new(file.read, Chart.config.scraper_options)
           scraped_site.html_file = StringIOWithContentType.new(
             scraper.html,
@@ -42,7 +42,8 @@ module Pageflow
       def self.perform_with_result(scraped_site, options = {})
         # This is were the downloader passed to `initialize` is created.
-        new(Downloader.new(base_url: scraped_site.url)).perform(scraped_site)
+        new(RefreshTagFollowingDownloader.new(Downloader.new(base_url: scraped_site.url)))
+          .perform(scraped_site)
       end
       def begin_try_catch

data/lib/pageflow/chart/refresh_tag_following_downloader.rb ADDED Viewed

@@ -0,0 +1,62 @@
+require 'nokogiri'
+require 'uri'
+module Pageflow
+  module Chart
+    class RefreshTagFollowingDownloader < SimpleDelegator
+      MAX_REDIRECT_COUNT = 3
+      class TooManyRedirects < StandardError; end
+      class NoUrlInRefreshMetaTag < StandardError; end
+      def load_following_refresh_tags(url, redirect_count = 0, &block)
+        load(url) do |file|
+          if (redirect_url = find_refresh_meta_tag_url(file.read))
+            if redirect_count >= MAX_REDIRECT_COUNT
+              raise TooManyRedirects, 'Too many redirects via refresh meta tags.'
+            end
+            redirect_url = ensure_absolute(redirect_url, url)
+            return load_following_refresh_tags(redirect_url, redirect_count + 1, &block)
+          end
+          file.rewind
+          yield file if block_given?
+        end
+      end
+      private
+      def find_refresh_meta_tag_url(html)
+        tag = find_refresh_meta_tag(html)
+        extract_redirect_url(tag) if tag
+      end
+      def find_refresh_meta_tag(html)
+        document = Nokogiri::HTML(html)
+        document.at_css('head meta[http-equiv="REFRESH"]')
+      end
+      def extract_redirect_url(tag)
+        if tag[:content] && tag[:content] =~ /url=/
+          tag[:content].split('url=').last
+        else
+          raise NoUrlInRefreshMetaTag, "Could not extract url from #{tag}."
+        end
+      end
+      def ensure_absolute(url, context_url)
+        uri = URI(url)
+        context_uri = URI(context_url)
+        [
+          uri.scheme || context_uri.scheme,
+          '://',
+          uri.host || context_uri.host,
+          uri.path
+        ].join('')
+      end
+    end
+  end
+end

data/lib/pageflow/chart/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Pageflow
   module Chart
-    VERSION = '0.2.1'.freeze
+    VERSION = '0.2.2'.freeze
   end
 end

data/spec/jobs/pageflow/chart/scrape_site_job_spec.rb CHANGED Viewed

@@ -5,14 +5,14 @@ module Pageflow
     describe ScrapeSiteJob do
       describe '#perform' do
         it 'scrapes html' do
-          scraper = double("Scraper", html: '<html>rewritten</html>')
-          downloader = double("Downloader", load: '<html>original</html>')
+          scraper = double('Scraper', html: '<html>rewritten</html>')
+          downloader = double('Downloader', load: '<html>original</html>')
           job = ScrapeSiteJob.new(downloader)
           scraped_site = create(:scraped_site, url: 'http://example.com')
           allow(Scraper).to receive(:new).and_return(scraper)
-          expect(downloader).to receive(:load).with('http://example.com')
+          expect(downloader).to receive(:load_following_refresh_tags).with('http://example.com')
           job.perform(scraped_site)
         end

data/spec/pageflow/chart/refresh_tag_following_downloader_spec.rb ADDED Viewed

@@ -0,0 +1,178 @@
+require 'spec_helper'
+module Pageflow
+  module Chart
+    describe RefreshTagFollowingDownloader do
+      describe '#load_following_refresh_tags' do
+        it 'delegates to downloader if no refresh meta tag is found' do
+          downloader = double(Downloader)
+          refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
+          original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
+          chart_html = <<-HTML
+            <html><head><title>A chart</title></head></html>
+          HTML
+          result = ''
+          allow(downloader).to receive(:load)
+            .with(original_url)
+            .and_yield(StringIO.new(chart_html))
+          refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
+            result = file.read
+          end
+          expect(result).to eq(chart_html)
+        end
+        it 'looks for refresh meta tags and loads their url instead' do
+          downloader = double(Downloader)
+          refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
+          original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
+          target_url = 'http://other.dwcdn.net/HPKfl/5/'
+          redirect_html = <<-HTML
+            <html><head><meta http-equiv="REFRESH" content="0; url=http://other.dwcdn.net/HPKfl/5/"></head></html>
+          HTML
+          chart_html = <<-HTML
+            <html><head><title>A chart</title></head></html>
+          HTML
+          result = ''
+          allow(downloader).to receive(:load)
+            .with(original_url)
+            .and_yield(StringIO.new(redirect_html))
+          allow(downloader).to receive(:load)
+            .with(target_url)
+            .and_yield(StringIO.new(chart_html))
+          refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
+            result = file.read
+          end
+          expect(result).to eq(chart_html)
+        end
+        it 'supports schema relative urls' do
+          downloader = double(Downloader)
+          refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
+          original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
+          target_url = 'http://other.dwcdn.net/HPKfl/5/'
+          redirect_html = <<-HTML
+            <html><head><meta http-equiv="REFRESH" content="0; url=//other.dwcdn.net/HPKfl/5/"></head></html>
+          HTML
+          chart_html = <<-HTML
+            <html><head><title>A chart</title></head></html>
+          HTML
+          result = ''
+          allow(downloader).to receive(:load)
+            .with(original_url)
+            .and_yield(StringIO.new(redirect_html))
+          allow(downloader).to receive(:load)
+            .with(target_url)
+            .and_yield(StringIO.new(chart_html))
+          refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
+            result = file.read
+          end
+          expect(result).to eq(chart_html)
+        end
+        it 'supports relative urls' do
+          downloader = double(Downloader)
+          refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
+          original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
+          target_url = 'http://datawrapper.dwcdn.net/HPKfl/5/'
+          redirect_html = <<-HTML
+            <html><head><meta http-equiv="REFRESH" content="0; url=/HPKfl/5/"></head></html>
+          HTML
+          chart_html = <<-HTML
+            <html><head><title>A chart</title></head></html>
+          HTML
+          result = ''
+          allow(downloader).to receive(:load)
+            .with(original_url)
+            .and_yield(StringIO.new(redirect_html))
+          allow(downloader).to receive(:load)
+            .with(target_url)
+            .and_yield(StringIO.new(chart_html))
+          refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
+            result = file.read
+          end
+          expect(result).to eq(chart_html)
+        end
+        it 'fails on too many redirects' do
+          downloader = double(Downloader)
+          refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
+          original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
+          redirect_html = <<-HTML
+            <html><head><meta http-equiv="REFRESH" content="0; url=#{original_url}"></head></html>
+          HTML
+          allow(downloader).to receive(:load).with(original_url) do |&block|
+            block.call(StringIO.new(redirect_html))
+          end
+          expect {
+            refresh_tag_following_downloader.load_following_refresh_tags(original_url)
+          }.to raise_error(RefreshTagFollowingDownloader::TooManyRedirects)
+        end
+        it 'fails on invalid refresh meta tag' do
+          downloader = double(Downloader)
+          refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
+          original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
+          redirect_html = <<-HTML
+            <html><head><meta http-equiv="REFRESH" content="something strange"></head></html>
+          HTML
+          allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
+          expect {
+            refresh_tag_following_downloader.load_following_refresh_tags(original_url)
+          }.to raise_error(RefreshTagFollowingDownloader::NoUrlInRefreshMetaTag)
+        end
+        it 'fails on refresh meta tag without content attribute' do
+          downloader = double(Downloader)
+          refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
+          original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
+          redirect_html = <<-HTML
+            <html><head><meta http-equiv="REFRESH"></head></html>
+          HTML
+          allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
+          expect {
+            refresh_tag_following_downloader.load_following_refresh_tags(original_url)
+          }.to raise_error(RefreshTagFollowingDownloader::NoUrlInRefreshMetaTag)
+        end
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pageflow-chart
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.2
 platform: ruby
 authors:
 - Tim Fischbach
@@ -269,6 +269,7 @@ files:
 - lib/pageflow/chart/downloader.rb
 - lib/pageflow/chart/engine.rb
 - lib/pageflow/chart/page_type.rb
+- lib/pageflow/chart/refresh_tag_following_downloader.rb
 - lib/pageflow/chart/scraper.rb
 - lib/pageflow/chart/version.rb
 - spec/controllers/pageflow/chart/scraped_sites_controller_spec.rb
@@ -316,6 +317,7 @@ files:
 - spec/models/pageflow/chart/scraped_site_spec.rb
 - spec/pageflow/chart/configuration_spec.rb
 - spec/pageflow/chart/downloader_spec.rb
+- spec/pageflow/chart/refresh_tag_following_downloader_spec.rb
 - spec/pageflow/chart/scraper_spec.rb
 - spec/requests/scraping_site_spec.rb
 - spec/spec_helper.rb
@@ -394,6 +396,7 @@ test_files:
 - spec/models/pageflow/chart/scraped_site_spec.rb
 - spec/pageflow/chart/configuration_spec.rb
 - spec/pageflow/chart/downloader_spec.rb
+- spec/pageflow/chart/refresh_tag_following_downloader_spec.rb
 - spec/pageflow/chart/scraper_spec.rb
 - spec/requests/scraping_site_spec.rb
 - spec/spec_helper.rb