RubyGems - pageflow-chart - Versions diffs - 2.2.0 → 2.3.0 - Mend

pageflow-chart 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -8
data/app/jobs/pageflow/chart/scrape_site_job.rb +10 -3
data/app/models/pageflow/chart/scraped_site.rb +3 -1
data/db/migrate/20200507141608_add_javascript_body_attachment_to_scraped_site.rb +5 -0
data/lib/pageflow/chart/configuration.rb +2 -0
data/lib/pageflow/chart/scraper.rb +25 -13
data/lib/pageflow/chart/version.rb +1 -1
data/spec/factories/scraped_sites.rb +1 -0
data/spec/fixtures/all_body.js +1 -0
data/spec/models/pageflow/chart/scraped_site_spec.rb +1 -1
data/spec/pageflow/chart/scraper_spec.rb +201 -63
metadata +5 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c3834ff2c73f8f33f70e43de722ed33e5a763ebe078ac4345074b8c7e49ebf3d
-  data.tar.gz: ab502c12ff710d698df3418fd572f3153d6981b0526cd9a635adc2ab1f8cbe64
+  metadata.gz: cf67574bc3fc0e11ee66f37634eeab5b73ff6105ef6c281b022b24fad99fd854
+  data.tar.gz: 21cad9381bcc4c3cc312726b8b4b3b2e8c8a3f4b4f8ef54255848d348ed31a90
 SHA512:
-  metadata.gz: e326231888b882249d84824451b0449d598aca5b9c78f6842b20111ffe4c9c0b7079f3966cd0a654c4552cf59eb1c57442556c0cfe5370b194e46f1be1b12525
-  data.tar.gz: e14b7bacc6d778ca93764d48fc2f8fd5e484569cab89edf395848e478699fe0ab91ea6e0944315550aceae6e226428aa9bab678b0e9a9bb8887dfa5687b7c93d
+  metadata.gz: 0f778a73ae92747c4a43e80e8443edd74dc0ecd5056278195b623182108c905b3abef2f48833fb64e7b49d58b9f64fffffe7eaf854c9fb61a896bf57c37c4205
+  data.tar.gz: 15ec876b421dc8ba87d71ec9dc88d007cff39ec3658ce44e8c2bf702d75be6f1130506b8daec5d4ed0ebad453bf3726088d7b7aa2b8ea9ce645f6dc809206dc8

data/CHANGELOG.md CHANGED

@@ -1,16 +1,14 @@
 # CHANGELOG
-### Version 2.2.0
+### Version 2.3.0
-2019-11-04
+2020-05-11
-[Compare changes](https://github.com/codevise/pageflow-chart/compare/2-1-stable...v2.2.0)
+[Compare changes](https://github.com/codevise/pageflow-chart/compare/2-2-stable...v2.3.0)
-#### Manual Update Step
-- Turn scraped site into file type. Install migrations.
-  ([#55](https://github.com/codevise/pageflow-chart/pull/55))
+- Support Datawrapper charts with script tags in body
+  ([#56](https://github.com/codevise/pageflow-chart/pull/56))
 See
-[2-1-stable branch](https://github.com/codevise/pageflow-chart/blob/2-1-stable/CHANGELOG.md)
+[2-2-stable branch](https://github.com/codevise/pageflow-chart/blob/2-2-stable/CHANGELOG.md)
 for previous changes.

data/app/jobs/pageflow/chart/scrape_site_job.rb CHANGED

@@ -17,11 +17,18 @@ module Pageflow
             content_type: 'text/html'
           )
-          downloader.load_all(scraper.javascript_urls,
+          downloader.load_all(scraper.javascript_urls_in_head,
                               extension: '.js',
                               before_each: begin_try_catch,
-                              after_each: end_try_catch) do |javascript_file|
-            scraped_site.javascript_file = javascript_file
+                              after_each: end_try_catch) do |javascript_head_file|
+            scraped_site.javascript_file = javascript_head_file
+          end
+          downloader.load_all(scraper.javascript_urls_in_body,
+                              extension: '.js',
+                              before_each: begin_try_catch,
+                              after_each: end_try_catch) do |javascript_body_file|
+            scraped_site.javascript_body_file = javascript_body_file
           end
           downloader.load_all(scraper.stylesheet_urls,

data/app/models/pageflow/chart/scraped_site.rb CHANGED

@@ -4,11 +4,13 @@ module Pageflow
       include Pageflow::ReusableFile
       has_attached_file :javascript_file, Chart.config.paperclip_options(extension: 'js')
+      has_attached_file :javascript_body_file, Chart.config.paperclip_options(basename: 'all_body', extension: 'js')
       has_attached_file :stylesheet_file, Chart.config.paperclip_options(extension: 'css')
       has_attached_file :html_file, Chart.config.paperclip_options(extension: 'html')
       has_attached_file :csv_file, Chart.config.paperclip_options(basename: 'data', extension: 'csv')
       do_not_validate_attachment_file_type(:javascript_file)
+      do_not_validate_attachment_file_type(:javascript_body_file)
       do_not_validate_attachment_file_type(:stylesheet_file)
       do_not_validate_attachment_file_type(:html_file)
       do_not_validate_attachment_file_type(:csv_file)
@@ -85,7 +87,7 @@ module Pageflow
       end
       def attachments_for_export
-        [javascript_file, stylesheet_file, html_file, csv_file]
+        [javascript_file, javascript_body_file, stylesheet_file, html_file, csv_file]
       end
     end
   end

data/db/migrate/20200507141608_add_javascript_body_attachment_to_scraped_site.rb ADDED

@@ -0,0 +1,5 @@
+class AddJavascriptBodyAttachmentToScrapedSite < ActiveRecord::Migration[5.2]
+  def change
+    add_attachment :pageflow_chart_scraped_sites, :javascript_body_file
+  end
+end

data/lib/pageflow/chart/configuration.rb CHANGED

@@ -22,6 +22,7 @@ module Pageflow
       #
       # @param [Hash] opts
       # @option opts [Array<Regexp>] :head_script_blacklist Script tags in page head are ignored if they match any of this list of regexes.
+      # @option opts [Array<Regexp>] :body_script_blacklist Script tags in page body are ignored if they match any of this list of regexes.
       # @option opts [Array<Regexp>] :inline_script_blacklist Inline script tags are ignored if they match any of this list of regexes.
       # @option opts [Array<String>] :selector_blacklist HTML-elements matched by selectors in this list will not be scraped.
       # @return [Hash]
@@ -55,6 +56,7 @@ module Pageflow
       def initialize
         @scraper_options = {
           head_script_blacklist: [/piwik/],
+          body_script_blacklist: [/piwik/],
           inline_script_blacklist: [/piwik/],
           selector_blacklist: ['body .noscript']
         }

data/lib/pageflow/chart/scraper.rb CHANGED

@@ -3,7 +3,11 @@ require 'nokogiri'
 module Pageflow
   module Chart
     class Scraper
-      attr_reader :document, :options, :javascript_urls, :stylesheet_urls
+      attr_reader :document,
+                  :options,
+                  :javascript_urls_in_head,
+                  :javascript_urls_in_body,
+                  :stylesheet_urls
       def initialize(html, options = {})
         @document = Nokogiri::HTML(html)
@@ -23,14 +27,21 @@ module Pageflow
       private
       def parse
-        parse_javascript_urls
+        parse_javascript_urls(:head)
+        parse_javascript_urls(:body)
         parse_stylesheet_urls
       end
-      def parse_javascript_urls
-        @javascript_urls = filtered_script_tags_in_head.map do |tag|
+      def parse_javascript_urls(container)
+        script_tags = filtered_script_tags_in(container).map do |tag|
           tag[:src]
         end
+        if container.eql?(:head)
+          @javascript_urls_in_head = script_tags
+        else
+          @javascript_urls_in_body = script_tags
+        end
       end
       def parse_stylesheet_urls
@@ -42,7 +53,8 @@ module Pageflow
       def rewrite
         filter_inline_scripts
         filter_by_selectors
-        combine_script_tags_in_head
+        combine_script_tags_in(:head)
+        combine_script_tags_in(:body)
         combine_css_link_tags
       end
@@ -66,12 +78,12 @@ module Pageflow
         end
       end
-      def combine_script_tags_in_head
-        script_tags_to_remove = script_src_tags_in_head
+      def combine_script_tags_in(container)
+        script_tags_to_remove = script_src_tags_in(container)
         return if script_tags_to_remove.empty?
         all_script_src_tag = Nokogiri::XML::Node.new('script', document)
-        all_script_src_tag[:src] = 'all.js'
+        all_script_src_tag[:src] = container.eql?(:head) ? 'all.js' : 'all_body.js'
         all_script_src_tag[:type] = 'text/javascript'
         script_tags_to_remove
@@ -91,16 +103,16 @@ module Pageflow
         document.at_css('head') << all_css_link_tag
       end
-      def filtered_script_tags_in_head
-        script_src_tags_in_head.reject do |tag|
-          options.fetch(:head_script_blacklist, []).any? do |regexp|
+      def filtered_script_tags_in(container)
+        script_src_tags_in(container).reject do |tag|
+          options.fetch("#{container}_script_blacklist".to_sym, []).any? do |regexp|
             tag[:src] =~ regexp
           end
         end
       end
-      def script_src_tags_in_head
-        document.css('head script[src]')
+      def script_src_tags_in(container)
+        document.css("#{container} script[src]")
       end
       def css_link_tags

data/lib/pageflow/chart/version.rb CHANGED

@@ -1,5 +1,5 @@
 module Pageflow
   module Chart
-    VERSION = '2.2.0'.freeze
+    VERSION = '2.3.0'.freeze
   end
 end

data/spec/factories/scraped_sites.rb CHANGED

@@ -8,6 +8,7 @@ module Pageflow
           state { 'processed' }
           javascript_file { File.open(Engine.root.join('spec', 'fixtures', 'all.js')) }
+          javascript_body_file { File.open(Engine.root.join('spec', 'fixtures', 'all_body.js')) }
           stylesheet_file { File.open(Engine.root.join('spec', 'fixtures', 'all.css')) }
           html_file { File.open(Engine.root.join('spec', 'fixtures', 'index.html')) }
           csv_file { File.open(Engine.root.join('spec', 'fixtures', 'data.csv')) }

data/spec/fixtures/all_body.js ADDED

	@@ -0,0 +1 @@
1	+ var chart_body = {};

data/spec/models/pageflow/chart/scraped_site_spec.rb CHANGED

@@ -31,7 +31,7 @@ module Pageflow::Chart
       scraped_site = ScrapedSite.new(url: 'http://example.com/foo/index.html')
       expect(scraped_site.attachments_for_export.map(&:name))
-        .to eq(%i[javascript_file stylesheet_file html_file csv_file])
+        .to eq(%i[javascript_file javascript_body_file stylesheet_file html_file csv_file])
     end
     describe '#publish!' do

data/spec/pageflow/chart/scraper_spec.rb CHANGED

@@ -19,27 +19,67 @@ module Pageflow
           expect(scraper.html).to include('contents')
         end
-        it 'combines script tags in head' do
+        it 'filters blacklisted selectors' do
           html = <<-HTML
             <!DOCTYPE html>
             <html>
               <head>
-                <script type="text/javascript" src="/some.js"></script>
-                <script type="text/javascript" src="/other.js"></script>
+                <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
               </head>
               <body>
+                <div id="bad" class="noscript"></div>
+                <div id="good"></div>
               </body>
             </html>
           HTML
-          scraper = Scraper.new(html)
+          scraper = Scraper.new(html, selector_blacklist: ['body .noscript'])
-          expect(HtmlFragment.new(scraper.html)).not_to have_tag('head script[src="/some.js"]')
-          expect(HtmlFragment.new(scraper.html)).to have_tag('head script[src="all.js"]')
+          expect(HtmlFragment.new(scraper.html)).to have_tag('body #good')
+          expect(HtmlFragment.new(scraper.html)).not_to have_tag('body #bad')
         end
-        it 'inserts script tag at position of first script src tag to keep position' \
+        describe 'stylesheets in head' do
+          it 'combines link tags in head' do
+            html = <<-HTML
+            <!DOCTYPE html>
+            <html>
+              <head>
+                <link rel="stylesheet" type="text/css" href="/some.css">
+                <link rel="stylesheet" type="text/css" href="/other.css">
+              </head>
+              <body>
+              </body>
+            </html>
+            HTML
+            scraper = Scraper.new(html)
+            expect(HtmlFragment.new(scraper.html)).not_to have_tag('head link[href="/some.css"]')
+            expect(HtmlFragment.new(scraper.html)).to have_tag('head link[href="all.css"]')
+          end
+        end
+        describe 'scripts in head' do
+          it 'combines script tags in head' do
+            html = <<-HTML
+            <!DOCTYPE html>
+            <html>
+              <head>
+                <script type="text/javascript" src="/some.js"></script>
+                <script type="text/javascript" src="/other.js"></script>
+              </head>
+              <body>
+              </body>
+            </html>
+            HTML
+            scraper = Scraper.new(html)
+            expect(HtmlFragment.new(scraper.html)).not_to have_tag('head script[src="/some.js"]')
+            expect(HtmlFragment.new(scraper.html)).to have_tag('head script[src="all.js"]')
+          end
+          it 'inserts script tag at position of first script src tag to keep position ' \
            'between inline scripts' do
-          html = <<-HTML
+            html = <<-HTML
             <!DOCTYPE html>
             <html>
               <head>
@@ -55,122 +95,220 @@ module Pageflow
               <body>
               </body>
             </html>
-          HTML
-          scraper = Scraper.new(html)
+            HTML
+            scraper = Scraper.new(html)
-          fragment = HtmlFragment.new(scraper.html)
+            fragment = HtmlFragment.new(scraper.html)
-          expect(fragment).to have_tags_in_order('head script#setup',
-                                                 'head script[src="all.js"]',
-                                                 'head script#usage')
+            expect(fragment).to have_tags_in_order('head script#setup',
+                                                   'head script[src="all.js"]',
+                                                   'head script#usage')
+          end
         end
-        it 'combines link tags in head' do
-          html = <<-HTML
+        describe 'scripts in body' do
+          it 'combines script tags in body' do
+            html = <<-HTML
             <!DOCTYPE html>
             <html>
               <head>
-                <link rel="stylesheet" type="text/css" href="/some.css">
-                <link rel="stylesheet" type="text/css" href="/other.css">
               </head>
               <body>
+                <script type="text/javascript" src="/some.js"></script>
+                <script type="text/javascript" src="/other.js"></script>
               </body>
             </html>
-          HTML
-          scraper = Scraper.new(html)
+            HTML
+            scraper = Scraper.new(html)
-          expect(HtmlFragment.new(scraper.html)).not_to have_tag('head link[href="/some.css"]')
-          expect(HtmlFragment.new(scraper.html)).to have_tag('head link[href="all.css"]')
-        end
+            expect(HtmlFragment.new(scraper.html)).not_to have_tag('body script[src="/some.js"]')
+            expect(HtmlFragment.new(scraper.html)).to have_tag('body script[src="all_body.js"]')
+          end
-        it 'filters blacklisted inline scripts' do
-          html = <<-HTML
+          it 'inserts script tag at position of first script src tag to keep position ' \
+           'between inline scripts' do
+            html = <<-HTML
             <!DOCTYPE html>
             <html>
               <head>
-                <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
               </head>
               <body>
-                <script id="good">window.ok = true;</script>
-                <script id="bad">alert();</script>
+                <script id="setup">
+                  // Some setup required for scripts below to execute
+                </script>
+                <script type="text/javascript" src="/some.js"></script>
+                <script type="text/javascript" src="/other.js"></script>
+                <script id="usage">
+                  // Some script using stuff loading above
+                </script>
               </body>
             </html>
-          HTML
-          scraper = Scraper.new(html, inline_script_blacklist: [/alert/])
+            HTML
+            scraper = Scraper.new(html)
+            fragment = HtmlFragment.new(scraper.html)
+            expect(fragment).to have_tags_in_order('body script#setup',
+                                                   'body script[src="all_body.js"]',
+                                                   'body script#usage')
+          end
-          expect(HtmlFragment.new(scraper.html)).to have_tag('body script#good')
-          expect(HtmlFragment.new(scraper.html)).not_to have_tag('body script#bad')
+          it 'filters blacklisted inline scripts' do
+            html = <<-HTML
+              <!DOCTYPE html>
+              <html>
+                <head>
+                  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+                </head>
+                <body>
+                  <script id="good">window.ok = true;</script>
+                  <script id="bad">alert();</script>
+                </body>
+              </html>
+            HTML
+            scraper = Scraper.new(html, inline_script_blacklist: [/alert/])
+            expect(HtmlFragment.new(scraper.html)).to have_tag('body script#good')
+            expect(HtmlFragment.new(scraper.html)).not_to have_tag('body script#bad')
+          end
         end
+      end
-        it 'filters blacklisted selectors' do
-          html = <<-HTML
+      describe '#javascript_urls' do
+        describe 'scripts in head' do
+          it 'returns list of urls to javascript files' do
+            html = <<-HTML
             <!DOCTYPE html>
             <html>
               <head>
-                <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+                <script type="text/javascript" src="/some.js"></script>
+                <script type="text/javascript" src="/other.js"></script>
               </head>
               <body>
-                <div id="bad" class="noscript"></div>
-                <div id="good"></div>
               </body>
             </html>
-          HTML
-          scraper = Scraper.new(html, selector_blacklist: ['body .noscript'])
+            HTML
+            scraper = Scraper.new(html)
-          expect(HtmlFragment.new(scraper.html)).to have_tag('body #good')
-          expect(HtmlFragment.new(scraper.html)).not_to have_tag('body #bad')
-        end
-      end
+            expect(scraper.javascript_urls_in_head).to eq(['/some.js', '/other.js'])
+          end
-      describe '#javascript_urls' do
-        it 'returns list of urls to javascript files' do
-          html = <<-HTML
+          it 'filters by blacklist' do
+            html = <<-HTML
             <!DOCTYPE html>
             <html>
               <head>
                 <script type="text/javascript" src="/some.js"></script>
-                <script type="text/javascript" src="/other.js"></script>
+                <script type="text/javascript" src="http://example.com/piwik.js"></script>
               </head>
               <body>
               </body>
             </html>
-          HTML
-          scraper = Scraper.new(html)
+            HTML
+            scraper = Scraper.new(html, head_script_blacklist: [/piwik/])
+            expect(scraper.javascript_urls_in_head).to eq(['/some.js'])
+          end
-          expect(scraper.javascript_urls).to eq(['/some.js', '/other.js'])
+          it 'ignores inline scripts in head' do
+            html = <<-HTML
+            <!DOCTYPE html>
+            <html>
+              <head>
+                <script type="text/javascript"></script>
+              </head>
+              <body>
+              </body>
+            </html>
+            HTML
+            scraper = Scraper.new(html)
+            expect(scraper.javascript_urls_in_head).to eq([])
+          end
+          it 'ignores scripts in body' do
+            html = <<-HTML
+            <!DOCTYPE html>
+            <html>
+              <head>
+              </head>
+              <body>
+                <script type="text/javascript" src="/some.js"></script>
+              </body>
+            </html>
+            HTML
+            scraper = Scraper.new(html)
+            expect(scraper.javascript_urls_in_head).to eq([])
+          end
         end
-        it 'filters by blacklist' do
-          html = <<-HTML
+        describe 'scripts in body' do
+          it 'ignores scripts in head' do
+            html = <<-HTML
             <!DOCTYPE html>
             <html>
               <head>
                 <script type="text/javascript" src="/some.js"></script>
-                <script type="text/javascript" src="http://example.com/piwik.js"></script>
               </head>
               <body>
               </body>
             </html>
-          HTML
-          scraper = Scraper.new(html, head_script_blacklist: [/piwik/])
+            HTML
+            scraper = Scraper.new(html)
-          expect(scraper.javascript_urls).to eq(['/some.js'])
-        end
+            expect(scraper.javascript_urls_in_body).to eq([])
+          end
-        it 'ignores inline scripts in head' do
-          html = <<-HTML
+          it 'returns list of urls to javascript files' do
+            html = <<-HTML
             <!DOCTYPE html>
             <html>
               <head>
-                <script type="text/javascript"></script>
               </head>
               <body>
+                <script type="text/javascript" src="/some.js"></script>
+                <script type="text/javascript" src="/other.js"></script>
               </body>
             </html>
-          HTML
-          scraper = Scraper.new(html)
+            HTML
+            scraper = Scraper.new(html)
+            expect(scraper.javascript_urls_in_body).to eq(['/some.js', '/other.js'])
+          end
+          it 'filters by blacklist' do
+            html = <<-HTML
+            <!DOCTYPE html>
+            <html>
+              <head>
+              </head>
+              <body>
+                <script type="text/javascript" src="/some.js"></script>
+                <script type="text/javascript" src="http://example.com/piwik.js"></script>
+              </body>
+            </html>
+            HTML
+            scraper = Scraper.new(html, body_script_blacklist: [/piwik/])
+            expect(scraper.javascript_urls_in_body).to eq(['/some.js'])
+          end
+          it 'ignores inline scripts in body' do
+            html = <<-HTML
+            <!DOCTYPE html>
+            <html>
+              <head>
+              </head>
+              <body>
+                <script type="text/javascript"></script>
+              </body>
+            </html>
+            HTML
+            scraper = Scraper.new(html)
-          expect(scraper.javascript_urls).to eq([])
+            expect(scraper.javascript_urls_in_body).to eq([])
+          end
         end
       end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pageflow-chart
 version: !ruby/object:Gem::Version
-  version: 2.2.0
+  version: 2.3.0
 platform: ruby
 authors:
 - Codevise Solutions Ltd
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-11-04 00:00:00.000000000 Z
+date: 2020-05-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: pageflow
@@ -209,6 +209,7 @@ files:
 - db/migrate/20160211085234_add_use_custom_theme_to_scraped_sites.rb
 - db/migrate/20190531141820_add_file_attributes_to_scraped_sites.rb
 - db/migrate/20190531145431_insert_file_usages_for_scraped_sites.rb
+- db/migrate/20200507141608_add_javascript_body_attachment_to_scraped_site.rb
 - lib/generators/pageflow_chart/install/install_generator.rb
 - lib/pageflow/chart.rb
 - lib/pageflow/chart/configuration.rb
@@ -221,6 +222,7 @@ files:
 - spec/factories/scraped_sites.rb
 - spec/fixtures/all.css
 - spec/fixtures/all.js
+- spec/fixtures/all_body.js
 - spec/fixtures/data.csv
 - spec/fixtures/datawrapper.html
 - spec/fixtures/index.html
@@ -266,6 +268,7 @@ test_files:
 - spec/factories/scraped_sites.rb
 - spec/fixtures/all.css
 - spec/fixtures/all.js
+- spec/fixtures/all_body.js
 - spec/fixtures/data.csv
 - spec/fixtures/datawrapper.html
 - spec/fixtures/index.html