RubyGems - sitediff - Versions diffs - 0.0.1 → 1.0.0 - Mend

sitediff 0.0.1 → 1.0.0

Files changed (31) hide show

checksums.yaml +5 -5
data/bin/sitediff +10 -4
data/lib/sitediff.rb +179 -91
data/lib/sitediff/cache.rb +106 -0
data/lib/sitediff/cli.rb +391 -60
data/lib/sitediff/config.rb +383 -37
data/lib/sitediff/config/creator.rb +114 -0
data/lib/sitediff/config/preset.rb +75 -0
data/lib/sitediff/crawler.rb +131 -0
data/lib/sitediff/diff.rb +57 -12
data/lib/sitediff/exception.rb +5 -0
data/lib/sitediff/fetch.rb +76 -0
data/lib/sitediff/files/diff.html.erb +20 -2
data/lib/sitediff/files/jquery.min.js +2 -0
data/lib/sitediff/files/normalize.css +349 -0
data/lib/sitediff/files/report.html.erb +144 -0
data/lib/sitediff/files/sidebyside.html.erb +16 -0
data/lib/sitediff/files/sitediff.css +236 -29
data/lib/sitediff/files/sitediff.js +176 -0
data/lib/sitediff/report.rb +238 -0
data/lib/sitediff/result.rb +63 -26
data/lib/sitediff/sanitize.rb +160 -141
data/lib/sitediff/sanitize/dom_transform.rb +130 -0
data/lib/sitediff/sanitize/regexp.rb +82 -0
data/lib/sitediff/uriwrapper.rb +114 -35
data/lib/sitediff/webserver.rb +94 -0
data/lib/sitediff/webserver/resultserver.rb +134 -0
metadata +103 -43
data/lib/sitediff/files/html_report.html.erb +0 -47
data/lib/sitediff/util/cache.rb +0 -32
data/lib/sitediff/util/webserver.rb +0 -77

@@ -0,0 +1,238 @@
+# frozen_string_literal: true
+require 'fileutils'
+require 'json'
+require 'minitar'
+require 'sitediff'
+require 'sitediff/config'
+require 'zlib'
+class SiteDiff
+  ##
+  # SiteDiff Report Helper.
+  class Report
+    attr_reader :results, :cache
+    ##
+    # Directory where diffs will be generated.
+    DIFFS_DIR = 'diffs'
+    ##
+    # Name of file containing a list of pages with diffs.
+    FAILURES_FILE = 'failures.txt'
+    ##
+    # Name of file containing HTML report of diffs.
+    REPORT_FILE_HTML = 'report.html'
+    ##
+    # Name of file containing JSON report of diffs.
+    REPORT_FILE_JSON = 'report.json'
+    ##
+    # Name of file containing exported file archive.
+    REPORT_FILE_TAR = 'report.tgz'
+    ##
+    # Name of directory in which to build the portable report.
+    REPORT_BUILD_DIR = '_tmp_report'
+    ##
+    # Name of the portable report directory.
+    REPORT_DIR = 'report'
+    ##
+    # Path to settings used for report.
+    SETTINGS_FILE = 'settings.yaml'
+    ##
+    # Creates a Reporter object.
+    #
+    # @param [Config] config.
+    # @param [Cache] cache.
+    # @param [Array] results.
+    def initialize(config, cache, results)
+      @config = config
+      @cache = cache
+      @results = results
+    end
+    ##
+    # Generates an HTML report.
+    #
+    # @param [String] dir
+    #   The directory in which the report is to be generated.
+    def generate_html(
+      dir,
+      report_before = nil,
+      report_after = nil
+    )
+      report_before ||= @config.before_url
+      report_after ||= @config.after_url
+      dir = SiteDiff.ensure_dir dir
+      write_diffs dir
+      write_failures dir
+      # Prepare report.
+      report = Diff.generate_html(
+        @results,
+        report_before,
+        report_after,
+        @cache,
+        @config.export
+      )
+      # Write report.
+      report_file = dir + REPORT_FILE_HTML
+      report_file.unlink if report_file.file?
+      report_file.open('w') { |f| f.write(report) }
+      write_settings dir, report_before, report_after
+      if @config.export
+        package_report(dir)
+      else
+        SiteDiff.log 'Report generated to ' + report_file.expand_path.to_s
+      end
+    end
+    ##
+    # Generates a JSON report.
+    #
+    # @param dir
+    #   The directory in which the report is to be generated.
+    def generate_json(dir)
+      dir = SiteDiff.ensure_dir dir
+      write_diffs dir
+      write_failures dir
+      # Prepare report.
+      report = {
+        paths_compared: @results.length,
+        paths_diffs: 0,
+        paths: {}
+      }
+      @results.each do |item|
+        report[:paths_diffs] += 1 unless item.success?
+        item_report = {
+          path: item.path,
+          status: item.status,
+          message: item.error
+        }
+        report[:paths][item.path] = item_report
+      end
+      report = JSON report
+      # Write report.
+      report_file = dir + REPORT_FILE_JSON
+      report_file.unlink if report_file.file?
+      report_file.open('w') { |f| f.write(report) }
+      write_settings dir
+      SiteDiff.log 'Report generated to ' + report_file.expand_path.to_s
+    end
+    ##
+    # Package report for export.
+    def package_report(dir)
+      # Create temporaryreport directories.
+      temp_path = dir + REPORT_BUILD_DIR
+      temp_path.rmtree if temp_path.directory?
+      temp_path.mkpath
+      report_path = temp_path + REPORT_DIR
+      report_path.mkpath
+      files_path = report_path + 'files'
+      files_path.mkpath
+      diffs_path = dir + DIFFS_DIR
+      # Move files to place.
+      FileUtils.move(dir + REPORT_FILE_HTML, report_path)
+      FileUtils.move(diffs_path, files_path) if diffs_path.directory?
+      # Make tar file.
+      Dir.chdir(temp_path) do
+        Minitar.pack(
+          REPORT_DIR,
+          Zlib::GzipWriter.new(File.open(REPORT_FILE_TAR, 'wb'))
+        )
+      end
+      FileUtils.move(temp_path + REPORT_FILE_TAR, dir)
+      temp_path.rmtree
+      SiteDiff.log 'Archived report generated to ' + dir.join(REPORT_FILE_TAR).to_s
+    end
+    ##
+    # Creates diff files in a directory named "diffs".
+    #
+    # If "dir" is /foo/bar, then diffs will be placed in /foo/bar/diffs.
+    #
+    # @param [Pathname] dir
+    #   The directory in which a "diffs" directory is to be generated.
+    def write_diffs(dir)
+      raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
+      # Delete existing "diffs" dir, if exists.
+      diff_dir = dir + DIFFS_DIR
+      diff_dir.rmtree if diff_dir.exist?
+      # Write diffs to the diff directory.
+      @results.each { |r| r.dump(dir, @config.export) if r.status == Result::STATUS_FAILURE }
+      SiteDiff.log "All diff files written to #{diff_dir.expand_path}" unless @config.export
+    end
+    ##
+    # Writes paths with diffs into a file.
+    #
+    # @param [Pathname] dir
+    #   The directory in which the report is to be generated.
+    def write_failures(dir)
+      raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
+      failures = dir + FAILURES_FILE
+      SiteDiff.log "All failures written to #{failures.expand_path}"
+      failures.open('w') do |f|
+        @results.each { |r| f.puts r.path unless r.success? }
+      end
+    end
+    ##
+    # Creates report settings.yaml file.
+    #
+    # TODO: Find a way to avoid having to create this file.
+    #
+    # @param [Pathname] dir
+    #   The directory in which the report is to be generated.
+    def write_settings(dir, report_before = nil, report_after = nil)
+      raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
+      settings = {
+        'before' => report_before,
+        'after' => report_after,
+        'cached' => %w[before after]
+      }
+      dir.+(SETTINGS_FILE).open('w') { |f| YAML.dump(settings, f) }
+    end
+    ##
+    # Returns CSS for HTML report.
+    def self.css
+      output = ''
+      output += File.read(File.join(SiteDiff::FILES_DIR, 'normalize.css'))
+      output += File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.css'))
+      output
+    end
+    ##
+    # Returns JS for HTML report.
+    def self.js
+      output = ''
+      output += File.read(File.join(SiteDiff::FILES_DIR, 'jquery.min.js'))
+      output += File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.js'))
+      output
+    end
+  end
+end

data/lib/sitediff/result.rb CHANGED

@@ -1,73 +1,110 @@
-require 'fileutils'
+# frozen_string_literal: true
+require 'sitediff'
+require 'sitediff/diff'
+require 'sitediff/report'
 require 'digest/sha1'
+require 'fileutils'
 class SiteDiff
-  class Result < Struct.new(:path, :before, :after, :error)
+  # SiteDiff Result Object.
+  class Result < Struct.new(
+    :path,
+    :before,
+    :after,
+    :before_encoding,
+    :after_encoding,
+    :error,
+    :verbose
+  )
     STATUS_SUCCESS  = 0   # Identical before and after
     STATUS_FAILURE  = 1   # Different before and after
     STATUS_ERROR    = 2   # Couldn't fetch page
-    STATUS_TEXT = %w[success failure error]
+    STATUS_TEXT = %w[unchanged changed error].freeze
     attr_reader :status, :diff
+    ##
+    # Creates a Result.
     def initialize(*args)
       super
       if error
         @status = STATUS_ERROR
       else
-        @diff = Diff::html_diffy(before, after)
+        if !before_encoding || !after_encoding
+          @diff = Diff.binary_diffy(
+            before,
+            after,
+            before_encoding,
+            after_encoding
+          )
+        else
+          @diff = Diff.html_diffy(before, after)
+        end
         @status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
       end
     end
+    ##
+    # Whether the result has no diff.
+    #
+    # If there is a diff, it is not a success.
+    #
+    # TODO: Change "Success" to unchanged.
     def success?
       status == STATUS_SUCCESS
     end
+    ##
+    # Whether the result has an error.
+    def error?
+      status == STATUS_ERROR
+    end
     # Textual representation of the status
     def status_text
-      return STATUS_TEXT[status]
+      STATUS_TEXT[status]
     end
     # Printable URL
-    def url(prefix)
-      prefix.to_s + path
+    def url(tag, prefix, cache)
+      base = cache.read_tags.include?(tag) ? "/cache/#{tag}" : prefix
+      base.to_s + path
     end
     # Filename to store diff
     def filename
-      File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(self.path) + '.html')
+      File.join(Report::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
     end
-    # Text of the link in the HTML report
-    def link
-      case status
-      when STATUS_ERROR then error
-      when STATUS_SUCCESS then status_text
-      when STATUS_FAILURE then "<a href='#{filename}'>DIFF</a>"
-      end
+    # Returns a URL to the result diff.
+    #
+    # Returns nil if the result has no diffs.
+    def diff_url(relative = false)
+      prefix = relative ? 'files/' : '/files/'
+      return prefix + filename if status == STATUS_FAILURE
     end
     # Log the result to the terminal
-    def log
+    def log(verbose = true)
       case status
-      when STATUS_SUCCESS then
-        SiteDiff::log path, :success, 'SUCCESS'
-      when STATUS_ERROR then
-        SiteDiff::log path, :error, "ERROR (#{error})"
-      when STATUS_FAILURE then
-        SiteDiff::log path, :failure, "FAILURE"
-        puts Diff::terminal_diffy(before, after)
+      when STATUS_SUCCESS
+        SiteDiff.log path, :success, 'UNCHANGED'
+      when STATUS_ERROR
+        SiteDiff.log path + " (#{error})", :warning, 'ERROR'
+      when STATUS_FAILURE
+        SiteDiff.log path, :error, 'CHANGED'
+        puts Diff.terminal_diffy(before, after) if verbose
       end
     end
     # Dump the result to a file
-    def dump(dir)
+    def dump(dir, relative = false)
       dump_path = File.join(dir, filename)
       base = File.dirname(dump_path)
-      FileUtils::mkdir_p(base) unless File.exists?(base)
+      FileUtils.mkdir_p(base) unless File.exist?(base)
       File.open(dump_path, 'w') do |f|
-        f.write(Diff::generate_diff_output(self))
+        f.write(Diff.generate_diff_output(self, relative))
       end
     end
   end

data/lib/sitediff/sanitize.rb CHANGED

@@ -1,104 +1,152 @@
+# frozen_string_literal: true
+require 'sitediff'
+require 'sitediff/exception'
+require 'sitediff/sanitize/dom_transform'
+require 'sitediff/sanitize/regexp'
 require 'nokogiri'
 require 'set'
 class SiteDiff
-  module Sanitize
-    class InvalidSanitization < Exception; end
+  # SiteDiff Sanitizer.
+  class Sanitizer
+    class InvalidSanitization < SiteDiffException; end
     TOOLS = {
-      :array => %w[dom_transform sanitization],
-      :scalar => %w[selector remove_spacing],
-    }
-    DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
-    module_function
-    # Performs dom transformations.
-    #
-    # Currently supported transforms:
-    #
-    #  * { :type => "unwrap_root" }
-    #  * { :type => "unwrap", :selector => "div.field-item" }
-    #  * { :type => "remove", :selector => "div.extra-stuff" }
-    #
-    #  @arg node - Nokogiri document or Node
-    #  @arg rules - array of dom_transform rules
-    #  @return - transformed Nokogiri document node
-    def perform_dom_transforms(node, rules)
-      rules.each do |rule|
-        type = rule['type'] or
-          raise InvalidSanitization, "DOM transform needs a type"
-        DOM_TRANSFORMS.include?(type) or
-          raise InvalidSanitization, "No DOM transform named #{type}"
-        meth = 'transform_' + type
-        if sels = rule['selector']
-          sels = [sels].flatten # Either array or scalar is fine
-          # Call method for each node the selectors find
-          sels.each do |sel|
-            node.css(sel).each { |e| send(meth, rule, e) }
-          end
-        else
-          send(meth, rule, node)
-        end
-      end
+      array: %w[dom_transform sanitization],
+      scalar: %w[selector remove_spacing ignore_whitespace]
+    }.freeze
+    DOM_TRANSFORMS = Set.new(%w[remove strip unwrap_root unwrap remove_class])
+    ##
+    # Creates a Sanitizer.
+    def initialize(html, config, opts = {})
+      @html = html
+      @config = config
+      @opts = opts
     end
-    def transform_remove(rule, el)
-      el.remove
-    end
-    def transform_unwrap(rule, el)
-      el.add_next_sibling(el.children)
-      el.remove
+    ##
+    # Performs sanitization.
+    def sanitize
+      return '' if @html == '' # Quick return on empty input
+      @node = Sanitizer.domify(@html)
+      @html = nil
+      remove_spacing
+      selector
+      dom_transforms
+      regexps
+      @html || Sanitizer.prettify(@node)
     end
-    def transform_remove_class(rule, el)
-      # Must call remove_class on a NodeSet!
-      ns = Nokogiri::XML::NodeSet.new(el.document, [el])
-      [rule['class']].flatten.each do |class_name|
-        ns.remove_class(class_name)
+    # Return whether or not we want to keep a rule
+    def want_rule(rule)
+      return false unless rule
+      return false if rule['disabled']
+      # Filter out if path regexp doesn't match
+      if (pathre = rule['path']) && (path = @opts[:path])
+        return ::Regexp.new(pathre).match(path)
       end
-    end
-    def transform_unwrap_root(rule, node)
-      node.children.size == 1 or
-        raise InvalidSanitization, "Multiple root elements in unwrap_root"
-      node.children = node.children[0].children
+      true
     end
-    def parse(str, force_doc = false, log_errors = false)
-      if force_doc || /<!DOCTYPE/.match(str[0, 512])
-        doc = Nokogiri::HTML(str)
-        doc
+    # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
+    # It may be a simple value, or a hash, or an array of hashes.
+    # Turn it into an array of hashes.
+    def canonicalize_rule(name)
+      (rules = @config[name]) || (return nil)
+      # Already an array? Do nothing.
+      if rules[0]&.respond_to?('each') && rules[0]&.fetch('value')
+      # If it is a hash, put it in an array.
+      elsif rules['value']
+        rules = [rules]
+      # If it is a scalar value, put it in an array.
       else
-        doc = Nokogiri::HTML.fragment(str)
+        rules = [{ 'value' => rules }]
       end
-      if log_errors
-        doc.errors.each do |e|
-          SiteDiff::log "Error in parsing HTML document: #{e}", :error
-        end
+      want = rules.select { |r| want_rule(r) }
+      return nil if want.empty?
+      raise "Too many matching rules of type #{name}" if want.size > 1
+      want.first
+    end
+    # Perform 'remove_spacing' action
+    def remove_spacing
+      (rule = canonicalize_rule('remove_spacing')) || return
+      Sanitizer.remove_node_spacing(@node) if rule['value']
+    end
+    # Perform 'selector' action, to choose a new root
+    def selector
+      (rule = canonicalize_rule('selector')) || return
+      @node = Sanitizer.select_fragments(@node, rule['value'])
+    end
+    # Applies regexps. Also
+    def regexps
+      (rules = @config['sanitization']) || return
+      rules = rules.select { |r| want_rule(r) }
+      rules.map! { |r| Regexp.create(r) }
+      selector, global = rules.partition(&:selector?)
+      selector.each { |r| r.apply(@node) }
+      @html = Sanitizer.prettify(@node)
+      @node = nil
+      # Prevent potential UTF-8 encoding errors by removing bytes
+      # Not the only solution. An alternative is to return the
+      # string unmodified.
+      @html = @html.encode(
+        'UTF-8',
+        'binary',
+        invalid: :replace,
+        undef: :replace,
+        replace: ''
+      )
+      global.each { |r| r.apply(@html) }
+    end
+    # Perform DOM transforms
+    def dom_transforms
+      (rules = @config['dom_transform']) || return
+      rules = rules.select { |r| want_rule(r) }
+      rules.each do |rule|
+        transform = DomTransform.create(rule)
+        transform.apply(@node)
       end
-      doc
     end
-    # Force this object to be a document, so we can apply a stylesheet
-    def to_document(obj)
-      if Nokogiri::XML::Document === obj
-        return obj
-      elsif Nokogiri::XML::Node === obj # or fragment
-        return parse(obj.to_s, true)
-        # This ought to work, and would be faster,
-        # but seems to segfault Nokogiri
-        # doc = Nokogiri::HTML('<html><body>')
-        # doc.at('body').children = obj.children
-        # return doc
-      else
-        return to_document(parse(obj))
+    ##### Implementations of actions #####
+    # Remove double-spacing inside text nodes
+    def self.remove_node_spacing(node)
+      # remove double spacing, but only inside text nodes (eg not attributes)
+      node.xpath('//text()').each do |el|
+        el.content = el.content.gsub(/  +/, ' ')
       end
     end
-    # Pretty-print the HTML
-    def prettify(obj)
+    # Get a fragment consisting of the elements matching the selector(s)
+    def self.select_fragments(node, sel)
+      # When we choose a new root, we always become a DocumentFragment,
+      # and lose any DOCTYPE and such.
+      ns = node.css(sel)
+      node = Nokogiri::HTML.fragment('') unless node.fragment?
+      node.children = ns
+      node
+    end
+    # Pretty-print some HTML
+    def self.prettify(obj)
       @stylesheet ||= begin
         stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
         Nokogiri::XSLT(File.read(stylesheet_path))
@@ -109,10 +157,22 @@ class SiteDiff
       # but that tends to segfault Nokogiri
       str = @stylesheet.apply_to(to_document(obj))
+      # There's a lot of cruft left over,that we don't want
+      # Prevent potential UTF-8 encoding errors by removing invalid bytes.
+      # Not the only solution.
+      # An alternative is to return the string unmodified.
+      str = str.encode(
+        'UTF-8',
+        'binary',
+        invalid: :replace,
+        undef: :replace,
+        replace: ''
+      )
       # Remove xml declaration and <html> tags
       str.sub!(/\A<\?xml.*$\n/, '')
       str.sub!(/\A^<html>$\n/, '')
-      str.sub!(%r[</html>\n\Z], '')
+      str.sub!(%r{</html>\n\Z}, '')
       # Remove top-level indentation
       indent = /\A(\s*)/.match(str)[1].size
@@ -121,73 +181,32 @@ class SiteDiff
       # Remove blank lines
       str.gsub!(/^\s*$\n/, '')
-      return str
-    end
+      # Remove DOS newlines
+      str.gsub!(/\x0D$/, '')
+      str.gsub!(/&#13;$/, '')
-    def remove_spacing(doc)
-      # remove double spacing, but only inside text nodes (eg not attributes)
-      doc.xpath('//text()').each do |node|
-        node.content = node.content.gsub(/  +/, ' ')
-      end
-    end
-    # Do one regexp transformation on a string
-    def substitute(str, rule)
-      #FIXME escape forward slashes, right now we are escaping them in YAML!
-      str.gsub!(/#{rule['pattern']}/, rule['substitute'] || '' )
       str
     end
-    # Do all regexp sanitization rules
-    def perform_regexps(node, rules)
-      rules ||= []
-      # First do rules with a selector
-      rules.each do |rule|
-        if sel = rule['selector']
-          node.css(sel).each do |e|
-            e.replace(substitute(e.to_html, rule))
-          end
-        end
-      end
-      # If needed, do rules without a selector. We'd rather not convert to
-      # a string unless necessary.
-      global_rules = rules.reject { |r| r['selector'] }
-      return node if global_rules.empty?
-      str = node.to_html # Convert to string
-      global_rules.each { |r| substitute(str, r) }
-      return str
-    end
-    def select_root(node, sel)
-      return node unless sel
-      # When we choose a new root, we always become a DocumentFragment,
-      # and lose any DOCTYPE and such.
-      ns = node.css(sel)
-      unless node.fragment?
-        node = Nokogiri::HTML.fragment('')
+    # Parse HTML into a node
+    def self.domify(str, force_doc = false)
+      if force_doc || /<!DOCTYPE/.match(str[0, 512])
+        Nokogiri::HTML(str)
+      else
+        Nokogiri::HTML.fragment(str)
       end
-      node.children = ns
-      return node
     end
-    def sanitize(str, config)
-      return '' if str == ''
-      node = parse(str)
-      remove_spacing(node) if config['remove_spacing']
-      node = select_root(node, config['selector'])
-      if transform = config['dom_transform']
-        perform_dom_transforms(node, transform)
+    # Force this object to be a document, so we can apply a stylesheet
+    def self.to_document(obj)
+      if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
+        obj
+      # node or fragment
+      elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
+        domify(obj.to_s, true)
+      else
+        to_document(domify(obj, false))
       end
-      obj = perform_regexps(node, config['sanitization'])
-      return prettify(obj)
     end
   end
 end