RubyGems - sitediff - Versions diffs - 0.0.2 → 1.1.1 - Mend

sitediff 0.0.2 → 1.1.1

Files changed (32) hide show

checksums.yaml +7 -0
data/bin/sitediff +9 -3
data/lib/sitediff.rb +153 -79
data/lib/sitediff/api.rb +265 -0
data/lib/sitediff/cache.rb +110 -47
data/lib/sitediff/cli.rb +219 -165
data/lib/sitediff/config.rb +439 -58
data/lib/sitediff/config/creator.rb +93 -99
data/lib/sitediff/config/preset.rb +75 -0
data/lib/sitediff/crawler.rb +108 -72
data/lib/sitediff/diff.rb +60 -12
data/lib/sitediff/exception.rb +3 -1
data/lib/sitediff/fetch.rb +62 -41
data/lib/sitediff/files/diff.html.erb +20 -2
data/lib/sitediff/files/jquery.min.js +2 -0
data/lib/sitediff/files/normalize.css +349 -0
data/lib/sitediff/files/report.html.erb +171 -0
data/lib/sitediff/files/sidebyside.html.erb +5 -2
data/lib/sitediff/files/sitediff.css +303 -30
data/lib/sitediff/files/sitediff.js +367 -0
data/lib/sitediff/report.rb +254 -0
data/lib/sitediff/result.rb +59 -23
data/lib/sitediff/sanitize.rb +222 -150
data/lib/sitediff/sanitize/dom_transform.rb +111 -73
data/lib/sitediff/sanitize/regexp.rb +69 -43
data/lib/sitediff/uriwrapper.rb +104 -34
data/lib/sitediff/webserver.rb +89 -77
data/lib/sitediff/webserver/resultserver.rb +113 -77
metadata +92 -76
data/lib/sitediff/files/html_report.html.erb +0 -63
data/lib/sitediff/files/rules/drupal.yaml +0 -33
data/lib/sitediff/rules.rb +0 -65

@@ -1,76 +1,112 @@
+# frozen_string_literal: true
 require 'sitediff'
 require 'sitediff/diff'
+require 'sitediff/report'
 require 'digest/sha1'
 require 'fileutils'
 class SiteDiff
-  class Result < Struct.new(:path, :before, :after, :error, :verbose)
+  # SiteDiff Result Object.
+  class Result < Struct.new(
+    :path,
+    :before,
+    :after,
+    :before_encoding,
+    :after_encoding,
+    :error,
+    :verbose
+  )
     STATUS_SUCCESS  = 0   # Identical before and after
     STATUS_FAILURE  = 1   # Different before and after
     STATUS_ERROR    = 2   # Couldn't fetch page
-    STATUS_TEXT = %w[success failure error]
+    STATUS_TEXT = %w[unchanged changed error].freeze
     attr_reader :status, :diff
+    ##
+    # Creates a Result.
     def initialize(*args)
       super
       if error
         @status = STATUS_ERROR
       else
-        @diff = Diff::html_diffy(before, after)
+        if !before_encoding || !after_encoding
+          @diff = Diff.binary_diffy(
+            before,
+            after,
+            before_encoding,
+            after_encoding
+          )
+        else
+          @diff = Diff.html_diffy(before, after)
+        end
         @status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
       end
     end
+    ##
+    # Whether the result has no diff.
+    #
+    # If there is a diff, it is not a success.
+    #
+    # TODO: Change "Success" to unchanged.
     def success?
       status == STATUS_SUCCESS
     end
+    ##
+    # Whether the result has an error.
+    def error?
+      status == STATUS_ERROR
+    end
     # Textual representation of the status
     def status_text
-      return STATUS_TEXT[status]
+      STATUS_TEXT[status]
     end
     # Printable URL
     def url(tag, prefix, cache)
+      return unless prefix
       base = cache.read_tags.include?(tag) ? "/cache/#{tag}" : prefix
       base.to_s + path
     end
     # Filename to store diff
     def filename
-      File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(self.path) + '.html')
+      File.join(Report::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
     end
-    # Text of the link in the HTML report
-    def link
-      case status
-      when STATUS_ERROR then error
-      when STATUS_SUCCESS then status_text
-      when STATUS_FAILURE then "<a href='#{filename}'>DIFF</a>"
-      end
+    # Returns a URL to the result diff.
+    #
+    # Returns nil if the result has no diffs.
+    def diff_url(relative = false)
+      prefix = relative ? 'files/' : '/files/'
+      return prefix + filename if status == STATUS_FAILURE
     end
     # Log the result to the terminal
-    def log(verbose=true)
+    def log(verbose = true)
       case status
-      when STATUS_SUCCESS then
-        SiteDiff::log path, :diff_success, 'SUCCESS'
-      when STATUS_ERROR then
-        SiteDiff::log path, :warn, "ERROR (#{error})"
-      when STATUS_FAILURE then
-        SiteDiff::log path, :diff_failure, "FAILURE"
-        puts Diff::terminal_diffy(before, after) if verbose
+      when STATUS_SUCCESS
+        SiteDiff.log path, :success, 'UNCHANGED'
+      when STATUS_ERROR
+        SiteDiff.log path + " (#{error})", :warning, 'ERROR'
+      when STATUS_FAILURE
+        SiteDiff.log path, :error, 'CHANGED'
+        puts Diff.terminal_diffy(before, after) if verbose
       end
     end
     # Dump the result to a file
-    def dump(dir)
+    def dump(dir, relative = false)
       dump_path = File.join(dir, filename)
       base = File.dirname(dump_path)
-      FileUtils::mkdir_p(base) unless File.exists?(base)
+      FileUtils.mkdir_p(base) unless File.exist?(base)
       File.open(dump_path, 'w') do |f|
-        f.write(Diff::generate_diff_output(self))
+        f.write(Diff.generate_diff_output(self, relative))
       end
     end
   end

data/lib/sitediff/sanitize.rb CHANGED

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'sitediff'
 require 'sitediff/exception'
 require 'sitediff/sanitize/dom_transform'
@@ -6,183 +8,253 @@ require 'nokogiri'
 require 'set'
 class SiteDiff
-class Sanitizer
-class InvalidSanitization < SiteDiffException; end
-TOOLS = {
-  :array => %w[dom_transform sanitization],
-  :scalar => %w[selector remove_spacing],
-}
-DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
-def initialize(html, config, opts = {})
-  @html = html
-  @config = config
-  @opts = opts
-end
+  # SiteDiff Sanitizer.
+  class Sanitizer
+    class InvalidSanitization < SiteDiffException; end
+    TOOLS = {
+      array: %w[dom_transform sanitization],
+      scalar: %w[selector remove_spacing ignore_whitespace]
+    }.freeze
+    DOM_TRANSFORMS = Set.new(%w[remove strip unwrap_root unwrap remove_class])
+    ##
+    # Creates a Sanitizer.
+    def initialize(html, config, opts = {})
+      @html = html
+      @config = config
+      @opts = opts
+    end
-def sanitize
-  return '' if @html == '' # Quick return on empty input
+    ##
+    # Performs sanitization.
+    def sanitize
+      return '' if @html == '' # Quick return on empty input
-  @node, @html = Sanitizer.domify(@html), nil
+      @node = Sanitizer.domify(@html)
+      @html = nil
-  remove_spacing
-  selector
-  dom_transforms
-  regexps
+      remove_spacing
+      regions || selector
+      dom_transforms
+      regexps
-  return @html || Sanitizer.prettify(@node)
-end
+      @html || Sanitizer.prettify(@node)
+    end
-# Return whether or not we want to keep a rule
-def want_rule(rule)
-  return false unless rule
-  return false if rule['disabled']
+    # Return whether or not we want to keep a rule
+    def want_rule(rule)
+      return false unless rule
+      return false if rule['disabled']
-  # Filter out if path regexp doesn't match
-  if (pathre = rule['path']) and (path = @opts[:path])
-    return ::Regexp.new(pathre).match(path)
-  end
+      # Filter out if path regexp doesn't match
+      if (pathre = rule['path']) && (path = @opts[:path])
+        return ::Regexp.new(pathre).match(path)
+      end
-  return true
-end
+      true
+    end
-# Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
-# It may be a simple value, or a hash, or an array of hashes.
-# Turn it into an array of hashes.
-def canonicalize_rule(name)
-  rules = @config[name] or return nil
-  if rules[0] && rules[0].respond_to?(:[]) && rules[0]['value']
-    # Already an array
-  elsif rules['value']
-    # Hash, put it in an array
-    rules = [rules]
-  else
-    # Scalar, put it in a hash
-    rules = [{ 'value' => rules }]
-  end
+    # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
+    # It may be a simple value, or a hash, or an array of hashes.
+    # Turn it into an array of hashes.
+    def canonicalize_rule(name)
+      (rules = @config[name]) || (return nil)
+      # Already an array? Do nothing.
+      if rules[0]&.respond_to?('each') && rules[0]&.fetch('value')
+      # If it is a hash, put it in an array.
+      elsif rules['value']
+        rules = [rules]
+      # If it is a scalar value, put it in an array.
+      else
+        rules = [{ 'value' => rules }]
+      end
+      want = rules.select { |r| want_rule(r) }
+      return nil if want.empty?
+      raise "Too many matching rules of type #{name}" if want.size > 1
+      want.first
+    end
-  want = rules.select { |r| want_rule(r) }
-  return nil if want.empty?
-  raise "Too many matching rules of type #{name}" if want.size > 1
-  return want.first
-end
+    # Perform 'remove_spacing' action
+    def remove_spacing
+      (rule = canonicalize_rule('remove_spacing')) || return
+      Sanitizer.remove_node_spacing(@node) if rule['value']
+    end
-# Perform 'remove_spacing' action
-def remove_spacing
-  rule = canonicalize_rule('remove_spacing') or return
-  Sanitizer.remove_node_spacing(@node) if rule['value']
-end
+    # Perform 'regions' action, don't perform 'selector' if regions exist.
+    def regions
+      return unless validate_regions
-# Perform 'selector' action, to choose a new root
-def selector
-  rule = canonicalize_rule('selector') or return
-  @node = Sanitizer.select_fragments(@node, rule['value'])
-end
+      @node = select_regions(@node, @config['regions'], @opts[:output])
+    end
-# Applies regexps. Also
-def regexps
-  rules = @config['sanitization'] or return
-  rules = rules.select { |r| want_rule(r) }
+    # Perform 'selector' action, to choose a new root
+    def selector
+      (rule = canonicalize_rule('selector')) || return
+      @node = Sanitizer.select_fragments(@node, rule['value'])
+    end
-  rules.map! { |r| Regexp.create(r) }
-  selector, global = rules.partition { |r| r.selector? }
+    # Applies regexps. Also
+    def regexps
+      (rules = @config['sanitization']) || return
+      rules = rules.select { |r| want_rule(r) }
+      rules.map! { |r| Regexp.create(r) }
+      selector, global = rules.partition(&:selector?)
+      selector.each { |r| r.apply(@node) }
+      @html = Sanitizer.prettify(@node)
+      @node = nil
+      # Prevent potential UTF-8 encoding errors by removing bytes
+      # Not the only solution. An alternative is to return the
+      # string unmodified.
+      @html = @html.encode(
+        'UTF-8',
+        'binary',
+        invalid: :replace,
+        undef: :replace,
+        replace: ''
+      )
+      global.each { |r| r.apply(@html) }
+    end
-  selector.each { |r| r.apply(@node) }
-  @html, @node = Sanitizer.prettify(@node), nil
-  global.each { |r| r.apply(@html) }
-end
+    # Perform DOM transforms
+    def dom_transforms
+      (rules = @config['dom_transform']) || return
+      rules = rules.select { |r| want_rule(r) }
-# Perform DOM transforms
-def dom_transforms
-  rules = @config['dom_transform'] or return
-  rules = rules.select { |r| want_rule(r) }
+      rules.each do |rule|
+        transform = DomTransform.create(rule)
+        transform.apply(@node)
+      end
+    end
-  rules.each do |rule|
-    transform = DomTransform.create(rule)
-    transform.apply(@node)
-  end
-end
+    ##### Implementations of actions #####
-##### Implementations of actions #####
+    # Remove double-spacing inside text nodes
+    def self.remove_node_spacing(node)
+      # remove double spacing, but only inside text nodes (eg not attributes)
+      node.xpath('//text()').each do |el|
+        el.content = el.content.gsub(/  +/, ' ')
+      end
+    end
-# Remove double-spacing inside text nodes
-def self.remove_node_spacing(node)
-  # remove double spacing, but only inside text nodes (eg not attributes)
-  node.xpath('//text()').each do |el|
-    el.content = el.content.gsub(/  +/, ' ')
-  end
-end
+    # Restructure the node into regions.
+    def select_regions(node, regions, output)
+      regions = output.map do |name|
+        selector = get_named_region(regions, name)['selector']
+        region = Nokogiri::XML.fragment('<region id="' + name + '"></region>').at_css('region')
+        matching = node.css(selector)
+        matching.each { |m| region.add_child m }
+        region
+      end
+      node = Nokogiri::HTML.fragment('')
+      regions.each { |r| node.add_child r }
+      node
+    end
-# Get a fragment consisting of the elements matching the selector(s)
-def self.select_fragments(node, sel)
-  # When we choose a new root, we always become a DocumentFragment,
-  # and lose any DOCTYPE and such.
-  ns = node.css(sel)
-  unless node.fragment?
-    node = Nokogiri::HTML.fragment('')
-  end
-  node.children = ns
-  return node
-end
+    # Get a fragment consisting of the elements matching the selector(s)
+    def self.select_fragments(node, sel)
+      # When we choose a new root, we always become a DocumentFragment,
+      # and lose any DOCTYPE and such.
+      ns = node.css(sel)
+      node = Nokogiri::HTML.fragment('') unless node.fragment?
+      node.children = ns
+      node
+    end
-# Pretty-print some HTML
-def self.prettify(obj)
-  @stylesheet ||= begin
-    stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
-    Nokogiri::XSLT(File.read(stylesheet_path))
-  end
+    # Pretty-print some HTML
+    def self.prettify(obj)
+      @stylesheet ||= begin
+        stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
+        Nokogiri::XSLT(File.read(stylesheet_path))
+      end
+      # Pull out the html element's children
+      # The obvious way to do this is to iterate over pretty.css('html'),
+      # but that tends to segfault Nokogiri
+      str = @stylesheet.apply_to(to_document(obj))
+      # There's a lot of cruft left over,that we don't want
+      # Prevent potential UTF-8 encoding errors by removing invalid bytes.
+      # Not the only solution.
+      # An alternative is to return the string unmodified.
+      str = str.encode(
+        'UTF-8',
+        'binary',
+        invalid: :replace,
+        undef: :replace,
+        replace: ''
+      )
+      # Remove xml declaration and <html> tags
+      str.sub!(/\A<\?xml.*$\n/, '')
+      str.sub!(/\A^<html>$\n/, '')
+      str.sub!(%r{</html>\n\Z}, '')
+      # Remove top-level indentation
+      indent = /\A(\s*)/.match(str)[1].size
+      str.gsub!(/^\s{,#{indent}}/, '')
+      # Remove blank lines
+      str.gsub!(/^\s*$\n/, '')
+      # Remove DOS newlines
+      str.gsub!(/\x0D$/, '')
+      str.gsub!(/&#13;$/, '')
+      str
+    end
+    # Parse HTML into a node
+    def self.domify(str, force_doc = false)
+      if force_doc || /<!DOCTYPE/.match(str[0, 512])
+        Nokogiri::HTML(str)
+      else
+        Nokogiri::HTML.fragment(str)
+      end
+    end
-  # Pull out the html element's children
-  # The obvious way to do this is to iterate over pretty.css('html'),
-  # but that tends to segfault Nokogiri
-  str = @stylesheet.apply_to(to_document(obj))
+    # Force this object to be a document, so we can apply a stylesheet
+    def self.to_document(obj)
+      if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
+        obj
+      # node or fragment
+      elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
+        domify(obj.to_s, true)
+      else
+        to_document(domify(obj, false))
+      end
+    end
-  # There's a lot of cruft left over,that we don't want
+    private
-  # Remove xml declaration and <html> tags
-  str.sub!(/\A<\?xml.*$\n/, '')
-  str.sub!(/\A^<html>$\n/, '')
-  str.sub!(%r[</html>\n\Z], '')
+    # Validate `regions` and `output` from config.
+    def validate_regions
+      return false unless @config['regions'].is_a?(Array)
-  # Remove top-level indentation
-  indent = /\A(\s*)/.match(str)[1].size
-  str.gsub!(/^\s{,#{indent}}/, '')
+      return false unless @opts[:output].is_a?(Array)
-  # Remove blank lines
-  str.gsub!(/^\s*$\n/, '')
+      regions = @config['regions']
+      output = @opts[:output]
+      regions.each do |region|
+        return false unless region.key?('name') && region.key?('selector')
+      end
-  return str
-end
+      # Check that each named output has an associated region.
+      output.each do |name|
+        return false unless get_named_region(regions, name)
+      end
-# Parse HTML into a node
-def self.domify(str, force_doc = false)
-  if force_doc || /<!DOCTYPE/.match(str[0, 512])
-    return Nokogiri::HTML(str)
-  else
-    return Nokogiri::HTML.fragment(str)
-  end
-end
+      true
+    end
-# Force this object to be a document, so we can apply a stylesheet
-def self.to_document(obj)
-  if Nokogiri::XML::Document === obj
-    return obj
-  elsif Nokogiri::XML::Node === obj # node or fragment
-    return domify(obj.to_s, true)
-    # This ought to work, and would be faster,
-    # but seems to segfault Nokogiri
-    if false
-      doc = Nokogiri::HTML('<html><body>')
-      doc.at('body').children = obj.children
-      return doc
-    end
-  else
-    return to_document(domify(obj))
+    # Return the selector from a named region.
+    def get_named_region(regions, name)
+      regions.find { |region| region['name'] == name }
+    end
   end
 end
-end
-end