RubyGems - sitediff - Versions diffs - 0.0.1 → 0.0.2 - Mend

sitediff 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/bin/sitediff +1 -1
data/lib/sitediff.rb +79 -63
data/lib/sitediff/cache.rb +61 -0
data/lib/sitediff/cli.rb +144 -23
data/lib/sitediff/config.rb +46 -9
data/lib/sitediff/config/creator.rb +122 -0
data/lib/sitediff/crawler.rb +95 -0
data/lib/sitediff/diff.rb +2 -1
data/lib/sitediff/exception.rb +3 -0
data/lib/sitediff/fetch.rb +55 -0
data/lib/sitediff/files/html_report.html.erb +20 -4
data/lib/sitediff/files/rules/drupal.yaml +33 -0
data/lib/sitediff/files/sidebyside.html.erb +13 -0
data/lib/sitediff/files/sitediff.css +11 -0
data/lib/sitediff/result.rb +12 -9
data/lib/sitediff/rules.rb +65 -0
data/lib/sitediff/sanitize.rb +163 -168
data/lib/sitediff/sanitize/dom_transform.rb +92 -0
data/lib/sitediff/sanitize/regexp.rb +56 -0
data/lib/sitediff/uriwrapper.rb +19 -7
data/lib/sitediff/webserver.rb +82 -0
data/lib/sitediff/webserver/resultserver.rb +98 -0
metadata +70 -25
checksums.yaml +0 -7
data/lib/sitediff/util/cache.rb +0 -32
data/lib/sitediff/util/webserver.rb +0 -77

data/lib/sitediff/files/rules/drupal.yaml ADDED

@@ -0,0 +1,33 @@
+sanitization:
+- title: Strip Drupal.settings
+  selector: script
+  pattern: '^(<script>)?jQuery.extend\(Drupal.settings.*$'
+- title: Strip form build ID
+  selector: input
+  pattern: 'name="form_build_id" value="form-[-\w]{43}"'
+  substitution: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
+- title: Strip view DOM ID
+  pattern: '(class="view .*) view-dom-id-[a-f0-9]{32}"'
+  substitution: '\1 view-dom-id-DRUPAL_VIEW_DOM_ID"'
+- title: Strip CSS aggregation filenames
+  selector: link[rel=stylesheet]
+  pattern: '(href="[^"]*/files/css/css_)[-\w]{43}\.css"'
+  substitution: '\1DRUPAL_AGGREGATED_CSS.css"'
+- title: Strip JS aggregation filenames
+  selector: script
+  pattern: '(src="[^"]*/files/js/js_)[-\w]{43}\.js"'
+  substitution: '\1DRUPAL_AGGREGATED_JS.js"'
+- title: Strip CSS/JS cache IDs
+  selector: style, script
+  pattern: '("[^"]*\.(js|css))\?[a-z0-9]{6}"'
+  substitution: '\1'
+- title: Strip IE CSS/JS cache IDs
+  pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
+  substitution: '\1'
+- title: Strip Drupal JS version tags
+  selector: script
+  pattern: '(src="[^"]*/misc/\w+\.js)?v=\d+\.\d+"'
+  substitution: '\1'
+- title: Strip domain names from absolute URLs
+  pattern: 'http:\/\/[a-zA-Z0-9.:-]+'
+  substitute: '__domain__'

data/lib/sitediff/files/sidebyside.html.erb ADDED

@@ -0,0 +1,13 @@
+<html>
+  <head>
+    <title>Comparison for <%= path %></title>
+    <style>
+      <%= SiteDiff::Diff.css %>
+    </style>
+    <meta charset="utf-8" />
+  </head>
+  <body id="sidebyside">
+    <iframe src="<%= before %>"></iframe>
+    <iframe src="<%= after %>"></iframe>
+  </body>
+</html>

data/lib/sitediff/files/sitediff.css CHANGED

@@ -33,6 +33,7 @@
   background-color: salmon;
 }
 .sitediff .before-col,
+.sitediff .both-col,
 .sitediff .after-col,
 .sitediff .diff-stat-col {
   width: 10%;
@@ -40,3 +41,13 @@
 .sitediff .path-col {
   width: 55%;
 }
+#sidebyside {
+  margin: 0;
+}
+#sidebyside iframe {
+  float: left;
+  height: 100%;
+  width: 50%;
+  border: 0;
+}

data/lib/sitediff/result.rb CHANGED

@@ -1,8 +1,10 @@
-require 'fileutils'
+require 'sitediff'
+require 'sitediff/diff'
 require 'digest/sha1'
+require 'fileutils'
 class SiteDiff
-  class Result < Struct.new(:path, :before, :after, :error)
+  class Result < Struct.new(:path, :before, :after, :error, :verbose)
     STATUS_SUCCESS  = 0   # Identical before and after
     STATUS_FAILURE  = 1   # Different before and after
     STATUS_ERROR    = 2   # Couldn't fetch page
@@ -30,8 +32,9 @@ class SiteDiff
     end
     # Printable URL
-    def url(prefix)
-      prefix.to_s + path
+    def url(tag, prefix, cache)
+      base = cache.read_tags.include?(tag) ? "/cache/#{tag}" : prefix
+      base.to_s + path
     end
     # Filename to store diff
@@ -49,15 +52,15 @@ class SiteDiff
     end
     # Log the result to the terminal
-    def log
+    def log(verbose=true)
       case status
       when STATUS_SUCCESS then
-        SiteDiff::log path, :success, 'SUCCESS'
+        SiteDiff::log path, :diff_success, 'SUCCESS'
       when STATUS_ERROR then
-        SiteDiff::log path, :error, "ERROR (#{error})"
+        SiteDiff::log path, :warn, "ERROR (#{error})"
       when STATUS_FAILURE then
-        SiteDiff::log path, :failure, "FAILURE"
-        puts Diff::terminal_diffy(before, after)
+        SiteDiff::log path, :diff_failure, "FAILURE"
+        puts Diff::terminal_diffy(before, after) if verbose
       end
     end

data/lib/sitediff/rules.rb ADDED

@@ -0,0 +1,65 @@
+require 'sitediff/sanitize/regexp'
+require 'pathname'
+require 'set'
+class SiteDiff
+# Find appropriate rules for a given site
+class Rules
+  def initialize(config, disabled = false)
+    @disabled = disabled
+    @config = config
+    find_sanitization_candidates
+    @rules = Hash.new { |h, k| h[k] = Set.new }
+  end
+  def find_sanitization_candidates
+    @candidates = Set.new
+    rules_dir = Pathname.new(__FILE__).dirname + 'files' + 'rules'
+    rules_dir.children.each do |f|
+      next unless f.file? && f.extname == '.yaml'
+      conf = YAML.load_file(f)
+      @candidates.merge(conf['sanitization'])
+    end
+  end
+  def handle_page(tag, html, doc)
+    found = find_rules(html, doc)
+    @rules[tag].merge(found)
+  end
+  # Yield a set of rules that seem reasonable for this HTML
+  # assumption: the YAML file is a list of regexp rules only
+  def find_rules(html, doc)
+    rules = []
+    return @candidates.select do |rule|
+      re = SiteDiff::Sanitizer::Regexp.create(rule)
+      re.applies?(html, doc)
+    end
+  end
+  # Find all rules from all rulesets that apply for all pages
+  def add_config
+    have_both = @rules.include?(:before)
+    r1, r2 = *@rules.values_at(:before, :after)
+    if have_both
+      add_section('before', r1 - r2)
+      add_section('after', r2 - r1)
+      add_section(nil, r1 & r2)
+    else
+      add_section(nil, r2)
+    end
+  end
+  def add_section(name, rules)
+    return if rules.empty?
+    conf = name ? @config[name] : @config
+    if @disabled
+      rules.each { |r| r['disabled'] = true }
+    end
+    conf['sanitization'] = rules.to_a.sort_by { |r| r['title'] }
+  end
+end
+end

data/lib/sitediff/sanitize.rb CHANGED

@@ -1,193 +1,188 @@
+require 'sitediff'
+require 'sitediff/exception'
+require 'sitediff/sanitize/dom_transform'
+require 'sitediff/sanitize/regexp'
 require 'nokogiri'
 require 'set'
 class SiteDiff
-  module Sanitize
-    class InvalidSanitization < Exception; end
-    TOOLS = {
-      :array => %w[dom_transform sanitization],
-      :scalar => %w[selector remove_spacing],
-    }
-    DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
-    module_function
-    # Performs dom transformations.
-    #
-    # Currently supported transforms:
-    #
-    #  * { :type => "unwrap_root" }
-    #  * { :type => "unwrap", :selector => "div.field-item" }
-    #  * { :type => "remove", :selector => "div.extra-stuff" }
-    #
-    #  @arg node - Nokogiri document or Node
-    #  @arg rules - array of dom_transform rules
-    #  @return - transformed Nokogiri document node
-    def perform_dom_transforms(node, rules)
-      rules.each do |rule|
-        type = rule['type'] or
-          raise InvalidSanitization, "DOM transform needs a type"
-        DOM_TRANSFORMS.include?(type) or
-          raise InvalidSanitization, "No DOM transform named #{type}"
-        meth = 'transform_' + type
-        if sels = rule['selector']
-          sels = [sels].flatten # Either array or scalar is fine
-          # Call method for each node the selectors find
-          sels.each do |sel|
-            node.css(sel).each { |e| send(meth, rule, e) }
-          end
-        else
-          send(meth, rule, node)
-        end
-      end
-    end
+class Sanitizer
+class InvalidSanitization < SiteDiffException; end
+TOOLS = {
+  :array => %w[dom_transform sanitization],
+  :scalar => %w[selector remove_spacing],
+}
+DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
+def initialize(html, config, opts = {})
+  @html = html
+  @config = config
+  @opts = opts
+end
-    def transform_remove(rule, el)
-      el.remove
-    end
-    def transform_unwrap(rule, el)
-      el.add_next_sibling(el.children)
-      el.remove
-    end
-    def transform_remove_class(rule, el)
-      # Must call remove_class on a NodeSet!
-      ns = Nokogiri::XML::NodeSet.new(el.document, [el])
-      [rule['class']].flatten.each do |class_name|
-        ns.remove_class(class_name)
-      end
-    end
-    def transform_unwrap_root(rule, node)
-      node.children.size == 1 or
-        raise InvalidSanitization, "Multiple root elements in unwrap_root"
-      node.children = node.children[0].children
-    end
+def sanitize
+  return '' if @html == '' # Quick return on empty input
-    def parse(str, force_doc = false, log_errors = false)
-      if force_doc || /<!DOCTYPE/.match(str[0, 512])
-        doc = Nokogiri::HTML(str)
-        doc
-      else
-        doc = Nokogiri::HTML.fragment(str)
-      end
-      if log_errors
-        doc.errors.each do |e|
-          SiteDiff::log "Error in parsing HTML document: #{e}", :error
-        end
-      end
-      doc
-    end
+  @node, @html = Sanitizer.domify(@html), nil
-    # Force this object to be a document, so we can apply a stylesheet
-    def to_document(obj)
-      if Nokogiri::XML::Document === obj
-        return obj
-      elsif Nokogiri::XML::Node === obj # or fragment
-        return parse(obj.to_s, true)
-        # This ought to work, and would be faster,
-        # but seems to segfault Nokogiri
-        # doc = Nokogiri::HTML('<html><body>')
-        # doc.at('body').children = obj.children
-        # return doc
-      else
-        return to_document(parse(obj))
-      end
-    end
+  remove_spacing
+  selector
+  dom_transforms
+  regexps
-    # Pretty-print the HTML
-    def prettify(obj)
-      @stylesheet ||= begin
-        stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
-        Nokogiri::XSLT(File.read(stylesheet_path))
-      end
+  return @html || Sanitizer.prettify(@node)
+end
-      # Pull out the html element's children
-      # The obvious way to do this is to iterate over pretty.css('html'),
-      # but that tends to segfault Nokogiri
-      str = @stylesheet.apply_to(to_document(obj))
+# Return whether or not we want to keep a rule
+def want_rule(rule)
+  return false unless rule
+  return false if rule['disabled']
+  # Filter out if path regexp doesn't match
+  if (pathre = rule['path']) and (path = @opts[:path])
+    return ::Regexp.new(pathre).match(path)
+  end
-      # Remove xml declaration and <html> tags
-      str.sub!(/\A<\?xml.*$\n/, '')
-      str.sub!(/\A^<html>$\n/, '')
-      str.sub!(%r[</html>\n\Z], '')
+  return true
+end
-      # Remove top-level indentation
-      indent = /\A(\s*)/.match(str)[1].size
-      str.gsub!(/^\s{,#{indent}}/, '')
+# Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
+# It may be a simple value, or a hash, or an array of hashes.
+# Turn it into an array of hashes.
+def canonicalize_rule(name)
+  rules = @config[name] or return nil
+  if rules[0] && rules[0].respond_to?(:[]) && rules[0]['value']
+    # Already an array
+  elsif rules['value']
+    # Hash, put it in an array
+    rules = [rules]
+  else
+    # Scalar, put it in a hash
+    rules = [{ 'value' => rules }]
+  end
-      # Remove blank lines
-      str.gsub!(/^\s*$\n/, '')
+  want = rules.select { |r| want_rule(r) }
+  return nil if want.empty?
+  raise "Too many matching rules of type #{name}" if want.size > 1
+  return want.first
+end
-      return str
-    end
+# Perform 'remove_spacing' action
+def remove_spacing
+  rule = canonicalize_rule('remove_spacing') or return
+  Sanitizer.remove_node_spacing(@node) if rule['value']
+end
-    def remove_spacing(doc)
-      # remove double spacing, but only inside text nodes (eg not attributes)
-      doc.xpath('//text()').each do |node|
-        node.content = node.content.gsub(/  +/, ' ')
-      end
-    end
+# Perform 'selector' action, to choose a new root
+def selector
+  rule = canonicalize_rule('selector') or return
+  @node = Sanitizer.select_fragments(@node, rule['value'])
+end
-    # Do one regexp transformation on a string
-    def substitute(str, rule)
-      #FIXME escape forward slashes, right now we are escaping them in YAML!
-      str.gsub!(/#{rule['pattern']}/, rule['substitute'] || '' )
-      str
-    end
+# Applies regexps. Also
+def regexps
+  rules = @config['sanitization'] or return
+  rules = rules.select { |r| want_rule(r) }
-    # Do all regexp sanitization rules
-    def perform_regexps(node, rules)
-      rules ||= []
-      # First do rules with a selector
-      rules.each do |rule|
-        if sel = rule['selector']
-          node.css(sel).each do |e|
-            e.replace(substitute(e.to_html, rule))
-          end
-        end
-      end
-      # If needed, do rules without a selector. We'd rather not convert to
-      # a string unless necessary.
-      global_rules = rules.reject { |r| r['selector'] }
-      return node if global_rules.empty?
-      str = node.to_html # Convert to string
-      global_rules.each { |r| substitute(str, r) }
-      return str
-    end
+  rules.map! { |r| Regexp.create(r) }
+  selector, global = rules.partition { |r| r.selector? }
-    def select_root(node, sel)
-      return node unless sel
-      # When we choose a new root, we always become a DocumentFragment,
-      # and lose any DOCTYPE and such.
-      ns = node.css(sel)
-      unless node.fragment?
-        node = Nokogiri::HTML.fragment('')
-      end
-      node.children = ns
-      return node
-    end
+  selector.each { |r| r.apply(@node) }
+  @html, @node = Sanitizer.prettify(@node), nil
+  global.each { |r| r.apply(@html) }
+end
-    def sanitize(str, config)
-      return '' if str == ''
+# Perform DOM transforms
+def dom_transforms
+  rules = @config['dom_transform'] or return
+  rules = rules.select { |r| want_rule(r) }
-      node = parse(str)
+  rules.each do |rule|
+    transform = DomTransform.create(rule)
+    transform.apply(@node)
+  end
+end
+##### Implementations of actions #####
+# Remove double-spacing inside text nodes
+def self.remove_node_spacing(node)
+  # remove double spacing, but only inside text nodes (eg not attributes)
+  node.xpath('//text()').each do |el|
+    el.content = el.content.gsub(/  +/, ' ')
+  end
+end
+# Get a fragment consisting of the elements matching the selector(s)
+def self.select_fragments(node, sel)
+  # When we choose a new root, we always become a DocumentFragment,
+  # and lose any DOCTYPE and such.
+  ns = node.css(sel)
+  unless node.fragment?
+    node = Nokogiri::HTML.fragment('')
+  end
+  node.children = ns
+  return node
+end
+# Pretty-print some HTML
+def self.prettify(obj)
+  @stylesheet ||= begin
+    stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
+    Nokogiri::XSLT(File.read(stylesheet_path))
+  end
-      remove_spacing(node) if config['remove_spacing']
-      node = select_root(node, config['selector'])
-      if transform = config['dom_transform']
-        perform_dom_transforms(node, transform)
-      end
+  # Pull out the html element's children
+  # The obvious way to do this is to iterate over pretty.css('html'),
+  # but that tends to segfault Nokogiri
+  str = @stylesheet.apply_to(to_document(obj))
-      obj = perform_regexps(node, config['sanitization'])
+  # There's a lot of cruft left over,that we don't want
-      return prettify(obj)
+  # Remove xml declaration and <html> tags
+  str.sub!(/\A<\?xml.*$\n/, '')
+  str.sub!(/\A^<html>$\n/, '')
+  str.sub!(%r[</html>\n\Z], '')
+  # Remove top-level indentation
+  indent = /\A(\s*)/.match(str)[1].size
+  str.gsub!(/^\s{,#{indent}}/, '')
+  # Remove blank lines
+  str.gsub!(/^\s*$\n/, '')
+  return str
+end
+# Parse HTML into a node
+def self.domify(str, force_doc = false)
+  if force_doc || /<!DOCTYPE/.match(str[0, 512])
+    return Nokogiri::HTML(str)
+  else
+    return Nokogiri::HTML.fragment(str)
+  end
+end
+# Force this object to be a document, so we can apply a stylesheet
+def self.to_document(obj)
+  if Nokogiri::XML::Document === obj
+    return obj
+  elsif Nokogiri::XML::Node === obj # node or fragment
+    return domify(obj.to_s, true)
+    # This ought to work, and would be faster,
+    # but seems to segfault Nokogiri
+    if false
+      doc = Nokogiri::HTML('<html><body>')
+      doc.at('body').children = obj.children
+      return doc
     end
+  else
+    return to_document(domify(obj))
   end
 end
+end
+end