RubyGems - sitediff - Versions diffs - 0.0.3 → 0.0.5 - Mend

sitediff 0.0.3 → 0.0.5

Files changed (22) hide show

checksums.yaml +4 -4
data/bin/sitediff +2 -3
data/lib/sitediff.rb +35 -24
data/lib/sitediff/cache.rb +53 -47
data/lib/sitediff/cli.rb +127 -114
data/lib/sitediff/config.rb +35 -59
data/lib/sitediff/config/creator.rb +95 -90
data/lib/sitediff/crawler.rb +83 -72
data/lib/sitediff/diff.rb +7 -5
data/lib/sitediff/exception.rb +3 -1
data/lib/sitediff/fetch.rb +47 -41
data/lib/sitediff/files/html_report.html.erb +3 -0
data/lib/sitediff/files/rules/drupal.yaml +36 -6
data/lib/sitediff/result.rb +13 -11
data/lib/sitediff/rules.rb +47 -47
data/lib/sitediff/sanitize.rb +145 -150
data/lib/sitediff/sanitize/dom_transform.rb +73 -74
data/lib/sitediff/sanitize/regexp.rb +55 -52
data/lib/sitediff/uriwrapper.rb +37 -26
data/lib/sitediff/webserver.rb +80 -77
data/lib/sitediff/webserver/resultserver.rb +117 -76
metadata +32 -44

data/lib/sitediff/diff.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'sitediff'
 require 'diffy'
 require 'erb'
@@ -9,26 +11,26 @@ class SiteDiff
     def html_diffy(before_html, after_html)
       diff = Diffy::Diff.new(before_html, after_html)
-      diff.first ?  # Is it non-empty?
+      diff.first ? # Is it non-empty?
         diff.to_s(:html) : nil
     end
     def terminal_diffy(before_html, after_html)
       args = []
       args << :color if Rainbow.enabled
-      return Diffy::Diff.new(before_html, after_html, :context => 3).
-        to_s(*args)
+      Diffy::Diff.new(before_html, after_html, context: 3)
+                 .to_s(*args)
     end
     def generate_html_report(results, before, after, cache)
       erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
       report_html = ERB.new(File.read(erb_path)).result(binding)
-      return report_html
+      report_html
     end
     def generate_diff_output(result)
       erb_path = File.join(SiteDiff::FILES_DIR, 'diff.html.erb')
-      return ERB.new(File.read(erb_path)).result(binding)
+      ERB.new(File.read(erb_path)).result(binding)
     end
     def css

data/lib/sitediff/exception.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 class SiteDiff
-  class SiteDiffException < Exception; end
+  class SiteDiffException < RuntimeError; end
 end

data/lib/sitediff/fetch.rb CHANGED Viewed

@@ -1,55 +1,61 @@
+# frozen_string_literal: true
 require 'sitediff/uriwrapper'
 require 'typhoeus'
 class SiteDiff
-class Fetch
-  # Cache is a cache object, see sitediff/cache
-  # Paths is a list of sub-paths
-  # Tags is a hash of tag names => base URLs.
-  def initialize(cache, paths, tags)
-    @cache = cache
-    @paths = paths
-    @tags = tags
-  end
+  class Fetch
+    # Cache is a cache object, see sitediff/cache
+    # Paths is a list of sub-paths
+    # Tags is a hash of tag names => base URLs.
+    def initialize(cache, paths, concurrency = 3, curl_opts = nil, **tags)
+      @cache = cache
+      @paths = paths
+      @tags = tags
+      @curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
+      @concurrency = concurrency
+    end
-  # Fetch all the paths, once per tag.
-  # When a path has been fetched for every tag, block will be called with the
-  # path, and a hash of tag => UriWrapper::ReadResult objects.
-  def run(&block)
-    @callback = block
-    @hydra = Typhoeus::Hydra.new(max_concurrency: 3)
-    @paths.each { |path| queue_path(path) }
-    @hydra.run
-  end
+    # Fetch all the paths, once per tag.
+    # When a path has been fetched for every tag, block will be called with the
+    # path, and a hash of tag => UriWrapper::ReadResult objects.
+    def run(&block)
+      @callback = block
+      @hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
+      @paths.each { |path| queue_path(path) }
+      @hydra.run
+    end
+    private
-private
-  # Queue a path for fetching
-  def queue_path(path)
-    results = {}
-    @tags.each do |tag, base|
-      if res = @cache.get(tag, path)
-        results[tag] = res
-        process_results(path, results)
-      elsif !base
-        # We only have the cache, but this item isn't cached!
-        results[tag] = UriWrapper::ReadResult.error("Not cached")
-        process_results(path, results)
-      else
-        uri = UriWrapper.new(base + path)
-        uri.queue(@hydra) do |res|
-          @cache.set(tag, path, res)
+    # Queue a path for fetching
+    def queue_path(path)
+      results = {}
+      @tags.each do |tag, base|
+        if (res = @cache.get(tag, path))
           results[tag] = res
           process_results(path, results)
+        elsif !base
+          # We only have the cache, but this item isn't cached!
+          results[tag] = UriWrapper::ReadResult.error('Not cached')
+          process_results(path, results)
+        else
+          uri = UriWrapper.new(base + path, @curl_opts)
+          uri.queue(@hydra) do |resl|
+            @cache.set(tag, path, resl)
+            results[tag] = resl
+            process_results(path, results)
+          end
         end
       end
     end
-  end
-  # Process fetch results
-  def process_results(path, results)
-    return unless results.size == @tags.size
-    @callback[path, results]
+    # Process fetch results
+    def process_results(path, results)
+      return unless results.size == @tags.size
+      @callback[path, results]
+    end
   end
 end
-end

data/lib/sitediff/files/html_report.html.erb CHANGED Viewed

@@ -21,6 +21,9 @@
                <a href="<%= eval(tag) %>"><%= eval(tag) %></a>
         <% end %>
       </div>
+      <div class="run">
+        <a href="../run/diff">Rerun diff</a>
+      </div>
       <table class="results">
         <colgroup>

data/lib/sitediff/files/rules/drupal.yaml CHANGED Viewed

@@ -2,28 +2,28 @@ sanitization:
 - title: Strip Drupal.settings
   selector: script
   pattern: '^(<script>)?jQuery.extend\(Drupal.settings.*$'
+- title: Strip IE CSS/JS cache IDs
+  pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
+  substitute: '\1'
 - title: Strip form build ID
   selector: input
-  pattern: 'name="form_build_id" value="form-[-\w]{43}"'
+  pattern: 'name="form_build_id" value="form-[-\w]{40,43}"'
   substitute: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
 - title: Strip view DOM ID
   pattern: '(class="view .*) view-dom-id-[a-f0-9]{32}"'
   substitute: '\1 view-dom-id-DRUPAL_VIEW_DOM_ID"'
 - title: Strip CSS aggregation filenames
   selector: link[rel=stylesheet]
-  pattern: '(href="[^"]*/files/css/css_)[-\w]{43}\.css"'
+  pattern: '(href="[^"]*/files/css/css_)[-\w]{40,43}\.css"'
   substitute: '\1DRUPAL_AGGREGATED_CSS.css"'
 - title: Strip JS aggregation filenames
   selector: script
-  pattern: '(src="[^"]*/files/js/js_)[-\w]{43}\.js"'
+  pattern: '(src="[^"]*/files/js/js_)[-\w]{40,43}\.js"'
   substitute: '\1DRUPAL_AGGREGATED_JS.js"'
 - title: Strip CSS/JS cache IDs
   selector: style, script
   pattern: '("[^"]*\.(js|css))\?[a-z0-9]{6}"'
   substitute: '\1'
-- title: Strip IE CSS/JS cache IDs
-  pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
-  substitute: '\1'
 - title: Strip Drupal JS version tags
   selector: script
   pattern: '(src="[^"]*/misc/\w+\.js)?v=\d+\.\d+"'
@@ -31,3 +31,33 @@ sanitization:
 - title: Strip domain names from absolute URLs
   pattern: 'http:\/\/[a-zA-Z0-9.:-]+'
   substitute: '__domain__'
+- title: Strip form build ID
+  selector: input
+  pattern: 'autocomplete="off" data-drupal-selector="form-[-\w]{40,43}"'
+  substitute: 'autocomplete="off" data-drupal-selector="form-DRUPAL_FORM_BUILD_ID"'
+- title: Strip form build ID 2
+  selector: input
+  pattern: 'name="form_build_id" value="form-[-\w]{40,43}"'
+  substitute: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
+- title: Strip Drupal CSS link queries
+  selector: link
+  pattern: '\.css\?(\w*)'
+  substitute: '\.css'
+- title: Strip Drupal JS link queries
+  selector: script
+  pattern: '\.js\?(\w*)'
+  substitute: '\.js'
+- title: Strip Drupal View-DOM ID
+  pattern: 'view-dom-id-\w*'
+  substitute: 'view-dom-id-_ID_'
+- title: Strip Drupal View-DOM ID 2
+  pattern: '(views?_dom_id"?:"?)\w*'
+  substitute: '\1_ID_'
+- title: Ignore Drupal CSS file names
+  selector: link
+  pattern: 'css_[-\w]{40,43}(\\|%5C)?\.css'
+  substitute: 'css__ID__.css'
+- title: Ignore Drupal JS file names
+  selector: script
+  pattern: 'js_[-\w]{40,43}\\?\.js'
+  substitute: 'js__ID__.js'

data/lib/sitediff/result.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'sitediff'
 require 'sitediff/diff'
 require 'digest/sha1'
@@ -8,7 +10,7 @@ class SiteDiff
     STATUS_SUCCESS  = 0   # Identical before and after
     STATUS_FAILURE  = 1   # Different before and after
     STATUS_ERROR    = 2   # Couldn't fetch page
-    STATUS_TEXT = %w[success failure error]
+    STATUS_TEXT = %w[success failure error].freeze
     attr_reader :status, :diff
@@ -17,7 +19,7 @@ class SiteDiff
       if error
         @status = STATUS_ERROR
       else
-        @diff = Diff::html_diffy(before, after)
+        @diff = Diff.html_diffy(before, after)
         @status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
       end
     end
@@ -28,7 +30,7 @@ class SiteDiff
     # Textual representation of the status
     def status_text
-      return STATUS_TEXT[status]
+      STATUS_TEXT[status]
     end
     # Printable URL
@@ -39,7 +41,7 @@ class SiteDiff
     # Filename to store diff
     def filename
-      File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(self.path) + '.html')
+      File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
     end
     # Text of the link in the HTML report
@@ -52,15 +54,15 @@ class SiteDiff
     end
     # Log the result to the terminal
-    def log(verbose=true)
+    def log(verbose = true)
       case status
       when STATUS_SUCCESS then
-        SiteDiff::log path, :diff_success, 'SUCCESS'
+        SiteDiff.log path, :diff_success, 'UNCHANGED'
       when STATUS_ERROR then
-        SiteDiff::log path, :warn, "ERROR (#{error})"
+        SiteDiff.log path, :warn, "ERROR (#{error})"
       when STATUS_FAILURE then
-        SiteDiff::log path, :diff_failure, "FAILURE"
-        puts Diff::terminal_diffy(before, after) if verbose
+        SiteDiff.log path, :diff_failure, 'CHANGED'
+        puts Diff.terminal_diffy(before, after) if verbose
       end
     end
@@ -68,9 +70,9 @@ class SiteDiff
     def dump(dir)
       dump_path = File.join(dir, filename)
       base = File.dirname(dump_path)
-      FileUtils::mkdir_p(base) unless File.exists?(base)
+      FileUtils.mkdir_p(base) unless File.exist?(base)
       File.open(dump_path, 'w') do |f|
-        f.write(Diff::generate_diff_output(self))
+        f.write(Diff.generate_diff_output(self))
       end
     end
   end

data/lib/sitediff/rules.rb CHANGED Viewed

@@ -1,65 +1,65 @@
+# frozen_string_literal: true
 require 'sitediff/sanitize/regexp'
 require 'pathname'
 require 'set'
 class SiteDiff
-# Find appropriate rules for a given site
-class Rules
-  def initialize(config, disabled = false)
-    @disabled = disabled
-    @config = config
-    find_sanitization_candidates
-    @rules = Hash.new { |h, k| h[k] = Set.new }
-  end
+  # Find appropriate rules for a given site
+  class Rules
+    def initialize(config, disabled = false)
+      @disabled = disabled
+      @config = config
+      find_sanitization_candidates
+      @rules = Hash.new { |h, k| h[k] = Set.new }
+    end
-  def find_sanitization_candidates
-    @candidates = Set.new
+    def find_sanitization_candidates
+      @candidates = Set.new
-    rules_dir = Pathname.new(__FILE__).dirname + 'files' + 'rules'
-    rules_dir.children.each do |f|
-      next unless f.file? && f.extname == '.yaml'
-      conf = YAML.load_file(f)
-      @candidates.merge(conf['sanitization'])
-    end
-  end
+      rules_dir = Pathname.new(__FILE__).dirname + 'files' + 'rules'
+      rules_dir.children.each do |f|
+        next unless f.file? && f.extname == '.yaml'
-  def handle_page(tag, html, doc)
-    found = find_rules(html, doc)
-    @rules[tag].merge(found)
-  end
+        conf = YAML.load_file(f)
+        @candidates.merge(conf['sanitization'])
+      end
+    end
-  # Yield a set of rules that seem reasonable for this HTML
-  # assumption: the YAML file is a list of regexp rules only
-  def find_rules(html, doc)
-    rules = []
+    def handle_page(tag, html, doc)
+      found = find_rules(html, doc)
+      @rules[tag].merge(found)
+    end
-    return @candidates.select do |rule|
-      re = SiteDiff::Sanitizer::Regexp.create(rule)
-      re.applies?(html, doc)
+    # Yield a set of rules that seem reasonable for this HTML
+    # assumption: the YAML file is a list of regexp rules only
+    def find_rules(html, doc)
+      @candidates.select do |rule|
+        re = SiteDiff::Sanitizer::Regexp.create(rule)
+        re.applies?(html, doc)
+      end
     end
-  end
-  # Find all rules from all rulesets that apply for all pages
-  def add_config
-    have_both = @rules.include?(:before)
+    # Find all rules from all rulesets that apply for all pages
+    def add_config
+      have_both = @rules.include?(:before)
-    r1, r2 = *@rules.values_at(:before, :after)
-    if have_both
-      add_section('before', r1 - r2)
-      add_section('after', r2 - r1)
-      add_section(nil, r1 & r2)
-    else
-      add_section(nil, r2)
+      r1, r2 = *@rules.values_at(:before, :after)
+      if have_both
+        add_section('before', r1 - r2)
+        add_section('after', r2 - r1)
+        add_section(nil, r1 & r2)
+      else
+        add_section(nil, r2)
+      end
     end
-  end
-  def add_section(name, rules)
-    return if rules.empty?
-    conf = name ? @config[name] : @config
-    if @disabled
-      rules.each { |r| r['disabled'] = true }
+    def add_section(name, rules)
+      return if rules.empty?
+      conf = name ? @config[name] : @config
+      rules.each { |r| r['disabled'] = true } if @disabled
+      conf['sanitization'] = rules.to_a.sort_by { |r| r['title'] }
     end
-    conf['sanitization'] = rules.to_a.sort_by { |r| r['title'] }
   end
 end
-end

data/lib/sitediff/sanitize.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'sitediff'
 require 'sitediff/exception'
 require 'sitediff/sanitize/dom_transform'
@@ -6,183 +8,176 @@ require 'nokogiri'
 require 'set'
 class SiteDiff
-class Sanitizer
-class InvalidSanitization < SiteDiffException; end
-TOOLS = {
-  :array => %w[dom_transform sanitization],
-  :scalar => %w[selector remove_spacing],
-}
-DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
-def initialize(html, config, opts = {})
-  @html = html
-  @config = config
-  @opts = opts
-end
-def sanitize
-  return '' if @html == '' # Quick return on empty input
+  class Sanitizer
+    class InvalidSanitization < SiteDiffException; end
+    TOOLS = {
+      array: %w[dom_transform sanitization],
+      scalar: %w[selector remove_spacing]
+    }.freeze
+    DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
+    def initialize(html, config, opts = {})
+      @html = html
+      @config = config
+      @opts = opts
+    end
-  @node, @html = Sanitizer.domify(@html), nil
+    def sanitize
+      return '' if @html == '' # Quick return on empty input
-  remove_spacing
-  selector
-  dom_transforms
-  regexps
+      @node = Sanitizer.domify(@html)
+      @html = nil
-  return @html || Sanitizer.prettify(@node)
-end
+      remove_spacing
+      selector
+      dom_transforms
+      regexps
-# Return whether or not we want to keep a rule
-def want_rule(rule)
-  return false unless rule
-  return false if rule['disabled']
+      @html || Sanitizer.prettify(@node)
+    end
-  # Filter out if path regexp doesn't match
-  if (pathre = rule['path']) and (path = @opts[:path])
-    return ::Regexp.new(pathre).match(path)
-  end
+    # Return whether or not we want to keep a rule
+    def want_rule(rule)
+      return false unless rule
+      return false if rule['disabled']
-  return true
-end
+      # Filter out if path regexp doesn't match
+      if (pathre = rule['path']) && (path = @opts[:path])
+        return ::Regexp.new(pathre).match(path)
+      end
-# Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
-# It may be a simple value, or a hash, or an array of hashes.
-# Turn it into an array of hashes.
-def canonicalize_rule(name)
-  rules = @config[name] or return nil
-  if rules[0] && rules[0].respond_to?(:[]) && rules[0]['value']
-    # Already an array
-  elsif rules['value']
-    # Hash, put it in an array
-    rules = [rules]
-  else
-    # Scalar, put it in a hash
-    rules = [{ 'value' => rules }]
-  end
+      true
+    end
-  want = rules.select { |r| want_rule(r) }
-  return nil if want.empty?
-  raise "Too many matching rules of type #{name}" if want.size > 1
-  return want.first
-end
+    # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
+    # It may be a simple value, or a hash, or an array of hashes.
+    # Turn it into an array of hashes.
+    def canonicalize_rule(name)
+      (rules = @config[name]) || (return nil)
+      if rules[0]&.respond_to?(:[]) && rules[0]['value']
+        # Already an array
+      elsif rules['value']
+        # Hash, put it in an array
+        rules = [rules]
+      else
+        # Scalar, put it in a hash
+        rules = [{ 'value' => rules }]
+      end
+      want = rules.select { |r| want_rule(r) }
+      return nil if want.empty?
+      raise "Too many matching rules of type #{name}" if want.size > 1
+      want.first
+    end
-# Perform 'remove_spacing' action
-def remove_spacing
-  rule = canonicalize_rule('remove_spacing') or return
-  Sanitizer.remove_node_spacing(@node) if rule['value']
-end
+    # Perform 'remove_spacing' action
+    def remove_spacing
+      (rule = canonicalize_rule('remove_spacing')) || return
+      Sanitizer.remove_node_spacing(@node) if rule['value']
+    end
-# Perform 'selector' action, to choose a new root
-def selector
-  rule = canonicalize_rule('selector') or return
-  @node = Sanitizer.select_fragments(@node, rule['value'])
-end
+    # Perform 'selector' action, to choose a new root
+    def selector
+      (rule = canonicalize_rule('selector')) || return
+      @node = Sanitizer.select_fragments(@node, rule['value'])
+    end
-# Applies regexps. Also
-def regexps
-  rules = @config['sanitization'] or return
-  rules = rules.select { |r| want_rule(r) }
+    # Applies regexps. Also
+    def regexps
+      (rules = @config['sanitization']) || return
+      rules = rules.select { |r| want_rule(r) }
-  rules.map! { |r| Regexp.create(r) }
-  selector, global = rules.partition { |r| r.selector? }
+      rules.map! { |r| Regexp.create(r) }
+      selector, global = rules.partition(&:selector?)
-  selector.each { |r| r.apply(@node) }
-  @html, @node = Sanitizer.prettify(@node), nil
-  global.each { |r| r.apply(@html) }
-end
+      selector.each { |r| r.apply(@node) }
+      @html = Sanitizer.prettify(@node)
+      @node = nil
+      global.each { |r| r.apply(@html) }
+    end
-# Perform DOM transforms
-def dom_transforms
-  rules = @config['dom_transform'] or return
-  rules = rules.select { |r| want_rule(r) }
+    # Perform DOM transforms
+    def dom_transforms
+      (rules = @config['dom_transform']) || return
+      rules = rules.select { |r| want_rule(r) }
-  rules.each do |rule|
-    transform = DomTransform.create(rule)
-    transform.apply(@node)
-  end
-end
+      rules.each do |rule|
+        transform = DomTransform.create(rule)
+        transform.apply(@node)
+      end
+    end
-##### Implementations of actions #####
+    ##### Implementations of actions #####
-# Remove double-spacing inside text nodes
-def self.remove_node_spacing(node)
-  # remove double spacing, but only inside text nodes (eg not attributes)
-  node.xpath('//text()').each do |el|
-    el.content = el.content.gsub(/  +/, ' ')
-  end
-end
+    # Remove double-spacing inside text nodes
+    def self.remove_node_spacing(node)
+      # remove double spacing, but only inside text nodes (eg not attributes)
+      node.xpath('//text()').each do |el|
+        el.content = el.content.gsub(/  +/, ' ')
+      end
+    end
-# Get a fragment consisting of the elements matching the selector(s)
-def self.select_fragments(node, sel)
-  # When we choose a new root, we always become a DocumentFragment,
-  # and lose any DOCTYPE and such.
-  ns = node.css(sel)
-  unless node.fragment?
-    node = Nokogiri::HTML.fragment('')
-  end
-  node.children = ns
-  return node
-end
+    # Get a fragment consisting of the elements matching the selector(s)
+    def self.select_fragments(node, sel)
+      # When we choose a new root, we always become a DocumentFragment,
+      # and lose any DOCTYPE and such.
+      ns = node.css(sel)
+      node = Nokogiri::HTML.fragment('') unless node.fragment?
+      node.children = ns
+      node
+    end
-# Pretty-print some HTML
-def self.prettify(obj)
-  @stylesheet ||= begin
-    stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
-    Nokogiri::XSLT(File.read(stylesheet_path))
-  end
+    # Pretty-print some HTML
+    def self.prettify(obj)
+      @stylesheet ||= begin
+        stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
+        Nokogiri::XSLT(File.read(stylesheet_path))
+      end
-  # Pull out the html element's children
-  # The obvious way to do this is to iterate over pretty.css('html'),
-  # but that tends to segfault Nokogiri
-  str = @stylesheet.apply_to(to_document(obj))
+      # Pull out the html element's children
+      # The obvious way to do this is to iterate over pretty.css('html'),
+      # but that tends to segfault Nokogiri
+      str = @stylesheet.apply_to(to_document(obj))
-  # There's a lot of cruft left over,that we don't want
+      # There's a lot of cruft left over,that we don't want
-  # Remove xml declaration and <html> tags
-  str.sub!(/\A<\?xml.*$\n/, '')
-  str.sub!(/\A^<html>$\n/, '')
-  str.sub!(%r[</html>\n\Z], '')
+      # Remove xml declaration and <html> tags
+      str.sub!(/\A<\?xml.*$\n/, '')
+      str.sub!(/\A^<html>$\n/, '')
+      str.sub!(%r{</html>\n\Z}, '')
-  # Remove top-level indentation
-  indent = /\A(\s*)/.match(str)[1].size
-  str.gsub!(/^\s{,#{indent}}/, '')
+      # Remove top-level indentation
+      indent = /\A(\s*)/.match(str)[1].size
+      str.gsub!(/^\s{,#{indent}}/, '')
-  # Remove blank lines
-  str.gsub!(/^\s*$\n/, '')
+      # Remove blank lines
+      str.gsub!(/^\s*$\n/, '')
-  return str
-end
+      str
+    end
-# Parse HTML into a node
-def self.domify(str, force_doc = false)
-  if force_doc || /<!DOCTYPE/.match(str[0, 512])
-    return Nokogiri::HTML(str)
-  else
-    return Nokogiri::HTML.fragment(str)
-  end
-end
+    # Parse HTML into a node
+    def self.domify(str, force_doc = false)
+      if force_doc || /<!DOCTYPE/.match(str[0, 512])
+        Nokogiri::HTML(str)
+      else
+        Nokogiri::HTML.fragment(str)
+      end
+    end
-# Force this object to be a document, so we can apply a stylesheet
-def self.to_document(obj)
-  if Nokogiri::XML::Document === obj
-    return obj
-  elsif Nokogiri::XML::Node === obj # node or fragment
-    return domify(obj.to_s, true)
-    # This ought to work, and would be faster,
-    # but seems to segfault Nokogiri
-    if false
-      doc = Nokogiri::HTML('<html><body>')
-      doc.at('body').children = obj.children
-      return doc
+    # Force this object to be a document, so we can apply a stylesheet
+    def self.to_document(obj)
+      if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
+        obj
+      # node or fragment
+      elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
+        domify(obj.to_s, true)
+      else
+        to_document(domify(obj, false))
+      end
     end
-  else
-    return to_document(domify(obj))
   end
 end
-end
-end