RubyGems - sitediff - Versions diffs - 0.0.1 → 1.0.0 - Mend

sitediff 0.0.1 → 1.0.0

Files changed (31) hide show

checksums.yaml +5 -5
data/bin/sitediff +10 -4
data/lib/sitediff.rb +179 -91
data/lib/sitediff/cache.rb +106 -0
data/lib/sitediff/cli.rb +391 -60
data/lib/sitediff/config.rb +383 -37
data/lib/sitediff/config/creator.rb +114 -0
data/lib/sitediff/config/preset.rb +75 -0
data/lib/sitediff/crawler.rb +131 -0
data/lib/sitediff/diff.rb +57 -12
data/lib/sitediff/exception.rb +5 -0
data/lib/sitediff/fetch.rb +76 -0
data/lib/sitediff/files/diff.html.erb +20 -2
data/lib/sitediff/files/jquery.min.js +2 -0
data/lib/sitediff/files/normalize.css +349 -0
data/lib/sitediff/files/report.html.erb +144 -0
data/lib/sitediff/files/sidebyside.html.erb +16 -0
data/lib/sitediff/files/sitediff.css +236 -29
data/lib/sitediff/files/sitediff.js +176 -0
data/lib/sitediff/report.rb +238 -0
data/lib/sitediff/result.rb +63 -26
data/lib/sitediff/sanitize.rb +160 -141
data/lib/sitediff/sanitize/dom_transform.rb +130 -0
data/lib/sitediff/sanitize/regexp.rb +82 -0
data/lib/sitediff/uriwrapper.rb +114 -35
data/lib/sitediff/webserver.rb +94 -0
data/lib/sitediff/webserver/resultserver.rb +134 -0
metadata +103 -43
data/lib/sitediff/files/html_report.html.erb +0 -47
data/lib/sitediff/util/cache.rb +0 -32
data/lib/sitediff/util/webserver.rb +0 -77

data/lib/sitediff/sanitize/dom_transform.rb ADDED

@@ -0,0 +1,130 @@
+# frozen_string_literal: true
+require 'sitediff/sanitize'
+require 'nokogiri'
+class SiteDiff
+  class Sanitizer
+    # Currently supported transforms:
+    #
+    #  * { :type => "unwrap_root" }
+    #  * { :type => "unwrap", :selector => "div.field-item" }
+    #  * { :type => "remove", :selector => "div.extra-stuff" }
+    #  * { :type => "remove_class", :class => 'class1' }
+    #  * { :type => "strip", :selector => 'h1' }
+    class DomTransform
+      # Supported dom_transform types.
+      TRANSFORMS = {}
+      ##
+      # Creates a DOM Transform.
+      def initialize(rule)
+        @rule = rule
+      end
+      ##
+      # Often an array or scalar are both ok values. Turn either into an array.
+      def to_array(val)
+        [val].flatten
+      end
+      ##
+      # TODO: Document what this method does.
+      def targets(node)
+        selectors = to_array(@rule['selector'])
+        selectors.each do |sel|
+          node.css(sel).each { |n| yield n }
+        end
+      end
+      ##
+      # Applies the transformation to a DOM node.
+      def apply(node)
+        targets(node) { |t| process(t) }
+      end
+      ##
+      # Registers a DOM Transform plugin.
+      def self.register(name)
+        TRANSFORMS[name] = self
+      end
+      ##
+      # Creates a DOM Transform as per rule.
+      def self.create(rule)
+        (type = rule['type']) ||
+          raise(InvalidSanitization, 'DOM transform needs a type')
+        (transform = TRANSFORMS[type]) ||
+          raise(InvalidSanitization, "No DOM transform named #{type}")
+        transform.new(rule)
+      end
+      ##
+      # Remove elements matching 'selector'.
+      class Remove < DomTransform
+        register 'remove'
+        ##
+        # Processes a node.
+        def process(node)
+          node.remove
+        end
+      end
+      # Squeeze whitespace from a tag matching 'selector'.
+      class Strip < DomTransform
+        register 'strip'
+        ##
+        # Processes a node.
+        def process(node)
+          node.content = node.content.strip
+        end
+      end
+      # Unwrap elements matching 'selector'.
+      class Unwrap < DomTransform
+        register 'unwrap'
+        ##
+        # Processes a node.
+        def process(node)
+          node.add_next_sibling(node.children)
+          node.remove
+        end
+      end
+      ##
+      # Remove classes from elements matching selector
+      class RemoveClass < DomTransform
+        register 'remove_class'
+        ##
+        # Processes a node.
+        def process(node)
+          classes = to_array(@rule['class'])
+          # Must call remove_class on a NodeSet!
+          ns = Nokogiri::XML::NodeSet.new(node.document, [node])
+          classes.each do |class_name|
+            ns.remove_class(class_name)
+          end
+        end
+      end
+      ##
+      # Unwrap the root element.
+      class UnwrapRoot < DomTransform
+        register 'unwrap_root'
+        ##
+        # Applies the transformation to a DOM node.
+        def apply(node)
+          (node.children.size == 1) ||
+            raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
+          node.children = node.children[0].children
+        end
+      end
+    end
+  end
+end

data/lib/sitediff/sanitize/regexp.rb ADDED

@@ -0,0 +1,82 @@
+# frozen_string_literal: true
+class SiteDiff
+  class Sanitizer
+    # Regular Expression Object.
+    class Regexp
+      ##
+      # Creates a RegExp object.
+      def initialize(rule)
+        @rule = rule
+      end
+      ##
+      # Whether the RegExp has a selector.
+      def selector?
+        false
+      end
+      ##
+      # Whether the RegExp applies to the given markup.
+      def applies?(html, _node)
+        applies_to_string?(html)
+      end
+      ##
+      # Applies the RegExp to the markup.
+      def apply(html)
+        gsub!(html)
+      end
+      ##
+      # Creates a RegExp object as per rule.
+      def self.create(rule)
+        rule['selector'] ? WithSelector.new(rule) : new(rule)
+      end
+      ##
+      # A RegExp with selector.
+      class WithSelector < Regexp
+        ##
+        # Whether the RegExp has a selector.
+        def selector?
+          true
+        end
+        ##
+        # TODO: Document what this method does.
+        def contexts(node)
+          selectors = @rule['selector']
+          node.css(selectors).each { |e| yield(e) }
+        end
+        ##
+        # Whether the RegExp applies to the given markup.
+        def applies?(_html, node)
+          enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
+        end
+        ##
+        # Applies the RegExp to the markup.
+        def apply(node)
+          contexts(node) { |e| e.replace(gsub!(e.to_html)) }
+        end
+      end
+      protected
+      def gsub!(str)
+        re = ::Regexp.new(@rule['pattern'])
+        sub = @rule['substitute'] || ''
+        # Expecting a mutation here. Do not reassign the variable str
+        # for the purpose of removing UTF-8 encoding errors.
+        str.gsub!(re, sub)
+        str
+      end
+      def applies_to_string?(str)
+        gsub!(str.dup) != str
+      end
+    end
+  end
+end

data/lib/sitediff/uriwrapper.rb CHANGED

@@ -1,55 +1,97 @@
+# frozen_string_literal: true
+require 'sitediff/exception'
 require 'typhoeus'
+require 'addressable/uri'
 class SiteDiff
-  class SiteDiffReadFailure < Exception; end
+  class SiteDiffReadFailure < SiteDiffException; end
+  # SiteDiff URI Wrapper.
   class UriWrapper
+    # TODO: Move these CURL OPTS to Config.DEFAULT_CONFIG.
+    DEFAULT_CURL_OPTS = {
+      # Don't hang on servers that don't exist.
+      connecttimeout: 3,
+      # Follow HTTP redirects (code 301 and 302).
+      followlocation: true,
+      headers: {
+        'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
+      }
+    }.freeze
     # This lets us treat errors or content as one object
-    class ReadResult < Struct.new(:content, :error)
-      def initialize(cont, err = nil)
-        super(cont, err)
+    class ReadResult
+      attr_accessor :encoding, :content, :error_code, :error
+      ##
+      # Creates a ReadResult.
+      def initialize(content = nil, encoding = 'utf-8')
+        @content = content
+        @encoding = encoding
+        @error = nil
+        @error_code = nil
+      end
+      ##
+      # Creates a ReadResult with an error.
+      def self.error(message, code = nil)
+        res = new
+        res.error_code = code
+        res.error = message
+        res
       end
-      def self.error(err); new(nil, err); end
     end
-    def initialize(uri)
-      @uri = uri.respond_to?(:scheme) ? uri : URI.parse(uri)
+    ##
+    # Creates a UriWrapper.
+    def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
+      @uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
       # remove trailing '/'s from local URIs
-      @uri.path.gsub!(/\/*$/, '') if local?
+      @uri.path.gsub!(%r{/*$}, '') if local?
+      @curl_opts = curl_opts
+      @debug = debug
     end
+    ##
+    # Returns the "user" part of the URI.
     def user
       @uri.user
     end
+    ##
+    # Returns the "password" part of the URI.
     def password
       @uri.password
     end
+    ##
+    # Converts the URI to a string.
     def to_s
       uri = @uri.dup
       uri.user = nil
       uri.password = nil
-      return uri.to_s
+      uri.to_s
     end
+    ##
     # Is this a local filesystem path?
     def local?
-      @uri.scheme == nil
+      @uri.scheme.nil?
     end
-    # FIXME this is not used anymore
-    def +(path)
+    ## What does this one do?
+    # FIXME: this is not used anymore
+    def +(other)
       # 'path' for SiteDiff includes (parts of) path, query, and fragment.
       sep = ''
-      if local? || @uri.path.empty?
-        sep = '/'
-      end
-      self.class.new(@uri.to_s + sep + path)
+      sep = '/' if local? || @uri.path.empty?
+      self.class.new(@uri.to_s + sep + other)
     end
+    ##
     # Reads a file and yields to the completion handler, see .queue()
-    def read_file(&handler)
+    def read_file
       File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
     rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
       yield ReadResult.error(e.message)
@@ -57,10 +99,10 @@ class SiteDiff
     # Returns the encoding of an HTTP response from headers , nil if not
     # specified.
-    def http_encoding(http_headers)
-      if content_type = http_headers['Content-Type']
-        if md = /;\s*charset=([-\w]*)/.match(content_type)
-          return md[1]
+    def charset_encoding(http_headers)
+      if (content_type = http_headers['Content-Type'])
+        if (md = /;\s*charset=([-\w]*)/.match(content_type))
+          md[1]
         end
       end
     end
@@ -69,33 +111,58 @@ class SiteDiff
     #
     # Completion callbacks of the request wrap the given handler which is
     # assumed to accept a single ReadResult argument.
-    def typhoeus_request(&handler)
-      params = {
-        :connecttimeout => 3,     # Don't hang on servers that don't exist
-        :followlocation => true,  # Follow HTTP redirects (code 301 and 302)
-        :headers => {
-          "User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
-        }
-      }
+    def typhoeus_request
+      params = @curl_opts.dup
       # Allow basic auth
       params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
-      req = Typhoeus::Request.new(self.to_s, params)
+      req = Typhoeus::Request.new(to_s, params)
       req.on_success do |resp|
         body = resp.body
         # Typhoeus does not respect HTTP headers when setting the encoding
         # resp.body; coerce if possible.
-        if encoding = http_encoding(resp.headers)
+        if (encoding = charset_encoding(resp.headers))
           body.force_encoding(encoding)
         end
-        yield ReadResult.new(body)
+        # Should be wrapped with rescue I guess? Maybe this entire function?
+        # Should at least be an option in the Cli to disable this.
+        # "stop on first error"
+        begin
+          yield ReadResult.new(body, encoding)
+        rescue ArgumentError => e
+          raise if @debug
+          yield ReadResult.error(
+            "Parsing error for #{@uri}: #{e.message}"
+          )
+        rescue StandardError => e
+          raise if @debug
+          yield ReadResult.error(
+            "Unknown parsing error for #{@uri}: #{e.message}"
+          )
+        end
       end
       req.on_failure do |resp|
-        msg = 'Unknown Error'
-        msg = resp.status_message if resp and resp.status_message
-        yield ReadResult.error("HTTP error #{@uri}: #{msg}")
+        if resp&.status_message
+          msg = resp.status_message
+          yield ReadResult.error(
+            "HTTP error when loading #{@uri}: #{msg}",
+            resp.response_code
+          )
+        elsif (msg = resp.options[:return_code])
+          yield ReadResult.error(
+            "Connection error when loading #{@uri}: #{msg}",
+            resp.response_code
+          )
+        else
+          yield ReadResult.error(
+            "Unknown error when loading #{@uri}: #{msg}",
+            resp.response_code
+          )
+        end
       end
       req
@@ -114,5 +181,17 @@ class SiteDiff
         hydra.queue(typhoeus_request(&handler))
       end
     end
+    ##
+    # Canonicalize a path.
+    #
+    # @param [String] path
+    #   A base relative path. Example: /foo/bar
+    def self.canonicalize(path)
+      # Ignore trailing slashes for all paths except "/" (front page).
+      path = path.chomp('/') unless path == '/'
+      # If the path is empty, assume that it's the front page.
+      path.empty? ? '/' : path
+    end
   end
 end

data/lib/sitediff/webserver.rb ADDED

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+require 'webrick'
+class SiteDiff
+  # SiteDiff Web Server.
+  class Webserver
+    # Simple web server for testing purposes.
+    DEFAULT_PORT = 13_080
+    attr_accessor :ports
+    ##
+    # Serve a list of directories.
+    def initialize(start_port, dirs, opts = {})
+      start_port ||= DEFAULT_PORT
+      @ports = (start_port...(start_port + dirs.size)).to_a
+      @dirs = dirs
+      @opts = opts
+      setup
+      start_servers
+      if block_given?
+        yield self
+        kill
+      end
+    end
+    ##
+    # Kills the server.
+    def kill
+      @threads.each(&:kill)
+    end
+    ##
+    # Waits for the server.
+    def wait
+      @threads.each(&:join)
+    end
+    ##
+    # Maps URIs to defined ports and returns a list of URIs.
+    def uris
+      ports.map { |p| "http://localhost:#{p}" }
+    end
+    protected
+    def setup
+      @server_opts = {}
+      if @opts[:quiet]
+        @server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
+        @server_opts[:AccessLog] = []
+      end
+    end
+    def server(opts)
+      WEBrick::HTTPServer.new(opts)
+    end
+    def start_servers
+      @threads = []
+      @dirs.each_with_index do |dir, idx|
+        @server_opts[:Port] = @ports[idx]
+        @server_opts[:DocumentRoot] = dir
+        srv = server(@server_opts)
+        @threads << Thread.new { srv.start }
+      end
+    end
+    public
+    # SiteDiff Fixture Server.
+    class FixtureServer < Webserver
+      PORT = DEFAULT_PORT + 1
+      BASE = 'spec/sites/ruby-doc.org'
+      NAMES = %w[core-1.9.3 core-2.0].freeze
+      def initialize(port = PORT, base = BASE, names = NAMES)
+        dirs = names.map { |n| File.join(base, n) }
+        super(port, dirs, quiet: true)
+      end
+      def before
+        uris.first
+      end
+      def after
+        uris.last
+      end
+    end
+  end
+end