sitediff 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1dc3a624b91cd4b7ef1c926116630cd795532024
4
+ data.tar.gz: e49f227ae303f574b704ffe3a226f79a120ae30f
5
+ SHA512:
6
+ metadata.gz: 90ca5508b834d32ac7c96aa6a94a6aa8488921e978e76890e142b1249da20bc620ddcfa237f3defc1e6928d83dd0a22583c9dded150855c320f94140e1bffdf1
7
+ data.tar.gz: 24bf7969b6f17c269bb407d1ff1684f6556318d0cfa7c6c92a8327ddd0d86ee4f153778affa4d1ef115e47a3b69b31b4dac5f01ed8b7f464a05fd98f9f98212b
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # when run as gem, $0 is /usr/local/bin/sitediff not this file
4
+ if $0 == __FILE__
5
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
6
+ end
7
+
8
+ require 'sitediff'
9
+
10
+ SiteDiff::Cli.start
@@ -0,0 +1,130 @@
1
+ #!/bin/env ruby
2
+ require 'sitediff/cli.rb'
3
+ require 'sitediff/config.rb'
4
+ require 'sitediff/result.rb'
5
+ require 'sitediff/uriwrapper'
6
+ require 'sitediff/util/cache'
7
+ require 'typhoeus'
8
+ require 'rainbow'
9
+
10
+ class SiteDiff
11
+ # path to misc. static files (e.g. erb, css files)
12
+ FILES_DIR = File.join(File.dirname(__FILE__), 'sitediff', 'files')
13
+
14
+ # subdirectory containing all failing diffs
15
+ DIFFS_DIR = 'diffs'
16
+
17
+ # label will be colorized and str will not be.
18
+ # type dictates the color: can be :success, :error, or :failure
19
+ def self.log(str, type=nil, label=nil)
20
+ label = label ? "[sitediff] #{label}" : '[sitediff]'
21
+ bg = fg = nil
22
+ case type
23
+ when :success
24
+ bg = :green
25
+ fg = :black
26
+ when :failure
27
+ bg = :red
28
+ when :error
29
+ bg = :yellow
30
+ fg = :black
31
+ end
32
+ label = Rainbow(label)
33
+ label = label.bg(bg) if bg
34
+ label = label.fg(fg) if fg
35
+ puts label + ' ' + str
36
+ end
37
+
38
+ attr_reader :config, :results
39
+ def before
40
+ @config.before['url']
41
+ end
42
+ def after
43
+ @config.after['url']
44
+ end
45
+
46
+ def cache=(file)
47
+ # FIXME: Non-global cache would be nice
48
+ return unless file
49
+ if Gem::Version.new(Typhoeus::VERSION) >= Gem::Version.new('0.6.4')
50
+ Typhoeus::Config.cache = SiteDiff::Util::Cache.new(file)
51
+ else
52
+ # Bug, see: https://github.com/typhoeus/typhoeus/pull/296
53
+ SiteDiff::log("Cache unsupported on Typhoeus version < 0.6.4", :failure)
54
+ end
55
+ end
56
+
57
+ def initialize(config, cache)
58
+ config.validate
59
+ @config = config
60
+ self.cache = cache
61
+ end
62
+
63
+ # Sanitize an HTML string based on configuration for either before or after
64
+ def sanitize(html, pos)
65
+ Sanitize::sanitize(html, @config.send(pos))
66
+ end
67
+
68
+ # Queues fetching before and after URLs with a Typhoeus::Hydra instance
69
+ #
70
+ # Upon completion of both before and after, prints and saves the diff to
71
+ # @results.
72
+ def queue_read(hydra, path)
73
+ # ( :before | after ) => ReadResult object
74
+ reads = {}
75
+ [:before, :after].each do |pos|
76
+ uri = UriWrapper.new(send(pos) + path)
77
+
78
+ uri.queue(hydra) do |res|
79
+ reads[pos] = res
80
+ next unless reads.size == 2
81
+
82
+ # we have read both before and after; calculate diff
83
+ if error = reads[:before].error || reads[:after].error
84
+ diff = Result.new(path, nil, nil, error)
85
+ else
86
+ diff = Result.new(path, sanitize(reads[:before].content, :before),
87
+ sanitize(reads[:after].content,:after), nil)
88
+ end
89
+ diff.log
90
+ @results[path] = diff
91
+ end
92
+ end
93
+ end
94
+
95
+ # Perform the comparison
96
+ def run
97
+ # Map of path -> Result object, queue_read sets callbacks to populate this
98
+ @results = {}
99
+
100
+ hydra = Typhoeus::Hydra.new(max_concurrency: 3)
101
+ @config.paths.each { |path| queue_read(hydra, path) }
102
+ hydra.run
103
+
104
+ # Order by original path order
105
+ @results = @config.paths.map { |p| @results[p] }
106
+ end
107
+
108
+ # Dump results to disk
109
+ def dump(dir, report_before, report_after, failing_paths)
110
+ report_before ||= before
111
+ report_after ||= after
112
+ FileUtils.mkdir_p(dir)
113
+
114
+ # store diffs of each failing case, first wipe out existing diffs
115
+ diff_dir = File.join(dir, DIFFS_DIR)
116
+ FileUtils.rm_rf(diff_dir)
117
+ results.each { |r| r.dump(dir) if r.status == Result::STATUS_FAILURE }
118
+ SiteDiff::log "All diff files were dumped inside #{dir}"
119
+
120
+ # store failing paths
121
+ SiteDiff::log "Writing failures to #{failing_paths}"
122
+ File.open(failing_paths, 'w') do |f|
123
+ results.each { |r| f.puts r.path unless r.success? }
124
+ end
125
+
126
+ # create report of results
127
+ report = Diff::generate_html_report(results, report_before, report_after)
128
+ File.open(File.join(dir, "/report.html") , 'w') { |f| f.write(report) }
129
+ end
130
+ end
@@ -0,0 +1,90 @@
1
+ require 'thor'
2
+ require 'sitediff/diff'
3
+ require 'sitediff/sanitize'
4
+ require 'sitediff/util/webserver'
5
+ require 'open-uri'
6
+ require 'uri'
7
+
8
+ class SiteDiff
9
+ class Cli < Thor
10
+ # Thor, by default, exits with 0 no matter what!
11
+ def self.exit_on_failure?
12
+ true
13
+ end
14
+
15
+ # Thor, by default, does not raise an error for use of unknown options.
16
+ def self.check_unknown_options?(config)
17
+ true
18
+ end
19
+
20
+ option 'dump-dir',
21
+ :type => :string,
22
+ :default => File.join('.', 'output'),
23
+ :desc => "Location to write the output to."
24
+ option 'paths',
25
+ :type => :string,
26
+ :desc => 'Paths are read (one at a line) from PATHS: ' +
27
+ 'useful for iterating over sanitization rules',
28
+ :aliases => '--paths-from-file'
29
+ option 'before',
30
+ :type => :string,
31
+ :desc => "URL used to fetch the before HTML. Acts as a prefix to specified paths",
32
+ :aliases => '--before-url'
33
+ option 'after',
34
+ :type => :string,
35
+ :desc => "URL used to fetch the after HTML. Acts as a prefix to specified paths.",
36
+ :aliases => '--after-url'
37
+ option 'before-report',
38
+ :type => :string,
39
+ :desc => "Before URL to use for reporting purposes. Useful if port forwarding.",
40
+ :aliases => '--before-url-report'
41
+ option 'after-report',
42
+ :type => :string,
43
+ :desc => "After URL to use for reporting purposes. Useful if port forwarding.",
44
+ :aliases => '--after-url-report'
45
+ option 'cache',
46
+ :type => :string,
47
+ :desc => "Filename to use for caching requests.",
48
+ :lazy_default => 'cache.db'
49
+ desc "diff [OPTIONS] [CONFIGFILES]", "Perform systematic diff on given URLs"
50
+ def diff(*config_files)
51
+ config = SiteDiff::Config.new(config_files)
52
+
53
+ # override config based on options
54
+ if paths_file = options['paths']
55
+ unless File.exists? paths_file
56
+ raise Config::InvalidConfig,
57
+ "Paths file '#{paths_file}' not found!"
58
+ end
59
+ SiteDiff::log "Reading paths from: #{paths_file}"
60
+ config.paths = File.readlines(paths_file)
61
+ end
62
+ config.before['url'] = options['before'] if options['before']
63
+ config.after['url'] = options['after'] if options['after']
64
+
65
+ sitediff = SiteDiff.new(config, options['cache'])
66
+ sitediff.run
67
+
68
+ failing_paths = File.join(options['dump-dir'], 'failures.txt')
69
+ sitediff.dump(options['dump-dir'], options['before-report'],
70
+ options['after-report'], failing_paths)
71
+ rescue Config::InvalidConfig => e
72
+ SiteDiff.log "Invalid configuration: #{e.message}", :failure
73
+ end
74
+
75
+ option :port,
76
+ :type => :numeric,
77
+ :default => SiteDiff::Util::Webserver::DEFAULT_PORT,
78
+ :desc => 'The port to serve on'
79
+ option :directory,
80
+ :type => :string,
81
+ :default => 'output',
82
+ :desc => 'The directory to serve',
83
+ :aliases => '--dump-dir'
84
+ desc "serve [OPTIONS]", "Serve the sitediff output directory over HTTP"
85
+ def serve
86
+ SiteDiff::Util::Webserver.serve(options[:port], options[:directory],
87
+ :announce => true).wait
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,154 @@
1
+ require 'yaml'
2
+
3
+ class SiteDiff
4
+ class Config
5
+
6
+ # keys allowed in configuration files
7
+ CONF_KEYS = Sanitize::TOOLS.values.flatten(1) +
8
+ %w[paths before after before_url after_url includes]
9
+
10
+ class InvalidConfig < Exception; end
11
+
12
+ # Takes a Hash and normalizes it to the following form by merging globals
13
+ # into before and after. A normalized config Hash looks like this:
14
+ #
15
+ # paths:
16
+ # - /about
17
+ #
18
+ # before:
19
+ # url: http://before
20
+ # selector: body
21
+ # dom_transform:
22
+ # - type: remove
23
+ # selector: script
24
+ #
25
+ # after:
26
+ # url: http://after
27
+ # selector: body
28
+ #
29
+ def self.normalize(conf)
30
+ tools = Sanitize::TOOLS
31
+
32
+ # merge globals
33
+ %w[before after].each do |pos|
34
+ conf[pos] ||= {}
35
+ tools[:array].each do |key|
36
+ conf[pos][key] ||= []
37
+ conf[pos][key] += conf[key] if conf[key]
38
+ end
39
+ tools[:scalar].each {|key| conf[pos][key] ||= conf[key]}
40
+ conf[pos]['url'] ||= conf[pos + '_url']
41
+ end
42
+ # normalize paths
43
+ conf['paths'] = Config::normalize_paths(conf['paths'])
44
+
45
+ conf.select {|k,v| %w[before after paths].include? k}
46
+ end
47
+
48
+ # Merges two normalized Hashes according to the following rules:
49
+ # 1 paths are merged as arrays.
50
+ # 2 before and after: for each subhash H (e.g. ['before']['dom_transform']):
51
+ # a) if first[H] and second[H] are expected to be arrays, their values
52
+ # are merged as such,
53
+ # b) if first[H] and second[H] are expected to be scalars, the value for
54
+ # second[H] is kept if and only if first[H] is nil.
55
+ #
56
+ # For example, merge(h1, h2) results in h3:
57
+ #
58
+ # (h1) before: {selector: foo, sanitization: [pattern: foo]}
59
+ # (h2) before: {selector: bar, sanitization: [pattern: bar]}
60
+ # (h3) before: {selector: foo, sanitization: [pattern: foo, pattern: bar]}
61
+ def self.merge(first, second)
62
+ result = { 'paths' => {}, 'before' => {}, 'after' => {} }
63
+ result['paths'] = (first['paths'] || []) + (second['paths'] || []) # rule 1
64
+ %w[before after].each do |pos|
65
+ unless first[pos]
66
+ result[pos] = second[pos] || {}
67
+ next
68
+ end
69
+ result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
70
+ if Sanitize::TOOLS[:array].include? key # rule 2a
71
+ result[pos][key] = (a || []) + (b|| [])
72
+ else
73
+ result[pos][key] = a || b # rule 2b
74
+ end
75
+ end
76
+ end
77
+ result
78
+ end
79
+
80
+ def initialize(files)
81
+ @config = {'paths' => [], 'before' => {}, 'after' => {} }
82
+ files.each do |file|
83
+ @config = Config::merge(@config, Config::load_conf(file))
84
+ end
85
+ end
86
+
87
+ def before
88
+ @config['before']
89
+ end
90
+ def after
91
+ @config['after']
92
+ end
93
+
94
+ def paths
95
+ @config['paths']
96
+ end
97
+ def paths=(paths)
98
+ @config['paths'] = Config::normalize_paths(paths)
99
+ end
100
+
101
+ # Checks if the configuration is usable for diff-ing.
102
+ def validate
103
+ raise InvalidConfig, "Undefined 'before' base URL." unless before['url']
104
+ raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
105
+ raise InvalidConfig, "Undefined 'paths'." unless (paths and !paths.empty?)
106
+ end
107
+
108
+ private
109
+
110
+ def self.normalize_paths(paths)
111
+ paths ||= []
112
+ return paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
113
+ end
114
+
115
+ # reads a YAML file and raises an InvalidConfig if the file is not valid.
116
+ def self.load_raw_yaml(file)
117
+ SiteDiff::log "Reading config file: #{file}"
118
+ conf = YAML.load_file(file) || {}
119
+ unless conf.is_a? Hash
120
+ raise InvalidConfig, "Invalid configuration file: '#{file}'"
121
+ end
122
+ conf.each do |k,v|
123
+ unless CONF_KEYS.include? k
124
+ raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'"
125
+ end
126
+ end
127
+ conf
128
+ end
129
+
130
+ # loads a single YAML configuration file, merges all its 'included' files
131
+ # and returns a normalized Hash.
132
+ def self.load_conf(file, visited=[])
133
+ # don't get fooled by a/../a/ or symlinks
134
+ file = File.realpath(file)
135
+ if visited.include? file
136
+ raise InvalidConfig, "Circular dependency: #{file}"
137
+ end
138
+
139
+ conf = load_raw_yaml(file) # not normalized yet
140
+ visited << file
141
+
142
+ # normalize and merge includes
143
+ includes = conf['includes'] || []
144
+ conf = Config::normalize(conf)
145
+ includes.each do |dep|
146
+ # include paths are relative to the including file.
147
+ dep = File.join(File.dirname(file), dep)
148
+ conf = Config::merge(conf, load_conf(dep, visited))
149
+ end
150
+ conf
151
+ end
152
+
153
+ end
154
+ end
@@ -0,0 +1,37 @@
1
+ require 'diffy'
2
+ require 'erb'
3
+ require 'rainbow'
4
+
5
+ class SiteDiff
6
+ module Diff
7
+ module_function
8
+
9
+ def html_diffy(before_html, after_html)
10
+ diff = Diffy::Diff.new(before_html, after_html)
11
+ diff.first ? # Is it non-empty?
12
+ diff.to_s(:html) : nil
13
+ end
14
+
15
+ def terminal_diffy(before_html, after_html)
16
+ args = []
17
+ args << :color if Rainbow.enabled
18
+ return Diffy::Diff.new(before_html, after_html, :context => 3).
19
+ to_s(*args)
20
+ end
21
+
22
+ def generate_html_report(results, before, after)
23
+ erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
24
+ report_html = ERB.new(File.read(erb_path)).result(binding)
25
+ return report_html
26
+ end
27
+
28
+ def generate_diff_output(result)
29
+ erb_path = File.join(SiteDiff::FILES_DIR, 'diff.html.erb')
30
+ return ERB.new(File.read(erb_path)).result(binding)
31
+ end
32
+
33
+ def css
34
+ File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.css'))
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <head>
3
+ <meta charset="utf-8" />
4
+ <style>
5
+ <%= Diffy::CSS %>
6
+ </style>
7
+ </head>
8
+ <body>
9
+ <%= result.diff %>
10
+ </body>
11
+ </html>
@@ -0,0 +1,47 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <!-- important: otherwise chrome will choke on non-ascii characters -->
5
+ <meta charset="utf-8" />
6
+ <style>
7
+ <%= SiteDiff::Diff.css %>
8
+ </style>
9
+ <title> SiteDiff Report </title>
10
+ </head>
11
+ <body>
12
+ <div class="sitediff">
13
+ <div class="legend">
14
+ <strong>before</strong> (base url): <a href="<%=before%>"><%=before%></a> |
15
+ <strong>after </strong> (base url): <a href="<%=after%>" ><%=after %></a>
16
+ </div>
17
+ <table class="results">
18
+
19
+ <colgroup>
20
+ <col class="before-col">
21
+ <col class="after-col">
22
+ <col class="path-col">
23
+ <col class="diff-stat-col">
24
+ </colgroup>
25
+
26
+ <thead>
27
+ <tr>
28
+ <th> Before </th>
29
+ <th> After </th>
30
+ <th> Path </th>
31
+ <th> Status </th>
32
+ </tr>
33
+ </thead>
34
+
35
+ <% results.each do |result| %>
36
+ <tr class="<%= result.status_text %>">
37
+ <td class="before"><a href="<%= result.url(before) %>">[before]</a></td>
38
+ <td class="after"><a href="<%= result.url(after) %>">[after]</a></td>
39
+ <td class="path"><%= result.path %></td>
40
+ <td class="status"><%= result.link %></td>
41
+ </tr>
42
+ <% end %>
43
+
44
+ </table>
45
+ </div>
46
+ </body>
47
+ </html>
@@ -0,0 +1,9 @@
1
+ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
2
+ <xsl:output method="xml" encoding="UTF-8" indent="yes"/>
3
+ <xsl:param name="indent-increment" select="' '"/>
4
+ <xsl:strip-space elements="*"/>
5
+
6
+ <xsl:template match="/">
7
+ <xsl:copy-of select="."/>
8
+ </xsl:template>
9
+ </xsl:stylesheet>
@@ -0,0 +1,42 @@
1
+ .sitediff {
2
+ font-family: monospace;
3
+ font-size: 1.2em;
4
+ }
5
+ .sitediff .legend {
6
+ width: 95%;
7
+ margin: 1em auto;
8
+ text-align: center;
9
+ }
10
+ .sitediff .results thead {
11
+ background: black;
12
+ color: white;
13
+ }
14
+ .sitediff .results td {
15
+ text-align: center;
16
+ }
17
+ .sitediff .results td.path {
18
+ text-align: left;
19
+ padding-left: 1em;
20
+ }
21
+ .sitediff .results {
22
+ padding: 1em;
23
+ width: 95%;
24
+ margin: 1em auto;
25
+ font-size: 1em;
26
+ }
27
+ .sitediff tr.error > td.status,
28
+ .sitediff tr.error > td.path {
29
+ background-color: khaki;
30
+ }
31
+ .sitediff tr.failure > td.status,
32
+ .sitediff tr.failure > td.path {
33
+ background-color: salmon;
34
+ }
35
+ .sitediff .before-col,
36
+ .sitediff .after-col,
37
+ .sitediff .diff-stat-col {
38
+ width: 10%;
39
+ }
40
+ .sitediff .path-col {
41
+ width: 55%;
42
+ }
@@ -0,0 +1,74 @@
1
+ require 'fileutils'
2
+ require 'digest/sha1'
3
+
4
+ class SiteDiff
5
+ class Result < Struct.new(:path, :before, :after, :error)
6
+ STATUS_SUCCESS = 0 # Identical before and after
7
+ STATUS_FAILURE = 1 # Different before and after
8
+ STATUS_ERROR = 2 # Couldn't fetch page
9
+ STATUS_TEXT = %w[success failure error]
10
+
11
+ attr_reader :status, :diff
12
+
13
+ def initialize(*args)
14
+ super
15
+ if error
16
+ @status = STATUS_ERROR
17
+ else
18
+ @diff = Diff::html_diffy(before, after)
19
+ @status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
20
+ end
21
+ end
22
+
23
+ def success?
24
+ status == STATUS_SUCCESS
25
+ end
26
+
27
+ # Textual representation of the status
28
+ def status_text
29
+ return STATUS_TEXT[status]
30
+ end
31
+
32
+ # Printable URL
33
+ def url(prefix)
34
+ prefix.to_s + path
35
+ end
36
+
37
+ # Filename to store diff
38
+ def filename
39
+ File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(self.path) + '.html')
40
+ end
41
+
42
+ # Text of the link in the HTML report
43
+ def link
44
+ case status
45
+ when STATUS_ERROR then error
46
+ when STATUS_SUCCESS then status_text
47
+ when STATUS_FAILURE then "<a href='#{filename}'>DIFF</a>"
48
+ end
49
+ end
50
+
51
+ # Log the result to the terminal
52
+ def log
53
+ case status
54
+ when STATUS_SUCCESS then
55
+ SiteDiff::log path, :success, 'SUCCESS'
56
+ when STATUS_ERROR then
57
+ SiteDiff::log path, :error, "ERROR (#{error})"
58
+ when STATUS_FAILURE then
59
+ SiteDiff::log path, :failure, "FAILURE"
60
+ puts Diff::terminal_diffy(before, after)
61
+ end
62
+ end
63
+
64
+ # Dump the result to a file
65
+ def dump(dir)
66
+ dump_path = File.join(dir, filename)
67
+ base = File.dirname(dump_path)
68
+ FileUtils::mkdir_p(base) unless File.exists?(base)
69
+ File.open(dump_path, 'w') do |f|
70
+ f.write(Diff::generate_diff_output(self))
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,193 @@
1
+ require 'nokogiri'
2
+ require 'set'
3
+
4
+ class SiteDiff
5
+ module Sanitize
6
+ class InvalidSanitization < Exception; end
7
+
8
+ TOOLS = {
9
+ :array => %w[dom_transform sanitization],
10
+ :scalar => %w[selector remove_spacing],
11
+ }
12
+ DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
13
+
14
+ module_function
15
+
16
+ # Performs dom transformations.
17
+ #
18
+ # Currently supported transforms:
19
+ #
20
+ # * { :type => "unwrap_root" }
21
+ # * { :type => "unwrap", :selector => "div.field-item" }
22
+ # * { :type => "remove", :selector => "div.extra-stuff" }
23
+ #
24
+ # @arg node - Nokogiri document or Node
25
+ # @arg rules - array of dom_transform rules
26
+ # @return - transformed Nokogiri document node
27
+ def perform_dom_transforms(node, rules)
28
+ rules.each do |rule|
29
+ type = rule['type'] or
30
+ raise InvalidSanitization, "DOM transform needs a type"
31
+ DOM_TRANSFORMS.include?(type) or
32
+ raise InvalidSanitization, "No DOM transform named #{type}"
33
+
34
+ meth = 'transform_' + type
35
+
36
+ if sels = rule['selector']
37
+ sels = [sels].flatten # Either array or scalar is fine
38
+ # Call method for each node the selectors find
39
+ sels.each do |sel|
40
+ node.css(sel).each { |e| send(meth, rule, e) }
41
+ end
42
+ else
43
+ send(meth, rule, node)
44
+ end
45
+ end
46
+ end
47
+
48
+ def transform_remove(rule, el)
49
+ el.remove
50
+ end
51
+ def transform_unwrap(rule, el)
52
+ el.add_next_sibling(el.children)
53
+ el.remove
54
+ end
55
+ def transform_remove_class(rule, el)
56
+ # Must call remove_class on a NodeSet!
57
+ ns = Nokogiri::XML::NodeSet.new(el.document, [el])
58
+ [rule['class']].flatten.each do |class_name|
59
+ ns.remove_class(class_name)
60
+ end
61
+ end
62
+ def transform_unwrap_root(rule, node)
63
+ node.children.size == 1 or
64
+ raise InvalidSanitization, "Multiple root elements in unwrap_root"
65
+ node.children = node.children[0].children
66
+ end
67
+
68
+ def parse(str, force_doc = false, log_errors = false)
69
+ if force_doc || /<!DOCTYPE/.match(str[0, 512])
70
+ doc = Nokogiri::HTML(str)
71
+ doc
72
+ else
73
+ doc = Nokogiri::HTML.fragment(str)
74
+ end
75
+ if log_errors
76
+ doc.errors.each do |e|
77
+ SiteDiff::log "Error in parsing HTML document: #{e}", :error
78
+ end
79
+ end
80
+ doc
81
+ end
82
+
83
+ # Force this object to be a document, so we can apply a stylesheet
84
+ def to_document(obj)
85
+ if Nokogiri::XML::Document === obj
86
+ return obj
87
+ elsif Nokogiri::XML::Node === obj # or fragment
88
+ return parse(obj.to_s, true)
89
+
90
+ # This ought to work, and would be faster,
91
+ # but seems to segfault Nokogiri
92
+ # doc = Nokogiri::HTML('<html><body>')
93
+ # doc.at('body').children = obj.children
94
+ # return doc
95
+ else
96
+ return to_document(parse(obj))
97
+ end
98
+ end
99
+
100
+ # Pretty-print the HTML
101
+ def prettify(obj)
102
+ @stylesheet ||= begin
103
+ stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
104
+ Nokogiri::XSLT(File.read(stylesheet_path))
105
+ end
106
+
107
+ # Pull out the html element's children
108
+ # The obvious way to do this is to iterate over pretty.css('html'),
109
+ # but that tends to segfault Nokogiri
110
+ str = @stylesheet.apply_to(to_document(obj))
111
+
112
+ # Remove xml declaration and <html> tags
113
+ str.sub!(/\A<\?xml.*$\n/, '')
114
+ str.sub!(/\A^<html>$\n/, '')
115
+ str.sub!(%r[</html>\n\Z], '')
116
+
117
+ # Remove top-level indentation
118
+ indent = /\A(\s*)/.match(str)[1].size
119
+ str.gsub!(/^\s{,#{indent}}/, '')
120
+
121
+ # Remove blank lines
122
+ str.gsub!(/^\s*$\n/, '')
123
+
124
+ return str
125
+ end
126
+
127
+ def remove_spacing(doc)
128
+ # remove double spacing, but only inside text nodes (eg not attributes)
129
+ doc.xpath('//text()').each do |node|
130
+ node.content = node.content.gsub(/ +/, ' ')
131
+ end
132
+ end
133
+
134
+ # Do one regexp transformation on a string
135
+ def substitute(str, rule)
136
+ #FIXME escape forward slashes, right now we are escaping them in YAML!
137
+ str.gsub!(/#{rule['pattern']}/, rule['substitute'] || '' )
138
+ str
139
+ end
140
+
141
+ # Do all regexp sanitization rules
142
+ def perform_regexps(node, rules)
143
+ rules ||= []
144
+
145
+ # First do rules with a selector
146
+ rules.each do |rule|
147
+ if sel = rule['selector']
148
+ node.css(sel).each do |e|
149
+ e.replace(substitute(e.to_html, rule))
150
+ end
151
+ end
152
+ end
153
+
154
+ # If needed, do rules without a selector. We'd rather not convert to
155
+ # a string unless necessary.
156
+ global_rules = rules.reject { |r| r['selector'] }
157
+ return node if global_rules.empty?
158
+
159
+ str = node.to_html # Convert to string
160
+ global_rules.each { |r| substitute(str, r) }
161
+ return str
162
+ end
163
+
164
+ def select_root(node, sel)
165
+ return node unless sel
166
+
167
+ # When we choose a new root, we always become a DocumentFragment,
168
+ # and lose any DOCTYPE and such.
169
+ ns = node.css(sel)
170
+ unless node.fragment?
171
+ node = Nokogiri::HTML.fragment('')
172
+ end
173
+ node.children = ns
174
+ return node
175
+ end
176
+
177
+ def sanitize(str, config)
178
+ return '' if str == ''
179
+
180
+ node = parse(str)
181
+
182
+ remove_spacing(node) if config['remove_spacing']
183
+ node = select_root(node, config['selector'])
184
+ if transform = config['dom_transform']
185
+ perform_dom_transforms(node, transform)
186
+ end
187
+
188
+ obj = perform_regexps(node, config['sanitization'])
189
+
190
+ return prettify(obj)
191
+ end
192
+ end
193
+ end
@@ -0,0 +1,118 @@
1
+ require 'typhoeus'
2
+
3
+ class SiteDiff
4
+ class SiteDiffReadFailure < Exception; end
5
+
6
+ class UriWrapper
7
+ # This lets us treat errors or content as one object
8
+ class ReadResult < Struct.new(:content, :error)
9
+ def initialize(cont, err = nil)
10
+ super(cont, err)
11
+ end
12
+ def self.error(err); new(nil, err); end
13
+ end
14
+
15
+ def initialize(uri)
16
+ @uri = uri.respond_to?(:scheme) ? uri : URI.parse(uri)
17
+ # remove trailing '/'s from local URIs
18
+ @uri.path.gsub!(/\/*$/, '') if local?
19
+ end
20
+
21
+ def user
22
+ @uri.user
23
+ end
24
+
25
+ def password
26
+ @uri.password
27
+ end
28
+
29
+ def to_s
30
+ uri = @uri.dup
31
+ uri.user = nil
32
+ uri.password = nil
33
+ return uri.to_s
34
+ end
35
+
36
+ # Is this a local filesystem path?
37
+ def local?
38
+ @uri.scheme == nil
39
+ end
40
+
41
+ # FIXME this is not used anymore
42
+ def +(path)
43
+ # 'path' for SiteDiff includes (parts of) path, query, and fragment.
44
+ sep = ''
45
+ if local? || @uri.path.empty?
46
+ sep = '/'
47
+ end
48
+ self.class.new(@uri.to_s + sep + path)
49
+ end
50
+
51
+ # Reads a file and yields to the completion handler, see .queue()
52
+ def read_file(&handler)
53
+ File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
54
+ rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
55
+ yield ReadResult.error(e.message)
56
+ end
57
+
58
+ # Returns the encoding of an HTTP response from headers , nil if not
59
+ # specified.
60
+ def http_encoding(http_headers)
61
+ if content_type = http_headers['Content-Type']
62
+ if md = /;\s*charset=([-\w]*)/.match(content_type)
63
+ return md[1]
64
+ end
65
+ end
66
+ end
67
+
68
+ # Returns a Typhoeus::Request to fetch @uri
69
+ #
70
+ # Completion callbacks of the request wrap the given handler which is
71
+ # assumed to accept a single ReadResult argument.
72
+ def typhoeus_request(&handler)
73
+ params = {
74
+ :connecttimeout => 3, # Don't hang on servers that don't exist
75
+ :followlocation => true, # Follow HTTP redirects (code 301 and 302)
76
+ :headers => {
77
+ "User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
78
+ }
79
+ }
80
+ # Allow basic auth
81
+ params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
82
+
83
+ req = Typhoeus::Request.new(self.to_s, params)
84
+
85
+ req.on_success do |resp|
86
+ body = resp.body
87
+ # Typhoeus does not respect HTTP headers when setting the encoding
88
+ # resp.body; coerce if possible.
89
+ if encoding = http_encoding(resp.headers)
90
+ body.force_encoding(encoding)
91
+ end
92
+ yield ReadResult.new(body)
93
+ end
94
+
95
+ req.on_failure do |resp|
96
+ msg = 'Unknown Error'
97
+ msg = resp.status_message if resp and resp.status_message
98
+ yield ReadResult.error("HTTP error #{@uri}: #{msg}")
99
+ end
100
+
101
+ req
102
+ end
103
+
104
+ # Queue reading this URL, with a completion handler to run after.
105
+ #
106
+ # The handler should be callable as handler[ReadResult].
107
+ #
108
+ # This method may choose not to queue the request at all, but simply
109
+ # execute right away.
110
+ def queue(hydra, &handler)
111
+ if local?
112
+ read_file(&handler)
113
+ else
114
+ hydra.queue(typhoeus_request(&handler))
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,32 @@
1
+ class SiteDiff
2
+ module Util
3
+ # A typhoeus cache, backed by DBM
4
+ class Cache
5
+ def initialize(file)
6
+ # Default to GDBM, if we have it, we don't want pag/dir files
7
+ begin
8
+ require 'gdbm'
9
+ @dbm = GDBM.new(file)
10
+ rescue LoadError
11
+ require 'dbm'
12
+ @dbm = DBM.new(file)
13
+ end
14
+ end
15
+
16
+ # Older Typhoeus doesn't have cache_key
17
+ def cache_key(req)
18
+ return req.cache_key if req.respond_to?(:cache_key)
19
+ return Marshal.dump([req.base_url, req.options])
20
+ end
21
+
22
+ def get(req)
23
+ resp = @dbm[cache_key(req)] or return nil
24
+ Marshal.load(resp)
25
+ end
26
+
27
+ def set(req, resp)
28
+ @dbm[cache_key(req)] = Marshal.dump(resp)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,77 @@
1
+ require 'webrick'
2
+
3
+ class SiteDiff
4
+ module Util
5
+ # Simple webserver for testing purposes
6
+ class Webserver
7
+ DEFAULT_PORT = 13080
8
+
9
+ attr_accessor :ports
10
+
11
+ # Serve a list of directories
12
+ def initialize(start_port, dirs, params = {})
13
+ start_port ||= DEFAULT_PORT
14
+ @ports = (start_port...(start_port + dirs.size)).to_a
15
+
16
+ if params[:announce]
17
+ puts "Serving at #{uris.join(", ")}"
18
+ end
19
+
20
+ opts = {}
21
+ if params[:quiet]
22
+ opts[:Logger] = WEBrick::Log.new(IO::NULL)
23
+ opts[:AccessLog] = []
24
+ end
25
+
26
+ @threads = []
27
+ dirs.each_with_index do |dir, idx|
28
+ opts[:Port] = @ports[idx]
29
+ opts[:DocumentRoot] = dir
30
+ server = WEBrick::HTTPServer.new(opts)
31
+ @threads << Thread.new { server.start }
32
+ end
33
+
34
+ if block_given?
35
+ yield self
36
+ kill
37
+ end
38
+ end
39
+
40
+ def kill
41
+ @threads.each { |t| t.kill }
42
+ end
43
+
44
+ def wait
45
+ @threads.each { |t| t.join }
46
+ end
47
+
48
+ def uris
49
+ ports.map { |p| "http://localhost:#{p}" }
50
+ end
51
+
52
+
53
+ # Helper to serve one dir
54
+ def self.serve(port, dir, params = {})
55
+ new(port, [dir], params)
56
+ end
57
+ end
58
+
59
+ class FixtureServer < Webserver
60
+ PORT = DEFAULT_PORT + 1
61
+ BASE = 'spec/fixtures/ruby-doc.org'
62
+ NAMES = %w[core-1.9.3 core-2.0]
63
+
64
+ def initialize(port = PORT, base = BASE, names = NAMES)
65
+ dirs = names.map { |n| File.join(base, n) }
66
+ super(port, dirs, :quiet => true)
67
+ end
68
+
69
+ def before
70
+ uris.first
71
+ end
72
+ def after
73
+ uris.last
74
+ end
75
+ end
76
+ end
77
+ end
metadata ADDED
@@ -0,0 +1,131 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sitediff
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Alex Dergachev
8
+ - Amir Kadivar
9
+ - Dave Vasilevsky
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2015-04-21 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: thor
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - '>='
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ - !ruby/object:Gem::Dependency
30
+ name: nokogiri
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ type: :runtime
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ - !ruby/object:Gem::Dependency
44
+ name: diffy
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - '>='
48
+ - !ruby/object:Gem::Version
49
+ version: '0'
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ - !ruby/object:Gem::Dependency
58
+ name: typhoeus
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ type: :runtime
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - '>='
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ - !ruby/object:Gem::Dependency
72
+ name: rainbow
73
+ requirement: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ type: :runtime
79
+ prerelease: false
80
+ version_requirements: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ description: |
86
+ SiteDiff makes it easy to see differences between two versions of a website. It accepts a set of paths to compare two versions of the site together with potential normalization/sanitization rules. From the provided paths and configuration SiteDiff generates an HTML report of all the status of HTML comparison between the given paths together with a readable diff-like HTML for each specified path containing the differences between the two versions of the site. It is useful tool for QAing re-deployments, site upgrades, etc.
87
+ email: alex@evolvingweb.ca
88
+ executables:
89
+ - sitediff
90
+ extensions: []
91
+ extra_rdoc_files: []
92
+ files:
93
+ - lib/sitediff/cli.rb
94
+ - lib/sitediff/config.rb
95
+ - lib/sitediff/diff.rb
96
+ - lib/sitediff/result.rb
97
+ - lib/sitediff/sanitize.rb
98
+ - lib/sitediff/uriwrapper.rb
99
+ - lib/sitediff/util/cache.rb
100
+ - lib/sitediff/util/webserver.rb
101
+ - lib/sitediff.rb
102
+ - lib/sitediff/files/diff.html.erb
103
+ - lib/sitediff/files/html_report.html.erb
104
+ - lib/sitediff/files/pretty_print.xsl
105
+ - lib/sitediff/files/sitediff.css
106
+ - bin/sitediff
107
+ homepage: https://github.com/evolvingweb/sitediff/
108
+ licenses:
109
+ - GPL-2
110
+ metadata: {}
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - '>='
118
+ - !ruby/object:Gem::Version
119
+ version: 1.9.3
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ requirements: []
126
+ rubyforge_project:
127
+ rubygems_version: 2.0.14
128
+ signing_key:
129
+ specification_version: 4
130
+ summary: Compare two versions of a site with ease!
131
+ test_files: []