sitediff 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1dc3a624b91cd4b7ef1c926116630cd795532024
4
+ data.tar.gz: e49f227ae303f574b704ffe3a226f79a120ae30f
5
+ SHA512:
6
+ metadata.gz: 90ca5508b834d32ac7c96aa6a94a6aa8488921e978e76890e142b1249da20bc620ddcfa237f3defc1e6928d83dd0a22583c9dded150855c320f94140e1bffdf1
7
+ data.tar.gz: 24bf7969b6f17c269bb407d1ff1684f6556318d0cfa7c6c92a8327ddd0d86ee4f153778affa4d1ef115e47a3b69b31b4dac5f01ed8b7f464a05fd98f9f98212b
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # when run as gem, $0 is /usr/local/bin/sitediff not this file
4
+ if $0 == __FILE__
5
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
6
+ end
7
+
8
+ require 'sitediff'
9
+
10
+ SiteDiff::Cli.start
@@ -0,0 +1,130 @@
1
+ #!/bin/env ruby
2
+ require 'sitediff/cli.rb'
3
+ require 'sitediff/config.rb'
4
+ require 'sitediff/result.rb'
5
+ require 'sitediff/uriwrapper'
6
+ require 'sitediff/util/cache'
7
+ require 'typhoeus'
8
+ require 'rainbow'
9
+
10
+ class SiteDiff
11
+ # path to misc. static files (e.g. erb, css files)
12
+ FILES_DIR = File.join(File.dirname(__FILE__), 'sitediff', 'files')
13
+
14
+ # subdirectory containing all failing diffs
15
+ DIFFS_DIR = 'diffs'
16
+
17
+ # label will be colorized and str will not be.
18
+ # type dictates the color: can be :success, :error, or :failure
19
+ def self.log(str, type=nil, label=nil)
20
+ label = label ? "[sitediff] #{label}" : '[sitediff]'
21
+ bg = fg = nil
22
+ case type
23
+ when :success
24
+ bg = :green
25
+ fg = :black
26
+ when :failure
27
+ bg = :red
28
+ when :error
29
+ bg = :yellow
30
+ fg = :black
31
+ end
32
+ label = Rainbow(label)
33
+ label = label.bg(bg) if bg
34
+ label = label.fg(fg) if fg
35
+ puts label + ' ' + str
36
+ end
37
+
38
+ attr_reader :config, :results
39
+ def before
40
+ @config.before['url']
41
+ end
42
+ def after
43
+ @config.after['url']
44
+ end
45
+
46
+ def cache=(file)
47
+ # FIXME: Non-global cache would be nice
48
+ return unless file
49
+ if Gem::Version.new(Typhoeus::VERSION) >= Gem::Version.new('0.6.4')
50
+ Typhoeus::Config.cache = SiteDiff::Util::Cache.new(file)
51
+ else
52
+ # Bug, see: https://github.com/typhoeus/typhoeus/pull/296
53
+ SiteDiff::log("Cache unsupported on Typhoeus version < 0.6.4", :failure)
54
+ end
55
+ end
56
+
57
+ def initialize(config, cache)
58
+ config.validate
59
+ @config = config
60
+ self.cache = cache
61
+ end
62
+
63
+ # Sanitize an HTML string based on configuration for either before or after
64
+ def sanitize(html, pos)
65
+ Sanitize::sanitize(html, @config.send(pos))
66
+ end
67
+
68
+ # Queues fetching before and after URLs with a Typhoeus::Hydra instance
69
+ #
70
+ # Upon completion of both before and after, prints and saves the diff to
71
+ # @results.
72
+ def queue_read(hydra, path)
73
+ # ( :before | after ) => ReadResult object
74
+ reads = {}
75
+ [:before, :after].each do |pos|
76
+ uri = UriWrapper.new(send(pos) + path)
77
+
78
+ uri.queue(hydra) do |res|
79
+ reads[pos] = res
80
+ next unless reads.size == 2
81
+
82
+ # we have read both before and after; calculate diff
83
+ if error = reads[:before].error || reads[:after].error
84
+ diff = Result.new(path, nil, nil, error)
85
+ else
86
+ diff = Result.new(path, sanitize(reads[:before].content, :before),
87
+ sanitize(reads[:after].content,:after), nil)
88
+ end
89
+ diff.log
90
+ @results[path] = diff
91
+ end
92
+ end
93
+ end
94
+
95
+ # Perform the comparison
96
+ def run
97
+ # Map of path -> Result object, queue_read sets callbacks to populate this
98
+ @results = {}
99
+
100
+ hydra = Typhoeus::Hydra.new(max_concurrency: 3)
101
+ @config.paths.each { |path| queue_read(hydra, path) }
102
+ hydra.run
103
+
104
+ # Order by original path order
105
+ @results = @config.paths.map { |p| @results[p] }
106
+ end
107
+
108
+ # Dump results to disk
109
+ def dump(dir, report_before, report_after, failing_paths)
110
+ report_before ||= before
111
+ report_after ||= after
112
+ FileUtils.mkdir_p(dir)
113
+
114
+ # store diffs of each failing case, first wipe out existing diffs
115
+ diff_dir = File.join(dir, DIFFS_DIR)
116
+ FileUtils.rm_rf(diff_dir)
117
+ results.each { |r| r.dump(dir) if r.status == Result::STATUS_FAILURE }
118
+ SiteDiff::log "All diff files were dumped inside #{dir}"
119
+
120
+ # store failing paths
121
+ SiteDiff::log "Writing failures to #{failing_paths}"
122
+ File.open(failing_paths, 'w') do |f|
123
+ results.each { |r| f.puts r.path unless r.success? }
124
+ end
125
+
126
+ # create report of results
127
+ report = Diff::generate_html_report(results, report_before, report_after)
128
+ File.open(File.join(dir, "/report.html") , 'w') { |f| f.write(report) }
129
+ end
130
+ end
@@ -0,0 +1,90 @@
1
+ require 'thor'
2
+ require 'sitediff/diff'
3
+ require 'sitediff/sanitize'
4
+ require 'sitediff/util/webserver'
5
+ require 'open-uri'
6
+ require 'uri'
7
+
8
+ class SiteDiff
9
+ class Cli < Thor
10
+ # Thor, by default, exits with 0 no matter what!
11
+ def self.exit_on_failure?
12
+ true
13
+ end
14
+
15
+ # Thor, by default, does not raise an error for use of unknown options.
16
+ def self.check_unknown_options?(config)
17
+ true
18
+ end
19
+
20
+ option 'dump-dir',
21
+ :type => :string,
22
+ :default => File.join('.', 'output'),
23
+ :desc => "Location to write the output to."
24
+ option 'paths',
25
+ :type => :string,
26
+ :desc => 'Paths are read (one at a line) from PATHS: ' +
27
+ 'useful for iterating over sanitization rules',
28
+ :aliases => '--paths-from-file'
29
+ option 'before',
30
+ :type => :string,
31
+ :desc => "URL used to fetch the before HTML. Acts as a prefix to specified paths",
32
+ :aliases => '--before-url'
33
+ option 'after',
34
+ :type => :string,
35
+ :desc => "URL used to fetch the after HTML. Acts as a prefix to specified paths.",
36
+ :aliases => '--after-url'
37
+ option 'before-report',
38
+ :type => :string,
39
+ :desc => "Before URL to use for reporting purposes. Useful if port forwarding.",
40
+ :aliases => '--before-url-report'
41
+ option 'after-report',
42
+ :type => :string,
43
+ :desc => "After URL to use for reporting purposes. Useful if port forwarding.",
44
+ :aliases => '--after-url-report'
45
+ option 'cache',
46
+ :type => :string,
47
+ :desc => "Filename to use for caching requests.",
48
+ :lazy_default => 'cache.db'
49
+ desc "diff [OPTIONS] [CONFIGFILES]", "Perform systematic diff on given URLs"
50
+ def diff(*config_files)
51
+ config = SiteDiff::Config.new(config_files)
52
+
53
+ # override config based on options
54
+ if paths_file = options['paths']
55
+ unless File.exists? paths_file
56
+ raise Config::InvalidConfig,
57
+ "Paths file '#{paths_file}' not found!"
58
+ end
59
+ SiteDiff::log "Reading paths from: #{paths_file}"
60
+ config.paths = File.readlines(paths_file)
61
+ end
62
+ config.before['url'] = options['before'] if options['before']
63
+ config.after['url'] = options['after'] if options['after']
64
+
65
+ sitediff = SiteDiff.new(config, options['cache'])
66
+ sitediff.run
67
+
68
+ failing_paths = File.join(options['dump-dir'], 'failures.txt')
69
+ sitediff.dump(options['dump-dir'], options['before-report'],
70
+ options['after-report'], failing_paths)
71
+ rescue Config::InvalidConfig => e
72
+ SiteDiff.log "Invalid configuration: #{e.message}", :failure
73
+ end
74
+
75
+ option :port,
76
+ :type => :numeric,
77
+ :default => SiteDiff::Util::Webserver::DEFAULT_PORT,
78
+ :desc => 'The port to serve on'
79
+ option :directory,
80
+ :type => :string,
81
+ :default => 'output',
82
+ :desc => 'The directory to serve',
83
+ :aliases => '--dump-dir'
84
+ desc "serve [OPTIONS]", "Serve the sitediff output directory over HTTP"
85
+ def serve
86
+ SiteDiff::Util::Webserver.serve(options[:port], options[:directory],
87
+ :announce => true).wait
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,154 @@
1
+ require 'yaml'
2
+
3
+ class SiteDiff
4
+ class Config
5
+
6
+ # keys allowed in configuration files
7
+ CONF_KEYS = Sanitize::TOOLS.values.flatten(1) +
8
+ %w[paths before after before_url after_url includes]
9
+
10
+ class InvalidConfig < Exception; end
11
+
12
+ # Takes a Hash and normalizes it to the following form by merging globals
13
+ # into before and after. A normalized config Hash looks like this:
14
+ #
15
+ # paths:
16
+ # - /about
17
+ #
18
+ # before:
19
+ # url: http://before
20
+ # selector: body
21
+ # dom_transform:
22
+ # - type: remove
23
+ # selector: script
24
+ #
25
+ # after:
26
+ # url: http://after
27
+ # selector: body
28
+ #
29
+ def self.normalize(conf)
30
+ tools = Sanitize::TOOLS
31
+
32
+ # merge globals
33
+ %w[before after].each do |pos|
34
+ conf[pos] ||= {}
35
+ tools[:array].each do |key|
36
+ conf[pos][key] ||= []
37
+ conf[pos][key] += conf[key] if conf[key]
38
+ end
39
+ tools[:scalar].each {|key| conf[pos][key] ||= conf[key]}
40
+ conf[pos]['url'] ||= conf[pos + '_url']
41
+ end
42
+ # normalize paths
43
+ conf['paths'] = Config::normalize_paths(conf['paths'])
44
+
45
+ conf.select {|k,v| %w[before after paths].include? k}
46
+ end
47
+
48
+ # Merges two normalized Hashes according to the following rules:
49
+ # 1 paths are merged as arrays.
50
+ # 2 before and after: for each subhash H (e.g. ['before']['dom_transform']):
51
+ # a) if first[H] and second[H] are expected to be arrays, their values
52
+ # are merged as such,
53
+ # b) if first[H] and second[H] are expected to be scalars, the value for
54
+ # second[H] is kept if and only if first[H] is nil.
55
+ #
56
+ # For example, merge(h1, h2) results in h3:
57
+ #
58
+ # (h1) before: {selector: foo, sanitization: [pattern: foo]}
59
+ # (h2) before: {selector: bar, sanitization: [pattern: bar]}
60
+ # (h3) before: {selector: foo, sanitization: [pattern: foo, pattern: bar]}
61
+ def self.merge(first, second)
62
+ result = { 'paths' => {}, 'before' => {}, 'after' => {} }
63
+ result['paths'] = (first['paths'] || []) + (second['paths'] || []) # rule 1
64
+ %w[before after].each do |pos|
65
+ unless first[pos]
66
+ result[pos] = second[pos] || {}
67
+ next
68
+ end
69
+ result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
70
+ if Sanitize::TOOLS[:array].include? key # rule 2a
71
+ result[pos][key] = (a || []) + (b|| [])
72
+ else
73
+ result[pos][key] = a || b # rule 2b
74
+ end
75
+ end
76
+ end
77
+ result
78
+ end
79
+
80
+ def initialize(files)
81
+ @config = {'paths' => [], 'before' => {}, 'after' => {} }
82
+ files.each do |file|
83
+ @config = Config::merge(@config, Config::load_conf(file))
84
+ end
85
+ end
86
+
87
+ def before
88
+ @config['before']
89
+ end
90
+ def after
91
+ @config['after']
92
+ end
93
+
94
+ def paths
95
+ @config['paths']
96
+ end
97
+ def paths=(paths)
98
+ @config['paths'] = Config::normalize_paths(paths)
99
+ end
100
+
101
+ # Checks if the configuration is usable for diff-ing.
102
+ def validate
103
+ raise InvalidConfig, "Undefined 'before' base URL." unless before['url']
104
+ raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
105
+ raise InvalidConfig, "Undefined 'paths'." unless (paths and !paths.empty?)
106
+ end
107
+
108
+ private
109
+
110
+ def self.normalize_paths(paths)
111
+ paths ||= []
112
+ return paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
113
+ end
114
+
115
+ # reads a YAML file and raises an InvalidConfig if the file is not valid.
116
+ def self.load_raw_yaml(file)
117
+ SiteDiff::log "Reading config file: #{file}"
118
+ conf = YAML.load_file(file) || {}
119
+ unless conf.is_a? Hash
120
+ raise InvalidConfig, "Invalid configuration file: '#{file}'"
121
+ end
122
+ conf.each do |k,v|
123
+ unless CONF_KEYS.include? k
124
+ raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'"
125
+ end
126
+ end
127
+ conf
128
+ end
129
+
130
+ # loads a single YAML configuration file, merges all its 'included' files
131
+ # and returns a normalized Hash.
132
+ def self.load_conf(file, visited=[])
133
+ # don't get fooled by a/../a/ or symlinks
134
+ file = File.realpath(file)
135
+ if visited.include? file
136
+ raise InvalidConfig, "Circular dependency: #{file}"
137
+ end
138
+
139
+ conf = load_raw_yaml(file) # not normalized yet
140
+ visited << file
141
+
142
+ # normalize and merge includes
143
+ includes = conf['includes'] || []
144
+ conf = Config::normalize(conf)
145
+ includes.each do |dep|
146
+ # include paths are relative to the including file.
147
+ dep = File.join(File.dirname(file), dep)
148
+ conf = Config::merge(conf, load_conf(dep, visited))
149
+ end
150
+ conf
151
+ end
152
+
153
+ end
154
+ end
@@ -0,0 +1,37 @@
1
+ require 'diffy'
2
+ require 'erb'
3
+ require 'rainbow'
4
+
5
+ class SiteDiff
6
+ module Diff
7
+ module_function
8
+
9
+ def html_diffy(before_html, after_html)
10
+ diff = Diffy::Diff.new(before_html, after_html)
11
+ diff.first ? # Is it non-empty?
12
+ diff.to_s(:html) : nil
13
+ end
14
+
15
+ def terminal_diffy(before_html, after_html)
16
+ args = []
17
+ args << :color if Rainbow.enabled
18
+ return Diffy::Diff.new(before_html, after_html, :context => 3).
19
+ to_s(*args)
20
+ end
21
+
22
+ def generate_html_report(results, before, after)
23
+ erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
24
+ report_html = ERB.new(File.read(erb_path)).result(binding)
25
+ return report_html
26
+ end
27
+
28
+ def generate_diff_output(result)
29
+ erb_path = File.join(SiteDiff::FILES_DIR, 'diff.html.erb')
30
+ return ERB.new(File.read(erb_path)).result(binding)
31
+ end
32
+
33
+ def css
34
+ File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.css'))
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <head>
3
+ <meta charset="utf-8" />
4
+ <style>
5
+ <%= Diffy::CSS %>
6
+ </style>
7
+ </head>
8
+ <body>
9
+ <%= result.diff %>
10
+ </body>
11
+ </html>
@@ -0,0 +1,47 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <!-- important: otherwise chrome will choke on non-ascii characters -->
5
+ <meta charset="utf-8" />
6
+ <style>
7
+ <%= SiteDiff::Diff.css %>
8
+ </style>
9
+ <title> SiteDiff Report </title>
10
+ </head>
11
+ <body>
12
+ <div class="sitediff">
13
+ <div class="legend">
14
+ <strong>before</strong> (base url): <a href="<%=before%>"><%=before%></a> |
15
+ <strong>after </strong> (base url): <a href="<%=after%>" ><%=after %></a>
16
+ </div>
17
+ <table class="results">
18
+
19
+ <colgroup>
20
+ <col class="before-col">
21
+ <col class="after-col">
22
+ <col class="path-col">
23
+ <col class="diff-stat-col">
24
+ </colgroup>
25
+
26
+ <thead>
27
+ <tr>
28
+ <th> Before </th>
29
+ <th> After </th>
30
+ <th> Path </th>
31
+ <th> Status </th>
32
+ </tr>
33
+ </thead>
34
+
35
+ <% results.each do |result| %>
36
+ <tr class="<%= result.status_text %>">
37
+ <td class="before"><a href="<%= result.url(before) %>">[before]</a></td>
38
+ <td class="after"><a href="<%= result.url(after) %>">[after]</a></td>
39
+ <td class="path"><%= result.path %></td>
40
+ <td class="status"><%= result.link %></td>
41
+ </tr>
42
+ <% end %>
43
+
44
+ </table>
45
+ </div>
46
+ </body>
47
+ </html>
@@ -0,0 +1,9 @@
1
+ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
2
+ <xsl:output method="xml" encoding="UTF-8" indent="yes"/>
3
+ <xsl:param name="indent-increment" select="' '"/>
4
+ <xsl:strip-space elements="*"/>
5
+
6
+ <xsl:template match="/">
7
+ <xsl:copy-of select="."/>
8
+ </xsl:template>
9
+ </xsl:stylesheet>
@@ -0,0 +1,42 @@
1
+ .sitediff {
2
+ font-family: monospace;
3
+ font-size: 1.2em;
4
+ }
5
+ .sitediff .legend {
6
+ width: 95%;
7
+ margin: 1em auto;
8
+ text-align: center;
9
+ }
10
+ .sitediff .results thead {
11
+ background: black;
12
+ color: white;
13
+ }
14
+ .sitediff .results td {
15
+ text-align: center;
16
+ }
17
+ .sitediff .results td.path {
18
+ text-align: left;
19
+ padding-left: 1em;
20
+ }
21
+ .sitediff .results {
22
+ padding: 1em;
23
+ width: 95%;
24
+ margin: 1em auto;
25
+ font-size: 1em;
26
+ }
27
+ .sitediff tr.error > td.status,
28
+ .sitediff tr.error > td.path {
29
+ background-color: khaki;
30
+ }
31
+ .sitediff tr.failure > td.status,
32
+ .sitediff tr.failure > td.path {
33
+ background-color: salmon;
34
+ }
35
+ .sitediff .before-col,
36
+ .sitediff .after-col,
37
+ .sitediff .diff-stat-col {
38
+ width: 10%;
39
+ }
40
+ .sitediff .path-col {
41
+ width: 55%;
42
+ }
@@ -0,0 +1,74 @@
1
+ require 'fileutils'
2
+ require 'digest/sha1'
3
+
4
+ class SiteDiff
5
+ class Result < Struct.new(:path, :before, :after, :error)
6
+ STATUS_SUCCESS = 0 # Identical before and after
7
+ STATUS_FAILURE = 1 # Different before and after
8
+ STATUS_ERROR = 2 # Couldn't fetch page
9
+ STATUS_TEXT = %w[success failure error]
10
+
11
+ attr_reader :status, :diff
12
+
13
+ def initialize(*args)
14
+ super
15
+ if error
16
+ @status = STATUS_ERROR
17
+ else
18
+ @diff = Diff::html_diffy(before, after)
19
+ @status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
20
+ end
21
+ end
22
+
23
+ def success?
24
+ status == STATUS_SUCCESS
25
+ end
26
+
27
+ # Textual representation of the status
28
+ def status_text
29
+ return STATUS_TEXT[status]
30
+ end
31
+
32
+ # Printable URL
33
+ def url(prefix)
34
+ prefix.to_s + path
35
+ end
36
+
37
+ # Filename to store diff
38
+ def filename
39
+ File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(self.path) + '.html')
40
+ end
41
+
42
+ # Text of the link in the HTML report
43
+ def link
44
+ case status
45
+ when STATUS_ERROR then error
46
+ when STATUS_SUCCESS then status_text
47
+ when STATUS_FAILURE then "<a href='#{filename}'>DIFF</a>"
48
+ end
49
+ end
50
+
51
+ # Log the result to the terminal
52
+ def log
53
+ case status
54
+ when STATUS_SUCCESS then
55
+ SiteDiff::log path, :success, 'SUCCESS'
56
+ when STATUS_ERROR then
57
+ SiteDiff::log path, :error, "ERROR (#{error})"
58
+ when STATUS_FAILURE then
59
+ SiteDiff::log path, :failure, "FAILURE"
60
+ puts Diff::terminal_diffy(before, after)
61
+ end
62
+ end
63
+
64
+ # Dump the result to a file
65
+ def dump(dir)
66
+ dump_path = File.join(dir, filename)
67
+ base = File.dirname(dump_path)
68
+ FileUtils::mkdir_p(base) unless File.exists?(base)
69
+ File.open(dump_path, 'w') do |f|
70
+ f.write(Diff::generate_diff_output(self))
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,193 @@
1
+ require 'nokogiri'
2
+ require 'set'
3
+
4
+ class SiteDiff
5
+ module Sanitize
6
+ class InvalidSanitization < Exception; end
7
+
8
+ TOOLS = {
9
+ :array => %w[dom_transform sanitization],
10
+ :scalar => %w[selector remove_spacing],
11
+ }
12
+ DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
13
+
14
+ module_function
15
+
16
+ # Performs dom transformations.
17
+ #
18
+ # Currently supported transforms:
19
+ #
20
+ # * { :type => "unwrap_root" }
21
+ # * { :type => "unwrap", :selector => "div.field-item" }
22
+ # * { :type => "remove", :selector => "div.extra-stuff" }
23
+ #
24
+ # @arg node - Nokogiri document or Node
25
+ # @arg rules - array of dom_transform rules
26
+ # @return - transformed Nokogiri document node
27
+ def perform_dom_transforms(node, rules)
28
+ rules.each do |rule|
29
+ type = rule['type'] or
30
+ raise InvalidSanitization, "DOM transform needs a type"
31
+ DOM_TRANSFORMS.include?(type) or
32
+ raise InvalidSanitization, "No DOM transform named #{type}"
33
+
34
+ meth = 'transform_' + type
35
+
36
+ if sels = rule['selector']
37
+ sels = [sels].flatten # Either array or scalar is fine
38
+ # Call method for each node the selectors find
39
+ sels.each do |sel|
40
+ node.css(sel).each { |e| send(meth, rule, e) }
41
+ end
42
+ else
43
+ send(meth, rule, node)
44
+ end
45
+ end
46
+ end
47
+
48
+ def transform_remove(rule, el)
49
+ el.remove
50
+ end
51
+ def transform_unwrap(rule, el)
52
+ el.add_next_sibling(el.children)
53
+ el.remove
54
+ end
55
+ def transform_remove_class(rule, el)
56
+ # Must call remove_class on a NodeSet!
57
+ ns = Nokogiri::XML::NodeSet.new(el.document, [el])
58
+ [rule['class']].flatten.each do |class_name|
59
+ ns.remove_class(class_name)
60
+ end
61
+ end
62
+ def transform_unwrap_root(rule, node)
63
+ node.children.size == 1 or
64
+ raise InvalidSanitization, "Multiple root elements in unwrap_root"
65
+ node.children = node.children[0].children
66
+ end
67
+
68
+ def parse(str, force_doc = false, log_errors = false)
69
+ if force_doc || /<!DOCTYPE/.match(str[0, 512])
70
+ doc = Nokogiri::HTML(str)
71
+ doc
72
+ else
73
+ doc = Nokogiri::HTML.fragment(str)
74
+ end
75
+ if log_errors
76
+ doc.errors.each do |e|
77
+ SiteDiff::log "Error in parsing HTML document: #{e}", :error
78
+ end
79
+ end
80
+ doc
81
+ end
82
+
83
+ # Force this object to be a document, so we can apply a stylesheet
84
+ def to_document(obj)
85
+ if Nokogiri::XML::Document === obj
86
+ return obj
87
+ elsif Nokogiri::XML::Node === obj # or fragment
88
+ return parse(obj.to_s, true)
89
+
90
+ # This ought to work, and would be faster,
91
+ # but seems to segfault Nokogiri
92
+ # doc = Nokogiri::HTML('<html><body>')
93
+ # doc.at('body').children = obj.children
94
+ # return doc
95
+ else
96
+ return to_document(parse(obj))
97
+ end
98
+ end
99
+
100
+ # Pretty-print the HTML
101
+ def prettify(obj)
102
+ @stylesheet ||= begin
103
+ stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
104
+ Nokogiri::XSLT(File.read(stylesheet_path))
105
+ end
106
+
107
+ # Pull out the html element's children
108
+ # The obvious way to do this is to iterate over pretty.css('html'),
109
+ # but that tends to segfault Nokogiri
110
+ str = @stylesheet.apply_to(to_document(obj))
111
+
112
+ # Remove xml declaration and <html> tags
113
+ str.sub!(/\A<\?xml.*$\n/, '')
114
+ str.sub!(/\A^<html>$\n/, '')
115
+ str.sub!(%r[</html>\n\Z], '')
116
+
117
+ # Remove top-level indentation
118
+ indent = /\A(\s*)/.match(str)[1].size
119
+ str.gsub!(/^\s{,#{indent}}/, '')
120
+
121
+ # Remove blank lines
122
+ str.gsub!(/^\s*$\n/, '')
123
+
124
+ return str
125
+ end
126
+
127
+ def remove_spacing(doc)
128
+ # remove double spacing, but only inside text nodes (eg not attributes)
129
+ doc.xpath('//text()').each do |node|
130
+ node.content = node.content.gsub(/ +/, ' ')
131
+ end
132
+ end
133
+
134
+ # Do one regexp transformation on a string
135
+ def substitute(str, rule)
136
+ #FIXME escape forward slashes, right now we are escaping them in YAML!
137
+ str.gsub!(/#{rule['pattern']}/, rule['substitute'] || '' )
138
+ str
139
+ end
140
+
141
+ # Do all regexp sanitization rules
142
+ def perform_regexps(node, rules)
143
+ rules ||= []
144
+
145
+ # First do rules with a selector
146
+ rules.each do |rule|
147
+ if sel = rule['selector']
148
+ node.css(sel).each do |e|
149
+ e.replace(substitute(e.to_html, rule))
150
+ end
151
+ end
152
+ end
153
+
154
+ # If needed, do rules without a selector. We'd rather not convert to
155
+ # a string unless necessary.
156
+ global_rules = rules.reject { |r| r['selector'] }
157
+ return node if global_rules.empty?
158
+
159
+ str = node.to_html # Convert to string
160
+ global_rules.each { |r| substitute(str, r) }
161
+ return str
162
+ end
163
+
164
+ def select_root(node, sel)
165
+ return node unless sel
166
+
167
+ # When we choose a new root, we always become a DocumentFragment,
168
+ # and lose any DOCTYPE and such.
169
+ ns = node.css(sel)
170
+ unless node.fragment?
171
+ node = Nokogiri::HTML.fragment('')
172
+ end
173
+ node.children = ns
174
+ return node
175
+ end
176
+
177
+ def sanitize(str, config)
178
+ return '' if str == ''
179
+
180
+ node = parse(str)
181
+
182
+ remove_spacing(node) if config['remove_spacing']
183
+ node = select_root(node, config['selector'])
184
+ if transform = config['dom_transform']
185
+ perform_dom_transforms(node, transform)
186
+ end
187
+
188
+ obj = perform_regexps(node, config['sanitization'])
189
+
190
+ return prettify(obj)
191
+ end
192
+ end
193
+ end
@@ -0,0 +1,118 @@
1
+ require 'typhoeus'
2
+
3
+ class SiteDiff
4
+ class SiteDiffReadFailure < Exception; end
5
+
6
+ class UriWrapper
7
+ # This lets us treat errors or content as one object
8
+ class ReadResult < Struct.new(:content, :error)
9
+ def initialize(cont, err = nil)
10
+ super(cont, err)
11
+ end
12
+ def self.error(err); new(nil, err); end
13
+ end
14
+
15
+ def initialize(uri)
16
+ @uri = uri.respond_to?(:scheme) ? uri : URI.parse(uri)
17
+ # remove trailing '/'s from local URIs
18
+ @uri.path.gsub!(/\/*$/, '') if local?
19
+ end
20
+
21
+ def user
22
+ @uri.user
23
+ end
24
+
25
+ def password
26
+ @uri.password
27
+ end
28
+
29
+ def to_s
30
+ uri = @uri.dup
31
+ uri.user = nil
32
+ uri.password = nil
33
+ return uri.to_s
34
+ end
35
+
36
+ # Is this a local filesystem path?
37
+ def local?
38
+ @uri.scheme == nil
39
+ end
40
+
41
+ # FIXME this is not used anymore
42
+ def +(path)
43
+ # 'path' for SiteDiff includes (parts of) path, query, and fragment.
44
+ sep = ''
45
+ if local? || @uri.path.empty?
46
+ sep = '/'
47
+ end
48
+ self.class.new(@uri.to_s + sep + path)
49
+ end
50
+
51
+ # Reads a file and yields to the completion handler, see .queue()
52
+ def read_file(&handler)
53
+ File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
54
+ rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
55
+ yield ReadResult.error(e.message)
56
+ end
57
+
58
+ # Returns the encoding of an HTTP response from headers , nil if not
59
+ # specified.
60
+ def http_encoding(http_headers)
61
+ if content_type = http_headers['Content-Type']
62
+ if md = /;\s*charset=([-\w]*)/.match(content_type)
63
+ return md[1]
64
+ end
65
+ end
66
+ end
67
+
68
+ # Returns a Typhoeus::Request to fetch @uri
69
+ #
70
+ # Completion callbacks of the request wrap the given handler which is
71
+ # assumed to accept a single ReadResult argument.
72
+ def typhoeus_request(&handler)
73
+ params = {
74
+ :connecttimeout => 3, # Don't hang on servers that don't exist
75
+ :followlocation => true, # Follow HTTP redirects (code 301 and 302)
76
+ :headers => {
77
+ "User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
78
+ }
79
+ }
80
+ # Allow basic auth
81
+ params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
82
+
83
+ req = Typhoeus::Request.new(self.to_s, params)
84
+
85
+ req.on_success do |resp|
86
+ body = resp.body
87
+ # Typhoeus does not respect HTTP headers when setting the encoding
88
+ # resp.body; coerce if possible.
89
+ if encoding = http_encoding(resp.headers)
90
+ body.force_encoding(encoding)
91
+ end
92
+ yield ReadResult.new(body)
93
+ end
94
+
95
+ req.on_failure do |resp|
96
+ msg = 'Unknown Error'
97
+ msg = resp.status_message if resp and resp.status_message
98
+ yield ReadResult.error("HTTP error #{@uri}: #{msg}")
99
+ end
100
+
101
+ req
102
+ end
103
+
104
+ # Queue reading this URL, with a completion handler to run after.
105
+ #
106
+ # The handler should be callable as handler[ReadResult].
107
+ #
108
+ # This method may choose not to queue the request at all, but simply
109
+ # execute right away.
110
+ def queue(hydra, &handler)
111
+ if local?
112
+ read_file(&handler)
113
+ else
114
+ hydra.queue(typhoeus_request(&handler))
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,32 @@
1
+ class SiteDiff
2
+ module Util
3
+ # A typhoeus cache, backed by DBM
4
+ class Cache
5
+ def initialize(file)
6
+ # Default to GDBM, if we have it, we don't want pag/dir files
7
+ begin
8
+ require 'gdbm'
9
+ @dbm = GDBM.new(file)
10
+ rescue LoadError
11
+ require 'dbm'
12
+ @dbm = DBM.new(file)
13
+ end
14
+ end
15
+
16
+ # Older Typhoeus doesn't have cache_key
17
+ def cache_key(req)
18
+ return req.cache_key if req.respond_to?(:cache_key)
19
+ return Marshal.dump([req.base_url, req.options])
20
+ end
21
+
22
+ def get(req)
23
+ resp = @dbm[cache_key(req)] or return nil
24
+ Marshal.load(resp)
25
+ end
26
+
27
+ def set(req, resp)
28
+ @dbm[cache_key(req)] = Marshal.dump(resp)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,77 @@
1
+ require 'webrick'
2
+
3
+ class SiteDiff
4
+ module Util
5
+ # Simple webserver for testing purposes
6
+ class Webserver
7
+ DEFAULT_PORT = 13080
8
+
9
+ attr_accessor :ports
10
+
11
+ # Serve a list of directories
12
+ def initialize(start_port, dirs, params = {})
13
+ start_port ||= DEFAULT_PORT
14
+ @ports = (start_port...(start_port + dirs.size)).to_a
15
+
16
+ if params[:announce]
17
+ puts "Serving at #{uris.join(", ")}"
18
+ end
19
+
20
+ opts = {}
21
+ if params[:quiet]
22
+ opts[:Logger] = WEBrick::Log.new(IO::NULL)
23
+ opts[:AccessLog] = []
24
+ end
25
+
26
+ @threads = []
27
+ dirs.each_with_index do |dir, idx|
28
+ opts[:Port] = @ports[idx]
29
+ opts[:DocumentRoot] = dir
30
+ server = WEBrick::HTTPServer.new(opts)
31
+ @threads << Thread.new { server.start }
32
+ end
33
+
34
+ if block_given?
35
+ yield self
36
+ kill
37
+ end
38
+ end
39
+
40
+ def kill
41
+ @threads.each { |t| t.kill }
42
+ end
43
+
44
+ def wait
45
+ @threads.each { |t| t.join }
46
+ end
47
+
48
+ def uris
49
+ ports.map { |p| "http://localhost:#{p}" }
50
+ end
51
+
52
+
53
+ # Helper to serve one dir
54
+ def self.serve(port, dir, params = {})
55
+ new(port, [dir], params)
56
+ end
57
+ end
58
+
59
+ class FixtureServer < Webserver
60
+ PORT = DEFAULT_PORT + 1
61
+ BASE = 'spec/fixtures/ruby-doc.org'
62
+ NAMES = %w[core-1.9.3 core-2.0]
63
+
64
+ def initialize(port = PORT, base = BASE, names = NAMES)
65
+ dirs = names.map { |n| File.join(base, n) }
66
+ super(port, dirs, :quiet => true)
67
+ end
68
+
69
+ def before
70
+ uris.first
71
+ end
72
+ def after
73
+ uris.last
74
+ end
75
+ end
76
+ end
77
+ end
metadata ADDED
@@ -0,0 +1,131 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sitediff
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Alex Dergachev
8
+ - Amir Kadivar
9
+ - Dave Vasilevsky
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2015-04-21 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: thor
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - '>='
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ - !ruby/object:Gem::Dependency
30
+ name: nokogiri
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ type: :runtime
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ - !ruby/object:Gem::Dependency
44
+ name: diffy
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - '>='
48
+ - !ruby/object:Gem::Version
49
+ version: '0'
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ - !ruby/object:Gem::Dependency
58
+ name: typhoeus
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ type: :runtime
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - '>='
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ - !ruby/object:Gem::Dependency
72
+ name: rainbow
73
+ requirement: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ type: :runtime
79
+ prerelease: false
80
+ version_requirements: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ description: |
86
+ SiteDiff makes it easy to see differences between two versions of a website. It accepts a set of paths to compare two versions of the site together with potential normalization/sanitization rules. From the provided paths and configuration SiteDiff generates an HTML report of all the status of HTML comparison between the given paths together with a readable diff-like HTML for each specified path containing the differences between the two versions of the site. It is useful tool for QAing re-deployments, site upgrades, etc.
87
+ email: alex@evolvingweb.ca
88
+ executables:
89
+ - sitediff
90
+ extensions: []
91
+ extra_rdoc_files: []
92
+ files:
93
+ - lib/sitediff/cli.rb
94
+ - lib/sitediff/config.rb
95
+ - lib/sitediff/diff.rb
96
+ - lib/sitediff/result.rb
97
+ - lib/sitediff/sanitize.rb
98
+ - lib/sitediff/uriwrapper.rb
99
+ - lib/sitediff/util/cache.rb
100
+ - lib/sitediff/util/webserver.rb
101
+ - lib/sitediff.rb
102
+ - lib/sitediff/files/diff.html.erb
103
+ - lib/sitediff/files/html_report.html.erb
104
+ - lib/sitediff/files/pretty_print.xsl
105
+ - lib/sitediff/files/sitediff.css
106
+ - bin/sitediff
107
+ homepage: https://github.com/evolvingweb/sitediff/
108
+ licenses:
109
+ - GPL-2
110
+ metadata: {}
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - '>='
118
+ - !ruby/object:Gem::Version
119
+ version: 1.9.3
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ requirements: []
126
+ rubyforge_project:
127
+ rubygems_version: 2.0.14
128
+ signing_key:
129
+ specification_version: 4
130
+ summary: Compare two versions of a site with ease!
131
+ test_files: []