sitediff 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/sitediff +1 -1
- data/lib/sitediff.rb +79 -63
- data/lib/sitediff/cache.rb +61 -0
- data/lib/sitediff/cli.rb +144 -23
- data/lib/sitediff/config.rb +46 -9
- data/lib/sitediff/config/creator.rb +122 -0
- data/lib/sitediff/crawler.rb +95 -0
- data/lib/sitediff/diff.rb +2 -1
- data/lib/sitediff/exception.rb +3 -0
- data/lib/sitediff/fetch.rb +55 -0
- data/lib/sitediff/files/html_report.html.erb +20 -4
- data/lib/sitediff/files/rules/drupal.yaml +33 -0
- data/lib/sitediff/files/sidebyside.html.erb +13 -0
- data/lib/sitediff/files/sitediff.css +11 -0
- data/lib/sitediff/result.rb +12 -9
- data/lib/sitediff/rules.rb +65 -0
- data/lib/sitediff/sanitize.rb +163 -168
- data/lib/sitediff/sanitize/dom_transform.rb +92 -0
- data/lib/sitediff/sanitize/regexp.rb +56 -0
- data/lib/sitediff/uriwrapper.rb +19 -7
- data/lib/sitediff/webserver.rb +82 -0
- data/lib/sitediff/webserver/resultserver.rb +98 -0
- metadata +70 -25
- checksums.yaml +0 -7
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
data/bin/sitediff
CHANGED
data/lib/sitediff.rb
CHANGED
@@ -1,11 +1,10 @@
|
|
1
1
|
#!/bin/env ruby
|
2
|
-
require 'sitediff/
|
3
|
-
require 'sitediff/
|
4
|
-
require 'sitediff/result
|
5
|
-
require '
|
6
|
-
require 'sitediff/util/cache'
|
7
|
-
require 'typhoeus'
|
2
|
+
require 'sitediff/config'
|
3
|
+
require 'sitediff/fetch'
|
4
|
+
require 'sitediff/result'
|
5
|
+
require 'pathname'
|
8
6
|
require 'rainbow'
|
7
|
+
require 'yaml'
|
9
8
|
|
10
9
|
class SiteDiff
|
11
10
|
# path to misc. static files (e.g. erb, css files)
|
@@ -14,20 +13,28 @@ class SiteDiff
|
|
14
13
|
# subdirectory containing all failing diffs
|
15
14
|
DIFFS_DIR = 'diffs'
|
16
15
|
|
16
|
+
# files in output
|
17
|
+
FAILURES_FILE = 'failures.txt'
|
18
|
+
REPORT_FILE = 'report.html'
|
19
|
+
SETTINGS_FILE = 'settings.yaml'
|
20
|
+
|
17
21
|
# label will be colorized and str will not be.
|
18
22
|
# type dictates the color: can be :success, :error, or :failure
|
19
|
-
def self.log(str, type
|
23
|
+
def self.log(str, type=:info, label=nil)
|
20
24
|
label = label ? "[sitediff] #{label}" : '[sitediff]'
|
21
25
|
bg = fg = nil
|
22
26
|
case type
|
23
|
-
when :
|
27
|
+
when :info
|
28
|
+
when :diff_success
|
24
29
|
bg = :green
|
25
30
|
fg = :black
|
26
|
-
when :
|
31
|
+
when :diff_failure
|
27
32
|
bg = :red
|
28
|
-
when :
|
33
|
+
when :warn
|
29
34
|
bg = :yellow
|
30
35
|
fg = :black
|
36
|
+
when :error
|
37
|
+
bg = :red
|
31
38
|
end
|
32
39
|
label = Rainbow(label)
|
33
40
|
label = label.bg(bg) if bg
|
@@ -43,88 +50,97 @@ class SiteDiff
|
|
43
50
|
@config.after['url']
|
44
51
|
end
|
45
52
|
|
46
|
-
def cache=
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
53
|
+
def initialize(config, cache, verbose=true)
|
54
|
+
@cache = cache
|
55
|
+
@verbose = verbose
|
56
|
+
|
57
|
+
# Check for single-site mode
|
58
|
+
validate_opts = {}
|
59
|
+
if !config.before['url'] && @cache.tag?(:before)
|
60
|
+
raise SiteDiffException,
|
61
|
+
"A cached 'before' is required for single-site mode" \
|
62
|
+
unless @cache.read_tags.include?(:before)
|
63
|
+
validate_opts[:need_before] = false
|
54
64
|
end
|
55
|
-
|
65
|
+
config.validate(validate_opts)
|
56
66
|
|
57
|
-
def initialize(config, cache)
|
58
|
-
config.validate
|
59
67
|
@config = config
|
60
|
-
self.cache = cache
|
61
68
|
end
|
62
69
|
|
63
|
-
# Sanitize
|
64
|
-
def sanitize(
|
65
|
-
|
70
|
+
# Sanitize HTML
|
71
|
+
def sanitize(path, read_results)
|
72
|
+
[:before, :after].map do |tag|
|
73
|
+
html = read_results[tag].content
|
74
|
+
config = @config.send(tag)
|
75
|
+
Sanitizer.new(html, config, :path => path).sanitize
|
76
|
+
end
|
66
77
|
end
|
67
78
|
|
68
|
-
#
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
[
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
# we have read both before and after; calculate diff
|
83
|
-
if error = reads[:before].error || reads[:after].error
|
84
|
-
diff = Result.new(path, nil, nil, error)
|
85
|
-
else
|
86
|
-
diff = Result.new(path, sanitize(reads[:before].content, :before),
|
87
|
-
sanitize(reads[:after].content,:after), nil)
|
88
|
-
end
|
89
|
-
diff.log
|
90
|
-
@results[path] = diff
|
91
|
-
end
|
79
|
+
# Process a set of read results
|
80
|
+
def process_results(path, read_results)
|
81
|
+
if error = read_results[:before].error || read_results[:after].error
|
82
|
+
diff = Result.new(path, nil, nil, error)
|
83
|
+
else
|
84
|
+
diff = Result.new(path, *sanitize(path, read_results), nil)
|
85
|
+
end
|
86
|
+
@results[path] = diff
|
87
|
+
|
88
|
+
# Print results in order!
|
89
|
+
while next_diff = @results[@ordered.first]
|
90
|
+
next_diff.log(@verbose)
|
91
|
+
@ordered.shift
|
92
92
|
end
|
93
93
|
end
|
94
94
|
|
95
|
-
# Perform the comparison
|
95
|
+
# Perform the comparison, populate @results and return the number of failing
|
96
|
+
# paths (paths with non-zero diff).
|
96
97
|
def run
|
97
|
-
# Map of path -> Result object,
|
98
|
+
# Map of path -> Result object, populated by process_results
|
98
99
|
@results = {}
|
100
|
+
@ordered = @config.paths.dup
|
101
|
+
|
102
|
+
unless @cache.read_tags.empty?
|
103
|
+
SiteDiff.log("Using sites from cache: " +
|
104
|
+
@cache.read_tags.sort.join(', '))
|
105
|
+
end
|
99
106
|
|
100
|
-
|
101
|
-
|
102
|
-
|
107
|
+
fetcher = Fetch.new(@cache, @config.paths,
|
108
|
+
:before => before, :after => after)
|
109
|
+
fetcher.run(&self.method(:process_results))
|
103
110
|
|
104
111
|
# Order by original path order
|
105
112
|
@results = @config.paths.map { |p| @results[p] }
|
113
|
+
return results.map{ |r| r unless r.success? }.compact.length
|
106
114
|
end
|
107
115
|
|
108
116
|
# Dump results to disk
|
109
|
-
def dump(dir, report_before, report_after
|
117
|
+
def dump(dir, report_before, report_after)
|
110
118
|
report_before ||= before
|
111
119
|
report_after ||= after
|
112
|
-
|
120
|
+
dir = Pathname.new(dir)
|
121
|
+
dir.mkpath unless dir.directory?
|
113
122
|
|
114
123
|
# store diffs of each failing case, first wipe out existing diffs
|
115
|
-
diff_dir =
|
116
|
-
|
124
|
+
diff_dir = dir + DIFFS_DIR
|
125
|
+
diff_dir.rmtree if diff_dir.exist?
|
117
126
|
results.each { |r| r.dump(dir) if r.status == Result::STATUS_FAILURE }
|
118
|
-
SiteDiff::log "All diff files were dumped inside #{dir}"
|
127
|
+
SiteDiff::log "All diff files were dumped inside #{dir.expand_path}"
|
119
128
|
|
120
129
|
# store failing paths
|
121
|
-
|
122
|
-
|
130
|
+
failures = dir + FAILURES_FILE
|
131
|
+
SiteDiff::log "Writing failures to #{failures.expand_path}"
|
132
|
+
failures.open('w') do |f|
|
123
133
|
results.each { |r| f.puts r.path unless r.success? }
|
124
134
|
end
|
125
135
|
|
126
136
|
# create report of results
|
127
|
-
report = Diff::generate_html_report(results, report_before, report_after
|
128
|
-
|
137
|
+
report = Diff::generate_html_report(results, report_before, report_after,
|
138
|
+
@cache)
|
139
|
+
dir.+(REPORT_FILE).open('w') { |f| f.write(report) }
|
140
|
+
|
141
|
+
# serve some settings
|
142
|
+
settings = { 'before' => report_before, 'after' => report_after,
|
143
|
+
'cached' => @cache.read_tags.map { |t| t.to_s } }
|
144
|
+
dir.+(SETTINGS_FILE).open('w') { |f| YAML.dump(settings, f) }
|
129
145
|
end
|
130
146
|
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
class SiteDiff
|
4
|
+
class Cache
|
5
|
+
DEFAULT_FILENAME = 'cache.db'
|
6
|
+
|
7
|
+
attr_accessor :read_tags, :write_tags
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
@file = opts[:file] || DEFAULT_FILENAME
|
11
|
+
@create = opts[:create]
|
12
|
+
@read_tags = Set.new
|
13
|
+
@write_tags = Set.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def close; @dbm.close if defined? @dbm; end
|
17
|
+
|
18
|
+
# Is a tag cached?
|
19
|
+
def tag?(tag)
|
20
|
+
open
|
21
|
+
@dbm[tag.to_s]
|
22
|
+
end
|
23
|
+
|
24
|
+
def get(tag, path)
|
25
|
+
return nil unless @read_tags.include? tag
|
26
|
+
open or return nil
|
27
|
+
val = @dbm[key(tag, path)]
|
28
|
+
return val && Marshal.load(val)
|
29
|
+
end
|
30
|
+
|
31
|
+
def set(tag, path, result)
|
32
|
+
return unless @write_tags.include? tag
|
33
|
+
open or return
|
34
|
+
@dbm[tag.to_s] = 'TRUE'
|
35
|
+
@dbm[key(tag, path)] = Marshal.dump(result)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def key(tag, path)
|
40
|
+
# Ensure encoding stays the same!
|
41
|
+
Marshal.dump([tag, path.encode('UTF-8')])
|
42
|
+
end
|
43
|
+
|
44
|
+
# Ensure the DB is open
|
45
|
+
def open
|
46
|
+
# DBM adds an extra .db, ugh
|
47
|
+
return false unless @create || File.exist?(@file) ||
|
48
|
+
File.exist?(@file + '.db')
|
49
|
+
return true if defined? @dbm
|
50
|
+
|
51
|
+
begin
|
52
|
+
require 'gdbm'
|
53
|
+
@dbm = GDBM.new(@file)
|
54
|
+
rescue LoadError
|
55
|
+
require 'dbm'
|
56
|
+
@dbm = DBM.new(@file)
|
57
|
+
end
|
58
|
+
return true
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
data/lib/sitediff/cli.rb
CHANGED
@@ -1,12 +1,18 @@
|
|
1
1
|
require 'thor'
|
2
|
-
require 'sitediff
|
3
|
-
require 'sitediff/
|
4
|
-
require 'sitediff/
|
5
|
-
require '
|
6
|
-
require '
|
2
|
+
require 'sitediff'
|
3
|
+
require 'sitediff/cache'
|
4
|
+
require 'sitediff/config'
|
5
|
+
require 'sitediff/config/creator'
|
6
|
+
require 'sitediff/fetch'
|
7
|
+
require 'sitediff/webserver/resultserver'
|
7
8
|
|
8
9
|
class SiteDiff
|
9
10
|
class Cli < Thor
|
11
|
+
class_option 'directory',
|
12
|
+
:type => :string,
|
13
|
+
:aliases => '-C',
|
14
|
+
:desc => "Go to a given directory before running."
|
15
|
+
|
10
16
|
# Thor, by default, exits with 0 no matter what!
|
11
17
|
def self.exit_on_failure?
|
12
18
|
true
|
@@ -21,11 +27,15 @@ class SiteDiff
|
|
21
27
|
:type => :string,
|
22
28
|
:default => File.join('.', 'output'),
|
23
29
|
:desc => "Location to write the output to."
|
24
|
-
option 'paths',
|
30
|
+
option 'paths-file',
|
25
31
|
:type => :string,
|
26
32
|
:desc => 'Paths are read (one at a line) from PATHS: ' +
|
27
33
|
'useful for iterating over sanitization rules',
|
28
34
|
:aliases => '--paths-from-file'
|
35
|
+
option 'paths',
|
36
|
+
:type => :array,
|
37
|
+
:aliases => '-p',
|
38
|
+
:desc => "Fetch only these specific paths"
|
29
39
|
option 'before',
|
30
40
|
:type => :string,
|
31
41
|
:desc => "URL used to fetch the before HTML. Acts as a prefix to specified paths",
|
@@ -42,16 +52,29 @@ class SiteDiff
|
|
42
52
|
:type => :string,
|
43
53
|
:desc => "After URL to use for reporting purposes. Useful if port forwarding.",
|
44
54
|
:aliases => '--after-url-report'
|
45
|
-
option '
|
55
|
+
option 'cached',
|
46
56
|
:type => :string,
|
47
|
-
:
|
48
|
-
:
|
57
|
+
:enum => %w[none all before after],
|
58
|
+
:default => 'before',
|
59
|
+
:desc => "Use the cached version of these sites, if available."
|
60
|
+
option 'quiet',
|
61
|
+
:type => :boolean,
|
62
|
+
:aliases => '-q',
|
63
|
+
:default => false,
|
64
|
+
:desc => "Show the difference between versions for each page"
|
49
65
|
desc "diff [OPTIONS] [CONFIGFILES]", "Perform systematic diff on given URLs"
|
50
66
|
def diff(*config_files)
|
51
|
-
config =
|
67
|
+
config = chdir(config_files)
|
52
68
|
|
53
69
|
# override config based on options
|
54
|
-
|
70
|
+
paths = options['paths']
|
71
|
+
if paths_file = options['paths-file']
|
72
|
+
if paths then
|
73
|
+
SiteDiff::log "Can't have both --paths-file and --paths", :error
|
74
|
+
exit -1
|
75
|
+
end
|
76
|
+
|
77
|
+
paths_file = Pathname.new(paths_file).expand_path
|
55
78
|
unless File.exists? paths_file
|
56
79
|
raise Config::InvalidConfig,
|
57
80
|
"Paths file '#{paths_file}' not found!"
|
@@ -59,32 +82,130 @@ class SiteDiff
|
|
59
82
|
SiteDiff::log "Reading paths from: #{paths_file}"
|
60
83
|
config.paths = File.readlines(paths_file)
|
61
84
|
end
|
85
|
+
config.paths = paths if paths
|
86
|
+
|
62
87
|
config.before['url'] = options['before'] if options['before']
|
63
88
|
config.after['url'] = options['after'] if options['after']
|
64
89
|
|
65
|
-
|
66
|
-
|
90
|
+
# Setup cache
|
91
|
+
cache = SiteDiff::Cache.new(:create => options['cached'] != 'none')
|
92
|
+
cache.read_tags << :before if %w[before all].include?(options['cached'])
|
93
|
+
cache.read_tags << :after if %w[after all].include?(options['cached'])
|
94
|
+
cache.write_tags << :before << :after
|
95
|
+
|
96
|
+
sitediff = SiteDiff.new(config, cache, !options['quiet'])
|
97
|
+
num_failing = sitediff.run
|
98
|
+
exit_code = (num_failing > 0) ? 2 : 0;
|
67
99
|
|
68
|
-
failing_paths = File.join(options['dump-dir'], 'failures.txt')
|
69
100
|
sitediff.dump(options['dump-dir'], options['before-report'],
|
70
|
-
options['after-report']
|
101
|
+
options['after-report'])
|
71
102
|
rescue Config::InvalidConfig => e
|
72
|
-
SiteDiff.log "Invalid configuration: #{e.message}", :
|
103
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
104
|
+
rescue SiteDiffException => e
|
105
|
+
SiteDiff.log e.message, :error
|
106
|
+
else # no exception was raised
|
107
|
+
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
108
|
+
# Failing diff --> exit(2), populated above
|
109
|
+
exit(exit_code)
|
73
110
|
end
|
74
111
|
|
75
112
|
option :port,
|
76
113
|
:type => :numeric,
|
77
|
-
:default => SiteDiff::
|
114
|
+
:default => SiteDiff::Webserver::DEFAULT_PORT,
|
78
115
|
:desc => 'The port to serve on'
|
79
|
-
option
|
116
|
+
option 'dump-dir',
|
80
117
|
:type => :string,
|
81
118
|
:default => 'output',
|
82
|
-
:desc => 'The directory to serve'
|
83
|
-
|
119
|
+
:desc => 'The directory to serve'
|
120
|
+
option :browse,
|
121
|
+
:type => :boolean,
|
122
|
+
:default => true,
|
123
|
+
:desc => "Whether to open the served content in your browser"
|
84
124
|
desc "serve [OPTIONS]", "Serve the sitediff output directory over HTTP"
|
85
|
-
def serve
|
86
|
-
|
87
|
-
|
125
|
+
def serve(*config_files)
|
126
|
+
config = chdir(config_files, :config => false)
|
127
|
+
|
128
|
+
cache = Cache.new
|
129
|
+
cache.read_tags << :before << :after
|
130
|
+
|
131
|
+
SiteDiff::Webserver::ResultServer.new(
|
132
|
+
options[:port],
|
133
|
+
options['dump-dir'],
|
134
|
+
:browse => options[:browse],
|
135
|
+
:cache => cache,
|
136
|
+
:config => config,
|
137
|
+
).wait
|
138
|
+
end
|
139
|
+
|
140
|
+
option :output,
|
141
|
+
:type => :string,
|
142
|
+
:default => 'sitediff',
|
143
|
+
:desc => 'Where to place the configuration',
|
144
|
+
:aliases => ['-o']
|
145
|
+
option :depth,
|
146
|
+
:type => :numeric,
|
147
|
+
:default => 3,
|
148
|
+
:desc => 'How deeply to crawl the given site'
|
149
|
+
option :rules,
|
150
|
+
:type => :string,
|
151
|
+
:enum => %w[yes no disabled],
|
152
|
+
:default => 'disabled',
|
153
|
+
:desc => 'Whether rules for the site should be auto-created'
|
154
|
+
desc "init URL [URL]", "Create a sitediff configuration"
|
155
|
+
def init(*urls)
|
156
|
+
unless (1..2).include? urls.size
|
157
|
+
SiteDiff.log "sitediff init requires one or two URLs", :error
|
158
|
+
exit 2
|
159
|
+
end
|
160
|
+
|
161
|
+
chdir([], :search => false)
|
162
|
+
creator = SiteDiff::Config::Creator.new(*urls)
|
163
|
+
creator.create(
|
164
|
+
:depth => options[:depth],
|
165
|
+
:directory => options[:output],
|
166
|
+
:rules => options[:rules] != 'no',
|
167
|
+
:rules_disabled => (options[:rules] == 'disabled'),
|
168
|
+
) do |tag, info|
|
169
|
+
SiteDiff.log "Visited #{info.uri}, cached"
|
170
|
+
end
|
171
|
+
|
172
|
+
SiteDiff.log "Created #{creator.config_file.expand_path}", :success
|
173
|
+
SiteDiff.log "You can now run 'sitediff diff'", :success
|
174
|
+
end
|
175
|
+
|
176
|
+
option :url,
|
177
|
+
:type => :string,
|
178
|
+
:desc => 'A custom base URL to fetch from'
|
179
|
+
desc "store [CONFIGFILES]",
|
180
|
+
"Cache the current contents of a site for later comparison"
|
181
|
+
def store(*config_files)
|
182
|
+
config = chdir(config_files)
|
183
|
+
config.validate(:need_before => false)
|
184
|
+
|
185
|
+
cache = SiteDiff::Cache.new(:create => true)
|
186
|
+
cache.write_tags << :before
|
187
|
+
|
188
|
+
base = options[:url] || config.after['url']
|
189
|
+
fetcher = SiteDiff::Fetch.new(cache, config.paths, :before => base)
|
190
|
+
fetcher.run do |path, res|
|
191
|
+
SiteDiff.log "Visited #{path}, cached"
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
private
|
196
|
+
def chdir(files, opts = {})
|
197
|
+
opts = { :config => true, :search => true }.merge(opts)
|
198
|
+
|
199
|
+
dir = options['directory']
|
200
|
+
Dir.chdir(dir) if dir
|
201
|
+
|
202
|
+
return unless opts[:search]
|
203
|
+
begin
|
204
|
+
SiteDiff::Config.new(files, :search => !dir)
|
205
|
+
rescue SiteDiff::Config::ConfigNotFound => e
|
206
|
+
raise if opts[:config]
|
207
|
+
# If no config required, allow it to pass
|
208
|
+
end
|
88
209
|
end
|
89
210
|
end
|
90
211
|
end
|