sitediff 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/sitediff +1 -1
- data/lib/sitediff.rb +79 -63
- data/lib/sitediff/cache.rb +61 -0
- data/lib/sitediff/cli.rb +144 -23
- data/lib/sitediff/config.rb +46 -9
- data/lib/sitediff/config/creator.rb +122 -0
- data/lib/sitediff/crawler.rb +95 -0
- data/lib/sitediff/diff.rb +2 -1
- data/lib/sitediff/exception.rb +3 -0
- data/lib/sitediff/fetch.rb +55 -0
- data/lib/sitediff/files/html_report.html.erb +20 -4
- data/lib/sitediff/files/rules/drupal.yaml +33 -0
- data/lib/sitediff/files/sidebyside.html.erb +13 -0
- data/lib/sitediff/files/sitediff.css +11 -0
- data/lib/sitediff/result.rb +12 -9
- data/lib/sitediff/rules.rb +65 -0
- data/lib/sitediff/sanitize.rb +163 -168
- data/lib/sitediff/sanitize/dom_transform.rb +92 -0
- data/lib/sitediff/sanitize/regexp.rb +56 -0
- data/lib/sitediff/uriwrapper.rb +19 -7
- data/lib/sitediff/webserver.rb +82 -0
- data/lib/sitediff/webserver/resultserver.rb +98 -0
- metadata +70 -25
- checksums.yaml +0 -7
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
data/bin/sitediff
CHANGED
data/lib/sitediff.rb
CHANGED
@@ -1,11 +1,10 @@
|
|
1
1
|
#!/bin/env ruby
|
2
|
-
require 'sitediff/
|
3
|
-
require 'sitediff/
|
4
|
-
require 'sitediff/result
|
5
|
-
require '
|
6
|
-
require 'sitediff/util/cache'
|
7
|
-
require 'typhoeus'
|
2
|
+
require 'sitediff/config'
|
3
|
+
require 'sitediff/fetch'
|
4
|
+
require 'sitediff/result'
|
5
|
+
require 'pathname'
|
8
6
|
require 'rainbow'
|
7
|
+
require 'yaml'
|
9
8
|
|
10
9
|
class SiteDiff
|
11
10
|
# path to misc. static files (e.g. erb, css files)
|
@@ -14,20 +13,28 @@ class SiteDiff
|
|
14
13
|
# subdirectory containing all failing diffs
|
15
14
|
DIFFS_DIR = 'diffs'
|
16
15
|
|
16
|
+
# files in output
|
17
|
+
FAILURES_FILE = 'failures.txt'
|
18
|
+
REPORT_FILE = 'report.html'
|
19
|
+
SETTINGS_FILE = 'settings.yaml'
|
20
|
+
|
17
21
|
# label will be colorized and str will not be.
|
18
22
|
# type dictates the color: can be :success, :error, or :failure
|
19
|
-
def self.log(str, type
|
23
|
+
def self.log(str, type=:info, label=nil)
|
20
24
|
label = label ? "[sitediff] #{label}" : '[sitediff]'
|
21
25
|
bg = fg = nil
|
22
26
|
case type
|
23
|
-
when :
|
27
|
+
when :info
|
28
|
+
when :diff_success
|
24
29
|
bg = :green
|
25
30
|
fg = :black
|
26
|
-
when :
|
31
|
+
when :diff_failure
|
27
32
|
bg = :red
|
28
|
-
when :
|
33
|
+
when :warn
|
29
34
|
bg = :yellow
|
30
35
|
fg = :black
|
36
|
+
when :error
|
37
|
+
bg = :red
|
31
38
|
end
|
32
39
|
label = Rainbow(label)
|
33
40
|
label = label.bg(bg) if bg
|
@@ -43,88 +50,97 @@ class SiteDiff
|
|
43
50
|
@config.after['url']
|
44
51
|
end
|
45
52
|
|
46
|
-
def cache=
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
53
|
+
def initialize(config, cache, verbose=true)
|
54
|
+
@cache = cache
|
55
|
+
@verbose = verbose
|
56
|
+
|
57
|
+
# Check for single-site mode
|
58
|
+
validate_opts = {}
|
59
|
+
if !config.before['url'] && @cache.tag?(:before)
|
60
|
+
raise SiteDiffException,
|
61
|
+
"A cached 'before' is required for single-site mode" \
|
62
|
+
unless @cache.read_tags.include?(:before)
|
63
|
+
validate_opts[:need_before] = false
|
54
64
|
end
|
55
|
-
|
65
|
+
config.validate(validate_opts)
|
56
66
|
|
57
|
-
def initialize(config, cache)
|
58
|
-
config.validate
|
59
67
|
@config = config
|
60
|
-
self.cache = cache
|
61
68
|
end
|
62
69
|
|
63
|
-
# Sanitize
|
64
|
-
def sanitize(
|
65
|
-
|
70
|
+
# Sanitize HTML
|
71
|
+
def sanitize(path, read_results)
|
72
|
+
[:before, :after].map do |tag|
|
73
|
+
html = read_results[tag].content
|
74
|
+
config = @config.send(tag)
|
75
|
+
Sanitizer.new(html, config, :path => path).sanitize
|
76
|
+
end
|
66
77
|
end
|
67
78
|
|
68
|
-
#
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
[
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
# we have read both before and after; calculate diff
|
83
|
-
if error = reads[:before].error || reads[:after].error
|
84
|
-
diff = Result.new(path, nil, nil, error)
|
85
|
-
else
|
86
|
-
diff = Result.new(path, sanitize(reads[:before].content, :before),
|
87
|
-
sanitize(reads[:after].content,:after), nil)
|
88
|
-
end
|
89
|
-
diff.log
|
90
|
-
@results[path] = diff
|
91
|
-
end
|
79
|
+
# Process a set of read results
|
80
|
+
def process_results(path, read_results)
|
81
|
+
if error = read_results[:before].error || read_results[:after].error
|
82
|
+
diff = Result.new(path, nil, nil, error)
|
83
|
+
else
|
84
|
+
diff = Result.new(path, *sanitize(path, read_results), nil)
|
85
|
+
end
|
86
|
+
@results[path] = diff
|
87
|
+
|
88
|
+
# Print results in order!
|
89
|
+
while next_diff = @results[@ordered.first]
|
90
|
+
next_diff.log(@verbose)
|
91
|
+
@ordered.shift
|
92
92
|
end
|
93
93
|
end
|
94
94
|
|
95
|
-
# Perform the comparison
|
95
|
+
# Perform the comparison, populate @results and return the number of failing
|
96
|
+
# paths (paths with non-zero diff).
|
96
97
|
def run
|
97
|
-
# Map of path -> Result object,
|
98
|
+
# Map of path -> Result object, populated by process_results
|
98
99
|
@results = {}
|
100
|
+
@ordered = @config.paths.dup
|
101
|
+
|
102
|
+
unless @cache.read_tags.empty?
|
103
|
+
SiteDiff.log("Using sites from cache: " +
|
104
|
+
@cache.read_tags.sort.join(', '))
|
105
|
+
end
|
99
106
|
|
100
|
-
|
101
|
-
|
102
|
-
|
107
|
+
fetcher = Fetch.new(@cache, @config.paths,
|
108
|
+
:before => before, :after => after)
|
109
|
+
fetcher.run(&self.method(:process_results))
|
103
110
|
|
104
111
|
# Order by original path order
|
105
112
|
@results = @config.paths.map { |p| @results[p] }
|
113
|
+
return results.map{ |r| r unless r.success? }.compact.length
|
106
114
|
end
|
107
115
|
|
108
116
|
# Dump results to disk
|
109
|
-
def dump(dir, report_before, report_after
|
117
|
+
def dump(dir, report_before, report_after)
|
110
118
|
report_before ||= before
|
111
119
|
report_after ||= after
|
112
|
-
|
120
|
+
dir = Pathname.new(dir)
|
121
|
+
dir.mkpath unless dir.directory?
|
113
122
|
|
114
123
|
# store diffs of each failing case, first wipe out existing diffs
|
115
|
-
diff_dir =
|
116
|
-
|
124
|
+
diff_dir = dir + DIFFS_DIR
|
125
|
+
diff_dir.rmtree if diff_dir.exist?
|
117
126
|
results.each { |r| r.dump(dir) if r.status == Result::STATUS_FAILURE }
|
118
|
-
SiteDiff::log "All diff files were dumped inside #{dir}"
|
127
|
+
SiteDiff::log "All diff files were dumped inside #{dir.expand_path}"
|
119
128
|
|
120
129
|
# store failing paths
|
121
|
-
|
122
|
-
|
130
|
+
failures = dir + FAILURES_FILE
|
131
|
+
SiteDiff::log "Writing failures to #{failures.expand_path}"
|
132
|
+
failures.open('w') do |f|
|
123
133
|
results.each { |r| f.puts r.path unless r.success? }
|
124
134
|
end
|
125
135
|
|
126
136
|
# create report of results
|
127
|
-
report = Diff::generate_html_report(results, report_before, report_after
|
128
|
-
|
137
|
+
report = Diff::generate_html_report(results, report_before, report_after,
|
138
|
+
@cache)
|
139
|
+
dir.+(REPORT_FILE).open('w') { |f| f.write(report) }
|
140
|
+
|
141
|
+
# serve some settings
|
142
|
+
settings = { 'before' => report_before, 'after' => report_after,
|
143
|
+
'cached' => @cache.read_tags.map { |t| t.to_s } }
|
144
|
+
dir.+(SETTINGS_FILE).open('w') { |f| YAML.dump(settings, f) }
|
129
145
|
end
|
130
146
|
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
class SiteDiff
|
4
|
+
class Cache
|
5
|
+
DEFAULT_FILENAME = 'cache.db'
|
6
|
+
|
7
|
+
attr_accessor :read_tags, :write_tags
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
@file = opts[:file] || DEFAULT_FILENAME
|
11
|
+
@create = opts[:create]
|
12
|
+
@read_tags = Set.new
|
13
|
+
@write_tags = Set.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def close; @dbm.close if defined? @dbm; end
|
17
|
+
|
18
|
+
# Is a tag cached?
|
19
|
+
def tag?(tag)
|
20
|
+
open
|
21
|
+
@dbm[tag.to_s]
|
22
|
+
end
|
23
|
+
|
24
|
+
def get(tag, path)
|
25
|
+
return nil unless @read_tags.include? tag
|
26
|
+
open or return nil
|
27
|
+
val = @dbm[key(tag, path)]
|
28
|
+
return val && Marshal.load(val)
|
29
|
+
end
|
30
|
+
|
31
|
+
def set(tag, path, result)
|
32
|
+
return unless @write_tags.include? tag
|
33
|
+
open or return
|
34
|
+
@dbm[tag.to_s] = 'TRUE'
|
35
|
+
@dbm[key(tag, path)] = Marshal.dump(result)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def key(tag, path)
|
40
|
+
# Ensure encoding stays the same!
|
41
|
+
Marshal.dump([tag, path.encode('UTF-8')])
|
42
|
+
end
|
43
|
+
|
44
|
+
# Ensure the DB is open
|
45
|
+
def open
|
46
|
+
# DBM adds an extra .db, ugh
|
47
|
+
return false unless @create || File.exist?(@file) ||
|
48
|
+
File.exist?(@file + '.db')
|
49
|
+
return true if defined? @dbm
|
50
|
+
|
51
|
+
begin
|
52
|
+
require 'gdbm'
|
53
|
+
@dbm = GDBM.new(@file)
|
54
|
+
rescue LoadError
|
55
|
+
require 'dbm'
|
56
|
+
@dbm = DBM.new(@file)
|
57
|
+
end
|
58
|
+
return true
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
data/lib/sitediff/cli.rb
CHANGED
@@ -1,12 +1,18 @@
|
|
1
1
|
require 'thor'
|
2
|
-
require 'sitediff
|
3
|
-
require 'sitediff/
|
4
|
-
require 'sitediff/
|
5
|
-
require '
|
6
|
-
require '
|
2
|
+
require 'sitediff'
|
3
|
+
require 'sitediff/cache'
|
4
|
+
require 'sitediff/config'
|
5
|
+
require 'sitediff/config/creator'
|
6
|
+
require 'sitediff/fetch'
|
7
|
+
require 'sitediff/webserver/resultserver'
|
7
8
|
|
8
9
|
class SiteDiff
|
9
10
|
class Cli < Thor
|
11
|
+
class_option 'directory',
|
12
|
+
:type => :string,
|
13
|
+
:aliases => '-C',
|
14
|
+
:desc => "Go to a given directory before running."
|
15
|
+
|
10
16
|
# Thor, by default, exits with 0 no matter what!
|
11
17
|
def self.exit_on_failure?
|
12
18
|
true
|
@@ -21,11 +27,15 @@ class SiteDiff
|
|
21
27
|
:type => :string,
|
22
28
|
:default => File.join('.', 'output'),
|
23
29
|
:desc => "Location to write the output to."
|
24
|
-
option 'paths',
|
30
|
+
option 'paths-file',
|
25
31
|
:type => :string,
|
26
32
|
:desc => 'Paths are read (one at a line) from PATHS: ' +
|
27
33
|
'useful for iterating over sanitization rules',
|
28
34
|
:aliases => '--paths-from-file'
|
35
|
+
option 'paths',
|
36
|
+
:type => :array,
|
37
|
+
:aliases => '-p',
|
38
|
+
:desc => "Fetch only these specific paths"
|
29
39
|
option 'before',
|
30
40
|
:type => :string,
|
31
41
|
:desc => "URL used to fetch the before HTML. Acts as a prefix to specified paths",
|
@@ -42,16 +52,29 @@ class SiteDiff
|
|
42
52
|
:type => :string,
|
43
53
|
:desc => "After URL to use for reporting purposes. Useful if port forwarding.",
|
44
54
|
:aliases => '--after-url-report'
|
45
|
-
option '
|
55
|
+
option 'cached',
|
46
56
|
:type => :string,
|
47
|
-
:
|
48
|
-
:
|
57
|
+
:enum => %w[none all before after],
|
58
|
+
:default => 'before',
|
59
|
+
:desc => "Use the cached version of these sites, if available."
|
60
|
+
option 'quiet',
|
61
|
+
:type => :boolean,
|
62
|
+
:aliases => '-q',
|
63
|
+
:default => false,
|
64
|
+
:desc => "Show the difference between versions for each page"
|
49
65
|
desc "diff [OPTIONS] [CONFIGFILES]", "Perform systematic diff on given URLs"
|
50
66
|
def diff(*config_files)
|
51
|
-
config =
|
67
|
+
config = chdir(config_files)
|
52
68
|
|
53
69
|
# override config based on options
|
54
|
-
|
70
|
+
paths = options['paths']
|
71
|
+
if paths_file = options['paths-file']
|
72
|
+
if paths then
|
73
|
+
SiteDiff::log "Can't have both --paths-file and --paths", :error
|
74
|
+
exit -1
|
75
|
+
end
|
76
|
+
|
77
|
+
paths_file = Pathname.new(paths_file).expand_path
|
55
78
|
unless File.exists? paths_file
|
56
79
|
raise Config::InvalidConfig,
|
57
80
|
"Paths file '#{paths_file}' not found!"
|
@@ -59,32 +82,130 @@ class SiteDiff
|
|
59
82
|
SiteDiff::log "Reading paths from: #{paths_file}"
|
60
83
|
config.paths = File.readlines(paths_file)
|
61
84
|
end
|
85
|
+
config.paths = paths if paths
|
86
|
+
|
62
87
|
config.before['url'] = options['before'] if options['before']
|
63
88
|
config.after['url'] = options['after'] if options['after']
|
64
89
|
|
65
|
-
|
66
|
-
|
90
|
+
# Setup cache
|
91
|
+
cache = SiteDiff::Cache.new(:create => options['cached'] != 'none')
|
92
|
+
cache.read_tags << :before if %w[before all].include?(options['cached'])
|
93
|
+
cache.read_tags << :after if %w[after all].include?(options['cached'])
|
94
|
+
cache.write_tags << :before << :after
|
95
|
+
|
96
|
+
sitediff = SiteDiff.new(config, cache, !options['quiet'])
|
97
|
+
num_failing = sitediff.run
|
98
|
+
exit_code = (num_failing > 0) ? 2 : 0;
|
67
99
|
|
68
|
-
failing_paths = File.join(options['dump-dir'], 'failures.txt')
|
69
100
|
sitediff.dump(options['dump-dir'], options['before-report'],
|
70
|
-
options['after-report']
|
101
|
+
options['after-report'])
|
71
102
|
rescue Config::InvalidConfig => e
|
72
|
-
SiteDiff.log "Invalid configuration: #{e.message}", :
|
103
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
104
|
+
rescue SiteDiffException => e
|
105
|
+
SiteDiff.log e.message, :error
|
106
|
+
else # no exception was raised
|
107
|
+
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
108
|
+
# Failing diff --> exit(2), populated above
|
109
|
+
exit(exit_code)
|
73
110
|
end
|
74
111
|
|
75
112
|
option :port,
|
76
113
|
:type => :numeric,
|
77
|
-
:default => SiteDiff::
|
114
|
+
:default => SiteDiff::Webserver::DEFAULT_PORT,
|
78
115
|
:desc => 'The port to serve on'
|
79
|
-
option
|
116
|
+
option 'dump-dir',
|
80
117
|
:type => :string,
|
81
118
|
:default => 'output',
|
82
|
-
:desc => 'The directory to serve'
|
83
|
-
|
119
|
+
:desc => 'The directory to serve'
|
120
|
+
option :browse,
|
121
|
+
:type => :boolean,
|
122
|
+
:default => true,
|
123
|
+
:desc => "Whether to open the served content in your browser"
|
84
124
|
desc "serve [OPTIONS]", "Serve the sitediff output directory over HTTP"
|
85
|
-
def serve
|
86
|
-
|
87
|
-
|
125
|
+
def serve(*config_files)
|
126
|
+
config = chdir(config_files, :config => false)
|
127
|
+
|
128
|
+
cache = Cache.new
|
129
|
+
cache.read_tags << :before << :after
|
130
|
+
|
131
|
+
SiteDiff::Webserver::ResultServer.new(
|
132
|
+
options[:port],
|
133
|
+
options['dump-dir'],
|
134
|
+
:browse => options[:browse],
|
135
|
+
:cache => cache,
|
136
|
+
:config => config,
|
137
|
+
).wait
|
138
|
+
end
|
139
|
+
|
140
|
+
option :output,
|
141
|
+
:type => :string,
|
142
|
+
:default => 'sitediff',
|
143
|
+
:desc => 'Where to place the configuration',
|
144
|
+
:aliases => ['-o']
|
145
|
+
option :depth,
|
146
|
+
:type => :numeric,
|
147
|
+
:default => 3,
|
148
|
+
:desc => 'How deeply to crawl the given site'
|
149
|
+
option :rules,
|
150
|
+
:type => :string,
|
151
|
+
:enum => %w[yes no disabled],
|
152
|
+
:default => 'disabled',
|
153
|
+
:desc => 'Whether rules for the site should be auto-created'
|
154
|
+
desc "init URL [URL]", "Create a sitediff configuration"
|
155
|
+
def init(*urls)
|
156
|
+
unless (1..2).include? urls.size
|
157
|
+
SiteDiff.log "sitediff init requires one or two URLs", :error
|
158
|
+
exit 2
|
159
|
+
end
|
160
|
+
|
161
|
+
chdir([], :search => false)
|
162
|
+
creator = SiteDiff::Config::Creator.new(*urls)
|
163
|
+
creator.create(
|
164
|
+
:depth => options[:depth],
|
165
|
+
:directory => options[:output],
|
166
|
+
:rules => options[:rules] != 'no',
|
167
|
+
:rules_disabled => (options[:rules] == 'disabled'),
|
168
|
+
) do |tag, info|
|
169
|
+
SiteDiff.log "Visited #{info.uri}, cached"
|
170
|
+
end
|
171
|
+
|
172
|
+
SiteDiff.log "Created #{creator.config_file.expand_path}", :success
|
173
|
+
SiteDiff.log "You can now run 'sitediff diff'", :success
|
174
|
+
end
|
175
|
+
|
176
|
+
option :url,
|
177
|
+
:type => :string,
|
178
|
+
:desc => 'A custom base URL to fetch from'
|
179
|
+
desc "store [CONFIGFILES]",
|
180
|
+
"Cache the current contents of a site for later comparison"
|
181
|
+
def store(*config_files)
|
182
|
+
config = chdir(config_files)
|
183
|
+
config.validate(:need_before => false)
|
184
|
+
|
185
|
+
cache = SiteDiff::Cache.new(:create => true)
|
186
|
+
cache.write_tags << :before
|
187
|
+
|
188
|
+
base = options[:url] || config.after['url']
|
189
|
+
fetcher = SiteDiff::Fetch.new(cache, config.paths, :before => base)
|
190
|
+
fetcher.run do |path, res|
|
191
|
+
SiteDiff.log "Visited #{path}, cached"
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
private
|
196
|
+
def chdir(files, opts = {})
|
197
|
+
opts = { :config => true, :search => true }.merge(opts)
|
198
|
+
|
199
|
+
dir = options['directory']
|
200
|
+
Dir.chdir(dir) if dir
|
201
|
+
|
202
|
+
return unless opts[:search]
|
203
|
+
begin
|
204
|
+
SiteDiff::Config.new(files, :search => !dir)
|
205
|
+
rescue SiteDiff::Config::ConfigNotFound => e
|
206
|
+
raise if opts[:config]
|
207
|
+
# If no config required, allow it to pass
|
208
|
+
end
|
88
209
|
end
|
89
210
|
end
|
90
211
|
end
|