sitediff 0.0.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/sitediff +9 -2
- data/lib/sitediff.rb +126 -81
- data/lib/sitediff/cache.rb +35 -6
- data/lib/sitediff/cli.rb +254 -119
- data/lib/sitediff/config.rb +362 -29
- data/lib/sitediff/config/creator.rb +53 -71
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +11 -15
- data/lib/sitediff/diff.rb +28 -9
- data/lib/sitediff/fetch.rb +9 -2
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +144 -0
- data/lib/sitediff/files/sidebyside.html.erb +5 -2
- data/lib/sitediff/files/sitediff.css +226 -30
- data/lib/sitediff/files/sitediff.js +176 -0
- data/lib/sitediff/report.rb +238 -0
- data/lib/sitediff/result.rb +47 -19
- data/lib/sitediff/sanitize.rb +29 -8
- data/lib/sitediff/sanitize/dom_transform.rb +45 -6
- data/lib/sitediff/sanitize/regexp.rb +23 -2
- data/lib/sitediff/uriwrapper.rb +56 -15
- data/lib/sitediff/webserver.rb +12 -3
- data/lib/sitediff/webserver/resultserver.rb +28 -33
- metadata +33 -16
- data/lib/sitediff/files/html_report.html.erb +0 -66
- data/lib/sitediff/files/rules/drupal.yaml +0 -63
- data/lib/sitediff/rules.rb +0 -65
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1b7854497b5e81f48d810acec8106cbc66e33492d046e032e5516f76db26f142
|
4
|
+
data.tar.gz: a9349a79953237dd017600d49d38b8e734afc561f0ce09a1f8732e0e933530c9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0e91f665f3c59b8a65f16c6942ec49b9cc9ee7fd12b0a777eadb844a0b9819ab1fd9485495bf2c757ca7342a6198dcccb5ae546c4ddf2682f234d015b64309b2
|
7
|
+
data.tar.gz: 64b7980bbbade8710b6069af19a67083678c2bd5fa99674df3360c1c6a3ddf8a15de7c5be4e8349ec298fc1c0dc27535b816089cd4f8852b8c8633861d72a178
|
data/bin/sitediff
CHANGED
@@ -2,8 +2,15 @@
|
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
4
|
# when run as gem, $0 is /usr/local/bin/sitediff not this file
|
5
|
-
|
5
|
+
if $PROGRAM_NAME == __FILE__
|
6
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
|
7
|
+
end
|
6
8
|
|
7
9
|
require 'sitediff/cli'
|
8
10
|
|
9
|
-
|
11
|
+
begin
|
12
|
+
SiteDiff::Cli.start
|
13
|
+
rescue Interrupt
|
14
|
+
puts("\n")
|
15
|
+
SiteDiff.log('Stopping. Interrupted by user.')
|
16
|
+
end
|
data/lib/sitediff.rb
CHANGED
@@ -2,63 +2,85 @@
|
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
4
|
require 'sitediff/config'
|
5
|
+
require 'sitediff/diff'
|
5
6
|
require 'sitediff/fetch'
|
6
7
|
require 'sitediff/result'
|
8
|
+
require 'sitediff/report'
|
7
9
|
require 'pathname'
|
8
10
|
require 'rainbow'
|
11
|
+
require 'rubygems'
|
9
12
|
require 'yaml'
|
10
13
|
|
14
|
+
# SiteDiff Object.
|
11
15
|
class SiteDiff
|
12
|
-
|
16
|
+
attr_reader :config, :results
|
17
|
+
|
18
|
+
# SiteDiff installation directory.
|
19
|
+
ROOT_DIR = File.dirname(File.dirname(__FILE__))
|
20
|
+
|
21
|
+
# Path to misc files. Ex: *.erb, *.css.
|
13
22
|
FILES_DIR = File.join(File.dirname(__FILE__), 'sitediff', 'files')
|
14
23
|
|
15
|
-
#
|
16
|
-
|
17
|
-
|
18
|
-
#
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
bg = fg = nil
|
28
|
-
case type
|
29
|
-
when :info
|
30
|
-
bg = fg = nil
|
31
|
-
when :diff_success
|
32
|
-
bg = :green
|
24
|
+
# Logs a message.
|
25
|
+
#
|
26
|
+
# Label will be colorized and message will not.
|
27
|
+
# Type dictates the color: can be :success, :error, or :failure.
|
28
|
+
#
|
29
|
+
# TODO: Only print :debug messages in debug mode.
|
30
|
+
def self.log(message, type = :info, label = nil)
|
31
|
+
# Prepare label.
|
32
|
+
label ||= type unless type == :info
|
33
|
+
label = label.to_s
|
34
|
+
unless label.empty?
|
35
|
+
# Colorize label.
|
33
36
|
fg = :black
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
37
|
+
bg = :blue
|
38
|
+
|
39
|
+
case type
|
40
|
+
when :info
|
41
|
+
bg = :cyan
|
42
|
+
when :success
|
43
|
+
bg = :green
|
44
|
+
when :error
|
45
|
+
bg = :red
|
46
|
+
when :warning
|
47
|
+
bg = :yellow
|
48
|
+
end
|
49
|
+
|
50
|
+
label = '[' + label.to_s + ']'
|
51
|
+
label = Rainbow(label)
|
52
|
+
label = label.bg(bg) if bg
|
53
|
+
label = label.fg(fg) if fg
|
54
|
+
|
55
|
+
# Add a space after the label.
|
56
|
+
label += ' '
|
41
57
|
end
|
42
|
-
|
43
|
-
|
44
|
-
label = label.fg(fg) if fg
|
45
|
-
puts label + ' ' + str
|
58
|
+
|
59
|
+
puts label + message
|
46
60
|
end
|
47
61
|
|
48
|
-
|
62
|
+
##
|
63
|
+
# Returns the "before" site's URL.
|
64
|
+
#
|
65
|
+
# TODO: Remove in favor of config.before_url.
|
49
66
|
def before
|
50
67
|
@config.before['url']
|
51
68
|
end
|
52
69
|
|
70
|
+
##
|
71
|
+
# Returns the "after" site's URL.
|
72
|
+
#
|
73
|
+
# TODO: Remove in favor of config.after_url.
|
53
74
|
def after
|
54
75
|
@config.after['url']
|
55
76
|
end
|
56
77
|
|
57
|
-
|
78
|
+
# Initialize SiteDiff.
|
79
|
+
def initialize(config, cache, verbose = true, debug = false)
|
58
80
|
@cache = cache
|
59
81
|
@verbose = verbose
|
60
82
|
@debug = debug
|
61
|
-
|
83
|
+
|
62
84
|
# Check for single-site mode
|
63
85
|
validate_opts = {}
|
64
86
|
if !config.before['url'] && @cache.tag?(:before)
|
@@ -69,37 +91,48 @@ class SiteDiff
|
|
69
91
|
validate_opts[:need_before] = false
|
70
92
|
end
|
71
93
|
config.validate(validate_opts)
|
72
|
-
|
73
|
-
|
94
|
+
# Configure diff.
|
95
|
+
Diff.diff_config(config)
|
74
96
|
@config = config
|
75
97
|
end
|
76
98
|
|
77
|
-
# Sanitize HTML
|
99
|
+
# Sanitize HTML.
|
78
100
|
def sanitize(path, read_results)
|
79
101
|
%i[before after].map do |tag|
|
80
102
|
html = read_results[tag].content
|
103
|
+
# TODO: See why encoding is empty while running tests.
|
104
|
+
#
|
105
|
+
# The presence of an "encoding" value used to be used to determine
|
106
|
+
# if the sanitizer would be called. However, encoding turns up blank
|
107
|
+
# during rspec tests for some reason.
|
81
108
|
encoding = read_results[tag].encoding
|
82
|
-
if encoding
|
83
|
-
|
84
|
-
Sanitizer.new(html,
|
109
|
+
if encoding || html.length.positive?
|
110
|
+
section = @config.send(tag, true)
|
111
|
+
Sanitizer.new(html, section, path: path).sanitize
|
85
112
|
else
|
86
113
|
html
|
87
114
|
end
|
88
115
|
end
|
89
116
|
end
|
90
117
|
|
91
|
-
|
118
|
+
##
|
119
|
+
# Process a set of read results.
|
120
|
+
#
|
121
|
+
# This is the callback that processes items fetched by the Fetcher.
|
92
122
|
def process_results(path, read_results)
|
93
|
-
|
123
|
+
error = (read_results[:before].error || read_results[:after].error)
|
124
|
+
if error
|
94
125
|
diff = Result.new(path, nil, nil, nil, nil, error)
|
95
126
|
else
|
96
127
|
begin
|
97
|
-
diff = Result.new(
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
128
|
+
diff = Result.new(
|
129
|
+
path,
|
130
|
+
*sanitize(path, read_results),
|
131
|
+
read_results[:before].encoding,
|
132
|
+
read_results[:after].encoding,
|
133
|
+
nil
|
134
|
+
)
|
135
|
+
rescue StandardError => e
|
103
136
|
raise if @debug
|
104
137
|
|
105
138
|
Result.new(path, nil, nil, nil, nil, "Sanitization error: #{e}")
|
@@ -114,60 +147,72 @@ class SiteDiff
|
|
114
147
|
end
|
115
148
|
end
|
116
149
|
|
117
|
-
|
118
|
-
#
|
119
|
-
|
150
|
+
##
|
151
|
+
# Compute diff as per config.
|
152
|
+
#
|
153
|
+
# @return [Integer]
|
154
|
+
# Number of paths which have diffs.
|
155
|
+
def run
|
120
156
|
# Map of path -> Result object, populated by process_results
|
121
157
|
@results = {}
|
122
158
|
@ordered = @config.paths.dup
|
123
159
|
|
124
160
|
unless @cache.read_tags.empty?
|
125
|
-
SiteDiff.log('Using sites from cache: ' +
|
126
|
-
@cache.read_tags.sort.join(', '))
|
161
|
+
SiteDiff.log('Using sites from cache: ' + @cache.read_tags.sort.join(', '))
|
127
162
|
end
|
128
163
|
|
129
164
|
# TODO: Fix this after config merge refactor!
|
130
165
|
# Not quite right. We are not passing @config.before or @config.after
|
131
166
|
# so passing this instead but @config.after['curl_opts'] is ignored.
|
167
|
+
curl_opts = @config.setting :curl_opts
|
132
168
|
config_curl_opts = @config.before['curl_opts']
|
133
169
|
curl_opts = config_curl_opts.clone.merge(curl_opts) if config_curl_opts
|
134
|
-
fetcher = Fetch.new(
|
135
|
-
|
170
|
+
fetcher = Fetch.new(
|
171
|
+
@cache,
|
172
|
+
@config.paths,
|
173
|
+
@config.setting(:interval),
|
174
|
+
@config.setting(:concurrency),
|
175
|
+
curl_opts,
|
176
|
+
@debug,
|
177
|
+
before: @config.before_url,
|
178
|
+
after: @config.after_url
|
179
|
+
)
|
180
|
+
|
181
|
+
# Run the Fetcher with "process results" as a callback.
|
136
182
|
fetcher.run(&method(:process_results))
|
137
183
|
|
138
184
|
# Order by original path order
|
139
|
-
@results = @config.paths.map { |
|
185
|
+
@results = @config.paths.map { |path| @results[path] }
|
140
186
|
results.map { |r| r unless r.success? }.compact.length
|
141
187
|
end
|
142
188
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
# store diffs of each failing case, first wipe out existing diffs
|
151
|
-
diff_dir = dir + DIFFS_DIR
|
152
|
-
diff_dir.rmtree if diff_dir.exist?
|
153
|
-
results.each { |r| r.dump(dir) if r.status == Result::STATUS_FAILURE }
|
154
|
-
SiteDiff.log "All diff files were dumped inside #{dir.expand_path}"
|
155
|
-
|
156
|
-
# store failing paths
|
157
|
-
failures = dir + FAILURES_FILE
|
158
|
-
SiteDiff.log "Writing failures to #{failures.expand_path}"
|
159
|
-
failures.open('w') do |f|
|
160
|
-
results.each { |r| f.puts r.path unless r.success? }
|
189
|
+
##
|
190
|
+
# Get a reporter object to help with report generation.
|
191
|
+
def report
|
192
|
+
if @results.nil?
|
193
|
+
raise SiteDiffException(
|
194
|
+
'No results detected. Run SiteDiff.run before SiteDiff.report.'
|
195
|
+
)
|
161
196
|
end
|
162
197
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
198
|
+
Report.new(@config, @cache, @results)
|
199
|
+
end
|
200
|
+
|
201
|
+
##
|
202
|
+
# Get SiteDiff gemspec.
|
203
|
+
def self.gemspec
|
204
|
+
file = ROOT_DIR + '/sitediff.gemspec'
|
205
|
+
Gem::Specification.load(file)
|
206
|
+
end
|
167
207
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
208
|
+
##
|
209
|
+
# Ensures that a directory exists and returns a Pathname for it.
|
210
|
+
#
|
211
|
+
# @param [String] dir
|
212
|
+
# path/to/directory
|
213
|
+
def self.ensure_dir(dir)
|
214
|
+
dir = Pathname.new(dir) unless dir.is_a? Pathname
|
215
|
+
dir.mkpath unless dir.directory?
|
216
|
+
dir
|
172
217
|
end
|
173
218
|
end
|
data/lib/sitediff/cache.rb
CHANGED
@@ -4,28 +4,42 @@ require 'set'
|
|
4
4
|
require 'fileutils'
|
5
5
|
|
6
6
|
class SiteDiff
|
7
|
+
# SiteDiff Cache Handler.
|
7
8
|
class Cache
|
8
9
|
attr_accessor :read_tags, :write_tags
|
9
10
|
|
11
|
+
##
|
12
|
+
# Creates a Cache object.
|
10
13
|
def initialize(opts = {})
|
11
14
|
@create = opts[:create]
|
12
15
|
|
13
|
-
# Read and Write tags are sets that can contain :before and :after
|
14
|
-
# They indicate whether we should use the cache for reading or writing
|
16
|
+
# Read and Write tags are sets that can contain :before and :after.
|
17
|
+
# They indicate whether we should use the cache for reading or writing.
|
15
18
|
@read_tags = Set.new
|
16
19
|
@write_tags = Set.new
|
20
|
+
|
21
|
+
# The directory used by the cache for storage.
|
17
22
|
@dir = opts[:directory] || '.'
|
18
23
|
end
|
19
24
|
|
25
|
+
##
|
20
26
|
# Is a tag cached?
|
27
|
+
# TODO: Rename it to is_cached? as it makes more sense.
|
21
28
|
def tag?(tag)
|
22
29
|
File.directory?(File.join(@dir, 'snapshot', tag.to_s))
|
23
30
|
end
|
24
31
|
|
32
|
+
##
|
33
|
+
# Get data from cache.
|
25
34
|
def get(tag, path)
|
26
35
|
return nil unless @read_tags.include? tag
|
27
36
|
|
28
|
-
filename = File.join(
|
37
|
+
filename = File.join(
|
38
|
+
@dir,
|
39
|
+
'snapshot',
|
40
|
+
tag.to_s,
|
41
|
+
*path.split(File::SEPARATOR)
|
42
|
+
)
|
29
43
|
|
30
44
|
filename = File.join(filename, 'index.html') if File.directory?(filename)
|
31
45
|
return nil unless File.file? filename
|
@@ -33,10 +47,17 @@ class SiteDiff
|
|
33
47
|
Marshal.load(File.read(filename))
|
34
48
|
end
|
35
49
|
|
50
|
+
##
|
51
|
+
# Set data to cache.
|
36
52
|
def set(tag, path, result)
|
37
53
|
return unless @write_tags.include? tag
|
38
54
|
|
39
|
-
filename = File.join(
|
55
|
+
filename = File.join(
|
56
|
+
@dir,
|
57
|
+
'snapshot',
|
58
|
+
tag.to_s,
|
59
|
+
*path.split(File::SEPARATOR)
|
60
|
+
)
|
40
61
|
|
41
62
|
filename = File.join(filename, 'index.html') if File.directory?(filename)
|
42
63
|
filepath = Pathname.new(filename)
|
@@ -50,23 +71,31 @@ class SiteDiff
|
|
50
71
|
# May cause problems if action is not atomic!
|
51
72
|
# Move existing file to dir/index.html first
|
52
73
|
# Not robust! Should generate an UUID or something.
|
53
|
-
|
74
|
+
if File.exist?(tempname)
|
75
|
+
SiteDiff.log "Overwriting file #{tempname}", :warning
|
76
|
+
end
|
54
77
|
curdir.rename(tempname)
|
55
78
|
filepath.dirname.mkpath
|
56
79
|
# Should only happen in strange situations such as when the path
|
57
80
|
# is foo/index.html/bar (i.e., index.html is a directory)
|
58
|
-
|
81
|
+
if (curdir + 'index.html').exist?
|
82
|
+
SiteDiff.log "Overwriting file #{tempname}", :warning
|
83
|
+
end
|
59
84
|
tempname.rename(curdir + 'index.html')
|
60
85
|
end
|
61
86
|
end
|
62
87
|
File.open(filename, 'w') { |file| file.write(Marshal.dump(result)) }
|
63
88
|
end
|
64
89
|
|
90
|
+
##
|
91
|
+
# TODO: Document this or remove it if unused.
|
65
92
|
def key(tag, path)
|
66
93
|
# Ensure encoding stays the same!
|
67
94
|
Marshal.dump([tag, path.encode('UTF-8')])
|
68
95
|
end
|
69
96
|
|
97
|
+
##
|
98
|
+
# Ensures that a directory exists.
|
70
99
|
def get_dir(directory)
|
71
100
|
# Create the dir. Must go before cache initialization!
|
72
101
|
@dir = Pathname.new(directory || '.')
|
data/lib/sitediff/cli.rb
CHANGED
@@ -5,32 +5,32 @@ require 'sitediff'
|
|
5
5
|
require 'sitediff/cache'
|
6
6
|
require 'sitediff/config'
|
7
7
|
require 'sitediff/config/creator'
|
8
|
+
require 'sitediff/config/preset'
|
8
9
|
require 'sitediff/fetch'
|
9
10
|
require 'sitediff/webserver/resultserver'
|
10
11
|
|
11
12
|
class SiteDiff
|
13
|
+
# SiteDiff CLI.
|
14
|
+
# TODO: Use config.defaults to feed default values for sitediff.yaml params?
|
12
15
|
class Cli < Thor
|
13
16
|
class_option 'directory',
|
14
17
|
type: :string,
|
15
18
|
aliases: '-C',
|
16
19
|
default: 'sitediff',
|
17
20
|
desc: 'Configuration directory'
|
18
|
-
class_option :
|
19
|
-
type: :hash,
|
20
|
-
default: {},
|
21
|
-
desc: 'Options to be passed to curl'
|
22
|
-
class_option :insecure,
|
21
|
+
class_option :debug,
|
23
22
|
type: :boolean,
|
23
|
+
aliases: '-d',
|
24
24
|
default: false,
|
25
|
-
desc: '
|
26
|
-
class_option
|
25
|
+
desc: 'Stop on certain errors and produce error trace backs.'
|
26
|
+
class_option 'verbose',
|
27
27
|
type: :boolean,
|
28
|
+
aliases: '-v',
|
28
29
|
default: false,
|
29
|
-
desc: '
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
desc: 'Crawling delay - interval in milliseconds'
|
30
|
+
desc: 'Show verbose output in terminal'
|
31
|
+
|
32
|
+
# Command aliases.
|
33
|
+
map recrawl: :crawl
|
34
34
|
|
35
35
|
# Thor, by default, exits with 0 no matter what!
|
36
36
|
def self.exit_on_failure?
|
@@ -42,6 +42,20 @@ class SiteDiff
|
|
42
42
|
true
|
43
43
|
end
|
44
44
|
|
45
|
+
desc 'version', 'Show version information'
|
46
|
+
##
|
47
|
+
# Show version information.
|
48
|
+
def version
|
49
|
+
gemspec = SiteDiff.gemspec
|
50
|
+
output = []
|
51
|
+
output.push("Sitediff CLI #{gemspec.version}")
|
52
|
+
if options[:verbose]
|
53
|
+
output.push('Website: ' + gemspec.homepage)
|
54
|
+
output.push('GitHub: ' + gemspec.metadata['source_code_uri'])
|
55
|
+
end
|
56
|
+
puts output.join("\n")
|
57
|
+
end
|
58
|
+
|
45
59
|
option 'paths-file',
|
46
60
|
type: :string,
|
47
61
|
desc: 'Paths are read (one at a line) from PATHS: ' \
|
@@ -53,79 +67,118 @@ class SiteDiff
|
|
53
67
|
desc: 'Specific path or paths to fetch'
|
54
68
|
option 'before',
|
55
69
|
type: :string,
|
56
|
-
desc: 'URL
|
70
|
+
desc: 'URL to the "before" site, prefixed to all paths.',
|
57
71
|
aliases: '--before-url'
|
58
72
|
option 'after',
|
59
73
|
type: :string,
|
60
|
-
desc: 'URL
|
74
|
+
desc: 'URL to the "after" site, prefixed to all paths.',
|
61
75
|
aliases: '--after-url'
|
76
|
+
option 'report-format',
|
77
|
+
type: :string,
|
78
|
+
enum: %w[html json],
|
79
|
+
default: 'html',
|
80
|
+
desc: 'The format in which a report should be generated.'
|
81
|
+
# TODO: Deprecate the parameters before-report / after-report?
|
62
82
|
option 'before-report',
|
63
83
|
type: :string,
|
64
|
-
desc: '
|
84
|
+
desc: 'URL to use in reports. Useful if port forwarding.',
|
65
85
|
aliases: '--before-url-report'
|
66
86
|
option 'after-report',
|
67
87
|
type: :string,
|
68
|
-
desc: '
|
88
|
+
desc: 'URL to use in reports. Useful if port forwarding.',
|
69
89
|
aliases: '--after-url-report'
|
70
90
|
option 'cached',
|
71
91
|
type: :string,
|
72
92
|
enum: %w[none all before after],
|
73
93
|
default: 'before',
|
74
94
|
desc: 'Use the cached version of these sites, if available.'
|
75
|
-
option '
|
95
|
+
option 'ignore-whitespace',
|
76
96
|
type: :boolean,
|
77
|
-
aliases: '-v',
|
78
97
|
default: false,
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
98
|
+
aliases: '-w',
|
99
|
+
desc: 'Ignore changes in whitespace.'
|
100
|
+
option 'export',
|
101
|
+
type: :boolean,
|
102
|
+
default: false,
|
103
|
+
aliases: '-e',
|
104
|
+
desc: 'Export report to files. This option forces HTML format.'
|
105
|
+
desc 'diff [OPTIONS] [CONFIG-FILE]',
|
106
|
+
'Compute diffs on configured URLs.'
|
107
|
+
##
|
108
|
+
# Computes diffs.
|
109
|
+
def diff(config_file = nil)
|
88
110
|
@dir = get_dir(options['directory'])
|
89
|
-
config = SiteDiff::Config.new(
|
90
|
-
|
91
|
-
# override config based on options
|
92
|
-
paths = options['paths']
|
93
|
-
if (paths_file = options['paths-file'])
|
94
|
-
if paths
|
95
|
-
SiteDiff.log "Can't have both --paths-file and --paths", :error
|
96
|
-
exit(-1)
|
97
|
-
end
|
111
|
+
config = SiteDiff::Config.new(config_file, @dir)
|
98
112
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
end
|
104
|
-
SiteDiff.log "Reading paths from: #{paths_file}"
|
105
|
-
config.paths = File.readlines(paths_file)
|
113
|
+
# Determine "paths" override based on options.
|
114
|
+
if options['paths'] && options['paths-file']
|
115
|
+
SiteDiff.log "Can't specify both --paths-file and --paths.", :error
|
116
|
+
exit(-1)
|
106
117
|
end
|
107
|
-
config.paths = paths if paths
|
108
118
|
|
119
|
+
# Ignore whitespace option.
|
120
|
+
config.ignore_whitespace = options['ignore-whitespace'] if options['ignore-whitespace']
|
121
|
+
|
122
|
+
# Export report option.
|
123
|
+
config.export = options['export']
|
124
|
+
|
125
|
+
# Apply "paths" override, if any.
|
126
|
+
config.paths = options['paths'] if options['paths']
|
127
|
+
|
128
|
+
# Determine and apply "paths-file", if "paths" is not specified.
|
129
|
+
unless options['paths']
|
130
|
+
paths_file = options['paths-file']
|
131
|
+
paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
|
132
|
+
paths_file = File.expand_path(paths_file)
|
133
|
+
|
134
|
+
paths_count = config.paths_file_read(paths_file)
|
135
|
+
SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
|
136
|
+
end
|
137
|
+
|
138
|
+
# TODO: Why do we allow before and after override during diff?
|
109
139
|
config.before['url'] = options['before'] if options['before']
|
110
140
|
config.after['url'] = options['after'] if options['after']
|
111
141
|
|
112
|
-
#
|
113
|
-
cache = SiteDiff::Cache.new(
|
114
|
-
|
142
|
+
# Prepare cache.
|
143
|
+
cache = SiteDiff::Cache.new(
|
144
|
+
create: options['cached'] != 'none',
|
145
|
+
directory: @dir
|
146
|
+
)
|
115
147
|
cache.read_tags << :before if %w[before all].include?(options['cached'])
|
116
148
|
cache.read_tags << :after if %w[after all].include?(options['cached'])
|
117
149
|
cache.write_tags << :before << :after
|
118
150
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
151
|
+
# Run sitediff.
|
152
|
+
sitediff = SiteDiff.new(
|
153
|
+
config,
|
154
|
+
cache,
|
155
|
+
options['verbose'],
|
156
|
+
options[:debug]
|
157
|
+
)
|
158
|
+
num_failing = sitediff.run
|
159
|
+
exit_code = num_failing.positive? ? 2 : 0
|
160
|
+
|
161
|
+
# Generate HTML report.
|
162
|
+
if options['report-format'] == 'html' || config.export
|
163
|
+
sitediff.report.generate_html(
|
164
|
+
@dir,
|
165
|
+
options['before-report'],
|
166
|
+
options['after-report']
|
167
|
+
)
|
168
|
+
end
|
123
169
|
|
124
|
-
|
125
|
-
|
170
|
+
# Generate JSON report.
|
171
|
+
if options['report-format'] == 'json' && config.export == false
|
172
|
+
sitediff.report.generate_json @dir
|
173
|
+
end
|
174
|
+
|
175
|
+
SiteDiff.log 'Run "sitediff serve" to see a report.' unless options['export']
|
126
176
|
rescue Config::InvalidConfig => e
|
127
177
|
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
128
|
-
SiteDiff.log
|
178
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
179
|
+
rescue Config::ConfigNotFound => e
|
180
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
181
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
129
182
|
else # no exception was raised
|
130
183
|
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
131
184
|
# Failing diff --> exit(2), populated above
|
@@ -140,11 +193,14 @@ class SiteDiff
|
|
140
193
|
type: :boolean,
|
141
194
|
default: true,
|
142
195
|
desc: 'Whether to open the served content in your browser'
|
143
|
-
desc 'serve [OPTIONS]',
|
144
|
-
|
145
|
-
|
146
|
-
|
196
|
+
desc 'serve [OPTIONS] [CONFIG-FILE]',
|
197
|
+
'Serve SiteDiff report directory over HTTP.'
|
198
|
+
##
|
199
|
+
# Serves SiteDiff report for accessing in the browser.
|
200
|
+
def serve(config_file = nil)
|
147
201
|
@dir = get_dir(options['directory'])
|
202
|
+
config = SiteDiff::Config.new(config_file, @dir)
|
203
|
+
|
148
204
|
cache = Cache.new(directory: @dir)
|
149
205
|
cache.read_tags << :before << :after
|
150
206
|
|
@@ -157,85 +213,102 @@ class SiteDiff
|
|
157
213
|
).wait
|
158
214
|
rescue SiteDiffException => e
|
159
215
|
SiteDiff.log e.message, :error
|
160
|
-
SiteDiff.log e.backtrace, :error
|
216
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
161
217
|
end
|
162
218
|
|
163
219
|
option :depth,
|
164
220
|
type: :numeric,
|
165
|
-
default:
|
221
|
+
default: Config::DEFAULT_CONFIG['settings']['depth'],
|
166
222
|
desc: 'How deeply to crawl the given site'
|
167
|
-
option :
|
223
|
+
option :crawl,
|
224
|
+
type: :boolean,
|
225
|
+
default: true,
|
226
|
+
desc: 'Run "sitediff crawl" to discover paths.'
|
227
|
+
option :preset,
|
168
228
|
type: :string,
|
169
|
-
enum:
|
170
|
-
|
171
|
-
desc: 'Whether rules for the site should be auto-created'
|
229
|
+
enum: Config::Preset.all,
|
230
|
+
desc: 'Framework-specific presets to apply.'
|
172
231
|
option :concurrency,
|
173
232
|
type: :numeric,
|
174
|
-
default:
|
175
|
-
desc: 'Max number of concurrent connections made'
|
233
|
+
default: Config::DEFAULT_CONFIG['settings']['concurrency'],
|
234
|
+
desc: 'Max number of concurrent connections made.'
|
235
|
+
option :interval,
|
236
|
+
type: :numeric,
|
237
|
+
default: Config::DEFAULT_CONFIG['settings']['interval'],
|
238
|
+
desc: 'Crawling delay - interval in milliseconds.'
|
176
239
|
option :whitelist,
|
177
240
|
type: :string,
|
178
|
-
default: '',
|
179
|
-
desc: 'Optional whitelist for crawling'
|
241
|
+
default: Config::DEFAULT_CONFIG['settings']['whitelist'],
|
242
|
+
desc: 'Optional whitelist for crawling.'
|
180
243
|
option :blacklist,
|
181
244
|
type: :string,
|
182
|
-
default: '',
|
183
|
-
desc: 'Optional blacklist for crawling'
|
184
|
-
|
245
|
+
default: Config::DEFAULT_CONFIG['settings']['blacklist'],
|
246
|
+
desc: 'Optional blacklist for crawling.'
|
247
|
+
# TODO: Remove this option. Always ignore SSL errors.
|
248
|
+
option :insecure,
|
249
|
+
type: :boolean,
|
250
|
+
default: false,
|
251
|
+
desc: 'Ignore many HTTPS/SSL errors'
|
252
|
+
option :curl_options,
|
253
|
+
type: :hash,
|
254
|
+
default: {},
|
255
|
+
desc: 'Options to be passed to curl'
|
256
|
+
desc 'init URL [URL]', 'Create a sitediff configuration.'
|
257
|
+
##
|
258
|
+
# Initializes a sitediff (yaml) configuration file.
|
185
259
|
def init(*urls)
|
186
260
|
unless (1..2).cover? urls.size
|
187
261
|
SiteDiff.log 'sitediff init requires one or two URLs', :error
|
188
262
|
exit(2)
|
189
263
|
end
|
190
264
|
|
191
|
-
|
192
|
-
check_interval(@interval)
|
265
|
+
# Prepare a config object and write it to the file system.
|
193
266
|
@dir = get_dir(options['directory'])
|
194
|
-
|
195
|
-
@whitelist = create_regexp(options['whitelist'])
|
196
|
-
@blacklist = create_regexp(options['blacklist'])
|
197
|
-
creator = SiteDiff::Config::Creator.new(options[:concurrency],
|
198
|
-
options['interval'],
|
199
|
-
@whitelist,
|
200
|
-
@blacklist,
|
201
|
-
curl_opts,
|
202
|
-
options[:debug],
|
203
|
-
*urls)
|
267
|
+
creator = SiteDiff::Config::Creator.new(options[:debug], *urls)
|
204
268
|
creator.create(
|
205
269
|
depth: options[:depth],
|
206
270
|
directory: @dir,
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
271
|
+
concurrency: options[:concurrency],
|
272
|
+
interval: options[:interval],
|
273
|
+
whitelist: Config.create_regexp(options['whitelist']),
|
274
|
+
blacklist: Config.create_regexp(options['blacklist']),
|
275
|
+
preset: options[:preset],
|
276
|
+
curl_opts: get_curl_opts(options)
|
277
|
+
)
|
213
278
|
SiteDiff.log "Created #{creator.config_file.expand_path}", :success
|
214
|
-
|
279
|
+
|
280
|
+
# Discover paths, if enabled.
|
281
|
+
if options[:crawl]
|
282
|
+
crawl(creator.config_file)
|
283
|
+
SiteDiff.log 'You can now run "sitediff diff".', :success
|
284
|
+
else
|
285
|
+
SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
|
286
|
+
end
|
215
287
|
end
|
216
288
|
|
217
289
|
option :url,
|
218
290
|
type: :string,
|
219
291
|
desc: 'A custom base URL to fetch from'
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
'Cache the current contents of a site for later comparison'
|
226
|
-
def store(*config_files)
|
292
|
+
desc 'store [CONFIG-FILE]',
|
293
|
+
'Cache the current contents of a site for later comparison.'
|
294
|
+
##
|
295
|
+
# Caches the current version of the site.
|
296
|
+
def store(config_file = nil)
|
227
297
|
@dir = get_dir(options['directory'])
|
228
|
-
config = SiteDiff::Config.new(
|
298
|
+
config = SiteDiff::Config.new(config_file, @dir)
|
299
|
+
# TODO: Figure out how to remove this config.validate call.
|
229
300
|
config.validate(need_before: false)
|
301
|
+
config.paths_file_read
|
302
|
+
|
230
303
|
cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
231
304
|
cache.write_tags << :before
|
232
305
|
|
233
306
|
base = options[:url] || config.after['url']
|
234
307
|
fetcher = SiteDiff::Fetch.new(cache,
|
235
308
|
config.paths,
|
236
|
-
|
237
|
-
|
238
|
-
get_curl_opts(
|
309
|
+
config.setting(:interval),
|
310
|
+
config.setting(:concurrency),
|
311
|
+
get_curl_opts(config.settings),
|
239
312
|
options[:debug],
|
240
313
|
before: base)
|
241
314
|
fetcher.run do |path, _res|
|
@@ -243,11 +316,70 @@ class SiteDiff
|
|
243
316
|
end
|
244
317
|
end
|
245
318
|
|
319
|
+
desc 'crawl [CONFIG-FILE]',
|
320
|
+
'Crawl the "before" site to discover paths.'
|
321
|
+
##
|
322
|
+
# Crawls the "before" site to determine "paths".
|
323
|
+
#
|
324
|
+
# TODO: Move actual crawling to sitediff.crawl(config).
|
325
|
+
# TODO: Switch to paths = sitediff.crawl().
|
326
|
+
def crawl(config_file = nil)
|
327
|
+
# Prepare configuration.
|
328
|
+
@dir = get_dir(options['directory'])
|
329
|
+
@config = SiteDiff::Config.new(config_file, @dir)
|
330
|
+
|
331
|
+
# Prepare cache.
|
332
|
+
@cache = SiteDiff::Cache.new(
|
333
|
+
create: options['cached'] != 'none',
|
334
|
+
directory: @dir
|
335
|
+
)
|
336
|
+
@cache.write_tags << :before << :after
|
337
|
+
|
338
|
+
# Crawl with Hydra to discover paths.
|
339
|
+
hydra = Typhoeus::Hydra.new(
|
340
|
+
max_concurrency: @config.setting(:concurrency)
|
341
|
+
)
|
342
|
+
@paths = {}
|
343
|
+
@config.roots.each do |tag, url|
|
344
|
+
Crawler.new(
|
345
|
+
hydra,
|
346
|
+
url,
|
347
|
+
@config.setting(:interval),
|
348
|
+
@config.setting(:whitelist),
|
349
|
+
@config.setting(:blacklist),
|
350
|
+
@config.setting(:depth),
|
351
|
+
get_curl_opts(@config.settings),
|
352
|
+
@debug
|
353
|
+
) do |info|
|
354
|
+
SiteDiff.log "Visited #{info.uri}, cached."
|
355
|
+
after_crawl(tag, info)
|
356
|
+
end
|
357
|
+
end
|
358
|
+
hydra.run
|
359
|
+
|
360
|
+
# Write paths to a file.
|
361
|
+
@paths = @paths.values.reduce(&:|).to_a.sort
|
362
|
+
@config.paths_file_write(@paths)
|
363
|
+
|
364
|
+
# Log output.
|
365
|
+
file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
|
366
|
+
SiteDiff.log ''
|
367
|
+
SiteDiff.log "#{@paths.length} page(s) found."
|
368
|
+
SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
|
369
|
+
end
|
370
|
+
|
246
371
|
no_commands do
|
372
|
+
# Generates CURL options.
|
373
|
+
#
|
374
|
+
# TODO: This should be in the config class instead.
|
375
|
+
# TODO: Make all requests insecure and avoid custom curl-opts.
|
247
376
|
def get_curl_opts(options)
|
248
377
|
# We do want string keys here
|
249
378
|
bool_hash = { 'true' => true, 'false' => false }
|
250
|
-
curl_opts = UriWrapper::DEFAULT_CURL_OPTS
|
379
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS
|
380
|
+
.clone
|
381
|
+
.merge(options['curl_options'] || {})
|
382
|
+
.merge(options['curl_opts'] || {})
|
251
383
|
curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
|
252
384
|
if options[:insecure]
|
253
385
|
curl_opts[:ssl_verifypeer] = false
|
@@ -256,13 +388,8 @@ class SiteDiff
|
|
256
388
|
curl_opts
|
257
389
|
end
|
258
390
|
|
259
|
-
|
260
|
-
|
261
|
-
SiteDiff.log '--concurrency must be set to 1 in order to enable the interval feature'
|
262
|
-
exit(2)
|
263
|
-
end
|
264
|
-
end
|
265
|
-
|
391
|
+
##
|
392
|
+
# Ensures that the given directory exists.
|
266
393
|
def get_dir(directory)
|
267
394
|
# Create the dir. Must go before cache initialization!
|
268
395
|
@dir = Pathname.new(directory || '.')
|
@@ -270,16 +397,24 @@ class SiteDiff
|
|
270
397
|
@dir.to_s
|
271
398
|
end
|
272
399
|
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
400
|
+
##
|
401
|
+
# Processes a crawled path.
|
402
|
+
def after_crawl(tag, info)
|
403
|
+
path = UriWrapper.canonicalize(info.relative)
|
404
|
+
|
405
|
+
# Register the path.
|
406
|
+
@paths[tag] = [] unless @paths[tag]
|
407
|
+
@paths[tag] << path
|
408
|
+
|
409
|
+
result = info.read_result
|
410
|
+
|
411
|
+
# Write result to applicable cache.
|
412
|
+
@cache.set(tag, path, result)
|
413
|
+
# If single-site, cache "after" as "before".
|
414
|
+
@cache.set(:before, path, result) unless @config.roots[:before]
|
415
|
+
|
416
|
+
# TODO: Restore application of rules.
|
417
|
+
# @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
283
418
|
end
|
284
419
|
end
|
285
420
|
end
|