sitediff 0.0.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/sitediff +10 -4
- data/lib/sitediff.rb +179 -91
- data/lib/sitediff/cache.rb +106 -0
- data/lib/sitediff/cli.rb +391 -60
- data/lib/sitediff/config.rb +383 -37
- data/lib/sitediff/config/creator.rb +114 -0
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +131 -0
- data/lib/sitediff/diff.rb +57 -12
- data/lib/sitediff/exception.rb +5 -0
- data/lib/sitediff/fetch.rb +76 -0
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +144 -0
- data/lib/sitediff/files/sidebyside.html.erb +16 -0
- data/lib/sitediff/files/sitediff.css +236 -29
- data/lib/sitediff/files/sitediff.js +176 -0
- data/lib/sitediff/report.rb +238 -0
- data/lib/sitediff/result.rb +63 -26
- data/lib/sitediff/sanitize.rb +160 -141
- data/lib/sitediff/sanitize/dom_transform.rb +130 -0
- data/lib/sitediff/sanitize/regexp.rb +82 -0
- data/lib/sitediff/uriwrapper.rb +114 -35
- data/lib/sitediff/webserver.rb +94 -0
- data/lib/sitediff/webserver/resultserver.rb +134 -0
- metadata +103 -43
- data/lib/sitediff/files/html_report.html.erb +0 -47
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
data/lib/sitediff/cli.rb
CHANGED
@@ -1,90 +1,421 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'thor'
|
2
|
-
require 'sitediff
|
3
|
-
require 'sitediff/
|
4
|
-
require 'sitediff/
|
5
|
-
require '
|
6
|
-
require '
|
4
|
+
require 'sitediff'
|
5
|
+
require 'sitediff/cache'
|
6
|
+
require 'sitediff/config'
|
7
|
+
require 'sitediff/config/creator'
|
8
|
+
require 'sitediff/config/preset'
|
9
|
+
require 'sitediff/fetch'
|
10
|
+
require 'sitediff/webserver/resultserver'
|
7
11
|
|
8
12
|
class SiteDiff
|
13
|
+
# SiteDiff CLI.
|
14
|
+
# TODO: Use config.defaults to feed default values for sitediff.yaml params?
|
9
15
|
class Cli < Thor
|
16
|
+
class_option 'directory',
|
17
|
+
type: :string,
|
18
|
+
aliases: '-C',
|
19
|
+
default: 'sitediff',
|
20
|
+
desc: 'Configuration directory'
|
21
|
+
class_option :debug,
|
22
|
+
type: :boolean,
|
23
|
+
aliases: '-d',
|
24
|
+
default: false,
|
25
|
+
desc: 'Stop on certain errors and produce error trace backs.'
|
26
|
+
class_option 'verbose',
|
27
|
+
type: :boolean,
|
28
|
+
aliases: '-v',
|
29
|
+
default: false,
|
30
|
+
desc: 'Show verbose output in terminal'
|
31
|
+
|
32
|
+
# Command aliases.
|
33
|
+
map recrawl: :crawl
|
34
|
+
|
10
35
|
# Thor, by default, exits with 0 no matter what!
|
11
36
|
def self.exit_on_failure?
|
12
37
|
true
|
13
38
|
end
|
14
39
|
|
15
40
|
# Thor, by default, does not raise an error for use of unknown options.
|
16
|
-
def self.check_unknown_options?(
|
41
|
+
def self.check_unknown_options?(_config)
|
17
42
|
true
|
18
43
|
end
|
19
44
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
45
|
+
desc 'version', 'Show version information'
|
46
|
+
##
|
47
|
+
# Show version information.
|
48
|
+
def version
|
49
|
+
gemspec = SiteDiff.gemspec
|
50
|
+
output = []
|
51
|
+
output.push("Sitediff CLI #{gemspec.version}")
|
52
|
+
if options[:verbose]
|
53
|
+
output.push('Website: ' + gemspec.homepage)
|
54
|
+
output.push('GitHub: ' + gemspec.metadata['source_code_uri'])
|
55
|
+
end
|
56
|
+
puts output.join("\n")
|
57
|
+
end
|
58
|
+
|
59
|
+
option 'paths-file',
|
60
|
+
type: :string,
|
61
|
+
desc: 'Paths are read (one at a line) from PATHS: ' \
|
62
|
+
'useful for iterating over sanitization rules',
|
63
|
+
aliases: '--paths-from-file'
|
24
64
|
option 'paths',
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
:aliases => '--paths-from-file'
|
65
|
+
type: :array,
|
66
|
+
aliases: '-p',
|
67
|
+
desc: 'Specific path or paths to fetch'
|
29
68
|
option 'before',
|
30
|
-
|
31
|
-
|
32
|
-
|
69
|
+
type: :string,
|
70
|
+
desc: 'URL to the "before" site, prefixed to all paths.',
|
71
|
+
aliases: '--before-url'
|
33
72
|
option 'after',
|
34
|
-
|
35
|
-
|
36
|
-
|
73
|
+
type: :string,
|
74
|
+
desc: 'URL to the "after" site, prefixed to all paths.',
|
75
|
+
aliases: '--after-url'
|
76
|
+
option 'report-format',
|
77
|
+
type: :string,
|
78
|
+
enum: %w[html json],
|
79
|
+
default: 'html',
|
80
|
+
desc: 'The format in which a report should be generated.'
|
81
|
+
# TODO: Deprecate the parameters before-report / after-report?
|
37
82
|
option 'before-report',
|
38
|
-
|
39
|
-
|
40
|
-
|
83
|
+
type: :string,
|
84
|
+
desc: 'URL to use in reports. Useful if port forwarding.',
|
85
|
+
aliases: '--before-url-report'
|
41
86
|
option 'after-report',
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
option '
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
87
|
+
type: :string,
|
88
|
+
desc: 'URL to use in reports. Useful if port forwarding.',
|
89
|
+
aliases: '--after-url-report'
|
90
|
+
option 'cached',
|
91
|
+
type: :string,
|
92
|
+
enum: %w[none all before after],
|
93
|
+
default: 'before',
|
94
|
+
desc: 'Use the cached version of these sites, if available.'
|
95
|
+
option 'ignore-whitespace',
|
96
|
+
type: :boolean,
|
97
|
+
default: false,
|
98
|
+
aliases: '-w',
|
99
|
+
desc: 'Ignore changes in whitespace.'
|
100
|
+
option 'export',
|
101
|
+
type: :boolean,
|
102
|
+
default: false,
|
103
|
+
aliases: '-e',
|
104
|
+
desc: 'Export report to files. This option forces HTML format.'
|
105
|
+
desc 'diff [OPTIONS] [CONFIG-FILE]',
|
106
|
+
'Compute diffs on configured URLs.'
|
107
|
+
##
|
108
|
+
# Computes diffs.
|
109
|
+
def diff(config_file = nil)
|
110
|
+
@dir = get_dir(options['directory'])
|
111
|
+
config = SiteDiff::Config.new(config_file, @dir)
|
112
|
+
|
113
|
+
# Determine "paths" override based on options.
|
114
|
+
if options['paths'] && options['paths-file']
|
115
|
+
SiteDiff.log "Can't specify both --paths-file and --paths.", :error
|
116
|
+
exit(-1)
|
117
|
+
end
|
118
|
+
|
119
|
+
# Ignore whitespace option.
|
120
|
+
config.ignore_whitespace = options['ignore-whitespace'] if options['ignore-whitespace']
|
121
|
+
|
122
|
+
# Export report option.
|
123
|
+
config.export = options['export']
|
124
|
+
|
125
|
+
# Apply "paths" override, if any.
|
126
|
+
config.paths = options['paths'] if options['paths']
|
127
|
+
|
128
|
+
# Determine and apply "paths-file", if "paths" is not specified.
|
129
|
+
unless options['paths']
|
130
|
+
paths_file = options['paths-file']
|
131
|
+
paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
|
132
|
+
paths_file = File.expand_path(paths_file)
|
133
|
+
|
134
|
+
paths_count = config.paths_file_read(paths_file)
|
135
|
+
SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
|
61
136
|
end
|
137
|
+
|
138
|
+
# TODO: Why do we allow before and after override during diff?
|
62
139
|
config.before['url'] = options['before'] if options['before']
|
63
140
|
config.after['url'] = options['after'] if options['after']
|
64
141
|
|
65
|
-
|
66
|
-
|
142
|
+
# Prepare cache.
|
143
|
+
cache = SiteDiff::Cache.new(
|
144
|
+
create: options['cached'] != 'none',
|
145
|
+
directory: @dir
|
146
|
+
)
|
147
|
+
cache.read_tags << :before if %w[before all].include?(options['cached'])
|
148
|
+
cache.read_tags << :after if %w[after all].include?(options['cached'])
|
149
|
+
cache.write_tags << :before << :after
|
150
|
+
|
151
|
+
# Run sitediff.
|
152
|
+
sitediff = SiteDiff.new(
|
153
|
+
config,
|
154
|
+
cache,
|
155
|
+
options['verbose'],
|
156
|
+
options[:debug]
|
157
|
+
)
|
158
|
+
num_failing = sitediff.run
|
159
|
+
exit_code = num_failing.positive? ? 2 : 0
|
160
|
+
|
161
|
+
# Generate HTML report.
|
162
|
+
if options['report-format'] == 'html' || config.export
|
163
|
+
sitediff.report.generate_html(
|
164
|
+
@dir,
|
165
|
+
options['before-report'],
|
166
|
+
options['after-report']
|
167
|
+
)
|
168
|
+
end
|
67
169
|
|
68
|
-
|
69
|
-
|
70
|
-
|
170
|
+
# Generate JSON report.
|
171
|
+
if options['report-format'] == 'json' && config.export == false
|
172
|
+
sitediff.report.generate_json @dir
|
173
|
+
end
|
174
|
+
|
175
|
+
SiteDiff.log 'Run "sitediff serve" to see a report.' unless options['export']
|
71
176
|
rescue Config::InvalidConfig => e
|
72
|
-
SiteDiff.log "Invalid configuration: #{e.message}", :
|
177
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
178
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
179
|
+
rescue Config::ConfigNotFound => e
|
180
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
181
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
182
|
+
else # no exception was raised
|
183
|
+
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
184
|
+
# Failing diff --> exit(2), populated above
|
185
|
+
exit(exit_code)
|
73
186
|
end
|
74
187
|
|
75
188
|
option :port,
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
option :
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
189
|
+
type: :numeric,
|
190
|
+
default: SiteDiff::Webserver::DEFAULT_PORT,
|
191
|
+
desc: 'The port to serve on'
|
192
|
+
option :browse,
|
193
|
+
type: :boolean,
|
194
|
+
default: true,
|
195
|
+
desc: 'Whether to open the served content in your browser'
|
196
|
+
desc 'serve [OPTIONS] [CONFIG-FILE]',
|
197
|
+
'Serve SiteDiff report directory over HTTP.'
|
198
|
+
##
|
199
|
+
# Serves SiteDiff report for accessing in the browser.
|
200
|
+
def serve(config_file = nil)
|
201
|
+
@dir = get_dir(options['directory'])
|
202
|
+
config = SiteDiff::Config.new(config_file, @dir)
|
203
|
+
|
204
|
+
cache = Cache.new(directory: @dir)
|
205
|
+
cache.read_tags << :before << :after
|
206
|
+
|
207
|
+
SiteDiff::Webserver::ResultServer.new(
|
208
|
+
options[:port],
|
209
|
+
options['directory'],
|
210
|
+
browse: options[:browse],
|
211
|
+
cache: cache,
|
212
|
+
config: config
|
213
|
+
).wait
|
214
|
+
rescue SiteDiffException => e
|
215
|
+
SiteDiff.log e.message, :error
|
216
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
217
|
+
end
|
218
|
+
|
219
|
+
option :depth,
|
220
|
+
type: :numeric,
|
221
|
+
default: Config::DEFAULT_CONFIG['settings']['depth'],
|
222
|
+
desc: 'How deeply to crawl the given site'
|
223
|
+
option :crawl,
|
224
|
+
type: :boolean,
|
225
|
+
default: true,
|
226
|
+
desc: 'Run "sitediff crawl" to discover paths.'
|
227
|
+
option :preset,
|
228
|
+
type: :string,
|
229
|
+
enum: Config::Preset.all,
|
230
|
+
desc: 'Framework-specific presets to apply.'
|
231
|
+
option :concurrency,
|
232
|
+
type: :numeric,
|
233
|
+
default: Config::DEFAULT_CONFIG['settings']['concurrency'],
|
234
|
+
desc: 'Max number of concurrent connections made.'
|
235
|
+
option :interval,
|
236
|
+
type: :numeric,
|
237
|
+
default: Config::DEFAULT_CONFIG['settings']['interval'],
|
238
|
+
desc: 'Crawling delay - interval in milliseconds.'
|
239
|
+
option :whitelist,
|
240
|
+
type: :string,
|
241
|
+
default: Config::DEFAULT_CONFIG['settings']['whitelist'],
|
242
|
+
desc: 'Optional whitelist for crawling.'
|
243
|
+
option :blacklist,
|
244
|
+
type: :string,
|
245
|
+
default: Config::DEFAULT_CONFIG['settings']['blacklist'],
|
246
|
+
desc: 'Optional blacklist for crawling.'
|
247
|
+
# TODO: Remove this option. Always ignore SSL errors.
|
248
|
+
option :insecure,
|
249
|
+
type: :boolean,
|
250
|
+
default: false,
|
251
|
+
desc: 'Ignore many HTTPS/SSL errors'
|
252
|
+
option :curl_options,
|
253
|
+
type: :hash,
|
254
|
+
default: {},
|
255
|
+
desc: 'Options to be passed to curl'
|
256
|
+
desc 'init URL [URL]', 'Create a sitediff configuration.'
|
257
|
+
##
|
258
|
+
# Initializes a sitediff (yaml) configuration file.
|
259
|
+
def init(*urls)
|
260
|
+
unless (1..2).cover? urls.size
|
261
|
+
SiteDiff.log 'sitediff init requires one or two URLs', :error
|
262
|
+
exit(2)
|
263
|
+
end
|
264
|
+
|
265
|
+
# Prepare a config object and write it to the file system.
|
266
|
+
@dir = get_dir(options['directory'])
|
267
|
+
creator = SiteDiff::Config::Creator.new(options[:debug], *urls)
|
268
|
+
creator.create(
|
269
|
+
depth: options[:depth],
|
270
|
+
directory: @dir,
|
271
|
+
concurrency: options[:concurrency],
|
272
|
+
interval: options[:interval],
|
273
|
+
whitelist: Config.create_regexp(options['whitelist']),
|
274
|
+
blacklist: Config.create_regexp(options['blacklist']),
|
275
|
+
preset: options[:preset],
|
276
|
+
curl_opts: get_curl_opts(options)
|
277
|
+
)
|
278
|
+
SiteDiff.log "Created #{creator.config_file.expand_path}", :success
|
279
|
+
|
280
|
+
# Discover paths, if enabled.
|
281
|
+
if options[:crawl]
|
282
|
+
crawl(creator.config_file)
|
283
|
+
SiteDiff.log 'You can now run "sitediff diff".', :success
|
284
|
+
else
|
285
|
+
SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
option :url,
|
290
|
+
type: :string,
|
291
|
+
desc: 'A custom base URL to fetch from'
|
292
|
+
desc 'store [CONFIG-FILE]',
|
293
|
+
'Cache the current contents of a site for later comparison.'
|
294
|
+
##
|
295
|
+
# Caches the current version of the site.
|
296
|
+
def store(config_file = nil)
|
297
|
+
@dir = get_dir(options['directory'])
|
298
|
+
config = SiteDiff::Config.new(config_file, @dir)
|
299
|
+
# TODO: Figure out how to remove this config.validate call.
|
300
|
+
config.validate(need_before: false)
|
301
|
+
config.paths_file_read
|
302
|
+
|
303
|
+
cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
304
|
+
cache.write_tags << :before
|
305
|
+
|
306
|
+
base = options[:url] || config.after['url']
|
307
|
+
fetcher = SiteDiff::Fetch.new(cache,
|
308
|
+
config.paths,
|
309
|
+
config.setting(:interval),
|
310
|
+
config.setting(:concurrency),
|
311
|
+
get_curl_opts(config.settings),
|
312
|
+
options[:debug],
|
313
|
+
before: base)
|
314
|
+
fetcher.run do |path, _res|
|
315
|
+
SiteDiff.log "Visited #{path}, cached"
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
desc 'crawl [CONFIG-FILE]',
|
320
|
+
'Crawl the "before" site to discover paths.'
|
321
|
+
##
|
322
|
+
# Crawls the "before" site to determine "paths".
|
323
|
+
#
|
324
|
+
# TODO: Move actual crawling to sitediff.crawl(config).
|
325
|
+
# TODO: Switch to paths = sitediff.crawl().
|
326
|
+
def crawl(config_file = nil)
|
327
|
+
# Prepare configuration.
|
328
|
+
@dir = get_dir(options['directory'])
|
329
|
+
@config = SiteDiff::Config.new(config_file, @dir)
|
330
|
+
|
331
|
+
# Prepare cache.
|
332
|
+
@cache = SiteDiff::Cache.new(
|
333
|
+
create: options['cached'] != 'none',
|
334
|
+
directory: @dir
|
335
|
+
)
|
336
|
+
@cache.write_tags << :before << :after
|
337
|
+
|
338
|
+
# Crawl with Hydra to discover paths.
|
339
|
+
hydra = Typhoeus::Hydra.new(
|
340
|
+
max_concurrency: @config.setting(:concurrency)
|
341
|
+
)
|
342
|
+
@paths = {}
|
343
|
+
@config.roots.each do |tag, url|
|
344
|
+
Crawler.new(
|
345
|
+
hydra,
|
346
|
+
url,
|
347
|
+
@config.setting(:interval),
|
348
|
+
@config.setting(:whitelist),
|
349
|
+
@config.setting(:blacklist),
|
350
|
+
@config.setting(:depth),
|
351
|
+
get_curl_opts(@config.settings),
|
352
|
+
@debug
|
353
|
+
) do |info|
|
354
|
+
SiteDiff.log "Visited #{info.uri}, cached."
|
355
|
+
after_crawl(tag, info)
|
356
|
+
end
|
357
|
+
end
|
358
|
+
hydra.run
|
359
|
+
|
360
|
+
# Write paths to a file.
|
361
|
+
@paths = @paths.values.reduce(&:|).to_a.sort
|
362
|
+
@config.paths_file_write(@paths)
|
363
|
+
|
364
|
+
# Log output.
|
365
|
+
file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
|
366
|
+
SiteDiff.log ''
|
367
|
+
SiteDiff.log "#{@paths.length} page(s) found."
|
368
|
+
SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
|
369
|
+
end
|
370
|
+
|
371
|
+
no_commands do
|
372
|
+
# Generates CURL options.
|
373
|
+
#
|
374
|
+
# TODO: This should be in the config class instead.
|
375
|
+
# TODO: Make all requests insecure and avoid custom curl-opts.
|
376
|
+
def get_curl_opts(options)
|
377
|
+
# We do want string keys here
|
378
|
+
bool_hash = { 'true' => true, 'false' => false }
|
379
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS
|
380
|
+
.clone
|
381
|
+
.merge(options['curl_options'] || {})
|
382
|
+
.merge(options['curl_opts'] || {})
|
383
|
+
curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
|
384
|
+
if options[:insecure]
|
385
|
+
curl_opts[:ssl_verifypeer] = false
|
386
|
+
curl_opts[:ssl_verifyhost] = 0
|
387
|
+
end
|
388
|
+
curl_opts
|
389
|
+
end
|
390
|
+
|
391
|
+
##
|
392
|
+
# Ensures that the given directory exists.
|
393
|
+
def get_dir(directory)
|
394
|
+
# Create the dir. Must go before cache initialization!
|
395
|
+
@dir = Pathname.new(directory || '.')
|
396
|
+
@dir.mkpath unless @dir.directory?
|
397
|
+
@dir.to_s
|
398
|
+
end
|
399
|
+
|
400
|
+
##
|
401
|
+
# Processes a crawled path.
|
402
|
+
def after_crawl(tag, info)
|
403
|
+
path = UriWrapper.canonicalize(info.relative)
|
404
|
+
|
405
|
+
# Register the path.
|
406
|
+
@paths[tag] = [] unless @paths[tag]
|
407
|
+
@paths[tag] << path
|
408
|
+
|
409
|
+
result = info.read_result
|
410
|
+
|
411
|
+
# Write result to applicable cache.
|
412
|
+
@cache.set(tag, path, result)
|
413
|
+
# If single-site, cache "after" as "before".
|
414
|
+
@cache.set(:before, path, result) unless @config.roots[:before]
|
415
|
+
|
416
|
+
# TODO: Restore application of rules.
|
417
|
+
# @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
418
|
+
end
|
88
419
|
end
|
89
420
|
end
|
90
421
|
end
|