sitediff 0.0.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/bin/sitediff +10 -4
- data/lib/sitediff.rb +179 -91
- data/lib/sitediff/cache.rb +106 -0
- data/lib/sitediff/cli.rb +391 -60
- data/lib/sitediff/config.rb +383 -37
- data/lib/sitediff/config/creator.rb +114 -0
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +131 -0
- data/lib/sitediff/diff.rb +57 -12
- data/lib/sitediff/exception.rb +5 -0
- data/lib/sitediff/fetch.rb +76 -0
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +144 -0
- data/lib/sitediff/files/sidebyside.html.erb +16 -0
- data/lib/sitediff/files/sitediff.css +236 -29
- data/lib/sitediff/files/sitediff.js +176 -0
- data/lib/sitediff/report.rb +238 -0
- data/lib/sitediff/result.rb +63 -26
- data/lib/sitediff/sanitize.rb +160 -141
- data/lib/sitediff/sanitize/dom_transform.rb +130 -0
- data/lib/sitediff/sanitize/regexp.rb +82 -0
- data/lib/sitediff/uriwrapper.rb +114 -35
- data/lib/sitediff/webserver.rb +94 -0
- data/lib/sitediff/webserver/resultserver.rb +134 -0
- metadata +103 -43
- data/lib/sitediff/files/html_report.html.erb +0 -47
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
data/lib/sitediff/cli.rb
CHANGED
@@ -1,90 +1,421 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'thor'
|
2
|
-
require 'sitediff
|
3
|
-
require 'sitediff/
|
4
|
-
require 'sitediff/
|
5
|
-
require '
|
6
|
-
require '
|
4
|
+
require 'sitediff'
|
5
|
+
require 'sitediff/cache'
|
6
|
+
require 'sitediff/config'
|
7
|
+
require 'sitediff/config/creator'
|
8
|
+
require 'sitediff/config/preset'
|
9
|
+
require 'sitediff/fetch'
|
10
|
+
require 'sitediff/webserver/resultserver'
|
7
11
|
|
8
12
|
class SiteDiff
|
13
|
+
# SiteDiff CLI.
|
14
|
+
# TODO: Use config.defaults to feed default values for sitediff.yaml params?
|
9
15
|
class Cli < Thor
|
16
|
+
class_option 'directory',
|
17
|
+
type: :string,
|
18
|
+
aliases: '-C',
|
19
|
+
default: 'sitediff',
|
20
|
+
desc: 'Configuration directory'
|
21
|
+
class_option :debug,
|
22
|
+
type: :boolean,
|
23
|
+
aliases: '-d',
|
24
|
+
default: false,
|
25
|
+
desc: 'Stop on certain errors and produce error trace backs.'
|
26
|
+
class_option 'verbose',
|
27
|
+
type: :boolean,
|
28
|
+
aliases: '-v',
|
29
|
+
default: false,
|
30
|
+
desc: 'Show verbose output in terminal'
|
31
|
+
|
32
|
+
# Command aliases.
|
33
|
+
map recrawl: :crawl
|
34
|
+
|
10
35
|
# Thor, by default, exits with 0 no matter what!
|
11
36
|
def self.exit_on_failure?
|
12
37
|
true
|
13
38
|
end
|
14
39
|
|
15
40
|
# Thor, by default, does not raise an error for use of unknown options.
|
16
|
-
def self.check_unknown_options?(
|
41
|
+
def self.check_unknown_options?(_config)
|
17
42
|
true
|
18
43
|
end
|
19
44
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
45
|
+
desc 'version', 'Show version information'
|
46
|
+
##
|
47
|
+
# Show version information.
|
48
|
+
def version
|
49
|
+
gemspec = SiteDiff.gemspec
|
50
|
+
output = []
|
51
|
+
output.push("Sitediff CLI #{gemspec.version}")
|
52
|
+
if options[:verbose]
|
53
|
+
output.push('Website: ' + gemspec.homepage)
|
54
|
+
output.push('GitHub: ' + gemspec.metadata['source_code_uri'])
|
55
|
+
end
|
56
|
+
puts output.join("\n")
|
57
|
+
end
|
58
|
+
|
59
|
+
option 'paths-file',
|
60
|
+
type: :string,
|
61
|
+
desc: 'Paths are read (one at a line) from PATHS: ' \
|
62
|
+
'useful for iterating over sanitization rules',
|
63
|
+
aliases: '--paths-from-file'
|
24
64
|
option 'paths',
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
:aliases => '--paths-from-file'
|
65
|
+
type: :array,
|
66
|
+
aliases: '-p',
|
67
|
+
desc: 'Specific path or paths to fetch'
|
29
68
|
option 'before',
|
30
|
-
|
31
|
-
|
32
|
-
|
69
|
+
type: :string,
|
70
|
+
desc: 'URL to the "before" site, prefixed to all paths.',
|
71
|
+
aliases: '--before-url'
|
33
72
|
option 'after',
|
34
|
-
|
35
|
-
|
36
|
-
|
73
|
+
type: :string,
|
74
|
+
desc: 'URL to the "after" site, prefixed to all paths.',
|
75
|
+
aliases: '--after-url'
|
76
|
+
option 'report-format',
|
77
|
+
type: :string,
|
78
|
+
enum: %w[html json],
|
79
|
+
default: 'html',
|
80
|
+
desc: 'The format in which a report should be generated.'
|
81
|
+
# TODO: Deprecate the parameters before-report / after-report?
|
37
82
|
option 'before-report',
|
38
|
-
|
39
|
-
|
40
|
-
|
83
|
+
type: :string,
|
84
|
+
desc: 'URL to use in reports. Useful if port forwarding.',
|
85
|
+
aliases: '--before-url-report'
|
41
86
|
option 'after-report',
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
option '
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
87
|
+
type: :string,
|
88
|
+
desc: 'URL to use in reports. Useful if port forwarding.',
|
89
|
+
aliases: '--after-url-report'
|
90
|
+
option 'cached',
|
91
|
+
type: :string,
|
92
|
+
enum: %w[none all before after],
|
93
|
+
default: 'before',
|
94
|
+
desc: 'Use the cached version of these sites, if available.'
|
95
|
+
option 'ignore-whitespace',
|
96
|
+
type: :boolean,
|
97
|
+
default: false,
|
98
|
+
aliases: '-w',
|
99
|
+
desc: 'Ignore changes in whitespace.'
|
100
|
+
option 'export',
|
101
|
+
type: :boolean,
|
102
|
+
default: false,
|
103
|
+
aliases: '-e',
|
104
|
+
desc: 'Export report to files. This option forces HTML format.'
|
105
|
+
desc 'diff [OPTIONS] [CONFIG-FILE]',
|
106
|
+
'Compute diffs on configured URLs.'
|
107
|
+
##
|
108
|
+
# Computes diffs.
|
109
|
+
def diff(config_file = nil)
|
110
|
+
@dir = get_dir(options['directory'])
|
111
|
+
config = SiteDiff::Config.new(config_file, @dir)
|
112
|
+
|
113
|
+
# Determine "paths" override based on options.
|
114
|
+
if options['paths'] && options['paths-file']
|
115
|
+
SiteDiff.log "Can't specify both --paths-file and --paths.", :error
|
116
|
+
exit(-1)
|
117
|
+
end
|
118
|
+
|
119
|
+
# Ignore whitespace option.
|
120
|
+
config.ignore_whitespace = options['ignore-whitespace'] if options['ignore-whitespace']
|
121
|
+
|
122
|
+
# Export report option.
|
123
|
+
config.export = options['export']
|
124
|
+
|
125
|
+
# Apply "paths" override, if any.
|
126
|
+
config.paths = options['paths'] if options['paths']
|
127
|
+
|
128
|
+
# Determine and apply "paths-file", if "paths" is not specified.
|
129
|
+
unless options['paths']
|
130
|
+
paths_file = options['paths-file']
|
131
|
+
paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
|
132
|
+
paths_file = File.expand_path(paths_file)
|
133
|
+
|
134
|
+
paths_count = config.paths_file_read(paths_file)
|
135
|
+
SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
|
61
136
|
end
|
137
|
+
|
138
|
+
# TODO: Why do we allow before and after override during diff?
|
62
139
|
config.before['url'] = options['before'] if options['before']
|
63
140
|
config.after['url'] = options['after'] if options['after']
|
64
141
|
|
65
|
-
|
66
|
-
|
142
|
+
# Prepare cache.
|
143
|
+
cache = SiteDiff::Cache.new(
|
144
|
+
create: options['cached'] != 'none',
|
145
|
+
directory: @dir
|
146
|
+
)
|
147
|
+
cache.read_tags << :before if %w[before all].include?(options['cached'])
|
148
|
+
cache.read_tags << :after if %w[after all].include?(options['cached'])
|
149
|
+
cache.write_tags << :before << :after
|
150
|
+
|
151
|
+
# Run sitediff.
|
152
|
+
sitediff = SiteDiff.new(
|
153
|
+
config,
|
154
|
+
cache,
|
155
|
+
options['verbose'],
|
156
|
+
options[:debug]
|
157
|
+
)
|
158
|
+
num_failing = sitediff.run
|
159
|
+
exit_code = num_failing.positive? ? 2 : 0
|
160
|
+
|
161
|
+
# Generate HTML report.
|
162
|
+
if options['report-format'] == 'html' || config.export
|
163
|
+
sitediff.report.generate_html(
|
164
|
+
@dir,
|
165
|
+
options['before-report'],
|
166
|
+
options['after-report']
|
167
|
+
)
|
168
|
+
end
|
67
169
|
|
68
|
-
|
69
|
-
|
70
|
-
|
170
|
+
# Generate JSON report.
|
171
|
+
if options['report-format'] == 'json' && config.export == false
|
172
|
+
sitediff.report.generate_json @dir
|
173
|
+
end
|
174
|
+
|
175
|
+
SiteDiff.log 'Run "sitediff serve" to see a report.' unless options['export']
|
71
176
|
rescue Config::InvalidConfig => e
|
72
|
-
SiteDiff.log "Invalid configuration: #{e.message}", :
|
177
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
178
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
179
|
+
rescue Config::ConfigNotFound => e
|
180
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
181
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
182
|
+
else # no exception was raised
|
183
|
+
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
184
|
+
# Failing diff --> exit(2), populated above
|
185
|
+
exit(exit_code)
|
73
186
|
end
|
74
187
|
|
75
188
|
option :port,
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
option :
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
189
|
+
type: :numeric,
|
190
|
+
default: SiteDiff::Webserver::DEFAULT_PORT,
|
191
|
+
desc: 'The port to serve on'
|
192
|
+
option :browse,
|
193
|
+
type: :boolean,
|
194
|
+
default: true,
|
195
|
+
desc: 'Whether to open the served content in your browser'
|
196
|
+
desc 'serve [OPTIONS] [CONFIG-FILE]',
|
197
|
+
'Serve SiteDiff report directory over HTTP.'
|
198
|
+
##
|
199
|
+
# Serves SiteDiff report for accessing in the browser.
|
200
|
+
def serve(config_file = nil)
|
201
|
+
@dir = get_dir(options['directory'])
|
202
|
+
config = SiteDiff::Config.new(config_file, @dir)
|
203
|
+
|
204
|
+
cache = Cache.new(directory: @dir)
|
205
|
+
cache.read_tags << :before << :after
|
206
|
+
|
207
|
+
SiteDiff::Webserver::ResultServer.new(
|
208
|
+
options[:port],
|
209
|
+
options['directory'],
|
210
|
+
browse: options[:browse],
|
211
|
+
cache: cache,
|
212
|
+
config: config
|
213
|
+
).wait
|
214
|
+
rescue SiteDiffException => e
|
215
|
+
SiteDiff.log e.message, :error
|
216
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
217
|
+
end
|
218
|
+
|
219
|
+
option :depth,
|
220
|
+
type: :numeric,
|
221
|
+
default: Config::DEFAULT_CONFIG['settings']['depth'],
|
222
|
+
desc: 'How deeply to crawl the given site'
|
223
|
+
option :crawl,
|
224
|
+
type: :boolean,
|
225
|
+
default: true,
|
226
|
+
desc: 'Run "sitediff crawl" to discover paths.'
|
227
|
+
option :preset,
|
228
|
+
type: :string,
|
229
|
+
enum: Config::Preset.all,
|
230
|
+
desc: 'Framework-specific presets to apply.'
|
231
|
+
option :concurrency,
|
232
|
+
type: :numeric,
|
233
|
+
default: Config::DEFAULT_CONFIG['settings']['concurrency'],
|
234
|
+
desc: 'Max number of concurrent connections made.'
|
235
|
+
option :interval,
|
236
|
+
type: :numeric,
|
237
|
+
default: Config::DEFAULT_CONFIG['settings']['interval'],
|
238
|
+
desc: 'Crawling delay - interval in milliseconds.'
|
239
|
+
option :whitelist,
|
240
|
+
type: :string,
|
241
|
+
default: Config::DEFAULT_CONFIG['settings']['whitelist'],
|
242
|
+
desc: 'Optional whitelist for crawling.'
|
243
|
+
option :blacklist,
|
244
|
+
type: :string,
|
245
|
+
default: Config::DEFAULT_CONFIG['settings']['blacklist'],
|
246
|
+
desc: 'Optional blacklist for crawling.'
|
247
|
+
# TODO: Remove this option. Always ignore SSL errors.
|
248
|
+
option :insecure,
|
249
|
+
type: :boolean,
|
250
|
+
default: false,
|
251
|
+
desc: 'Ignore many HTTPS/SSL errors'
|
252
|
+
option :curl_options,
|
253
|
+
type: :hash,
|
254
|
+
default: {},
|
255
|
+
desc: 'Options to be passed to curl'
|
256
|
+
desc 'init URL [URL]', 'Create a sitediff configuration.'
|
257
|
+
##
|
258
|
+
# Initializes a sitediff (yaml) configuration file.
|
259
|
+
def init(*urls)
|
260
|
+
unless (1..2).cover? urls.size
|
261
|
+
SiteDiff.log 'sitediff init requires one or two URLs', :error
|
262
|
+
exit(2)
|
263
|
+
end
|
264
|
+
|
265
|
+
# Prepare a config object and write it to the file system.
|
266
|
+
@dir = get_dir(options['directory'])
|
267
|
+
creator = SiteDiff::Config::Creator.new(options[:debug], *urls)
|
268
|
+
creator.create(
|
269
|
+
depth: options[:depth],
|
270
|
+
directory: @dir,
|
271
|
+
concurrency: options[:concurrency],
|
272
|
+
interval: options[:interval],
|
273
|
+
whitelist: Config.create_regexp(options['whitelist']),
|
274
|
+
blacklist: Config.create_regexp(options['blacklist']),
|
275
|
+
preset: options[:preset],
|
276
|
+
curl_opts: get_curl_opts(options)
|
277
|
+
)
|
278
|
+
SiteDiff.log "Created #{creator.config_file.expand_path}", :success
|
279
|
+
|
280
|
+
# Discover paths, if enabled.
|
281
|
+
if options[:crawl]
|
282
|
+
crawl(creator.config_file)
|
283
|
+
SiteDiff.log 'You can now run "sitediff diff".', :success
|
284
|
+
else
|
285
|
+
SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
option :url,
|
290
|
+
type: :string,
|
291
|
+
desc: 'A custom base URL to fetch from'
|
292
|
+
desc 'store [CONFIG-FILE]',
|
293
|
+
'Cache the current contents of a site for later comparison.'
|
294
|
+
##
|
295
|
+
# Caches the current version of the site.
|
296
|
+
def store(config_file = nil)
|
297
|
+
@dir = get_dir(options['directory'])
|
298
|
+
config = SiteDiff::Config.new(config_file, @dir)
|
299
|
+
# TODO: Figure out how to remove this config.validate call.
|
300
|
+
config.validate(need_before: false)
|
301
|
+
config.paths_file_read
|
302
|
+
|
303
|
+
cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
304
|
+
cache.write_tags << :before
|
305
|
+
|
306
|
+
base = options[:url] || config.after['url']
|
307
|
+
fetcher = SiteDiff::Fetch.new(cache,
|
308
|
+
config.paths,
|
309
|
+
config.setting(:interval),
|
310
|
+
config.setting(:concurrency),
|
311
|
+
get_curl_opts(config.settings),
|
312
|
+
options[:debug],
|
313
|
+
before: base)
|
314
|
+
fetcher.run do |path, _res|
|
315
|
+
SiteDiff.log "Visited #{path}, cached"
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
desc 'crawl [CONFIG-FILE]',
|
320
|
+
'Crawl the "before" site to discover paths.'
|
321
|
+
##
|
322
|
+
# Crawls the "before" site to determine "paths".
|
323
|
+
#
|
324
|
+
# TODO: Move actual crawling to sitediff.crawl(config).
|
325
|
+
# TODO: Switch to paths = sitediff.crawl().
|
326
|
+
def crawl(config_file = nil)
|
327
|
+
# Prepare configuration.
|
328
|
+
@dir = get_dir(options['directory'])
|
329
|
+
@config = SiteDiff::Config.new(config_file, @dir)
|
330
|
+
|
331
|
+
# Prepare cache.
|
332
|
+
@cache = SiteDiff::Cache.new(
|
333
|
+
create: options['cached'] != 'none',
|
334
|
+
directory: @dir
|
335
|
+
)
|
336
|
+
@cache.write_tags << :before << :after
|
337
|
+
|
338
|
+
# Crawl with Hydra to discover paths.
|
339
|
+
hydra = Typhoeus::Hydra.new(
|
340
|
+
max_concurrency: @config.setting(:concurrency)
|
341
|
+
)
|
342
|
+
@paths = {}
|
343
|
+
@config.roots.each do |tag, url|
|
344
|
+
Crawler.new(
|
345
|
+
hydra,
|
346
|
+
url,
|
347
|
+
@config.setting(:interval),
|
348
|
+
@config.setting(:whitelist),
|
349
|
+
@config.setting(:blacklist),
|
350
|
+
@config.setting(:depth),
|
351
|
+
get_curl_opts(@config.settings),
|
352
|
+
@debug
|
353
|
+
) do |info|
|
354
|
+
SiteDiff.log "Visited #{info.uri}, cached."
|
355
|
+
after_crawl(tag, info)
|
356
|
+
end
|
357
|
+
end
|
358
|
+
hydra.run
|
359
|
+
|
360
|
+
# Write paths to a file.
|
361
|
+
@paths = @paths.values.reduce(&:|).to_a.sort
|
362
|
+
@config.paths_file_write(@paths)
|
363
|
+
|
364
|
+
# Log output.
|
365
|
+
file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
|
366
|
+
SiteDiff.log ''
|
367
|
+
SiteDiff.log "#{@paths.length} page(s) found."
|
368
|
+
SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
|
369
|
+
end
|
370
|
+
|
371
|
+
no_commands do
|
372
|
+
# Generates CURL options.
|
373
|
+
#
|
374
|
+
# TODO: This should be in the config class instead.
|
375
|
+
# TODO: Make all requests insecure and avoid custom curl-opts.
|
376
|
+
def get_curl_opts(options)
|
377
|
+
# We do want string keys here
|
378
|
+
bool_hash = { 'true' => true, 'false' => false }
|
379
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS
|
380
|
+
.clone
|
381
|
+
.merge(options['curl_options'] || {})
|
382
|
+
.merge(options['curl_opts'] || {})
|
383
|
+
curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
|
384
|
+
if options[:insecure]
|
385
|
+
curl_opts[:ssl_verifypeer] = false
|
386
|
+
curl_opts[:ssl_verifyhost] = 0
|
387
|
+
end
|
388
|
+
curl_opts
|
389
|
+
end
|
390
|
+
|
391
|
+
##
|
392
|
+
# Ensures that the given directory exists.
|
393
|
+
def get_dir(directory)
|
394
|
+
# Create the dir. Must go before cache initialization!
|
395
|
+
@dir = Pathname.new(directory || '.')
|
396
|
+
@dir.mkpath unless @dir.directory?
|
397
|
+
@dir.to_s
|
398
|
+
end
|
399
|
+
|
400
|
+
##
|
401
|
+
# Processes a crawled path.
|
402
|
+
def after_crawl(tag, info)
|
403
|
+
path = UriWrapper.canonicalize(info.relative)
|
404
|
+
|
405
|
+
# Register the path.
|
406
|
+
@paths[tag] = [] unless @paths[tag]
|
407
|
+
@paths[tag] << path
|
408
|
+
|
409
|
+
result = info.read_result
|
410
|
+
|
411
|
+
# Write result to applicable cache.
|
412
|
+
@cache.set(tag, path, result)
|
413
|
+
# If single-site, cache "after" as "before".
|
414
|
+
@cache.set(:before, path, result) unless @config.roots[:before]
|
415
|
+
|
416
|
+
# TODO: Restore application of rules.
|
417
|
+
# @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
418
|
+
end
|
88
419
|
end
|
89
420
|
end
|
90
421
|
end
|