sitediff 0.0.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,90 +1,421 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'thor'
2
- require 'sitediff/diff'
3
- require 'sitediff/sanitize'
4
- require 'sitediff/util/webserver'
5
- require 'open-uri'
6
- require 'uri'
4
+ require 'sitediff'
5
+ require 'sitediff/cache'
6
+ require 'sitediff/config'
7
+ require 'sitediff/config/creator'
8
+ require 'sitediff/config/preset'
9
+ require 'sitediff/fetch'
10
+ require 'sitediff/webserver/resultserver'
7
11
 
8
12
  class SiteDiff
13
+ # SiteDiff CLI.
14
+ # TODO: Use config.defaults to feed default values for sitediff.yaml params?
9
15
  class Cli < Thor
16
+ class_option 'directory',
17
+ type: :string,
18
+ aliases: '-C',
19
+ default: 'sitediff',
20
+ desc: 'Configuration directory'
21
+ class_option :debug,
22
+ type: :boolean,
23
+ aliases: '-d',
24
+ default: false,
25
+ desc: 'Stop on certain errors and produce error trace backs.'
26
+ class_option 'verbose',
27
+ type: :boolean,
28
+ aliases: '-v',
29
+ default: false,
30
+ desc: 'Show verbose output in terminal'
31
+
32
+ # Command aliases.
33
+ map recrawl: :crawl
34
+
10
35
  # Thor, by default, exits with 0 no matter what!
11
36
  def self.exit_on_failure?
12
37
  true
13
38
  end
14
39
 
15
40
  # Thor, by default, does not raise an error for use of unknown options.
16
- def self.check_unknown_options?(config)
41
+ def self.check_unknown_options?(_config)
17
42
  true
18
43
  end
19
44
 
20
- option 'dump-dir',
21
- :type => :string,
22
- :default => File.join('.', 'output'),
23
- :desc => "Location to write the output to."
45
+ desc 'version', 'Show version information'
46
+ ##
47
+ # Show version information.
48
+ def version
49
+ gemspec = SiteDiff.gemspec
50
+ output = []
51
+ output.push("Sitediff CLI #{gemspec.version}")
52
+ if options[:verbose]
53
+ output.push('Website: ' + gemspec.homepage)
54
+ output.push('GitHub: ' + gemspec.metadata['source_code_uri'])
55
+ end
56
+ puts output.join("\n")
57
+ end
58
+
59
+ option 'paths-file',
60
+ type: :string,
61
+ desc: 'Paths are read (one at a line) from PATHS: ' \
62
+ 'useful for iterating over sanitization rules',
63
+ aliases: '--paths-from-file'
24
64
  option 'paths',
25
- :type => :string,
26
- :desc => 'Paths are read (one at a line) from PATHS: ' +
27
- 'useful for iterating over sanitization rules',
28
- :aliases => '--paths-from-file'
65
+ type: :array,
66
+ aliases: '-p',
67
+ desc: 'Specific path or paths to fetch'
29
68
  option 'before',
30
- :type => :string,
31
- :desc => "URL used to fetch the before HTML. Acts as a prefix to specified paths",
32
- :aliases => '--before-url'
69
+ type: :string,
70
+ desc: 'URL to the "before" site, prefixed to all paths.',
71
+ aliases: '--before-url'
33
72
  option 'after',
34
- :type => :string,
35
- :desc => "URL used to fetch the after HTML. Acts as a prefix to specified paths.",
36
- :aliases => '--after-url'
73
+ type: :string,
74
+ desc: 'URL to the "after" site, prefixed to all paths.',
75
+ aliases: '--after-url'
76
+ option 'report-format',
77
+ type: :string,
78
+ enum: %w[html json],
79
+ default: 'html',
80
+ desc: 'The format in which a report should be generated.'
81
+ # TODO: Deprecate the parameters before-report / after-report?
37
82
  option 'before-report',
38
- :type => :string,
39
- :desc => "Before URL to use for reporting purposes. Useful if port forwarding.",
40
- :aliases => '--before-url-report'
83
+ type: :string,
84
+ desc: 'URL to use in reports. Useful if port forwarding.',
85
+ aliases: '--before-url-report'
41
86
  option 'after-report',
42
- :type => :string,
43
- :desc => "After URL to use for reporting purposes. Useful if port forwarding.",
44
- :aliases => '--after-url-report'
45
- option 'cache',
46
- :type => :string,
47
- :desc => "Filename to use for caching requests.",
48
- :lazy_default => 'cache.db'
49
- desc "diff [OPTIONS] [CONFIGFILES]", "Perform systematic diff on given URLs"
50
- def diff(*config_files)
51
- config = SiteDiff::Config.new(config_files)
52
-
53
- # override config based on options
54
- if paths_file = options['paths']
55
- unless File.exists? paths_file
56
- raise Config::InvalidConfig,
57
- "Paths file '#{paths_file}' not found!"
58
- end
59
- SiteDiff::log "Reading paths from: #{paths_file}"
60
- config.paths = File.readlines(paths_file)
87
+ type: :string,
88
+ desc: 'URL to use in reports. Useful if port forwarding.',
89
+ aliases: '--after-url-report'
90
+ option 'cached',
91
+ type: :string,
92
+ enum: %w[none all before after],
93
+ default: 'before',
94
+ desc: 'Use the cached version of these sites, if available.'
95
+ option 'ignore-whitespace',
96
+ type: :boolean,
97
+ default: false,
98
+ aliases: '-w',
99
+ desc: 'Ignore changes in whitespace.'
100
+ option 'export',
101
+ type: :boolean,
102
+ default: false,
103
+ aliases: '-e',
104
+ desc: 'Export report to files. This option forces HTML format.'
105
+ desc 'diff [OPTIONS] [CONFIG-FILE]',
106
+ 'Compute diffs on configured URLs.'
107
+ ##
108
+ # Computes diffs.
109
+ def diff(config_file = nil)
110
+ @dir = get_dir(options['directory'])
111
+ config = SiteDiff::Config.new(config_file, @dir)
112
+
113
+ # Determine "paths" override based on options.
114
+ if options['paths'] && options['paths-file']
115
+ SiteDiff.log "Can't specify both --paths-file and --paths.", :error
116
+ exit(-1)
117
+ end
118
+
119
+ # Ignore whitespace option.
120
+ config.ignore_whitespace = options['ignore-whitespace'] if options['ignore-whitespace']
121
+
122
+ # Export report option.
123
+ config.export = options['export']
124
+
125
+ # Apply "paths" override, if any.
126
+ config.paths = options['paths'] if options['paths']
127
+
128
+ # Determine and apply "paths-file", if "paths" is not specified.
129
+ unless options['paths']
130
+ paths_file = options['paths-file']
131
+ paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
132
+ paths_file = File.expand_path(paths_file)
133
+
134
+ paths_count = config.paths_file_read(paths_file)
135
+ SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
61
136
  end
137
+
138
+ # TODO: Why do we allow before and after override during diff?
62
139
  config.before['url'] = options['before'] if options['before']
63
140
  config.after['url'] = options['after'] if options['after']
64
141
 
65
- sitediff = SiteDiff.new(config, options['cache'])
66
- sitediff.run
142
+ # Prepare cache.
143
+ cache = SiteDiff::Cache.new(
144
+ create: options['cached'] != 'none',
145
+ directory: @dir
146
+ )
147
+ cache.read_tags << :before if %w[before all].include?(options['cached'])
148
+ cache.read_tags << :after if %w[after all].include?(options['cached'])
149
+ cache.write_tags << :before << :after
150
+
151
+ # Run sitediff.
152
+ sitediff = SiteDiff.new(
153
+ config,
154
+ cache,
155
+ options['verbose'],
156
+ options[:debug]
157
+ )
158
+ num_failing = sitediff.run
159
+ exit_code = num_failing.positive? ? 2 : 0
160
+
161
+ # Generate HTML report.
162
+ if options['report-format'] == 'html' || config.export
163
+ sitediff.report.generate_html(
164
+ @dir,
165
+ options['before-report'],
166
+ options['after-report']
167
+ )
168
+ end
67
169
 
68
- failing_paths = File.join(options['dump-dir'], 'failures.txt')
69
- sitediff.dump(options['dump-dir'], options['before-report'],
70
- options['after-report'], failing_paths)
170
+ # Generate JSON report.
171
+ if options['report-format'] == 'json' && config.export == false
172
+ sitediff.report.generate_json @dir
173
+ end
174
+
175
+ SiteDiff.log 'Run "sitediff serve" to see a report.' unless options['export']
71
176
  rescue Config::InvalidConfig => e
72
- SiteDiff.log "Invalid configuration: #{e.message}", :failure
177
+ SiteDiff.log "Invalid configuration: #{e.message}", :error
178
+ SiteDiff.log e.backtrace, :error if options[:verbose]
179
+ rescue Config::ConfigNotFound => e
180
+ SiteDiff.log "Invalid configuration: #{e.message}", :error
181
+ SiteDiff.log e.backtrace, :error if options[:verbose]
182
+ else # no exception was raised
183
+ # Thor::Error --> exit(1), guaranteed by exit_on_failure?
184
+ # Failing diff --> exit(2), populated above
185
+ exit(exit_code)
73
186
  end
74
187
 
75
188
  option :port,
76
- :type => :numeric,
77
- :default => SiteDiff::Util::Webserver::DEFAULT_PORT,
78
- :desc => 'The port to serve on'
79
- option :directory,
80
- :type => :string,
81
- :default => 'output',
82
- :desc => 'The directory to serve',
83
- :aliases => '--dump-dir'
84
- desc "serve [OPTIONS]", "Serve the sitediff output directory over HTTP"
85
- def serve
86
- SiteDiff::Util::Webserver.serve(options[:port], options[:directory],
87
- :announce => true).wait
189
+ type: :numeric,
190
+ default: SiteDiff::Webserver::DEFAULT_PORT,
191
+ desc: 'The port to serve on'
192
+ option :browse,
193
+ type: :boolean,
194
+ default: true,
195
+ desc: 'Whether to open the served content in your browser'
196
+ desc 'serve [OPTIONS] [CONFIG-FILE]',
197
+ 'Serve SiteDiff report directory over HTTP.'
198
+ ##
199
+ # Serves SiteDiff report for accessing in the browser.
200
+ def serve(config_file = nil)
201
+ @dir = get_dir(options['directory'])
202
+ config = SiteDiff::Config.new(config_file, @dir)
203
+
204
+ cache = Cache.new(directory: @dir)
205
+ cache.read_tags << :before << :after
206
+
207
+ SiteDiff::Webserver::ResultServer.new(
208
+ options[:port],
209
+ options['directory'],
210
+ browse: options[:browse],
211
+ cache: cache,
212
+ config: config
213
+ ).wait
214
+ rescue SiteDiffException => e
215
+ SiteDiff.log e.message, :error
216
+ SiteDiff.log e.backtrace, :error if options[:verbose]
217
+ end
218
+
219
+ option :depth,
220
+ type: :numeric,
221
+ default: Config::DEFAULT_CONFIG['settings']['depth'],
222
+ desc: 'How deeply to crawl the given site'
223
+ option :crawl,
224
+ type: :boolean,
225
+ default: true,
226
+ desc: 'Run "sitediff crawl" to discover paths.'
227
+ option :preset,
228
+ type: :string,
229
+ enum: Config::Preset.all,
230
+ desc: 'Framework-specific presets to apply.'
231
+ option :concurrency,
232
+ type: :numeric,
233
+ default: Config::DEFAULT_CONFIG['settings']['concurrency'],
234
+ desc: 'Max number of concurrent connections made.'
235
+ option :interval,
236
+ type: :numeric,
237
+ default: Config::DEFAULT_CONFIG['settings']['interval'],
238
+ desc: 'Crawling delay - interval in milliseconds.'
239
+ option :whitelist,
240
+ type: :string,
241
+ default: Config::DEFAULT_CONFIG['settings']['whitelist'],
242
+ desc: 'Optional whitelist for crawling.'
243
+ option :blacklist,
244
+ type: :string,
245
+ default: Config::DEFAULT_CONFIG['settings']['blacklist'],
246
+ desc: 'Optional blacklist for crawling.'
247
+ # TODO: Remove this option. Always ignore SSL errors.
248
+ option :insecure,
249
+ type: :boolean,
250
+ default: false,
251
+ desc: 'Ignore many HTTPS/SSL errors'
252
+ option :curl_options,
253
+ type: :hash,
254
+ default: {},
255
+ desc: 'Options to be passed to curl'
256
+ desc 'init URL [URL]', 'Create a sitediff configuration.'
257
+ ##
258
+ # Initializes a sitediff (yaml) configuration file.
259
+ def init(*urls)
260
+ unless (1..2).cover? urls.size
261
+ SiteDiff.log 'sitediff init requires one or two URLs', :error
262
+ exit(2)
263
+ end
264
+
265
+ # Prepare a config object and write it to the file system.
266
+ @dir = get_dir(options['directory'])
267
+ creator = SiteDiff::Config::Creator.new(options[:debug], *urls)
268
+ creator.create(
269
+ depth: options[:depth],
270
+ directory: @dir,
271
+ concurrency: options[:concurrency],
272
+ interval: options[:interval],
273
+ whitelist: Config.create_regexp(options['whitelist']),
274
+ blacklist: Config.create_regexp(options['blacklist']),
275
+ preset: options[:preset],
276
+ curl_opts: get_curl_opts(options)
277
+ )
278
+ SiteDiff.log "Created #{creator.config_file.expand_path}", :success
279
+
280
+ # Discover paths, if enabled.
281
+ if options[:crawl]
282
+ crawl(creator.config_file)
283
+ SiteDiff.log 'You can now run "sitediff diff".', :success
284
+ else
285
+ SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
286
+ end
287
+ end
288
+
289
+ option :url,
290
+ type: :string,
291
+ desc: 'A custom base URL to fetch from'
292
+ desc 'store [CONFIG-FILE]',
293
+ 'Cache the current contents of a site for later comparison.'
294
+ ##
295
+ # Caches the current version of the site.
296
+ def store(config_file = nil)
297
+ @dir = get_dir(options['directory'])
298
+ config = SiteDiff::Config.new(config_file, @dir)
299
+ # TODO: Figure out how to remove this config.validate call.
300
+ config.validate(need_before: false)
301
+ config.paths_file_read
302
+
303
+ cache = SiteDiff::Cache.new(directory: @dir, create: true)
304
+ cache.write_tags << :before
305
+
306
+ base = options[:url] || config.after['url']
307
+ fetcher = SiteDiff::Fetch.new(cache,
308
+ config.paths,
309
+ config.setting(:interval),
310
+ config.setting(:concurrency),
311
+ get_curl_opts(config.settings),
312
+ options[:debug],
313
+ before: base)
314
+ fetcher.run do |path, _res|
315
+ SiteDiff.log "Visited #{path}, cached"
316
+ end
317
+ end
318
+
319
+ desc 'crawl [CONFIG-FILE]',
320
+ 'Crawl the "before" site to discover paths.'
321
+ ##
322
+ # Crawls the "before" site to determine "paths".
323
+ #
324
+ # TODO: Move actual crawling to sitediff.crawl(config).
325
+ # TODO: Switch to paths = sitediff.crawl().
326
+ def crawl(config_file = nil)
327
+ # Prepare configuration.
328
+ @dir = get_dir(options['directory'])
329
+ @config = SiteDiff::Config.new(config_file, @dir)
330
+
331
+ # Prepare cache.
332
+ @cache = SiteDiff::Cache.new(
333
+ create: options['cached'] != 'none',
334
+ directory: @dir
335
+ )
336
+ @cache.write_tags << :before << :after
337
+
338
+ # Crawl with Hydra to discover paths.
339
+ hydra = Typhoeus::Hydra.new(
340
+ max_concurrency: @config.setting(:concurrency)
341
+ )
342
+ @paths = {}
343
+ @config.roots.each do |tag, url|
344
+ Crawler.new(
345
+ hydra,
346
+ url,
347
+ @config.setting(:interval),
348
+ @config.setting(:whitelist),
349
+ @config.setting(:blacklist),
350
+ @config.setting(:depth),
351
+ get_curl_opts(@config.settings),
352
+ @debug
353
+ ) do |info|
354
+ SiteDiff.log "Visited #{info.uri}, cached."
355
+ after_crawl(tag, info)
356
+ end
357
+ end
358
+ hydra.run
359
+
360
+ # Write paths to a file.
361
+ @paths = @paths.values.reduce(&:|).to_a.sort
362
+ @config.paths_file_write(@paths)
363
+
364
+ # Log output.
365
+ file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
366
+ SiteDiff.log ''
367
+ SiteDiff.log "#{@paths.length} page(s) found."
368
+ SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
369
+ end
370
+
371
+ no_commands do
372
+ # Generates CURL options.
373
+ #
374
+ # TODO: This should be in the config class instead.
375
+ # TODO: Make all requests insecure and avoid custom curl-opts.
376
+ def get_curl_opts(options)
377
+ # We do want string keys here
378
+ bool_hash = { 'true' => true, 'false' => false }
379
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS
380
+ .clone
381
+ .merge(options['curl_options'] || {})
382
+ .merge(options['curl_opts'] || {})
383
+ curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
384
+ if options[:insecure]
385
+ curl_opts[:ssl_verifypeer] = false
386
+ curl_opts[:ssl_verifyhost] = 0
387
+ end
388
+ curl_opts
389
+ end
390
+
391
+ ##
392
+ # Ensures that the given directory exists.
393
+ def get_dir(directory)
394
+ # Create the dir. Must go before cache initialization!
395
+ @dir = Pathname.new(directory || '.')
396
+ @dir.mkpath unless @dir.directory?
397
+ @dir.to_s
398
+ end
399
+
400
+ ##
401
+ # Processes a crawled path.
402
+ def after_crawl(tag, info)
403
+ path = UriWrapper.canonicalize(info.relative)
404
+
405
+ # Register the path.
406
+ @paths[tag] = [] unless @paths[tag]
407
+ @paths[tag] << path
408
+
409
+ result = info.read_result
410
+
411
+ # Write result to applicable cache.
412
+ @cache.set(tag, path, result)
413
+ # If single-site, cache "after" as "before".
414
+ @cache.set(:before, path, result) unless @config.roots[:before]
415
+
416
+ # TODO: Restore application of rules.
417
+ # @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
418
+ end
88
419
  end
89
420
  end
90
421
  end