sitediff 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,90 +1,421 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'thor'
2
- require 'sitediff/diff'
3
- require 'sitediff/sanitize'
4
- require 'sitediff/util/webserver'
5
- require 'open-uri'
6
- require 'uri'
4
+ require 'sitediff'
5
+ require 'sitediff/cache'
6
+ require 'sitediff/config'
7
+ require 'sitediff/config/creator'
8
+ require 'sitediff/config/preset'
9
+ require 'sitediff/fetch'
10
+ require 'sitediff/webserver/resultserver'
7
11
 
8
12
  class SiteDiff
13
+ # SiteDiff CLI.
14
+ # TODO: Use config.defaults to feed default values for sitediff.yaml params?
9
15
  class Cli < Thor
16
+ class_option 'directory',
17
+ type: :string,
18
+ aliases: '-C',
19
+ default: 'sitediff',
20
+ desc: 'Configuration directory'
21
+ class_option :debug,
22
+ type: :boolean,
23
+ aliases: '-d',
24
+ default: false,
25
+ desc: 'Stop on certain errors and produce error trace backs.'
26
+ class_option 'verbose',
27
+ type: :boolean,
28
+ aliases: '-v',
29
+ default: false,
30
+ desc: 'Show verbose output in terminal'
31
+
32
+ # Command aliases.
33
+ map recrawl: :crawl
34
+
10
35
  # Thor, by default, exits with 0 no matter what!
11
36
  def self.exit_on_failure?
12
37
  true
13
38
  end
14
39
 
15
40
  # Thor, by default, does not raise an error for use of unknown options.
16
- def self.check_unknown_options?(config)
41
+ def self.check_unknown_options?(_config)
17
42
  true
18
43
  end
19
44
 
20
- option 'dump-dir',
21
- :type => :string,
22
- :default => File.join('.', 'output'),
23
- :desc => "Location to write the output to."
45
+ desc 'version', 'Show version information'
46
+ ##
47
+ # Show version information.
48
+ def version
49
+ gemspec = SiteDiff.gemspec
50
+ output = []
51
+ output.push("Sitediff CLI #{gemspec.version}")
52
+ if options[:verbose]
53
+ output.push('Website: ' + gemspec.homepage)
54
+ output.push('GitHub: ' + gemspec.metadata['source_code_uri'])
55
+ end
56
+ puts output.join("\n")
57
+ end
58
+
59
+ option 'paths-file',
60
+ type: :string,
61
+ desc: 'Paths are read (one at a line) from PATHS: ' \
62
+ 'useful for iterating over sanitization rules',
63
+ aliases: '--paths-from-file'
24
64
  option 'paths',
25
- :type => :string,
26
- :desc => 'Paths are read (one at a line) from PATHS: ' +
27
- 'useful for iterating over sanitization rules',
28
- :aliases => '--paths-from-file'
65
+ type: :array,
66
+ aliases: '-p',
67
+ desc: 'Specific path or paths to fetch'
29
68
  option 'before',
30
- :type => :string,
31
- :desc => "URL used to fetch the before HTML. Acts as a prefix to specified paths",
32
- :aliases => '--before-url'
69
+ type: :string,
70
+ desc: 'URL to the "before" site, prefixed to all paths.',
71
+ aliases: '--before-url'
33
72
  option 'after',
34
- :type => :string,
35
- :desc => "URL used to fetch the after HTML. Acts as a prefix to specified paths.",
36
- :aliases => '--after-url'
73
+ type: :string,
74
+ desc: 'URL to the "after" site, prefixed to all paths.',
75
+ aliases: '--after-url'
76
+ option 'report-format',
77
+ type: :string,
78
+ enum: %w[html json],
79
+ default: 'html',
80
+ desc: 'The format in which a report should be generated.'
81
+ # TODO: Deprecate the parameters before-report / after-report?
37
82
  option 'before-report',
38
- :type => :string,
39
- :desc => "Before URL to use for reporting purposes. Useful if port forwarding.",
40
- :aliases => '--before-url-report'
83
+ type: :string,
84
+ desc: 'URL to use in reports. Useful if port forwarding.',
85
+ aliases: '--before-url-report'
41
86
  option 'after-report',
42
- :type => :string,
43
- :desc => "After URL to use for reporting purposes. Useful if port forwarding.",
44
- :aliases => '--after-url-report'
45
- option 'cache',
46
- :type => :string,
47
- :desc => "Filename to use for caching requests.",
48
- :lazy_default => 'cache.db'
49
- desc "diff [OPTIONS] [CONFIGFILES]", "Perform systematic diff on given URLs"
50
- def diff(*config_files)
51
- config = SiteDiff::Config.new(config_files)
52
-
53
- # override config based on options
54
- if paths_file = options['paths']
55
- unless File.exists? paths_file
56
- raise Config::InvalidConfig,
57
- "Paths file '#{paths_file}' not found!"
58
- end
59
- SiteDiff::log "Reading paths from: #{paths_file}"
60
- config.paths = File.readlines(paths_file)
87
+ type: :string,
88
+ desc: 'URL to use in reports. Useful if port forwarding.',
89
+ aliases: '--after-url-report'
90
+ option 'cached',
91
+ type: :string,
92
+ enum: %w[none all before after],
93
+ default: 'before',
94
+ desc: 'Use the cached version of these sites, if available.'
95
+ option 'ignore-whitespace',
96
+ type: :boolean,
97
+ default: false,
98
+ aliases: '-w',
99
+ desc: 'Ignore changes in whitespace.'
100
+ option 'export',
101
+ type: :boolean,
102
+ default: false,
103
+ aliases: '-e',
104
+ desc: 'Export report to files. This option forces HTML format.'
105
+ desc 'diff [OPTIONS] [CONFIG-FILE]',
106
+ 'Compute diffs on configured URLs.'
107
+ ##
108
+ # Computes diffs.
109
+ def diff(config_file = nil)
110
+ @dir = get_dir(options['directory'])
111
+ config = SiteDiff::Config.new(config_file, @dir)
112
+
113
+ # Determine "paths" override based on options.
114
+ if options['paths'] && options['paths-file']
115
+ SiteDiff.log "Can't specify both --paths-file and --paths.", :error
116
+ exit(-1)
117
+ end
118
+
119
+ # Ignore whitespace option.
120
+ config.ignore_whitespace = options['ignore-whitespace'] if options['ignore-whitespace']
121
+
122
+ # Export report option.
123
+ config.export = options['export']
124
+
125
+ # Apply "paths" override, if any.
126
+ config.paths = options['paths'] if options['paths']
127
+
128
+ # Determine and apply "paths-file", if "paths" is not specified.
129
+ unless options['paths']
130
+ paths_file = options['paths-file']
131
+ paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
132
+ paths_file = File.expand_path(paths_file)
133
+
134
+ paths_count = config.paths_file_read(paths_file)
135
+ SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
61
136
  end
137
+
138
+ # TODO: Why do we allow before and after override during diff?
62
139
  config.before['url'] = options['before'] if options['before']
63
140
  config.after['url'] = options['after'] if options['after']
64
141
 
65
- sitediff = SiteDiff.new(config, options['cache'])
66
- sitediff.run
142
+ # Prepare cache.
143
+ cache = SiteDiff::Cache.new(
144
+ create: options['cached'] != 'none',
145
+ directory: @dir
146
+ )
147
+ cache.read_tags << :before if %w[before all].include?(options['cached'])
148
+ cache.read_tags << :after if %w[after all].include?(options['cached'])
149
+ cache.write_tags << :before << :after
150
+
151
+ # Run sitediff.
152
+ sitediff = SiteDiff.new(
153
+ config,
154
+ cache,
155
+ options['verbose'],
156
+ options[:debug]
157
+ )
158
+ num_failing = sitediff.run
159
+ exit_code = num_failing.positive? ? 2 : 0
160
+
161
+ # Generate HTML report.
162
+ if options['report-format'] == 'html' || config.export
163
+ sitediff.report.generate_html(
164
+ @dir,
165
+ options['before-report'],
166
+ options['after-report']
167
+ )
168
+ end
67
169
 
68
- failing_paths = File.join(options['dump-dir'], 'failures.txt')
69
- sitediff.dump(options['dump-dir'], options['before-report'],
70
- options['after-report'], failing_paths)
170
+ # Generate JSON report.
171
+ if options['report-format'] == 'json' && config.export == false
172
+ sitediff.report.generate_json @dir
173
+ end
174
+
175
+ SiteDiff.log 'Run "sitediff serve" to see a report.' unless options['export']
71
176
  rescue Config::InvalidConfig => e
72
- SiteDiff.log "Invalid configuration: #{e.message}", :failure
177
+ SiteDiff.log "Invalid configuration: #{e.message}", :error
178
+ SiteDiff.log e.backtrace, :error if options[:verbose]
179
+ rescue Config::ConfigNotFound => e
180
+ SiteDiff.log "Invalid configuration: #{e.message}", :error
181
+ SiteDiff.log e.backtrace, :error if options[:verbose]
182
+ else # no exception was raised
183
+ # Thor::Error --> exit(1), guaranteed by exit_on_failure?
184
+ # Failing diff --> exit(2), populated above
185
+ exit(exit_code)
73
186
  end
74
187
 
75
188
  option :port,
76
- :type => :numeric,
77
- :default => SiteDiff::Util::Webserver::DEFAULT_PORT,
78
- :desc => 'The port to serve on'
79
- option :directory,
80
- :type => :string,
81
- :default => 'output',
82
- :desc => 'The directory to serve',
83
- :aliases => '--dump-dir'
84
- desc "serve [OPTIONS]", "Serve the sitediff output directory over HTTP"
85
- def serve
86
- SiteDiff::Util::Webserver.serve(options[:port], options[:directory],
87
- :announce => true).wait
189
+ type: :numeric,
190
+ default: SiteDiff::Webserver::DEFAULT_PORT,
191
+ desc: 'The port to serve on'
192
+ option :browse,
193
+ type: :boolean,
194
+ default: true,
195
+ desc: 'Whether to open the served content in your browser'
196
+ desc 'serve [OPTIONS] [CONFIG-FILE]',
197
+ 'Serve SiteDiff report directory over HTTP.'
198
+ ##
199
+ # Serves SiteDiff report for accessing in the browser.
200
+ def serve(config_file = nil)
201
+ @dir = get_dir(options['directory'])
202
+ config = SiteDiff::Config.new(config_file, @dir)
203
+
204
+ cache = Cache.new(directory: @dir)
205
+ cache.read_tags << :before << :after
206
+
207
+ SiteDiff::Webserver::ResultServer.new(
208
+ options[:port],
209
+ options['directory'],
210
+ browse: options[:browse],
211
+ cache: cache,
212
+ config: config
213
+ ).wait
214
+ rescue SiteDiffException => e
215
+ SiteDiff.log e.message, :error
216
+ SiteDiff.log e.backtrace, :error if options[:verbose]
217
+ end
218
+
219
+ option :depth,
220
+ type: :numeric,
221
+ default: Config::DEFAULT_CONFIG['settings']['depth'],
222
+ desc: 'How deeply to crawl the given site'
223
+ option :crawl,
224
+ type: :boolean,
225
+ default: true,
226
+ desc: 'Run "sitediff crawl" to discover paths.'
227
+ option :preset,
228
+ type: :string,
229
+ enum: Config::Preset.all,
230
+ desc: 'Framework-specific presets to apply.'
231
+ option :concurrency,
232
+ type: :numeric,
233
+ default: Config::DEFAULT_CONFIG['settings']['concurrency'],
234
+ desc: 'Max number of concurrent connections made.'
235
+ option :interval,
236
+ type: :numeric,
237
+ default: Config::DEFAULT_CONFIG['settings']['interval'],
238
+ desc: 'Crawling delay - interval in milliseconds.'
239
+ option :whitelist,
240
+ type: :string,
241
+ default: Config::DEFAULT_CONFIG['settings']['whitelist'],
242
+ desc: 'Optional whitelist for crawling.'
243
+ option :blacklist,
244
+ type: :string,
245
+ default: Config::DEFAULT_CONFIG['settings']['blacklist'],
246
+ desc: 'Optional blacklist for crawling.'
247
+ # TODO: Remove this option. Always ignore SSL errors.
248
+ option :insecure,
249
+ type: :boolean,
250
+ default: false,
251
+ desc: 'Ignore many HTTPS/SSL errors'
252
+ option :curl_options,
253
+ type: :hash,
254
+ default: {},
255
+ desc: 'Options to be passed to curl'
256
+ desc 'init URL [URL]', 'Create a sitediff configuration.'
257
+ ##
258
+ # Initializes a sitediff (yaml) configuration file.
259
+ def init(*urls)
260
+ unless (1..2).cover? urls.size
261
+ SiteDiff.log 'sitediff init requires one or two URLs', :error
262
+ exit(2)
263
+ end
264
+
265
+ # Prepare a config object and write it to the file system.
266
+ @dir = get_dir(options['directory'])
267
+ creator = SiteDiff::Config::Creator.new(options[:debug], *urls)
268
+ creator.create(
269
+ depth: options[:depth],
270
+ directory: @dir,
271
+ concurrency: options[:concurrency],
272
+ interval: options[:interval],
273
+ whitelist: Config.create_regexp(options['whitelist']),
274
+ blacklist: Config.create_regexp(options['blacklist']),
275
+ preset: options[:preset],
276
+ curl_opts: get_curl_opts(options)
277
+ )
278
+ SiteDiff.log "Created #{creator.config_file.expand_path}", :success
279
+
280
+ # Discover paths, if enabled.
281
+ if options[:crawl]
282
+ crawl(creator.config_file)
283
+ SiteDiff.log 'You can now run "sitediff diff".', :success
284
+ else
285
+ SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
286
+ end
287
+ end
288
+
289
+ option :url,
290
+ type: :string,
291
+ desc: 'A custom base URL to fetch from'
292
+ desc 'store [CONFIG-FILE]',
293
+ 'Cache the current contents of a site for later comparison.'
294
+ ##
295
+ # Caches the current version of the site.
296
+ def store(config_file = nil)
297
+ @dir = get_dir(options['directory'])
298
+ config = SiteDiff::Config.new(config_file, @dir)
299
+ # TODO: Figure out how to remove this config.validate call.
300
+ config.validate(need_before: false)
301
+ config.paths_file_read
302
+
303
+ cache = SiteDiff::Cache.new(directory: @dir, create: true)
304
+ cache.write_tags << :before
305
+
306
+ base = options[:url] || config.after['url']
307
+ fetcher = SiteDiff::Fetch.new(cache,
308
+ config.paths,
309
+ config.setting(:interval),
310
+ config.setting(:concurrency),
311
+ get_curl_opts(config.settings),
312
+ options[:debug],
313
+ before: base)
314
+ fetcher.run do |path, _res|
315
+ SiteDiff.log "Visited #{path}, cached"
316
+ end
317
+ end
318
+
319
+ desc 'crawl [CONFIG-FILE]',
320
+ 'Crawl the "before" site to discover paths.'
321
+ ##
322
+ # Crawls the "before" site to determine "paths".
323
+ #
324
+ # TODO: Move actual crawling to sitediff.crawl(config).
325
+ # TODO: Switch to paths = sitediff.crawl().
326
+ def crawl(config_file = nil)
327
+ # Prepare configuration.
328
+ @dir = get_dir(options['directory'])
329
+ @config = SiteDiff::Config.new(config_file, @dir)
330
+
331
+ # Prepare cache.
332
+ @cache = SiteDiff::Cache.new(
333
+ create: options['cached'] != 'none',
334
+ directory: @dir
335
+ )
336
+ @cache.write_tags << :before << :after
337
+
338
+ # Crawl with Hydra to discover paths.
339
+ hydra = Typhoeus::Hydra.new(
340
+ max_concurrency: @config.setting(:concurrency)
341
+ )
342
+ @paths = {}
343
+ @config.roots.each do |tag, url|
344
+ Crawler.new(
345
+ hydra,
346
+ url,
347
+ @config.setting(:interval),
348
+ @config.setting(:whitelist),
349
+ @config.setting(:blacklist),
350
+ @config.setting(:depth),
351
+ get_curl_opts(@config.settings),
352
+ @debug
353
+ ) do |info|
354
+ SiteDiff.log "Visited #{info.uri}, cached."
355
+ after_crawl(tag, info)
356
+ end
357
+ end
358
+ hydra.run
359
+
360
+ # Write paths to a file.
361
+ @paths = @paths.values.reduce(&:|).to_a.sort
362
+ @config.paths_file_write(@paths)
363
+
364
+ # Log output.
365
+ file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
366
+ SiteDiff.log ''
367
+ SiteDiff.log "#{@paths.length} page(s) found."
368
+ SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
369
+ end
370
+
371
+ no_commands do
372
+ # Generates CURL options.
373
+ #
374
+ # TODO: This should be in the config class instead.
375
+ # TODO: Make all requests insecure and avoid custom curl-opts.
376
+ def get_curl_opts(options)
377
+ # We do want string keys here
378
+ bool_hash = { 'true' => true, 'false' => false }
379
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS
380
+ .clone
381
+ .merge(options['curl_options'] || {})
382
+ .merge(options['curl_opts'] || {})
383
+ curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
384
+ if options[:insecure]
385
+ curl_opts[:ssl_verifypeer] = false
386
+ curl_opts[:ssl_verifyhost] = 0
387
+ end
388
+ curl_opts
389
+ end
390
+
391
+ ##
392
+ # Ensures that the given directory exists.
393
+ def get_dir(directory)
394
+ # Create the dir. Must go before cache initialization!
395
+ @dir = Pathname.new(directory || '.')
396
+ @dir.mkpath unless @dir.directory?
397
+ @dir.to_s
398
+ end
399
+
400
+ ##
401
+ # Processes a crawled path.
402
+ def after_crawl(tag, info)
403
+ path = UriWrapper.canonicalize(info.relative)
404
+
405
+ # Register the path.
406
+ @paths[tag] = [] unless @paths[tag]
407
+ @paths[tag] << path
408
+
409
+ result = info.read_result
410
+
411
+ # Write result to applicable cache.
412
+ @cache.set(tag, path, result)
413
+ # If single-site, cache "after" as "before".
414
+ @cache.set(:before, path, result) unless @config.roots[:before]
415
+
416
+ # TODO: Restore application of rules.
417
+ # @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
418
+ end
88
419
  end
89
420
  end
90
421
  end