sitediff 0.0.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 1377b6bafe658b4a8a8f50ef0f54e577e99f1a87
4
- data.tar.gz: 9a80e20a89b7f2f60506bbcccdc2b9f7037320f8
2
+ SHA256:
3
+ metadata.gz: 1b7854497b5e81f48d810acec8106cbc66e33492d046e032e5516f76db26f142
4
+ data.tar.gz: a9349a79953237dd017600d49d38b8e734afc561f0ce09a1f8732e0e933530c9
5
5
  SHA512:
6
- metadata.gz: 7fe3ce1b2e7bc1762d5e8f4a1bfd4ab9280963732d033d4b403087f71a4d6caa394669eadb9d82064ad963dd62918f95cae7e0b0495dd92979f105be1bfe6f5e
7
- data.tar.gz: fb98c439544172ae40c0ba347272ec1287a1dc9042ab238e4abd8f720d52307ffa8a4b3fec7a70df68c7fdf845b324fe69c99693c5bbbc369f9e2c22fbe8c404
6
+ metadata.gz: 0e91f665f3c59b8a65f16c6942ec49b9cc9ee7fd12b0a777eadb844a0b9819ab1fd9485495bf2c757ca7342a6198dcccb5ae546c4ddf2682f234d015b64309b2
7
+ data.tar.gz: 64b7980bbbade8710b6069af19a67083678c2bd5fa99674df3360c1c6a3ddf8a15de7c5be4e8349ec298fc1c0dc27535b816089cd4f8852b8c8633861d72a178
@@ -2,8 +2,15 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  # when run as gem, $0 is /usr/local/bin/sitediff not this file
5
- $LOAD_PATH.unshift File.expand_path('../lib', __dir__) if $PROGRAM_NAME == __FILE__
5
+ if $PROGRAM_NAME == __FILE__
6
+ $LOAD_PATH.unshift File.expand_path('../lib', __dir__)
7
+ end
6
8
 
7
9
  require 'sitediff/cli'
8
10
 
9
- SiteDiff::Cli.start
11
+ begin
12
+ SiteDiff::Cli.start
13
+ rescue Interrupt
14
+ puts("\n")
15
+ SiteDiff.log('Stopping. Interrupted by user.')
16
+ end
@@ -2,63 +2,85 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  require 'sitediff/config'
5
+ require 'sitediff/diff'
5
6
  require 'sitediff/fetch'
6
7
  require 'sitediff/result'
8
+ require 'sitediff/report'
7
9
  require 'pathname'
8
10
  require 'rainbow'
11
+ require 'rubygems'
9
12
  require 'yaml'
10
13
 
14
+ # SiteDiff Object.
11
15
  class SiteDiff
12
- # path to misc. static files (e.g. erb, css files)
16
+ attr_reader :config, :results
17
+
18
+ # SiteDiff installation directory.
19
+ ROOT_DIR = File.dirname(File.dirname(__FILE__))
20
+
21
+ # Path to misc files. Ex: *.erb, *.css.
13
22
  FILES_DIR = File.join(File.dirname(__FILE__), 'sitediff', 'files')
14
23
 
15
- # subdirectory containing all failing diffs
16
- DIFFS_DIR = 'diffs'
17
-
18
- # files in output
19
- FAILURES_FILE = 'failures.txt'
20
- REPORT_FILE = 'report.html'
21
- SETTINGS_FILE = 'settings.yaml'
22
-
23
- # label will be colorized and str will not be.
24
- # type dictates the color: can be :success, :error, or :failure
25
- def self.log(str, type = :info, label = nil)
26
- label = label ? "[sitediff] #{label}" : '[sitediff]'
27
- bg = fg = nil
28
- case type
29
- when :info
30
- bg = fg = nil
31
- when :diff_success
32
- bg = :green
24
+ # Logs a message.
25
+ #
26
+ # Label will be colorized and message will not.
27
+ # Type dictates the color: can be :success, :error, or :failure.
28
+ #
29
+ # TODO: Only print :debug messages in debug mode.
30
+ def self.log(message, type = :info, label = nil)
31
+ # Prepare label.
32
+ label ||= type unless type == :info
33
+ label = label.to_s
34
+ unless label.empty?
35
+ # Colorize label.
33
36
  fg = :black
34
- when :diff_failure
35
- bg = :red
36
- when :warn
37
- bg = :yellow
38
- fg = :black
39
- when :error
40
- bg = :red
37
+ bg = :blue
38
+
39
+ case type
40
+ when :info
41
+ bg = :cyan
42
+ when :success
43
+ bg = :green
44
+ when :error
45
+ bg = :red
46
+ when :warning
47
+ bg = :yellow
48
+ end
49
+
50
+ label = '[' + label.to_s + ']'
51
+ label = Rainbow(label)
52
+ label = label.bg(bg) if bg
53
+ label = label.fg(fg) if fg
54
+
55
+ # Add a space after the label.
56
+ label += ' '
41
57
  end
42
- label = Rainbow(label)
43
- label = label.bg(bg) if bg
44
- label = label.fg(fg) if fg
45
- puts label + ' ' + str
58
+
59
+ puts label + message
46
60
  end
47
61
 
48
- attr_reader :config, :results
62
+ ##
63
+ # Returns the "before" site's URL.
64
+ #
65
+ # TODO: Remove in favor of config.before_url.
49
66
  def before
50
67
  @config.before['url']
51
68
  end
52
69
 
70
+ ##
71
+ # Returns the "after" site's URL.
72
+ #
73
+ # TODO: Remove in favor of config.after_url.
53
74
  def after
54
75
  @config.after['url']
55
76
  end
56
77
 
57
- def initialize(config, cache, concurrency, interval, verbose = true, debug = false)
78
+ # Initialize SiteDiff.
79
+ def initialize(config, cache, verbose = true, debug = false)
58
80
  @cache = cache
59
81
  @verbose = verbose
60
82
  @debug = debug
61
- @interval = interval
83
+
62
84
  # Check for single-site mode
63
85
  validate_opts = {}
64
86
  if !config.before['url'] && @cache.tag?(:before)
@@ -69,37 +91,48 @@ class SiteDiff
69
91
  validate_opts[:need_before] = false
70
92
  end
71
93
  config.validate(validate_opts)
72
-
73
- @concurrency = concurrency
94
+ # Configure diff.
95
+ Diff.diff_config(config)
74
96
  @config = config
75
97
  end
76
98
 
77
- # Sanitize HTML
99
+ # Sanitize HTML.
78
100
  def sanitize(path, read_results)
79
101
  %i[before after].map do |tag|
80
102
  html = read_results[tag].content
103
+ # TODO: See why encoding is empty while running tests.
104
+ #
105
+ # The presence of an "encoding" value used to be used to determine
106
+ # if the sanitizer would be called. However, encoding turns up blank
107
+ # during rspec tests for some reason.
81
108
  encoding = read_results[tag].encoding
82
- if encoding
83
- config = @config.send(tag)
84
- Sanitizer.new(html, config, path: path).sanitize
109
+ if encoding || html.length.positive?
110
+ section = @config.send(tag, true)
111
+ Sanitizer.new(html, section, path: path).sanitize
85
112
  else
86
113
  html
87
114
  end
88
115
  end
89
116
  end
90
117
 
91
- # Process a set of read results
118
+ ##
119
+ # Process a set of read results.
120
+ #
121
+ # This is the callback that processes items fetched by the Fetcher.
92
122
  def process_results(path, read_results)
93
- if (error = (read_results[:before].error || read_results[:after].error))
123
+ error = (read_results[:before].error || read_results[:after].error)
124
+ if error
94
125
  diff = Result.new(path, nil, nil, nil, nil, error)
95
126
  else
96
127
  begin
97
- diff = Result.new(path,
98
- *sanitize(path, read_results),
99
- read_results[:before].encoding,
100
- read_results[:after].encoding,
101
- nil)
102
- rescue => e
128
+ diff = Result.new(
129
+ path,
130
+ *sanitize(path, read_results),
131
+ read_results[:before].encoding,
132
+ read_results[:after].encoding,
133
+ nil
134
+ )
135
+ rescue StandardError => e
103
136
  raise if @debug
104
137
 
105
138
  Result.new(path, nil, nil, nil, nil, "Sanitization error: #{e}")
@@ -114,60 +147,72 @@ class SiteDiff
114
147
  end
115
148
  end
116
149
 
117
- # Perform the comparison, populate @results and return the number of failing
118
- # paths (paths with non-zero diff).
119
- def run(curl_opts = {}, debug = true)
150
+ ##
151
+ # Compute diff as per config.
152
+ #
153
+ # @return [Integer]
154
+ # Number of paths which have diffs.
155
+ def run
120
156
  # Map of path -> Result object, populated by process_results
121
157
  @results = {}
122
158
  @ordered = @config.paths.dup
123
159
 
124
160
  unless @cache.read_tags.empty?
125
- SiteDiff.log('Using sites from cache: ' +
126
- @cache.read_tags.sort.join(', '))
161
+ SiteDiff.log('Using sites from cache: ' + @cache.read_tags.sort.join(', '))
127
162
  end
128
163
 
129
164
  # TODO: Fix this after config merge refactor!
130
165
  # Not quite right. We are not passing @config.before or @config.after
131
166
  # so passing this instead but @config.after['curl_opts'] is ignored.
167
+ curl_opts = @config.setting :curl_opts
132
168
  config_curl_opts = @config.before['curl_opts']
133
169
  curl_opts = config_curl_opts.clone.merge(curl_opts) if config_curl_opts
134
- fetcher = Fetch.new(@cache, @config.paths, @interval, @concurrency, curl_opts, debug,
135
- before: before, after: after)
170
+ fetcher = Fetch.new(
171
+ @cache,
172
+ @config.paths,
173
+ @config.setting(:interval),
174
+ @config.setting(:concurrency),
175
+ curl_opts,
176
+ @debug,
177
+ before: @config.before_url,
178
+ after: @config.after_url
179
+ )
180
+
181
+ # Run the Fetcher with "process results" as a callback.
136
182
  fetcher.run(&method(:process_results))
137
183
 
138
184
  # Order by original path order
139
- @results = @config.paths.map { |p| @results[p] }
185
+ @results = @config.paths.map { |path| @results[path] }
140
186
  results.map { |r| r unless r.success? }.compact.length
141
187
  end
142
188
 
143
- # Dump results to disk
144
- def dump(dir, report_before, report_after)
145
- report_before ||= before
146
- report_after ||= after
147
- dir = Pathname.new(dir)
148
- dir.mkpath unless dir.directory?
149
-
150
- # store diffs of each failing case, first wipe out existing diffs
151
- diff_dir = dir + DIFFS_DIR
152
- diff_dir.rmtree if diff_dir.exist?
153
- results.each { |r| r.dump(dir) if r.status == Result::STATUS_FAILURE }
154
- SiteDiff.log "All diff files were dumped inside #{dir.expand_path}"
155
-
156
- # store failing paths
157
- failures = dir + FAILURES_FILE
158
- SiteDiff.log "Writing failures to #{failures.expand_path}"
159
- failures.open('w') do |f|
160
- results.each { |r| f.puts r.path unless r.success? }
189
+ ##
190
+ # Get a reporter object to help with report generation.
191
+ def report
192
+ if @results.nil?
193
+ raise SiteDiffException(
194
+ 'No results detected. Run SiteDiff.run before SiteDiff.report.'
195
+ )
161
196
  end
162
197
 
163
- # create report of results
164
- report = Diff.generate_html_report(results, report_before, report_after,
165
- @cache)
166
- dir.+(REPORT_FILE).open('w') { |f| f.write(report) }
198
+ Report.new(@config, @cache, @results)
199
+ end
200
+
201
+ ##
202
+ # Get SiteDiff gemspec.
203
+ def self.gemspec
204
+ file = ROOT_DIR + '/sitediff.gemspec'
205
+ Gem::Specification.load(file)
206
+ end
167
207
 
168
- # serve some settings
169
- settings = { 'before' => report_before, 'after' => report_after,
170
- 'cached' => %w[before after] }
171
- dir.+(SETTINGS_FILE).open('w') { |f| YAML.dump(settings, f) }
208
+ ##
209
+ # Ensures that a directory exists and returns a Pathname for it.
210
+ #
211
+ # @param [String] dir
212
+ # path/to/directory
213
+ def self.ensure_dir(dir)
214
+ dir = Pathname.new(dir) unless dir.is_a? Pathname
215
+ dir.mkpath unless dir.directory?
216
+ dir
172
217
  end
173
218
  end
@@ -4,28 +4,42 @@ require 'set'
4
4
  require 'fileutils'
5
5
 
6
6
  class SiteDiff
7
+ # SiteDiff Cache Handler.
7
8
  class Cache
8
9
  attr_accessor :read_tags, :write_tags
9
10
 
11
+ ##
12
+ # Creates a Cache object.
10
13
  def initialize(opts = {})
11
14
  @create = opts[:create]
12
15
 
13
- # Read and Write tags are sets that can contain :before and :after
14
- # They indicate whether we should use the cache for reading or writing
16
+ # Read and Write tags are sets that can contain :before and :after.
17
+ # They indicate whether we should use the cache for reading or writing.
15
18
  @read_tags = Set.new
16
19
  @write_tags = Set.new
20
+
21
+ # The directory used by the cache for storage.
17
22
  @dir = opts[:directory] || '.'
18
23
  end
19
24
 
25
+ ##
20
26
  # Is a tag cached?
27
+ # TODO: Rename it to is_cached? as it makes more sense.
21
28
  def tag?(tag)
22
29
  File.directory?(File.join(@dir, 'snapshot', tag.to_s))
23
30
  end
24
31
 
32
+ ##
33
+ # Get data from cache.
25
34
  def get(tag, path)
26
35
  return nil unless @read_tags.include? tag
27
36
 
28
- filename = File.join(@dir, 'snapshot', tag.to_s, *path.split(File::SEPARATOR))
37
+ filename = File.join(
38
+ @dir,
39
+ 'snapshot',
40
+ tag.to_s,
41
+ *path.split(File::SEPARATOR)
42
+ )
29
43
 
30
44
  filename = File.join(filename, 'index.html') if File.directory?(filename)
31
45
  return nil unless File.file? filename
@@ -33,10 +47,17 @@ class SiteDiff
33
47
  Marshal.load(File.read(filename))
34
48
  end
35
49
 
50
+ ##
51
+ # Set data to cache.
36
52
  def set(tag, path, result)
37
53
  return unless @write_tags.include? tag
38
54
 
39
- filename = File.join(@dir, 'snapshot', tag.to_s, *path.split(File::SEPARATOR))
55
+ filename = File.join(
56
+ @dir,
57
+ 'snapshot',
58
+ tag.to_s,
59
+ *path.split(File::SEPARATOR)
60
+ )
40
61
 
41
62
  filename = File.join(filename, 'index.html') if File.directory?(filename)
42
63
  filepath = Pathname.new(filename)
@@ -50,23 +71,31 @@ class SiteDiff
50
71
  # May cause problems if action is not atomic!
51
72
  # Move existing file to dir/index.html first
52
73
  # Not robust! Should generate an UUID or something.
53
- SiteDiff.log "Overwriting file #{tempname}", :warn if File.exist?(tempname)
74
+ if File.exist?(tempname)
75
+ SiteDiff.log "Overwriting file #{tempname}", :warning
76
+ end
54
77
  curdir.rename(tempname)
55
78
  filepath.dirname.mkpath
56
79
  # Should only happen in strange situations such as when the path
57
80
  # is foo/index.html/bar (i.e., index.html is a directory)
58
- SiteDiff.log "Overwriting file #{tempname}", :warn if (curdir + 'index.html').exist?
81
+ if (curdir + 'index.html').exist?
82
+ SiteDiff.log "Overwriting file #{tempname}", :warning
83
+ end
59
84
  tempname.rename(curdir + 'index.html')
60
85
  end
61
86
  end
62
87
  File.open(filename, 'w') { |file| file.write(Marshal.dump(result)) }
63
88
  end
64
89
 
90
+ ##
91
+ # TODO: Document this or remove it if unused.
65
92
  def key(tag, path)
66
93
  # Ensure encoding stays the same!
67
94
  Marshal.dump([tag, path.encode('UTF-8')])
68
95
  end
69
96
 
97
+ ##
98
+ # Ensures that a directory exists.
70
99
  def get_dir(directory)
71
100
  # Create the dir. Must go before cache initialization!
72
101
  @dir = Pathname.new(directory || '.')
@@ -5,32 +5,32 @@ require 'sitediff'
5
5
  require 'sitediff/cache'
6
6
  require 'sitediff/config'
7
7
  require 'sitediff/config/creator'
8
+ require 'sitediff/config/preset'
8
9
  require 'sitediff/fetch'
9
10
  require 'sitediff/webserver/resultserver'
10
11
 
11
12
  class SiteDiff
13
+ # SiteDiff CLI.
14
+ # TODO: Use config.defaults to feed default values for sitediff.yaml params?
12
15
  class Cli < Thor
13
16
  class_option 'directory',
14
17
  type: :string,
15
18
  aliases: '-C',
16
19
  default: 'sitediff',
17
20
  desc: 'Configuration directory'
18
- class_option :curl_options,
19
- type: :hash,
20
- default: {},
21
- desc: 'Options to be passed to curl'
22
- class_option :insecure,
21
+ class_option :debug,
23
22
  type: :boolean,
23
+ aliases: '-d',
24
24
  default: false,
25
- desc: 'Ignore many HTTPS/SSL errors'
26
- class_option :debug,
25
+ desc: 'Stop on certain errors and produce error trace backs.'
26
+ class_option 'verbose',
27
27
  type: :boolean,
28
+ aliases: '-v',
28
29
  default: false,
29
- desc: 'Debug mode. Stop on certain errors and produce a traceback.'
30
- class_option :interval,
31
- type: :numeric,
32
- default: 0,
33
- desc: 'Crawling delay - interval in milliseconds'
30
+ desc: 'Show verbose output in terminal'
31
+
32
+ # Command aliases.
33
+ map recrawl: :crawl
34
34
 
35
35
  # Thor, by default, exits with 0 no matter what!
36
36
  def self.exit_on_failure?
@@ -42,6 +42,20 @@ class SiteDiff
42
42
  true
43
43
  end
44
44
 
45
+ desc 'version', 'Show version information'
46
+ ##
47
+ # Show version information.
48
+ def version
49
+ gemspec = SiteDiff.gemspec
50
+ output = []
51
+ output.push("Sitediff CLI #{gemspec.version}")
52
+ if options[:verbose]
53
+ output.push('Website: ' + gemspec.homepage)
54
+ output.push('GitHub: ' + gemspec.metadata['source_code_uri'])
55
+ end
56
+ puts output.join("\n")
57
+ end
58
+
45
59
  option 'paths-file',
46
60
  type: :string,
47
61
  desc: 'Paths are read (one at a line) from PATHS: ' \
@@ -53,79 +67,118 @@ class SiteDiff
53
67
  desc: 'Specific path or paths to fetch'
54
68
  option 'before',
55
69
  type: :string,
56
- desc: 'URL used to fetch the before HTML. Acts as a prefix to specified paths',
70
+ desc: 'URL to the "before" site, prefixed to all paths.',
57
71
  aliases: '--before-url'
58
72
  option 'after',
59
73
  type: :string,
60
- desc: 'URL used to fetch the after HTML. Acts as a prefix to specified paths.',
74
+ desc: 'URL to the "after" site, prefixed to all paths.',
61
75
  aliases: '--after-url'
76
+ option 'report-format',
77
+ type: :string,
78
+ enum: %w[html json],
79
+ default: 'html',
80
+ desc: 'The format in which a report should be generated.'
81
+ # TODO: Deprecate the parameters before-report / after-report?
62
82
  option 'before-report',
63
83
  type: :string,
64
- desc: 'Before URL to use for reporting purposes. Useful if port forwarding.',
84
+ desc: 'URL to use in reports. Useful if port forwarding.',
65
85
  aliases: '--before-url-report'
66
86
  option 'after-report',
67
87
  type: :string,
68
- desc: 'After URL to use for reporting purposes. Useful if port forwarding.',
88
+ desc: 'URL to use in reports. Useful if port forwarding.',
69
89
  aliases: '--after-url-report'
70
90
  option 'cached',
71
91
  type: :string,
72
92
  enum: %w[none all before after],
73
93
  default: 'before',
74
94
  desc: 'Use the cached version of these sites, if available.'
75
- option 'verbose',
95
+ option 'ignore-whitespace',
76
96
  type: :boolean,
77
- aliases: '-v',
78
97
  default: false,
79
- desc: 'Show differences between versions for each page in terminal'
80
- option :concurrency,
81
- type: :numeric,
82
- default: 3,
83
- desc: 'Max number of concurrent connections made'
84
- desc 'diff [OPTIONS] [CONFIGFILES]', 'Perform systematic diff on given URLs'
85
- def diff(*config_files)
86
- @interval = options['interval']
87
- check_interval(@interval)
98
+ aliases: '-w',
99
+ desc: 'Ignore changes in whitespace.'
100
+ option 'export',
101
+ type: :boolean,
102
+ default: false,
103
+ aliases: '-e',
104
+ desc: 'Export report to files. This option forces HTML format.'
105
+ desc 'diff [OPTIONS] [CONFIG-FILE]',
106
+ 'Compute diffs on configured URLs.'
107
+ ##
108
+ # Computes diffs.
109
+ def diff(config_file = nil)
88
110
  @dir = get_dir(options['directory'])
89
- config = SiteDiff::Config.new(config_files, @dir)
90
-
91
- # override config based on options
92
- paths = options['paths']
93
- if (paths_file = options['paths-file'])
94
- if paths
95
- SiteDiff.log "Can't have both --paths-file and --paths", :error
96
- exit(-1)
97
- end
111
+ config = SiteDiff::Config.new(config_file, @dir)
98
112
 
99
- paths_file = Pathname.new(paths_file).expand_path
100
- unless File.exist? paths_file
101
- raise Config::InvalidConfig,
102
- "Paths file '#{paths_file}' not found!"
103
- end
104
- SiteDiff.log "Reading paths from: #{paths_file}"
105
- config.paths = File.readlines(paths_file)
113
+ # Determine "paths" override based on options.
114
+ if options['paths'] && options['paths-file']
115
+ SiteDiff.log "Can't specify both --paths-file and --paths.", :error
116
+ exit(-1)
106
117
  end
107
- config.paths = paths if paths
108
118
 
119
+ # Ignore whitespace option.
120
+ config.ignore_whitespace = options['ignore-whitespace'] if options['ignore-whitespace']
121
+
122
+ # Export report option.
123
+ config.export = options['export']
124
+
125
+ # Apply "paths" override, if any.
126
+ config.paths = options['paths'] if options['paths']
127
+
128
+ # Determine and apply "paths-file", if "paths" is not specified.
129
+ unless options['paths']
130
+ paths_file = options['paths-file']
131
+ paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
132
+ paths_file = File.expand_path(paths_file)
133
+
134
+ paths_count = config.paths_file_read(paths_file)
135
+ SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
136
+ end
137
+
138
+ # TODO: Why do we allow before and after override during diff?
109
139
  config.before['url'] = options['before'] if options['before']
110
140
  config.after['url'] = options['after'] if options['after']
111
141
 
112
- # Setup cache
113
- cache = SiteDiff::Cache.new(create: options['cached'] != 'none',
114
- directory: @dir)
142
+ # Prepare cache.
143
+ cache = SiteDiff::Cache.new(
144
+ create: options['cached'] != 'none',
145
+ directory: @dir
146
+ )
115
147
  cache.read_tags << :before if %w[before all].include?(options['cached'])
116
148
  cache.read_tags << :after if %w[after all].include?(options['cached'])
117
149
  cache.write_tags << :before << :after
118
150
 
119
- sitediff = SiteDiff.new(config, cache, options[:concurrency], @interval,
120
- options['verbose'], options[:debug])
121
- num_failing = sitediff.run(get_curl_opts(options), options[:debug])
122
- exit_code = num_failing > 0 ? 2 : 0
151
+ # Run sitediff.
152
+ sitediff = SiteDiff.new(
153
+ config,
154
+ cache,
155
+ options['verbose'],
156
+ options[:debug]
157
+ )
158
+ num_failing = sitediff.run
159
+ exit_code = num_failing.positive? ? 2 : 0
160
+
161
+ # Generate HTML report.
162
+ if options['report-format'] == 'html' || config.export
163
+ sitediff.report.generate_html(
164
+ @dir,
165
+ options['before-report'],
166
+ options['after-report']
167
+ )
168
+ end
123
169
 
124
- sitediff.dump(@dir, options['before-report'],
125
- options['after-report'])
170
+ # Generate JSON report.
171
+ if options['report-format'] == 'json' && config.export == false
172
+ sitediff.report.generate_json @dir
173
+ end
174
+
175
+ SiteDiff.log 'Run "sitediff serve" to see a report.' unless options['export']
126
176
  rescue Config::InvalidConfig => e
127
177
  SiteDiff.log "Invalid configuration: #{e.message}", :error
128
- SiteDiff.log "at #{e.backtrace}", :error
178
+ SiteDiff.log e.backtrace, :error if options[:verbose]
179
+ rescue Config::ConfigNotFound => e
180
+ SiteDiff.log "Invalid configuration: #{e.message}", :error
181
+ SiteDiff.log e.backtrace, :error if options[:verbose]
129
182
  else # no exception was raised
130
183
  # Thor::Error --> exit(1), guaranteed by exit_on_failure?
131
184
  # Failing diff --> exit(2), populated above
@@ -140,11 +193,14 @@ class SiteDiff
140
193
  type: :boolean,
141
194
  default: true,
142
195
  desc: 'Whether to open the served content in your browser'
143
- desc 'serve [OPTIONS]', 'Serve the sitediff output directory over HTTP'
144
- def serve(*config_files)
145
- config = SiteDiff::Config.new(config_files, options['directory'])
146
- # Could check non-empty config here but currently errors are already raised.
196
+ desc 'serve [OPTIONS] [CONFIG-FILE]',
197
+ 'Serve SiteDiff report directory over HTTP.'
198
+ ##
199
+ # Serves SiteDiff report for accessing in the browser.
200
+ def serve(config_file = nil)
147
201
  @dir = get_dir(options['directory'])
202
+ config = SiteDiff::Config.new(config_file, @dir)
203
+
148
204
  cache = Cache.new(directory: @dir)
149
205
  cache.read_tags << :before << :after
150
206
 
@@ -157,85 +213,102 @@ class SiteDiff
157
213
  ).wait
158
214
  rescue SiteDiffException => e
159
215
  SiteDiff.log e.message, :error
160
- SiteDiff.log e.backtrace, :error
216
+ SiteDiff.log e.backtrace, :error if options[:verbose]
161
217
  end
162
218
 
163
219
  option :depth,
164
220
  type: :numeric,
165
- default: 3,
221
+ default: Config::DEFAULT_CONFIG['settings']['depth'],
166
222
  desc: 'How deeply to crawl the given site'
167
- option :rules,
223
+ option :crawl,
224
+ type: :boolean,
225
+ default: true,
226
+ desc: 'Run "sitediff crawl" to discover paths.'
227
+ option :preset,
168
228
  type: :string,
169
- enum: %w[yes no disabled],
170
- default: 'disabled',
171
- desc: 'Whether rules for the site should be auto-created'
229
+ enum: Config::Preset.all,
230
+ desc: 'Framework-specific presets to apply.'
172
231
  option :concurrency,
173
232
  type: :numeric,
174
- default: 3,
175
- desc: 'Max number of concurrent connections made'
233
+ default: Config::DEFAULT_CONFIG['settings']['concurrency'],
234
+ desc: 'Max number of concurrent connections made.'
235
+ option :interval,
236
+ type: :numeric,
237
+ default: Config::DEFAULT_CONFIG['settings']['interval'],
238
+ desc: 'Crawling delay - interval in milliseconds.'
176
239
  option :whitelist,
177
240
  type: :string,
178
- default: '',
179
- desc: 'Optional whitelist for crawling'
241
+ default: Config::DEFAULT_CONFIG['settings']['whitelist'],
242
+ desc: 'Optional whitelist for crawling.'
180
243
  option :blacklist,
181
244
  type: :string,
182
- default: '',
183
- desc: 'Optional blacklist for crawling'
184
- desc 'init URL [URL]', 'Create a sitediff configuration'
245
+ default: Config::DEFAULT_CONFIG['settings']['blacklist'],
246
+ desc: 'Optional blacklist for crawling.'
247
+ # TODO: Remove this option. Always ignore SSL errors.
248
+ option :insecure,
249
+ type: :boolean,
250
+ default: false,
251
+ desc: 'Ignore many HTTPS/SSL errors'
252
+ option :curl_options,
253
+ type: :hash,
254
+ default: {},
255
+ desc: 'Options to be passed to curl'
256
+ desc 'init URL [URL]', 'Create a sitediff configuration.'
257
+ ##
258
+ # Initializes a sitediff (yaml) configuration file.
185
259
  def init(*urls)
186
260
  unless (1..2).cover? urls.size
187
261
  SiteDiff.log 'sitediff init requires one or two URLs', :error
188
262
  exit(2)
189
263
  end
190
264
 
191
- @interval = options['interval']
192
- check_interval(@interval)
265
+ # Prepare a config object and write it to the file system.
193
266
  @dir = get_dir(options['directory'])
194
- curl_opts = get_curl_opts(options)
195
- @whitelist = create_regexp(options['whitelist'])
196
- @blacklist = create_regexp(options['blacklist'])
197
- creator = SiteDiff::Config::Creator.new(options[:concurrency],
198
- options['interval'],
199
- @whitelist,
200
- @blacklist,
201
- curl_opts,
202
- options[:debug],
203
- *urls)
267
+ creator = SiteDiff::Config::Creator.new(options[:debug], *urls)
204
268
  creator.create(
205
269
  depth: options[:depth],
206
270
  directory: @dir,
207
- rules: options[:rules] != 'no',
208
- rules_disabled: (options[:rules] == 'disabled')
209
- ) do |_tag, info|
210
- SiteDiff.log "Visited #{info.uri}, cached"
211
- end
212
-
271
+ concurrency: options[:concurrency],
272
+ interval: options[:interval],
273
+ whitelist: Config.create_regexp(options['whitelist']),
274
+ blacklist: Config.create_regexp(options['blacklist']),
275
+ preset: options[:preset],
276
+ curl_opts: get_curl_opts(options)
277
+ )
213
278
  SiteDiff.log "Created #{creator.config_file.expand_path}", :success
214
- SiteDiff.log "You can now run 'sitediff diff'", :success
279
+
280
+ # Discover paths, if enabled.
281
+ if options[:crawl]
282
+ crawl(creator.config_file)
283
+ SiteDiff.log 'You can now run "sitediff diff".', :success
284
+ else
285
+ SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
286
+ end
215
287
  end
216
288
 
217
289
  option :url,
218
290
  type: :string,
219
291
  desc: 'A custom base URL to fetch from'
220
- option :concurrency,
221
- type: :numeric,
222
- default: 3,
223
- desc: 'Max number of concurrent connections made'
224
- desc 'store [CONFIGFILES]',
225
- 'Cache the current contents of a site for later comparison'
226
- def store(*config_files)
292
+ desc 'store [CONFIG-FILE]',
293
+ 'Cache the current contents of a site for later comparison.'
294
+ ##
295
+ # Caches the current version of the site.
296
+ def store(config_file = nil)
227
297
  @dir = get_dir(options['directory'])
228
- config = SiteDiff::Config.new(config_files, @dir)
298
+ config = SiteDiff::Config.new(config_file, @dir)
299
+ # TODO: Figure out how to remove this config.validate call.
229
300
  config.validate(need_before: false)
301
+ config.paths_file_read
302
+
230
303
  cache = SiteDiff::Cache.new(directory: @dir, create: true)
231
304
  cache.write_tags << :before
232
305
 
233
306
  base = options[:url] || config.after['url']
234
307
  fetcher = SiteDiff::Fetch.new(cache,
235
308
  config.paths,
236
- options[:interval],
237
- options[:concurrency],
238
- get_curl_opts(options),
309
+ config.setting(:interval),
310
+ config.setting(:concurrency),
311
+ get_curl_opts(config.settings),
239
312
  options[:debug],
240
313
  before: base)
241
314
  fetcher.run do |path, _res|
@@ -243,11 +316,70 @@ class SiteDiff
243
316
  end
244
317
  end
245
318
 
319
+ desc 'crawl [CONFIG-FILE]',
320
+ 'Crawl the "before" site to discover paths.'
321
+ ##
322
+ # Crawls the "before" site to determine "paths".
323
+ #
324
+ # TODO: Move actual crawling to sitediff.crawl(config).
325
+ # TODO: Switch to paths = sitediff.crawl().
326
+ def crawl(config_file = nil)
327
+ # Prepare configuration.
328
+ @dir = get_dir(options['directory'])
329
+ @config = SiteDiff::Config.new(config_file, @dir)
330
+
331
+ # Prepare cache.
332
+ @cache = SiteDiff::Cache.new(
333
+ create: options['cached'] != 'none',
334
+ directory: @dir
335
+ )
336
+ @cache.write_tags << :before << :after
337
+
338
+ # Crawl with Hydra to discover paths.
339
+ hydra = Typhoeus::Hydra.new(
340
+ max_concurrency: @config.setting(:concurrency)
341
+ )
342
+ @paths = {}
343
+ @config.roots.each do |tag, url|
344
+ Crawler.new(
345
+ hydra,
346
+ url,
347
+ @config.setting(:interval),
348
+ @config.setting(:whitelist),
349
+ @config.setting(:blacklist),
350
+ @config.setting(:depth),
351
+ get_curl_opts(@config.settings),
352
+ @debug
353
+ ) do |info|
354
+ SiteDiff.log "Visited #{info.uri}, cached."
355
+ after_crawl(tag, info)
356
+ end
357
+ end
358
+ hydra.run
359
+
360
+ # Write paths to a file.
361
+ @paths = @paths.values.reduce(&:|).to_a.sort
362
+ @config.paths_file_write(@paths)
363
+
364
+ # Log output.
365
+ file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
366
+ SiteDiff.log ''
367
+ SiteDiff.log "#{@paths.length} page(s) found."
368
+ SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
369
+ end
370
+
246
371
  no_commands do
372
+ # Generates CURL options.
373
+ #
374
+ # TODO: This should be in the config class instead.
375
+ # TODO: Make all requests insecure and avoid custom curl-opts.
247
376
  def get_curl_opts(options)
248
377
  # We do want string keys here
249
378
  bool_hash = { 'true' => true, 'false' => false }
250
- curl_opts = UriWrapper::DEFAULT_CURL_OPTS.clone.merge(options[:curl_options])
379
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS
380
+ .clone
381
+ .merge(options['curl_options'] || {})
382
+ .merge(options['curl_opts'] || {})
251
383
  curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
252
384
  if options[:insecure]
253
385
  curl_opts[:ssl_verifypeer] = false
@@ -256,13 +388,8 @@ class SiteDiff
256
388
  curl_opts
257
389
  end
258
390
 
259
- def check_interval(interval)
260
- if interval != 0 && options[:concurrency] != 1
261
- SiteDiff.log '--concurrency must be set to 1 in order to enable the interval feature'
262
- exit(2)
263
- end
264
- end
265
-
391
+ ##
392
+ # Ensures that the given directory exists.
266
393
  def get_dir(directory)
267
394
  # Create the dir. Must go before cache initialization!
268
395
  @dir = Pathname.new(directory || '.')
@@ -270,16 +397,24 @@ class SiteDiff
270
397
  @dir.to_s
271
398
  end
272
399
 
273
- def create_regexp(string_param)
274
- begin
275
- @return_value = string_param == '' ? nil : Regexp.new(string_param)
276
- rescue SiteDiffException => e
277
- @return_value = nil
278
- SiteDiff.log 'whitelist and blacklist parameters must be valid regular expressions', :error
279
- SiteDiff.log e.message, :error
280
- SiteDiff.log e.backtrace, :error
281
- end
282
- return @return_value
400
+ ##
401
+ # Processes a crawled path.
402
+ def after_crawl(tag, info)
403
+ path = UriWrapper.canonicalize(info.relative)
404
+
405
+ # Register the path.
406
+ @paths[tag] = [] unless @paths[tag]
407
+ @paths[tag] << path
408
+
409
+ result = info.read_result
410
+
411
+ # Write result to applicable cache.
412
+ @cache.set(tag, path, result)
413
+ # If single-site, cache "after" as "before".
414
+ @cache.set(:before, path, result) unless @config.roots[:before]
415
+
416
+ # TODO: Restore application of rules.
417
+ # @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
283
418
  end
284
419
  end
285
420
  end