sitediff 0.0.6 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 1377b6bafe658b4a8a8f50ef0f54e577e99f1a87
4
- data.tar.gz: 9a80e20a89b7f2f60506bbcccdc2b9f7037320f8
2
+ SHA256:
3
+ metadata.gz: 1b7854497b5e81f48d810acec8106cbc66e33492d046e032e5516f76db26f142
4
+ data.tar.gz: a9349a79953237dd017600d49d38b8e734afc561f0ce09a1f8732e0e933530c9
5
5
  SHA512:
6
- metadata.gz: 7fe3ce1b2e7bc1762d5e8f4a1bfd4ab9280963732d033d4b403087f71a4d6caa394669eadb9d82064ad963dd62918f95cae7e0b0495dd92979f105be1bfe6f5e
7
- data.tar.gz: fb98c439544172ae40c0ba347272ec1287a1dc9042ab238e4abd8f720d52307ffa8a4b3fec7a70df68c7fdf845b324fe69c99693c5bbbc369f9e2c22fbe8c404
6
+ metadata.gz: 0e91f665f3c59b8a65f16c6942ec49b9cc9ee7fd12b0a777eadb844a0b9819ab1fd9485495bf2c757ca7342a6198dcccb5ae546c4ddf2682f234d015b64309b2
7
+ data.tar.gz: 64b7980bbbade8710b6069af19a67083678c2bd5fa99674df3360c1c6a3ddf8a15de7c5be4e8349ec298fc1c0dc27535b816089cd4f8852b8c8633861d72a178
@@ -2,8 +2,15 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  # when run as gem, $0 is /usr/local/bin/sitediff not this file
5
- $LOAD_PATH.unshift File.expand_path('../lib', __dir__) if $PROGRAM_NAME == __FILE__
5
+ if $PROGRAM_NAME == __FILE__
6
+ $LOAD_PATH.unshift File.expand_path('../lib', __dir__)
7
+ end
6
8
 
7
9
  require 'sitediff/cli'
8
10
 
9
- SiteDiff::Cli.start
11
+ begin
12
+ SiteDiff::Cli.start
13
+ rescue Interrupt
14
+ puts("\n")
15
+ SiteDiff.log('Stopping. Interrupted by user.')
16
+ end
@@ -2,63 +2,85 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  require 'sitediff/config'
5
+ require 'sitediff/diff'
5
6
  require 'sitediff/fetch'
6
7
  require 'sitediff/result'
8
+ require 'sitediff/report'
7
9
  require 'pathname'
8
10
  require 'rainbow'
11
+ require 'rubygems'
9
12
  require 'yaml'
10
13
 
14
+ # SiteDiff Object.
11
15
  class SiteDiff
12
- # path to misc. static files (e.g. erb, css files)
16
+ attr_reader :config, :results
17
+
18
+ # SiteDiff installation directory.
19
+ ROOT_DIR = File.dirname(File.dirname(__FILE__))
20
+
21
+ # Path to misc files. Ex: *.erb, *.css.
13
22
  FILES_DIR = File.join(File.dirname(__FILE__), 'sitediff', 'files')
14
23
 
15
- # subdirectory containing all failing diffs
16
- DIFFS_DIR = 'diffs'
17
-
18
- # files in output
19
- FAILURES_FILE = 'failures.txt'
20
- REPORT_FILE = 'report.html'
21
- SETTINGS_FILE = 'settings.yaml'
22
-
23
- # label will be colorized and str will not be.
24
- # type dictates the color: can be :success, :error, or :failure
25
- def self.log(str, type = :info, label = nil)
26
- label = label ? "[sitediff] #{label}" : '[sitediff]'
27
- bg = fg = nil
28
- case type
29
- when :info
30
- bg = fg = nil
31
- when :diff_success
32
- bg = :green
24
+ # Logs a message.
25
+ #
26
+ # Label will be colorized and message will not.
27
+ # Type dictates the color: can be :success, :error, or :failure.
28
+ #
29
+ # TODO: Only print :debug messages in debug mode.
30
+ def self.log(message, type = :info, label = nil)
31
+ # Prepare label.
32
+ label ||= type unless type == :info
33
+ label = label.to_s
34
+ unless label.empty?
35
+ # Colorize label.
33
36
  fg = :black
34
- when :diff_failure
35
- bg = :red
36
- when :warn
37
- bg = :yellow
38
- fg = :black
39
- when :error
40
- bg = :red
37
+ bg = :blue
38
+
39
+ case type
40
+ when :info
41
+ bg = :cyan
42
+ when :success
43
+ bg = :green
44
+ when :error
45
+ bg = :red
46
+ when :warning
47
+ bg = :yellow
48
+ end
49
+
50
+ label = '[' + label.to_s + ']'
51
+ label = Rainbow(label)
52
+ label = label.bg(bg) if bg
53
+ label = label.fg(fg) if fg
54
+
55
+ # Add a space after the label.
56
+ label += ' '
41
57
  end
42
- label = Rainbow(label)
43
- label = label.bg(bg) if bg
44
- label = label.fg(fg) if fg
45
- puts label + ' ' + str
58
+
59
+ puts label + message
46
60
  end
47
61
 
48
- attr_reader :config, :results
62
+ ##
63
+ # Returns the "before" site's URL.
64
+ #
65
+ # TODO: Remove in favor of config.before_url.
49
66
  def before
50
67
  @config.before['url']
51
68
  end
52
69
 
70
+ ##
71
+ # Returns the "after" site's URL.
72
+ #
73
+ # TODO: Remove in favor of config.after_url.
53
74
  def after
54
75
  @config.after['url']
55
76
  end
56
77
 
57
- def initialize(config, cache, concurrency, interval, verbose = true, debug = false)
78
+ # Initialize SiteDiff.
79
+ def initialize(config, cache, verbose = true, debug = false)
58
80
  @cache = cache
59
81
  @verbose = verbose
60
82
  @debug = debug
61
- @interval = interval
83
+
62
84
  # Check for single-site mode
63
85
  validate_opts = {}
64
86
  if !config.before['url'] && @cache.tag?(:before)
@@ -69,37 +91,48 @@ class SiteDiff
69
91
  validate_opts[:need_before] = false
70
92
  end
71
93
  config.validate(validate_opts)
72
-
73
- @concurrency = concurrency
94
+ # Configure diff.
95
+ Diff.diff_config(config)
74
96
  @config = config
75
97
  end
76
98
 
77
- # Sanitize HTML
99
+ # Sanitize HTML.
78
100
  def sanitize(path, read_results)
79
101
  %i[before after].map do |tag|
80
102
  html = read_results[tag].content
103
+ # TODO: See why encoding is empty while running tests.
104
+ #
105
+ # The presence of an "encoding" value used to be used to determine
106
+ # if the sanitizer would be called. However, encoding turns up blank
107
+ # during rspec tests for some reason.
81
108
  encoding = read_results[tag].encoding
82
- if encoding
83
- config = @config.send(tag)
84
- Sanitizer.new(html, config, path: path).sanitize
109
+ if encoding || html.length.positive?
110
+ section = @config.send(tag, true)
111
+ Sanitizer.new(html, section, path: path).sanitize
85
112
  else
86
113
  html
87
114
  end
88
115
  end
89
116
  end
90
117
 
91
- # Process a set of read results
118
+ ##
119
+ # Process a set of read results.
120
+ #
121
+ # This is the callback that processes items fetched by the Fetcher.
92
122
  def process_results(path, read_results)
93
- if (error = (read_results[:before].error || read_results[:after].error))
123
+ error = (read_results[:before].error || read_results[:after].error)
124
+ if error
94
125
  diff = Result.new(path, nil, nil, nil, nil, error)
95
126
  else
96
127
  begin
97
- diff = Result.new(path,
98
- *sanitize(path, read_results),
99
- read_results[:before].encoding,
100
- read_results[:after].encoding,
101
- nil)
102
- rescue => e
128
+ diff = Result.new(
129
+ path,
130
+ *sanitize(path, read_results),
131
+ read_results[:before].encoding,
132
+ read_results[:after].encoding,
133
+ nil
134
+ )
135
+ rescue StandardError => e
103
136
  raise if @debug
104
137
 
105
138
  Result.new(path, nil, nil, nil, nil, "Sanitization error: #{e}")
@@ -114,60 +147,72 @@ class SiteDiff
114
147
  end
115
148
  end
116
149
 
117
- # Perform the comparison, populate @results and return the number of failing
118
- # paths (paths with non-zero diff).
119
- def run(curl_opts = {}, debug = true)
150
+ ##
151
+ # Compute diff as per config.
152
+ #
153
+ # @return [Integer]
154
+ # Number of paths which have diffs.
155
+ def run
120
156
  # Map of path -> Result object, populated by process_results
121
157
  @results = {}
122
158
  @ordered = @config.paths.dup
123
159
 
124
160
  unless @cache.read_tags.empty?
125
- SiteDiff.log('Using sites from cache: ' +
126
- @cache.read_tags.sort.join(', '))
161
+ SiteDiff.log('Using sites from cache: ' + @cache.read_tags.sort.join(', '))
127
162
  end
128
163
 
129
164
  # TODO: Fix this after config merge refactor!
130
165
  # Not quite right. We are not passing @config.before or @config.after
131
166
  # so passing this instead but @config.after['curl_opts'] is ignored.
167
+ curl_opts = @config.setting :curl_opts
132
168
  config_curl_opts = @config.before['curl_opts']
133
169
  curl_opts = config_curl_opts.clone.merge(curl_opts) if config_curl_opts
134
- fetcher = Fetch.new(@cache, @config.paths, @interval, @concurrency, curl_opts, debug,
135
- before: before, after: after)
170
+ fetcher = Fetch.new(
171
+ @cache,
172
+ @config.paths,
173
+ @config.setting(:interval),
174
+ @config.setting(:concurrency),
175
+ curl_opts,
176
+ @debug,
177
+ before: @config.before_url,
178
+ after: @config.after_url
179
+ )
180
+
181
+ # Run the Fetcher with "process results" as a callback.
136
182
  fetcher.run(&method(:process_results))
137
183
 
138
184
  # Order by original path order
139
- @results = @config.paths.map { |p| @results[p] }
185
+ @results = @config.paths.map { |path| @results[path] }
140
186
  results.map { |r| r unless r.success? }.compact.length
141
187
  end
142
188
 
143
- # Dump results to disk
144
- def dump(dir, report_before, report_after)
145
- report_before ||= before
146
- report_after ||= after
147
- dir = Pathname.new(dir)
148
- dir.mkpath unless dir.directory?
149
-
150
- # store diffs of each failing case, first wipe out existing diffs
151
- diff_dir = dir + DIFFS_DIR
152
- diff_dir.rmtree if diff_dir.exist?
153
- results.each { |r| r.dump(dir) if r.status == Result::STATUS_FAILURE }
154
- SiteDiff.log "All diff files were dumped inside #{dir.expand_path}"
155
-
156
- # store failing paths
157
- failures = dir + FAILURES_FILE
158
- SiteDiff.log "Writing failures to #{failures.expand_path}"
159
- failures.open('w') do |f|
160
- results.each { |r| f.puts r.path unless r.success? }
189
+ ##
190
+ # Get a reporter object to help with report generation.
191
+ def report
192
+ if @results.nil?
193
+ raise SiteDiffException(
194
+ 'No results detected. Run SiteDiff.run before SiteDiff.report.'
195
+ )
161
196
  end
162
197
 
163
- # create report of results
164
- report = Diff.generate_html_report(results, report_before, report_after,
165
- @cache)
166
- dir.+(REPORT_FILE).open('w') { |f| f.write(report) }
198
+ Report.new(@config, @cache, @results)
199
+ end
200
+
201
+ ##
202
+ # Get SiteDiff gemspec.
203
+ def self.gemspec
204
+ file = ROOT_DIR + '/sitediff.gemspec'
205
+ Gem::Specification.load(file)
206
+ end
167
207
 
168
- # serve some settings
169
- settings = { 'before' => report_before, 'after' => report_after,
170
- 'cached' => %w[before after] }
171
- dir.+(SETTINGS_FILE).open('w') { |f| YAML.dump(settings, f) }
208
+ ##
209
+ # Ensures that a directory exists and returns a Pathname for it.
210
+ #
211
+ # @param [String] dir
212
+ # path/to/directory
213
+ def self.ensure_dir(dir)
214
+ dir = Pathname.new(dir) unless dir.is_a? Pathname
215
+ dir.mkpath unless dir.directory?
216
+ dir
172
217
  end
173
218
  end
@@ -4,28 +4,42 @@ require 'set'
4
4
  require 'fileutils'
5
5
 
6
6
  class SiteDiff
7
+ # SiteDiff Cache Handler.
7
8
  class Cache
8
9
  attr_accessor :read_tags, :write_tags
9
10
 
11
+ ##
12
+ # Creates a Cache object.
10
13
  def initialize(opts = {})
11
14
  @create = opts[:create]
12
15
 
13
- # Read and Write tags are sets that can contain :before and :after
14
- # They indicate whether we should use the cache for reading or writing
16
+ # Read and Write tags are sets that can contain :before and :after.
17
+ # They indicate whether we should use the cache for reading or writing.
15
18
  @read_tags = Set.new
16
19
  @write_tags = Set.new
20
+
21
+ # The directory used by the cache for storage.
17
22
  @dir = opts[:directory] || '.'
18
23
  end
19
24
 
25
+ ##
20
26
  # Is a tag cached?
27
+ # TODO: Rename it to is_cached? as it makes more sense.
21
28
  def tag?(tag)
22
29
  File.directory?(File.join(@dir, 'snapshot', tag.to_s))
23
30
  end
24
31
 
32
+ ##
33
+ # Get data from cache.
25
34
  def get(tag, path)
26
35
  return nil unless @read_tags.include? tag
27
36
 
28
- filename = File.join(@dir, 'snapshot', tag.to_s, *path.split(File::SEPARATOR))
37
+ filename = File.join(
38
+ @dir,
39
+ 'snapshot',
40
+ tag.to_s,
41
+ *path.split(File::SEPARATOR)
42
+ )
29
43
 
30
44
  filename = File.join(filename, 'index.html') if File.directory?(filename)
31
45
  return nil unless File.file? filename
@@ -33,10 +47,17 @@ class SiteDiff
33
47
  Marshal.load(File.read(filename))
34
48
  end
35
49
 
50
+ ##
51
+ # Set data to cache.
36
52
  def set(tag, path, result)
37
53
  return unless @write_tags.include? tag
38
54
 
39
- filename = File.join(@dir, 'snapshot', tag.to_s, *path.split(File::SEPARATOR))
55
+ filename = File.join(
56
+ @dir,
57
+ 'snapshot',
58
+ tag.to_s,
59
+ *path.split(File::SEPARATOR)
60
+ )
40
61
 
41
62
  filename = File.join(filename, 'index.html') if File.directory?(filename)
42
63
  filepath = Pathname.new(filename)
@@ -50,23 +71,31 @@ class SiteDiff
50
71
  # May cause problems if action is not atomic!
51
72
  # Move existing file to dir/index.html first
52
73
  # Not robust! Should generate an UUID or something.
53
- SiteDiff.log "Overwriting file #{tempname}", :warn if File.exist?(tempname)
74
+ if File.exist?(tempname)
75
+ SiteDiff.log "Overwriting file #{tempname}", :warning
76
+ end
54
77
  curdir.rename(tempname)
55
78
  filepath.dirname.mkpath
56
79
  # Should only happen in strange situations such as when the path
57
80
  # is foo/index.html/bar (i.e., index.html is a directory)
58
- SiteDiff.log "Overwriting file #{tempname}", :warn if (curdir + 'index.html').exist?
81
+ if (curdir + 'index.html').exist?
82
+ SiteDiff.log "Overwriting file #{tempname}", :warning
83
+ end
59
84
  tempname.rename(curdir + 'index.html')
60
85
  end
61
86
  end
62
87
  File.open(filename, 'w') { |file| file.write(Marshal.dump(result)) }
63
88
  end
64
89
 
90
+ ##
91
+ # TODO: Document this or remove it if unused.
65
92
  def key(tag, path)
66
93
  # Ensure encoding stays the same!
67
94
  Marshal.dump([tag, path.encode('UTF-8')])
68
95
  end
69
96
 
97
+ ##
98
+ # Ensures that a directory exists.
70
99
  def get_dir(directory)
71
100
  # Create the dir. Must go before cache initialization!
72
101
  @dir = Pathname.new(directory || '.')
@@ -5,32 +5,32 @@ require 'sitediff'
5
5
  require 'sitediff/cache'
6
6
  require 'sitediff/config'
7
7
  require 'sitediff/config/creator'
8
+ require 'sitediff/config/preset'
8
9
  require 'sitediff/fetch'
9
10
  require 'sitediff/webserver/resultserver'
10
11
 
11
12
  class SiteDiff
13
+ # SiteDiff CLI.
14
+ # TODO: Use config.defaults to feed default values for sitediff.yaml params?
12
15
  class Cli < Thor
13
16
  class_option 'directory',
14
17
  type: :string,
15
18
  aliases: '-C',
16
19
  default: 'sitediff',
17
20
  desc: 'Configuration directory'
18
- class_option :curl_options,
19
- type: :hash,
20
- default: {},
21
- desc: 'Options to be passed to curl'
22
- class_option :insecure,
21
+ class_option :debug,
23
22
  type: :boolean,
23
+ aliases: '-d',
24
24
  default: false,
25
- desc: 'Ignore many HTTPS/SSL errors'
26
- class_option :debug,
25
+ desc: 'Stop on certain errors and produce error trace backs.'
26
+ class_option 'verbose',
27
27
  type: :boolean,
28
+ aliases: '-v',
28
29
  default: false,
29
- desc: 'Debug mode. Stop on certain errors and produce a traceback.'
30
- class_option :interval,
31
- type: :numeric,
32
- default: 0,
33
- desc: 'Crawling delay - interval in milliseconds'
30
+ desc: 'Show verbose output in terminal'
31
+
32
+ # Command aliases.
33
+ map recrawl: :crawl
34
34
 
35
35
  # Thor, by default, exits with 0 no matter what!
36
36
  def self.exit_on_failure?
@@ -42,6 +42,20 @@ class SiteDiff
42
42
  true
43
43
  end
44
44
 
45
+ desc 'version', 'Show version information'
46
+ ##
47
+ # Show version information.
48
+ def version
49
+ gemspec = SiteDiff.gemspec
50
+ output = []
51
+ output.push("Sitediff CLI #{gemspec.version}")
52
+ if options[:verbose]
53
+ output.push('Website: ' + gemspec.homepage)
54
+ output.push('GitHub: ' + gemspec.metadata['source_code_uri'])
55
+ end
56
+ puts output.join("\n")
57
+ end
58
+
45
59
  option 'paths-file',
46
60
  type: :string,
47
61
  desc: 'Paths are read (one at a line) from PATHS: ' \
@@ -53,79 +67,118 @@ class SiteDiff
53
67
  desc: 'Specific path or paths to fetch'
54
68
  option 'before',
55
69
  type: :string,
56
- desc: 'URL used to fetch the before HTML. Acts as a prefix to specified paths',
70
+ desc: 'URL to the "before" site, prefixed to all paths.',
57
71
  aliases: '--before-url'
58
72
  option 'after',
59
73
  type: :string,
60
- desc: 'URL used to fetch the after HTML. Acts as a prefix to specified paths.',
74
+ desc: 'URL to the "after" site, prefixed to all paths.',
61
75
  aliases: '--after-url'
76
+ option 'report-format',
77
+ type: :string,
78
+ enum: %w[html json],
79
+ default: 'html',
80
+ desc: 'The format in which a report should be generated.'
81
+ # TODO: Deprecate the parameters before-report / after-report?
62
82
  option 'before-report',
63
83
  type: :string,
64
- desc: 'Before URL to use for reporting purposes. Useful if port forwarding.',
84
+ desc: 'URL to use in reports. Useful if port forwarding.',
65
85
  aliases: '--before-url-report'
66
86
  option 'after-report',
67
87
  type: :string,
68
- desc: 'After URL to use for reporting purposes. Useful if port forwarding.',
88
+ desc: 'URL to use in reports. Useful if port forwarding.',
69
89
  aliases: '--after-url-report'
70
90
  option 'cached',
71
91
  type: :string,
72
92
  enum: %w[none all before after],
73
93
  default: 'before',
74
94
  desc: 'Use the cached version of these sites, if available.'
75
- option 'verbose',
95
+ option 'ignore-whitespace',
76
96
  type: :boolean,
77
- aliases: '-v',
78
97
  default: false,
79
- desc: 'Show differences between versions for each page in terminal'
80
- option :concurrency,
81
- type: :numeric,
82
- default: 3,
83
- desc: 'Max number of concurrent connections made'
84
- desc 'diff [OPTIONS] [CONFIGFILES]', 'Perform systematic diff on given URLs'
85
- def diff(*config_files)
86
- @interval = options['interval']
87
- check_interval(@interval)
98
+ aliases: '-w',
99
+ desc: 'Ignore changes in whitespace.'
100
+ option 'export',
101
+ type: :boolean,
102
+ default: false,
103
+ aliases: '-e',
104
+ desc: 'Export report to files. This option forces HTML format.'
105
+ desc 'diff [OPTIONS] [CONFIG-FILE]',
106
+ 'Compute diffs on configured URLs.'
107
+ ##
108
+ # Computes diffs.
109
+ def diff(config_file = nil)
88
110
  @dir = get_dir(options['directory'])
89
- config = SiteDiff::Config.new(config_files, @dir)
90
-
91
- # override config based on options
92
- paths = options['paths']
93
- if (paths_file = options['paths-file'])
94
- if paths
95
- SiteDiff.log "Can't have both --paths-file and --paths", :error
96
- exit(-1)
97
- end
111
+ config = SiteDiff::Config.new(config_file, @dir)
98
112
 
99
- paths_file = Pathname.new(paths_file).expand_path
100
- unless File.exist? paths_file
101
- raise Config::InvalidConfig,
102
- "Paths file '#{paths_file}' not found!"
103
- end
104
- SiteDiff.log "Reading paths from: #{paths_file}"
105
- config.paths = File.readlines(paths_file)
113
+ # Determine "paths" override based on options.
114
+ if options['paths'] && options['paths-file']
115
+ SiteDiff.log "Can't specify both --paths-file and --paths.", :error
116
+ exit(-1)
106
117
  end
107
- config.paths = paths if paths
108
118
 
119
+ # Ignore whitespace option.
120
+ config.ignore_whitespace = options['ignore-whitespace'] if options['ignore-whitespace']
121
+
122
+ # Export report option.
123
+ config.export = options['export']
124
+
125
+ # Apply "paths" override, if any.
126
+ config.paths = options['paths'] if options['paths']
127
+
128
+ # Determine and apply "paths-file", if "paths" is not specified.
129
+ unless options['paths']
130
+ paths_file = options['paths-file']
131
+ paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
132
+ paths_file = File.expand_path(paths_file)
133
+
134
+ paths_count = config.paths_file_read(paths_file)
135
+ SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
136
+ end
137
+
138
+ # TODO: Why do we allow before and after override during diff?
109
139
  config.before['url'] = options['before'] if options['before']
110
140
  config.after['url'] = options['after'] if options['after']
111
141
 
112
- # Setup cache
113
- cache = SiteDiff::Cache.new(create: options['cached'] != 'none',
114
- directory: @dir)
142
+ # Prepare cache.
143
+ cache = SiteDiff::Cache.new(
144
+ create: options['cached'] != 'none',
145
+ directory: @dir
146
+ )
115
147
  cache.read_tags << :before if %w[before all].include?(options['cached'])
116
148
  cache.read_tags << :after if %w[after all].include?(options['cached'])
117
149
  cache.write_tags << :before << :after
118
150
 
119
- sitediff = SiteDiff.new(config, cache, options[:concurrency], @interval,
120
- options['verbose'], options[:debug])
121
- num_failing = sitediff.run(get_curl_opts(options), options[:debug])
122
- exit_code = num_failing > 0 ? 2 : 0
151
+ # Run sitediff.
152
+ sitediff = SiteDiff.new(
153
+ config,
154
+ cache,
155
+ options['verbose'],
156
+ options[:debug]
157
+ )
158
+ num_failing = sitediff.run
159
+ exit_code = num_failing.positive? ? 2 : 0
160
+
161
+ # Generate HTML report.
162
+ if options['report-format'] == 'html' || config.export
163
+ sitediff.report.generate_html(
164
+ @dir,
165
+ options['before-report'],
166
+ options['after-report']
167
+ )
168
+ end
123
169
 
124
- sitediff.dump(@dir, options['before-report'],
125
- options['after-report'])
170
+ # Generate JSON report.
171
+ if options['report-format'] == 'json' && config.export == false
172
+ sitediff.report.generate_json @dir
173
+ end
174
+
175
+ SiteDiff.log 'Run "sitediff serve" to see a report.' unless options['export']
126
176
  rescue Config::InvalidConfig => e
127
177
  SiteDiff.log "Invalid configuration: #{e.message}", :error
128
- SiteDiff.log "at #{e.backtrace}", :error
178
+ SiteDiff.log e.backtrace, :error if options[:verbose]
179
+ rescue Config::ConfigNotFound => e
180
+ SiteDiff.log "Invalid configuration: #{e.message}", :error
181
+ SiteDiff.log e.backtrace, :error if options[:verbose]
129
182
  else # no exception was raised
130
183
  # Thor::Error --> exit(1), guaranteed by exit_on_failure?
131
184
  # Failing diff --> exit(2), populated above
@@ -140,11 +193,14 @@ class SiteDiff
140
193
  type: :boolean,
141
194
  default: true,
142
195
  desc: 'Whether to open the served content in your browser'
143
- desc 'serve [OPTIONS]', 'Serve the sitediff output directory over HTTP'
144
- def serve(*config_files)
145
- config = SiteDiff::Config.new(config_files, options['directory'])
146
- # Could check non-empty config here but currently errors are already raised.
196
+ desc 'serve [OPTIONS] [CONFIG-FILE]',
197
+ 'Serve SiteDiff report directory over HTTP.'
198
+ ##
199
+ # Serves SiteDiff report for accessing in the browser.
200
+ def serve(config_file = nil)
147
201
  @dir = get_dir(options['directory'])
202
+ config = SiteDiff::Config.new(config_file, @dir)
203
+
148
204
  cache = Cache.new(directory: @dir)
149
205
  cache.read_tags << :before << :after
150
206
 
@@ -157,85 +213,102 @@ class SiteDiff
157
213
  ).wait
158
214
  rescue SiteDiffException => e
159
215
  SiteDiff.log e.message, :error
160
- SiteDiff.log e.backtrace, :error
216
+ SiteDiff.log e.backtrace, :error if options[:verbose]
161
217
  end
162
218
 
163
219
  option :depth,
164
220
  type: :numeric,
165
- default: 3,
221
+ default: Config::DEFAULT_CONFIG['settings']['depth'],
166
222
  desc: 'How deeply to crawl the given site'
167
- option :rules,
223
+ option :crawl,
224
+ type: :boolean,
225
+ default: true,
226
+ desc: 'Run "sitediff crawl" to discover paths.'
227
+ option :preset,
168
228
  type: :string,
169
- enum: %w[yes no disabled],
170
- default: 'disabled',
171
- desc: 'Whether rules for the site should be auto-created'
229
+ enum: Config::Preset.all,
230
+ desc: 'Framework-specific presets to apply.'
172
231
  option :concurrency,
173
232
  type: :numeric,
174
- default: 3,
175
- desc: 'Max number of concurrent connections made'
233
+ default: Config::DEFAULT_CONFIG['settings']['concurrency'],
234
+ desc: 'Max number of concurrent connections made.'
235
+ option :interval,
236
+ type: :numeric,
237
+ default: Config::DEFAULT_CONFIG['settings']['interval'],
238
+ desc: 'Crawling delay - interval in milliseconds.'
176
239
  option :whitelist,
177
240
  type: :string,
178
- default: '',
179
- desc: 'Optional whitelist for crawling'
241
+ default: Config::DEFAULT_CONFIG['settings']['whitelist'],
242
+ desc: 'Optional whitelist for crawling.'
180
243
  option :blacklist,
181
244
  type: :string,
182
- default: '',
183
- desc: 'Optional blacklist for crawling'
184
- desc 'init URL [URL]', 'Create a sitediff configuration'
245
+ default: Config::DEFAULT_CONFIG['settings']['blacklist'],
246
+ desc: 'Optional blacklist for crawling.'
247
+ # TODO: Remove this option. Always ignore SSL errors.
248
+ option :insecure,
249
+ type: :boolean,
250
+ default: false,
251
+ desc: 'Ignore many HTTPS/SSL errors'
252
+ option :curl_options,
253
+ type: :hash,
254
+ default: {},
255
+ desc: 'Options to be passed to curl'
256
+ desc 'init URL [URL]', 'Create a sitediff configuration.'
257
+ ##
258
+ # Initializes a sitediff (yaml) configuration file.
185
259
  def init(*urls)
186
260
  unless (1..2).cover? urls.size
187
261
  SiteDiff.log 'sitediff init requires one or two URLs', :error
188
262
  exit(2)
189
263
  end
190
264
 
191
- @interval = options['interval']
192
- check_interval(@interval)
265
+ # Prepare a config object and write it to the file system.
193
266
  @dir = get_dir(options['directory'])
194
- curl_opts = get_curl_opts(options)
195
- @whitelist = create_regexp(options['whitelist'])
196
- @blacklist = create_regexp(options['blacklist'])
197
- creator = SiteDiff::Config::Creator.new(options[:concurrency],
198
- options['interval'],
199
- @whitelist,
200
- @blacklist,
201
- curl_opts,
202
- options[:debug],
203
- *urls)
267
+ creator = SiteDiff::Config::Creator.new(options[:debug], *urls)
204
268
  creator.create(
205
269
  depth: options[:depth],
206
270
  directory: @dir,
207
- rules: options[:rules] != 'no',
208
- rules_disabled: (options[:rules] == 'disabled')
209
- ) do |_tag, info|
210
- SiteDiff.log "Visited #{info.uri}, cached"
211
- end
212
-
271
+ concurrency: options[:concurrency],
272
+ interval: options[:interval],
273
+ whitelist: Config.create_regexp(options['whitelist']),
274
+ blacklist: Config.create_regexp(options['blacklist']),
275
+ preset: options[:preset],
276
+ curl_opts: get_curl_opts(options)
277
+ )
213
278
  SiteDiff.log "Created #{creator.config_file.expand_path}", :success
214
- SiteDiff.log "You can now run 'sitediff diff'", :success
279
+
280
+ # Discover paths, if enabled.
281
+ if options[:crawl]
282
+ crawl(creator.config_file)
283
+ SiteDiff.log 'You can now run "sitediff diff".', :success
284
+ else
285
+ SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
286
+ end
215
287
  end
216
288
 
217
289
  option :url,
218
290
  type: :string,
219
291
  desc: 'A custom base URL to fetch from'
220
- option :concurrency,
221
- type: :numeric,
222
- default: 3,
223
- desc: 'Max number of concurrent connections made'
224
- desc 'store [CONFIGFILES]',
225
- 'Cache the current contents of a site for later comparison'
226
- def store(*config_files)
292
+ desc 'store [CONFIG-FILE]',
293
+ 'Cache the current contents of a site for later comparison.'
294
+ ##
295
+ # Caches the current version of the site.
296
+ def store(config_file = nil)
227
297
  @dir = get_dir(options['directory'])
228
- config = SiteDiff::Config.new(config_files, @dir)
298
+ config = SiteDiff::Config.new(config_file, @dir)
299
+ # TODO: Figure out how to remove this config.validate call.
229
300
  config.validate(need_before: false)
301
+ config.paths_file_read
302
+
230
303
  cache = SiteDiff::Cache.new(directory: @dir, create: true)
231
304
  cache.write_tags << :before
232
305
 
233
306
  base = options[:url] || config.after['url']
234
307
  fetcher = SiteDiff::Fetch.new(cache,
235
308
  config.paths,
236
- options[:interval],
237
- options[:concurrency],
238
- get_curl_opts(options),
309
+ config.setting(:interval),
310
+ config.setting(:concurrency),
311
+ get_curl_opts(config.settings),
239
312
  options[:debug],
240
313
  before: base)
241
314
  fetcher.run do |path, _res|
@@ -243,11 +316,70 @@ class SiteDiff
243
316
  end
244
317
  end
245
318
 
319
+ desc 'crawl [CONFIG-FILE]',
320
+ 'Crawl the "before" site to discover paths.'
321
+ ##
322
+ # Crawls the "before" site to determine "paths".
323
+ #
324
+ # TODO: Move actual crawling to sitediff.crawl(config).
325
+ # TODO: Switch to paths = sitediff.crawl().
326
+ def crawl(config_file = nil)
327
+ # Prepare configuration.
328
+ @dir = get_dir(options['directory'])
329
+ @config = SiteDiff::Config.new(config_file, @dir)
330
+
331
+ # Prepare cache.
332
+ @cache = SiteDiff::Cache.new(
333
+ create: options['cached'] != 'none',
334
+ directory: @dir
335
+ )
336
+ @cache.write_tags << :before << :after
337
+
338
+ # Crawl with Hydra to discover paths.
339
+ hydra = Typhoeus::Hydra.new(
340
+ max_concurrency: @config.setting(:concurrency)
341
+ )
342
+ @paths = {}
343
+ @config.roots.each do |tag, url|
344
+ Crawler.new(
345
+ hydra,
346
+ url,
347
+ @config.setting(:interval),
348
+ @config.setting(:whitelist),
349
+ @config.setting(:blacklist),
350
+ @config.setting(:depth),
351
+ get_curl_opts(@config.settings),
352
+ @debug
353
+ ) do |info|
354
+ SiteDiff.log "Visited #{info.uri}, cached."
355
+ after_crawl(tag, info)
356
+ end
357
+ end
358
+ hydra.run
359
+
360
+ # Write paths to a file.
361
+ @paths = @paths.values.reduce(&:|).to_a.sort
362
+ @config.paths_file_write(@paths)
363
+
364
+ # Log output.
365
+ file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
366
+ SiteDiff.log ''
367
+ SiteDiff.log "#{@paths.length} page(s) found."
368
+ SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
369
+ end
370
+
246
371
  no_commands do
372
+ # Generates CURL options.
373
+ #
374
+ # TODO: This should be in the config class instead.
375
+ # TODO: Make all requests insecure and avoid custom curl-opts.
247
376
  def get_curl_opts(options)
248
377
  # We do want string keys here
249
378
  bool_hash = { 'true' => true, 'false' => false }
250
- curl_opts = UriWrapper::DEFAULT_CURL_OPTS.clone.merge(options[:curl_options])
379
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS
380
+ .clone
381
+ .merge(options['curl_options'] || {})
382
+ .merge(options['curl_opts'] || {})
251
383
  curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
252
384
  if options[:insecure]
253
385
  curl_opts[:ssl_verifypeer] = false
@@ -256,13 +388,8 @@ class SiteDiff
256
388
  curl_opts
257
389
  end
258
390
 
259
- def check_interval(interval)
260
- if interval != 0 && options[:concurrency] != 1
261
- SiteDiff.log '--concurrency must be set to 1 in order to enable the interval feature'
262
- exit(2)
263
- end
264
- end
265
-
391
+ ##
392
+ # Ensures that the given directory exists.
266
393
  def get_dir(directory)
267
394
  # Create the dir. Must go before cache initialization!
268
395
  @dir = Pathname.new(directory || '.')
@@ -270,16 +397,24 @@ class SiteDiff
270
397
  @dir.to_s
271
398
  end
272
399
 
273
- def create_regexp(string_param)
274
- begin
275
- @return_value = string_param == '' ? nil : Regexp.new(string_param)
276
- rescue SiteDiffException => e
277
- @return_value = nil
278
- SiteDiff.log 'whitelist and blacklist parameters must be valid regular expressions', :error
279
- SiteDiff.log e.message, :error
280
- SiteDiff.log e.backtrace, :error
281
- end
282
- return @return_value
400
+ ##
401
+ # Processes a crawled path.
402
+ def after_crawl(tag, info)
403
+ path = UriWrapper.canonicalize(info.relative)
404
+
405
+ # Register the path.
406
+ @paths[tag] = [] unless @paths[tag]
407
+ @paths[tag] << path
408
+
409
+ result = info.read_result
410
+
411
+ # Write result to applicable cache.
412
+ @cache.set(tag, path, result)
413
+ # If single-site, cache "after" as "before".
414
+ @cache.set(:before, path, result) unless @config.roots[:before]
415
+
416
+ # TODO: Restore application of rules.
417
+ # @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
283
418
  end
284
419
  end
285
420
  end