sitediff 0.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,61 +1,124 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'set'
4
+ require 'fileutils'
2
5
 
3
6
  class SiteDiff
4
- class Cache
5
- DEFAULT_FILENAME = 'cache.db'
7
+ # SiteDiff Cache Handler.
8
+ class Cache
9
+ TIMESTAMP_FILE = 'timestamp'
6
10
 
7
- attr_accessor :read_tags, :write_tags
11
+ attr_accessor :read_tags, :write_tags
8
12
 
9
- def initialize(opts = {})
10
- @file = opts[:file] || DEFAULT_FILENAME
11
- @create = opts[:create]
12
- @read_tags = Set.new
13
- @write_tags = Set.new
14
- end
13
+ ##
14
+ # Creates a Cache object.
15
+ def initialize(opts = {})
16
+ @create = opts[:create]
15
17
 
16
- def close; @dbm.close if defined? @dbm; end
18
+ # Read and Write tags are sets that can contain :before and :after.
19
+ # They indicate whether we should use the cache for reading or writing.
20
+ @read_tags = Set.new
21
+ @write_tags = Set.new
22
+ @timestamp_flag = { before: false, after: false }
17
23
 
18
- # Is a tag cached?
19
- def tag?(tag)
20
- open
21
- @dbm[tag.to_s]
22
- end
24
+ # The directory used by the cache for storage.
25
+ @dir = opts[:directory] || '.'
26
+ end
23
27
 
24
- def get(tag, path)
25
- return nil unless @read_tags.include? tag
26
- open or return nil
27
- val = @dbm[key(tag, path)]
28
- return val && Marshal.load(val)
29
- end
28
+ ##
29
+ # Is a tag cached?
30
+ # TODO: Rename it to is_cached? as it makes more sense.
31
+ def tag?(tag)
32
+ File.directory?(File.join(@dir, 'snapshot', tag.to_s))
33
+ end
30
34
 
31
- def set(tag, path, result)
32
- return unless @write_tags.include? tag
33
- open or return
34
- @dbm[tag.to_s] = 'TRUE'
35
- @dbm[key(tag, path)] = Marshal.dump(result)
36
- end
35
+ ##
36
+ # Get data from cache.
37
+ def get(tag, path)
38
+ return nil unless @read_tags.include? tag
37
39
 
38
- private
39
- def key(tag, path)
40
- # Ensure encoding stays the same!
41
- Marshal.dump([tag, path.encode('UTF-8')])
42
- end
40
+ filename = File.join(
41
+ @dir,
42
+ 'snapshot',
43
+ tag.to_s,
44
+ *path.split(File::SEPARATOR)
45
+ )
46
+
47
+ filename = File.join(filename, 'index.html') if File.directory?(filename)
48
+ return nil unless File.file? filename
49
+
50
+ Marshal.load(File.read(filename))
51
+ end
43
52
 
44
- # Ensure the DB is open
45
- def open
46
- # DBM adds an extra .db, ugh
47
- return false unless @create || File.exist?(@file) ||
48
- File.exist?(@file + '.db')
49
- return true if defined? @dbm
50
-
51
- begin
52
- require 'gdbm'
53
- @dbm = GDBM.new(@file)
54
- rescue LoadError
55
- require 'dbm'
56
- @dbm = DBM.new(@file)
53
+ ##
54
+ # Set data to cache.
55
+ def set(tag, path, result)
56
+ return unless @write_tags.include? tag
57
+
58
+ save_timestamp(tag)
59
+ filename = File.join(
60
+ @dir,
61
+ 'snapshot',
62
+ tag.to_s,
63
+ *path.split(File::SEPARATOR)
64
+ )
65
+
66
+ filename = File.join(filename, 'index.html') if File.directory?(filename)
67
+ filepath = Pathname.new(filename)
68
+ unless filepath.dirname.directory?
69
+ begin
70
+ filepath.dirname.mkpath
71
+ rescue Errno::EEXIST
72
+ curdir = filepath
73
+ curdir = curdir.parent until curdir.exist?
74
+ tempname = curdir.dirname + (curdir.basename.to_s + '.temporary')
75
+ # May cause problems if action is not atomic!
76
+ # Move existing file to dir/index.html first
77
+ # Not robust! Should generate an UUID or something.
78
+ if File.exist?(tempname)
79
+ SiteDiff.log "Overwriting file #{tempname}", :warning
80
+ end
81
+ curdir.rename(tempname)
82
+ filepath.dirname.mkpath
83
+ # Should only happen in strange situations such as when the path
84
+ # is foo/index.html/bar (i.e., index.html is a directory)
85
+ if (curdir + 'index.html').exist?
86
+ SiteDiff.log "Overwriting file #{tempname}", :warning
87
+ end
88
+ tempname.rename(curdir + 'index.html')
89
+ end
90
+ end
91
+ File.open(filename, 'w') { |file| file.write(Marshal.dump(result)) }
92
+ end
93
+
94
+ ##
95
+ # TODO: Document this or remove it if unused.
96
+ def key(tag, path)
97
+ # Ensure encoding stays the same!
98
+ Marshal.dump([tag, path.encode('UTF-8')])
99
+ end
100
+
101
+ ##
102
+ # Ensures that a directory exists.
103
+ def get_dir(directory)
104
+ # Create the dir. Must go before cache initialization!
105
+ @dir = Pathname.new(directory || '.')
106
+ @dir.mkpath unless @dir.directory?
107
+ @dir.to_s
108
+ end
109
+
110
+ private
111
+
112
+ def save_timestamp(tag)
113
+ # run once
114
+ return if @timestamp_flag[tag]
115
+
116
+ @timestamp_flag[tag] = true
117
+ cache_dir = File.join(@dir, 'snapshot', tag.to_s)
118
+ if File.exist? cache_dir
119
+ file = File.join(cache_dir, TIMESTAMP_FILE)
120
+ FileUtils.touch(file)
121
+ end
57
122
  end
58
- return true
59
123
  end
60
124
  end
61
- end
@@ -1,17 +1,30 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'thor'
2
4
  require 'sitediff'
3
- require 'sitediff/cache'
4
- require 'sitediff/config'
5
- require 'sitediff/config/creator'
6
- require 'sitediff/fetch'
7
- require 'sitediff/webserver/resultserver'
5
+ require 'sitediff/api'
8
6
 
9
7
  class SiteDiff
8
+ # SiteDiff CLI.
10
9
  class Cli < Thor
11
10
  class_option 'directory',
12
- :type => :string,
13
- :aliases => '-C',
14
- :desc => "Go to a given directory before running."
11
+ type: :string,
12
+ aliases: '-C',
13
+ default: 'sitediff',
14
+ desc: 'Configuration directory'
15
+ class_option :debug,
16
+ type: :boolean,
17
+ aliases: '-d',
18
+ default: false,
19
+ desc: 'Stop on certain errors and produce error trace backs.'
20
+ class_option 'verbose',
21
+ type: :boolean,
22
+ aliases: '-v',
23
+ default: false,
24
+ desc: 'Show verbose output in terminal'
25
+
26
+ # Command aliases.
27
+ map recrawl: :crawl
15
28
 
16
29
  # Thor, by default, exits with 0 no matter what!
17
30
  def self.exit_on_failure?
@@ -19,192 +32,233 @@ class SiteDiff
19
32
  end
20
33
 
21
34
  # Thor, by default, does not raise an error for use of unknown options.
22
- def self.check_unknown_options?(config)
35
+ def self.check_unknown_options?(_config)
23
36
  true
24
37
  end
25
38
 
26
- option 'dump-dir',
27
- :type => :string,
28
- :default => File.join('.', 'output'),
29
- :desc => "Location to write the output to."
39
+ desc 'version', 'Show version information'
40
+ ##
41
+ # Show version information.
42
+ def version
43
+ gemspec = SiteDiff.gemspec
44
+ output = []
45
+ output.push("Sitediff CLI #{gemspec.version}")
46
+ if options[:verbose]
47
+ output.push('Website: ' + gemspec.homepage)
48
+ output.push('GitHub: ' + gemspec.metadata['source_code_uri'])
49
+ end
50
+ puts output.join("\n")
51
+ end
52
+
30
53
  option 'paths-file',
31
- :type => :string,
32
- :desc => 'Paths are read (one at a line) from PATHS: ' +
33
- 'useful for iterating over sanitization rules',
34
- :aliases => '--paths-from-file'
54
+ type: :string,
55
+ desc: 'Paths are read (one at a line) from PATHS: ' \
56
+ 'useful for iterating over sanitization rules',
57
+ aliases: '--paths-from-file'
35
58
  option 'paths',
36
- :type => :array,
37
- :aliases => '-p',
38
- :desc => "Fetch only these specific paths"
59
+ type: :array,
60
+ aliases: '-p',
61
+ desc: 'Specific path or paths to fetch'
39
62
  option 'before',
40
- :type => :string,
41
- :desc => "URL used to fetch the before HTML. Acts as a prefix to specified paths",
42
- :aliases => '--before-url'
63
+ type: :string,
64
+ desc: 'URL to the "before" site, prefixed to all paths.',
65
+ aliases: '--before-url'
43
66
  option 'after',
44
- :type => :string,
45
- :desc => "URL used to fetch the after HTML. Acts as a prefix to specified paths.",
46
- :aliases => '--after-url'
67
+ type: :string,
68
+ desc: 'URL to the "after" site, prefixed to all paths.',
69
+ aliases: '--after-url'
70
+ option 'report-format',
71
+ type: :string,
72
+ enum: %w[html json],
73
+ default: 'html',
74
+ desc: 'The format in which a report should be generated.'
47
75
  option 'before-report',
48
- :type => :string,
49
- :desc => "Before URL to use for reporting purposes. Useful if port forwarding.",
50
- :aliases => '--before-url-report'
76
+ type: :string,
77
+ desc: 'URL to use in reports. Useful if port forwarding.',
78
+ aliases: '--before-url-report'
51
79
  option 'after-report',
52
- :type => :string,
53
- :desc => "After URL to use for reporting purposes. Useful if port forwarding.",
54
- :aliases => '--after-url-report'
80
+ type: :string,
81
+ desc: 'URL to use in reports. Useful if port forwarding.',
82
+ aliases: '--after-url-report'
55
83
  option 'cached',
56
- :type => :string,
57
- :enum => %w[none all before after],
58
- :default => 'before',
59
- :desc => "Use the cached version of these sites, if available."
60
- option 'quiet',
61
- :type => :boolean,
62
- :aliases => '-q',
63
- :default => false,
64
- :desc => "Show the difference between versions for each page"
65
- desc "diff [OPTIONS] [CONFIGFILES]", "Perform systematic diff on given URLs"
66
- def diff(*config_files)
67
- config = chdir(config_files)
68
-
69
- # override config based on options
70
- paths = options['paths']
71
- if paths_file = options['paths-file']
72
- if paths then
73
- SiteDiff::log "Can't have both --paths-file and --paths", :error
74
- exit -1
75
- end
76
-
77
- paths_file = Pathname.new(paths_file).expand_path
78
- unless File.exists? paths_file
79
- raise Config::InvalidConfig,
80
- "Paths file '#{paths_file}' not found!"
81
- end
82
- SiteDiff::log "Reading paths from: #{paths_file}"
83
- config.paths = File.readlines(paths_file)
84
+ type: :string,
85
+ enum: %w[none all before after],
86
+ default: 'before',
87
+ desc: 'Use the cached version of these sites, if available.'
88
+ option 'ignore-whitespace',
89
+ type: :boolean,
90
+ default: false,
91
+ aliases: '-w',
92
+ desc: 'Ignore changes in whitespace.'
93
+ option 'export',
94
+ type: :boolean,
95
+ default: false,
96
+ aliases: '-e',
97
+ desc: 'Export report to files. This option forces HTML format.'
98
+ desc 'diff [OPTIONS] [CONFIG-FILE]',
99
+ 'Compute diffs on configured URLs.'
100
+ ##
101
+ # Computes diffs.
102
+ def diff(config_file = nil)
103
+ # Determine "paths" override based on options.
104
+ if options['paths'] && options['paths-file']
105
+ SiteDiff.log "Can't specify both --paths-file and --paths.", :error
106
+ exit(-1)
84
107
  end
85
- config.paths = paths if paths
86
-
87
- config.before['url'] = options['before'] if options['before']
88
- config.after['url'] = options['after'] if options['after']
89
-
90
- # Setup cache
91
- cache = SiteDiff::Cache.new(:create => options['cached'] != 'none')
92
- cache.read_tags << :before if %w[before all].include?(options['cached'])
93
- cache.read_tags << :after if %w[after all].include?(options['cached'])
94
- cache.write_tags << :before << :after
95
-
96
- sitediff = SiteDiff.new(config, cache, !options['quiet'])
97
- num_failing = sitediff.run
98
- exit_code = (num_failing > 0) ? 2 : 0;
99
108
 
100
- sitediff.dump(options['dump-dir'], options['before-report'],
101
- options['after-report'])
102
- rescue Config::InvalidConfig => e
103
- SiteDiff.log "Invalid configuration: #{e.message}", :error
104
- rescue SiteDiffException => e
105
- SiteDiff.log e.message, :error
106
- else # no exception was raised
107
- # Thor::Error --> exit(1), guaranteed by exit_on_failure?
108
- # Failing diff --> exit(2), populated above
109
- exit(exit_code)
109
+ api = Api.new(options['directory'], config_file)
110
+ api_options =
111
+ clean_keys(
112
+ options,
113
+ :paths,
114
+ :paths_file,
115
+ :ignore_whitespace,
116
+ :export,
117
+ :before,
118
+ :after,
119
+ :cached,
120
+ :verbose,
121
+ :debug,
122
+ :report_format,
123
+ :before_report,
124
+ :after_report
125
+ )
126
+ api_options[:cli_mode] = true
127
+ api.diff(api_options)
110
128
  end
111
129
 
112
130
  option :port,
113
- :type => :numeric,
114
- :default => SiteDiff::Webserver::DEFAULT_PORT,
115
- :desc => 'The port to serve on'
116
- option 'dump-dir',
117
- :type => :string,
118
- :default => 'output',
119
- :desc => 'The directory to serve'
131
+ type: :numeric,
132
+ default: SiteDiff::Webserver::DEFAULT_PORT,
133
+ desc: 'The port to serve on'
120
134
  option :browse,
121
- :type => :boolean,
122
- :default => true,
123
- :desc => "Whether to open the served content in your browser"
124
- desc "serve [OPTIONS]", "Serve the sitediff output directory over HTTP"
125
- def serve(*config_files)
126
- config = chdir(config_files, :config => false)
127
-
128
- cache = Cache.new
129
- cache.read_tags << :before << :after
130
-
131
- SiteDiff::Webserver::ResultServer.new(
132
- options[:port],
133
- options['dump-dir'],
134
- :browse => options[:browse],
135
- :cache => cache,
136
- :config => config,
137
- ).wait
135
+ type: :boolean,
136
+ default: true,
137
+ desc: 'Whether to open the served content in your browser'
138
+ desc 'serve [OPTIONS] [CONFIG-FILE]',
139
+ 'Serve SiteDiff report directory over HTTP.'
140
+ ##
141
+ # Serves SiteDiff report for accessing in the browser.
142
+ def serve(config_file = nil)
143
+ api = Api.new(options['directory'], config_file)
144
+ api_options = clean_keys(options, :browse, :port)
145
+ api.serve(api_options)
138
146
  end
139
147
 
140
- option :output,
141
- :type => :string,
142
- :default => 'sitediff',
143
- :desc => 'Where to place the configuration',
144
- :aliases => ['-o']
145
148
  option :depth,
146
- :type => :numeric,
147
- :default => 3,
148
- :desc => 'How deeply to crawl the given site'
149
- option :rules,
150
- :type => :string,
151
- :enum => %w[yes no disabled],
152
- :default => 'disabled',
153
- :desc => 'Whether rules for the site should be auto-created'
154
- desc "init URL [URL]", "Create a sitediff configuration"
149
+ type: :numeric,
150
+ default: Config::DEFAULT_CONFIG['settings']['depth'],
151
+ desc: 'How deeply to crawl the given site'
152
+ option :crawl,
153
+ type: :boolean,
154
+ default: true,
155
+ desc: 'Run "sitediff crawl" to discover paths.'
156
+ option :preset,
157
+ type: :string,
158
+ enum: Config::Preset.all,
159
+ desc: 'Framework-specific presets to apply.'
160
+ option :concurrency,
161
+ type: :numeric,
162
+ default: Config::DEFAULT_CONFIG['settings']['concurrency'],
163
+ desc: 'Max number of concurrent connections made.'
164
+ option :interval,
165
+ type: :numeric,
166
+ default: Config::DEFAULT_CONFIG['settings']['interval'],
167
+ desc: 'Crawling delay - interval in milliseconds.'
168
+ option :include,
169
+ type: :string,
170
+ default: Config::DEFAULT_CONFIG['settings']['include'],
171
+ desc: 'Optional URL include regex for crawling.'
172
+ option :exclude,
173
+ type: :string,
174
+ default: Config::DEFAULT_CONFIG['settings']['exclude'],
175
+ desc: 'Optional URL exclude regex for crawling.'
176
+ option :curl_options,
177
+ type: :hash,
178
+ default: {},
179
+ desc: 'Options to be passed to curl'
180
+ desc 'init URL [URL]', 'Create a sitediff configuration.'
181
+ ##
182
+ # Initializes a sitediff (yaml) configuration file.
155
183
  def init(*urls)
156
- unless (1..2).include? urls.size
157
- SiteDiff.log "sitediff init requires one or two URLs", :error
158
- exit 2
184
+ unless (1..2).cover? urls.size
185
+ SiteDiff.log 'sitediff init requires one or two URLs', :error
186
+ exit(2)
159
187
  end
160
-
161
- chdir([], :search => false)
162
- creator = SiteDiff::Config::Creator.new(*urls)
163
- creator.create(
164
- :depth => options[:depth],
165
- :directory => options[:output],
166
- :rules => options[:rules] != 'no',
167
- :rules_disabled => (options[:rules] == 'disabled'),
168
- ) do |tag, info|
169
- SiteDiff.log "Visited #{info.uri}, cached"
170
- end
171
-
172
- SiteDiff.log "Created #{creator.config_file.expand_path}", :success
173
- SiteDiff.log "You can now run 'sitediff diff'", :success
188
+ api_options =
189
+ clean_keys(
190
+ options,
191
+ :depth,
192
+ :concurrency,
193
+ :interval,
194
+ :include,
195
+ :exclude,
196
+ :preset,
197
+ :crawl
198
+ )
199
+ .merge(
200
+ {
201
+ after_url: urls.pop,
202
+ before_url: urls.pop, # may be nil
203
+ directory: get_dir(options['directory']),
204
+ curl_opts: get_curl_opts(options)
205
+ }
206
+ )
207
+ Api.init(api_options)
174
208
  end
175
209
 
176
210
  option :url,
177
- :type => :string,
178
- :desc => 'A custom base URL to fetch from'
179
- desc "store [CONFIGFILES]",
180
- "Cache the current contents of a site for later comparison"
181
- def store(*config_files)
182
- config = chdir(config_files)
183
- config.validate(:need_before => false)
184
-
185
- cache = SiteDiff::Cache.new(:create => true)
186
- cache.write_tags << :before
211
+ type: :string,
212
+ desc: 'A custom base URL to fetch from'
213
+ desc 'store [CONFIG-FILE]',
214
+ 'Cache the current contents of a site for later comparison.'
215
+ ##
216
+ # Caches the current version of the site.
217
+ def store(config_file = nil)
218
+ api = Api.new(options['directory'], config_file)
219
+ api_options = clean_keys(options, :url, :debug)
220
+ api.store(api_options)
221
+ end
187
222
 
188
- base = options[:url] || config.after['url']
189
- fetcher = SiteDiff::Fetch.new(cache, config.paths, :before => base)
190
- fetcher.run do |path, res|
191
- SiteDiff.log "Visited #{path}, cached"
192
- end
223
+ desc 'crawl [CONFIG-FILE]',
224
+ 'Crawl the "before" site to discover paths.'
225
+ ##
226
+ # Crawls the "before" site to determine "paths".
227
+ #
228
+ def crawl(config_file = nil)
229
+ api = Api.new(options['directory'], config_file)
230
+ api.crawl
193
231
  end
194
232
 
195
- private
196
- def chdir(files, opts = {})
197
- opts = { :config => true, :search => true }.merge(opts)
233
+ no_commands do
234
+ # Generates CURL options.
235
+ #
236
+ # TODO: Possibly move to API class.
237
+ def get_curl_opts(options)
238
+ # We do want string keys here
239
+ bool_hash = { 'true' => true, 'false' => false }
240
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS
241
+ .clone
242
+ .merge(options['curl_options'] || {})
243
+ .merge(options['curl_opts'] || {})
244
+ curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
245
+ curl_opts
246
+ end
198
247
 
199
- dir = options['directory']
200
- Dir.chdir(dir) if dir
248
+ ##
249
+ # Ensures that the given directory exists.
250
+ def get_dir(directory)
251
+ # Create the dir. Must go before cache initialization!
252
+ @dir = Pathname.new(directory || '.')
253
+ @dir.mkpath unless @dir.directory?
254
+ @dir.to_s
255
+ end
201
256
 
202
- return unless opts[:search]
203
- begin
204
- SiteDiff::Config.new(files, :search => !dir)
205
- rescue SiteDiff::Config::ConfigNotFound => e
206
- raise if opts[:config]
207
- # If no config required, allow it to pass
257
+ ##
258
+ # Clean keys - return a subset of a hash with keys as symbols.
259
+ def clean_keys(hash, *keys)
260
+ new_hash = hash.transform_keys { |k| k.tr('-', '_').to_sym }
261
+ new_hash.slice(*keys)
208
262
  end
209
263
  end
210
264
  end