sitediff 0.0.2 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,61 +1,124 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'set'
4
+ require 'fileutils'
2
5
 
3
6
  class SiteDiff
4
- class Cache
5
- DEFAULT_FILENAME = 'cache.db'
7
+ # SiteDiff Cache Handler.
8
+ class Cache
9
+ TIMESTAMP_FILE = 'timestamp'
6
10
 
7
- attr_accessor :read_tags, :write_tags
11
+ attr_accessor :read_tags, :write_tags
8
12
 
9
- def initialize(opts = {})
10
- @file = opts[:file] || DEFAULT_FILENAME
11
- @create = opts[:create]
12
- @read_tags = Set.new
13
- @write_tags = Set.new
14
- end
13
+ ##
14
+ # Creates a Cache object.
15
+ def initialize(opts = {})
16
+ @create = opts[:create]
15
17
 
16
- def close; @dbm.close if defined? @dbm; end
18
+ # Read and Write tags are sets that can contain :before and :after.
19
+ # They indicate whether we should use the cache for reading or writing.
20
+ @read_tags = Set.new
21
+ @write_tags = Set.new
22
+ @timestamp_flag = { before: false, after: false }
17
23
 
18
- # Is a tag cached?
19
- def tag?(tag)
20
- open
21
- @dbm[tag.to_s]
22
- end
24
+ # The directory used by the cache for storage.
25
+ @dir = opts[:directory] || '.'
26
+ end
23
27
 
24
- def get(tag, path)
25
- return nil unless @read_tags.include? tag
26
- open or return nil
27
- val = @dbm[key(tag, path)]
28
- return val && Marshal.load(val)
29
- end
28
+ ##
29
+ # Is a tag cached?
30
+ # TODO: Rename it to is_cached? as it makes more sense.
31
+ def tag?(tag)
32
+ File.directory?(File.join(@dir, 'snapshot', tag.to_s))
33
+ end
30
34
 
31
- def set(tag, path, result)
32
- return unless @write_tags.include? tag
33
- open or return
34
- @dbm[tag.to_s] = 'TRUE'
35
- @dbm[key(tag, path)] = Marshal.dump(result)
36
- end
35
+ ##
36
+ # Get data from cache.
37
+ def get(tag, path)
38
+ return nil unless @read_tags.include? tag
37
39
 
38
- private
39
- def key(tag, path)
40
- # Ensure encoding stays the same!
41
- Marshal.dump([tag, path.encode('UTF-8')])
42
- end
40
+ filename = File.join(
41
+ @dir,
42
+ 'snapshot',
43
+ tag.to_s,
44
+ *path.split(File::SEPARATOR)
45
+ )
46
+
47
+ filename = File.join(filename, 'index.html') if File.directory?(filename)
48
+ return nil unless File.file? filename
49
+
50
+ Marshal.load(File.read(filename))
51
+ end
43
52
 
44
- # Ensure the DB is open
45
- def open
46
- # DBM adds an extra .db, ugh
47
- return false unless @create || File.exist?(@file) ||
48
- File.exist?(@file + '.db')
49
- return true if defined? @dbm
50
-
51
- begin
52
- require 'gdbm'
53
- @dbm = GDBM.new(@file)
54
- rescue LoadError
55
- require 'dbm'
56
- @dbm = DBM.new(@file)
53
+ ##
54
+ # Set data to cache.
55
+ def set(tag, path, result)
56
+ return unless @write_tags.include? tag
57
+
58
+ save_timestamp(tag)
59
+ filename = File.join(
60
+ @dir,
61
+ 'snapshot',
62
+ tag.to_s,
63
+ *path.split(File::SEPARATOR)
64
+ )
65
+
66
+ filename = File.join(filename, 'index.html') if File.directory?(filename)
67
+ filepath = Pathname.new(filename)
68
+ unless filepath.dirname.directory?
69
+ begin
70
+ filepath.dirname.mkpath
71
+ rescue Errno::EEXIST
72
+ curdir = filepath
73
+ curdir = curdir.parent until curdir.exist?
74
+ tempname = curdir.dirname + (curdir.basename.to_s + '.temporary')
75
+ # May cause problems if action is not atomic!
76
+ # Move existing file to dir/index.html first
77
+ # Not robust! Should generate an UUID or something.
78
+ if File.exist?(tempname)
79
+ SiteDiff.log "Overwriting file #{tempname}", :warning
80
+ end
81
+ curdir.rename(tempname)
82
+ filepath.dirname.mkpath
83
+ # Should only happen in strange situations such as when the path
84
+ # is foo/index.html/bar (i.e., index.html is a directory)
85
+ if (curdir + 'index.html').exist?
86
+ SiteDiff.log "Overwriting file #{tempname}", :warning
87
+ end
88
+ tempname.rename(curdir + 'index.html')
89
+ end
90
+ end
91
+ File.open(filename, 'w') { |file| file.write(Marshal.dump(result)) }
92
+ end
93
+
94
+ ##
95
+ # TODO: Document this or remove it if unused.
96
+ def key(tag, path)
97
+ # Ensure encoding stays the same!
98
+ Marshal.dump([tag, path.encode('UTF-8')])
99
+ end
100
+
101
+ ##
102
+ # Ensures that a directory exists.
103
+ def get_dir(directory)
104
+ # Create the dir. Must go before cache initialization!
105
+ @dir = Pathname.new(directory || '.')
106
+ @dir.mkpath unless @dir.directory?
107
+ @dir.to_s
108
+ end
109
+
110
+ private
111
+
112
+ def save_timestamp(tag)
113
+ # run once
114
+ return if @timestamp_flag[tag]
115
+
116
+ @timestamp_flag[tag] = true
117
+ cache_dir = File.join(@dir, 'snapshot', tag.to_s)
118
+ if File.exist? cache_dir
119
+ file = File.join(cache_dir, TIMESTAMP_FILE)
120
+ FileUtils.touch(file)
121
+ end
57
122
  end
58
- return true
59
123
  end
60
124
  end
61
- end
@@ -1,17 +1,30 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'thor'
2
4
  require 'sitediff'
3
- require 'sitediff/cache'
4
- require 'sitediff/config'
5
- require 'sitediff/config/creator'
6
- require 'sitediff/fetch'
7
- require 'sitediff/webserver/resultserver'
5
+ require 'sitediff/api'
8
6
 
9
7
  class SiteDiff
8
+ # SiteDiff CLI.
10
9
  class Cli < Thor
11
10
  class_option 'directory',
12
- :type => :string,
13
- :aliases => '-C',
14
- :desc => "Go to a given directory before running."
11
+ type: :string,
12
+ aliases: '-C',
13
+ default: 'sitediff',
14
+ desc: 'Configuration directory'
15
+ class_option :debug,
16
+ type: :boolean,
17
+ aliases: '-d',
18
+ default: false,
19
+ desc: 'Stop on certain errors and produce error trace backs.'
20
+ class_option 'verbose',
21
+ type: :boolean,
22
+ aliases: '-v',
23
+ default: false,
24
+ desc: 'Show verbose output in terminal'
25
+
26
+ # Command aliases.
27
+ map recrawl: :crawl
15
28
 
16
29
  # Thor, by default, exits with 0 no matter what!
17
30
  def self.exit_on_failure?
@@ -19,192 +32,233 @@ class SiteDiff
19
32
  end
20
33
 
21
34
  # Thor, by default, does not raise an error for use of unknown options.
22
- def self.check_unknown_options?(config)
35
+ def self.check_unknown_options?(_config)
23
36
  true
24
37
  end
25
38
 
26
- option 'dump-dir',
27
- :type => :string,
28
- :default => File.join('.', 'output'),
29
- :desc => "Location to write the output to."
39
+ desc 'version', 'Show version information'
40
+ ##
41
+ # Show version information.
42
+ def version
43
+ gemspec = SiteDiff.gemspec
44
+ output = []
45
+ output.push("Sitediff CLI #{gemspec.version}")
46
+ if options[:verbose]
47
+ output.push('Website: ' + gemspec.homepage)
48
+ output.push('GitHub: ' + gemspec.metadata['source_code_uri'])
49
+ end
50
+ puts output.join("\n")
51
+ end
52
+
30
53
  option 'paths-file',
31
- :type => :string,
32
- :desc => 'Paths are read (one at a line) from PATHS: ' +
33
- 'useful for iterating over sanitization rules',
34
- :aliases => '--paths-from-file'
54
+ type: :string,
55
+ desc: 'Paths are read (one at a line) from PATHS: ' \
56
+ 'useful for iterating over sanitization rules',
57
+ aliases: '--paths-from-file'
35
58
  option 'paths',
36
- :type => :array,
37
- :aliases => '-p',
38
- :desc => "Fetch only these specific paths"
59
+ type: :array,
60
+ aliases: '-p',
61
+ desc: 'Specific path or paths to fetch'
39
62
  option 'before',
40
- :type => :string,
41
- :desc => "URL used to fetch the before HTML. Acts as a prefix to specified paths",
42
- :aliases => '--before-url'
63
+ type: :string,
64
+ desc: 'URL to the "before" site, prefixed to all paths.',
65
+ aliases: '--before-url'
43
66
  option 'after',
44
- :type => :string,
45
- :desc => "URL used to fetch the after HTML. Acts as a prefix to specified paths.",
46
- :aliases => '--after-url'
67
+ type: :string,
68
+ desc: 'URL to the "after" site, prefixed to all paths.',
69
+ aliases: '--after-url'
70
+ option 'report-format',
71
+ type: :string,
72
+ enum: %w[html json],
73
+ default: 'html',
74
+ desc: 'The format in which a report should be generated.'
47
75
  option 'before-report',
48
- :type => :string,
49
- :desc => "Before URL to use for reporting purposes. Useful if port forwarding.",
50
- :aliases => '--before-url-report'
76
+ type: :string,
77
+ desc: 'URL to use in reports. Useful if port forwarding.',
78
+ aliases: '--before-url-report'
51
79
  option 'after-report',
52
- :type => :string,
53
- :desc => "After URL to use for reporting purposes. Useful if port forwarding.",
54
- :aliases => '--after-url-report'
80
+ type: :string,
81
+ desc: 'URL to use in reports. Useful if port forwarding.',
82
+ aliases: '--after-url-report'
55
83
  option 'cached',
56
- :type => :string,
57
- :enum => %w[none all before after],
58
- :default => 'before',
59
- :desc => "Use the cached version of these sites, if available."
60
- option 'quiet',
61
- :type => :boolean,
62
- :aliases => '-q',
63
- :default => false,
64
- :desc => "Show the difference between versions for each page"
65
- desc "diff [OPTIONS] [CONFIGFILES]", "Perform systematic diff on given URLs"
66
- def diff(*config_files)
67
- config = chdir(config_files)
68
-
69
- # override config based on options
70
- paths = options['paths']
71
- if paths_file = options['paths-file']
72
- if paths then
73
- SiteDiff::log "Can't have both --paths-file and --paths", :error
74
- exit -1
75
- end
76
-
77
- paths_file = Pathname.new(paths_file).expand_path
78
- unless File.exists? paths_file
79
- raise Config::InvalidConfig,
80
- "Paths file '#{paths_file}' not found!"
81
- end
82
- SiteDiff::log "Reading paths from: #{paths_file}"
83
- config.paths = File.readlines(paths_file)
84
+ type: :string,
85
+ enum: %w[none all before after],
86
+ default: 'before',
87
+ desc: 'Use the cached version of these sites, if available.'
88
+ option 'ignore-whitespace',
89
+ type: :boolean,
90
+ default: false,
91
+ aliases: '-w',
92
+ desc: 'Ignore changes in whitespace.'
93
+ option 'export',
94
+ type: :boolean,
95
+ default: false,
96
+ aliases: '-e',
97
+ desc: 'Export report to files. This option forces HTML format.'
98
+ desc 'diff [OPTIONS] [CONFIG-FILE]',
99
+ 'Compute diffs on configured URLs.'
100
+ ##
101
+ # Computes diffs.
102
+ def diff(config_file = nil)
103
+ # Determine "paths" override based on options.
104
+ if options['paths'] && options['paths-file']
105
+ SiteDiff.log "Can't specify both --paths-file and --paths.", :error
106
+ exit(-1)
84
107
  end
85
- config.paths = paths if paths
86
-
87
- config.before['url'] = options['before'] if options['before']
88
- config.after['url'] = options['after'] if options['after']
89
-
90
- # Setup cache
91
- cache = SiteDiff::Cache.new(:create => options['cached'] != 'none')
92
- cache.read_tags << :before if %w[before all].include?(options['cached'])
93
- cache.read_tags << :after if %w[after all].include?(options['cached'])
94
- cache.write_tags << :before << :after
95
-
96
- sitediff = SiteDiff.new(config, cache, !options['quiet'])
97
- num_failing = sitediff.run
98
- exit_code = (num_failing > 0) ? 2 : 0;
99
108
 
100
- sitediff.dump(options['dump-dir'], options['before-report'],
101
- options['after-report'])
102
- rescue Config::InvalidConfig => e
103
- SiteDiff.log "Invalid configuration: #{e.message}", :error
104
- rescue SiteDiffException => e
105
- SiteDiff.log e.message, :error
106
- else # no exception was raised
107
- # Thor::Error --> exit(1), guaranteed by exit_on_failure?
108
- # Failing diff --> exit(2), populated above
109
- exit(exit_code)
109
+ api = Api.new(options['directory'], config_file)
110
+ api_options =
111
+ clean_keys(
112
+ options,
113
+ :paths,
114
+ :paths_file,
115
+ :ignore_whitespace,
116
+ :export,
117
+ :before,
118
+ :after,
119
+ :cached,
120
+ :verbose,
121
+ :debug,
122
+ :report_format,
123
+ :before_report,
124
+ :after_report
125
+ )
126
+ api_options[:cli_mode] = true
127
+ api.diff(api_options)
110
128
  end
111
129
 
112
130
  option :port,
113
- :type => :numeric,
114
- :default => SiteDiff::Webserver::DEFAULT_PORT,
115
- :desc => 'The port to serve on'
116
- option 'dump-dir',
117
- :type => :string,
118
- :default => 'output',
119
- :desc => 'The directory to serve'
131
+ type: :numeric,
132
+ default: SiteDiff::Webserver::DEFAULT_PORT,
133
+ desc: 'The port to serve on'
120
134
  option :browse,
121
- :type => :boolean,
122
- :default => true,
123
- :desc => "Whether to open the served content in your browser"
124
- desc "serve [OPTIONS]", "Serve the sitediff output directory over HTTP"
125
- def serve(*config_files)
126
- config = chdir(config_files, :config => false)
127
-
128
- cache = Cache.new
129
- cache.read_tags << :before << :after
130
-
131
- SiteDiff::Webserver::ResultServer.new(
132
- options[:port],
133
- options['dump-dir'],
134
- :browse => options[:browse],
135
- :cache => cache,
136
- :config => config,
137
- ).wait
135
+ type: :boolean,
136
+ default: true,
137
+ desc: 'Whether to open the served content in your browser'
138
+ desc 'serve [OPTIONS] [CONFIG-FILE]',
139
+ 'Serve SiteDiff report directory over HTTP.'
140
+ ##
141
+ # Serves SiteDiff report for accessing in the browser.
142
+ def serve(config_file = nil)
143
+ api = Api.new(options['directory'], config_file)
144
+ api_options = clean_keys(options, :browse, :port)
145
+ api.serve(api_options)
138
146
  end
139
147
 
140
- option :output,
141
- :type => :string,
142
- :default => 'sitediff',
143
- :desc => 'Where to place the configuration',
144
- :aliases => ['-o']
145
148
  option :depth,
146
- :type => :numeric,
147
- :default => 3,
148
- :desc => 'How deeply to crawl the given site'
149
- option :rules,
150
- :type => :string,
151
- :enum => %w[yes no disabled],
152
- :default => 'disabled',
153
- :desc => 'Whether rules for the site should be auto-created'
154
- desc "init URL [URL]", "Create a sitediff configuration"
149
+ type: :numeric,
150
+ default: Config::DEFAULT_CONFIG['settings']['depth'],
151
+ desc: 'How deeply to crawl the given site'
152
+ option :crawl,
153
+ type: :boolean,
154
+ default: true,
155
+ desc: 'Run "sitediff crawl" to discover paths.'
156
+ option :preset,
157
+ type: :string,
158
+ enum: Config::Preset.all,
159
+ desc: 'Framework-specific presets to apply.'
160
+ option :concurrency,
161
+ type: :numeric,
162
+ default: Config::DEFAULT_CONFIG['settings']['concurrency'],
163
+ desc: 'Max number of concurrent connections made.'
164
+ option :interval,
165
+ type: :numeric,
166
+ default: Config::DEFAULT_CONFIG['settings']['interval'],
167
+ desc: 'Crawling delay - interval in milliseconds.'
168
+ option :include,
169
+ type: :string,
170
+ default: Config::DEFAULT_CONFIG['settings']['include'],
171
+ desc: 'Optional URL include regex for crawling.'
172
+ option :exclude,
173
+ type: :string,
174
+ default: Config::DEFAULT_CONFIG['settings']['exclude'],
175
+ desc: 'Optional URL exclude regex for crawling.'
176
+ option :curl_options,
177
+ type: :hash,
178
+ default: {},
179
+ desc: 'Options to be passed to curl'
180
+ desc 'init URL [URL]', 'Create a sitediff configuration.'
181
+ ##
182
+ # Initializes a sitediff (yaml) configuration file.
155
183
  def init(*urls)
156
- unless (1..2).include? urls.size
157
- SiteDiff.log "sitediff init requires one or two URLs", :error
158
- exit 2
184
+ unless (1..2).cover? urls.size
185
+ SiteDiff.log 'sitediff init requires one or two URLs', :error
186
+ exit(2)
159
187
  end
160
-
161
- chdir([], :search => false)
162
- creator = SiteDiff::Config::Creator.new(*urls)
163
- creator.create(
164
- :depth => options[:depth],
165
- :directory => options[:output],
166
- :rules => options[:rules] != 'no',
167
- :rules_disabled => (options[:rules] == 'disabled'),
168
- ) do |tag, info|
169
- SiteDiff.log "Visited #{info.uri}, cached"
170
- end
171
-
172
- SiteDiff.log "Created #{creator.config_file.expand_path}", :success
173
- SiteDiff.log "You can now run 'sitediff diff'", :success
188
+ api_options =
189
+ clean_keys(
190
+ options,
191
+ :depth,
192
+ :concurrency,
193
+ :interval,
194
+ :include,
195
+ :exclude,
196
+ :preset,
197
+ :crawl
198
+ )
199
+ .merge(
200
+ {
201
+ after_url: urls.pop,
202
+ before_url: urls.pop, # may be nil
203
+ directory: get_dir(options['directory']),
204
+ curl_opts: get_curl_opts(options)
205
+ }
206
+ )
207
+ Api.init(api_options)
174
208
  end
175
209
 
176
210
  option :url,
177
- :type => :string,
178
- :desc => 'A custom base URL to fetch from'
179
- desc "store [CONFIGFILES]",
180
- "Cache the current contents of a site for later comparison"
181
- def store(*config_files)
182
- config = chdir(config_files)
183
- config.validate(:need_before => false)
184
-
185
- cache = SiteDiff::Cache.new(:create => true)
186
- cache.write_tags << :before
211
+ type: :string,
212
+ desc: 'A custom base URL to fetch from'
213
+ desc 'store [CONFIG-FILE]',
214
+ 'Cache the current contents of a site for later comparison.'
215
+ ##
216
+ # Caches the current version of the site.
217
+ def store(config_file = nil)
218
+ api = Api.new(options['directory'], config_file)
219
+ api_options = clean_keys(options, :url, :debug)
220
+ api.store(api_options)
221
+ end
187
222
 
188
- base = options[:url] || config.after['url']
189
- fetcher = SiteDiff::Fetch.new(cache, config.paths, :before => base)
190
- fetcher.run do |path, res|
191
- SiteDiff.log "Visited #{path}, cached"
192
- end
223
+ desc 'crawl [CONFIG-FILE]',
224
+ 'Crawl the "before" site to discover paths.'
225
+ ##
226
+ # Crawls the "before" site to determine "paths".
227
+ #
228
+ def crawl(config_file = nil)
229
+ api = Api.new(options['directory'], config_file)
230
+ api.crawl
193
231
  end
194
232
 
195
- private
196
- def chdir(files, opts = {})
197
- opts = { :config => true, :search => true }.merge(opts)
233
+ no_commands do
234
+ # Generates CURL options.
235
+ #
236
+ # TODO: Possibly move to API class.
237
+ def get_curl_opts(options)
238
+ # We do want string keys here
239
+ bool_hash = { 'true' => true, 'false' => false }
240
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS
241
+ .clone
242
+ .merge(options['curl_options'] || {})
243
+ .merge(options['curl_opts'] || {})
244
+ curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
245
+ curl_opts
246
+ end
198
247
 
199
- dir = options['directory']
200
- Dir.chdir(dir) if dir
248
+ ##
249
+ # Ensures that the given directory exists.
250
+ def get_dir(directory)
251
+ # Create the dir. Must go before cache initialization!
252
+ @dir = Pathname.new(directory || '.')
253
+ @dir.mkpath unless @dir.directory?
254
+ @dir.to_s
255
+ end
201
256
 
202
- return unless opts[:search]
203
- begin
204
- SiteDiff::Config.new(files, :search => !dir)
205
- rescue SiteDiff::Config::ConfigNotFound => e
206
- raise if opts[:config]
207
- # If no config required, allow it to pass
257
+ ##
258
+ # Clean keys - return a subset of a hash with keys as symbols.
259
+ def clean_keys(hash, *keys)
260
+ new_hash = hash.transform_keys { |k| k.tr('-', '_').to_sym }
261
+ new_hash.slice(*keys)
208
262
  end
209
263
  end
210
264
  end