sitediff 0.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,19 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff/config/preset'
1
4
  require 'sitediff/exception'
2
5
  require 'sitediff/sanitize'
3
6
  require 'pathname'
4
7
  require 'yaml'
5
8
 
6
9
  class SiteDiff
10
+ # SiteDiff Configuration.
7
11
  class Config
12
+ # Default config file.
8
13
  DEFAULT_FILENAME = 'sitediff.yaml'
9
14
 
10
- # keys allowed in configuration files
11
- CONF_KEYS = Sanitizer::TOOLS.values.flatten(1) +
12
- %w[paths before after before_url after_url includes]
15
+ # Default paths file.
16
+ DEFAULT_PATHS_FILENAME = 'paths.txt'
17
+
18
+ # Default SiteDiff config.
19
+ DEFAULT_CONFIG = {
20
+ 'settings' => {
21
+ 'depth' => 3,
22
+ 'interval' => 0,
23
+ 'include' => '',
24
+ 'exclude' => '',
25
+ 'concurrency' => 3,
26
+ 'preset' => nil
27
+ },
28
+ 'before' => {},
29
+ 'after' => {},
30
+ 'paths' => []
31
+ }.freeze
32
+
33
+ # Keys allowed in config files.
34
+ # TODO: Deprecate repeated params before_url and after_url.
35
+ # TODO: Create a method self.supports
36
+ # TODO: Deprecate in favor of self.supports key, subkey, subkey...
37
+ ALLOWED_CONFIG_KEYS = Sanitizer::TOOLS.values.flatten(1) + %w[
38
+ includes
39
+ settings
40
+ before
41
+ after
42
+ before_url
43
+ after_url
44
+ ignore_whitespace
45
+ export
46
+ output
47
+ report
48
+ ]
49
+
50
+ ##
51
+ # Keys allowed in the "settings" key.
52
+ # TODO: Create a method self.supports
53
+ # TODO: Deprecate in favor of self.supports key, subkey, subkey...
54
+ ALLOWED_SETTINGS_KEYS = %w[
55
+ preset
56
+ depth
57
+ include
58
+ exclude
59
+ concurrency
60
+ interval
61
+ curl_opts
62
+ ].freeze
13
63
 
14
64
  class InvalidConfig < SiteDiffException; end
15
65
  class ConfigNotFound < SiteDiffException; end
16
66
 
67
+ attr_reader :directory
68
+
17
69
  # Takes a Hash and normalizes it to the following form by merging globals
18
70
  # into before and after. A normalized config Hash looks like this:
19
71
  #
@@ -23,6 +75,12 @@ class SiteDiff
23
75
  # before:
24
76
  # url: http://before
25
77
  # selector: body
78
+ # ## Note: use either `selector` or `regions`, but not both
79
+ # regions:
80
+ # - name: title
81
+ # selector: .field-name-title h2
82
+ # - name: body
83
+ # selector: .field-name-field-news-description .field-item
26
84
  # dom_transform:
27
85
  # - type: remove
28
86
  # selector: script
@@ -31,23 +89,32 @@ class SiteDiff
31
89
  # url: http://after
32
90
  # selector: body
33
91
  #
92
+ # ## Note: use `output` only with `regions`
93
+ # output:
94
+ # - title
95
+ # - author
96
+ # - source
97
+ # - body
98
+ #
34
99
  def self.normalize(conf)
35
100
  tools = Sanitizer::TOOLS
36
101
 
37
- # merge globals
102
+ # Merge globals
38
103
  %w[before after].each do |pos|
39
104
  conf[pos] ||= {}
40
105
  tools[:array].each do |key|
41
106
  conf[pos][key] ||= []
42
107
  conf[pos][key] += conf[key] if conf[key]
43
108
  end
44
- tools[:scalar].each {|key| conf[pos][key] ||= conf[key]}
109
+ tools[:scalar].each { |key| conf[pos][key] ||= conf[key] }
45
110
  conf[pos]['url'] ||= conf[pos + '_url']
111
+ conf[pos]['curl_opts'] = conf['curl_opts']
46
112
  end
47
- # normalize paths
48
- conf['paths'] = Config::normalize_paths(conf['paths'])
49
113
 
50
- conf.select {|k,v| %w[before after paths].include? k}
114
+ # Normalize paths.
115
+ conf['paths'] = Config.normalize_paths(conf['paths'])
116
+
117
+ conf.select { |k, _v| ALLOWED_CONFIG_KEYS.include? k }
51
118
  end
52
119
 
53
120
  # Merges two normalized Hashes according to the following rules:
@@ -64,110 +131,424 @@ class SiteDiff
64
131
  # (h2) before: {selector: bar, sanitization: [pattern: bar]}
65
132
  # (h3) before: {selector: foo, sanitization: [pattern: foo, pattern: bar]}
66
133
  def self.merge(first, second)
67
- result = { 'paths' => {}, 'before' => {}, 'after' => {} }
68
- result['paths'] = (first['paths'] || []) + (second['paths'] || []) # rule 1
134
+ result = {
135
+ 'before' => {},
136
+ 'after' => {},
137
+ 'output' => [],
138
+ 'settings' => {}
139
+ }
140
+
141
+ # Merge sanitization rules.
142
+ Sanitizer::TOOLS.values.flatten(1).each do |key|
143
+ result[key] = second[key] || first[key]
144
+ result.delete(key) unless result[key]
145
+ end
146
+
147
+ # Rule 1.
69
148
  %w[before after].each do |pos|
149
+ first[pos] ||= {}
150
+ second[pos] ||= {}
151
+
152
+ # If only the second hash has the value.
70
153
  unless first[pos]
71
154
  result[pos] = second[pos] || {}
72
155
  next
73
156
  end
157
+
74
158
  result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
75
- if Sanitizer::TOOLS[:array].include? key # rule 2a
76
- result[pos][key] = (a || []) + (b|| [])
77
- else
78
- result[pos][key] = a || b # rule 2b
79
- end
159
+ # Rule 2a.
160
+ result[pos][key] = if Sanitizer::TOOLS[:array].include? key
161
+ (a || []) + (b || [])
162
+ elsif key == 'settings'
163
+ b
164
+ else
165
+ a || b # Rule 2b.
166
+ end
80
167
  end
81
168
  end
169
+
170
+ # Merge output array.
171
+ result['output'] += (first['output'] || []) + (second['output'] || [])
172
+
173
+ # Merge url_report keys.
174
+ %w[before_url_report after_url_report].each do |pos|
175
+ result[pos] = first[pos] || second[pos]
176
+ end
177
+
178
+ # Merge settings.
179
+ result['settings'] = merge_deep(
180
+ first['settings'] || {},
181
+ second['settings'] || {}
182
+ )
183
+
184
+ # Merge report labels.
185
+ result['report'] = merge_deep(
186
+ first['report'] || {},
187
+ second['report'] || {}
188
+ )
189
+
82
190
  result
83
191
  end
84
192
 
85
- # Search for a config file. If found, change to the containing directory,
86
- # and return an array of config files found.
87
- def self.search
88
- subdirs = %w[. sitediff]
89
- root_indicators = %w[.git .svn]
90
-
91
- Pathname.pwd.ascend do |dir|
92
- subdirs.each do |subdir|
93
- d = dir + subdir + DEFAULT_FILENAME
94
- if d.exist?
95
- Dir.chdir(dir.+(subdir).to_s)
96
- return [DEFAULT_FILENAME]
97
- end
193
+ ##
194
+ # Merges 2 iterable objects deeply.
195
+ def self.merge_deep(first, second)
196
+ first.merge(second) do |_key, val1, val2|
197
+ case val1.class
198
+ when Hash
199
+ self.class.merge_deep(val1, val2 || {})
200
+ when Array
201
+ val1 + (val2 || [])
202
+ else
203
+ val2
98
204
  end
99
-
100
- root_indicators.each { |r| return [] if dir.+(r).exist? }
101
205
  end
206
+ end
102
207
 
103
- return []
208
+ ##
209
+ # Gets all loaded configuration except defaults.
210
+ #
211
+ # @return [Hash]
212
+ # Config data.
213
+ def all
214
+ result = Marshal.load(Marshal.dump(@config))
215
+ self.class.remove_defaults(result)
104
216
  end
105
217
 
106
- def initialize(files, opts = {})
107
- @config = {'paths' => [], 'before' => {}, 'after' => {} }
218
+ ##
219
+ # Removes default parameters from a config hash.
220
+ #
221
+ # I know this is weird, but it'll be fixed. The config management needs to
222
+ # be streamlined further.
223
+ def self.remove_defaults(data)
224
+ # Create a deep copy of the config data.
225
+ result = data
108
226
 
109
- files = Config.search if files.empty? && opts[:search]
110
- files = [DEFAULT_FILENAME] if files.empty? &&
111
- File.exists?(DEFAULT_FILENAME)
112
- raise ConfigNotFound, "No configuration file found." if files.empty?
227
+ # Exclude default settings.
228
+ result['settings'].delete_if do |key, value|
229
+ value == DEFAULT_CONFIG['settings'][key] || !value
230
+ end
113
231
 
114
- files.each do |file|
115
- raise InvalidConfig,
116
- "Missing config file %s." % File.expand_path(file) \
117
- unless File.exist?(file)
118
- @config = Config::merge(@config, Config::load_conf(file))
232
+ # Exclude default curl opts.
233
+ result['settings']['curl_opts'] ||= {}
234
+ result['settings']['curl_opts'].delete_if do |key, value|
235
+ value == UriWrapper::DEFAULT_CURL_OPTS[key.to_sym]
119
236
  end
237
+
238
+ # Delete curl opts if empty.
239
+ unless result['settings']['curl_opts'].length.positive?
240
+ result['settings'].delete('curl_opts')
241
+ end
242
+
243
+ result
120
244
  end
121
245
 
122
- def before
123
- @config['before']
246
+ # Creates a SiteDiff Config object.
247
+ def initialize(file, directory)
248
+ # Fallback to default config filename, if none is specified.
249
+ file = File.join(directory, DEFAULT_FILENAME) if file.nil?
250
+ unless File.exist?(file)
251
+ path = File.expand_path(file)
252
+ raise InvalidConfig, "Missing config file #{path}."
253
+ end
254
+ @config = Config.merge(DEFAULT_CONFIG, Config.load_conf(file))
255
+ @file = file
256
+ @directory = directory
257
+
258
+ # Validate configurations.
259
+ validate
260
+ end
261
+
262
+ # Get "before" site configuration.
263
+ def before(apply_preset = false)
264
+ section :before, apply_preset
124
265
  end
125
- def after
126
- @config['after']
266
+
267
+ # Get "before" site URL.
268
+ def before_url
269
+ result = before
270
+ result['url'] if result
271
+ end
272
+
273
+ # Get "after" site configuration.
274
+ def after(apply_preset = false)
275
+ section :after, apply_preset
127
276
  end
128
277
 
278
+ # Get "after" site URL.
279
+ def after_url
280
+ result = after
281
+ result['url'] if result
282
+ end
283
+
284
+ # Get paths.
129
285
  def paths
130
286
  @config['paths']
131
287
  end
288
+
289
+ # Set paths.
132
290
  def paths=(paths)
133
- @config['paths'] = Config::normalize_paths(paths)
291
+ raise 'Paths must be an Array' unless paths.is_a? Array
292
+
293
+ @config['paths'] = Config.normalize_paths(paths)
294
+ end
295
+
296
+ # Get ignore_whitespace option
297
+ def ignore_whitespace
298
+ @config['ignore_whitespace']
299
+ end
300
+
301
+ # Set ignore_whitespace option
302
+ def ignore_whitespace=(ignore_whitespace)
303
+ @config['ignore_whitespace'] = ignore_whitespace
304
+ end
305
+
306
+ # Get export option
307
+ def export
308
+ @config['export']
309
+ end
310
+
311
+ # Set export option
312
+ def export=(export)
313
+ @config['export'] = export
314
+ end
315
+
316
+ # Get output option
317
+ def output
318
+ @config['output']
319
+ end
320
+
321
+ # Set output option
322
+ def output=(output)
323
+ raise 'Output must be an Array' unless output.is_a? Array
324
+
325
+ @config['output'] = output
326
+ end
327
+
328
+ # Return report display settings.
329
+ def report
330
+ @config['report']
331
+ end
332
+
333
+ # Set crawl time for 'before'
334
+ def before_time=(time)
335
+ @config['report']['before_time'] = time
336
+ end
337
+
338
+ # Set crawl time for 'after'
339
+ def after_time=(time)
340
+ @config['report']['after_time'] = time
341
+ end
342
+
343
+ ##
344
+ # Writes an array of paths to a file.
345
+ #
346
+ # @param [Array] paths
347
+ # An array of paths.
348
+ # @param [String] file
349
+ # Optional path to a file.
350
+ def paths_file_write(paths, file = nil)
351
+ unless paths.is_a?(Array) && paths.length.positive?
352
+ raise SiteDiffException, 'Write failed. Invalid paths.'
353
+ end
354
+
355
+ file ||= File.join(@directory, DEFAULT_PATHS_FILENAME)
356
+ File.open(file, 'w+') { |f| f.puts(paths) }
357
+ end
358
+
359
+ ##
360
+ # Reads a collection of paths from a file.
361
+ #
362
+ # @param [String] file
363
+ # A file containing one path per line.
364
+ #
365
+ # @return [Integer]
366
+ # Number of paths read.
367
+ def paths_file_read(file = nil)
368
+ file ||= File.join(@directory, DEFAULT_PATHS_FILENAME)
369
+
370
+ unless File.exist? file
371
+ raise Config::InvalidConfig, "File not found: #{file}"
372
+ end
373
+
374
+ self.paths = File.readlines(file)
375
+
376
+ # Return the number of paths.
377
+ paths.length
378
+ end
379
+
380
+ ##
381
+ # Get roots.
382
+ #
383
+ # Example: If the config has a "before" and "after" sections, then roots
384
+ # will be ["before", "after"].
385
+ def roots
386
+ @roots = { 'after' => after_url }
387
+ @roots['before'] = before_url if before
388
+ @roots
389
+ end
390
+
391
+ ##
392
+ # Gets a setting.
393
+ #
394
+ # @param [String] key
395
+ # A key.
396
+ #
397
+ # @return [*]
398
+ # A value, if exists.
399
+ def setting(key)
400
+ key = key.to_s if key.is_a?(Symbol)
401
+ return @config['settings'][key] if @config['settings'].key?(key)
402
+ end
403
+
404
+ ##
405
+ # Gets all settings.
406
+ #
407
+ # TODO: Make sure the settings are not writable.
408
+ #
409
+ # @return [Hash]
410
+ # All settings.
411
+ def settings
412
+ @config['settings']
134
413
  end
135
414
 
136
415
  # Checks if the configuration is usable for diff-ing.
416
+ # TODO: Do we actually need the opts argument?
137
417
  def validate(opts = {})
138
- opts = { :need_before => true }.merge(opts)
418
+ opts = { need_before: true }.merge(opts)
419
+
420
+ if opts[:need_before] && !before['url']
421
+ raise InvalidConfig, "Undefined 'before' base URL."
422
+ end
139
423
 
140
- raise InvalidConfig, "Undefined 'before' base URL." if \
141
- opts[:need_before] && !before['url']
142
424
  raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
143
- raise InvalidConfig, "Undefined 'paths'." unless (paths and !paths.empty?)
425
+
426
+ # Validate interval and concurrency.
427
+ interval = setting(:interval)
428
+ concurrency = setting(:concurrency)
429
+ if interval.to_i != 0 && concurrency != 1
430
+ raise InvalidConfig, 'Concurrency must be 1 when an interval is set.'
431
+ end
432
+
433
+ # Validate preset.
434
+ Preset.exist? setting(:preset), true if setting(:preset)
435
+ end
436
+
437
+ ##
438
+ # Returns object clone with stringified keys.
439
+ # TODO: Make this method available globally, if required.
440
+ def self.stringify_keys(object)
441
+ # Do nothing if it is not an object.
442
+ return object unless object.respond_to?('each_key')
443
+
444
+ # Convert symbol indices to strings.
445
+ output = {}
446
+ object.each_key do |old_k|
447
+ new_k = old_k.is_a?(Symbol) ? old_k.to_s : old_k
448
+ output[new_k] = stringify_keys object[old_k]
449
+ end
450
+
451
+ # Return the new hash with string indices.
452
+ output
453
+ end
454
+
455
+ ##
456
+ # Creates a RegExp from a string.
457
+ def self.create_regexp(string_param)
458
+ begin
459
+ @return_value = string_param == '' ? nil : Regexp.new(string_param)
460
+ rescue SiteDiffException => e
461
+ @return_value = nil
462
+ SiteDiff.log 'Invalid RegExp: ' + string_param, :error
463
+ SiteDiff.log e.message, :error
464
+ # TODO: Use SiteDiff.log type :debug
465
+ # SiteDiff.log e.backtrace, :error if options[:verbose]
466
+ end
467
+ @return_value
468
+ end
469
+
470
+ ##
471
+ # Return merged CURL options.
472
+ def curl_opts
473
+ # We do want string keys here
474
+ bool_hash = { 'true' => true, 'false' => false }
475
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS
476
+ .clone
477
+ .merge(settings['curl_opts'] || {})
478
+ curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
479
+ curl_opts
144
480
  end
145
481
 
146
482
  private
147
483
 
484
+ ##
485
+ # Returns one of the "before" or "after" sections.
486
+ #
487
+ # @param [String|Symbol]
488
+ # Section name. Example: before, after.
489
+ # @param [Boolean] with_preset
490
+ # Whether to merge with preset config (if any).
491
+ #
492
+ # @return [Hash|Nil]
493
+ # Section data or Nil.
494
+ def section(name, with_preset = false)
495
+ name = name.to_s if name.is_a? Symbol
496
+
497
+ # Validate section.
498
+ unless %w[before after].include? name
499
+ raise SiteDiffException, '"name" must be one of "before" or "after".'
500
+ end
501
+
502
+ # Return nil if section is not defined.
503
+ return nil unless @config[name]
504
+
505
+ result = @config[name]
506
+
507
+ # Merge preset rules, if required.
508
+ preset = setting(:preset)
509
+ if with_preset && !preset.nil?
510
+ preset_config = Preset.read preset
511
+
512
+ # Merge plugins with array values.
513
+ # TODO: This won't be required after plugin declarations are improved.
514
+ # See https://rm.ewdev.ca/issues/18301
515
+ Sanitizer::TOOLS[:array].each do |key|
516
+ if preset_config[key]
517
+ result[key] = (result[key] || []) + preset_config[key]
518
+ end
519
+ end
520
+ end
521
+
522
+ result
523
+ end
524
+
148
525
  def self.normalize_paths(paths)
149
526
  paths ||= []
150
- return paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
527
+ paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
151
528
  end
152
529
 
153
530
  # reads a YAML file and raises an InvalidConfig if the file is not valid.
154
531
  def self.load_raw_yaml(file)
155
- SiteDiff::log "Reading config file: #{Pathname.new(file).expand_path}"
532
+ # TODO: Only show this in verbose mode.
533
+ SiteDiff.log "Reading config file: #{Pathname.new(file).expand_path}"
156
534
  conf = YAML.load_file(file) || {}
535
+
157
536
  unless conf.is_a? Hash
158
537
  raise InvalidConfig, "Invalid configuration file: '#{file}'"
159
538
  end
160
- conf.each do |k,v|
161
- unless CONF_KEYS.include? k
539
+
540
+ conf.each_key do |k, _v|
541
+ unless ALLOWED_CONFIG_KEYS.include? k
162
542
  raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'"
163
543
  end
164
544
  end
545
+
165
546
  conf
166
547
  end
167
548
 
168
549
  # loads a single YAML configuration file, merges all its 'included' files
169
550
  # and returns a normalized Hash.
170
- def self.load_conf(file, visited=[])
551
+ def self.load_conf(file, visited = [])
171
552
  # don't get fooled by a/../a/ or symlinks
172
553
  file = File.realpath(file)
173
554
  if visited.include? file
@@ -179,11 +560,11 @@ class SiteDiff
179
560
 
180
561
  # normalize and merge includes
181
562
  includes = conf['includes'] || []
182
- conf = Config::normalize(conf)
563
+ conf = Config.normalize(conf)
183
564
  includes.each do |dep|
184
565
  # include paths are relative to the including file.
185
566
  dep = File.join(File.dirname(file), dep)
186
- conf = Config::merge(conf, load_conf(dep, visited))
567
+ conf = Config.merge(conf, load_conf(dep, visited))
187
568
  end
188
569
  conf
189
570
  end