sitediff 0.0.2 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,19 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff/config/preset'
1
4
  require 'sitediff/exception'
2
5
  require 'sitediff/sanitize'
3
6
  require 'pathname'
4
7
  require 'yaml'
5
8
 
6
9
  class SiteDiff
10
+ # SiteDiff Configuration.
7
11
  class Config
12
+ # Default config file.
8
13
  DEFAULT_FILENAME = 'sitediff.yaml'
9
14
 
10
- # keys allowed in configuration files
11
- CONF_KEYS = Sanitizer::TOOLS.values.flatten(1) +
12
- %w[paths before after before_url after_url includes]
15
+ # Default paths file.
16
+ DEFAULT_PATHS_FILENAME = 'paths.txt'
17
+
18
+ # Default SiteDiff config.
19
+ DEFAULT_CONFIG = {
20
+ 'settings' => {
21
+ 'depth' => 3,
22
+ 'interval' => 0,
23
+ 'include' => '',
24
+ 'exclude' => '',
25
+ 'concurrency' => 3,
26
+ 'preset' => nil
27
+ },
28
+ 'before' => {},
29
+ 'after' => {},
30
+ 'paths' => []
31
+ }.freeze
32
+
33
+ # Keys allowed in config files.
34
+ # TODO: Deprecate repeated params before_url and after_url.
35
+ # TODO: Create a method self.supports
36
+ # TODO: Deprecate in favor of self.supports key, subkey, subkey...
37
+ ALLOWED_CONFIG_KEYS = Sanitizer::TOOLS.values.flatten(1) + %w[
38
+ includes
39
+ settings
40
+ before
41
+ after
42
+ before_url
43
+ after_url
44
+ ignore_whitespace
45
+ export
46
+ output
47
+ report
48
+ ]
49
+
50
+ ##
51
+ # Keys allowed in the "settings" key.
52
+ # TODO: Create a method self.supports
53
+ # TODO: Deprecate in favor of self.supports key, subkey, subkey...
54
+ ALLOWED_SETTINGS_KEYS = %w[
55
+ preset
56
+ depth
57
+ include
58
+ exclude
59
+ concurrency
60
+ interval
61
+ curl_opts
62
+ ].freeze
13
63
 
14
64
  class InvalidConfig < SiteDiffException; end
15
65
  class ConfigNotFound < SiteDiffException; end
16
66
 
67
+ attr_reader :directory
68
+
17
69
  # Takes a Hash and normalizes it to the following form by merging globals
18
70
  # into before and after. A normalized config Hash looks like this:
19
71
  #
@@ -23,6 +75,12 @@ class SiteDiff
23
75
  # before:
24
76
  # url: http://before
25
77
  # selector: body
78
+ # ## Note: use either `selector` or `regions`, but not both
79
+ # regions:
80
+ # - name: title
81
+ # selector: .field-name-title h2
82
+ # - name: body
83
+ # selector: .field-name-field-news-description .field-item
26
84
  # dom_transform:
27
85
  # - type: remove
28
86
  # selector: script
@@ -31,23 +89,32 @@ class SiteDiff
31
89
  # url: http://after
32
90
  # selector: body
33
91
  #
92
+ # ## Note: use `output` only with `regions`
93
+ # output:
94
+ # - title
95
+ # - author
96
+ # - source
97
+ # - body
98
+ #
34
99
  def self.normalize(conf)
35
100
  tools = Sanitizer::TOOLS
36
101
 
37
- # merge globals
102
+ # Merge globals
38
103
  %w[before after].each do |pos|
39
104
  conf[pos] ||= {}
40
105
  tools[:array].each do |key|
41
106
  conf[pos][key] ||= []
42
107
  conf[pos][key] += conf[key] if conf[key]
43
108
  end
44
- tools[:scalar].each {|key| conf[pos][key] ||= conf[key]}
109
+ tools[:scalar].each { |key| conf[pos][key] ||= conf[key] }
45
110
  conf[pos]['url'] ||= conf[pos + '_url']
111
+ conf[pos]['curl_opts'] = conf['curl_opts']
46
112
  end
47
- # normalize paths
48
- conf['paths'] = Config::normalize_paths(conf['paths'])
49
113
 
50
- conf.select {|k,v| %w[before after paths].include? k}
114
+ # Normalize paths.
115
+ conf['paths'] = Config.normalize_paths(conf['paths'])
116
+
117
+ conf.select { |k, _v| ALLOWED_CONFIG_KEYS.include? k }
51
118
  end
52
119
 
53
120
  # Merges two normalized Hashes according to the following rules:
@@ -64,110 +131,424 @@ class SiteDiff
64
131
  # (h2) before: {selector: bar, sanitization: [pattern: bar]}
65
132
  # (h3) before: {selector: foo, sanitization: [pattern: foo, pattern: bar]}
66
133
  def self.merge(first, second)
67
- result = { 'paths' => {}, 'before' => {}, 'after' => {} }
68
- result['paths'] = (first['paths'] || []) + (second['paths'] || []) # rule 1
134
+ result = {
135
+ 'before' => {},
136
+ 'after' => {},
137
+ 'output' => [],
138
+ 'settings' => {}
139
+ }
140
+
141
+ # Merge sanitization rules.
142
+ Sanitizer::TOOLS.values.flatten(1).each do |key|
143
+ result[key] = second[key] || first[key]
144
+ result.delete(key) unless result[key]
145
+ end
146
+
147
+ # Rule 1.
69
148
  %w[before after].each do |pos|
149
+ first[pos] ||= {}
150
+ second[pos] ||= {}
151
+
152
+ # If only the second hash has the value.
70
153
  unless first[pos]
71
154
  result[pos] = second[pos] || {}
72
155
  next
73
156
  end
157
+
74
158
  result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
75
- if Sanitizer::TOOLS[:array].include? key # rule 2a
76
- result[pos][key] = (a || []) + (b|| [])
77
- else
78
- result[pos][key] = a || b # rule 2b
79
- end
159
+ # Rule 2a.
160
+ result[pos][key] = if Sanitizer::TOOLS[:array].include? key
161
+ (a || []) + (b || [])
162
+ elsif key == 'settings'
163
+ b
164
+ else
165
+ a || b # Rule 2b.
166
+ end
80
167
  end
81
168
  end
169
+
170
+ # Merge output array.
171
+ result['output'] += (first['output'] || []) + (second['output'] || [])
172
+
173
+ # Merge url_report keys.
174
+ %w[before_url_report after_url_report].each do |pos|
175
+ result[pos] = first[pos] || second[pos]
176
+ end
177
+
178
+ # Merge settings.
179
+ result['settings'] = merge_deep(
180
+ first['settings'] || {},
181
+ second['settings'] || {}
182
+ )
183
+
184
+ # Merge report labels.
185
+ result['report'] = merge_deep(
186
+ first['report'] || {},
187
+ second['report'] || {}
188
+ )
189
+
82
190
  result
83
191
  end
84
192
 
85
- # Search for a config file. If found, change to the containing directory,
86
- # and return an array of config files found.
87
- def self.search
88
- subdirs = %w[. sitediff]
89
- root_indicators = %w[.git .svn]
90
-
91
- Pathname.pwd.ascend do |dir|
92
- subdirs.each do |subdir|
93
- d = dir + subdir + DEFAULT_FILENAME
94
- if d.exist?
95
- Dir.chdir(dir.+(subdir).to_s)
96
- return [DEFAULT_FILENAME]
97
- end
193
+ ##
194
+ # Merges 2 iterable objects deeply.
195
+ def self.merge_deep(first, second)
196
+ first.merge(second) do |_key, val1, val2|
197
+ case val1.class
198
+ when Hash
199
+ self.class.merge_deep(val1, val2 || {})
200
+ when Array
201
+ val1 + (val2 || [])
202
+ else
203
+ val2
98
204
  end
99
-
100
- root_indicators.each { |r| return [] if dir.+(r).exist? }
101
205
  end
206
+ end
102
207
 
103
- return []
208
+ ##
209
+ # Gets all loaded configuration except defaults.
210
+ #
211
+ # @return [Hash]
212
+ # Config data.
213
+ def all
214
+ result = Marshal.load(Marshal.dump(@config))
215
+ self.class.remove_defaults(result)
104
216
  end
105
217
 
106
- def initialize(files, opts = {})
107
- @config = {'paths' => [], 'before' => {}, 'after' => {} }
218
+ ##
219
+ # Removes default parameters from a config hash.
220
+ #
221
+ # I know this is weird, but it'll be fixed. The config management needs to
222
+ # be streamlined further.
223
+ def self.remove_defaults(data)
224
+ # Create a deep copy of the config data.
225
+ result = data
108
226
 
109
- files = Config.search if files.empty? && opts[:search]
110
- files = [DEFAULT_FILENAME] if files.empty? &&
111
- File.exists?(DEFAULT_FILENAME)
112
- raise ConfigNotFound, "No configuration file found." if files.empty?
227
+ # Exclude default settings.
228
+ result['settings'].delete_if do |key, value|
229
+ value == DEFAULT_CONFIG['settings'][key] || !value
230
+ end
113
231
 
114
- files.each do |file|
115
- raise InvalidConfig,
116
- "Missing config file %s." % File.expand_path(file) \
117
- unless File.exist?(file)
118
- @config = Config::merge(@config, Config::load_conf(file))
232
+ # Exclude default curl opts.
233
+ result['settings']['curl_opts'] ||= {}
234
+ result['settings']['curl_opts'].delete_if do |key, value|
235
+ value == UriWrapper::DEFAULT_CURL_OPTS[key.to_sym]
119
236
  end
237
+
238
+ # Delete curl opts if empty.
239
+ unless result['settings']['curl_opts'].length.positive?
240
+ result['settings'].delete('curl_opts')
241
+ end
242
+
243
+ result
120
244
  end
121
245
 
122
- def before
123
- @config['before']
246
+ # Creates a SiteDiff Config object.
247
+ def initialize(file, directory)
248
+ # Fallback to default config filename, if none is specified.
249
+ file = File.join(directory, DEFAULT_FILENAME) if file.nil?
250
+ unless File.exist?(file)
251
+ path = File.expand_path(file)
252
+ raise InvalidConfig, "Missing config file #{path}."
253
+ end
254
+ @config = Config.merge(DEFAULT_CONFIG, Config.load_conf(file))
255
+ @file = file
256
+ @directory = directory
257
+
258
+ # Validate configurations.
259
+ validate
260
+ end
261
+
262
+ # Get "before" site configuration.
263
+ def before(apply_preset = false)
264
+ section :before, apply_preset
124
265
  end
125
- def after
126
- @config['after']
266
+
267
+ # Get "before" site URL.
268
+ def before_url
269
+ result = before
270
+ result['url'] if result
271
+ end
272
+
273
+ # Get "after" site configuration.
274
+ def after(apply_preset = false)
275
+ section :after, apply_preset
127
276
  end
128
277
 
278
+ # Get "after" site URL.
279
+ def after_url
280
+ result = after
281
+ result['url'] if result
282
+ end
283
+
284
+ # Get paths.
129
285
  def paths
130
286
  @config['paths']
131
287
  end
288
+
289
+ # Set paths.
132
290
  def paths=(paths)
133
- @config['paths'] = Config::normalize_paths(paths)
291
+ raise 'Paths must be an Array' unless paths.is_a? Array
292
+
293
+ @config['paths'] = Config.normalize_paths(paths)
294
+ end
295
+
296
+ # Get ignore_whitespace option
297
+ def ignore_whitespace
298
+ @config['ignore_whitespace']
299
+ end
300
+
301
+ # Set ignore_whitespace option
302
+ def ignore_whitespace=(ignore_whitespace)
303
+ @config['ignore_whitespace'] = ignore_whitespace
304
+ end
305
+
306
+ # Get export option
307
+ def export
308
+ @config['export']
309
+ end
310
+
311
+ # Set export option
312
+ def export=(export)
313
+ @config['export'] = export
314
+ end
315
+
316
+ # Get output option
317
+ def output
318
+ @config['output']
319
+ end
320
+
321
+ # Set output option
322
+ def output=(output)
323
+ raise 'Output must be an Array' unless output.is_a? Array
324
+
325
+ @config['output'] = output
326
+ end
327
+
328
+ # Return report display settings.
329
+ def report
330
+ @config['report']
331
+ end
332
+
333
+ # Set crawl time for 'before'
334
+ def before_time=(time)
335
+ @config['report']['before_time'] = time
336
+ end
337
+
338
+ # Set crawl time for 'after'
339
+ def after_time=(time)
340
+ @config['report']['after_time'] = time
341
+ end
342
+
343
+ ##
344
+ # Writes an array of paths to a file.
345
+ #
346
+ # @param [Array] paths
347
+ # An array of paths.
348
+ # @param [String] file
349
+ # Optional path to a file.
350
+ def paths_file_write(paths, file = nil)
351
+ unless paths.is_a?(Array) && paths.length.positive?
352
+ raise SiteDiffException, 'Write failed. Invalid paths.'
353
+ end
354
+
355
+ file ||= File.join(@directory, DEFAULT_PATHS_FILENAME)
356
+ File.open(file, 'w+') { |f| f.puts(paths) }
357
+ end
358
+
359
+ ##
360
+ # Reads a collection of paths from a file.
361
+ #
362
+ # @param [String] file
363
+ # A file containing one path per line.
364
+ #
365
+ # @return [Integer]
366
+ # Number of paths read.
367
+ def paths_file_read(file = nil)
368
+ file ||= File.join(@directory, DEFAULT_PATHS_FILENAME)
369
+
370
+ unless File.exist? file
371
+ raise Config::InvalidConfig, "File not found: #{file}"
372
+ end
373
+
374
+ self.paths = File.readlines(file)
375
+
376
+ # Return the number of paths.
377
+ paths.length
378
+ end
379
+
380
+ ##
381
+ # Get roots.
382
+ #
383
+ # Example: If the config has a "before" and "after" sections, then roots
384
+ # will be ["before", "after"].
385
+ def roots
386
+ @roots = { 'after' => after_url }
387
+ @roots['before'] = before_url if before
388
+ @roots
389
+ end
390
+
391
+ ##
392
+ # Gets a setting.
393
+ #
394
+ # @param [String] key
395
+ # A key.
396
+ #
397
+ # @return [*]
398
+ # A value, if exists.
399
+ def setting(key)
400
+ key = key.to_s if key.is_a?(Symbol)
401
+ return @config['settings'][key] if @config['settings'].key?(key)
402
+ end
403
+
404
+ ##
405
+ # Gets all settings.
406
+ #
407
+ # TODO: Make sure the settings are not writable.
408
+ #
409
+ # @return [Hash]
410
+ # All settings.
411
+ def settings
412
+ @config['settings']
134
413
  end
135
414
 
136
415
  # Checks if the configuration is usable for diff-ing.
416
+ # TODO: Do we actually need the opts argument?
137
417
  def validate(opts = {})
138
- opts = { :need_before => true }.merge(opts)
418
+ opts = { need_before: true }.merge(opts)
419
+
420
+ if opts[:need_before] && !before['url']
421
+ raise InvalidConfig, "Undefined 'before' base URL."
422
+ end
139
423
 
140
- raise InvalidConfig, "Undefined 'before' base URL." if \
141
- opts[:need_before] && !before['url']
142
424
  raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
143
- raise InvalidConfig, "Undefined 'paths'." unless (paths and !paths.empty?)
425
+
426
+ # Validate interval and concurrency.
427
+ interval = setting(:interval)
428
+ concurrency = setting(:concurrency)
429
+ if interval.to_i != 0 && concurrency != 1
430
+ raise InvalidConfig, 'Concurrency must be 1 when an interval is set.'
431
+ end
432
+
433
+ # Validate preset.
434
+ Preset.exist? setting(:preset), true if setting(:preset)
435
+ end
436
+
437
+ ##
438
+ # Returns object clone with stringified keys.
439
+ # TODO: Make this method available globally, if required.
440
+ def self.stringify_keys(object)
441
+ # Do nothing if it is not an object.
442
+ return object unless object.respond_to?('each_key')
443
+
444
+ # Convert symbol indices to strings.
445
+ output = {}
446
+ object.each_key do |old_k|
447
+ new_k = old_k.is_a?(Symbol) ? old_k.to_s : old_k
448
+ output[new_k] = stringify_keys object[old_k]
449
+ end
450
+
451
+ # Return the new hash with string indices.
452
+ output
453
+ end
454
+
455
+ ##
456
+ # Creates a RegExp from a string.
457
+ def self.create_regexp(string_param)
458
+ begin
459
+ @return_value = string_param == '' ? nil : Regexp.new(string_param)
460
+ rescue SiteDiffException => e
461
+ @return_value = nil
462
+ SiteDiff.log 'Invalid RegExp: ' + string_param, :error
463
+ SiteDiff.log e.message, :error
464
+ # TODO: Use SiteDiff.log type :debug
465
+ # SiteDiff.log e.backtrace, :error if options[:verbose]
466
+ end
467
+ @return_value
468
+ end
469
+
470
+ ##
471
+ # Return merged CURL options.
472
+ def curl_opts
473
+ # We do want string keys here
474
+ bool_hash = { 'true' => true, 'false' => false }
475
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS
476
+ .clone
477
+ .merge(settings['curl_opts'] || {})
478
+ curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
479
+ curl_opts
144
480
  end
145
481
 
146
482
  private
147
483
 
484
+ ##
485
+ # Returns one of the "before" or "after" sections.
486
+ #
487
+ # @param [String|Symbol]
488
+ # Section name. Example: before, after.
489
+ # @param [Boolean] with_preset
490
+ # Whether to merge with preset config (if any).
491
+ #
492
+ # @return [Hash|Nil]
493
+ # Section data or Nil.
494
+ def section(name, with_preset = false)
495
+ name = name.to_s if name.is_a? Symbol
496
+
497
+ # Validate section.
498
+ unless %w[before after].include? name
499
+ raise SiteDiffException, '"name" must be one of "before" or "after".'
500
+ end
501
+
502
+ # Return nil if section is not defined.
503
+ return nil unless @config[name]
504
+
505
+ result = @config[name]
506
+
507
+ # Merge preset rules, if required.
508
+ preset = setting(:preset)
509
+ if with_preset && !preset.nil?
510
+ preset_config = Preset.read preset
511
+
512
+ # Merge plugins with array values.
513
+ # TODO: This won't be required after plugin declarations are improved.
514
+ # See https://rm.ewdev.ca/issues/18301
515
+ Sanitizer::TOOLS[:array].each do |key|
516
+ if preset_config[key]
517
+ result[key] = (result[key] || []) + preset_config[key]
518
+ end
519
+ end
520
+ end
521
+
522
+ result
523
+ end
524
+
148
525
  def self.normalize_paths(paths)
149
526
  paths ||= []
150
- return paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
527
+ paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
151
528
  end
152
529
 
153
530
  # reads a YAML file and raises an InvalidConfig if the file is not valid.
154
531
  def self.load_raw_yaml(file)
155
- SiteDiff::log "Reading config file: #{Pathname.new(file).expand_path}"
532
+ # TODO: Only show this in verbose mode.
533
+ SiteDiff.log "Reading config file: #{Pathname.new(file).expand_path}"
156
534
  conf = YAML.load_file(file) || {}
535
+
157
536
  unless conf.is_a? Hash
158
537
  raise InvalidConfig, "Invalid configuration file: '#{file}'"
159
538
  end
160
- conf.each do |k,v|
161
- unless CONF_KEYS.include? k
539
+
540
+ conf.each_key do |k, _v|
541
+ unless ALLOWED_CONFIG_KEYS.include? k
162
542
  raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'"
163
543
  end
164
544
  end
545
+
165
546
  conf
166
547
  end
167
548
 
168
549
  # loads a single YAML configuration file, merges all its 'included' files
169
550
  # and returns a normalized Hash.
170
- def self.load_conf(file, visited=[])
551
+ def self.load_conf(file, visited = [])
171
552
  # don't get fooled by a/../a/ or symlinks
172
553
  file = File.realpath(file)
173
554
  if visited.include? file
@@ -179,11 +560,11 @@ class SiteDiff
179
560
 
180
561
  # normalize and merge includes
181
562
  includes = conf['includes'] || []
182
- conf = Config::normalize(conf)
563
+ conf = Config.normalize(conf)
183
564
  includes.each do |dep|
184
565
  # include paths are relative to the including file.
185
566
  dep = File.join(File.dirname(file), dep)
186
- conf = Config::merge(conf, load_conf(dep, visited))
567
+ conf = Config.merge(conf, load_conf(dep, visited))
187
568
  end
188
569
  conf
189
570
  end