sitediff 0.0.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,238 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'json'
5
+ require 'minitar'
6
+ require 'sitediff'
7
+ require 'sitediff/config'
8
+ require 'zlib'
9
+
10
+ class SiteDiff
11
+ ##
12
+ # SiteDiff Report Helper.
13
+ class Report
14
+ attr_reader :results, :cache
15
+
16
+ ##
17
+ # Directory where diffs will be generated.
18
+ DIFFS_DIR = 'diffs'
19
+
20
+ ##
21
+ # Name of file containing a list of pages with diffs.
22
+ FAILURES_FILE = 'failures.txt'
23
+
24
+ ##
25
+ # Name of file containing HTML report of diffs.
26
+ REPORT_FILE_HTML = 'report.html'
27
+
28
+ ##
29
+ # Name of file containing JSON report of diffs.
30
+ REPORT_FILE_JSON = 'report.json'
31
+
32
+ ##
33
+ # Name of file containing exported file archive.
34
+ REPORT_FILE_TAR = 'report.tgz'
35
+
36
+ ##
37
+ # Name of directory in which to build the portable report.
38
+ REPORT_BUILD_DIR = '_tmp_report'
39
+
40
+ ##
41
+ # Name of the portable report directory.
42
+ REPORT_DIR = 'report'
43
+
44
+ ##
45
+ # Path to settings used for report.
46
+ SETTINGS_FILE = 'settings.yaml'
47
+
48
+ ##
49
+ # Creates a Reporter object.
50
+ #
51
+ # @param [Config] config.
52
+ # @param [Cache] cache.
53
+ # @param [Array] results.
54
+ def initialize(config, cache, results)
55
+ @config = config
56
+ @cache = cache
57
+ @results = results
58
+ end
59
+
60
+ ##
61
+ # Generates an HTML report.
62
+ #
63
+ # @param [String] dir
64
+ # The directory in which the report is to be generated.
65
+ def generate_html(
66
+ dir,
67
+ report_before = nil,
68
+ report_after = nil
69
+ )
70
+ report_before ||= @config.before_url
71
+ report_after ||= @config.after_url
72
+
73
+ dir = SiteDiff.ensure_dir dir
74
+
75
+ write_diffs dir
76
+ write_failures dir
77
+
78
+ # Prepare report.
79
+ report = Diff.generate_html(
80
+ @results,
81
+ report_before,
82
+ report_after,
83
+ @cache,
84
+ @config.export
85
+ )
86
+
87
+ # Write report.
88
+ report_file = dir + REPORT_FILE_HTML
89
+ report_file.unlink if report_file.file?
90
+ report_file.open('w') { |f| f.write(report) }
91
+
92
+ write_settings dir, report_before, report_after
93
+
94
+ if @config.export
95
+ package_report(dir)
96
+ else
97
+ SiteDiff.log 'Report generated to ' + report_file.expand_path.to_s
98
+ end
99
+ end
100
+
101
+ ##
102
+ # Generates a JSON report.
103
+ #
104
+ # @param dir
105
+ # The directory in which the report is to be generated.
106
+ def generate_json(dir)
107
+ dir = SiteDiff.ensure_dir dir
108
+ write_diffs dir
109
+ write_failures dir
110
+
111
+ # Prepare report.
112
+ report = {
113
+ paths_compared: @results.length,
114
+ paths_diffs: 0,
115
+ paths: {}
116
+ }
117
+ @results.each do |item|
118
+ report[:paths_diffs] += 1 unless item.success?
119
+
120
+ item_report = {
121
+ path: item.path,
122
+ status: item.status,
123
+ message: item.error
124
+ }
125
+ report[:paths][item.path] = item_report
126
+ end
127
+ report = JSON report
128
+
129
+ # Write report.
130
+ report_file = dir + REPORT_FILE_JSON
131
+ report_file.unlink if report_file.file?
132
+ report_file.open('w') { |f| f.write(report) }
133
+
134
+ write_settings dir
135
+
136
+ SiteDiff.log 'Report generated to ' + report_file.expand_path.to_s
137
+ end
138
+
139
+ ##
140
+ # Package report for export.
141
+ def package_report(dir)
142
+ # Create temporaryreport directories.
143
+ temp_path = dir + REPORT_BUILD_DIR
144
+ temp_path.rmtree if temp_path.directory?
145
+ temp_path.mkpath
146
+ report_path = temp_path + REPORT_DIR
147
+ report_path.mkpath
148
+ files_path = report_path + 'files'
149
+ files_path.mkpath
150
+ diffs_path = dir + DIFFS_DIR
151
+
152
+ # Move files to place.
153
+ FileUtils.move(dir + REPORT_FILE_HTML, report_path)
154
+ FileUtils.move(diffs_path, files_path) if diffs_path.directory?
155
+
156
+ # Make tar file.
157
+ Dir.chdir(temp_path) do
158
+ Minitar.pack(
159
+ REPORT_DIR,
160
+ Zlib::GzipWriter.new(File.open(REPORT_FILE_TAR, 'wb'))
161
+ )
162
+ end
163
+ FileUtils.move(temp_path + REPORT_FILE_TAR, dir)
164
+ temp_path.rmtree
165
+ SiteDiff.log 'Archived report generated to ' + dir.join(REPORT_FILE_TAR).to_s
166
+ end
167
+
168
+ ##
169
+ # Creates diff files in a directory named "diffs".
170
+ #
171
+ # If "dir" is /foo/bar, then diffs will be placed in /foo/bar/diffs.
172
+ #
173
+ # @param [Pathname] dir
174
+ # The directory in which a "diffs" directory is to be generated.
175
+ def write_diffs(dir)
176
+ raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
177
+
178
+ # Delete existing "diffs" dir, if exists.
179
+ diff_dir = dir + DIFFS_DIR
180
+ diff_dir.rmtree if diff_dir.exist?
181
+
182
+ # Write diffs to the diff directory.
183
+ @results.each { |r| r.dump(dir, @config.export) if r.status == Result::STATUS_FAILURE }
184
+ SiteDiff.log "All diff files written to #{diff_dir.expand_path}" unless @config.export
185
+ end
186
+
187
+ ##
188
+ # Writes paths with diffs into a file.
189
+ #
190
+ # @param [Pathname] dir
191
+ # The directory in which the report is to be generated.
192
+ def write_failures(dir)
193
+ raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
194
+
195
+ failures = dir + FAILURES_FILE
196
+ SiteDiff.log "All failures written to #{failures.expand_path}"
197
+ failures.open('w') do |f|
198
+ @results.each { |r| f.puts r.path unless r.success? }
199
+ end
200
+ end
201
+
202
+ ##
203
+ # Creates report settings.yaml file.
204
+ #
205
+ # TODO: Find a way to avoid having to create this file.
206
+ #
207
+ # @param [Pathname] dir
208
+ # The directory in which the report is to be generated.
209
+ def write_settings(dir, report_before = nil, report_after = nil)
210
+ raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
211
+
212
+ settings = {
213
+ 'before' => report_before,
214
+ 'after' => report_after,
215
+ 'cached' => %w[before after]
216
+ }
217
+ dir.+(SETTINGS_FILE).open('w') { |f| YAML.dump(settings, f) }
218
+ end
219
+
220
+ ##
221
+ # Returns CSS for HTML report.
222
+ def self.css
223
+ output = ''
224
+ output += File.read(File.join(SiteDiff::FILES_DIR, 'normalize.css'))
225
+ output += File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.css'))
226
+ output
227
+ end
228
+
229
+ ##
230
+ # Returns JS for HTML report.
231
+ def self.js
232
+ output = ''
233
+ output += File.read(File.join(SiteDiff::FILES_DIR, 'jquery.min.js'))
234
+ output += File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.js'))
235
+ output
236
+ end
237
+ end
238
+ end
@@ -2,25 +2,42 @@
2
2
 
3
3
  require 'sitediff'
4
4
  require 'sitediff/diff'
5
+ require 'sitediff/report'
5
6
  require 'digest/sha1'
6
7
  require 'fileutils'
7
8
 
8
9
  class SiteDiff
9
- class Result < Struct.new(:path, :before, :after, :before_encoding, :after_encoding, :error, :verbose)
10
+ # SiteDiff Result Object.
11
+ class Result < Struct.new(
12
+ :path,
13
+ :before,
14
+ :after,
15
+ :before_encoding,
16
+ :after_encoding,
17
+ :error,
18
+ :verbose
19
+ )
10
20
  STATUS_SUCCESS = 0 # Identical before and after
11
21
  STATUS_FAILURE = 1 # Different before and after
12
22
  STATUS_ERROR = 2 # Couldn't fetch page
13
- STATUS_TEXT = %w[success failure error].freeze
23
+ STATUS_TEXT = %w[unchanged changed error].freeze
14
24
 
15
25
  attr_reader :status, :diff
16
26
 
27
+ ##
28
+ # Creates a Result.
17
29
  def initialize(*args)
18
30
  super
19
31
  if error
20
32
  @status = STATUS_ERROR
21
33
  else
22
34
  if !before_encoding || !after_encoding
23
- @diff = Diff.binary_diffy(before, after, before_encoding, after_encoding)
35
+ @diff = Diff.binary_diffy(
36
+ before,
37
+ after,
38
+ before_encoding,
39
+ after_encoding
40
+ )
24
41
  else
25
42
  @diff = Diff.html_diffy(before, after)
26
43
  end
@@ -28,10 +45,22 @@ class SiteDiff
28
45
  end
29
46
  end
30
47
 
48
+ ##
49
+ # Whether the result has no diff.
50
+ #
51
+ # If there is a diff, it is not a success.
52
+ #
53
+ # TODO: Change "Success" to unchanged.
31
54
  def success?
32
55
  status == STATUS_SUCCESS
33
56
  end
34
57
 
58
+ ##
59
+ # Whether the result has an error.
60
+ def error?
61
+ status == STATUS_ERROR
62
+ end
63
+
35
64
  # Textual representation of the status
36
65
  def status_text
37
66
  STATUS_TEXT[status]
@@ -45,38 +74,37 @@ class SiteDiff
45
74
 
46
75
  # Filename to store diff
47
76
  def filename
48
- File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
77
+ File.join(Report::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
49
78
  end
50
79
 
51
- # Text of the link in the HTML report
52
- def link
53
- case status
54
- when STATUS_ERROR then error
55
- when STATUS_SUCCESS then status_text
56
- when STATUS_FAILURE then "<a href='#{filename}'>DIFF</a>"
57
- end
80
+ # Returns a URL to the result diff.
81
+ #
82
+ # Returns nil if the result has no diffs.
83
+ def diff_url(relative = false)
84
+ prefix = relative ? 'files/' : '/files/'
85
+ return prefix + filename if status == STATUS_FAILURE
58
86
  end
59
87
 
60
88
  # Log the result to the terminal
61
89
  def log(verbose = true)
62
90
  case status
63
- when STATUS_SUCCESS then
64
- SiteDiff.log path, :diff_success, 'UNCHANGED'
65
- when STATUS_ERROR then
66
- SiteDiff.log path, :warn, "ERROR (#{error})"
67
- when STATUS_FAILURE then
68
- SiteDiff.log path, :diff_failure, 'CHANGED'
91
+ when STATUS_SUCCESS
92
+ SiteDiff.log path, :success, 'UNCHANGED'
93
+ when STATUS_ERROR
94
+ SiteDiff.log path + " (#{error})", :warning, 'ERROR'
95
+ when STATUS_FAILURE
96
+ SiteDiff.log path, :error, 'CHANGED'
69
97
  puts Diff.terminal_diffy(before, after) if verbose
70
98
  end
71
99
  end
72
100
 
73
101
  # Dump the result to a file
74
- def dump(dir)
102
+ def dump(dir, relative = false)
75
103
  dump_path = File.join(dir, filename)
76
104
  base = File.dirname(dump_path)
77
105
  FileUtils.mkdir_p(base) unless File.exist?(base)
78
106
  File.open(dump_path, 'w') do |f|
79
- f.write(Diff.generate_diff_output(self))
107
+ f.write(Diff.generate_diff_output(self, relative))
80
108
  end
81
109
  end
82
110
  end
@@ -8,21 +8,26 @@ require 'nokogiri'
8
8
  require 'set'
9
9
 
10
10
  class SiteDiff
11
+ # SiteDiff Sanitizer.
11
12
  class Sanitizer
12
13
  class InvalidSanitization < SiteDiffException; end
13
14
 
14
15
  TOOLS = {
15
16
  array: %w[dom_transform sanitization],
16
- scalar: %w[selector remove_spacing]
17
+ scalar: %w[selector remove_spacing ignore_whitespace]
17
18
  }.freeze
18
- DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
19
+ DOM_TRANSFORMS = Set.new(%w[remove strip unwrap_root unwrap remove_class])
19
20
 
21
+ ##
22
+ # Creates a Sanitizer.
20
23
  def initialize(html, config, opts = {})
21
24
  @html = html
22
25
  @config = config
23
26
  @opts = opts
24
27
  end
25
28
 
29
+ ##
30
+ # Performs sanitization.
26
31
  def sanitize
27
32
  return '' if @html == '' # Quick return on empty input
28
33
 
@@ -56,13 +61,13 @@ class SiteDiff
56
61
  def canonicalize_rule(name)
57
62
  (rules = @config[name]) || (return nil)
58
63
 
59
- if rules[0]&.respond_to?(:[]) && rules[0]['value']
60
- # Already an array
64
+ # Already an array? Do nothing.
65
+ if rules[0]&.respond_to?('each') && rules[0]&.fetch('value')
66
+ # If it is a hash, put it in an array.
61
67
  elsif rules['value']
62
- # Hash, put it in an array
63
68
  rules = [rules]
69
+ # If it is a scalar value, put it in an array.
64
70
  else
65
- # Scalar, put it in a hash
66
71
  rules = [{ 'value' => rules }]
67
72
  end
68
73
 
@@ -99,7 +104,13 @@ class SiteDiff
99
104
  # Prevent potential UTF-8 encoding errors by removing bytes
100
105
  # Not the only solution. An alternative is to return the
101
106
  # string unmodified.
102
- @html = @html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
107
+ @html = @html.encode(
108
+ 'UTF-8',
109
+ 'binary',
110
+ invalid: :replace,
111
+ undef: :replace,
112
+ replace: ''
113
+ )
103
114
  global.each { |r| r.apply(@html) }
104
115
  end
105
116
 
@@ -151,7 +162,13 @@ class SiteDiff
151
162
  # Prevent potential UTF-8 encoding errors by removing invalid bytes.
152
163
  # Not the only solution.
153
164
  # An alternative is to return the string unmodified.
154
- str = str.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
165
+ str = str.encode(
166
+ 'UTF-8',
167
+ 'binary',
168
+ invalid: :replace,
169
+ undef: :replace,
170
+ replace: ''
171
+ )
155
172
  # Remove xml declaration and <html> tags
156
173
  str.sub!(/\A<\?xml.*$\n/, '')
157
174
  str.sub!(/\A^<html>$\n/, '')
@@ -164,6 +181,10 @@ class SiteDiff
164
181
  # Remove blank lines
165
182
  str.gsub!(/^\s*$\n/, '')
166
183
 
184
+ # Remove DOS newlines
185
+ str.gsub!(/\x0D$/, '')
186
+ str.gsub!(/&#13;$/, '')
187
+
167
188
  str
168
189
  end
169
190
 
@@ -11,18 +11,25 @@ class SiteDiff
11
11
  # * { :type => "unwrap", :selector => "div.field-item" }
12
12
  # * { :type => "remove", :selector => "div.extra-stuff" }
13
13
  # * { :type => "remove_class", :class => 'class1' }
14
+ # * { :type => "strip", :selector => 'h1' }
14
15
  class DomTransform
15
- Transforms = {}
16
+ # Supported dom_transform types.
17
+ TRANSFORMS = {}
16
18
 
19
+ ##
20
+ # Creates a DOM Transform.
17
21
  def initialize(rule)
18
22
  @rule = rule
19
23
  end
20
24
 
25
+ ##
21
26
  # Often an array or scalar are both ok values. Turn either into an array.
22
27
  def to_array(val)
23
28
  [val].flatten
24
29
  end
25
30
 
31
+ ##
32
+ # TODO: Document what this method does.
26
33
  def targets(node)
27
34
  selectors = to_array(@rule['selector'])
28
35
  selectors.each do |sel|
@@ -30,42 +37,70 @@ class SiteDiff
30
37
  end
31
38
  end
32
39
 
40
+ ##
41
+ # Applies the transformation to a DOM node.
33
42
  def apply(node)
34
43
  targets(node) { |t| process(t) }
35
44
  end
36
45
 
46
+ ##
47
+ # Registers a DOM Transform plugin.
37
48
  def self.register(name)
38
- Transforms[name] = self
49
+ TRANSFORMS[name] = self
39
50
  end
40
51
 
52
+ ##
53
+ # Creates a DOM Transform as per rule.
41
54
  def self.create(rule)
42
55
  (type = rule['type']) ||
43
56
  raise(InvalidSanitization, 'DOM transform needs a type')
44
- (transform = Transforms[type]) ||
57
+ (transform = TRANSFORMS[type]) ||
45
58
  raise(InvalidSanitization, "No DOM transform named #{type}")
46
59
  transform.new(rule)
47
60
  end
48
61
 
49
- # Remove elements matching 'selector'
62
+ ##
63
+ # Remove elements matching 'selector'.
50
64
  class Remove < DomTransform
51
65
  register 'remove'
66
+
67
+ ##
68
+ # Processes a node.
52
69
  def process(node)
53
70
  node.remove
54
71
  end
55
72
  end
56
73
 
57
- # Unwrap elements matching 'selector'
74
+ # Squeeze whitespace from a tag matching 'selector'.
75
+ class Strip < DomTransform
76
+ register 'strip'
77
+
78
+ ##
79
+ # Processes a node.
80
+ def process(node)
81
+ node.content = node.content.strip
82
+ end
83
+ end
84
+
85
+ # Unwrap elements matching 'selector'.
58
86
  class Unwrap < DomTransform
59
87
  register 'unwrap'
88
+
89
+ ##
90
+ # Processes a node.
60
91
  def process(node)
61
92
  node.add_next_sibling(node.children)
62
93
  node.remove
63
94
  end
64
95
  end
65
96
 
97
+ ##
66
98
  # Remove classes from elements matching selector
67
99
  class RemoveClass < DomTransform
68
100
  register 'remove_class'
101
+
102
+ ##
103
+ # Processes a node.
69
104
  def process(node)
70
105
  classes = to_array(@rule['class'])
71
106
 
@@ -77,9 +112,13 @@ class SiteDiff
77
112
  end
78
113
  end
79
114
 
80
- # Unwrap the root element
115
+ ##
116
+ # Unwrap the root element.
81
117
  class UnwrapRoot < DomTransform
82
118
  register 'unwrap_root'
119
+
120
+ ##
121
+ # Applies the transformation to a DOM node.
83
122
  def apply(node)
84
123
  (node.children.size == 1) ||
85
124
  raise(InvalidSanitization, 'Multiple root elements in unwrap_root')