sitediff 0.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,76 +1,112 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'sitediff/diff'
5
+ require 'sitediff/report'
3
6
  require 'digest/sha1'
4
7
  require 'fileutils'
5
8
 
6
9
  class SiteDiff
7
- class Result < Struct.new(:path, :before, :after, :error, :verbose)
10
+ # SiteDiff Result Object.
11
+ class Result < Struct.new(
12
+ :path,
13
+ :before,
14
+ :after,
15
+ :before_encoding,
16
+ :after_encoding,
17
+ :error,
18
+ :verbose
19
+ )
8
20
  STATUS_SUCCESS = 0 # Identical before and after
9
21
  STATUS_FAILURE = 1 # Different before and after
10
22
  STATUS_ERROR = 2 # Couldn't fetch page
11
- STATUS_TEXT = %w[success failure error]
23
+ STATUS_TEXT = %w[unchanged changed error].freeze
12
24
 
13
25
  attr_reader :status, :diff
14
26
 
27
+ ##
28
+ # Creates a Result.
15
29
  def initialize(*args)
16
30
  super
17
31
  if error
18
32
  @status = STATUS_ERROR
19
33
  else
20
- @diff = Diff::html_diffy(before, after)
34
+ if !before_encoding || !after_encoding
35
+ @diff = Diff.binary_diffy(
36
+ before,
37
+ after,
38
+ before_encoding,
39
+ after_encoding
40
+ )
41
+ else
42
+ @diff = Diff.html_diffy(before, after)
43
+ end
21
44
  @status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
22
45
  end
23
46
  end
24
47
 
48
+ ##
49
+ # Whether the result has no diff.
50
+ #
51
+ # If there is a diff, it is not a success.
52
+ #
53
+ # TODO: Change "Success" to unchanged.
25
54
  def success?
26
55
  status == STATUS_SUCCESS
27
56
  end
28
57
 
58
+ ##
59
+ # Whether the result has an error.
60
+ def error?
61
+ status == STATUS_ERROR
62
+ end
63
+
29
64
  # Textual representation of the status
30
65
  def status_text
31
- return STATUS_TEXT[status]
66
+ STATUS_TEXT[status]
32
67
  end
33
68
 
34
69
  # Printable URL
35
70
  def url(tag, prefix, cache)
71
+ return unless prefix
72
+
36
73
  base = cache.read_tags.include?(tag) ? "/cache/#{tag}" : prefix
37
74
  base.to_s + path
38
75
  end
39
76
 
40
77
  # Filename to store diff
41
78
  def filename
42
- File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(self.path) + '.html')
79
+ File.join(Report::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
43
80
  end
44
81
 
45
- # Text of the link in the HTML report
46
- def link
47
- case status
48
- when STATUS_ERROR then error
49
- when STATUS_SUCCESS then status_text
50
- when STATUS_FAILURE then "<a href='#{filename}'>DIFF</a>"
51
- end
82
+ # Returns a URL to the result diff.
83
+ #
84
+ # Returns nil if the result has no diffs.
85
+ def diff_url(relative = false)
86
+ prefix = relative ? 'files/' : '/files/'
87
+ return prefix + filename if status == STATUS_FAILURE
52
88
  end
53
89
 
54
90
  # Log the result to the terminal
55
- def log(verbose=true)
91
+ def log(verbose = true)
56
92
  case status
57
- when STATUS_SUCCESS then
58
- SiteDiff::log path, :diff_success, 'SUCCESS'
59
- when STATUS_ERROR then
60
- SiteDiff::log path, :warn, "ERROR (#{error})"
61
- when STATUS_FAILURE then
62
- SiteDiff::log path, :diff_failure, "FAILURE"
63
- puts Diff::terminal_diffy(before, after) if verbose
93
+ when STATUS_SUCCESS
94
+ SiteDiff.log path, :success, 'UNCHANGED'
95
+ when STATUS_ERROR
96
+ SiteDiff.log path + " (#{error})", :warning, 'ERROR'
97
+ when STATUS_FAILURE
98
+ SiteDiff.log path, :error, 'CHANGED'
99
+ puts Diff.terminal_diffy(before, after) if verbose
64
100
  end
65
101
  end
66
102
 
67
103
  # Dump the result to a file
68
- def dump(dir)
104
+ def dump(dir, relative = false)
69
105
  dump_path = File.join(dir, filename)
70
106
  base = File.dirname(dump_path)
71
- FileUtils::mkdir_p(base) unless File.exists?(base)
107
+ FileUtils.mkdir_p(base) unless File.exist?(base)
72
108
  File.open(dump_path, 'w') do |f|
73
- f.write(Diff::generate_diff_output(self))
109
+ f.write(Diff.generate_diff_output(self, relative))
74
110
  end
75
111
  end
76
112
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'sitediff/exception'
3
5
  require 'sitediff/sanitize/dom_transform'
@@ -6,183 +8,253 @@ require 'nokogiri'
6
8
  require 'set'
7
9
 
8
10
  class SiteDiff
9
- class Sanitizer
10
- class InvalidSanitization < SiteDiffException; end
11
-
12
- TOOLS = {
13
- :array => %w[dom_transform sanitization],
14
- :scalar => %w[selector remove_spacing],
15
- }
16
- DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
17
-
18
- def initialize(html, config, opts = {})
19
- @html = html
20
- @config = config
21
- @opts = opts
22
- end
11
+ # SiteDiff Sanitizer.
12
+ class Sanitizer
13
+ class InvalidSanitization < SiteDiffException; end
14
+
15
+ TOOLS = {
16
+ array: %w[dom_transform sanitization],
17
+ scalar: %w[selector remove_spacing ignore_whitespace]
18
+ }.freeze
19
+ DOM_TRANSFORMS = Set.new(%w[remove strip unwrap_root unwrap remove_class])
20
+
21
+ ##
22
+ # Creates a Sanitizer.
23
+ def initialize(html, config, opts = {})
24
+ @html = html
25
+ @config = config
26
+ @opts = opts
27
+ end
23
28
 
24
- def sanitize
25
- return '' if @html == '' # Quick return on empty input
29
+ ##
30
+ # Performs sanitization.
31
+ def sanitize
32
+ return '' if @html == '' # Quick return on empty input
26
33
 
27
- @node, @html = Sanitizer.domify(@html), nil
34
+ @node = Sanitizer.domify(@html)
35
+ @html = nil
28
36
 
29
- remove_spacing
30
- selector
31
- dom_transforms
32
- regexps
37
+ remove_spacing
38
+ regions || selector
39
+ dom_transforms
40
+ regexps
33
41
 
34
- return @html || Sanitizer.prettify(@node)
35
- end
42
+ @html || Sanitizer.prettify(@node)
43
+ end
36
44
 
37
- # Return whether or not we want to keep a rule
38
- def want_rule(rule)
39
- return false unless rule
40
- return false if rule['disabled']
45
+ # Return whether or not we want to keep a rule
46
+ def want_rule(rule)
47
+ return false unless rule
48
+ return false if rule['disabled']
41
49
 
42
- # Filter out if path regexp doesn't match
43
- if (pathre = rule['path']) and (path = @opts[:path])
44
- return ::Regexp.new(pathre).match(path)
45
- end
50
+ # Filter out if path regexp doesn't match
51
+ if (pathre = rule['path']) && (path = @opts[:path])
52
+ return ::Regexp.new(pathre).match(path)
53
+ end
46
54
 
47
- return true
48
- end
55
+ true
56
+ end
49
57
 
50
- # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
51
- # It may be a simple value, or a hash, or an array of hashes.
52
- # Turn it into an array of hashes.
53
- def canonicalize_rule(name)
54
- rules = @config[name] or return nil
55
-
56
- if rules[0] && rules[0].respond_to?(:[]) && rules[0]['value']
57
- # Already an array
58
- elsif rules['value']
59
- # Hash, put it in an array
60
- rules = [rules]
61
- else
62
- # Scalar, put it in a hash
63
- rules = [{ 'value' => rules }]
64
- end
58
+ # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
59
+ # It may be a simple value, or a hash, or an array of hashes.
60
+ # Turn it into an array of hashes.
61
+ def canonicalize_rule(name)
62
+ (rules = @config[name]) || (return nil)
63
+
64
+ # Already an array? Do nothing.
65
+ if rules[0]&.respond_to?('each') && rules[0]&.fetch('value')
66
+ # If it is a hash, put it in an array.
67
+ elsif rules['value']
68
+ rules = [rules]
69
+ # If it is a scalar value, put it in an array.
70
+ else
71
+ rules = [{ 'value' => rules }]
72
+ end
73
+
74
+ want = rules.select { |r| want_rule(r) }
75
+ return nil if want.empty?
76
+ raise "Too many matching rules of type #{name}" if want.size > 1
77
+
78
+ want.first
79
+ end
65
80
 
66
- want = rules.select { |r| want_rule(r) }
67
- return nil if want.empty?
68
- raise "Too many matching rules of type #{name}" if want.size > 1
69
- return want.first
70
- end
81
+ # Perform 'remove_spacing' action
82
+ def remove_spacing
83
+ (rule = canonicalize_rule('remove_spacing')) || return
84
+ Sanitizer.remove_node_spacing(@node) if rule['value']
85
+ end
71
86
 
72
- # Perform 'remove_spacing' action
73
- def remove_spacing
74
- rule = canonicalize_rule('remove_spacing') or return
75
- Sanitizer.remove_node_spacing(@node) if rule['value']
76
- end
87
+ # Perform 'regions' action, don't perform 'selector' if regions exist.
88
+ def regions
89
+ return unless validate_regions
77
90
 
78
- # Perform 'selector' action, to choose a new root
79
- def selector
80
- rule = canonicalize_rule('selector') or return
81
- @node = Sanitizer.select_fragments(@node, rule['value'])
82
- end
91
+ @node = select_regions(@node, @config['regions'], @opts[:output])
92
+ end
83
93
 
84
- # Applies regexps. Also
85
- def regexps
86
- rules = @config['sanitization'] or return
87
- rules = rules.select { |r| want_rule(r) }
94
+ # Perform 'selector' action, to choose a new root
95
+ def selector
96
+ (rule = canonicalize_rule('selector')) || return
97
+ @node = Sanitizer.select_fragments(@node, rule['value'])
98
+ end
88
99
 
89
- rules.map! { |r| Regexp.create(r) }
90
- selector, global = rules.partition { |r| r.selector? }
100
+ # Applies regexps. Also
101
+ def regexps
102
+ (rules = @config['sanitization']) || return
103
+ rules = rules.select { |r| want_rule(r) }
104
+
105
+ rules.map! { |r| Regexp.create(r) }
106
+ selector, global = rules.partition(&:selector?)
107
+
108
+ selector.each { |r| r.apply(@node) }
109
+ @html = Sanitizer.prettify(@node)
110
+ @node = nil
111
+ # Prevent potential UTF-8 encoding errors by removing bytes
112
+ # Not the only solution. An alternative is to return the
113
+ # string unmodified.
114
+ @html = @html.encode(
115
+ 'UTF-8',
116
+ 'binary',
117
+ invalid: :replace,
118
+ undef: :replace,
119
+ replace: ''
120
+ )
121
+ global.each { |r| r.apply(@html) }
122
+ end
91
123
 
92
- selector.each { |r| r.apply(@node) }
93
- @html, @node = Sanitizer.prettify(@node), nil
94
- global.each { |r| r.apply(@html) }
95
- end
124
+ # Perform DOM transforms
125
+ def dom_transforms
126
+ (rules = @config['dom_transform']) || return
127
+ rules = rules.select { |r| want_rule(r) }
96
128
 
97
- # Perform DOM transforms
98
- def dom_transforms
99
- rules = @config['dom_transform'] or return
100
- rules = rules.select { |r| want_rule(r) }
129
+ rules.each do |rule|
130
+ transform = DomTransform.create(rule)
131
+ transform.apply(@node)
132
+ end
133
+ end
101
134
 
102
- rules.each do |rule|
103
- transform = DomTransform.create(rule)
104
- transform.apply(@node)
105
- end
106
- end
135
+ ##### Implementations of actions #####
107
136
 
108
- ##### Implementations of actions #####
137
+ # Remove double-spacing inside text nodes
138
+ def self.remove_node_spacing(node)
139
+ # remove double spacing, but only inside text nodes (eg not attributes)
140
+ node.xpath('//text()').each do |el|
141
+ el.content = el.content.gsub(/ +/, ' ')
142
+ end
143
+ end
109
144
 
110
- # Remove double-spacing inside text nodes
111
- def self.remove_node_spacing(node)
112
- # remove double spacing, but only inside text nodes (eg not attributes)
113
- node.xpath('//text()').each do |el|
114
- el.content = el.content.gsub(/ +/, ' ')
115
- end
116
- end
145
+ # Restructure the node into regions.
146
+ def select_regions(node, regions, output)
147
+ regions = output.map do |name|
148
+ selector = get_named_region(regions, name)['selector']
149
+ region = Nokogiri::XML.fragment('<region id="' + name + '"></region>').at_css('region')
150
+ matching = node.css(selector)
151
+ matching.each { |m| region.add_child m }
152
+ region
153
+ end
154
+ node = Nokogiri::HTML.fragment('')
155
+ regions.each { |r| node.add_child r }
156
+ node
157
+ end
117
158
 
118
- # Get a fragment consisting of the elements matching the selector(s)
119
- def self.select_fragments(node, sel)
120
- # When we choose a new root, we always become a DocumentFragment,
121
- # and lose any DOCTYPE and such.
122
- ns = node.css(sel)
123
- unless node.fragment?
124
- node = Nokogiri::HTML.fragment('')
125
- end
126
- node.children = ns
127
- return node
128
- end
159
+ # Get a fragment consisting of the elements matching the selector(s)
160
+ def self.select_fragments(node, sel)
161
+ # When we choose a new root, we always become a DocumentFragment,
162
+ # and lose any DOCTYPE and such.
163
+ ns = node.css(sel)
164
+ node = Nokogiri::HTML.fragment('') unless node.fragment?
165
+ node.children = ns
166
+ node
167
+ end
129
168
 
130
- # Pretty-print some HTML
131
- def self.prettify(obj)
132
- @stylesheet ||= begin
133
- stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
134
- Nokogiri::XSLT(File.read(stylesheet_path))
135
- end
169
+ # Pretty-print some HTML
170
+ def self.prettify(obj)
171
+ @stylesheet ||= begin
172
+ stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
173
+ Nokogiri::XSLT(File.read(stylesheet_path))
174
+ end
175
+
176
+ # Pull out the html element's children
177
+ # The obvious way to do this is to iterate over pretty.css('html'),
178
+ # but that tends to segfault Nokogiri
179
+ str = @stylesheet.apply_to(to_document(obj))
180
+
181
+ # There's a lot of cruft left over,that we don't want
182
+
183
+ # Prevent potential UTF-8 encoding errors by removing invalid bytes.
184
+ # Not the only solution.
185
+ # An alternative is to return the string unmodified.
186
+ str = str.encode(
187
+ 'UTF-8',
188
+ 'binary',
189
+ invalid: :replace,
190
+ undef: :replace,
191
+ replace: ''
192
+ )
193
+ # Remove xml declaration and <html> tags
194
+ str.sub!(/\A<\?xml.*$\n/, '')
195
+ str.sub!(/\A^<html>$\n/, '')
196
+ str.sub!(%r{</html>\n\Z}, '')
197
+
198
+ # Remove top-level indentation
199
+ indent = /\A(\s*)/.match(str)[1].size
200
+ str.gsub!(/^\s{,#{indent}}/, '')
201
+
202
+ # Remove blank lines
203
+ str.gsub!(/^\s*$\n/, '')
204
+
205
+ # Remove DOS newlines
206
+ str.gsub!(/\x0D$/, '')
207
+ str.gsub!(/&#13;$/, '')
208
+
209
+ str
210
+ end
211
+
212
+ # Parse HTML into a node
213
+ def self.domify(str, force_doc = false)
214
+ if force_doc || /<!DOCTYPE/.match(str[0, 512])
215
+ Nokogiri::HTML(str)
216
+ else
217
+ Nokogiri::HTML.fragment(str)
218
+ end
219
+ end
136
220
 
137
- # Pull out the html element's children
138
- # The obvious way to do this is to iterate over pretty.css('html'),
139
- # but that tends to segfault Nokogiri
140
- str = @stylesheet.apply_to(to_document(obj))
221
+ # Force this object to be a document, so we can apply a stylesheet
222
+ def self.to_document(obj)
223
+ if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
224
+ obj
225
+ # node or fragment
226
+ elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
227
+ domify(obj.to_s, true)
228
+ else
229
+ to_document(domify(obj, false))
230
+ end
231
+ end
141
232
 
142
- # There's a lot of cruft left over,that we don't want
233
+ private
143
234
 
144
- # Remove xml declaration and <html> tags
145
- str.sub!(/\A<\?xml.*$\n/, '')
146
- str.sub!(/\A^<html>$\n/, '')
147
- str.sub!(%r[</html>\n\Z], '')
235
+ # Validate `regions` and `output` from config.
236
+ def validate_regions
237
+ return false unless @config['regions'].is_a?(Array)
148
238
 
149
- # Remove top-level indentation
150
- indent = /\A(\s*)/.match(str)[1].size
151
- str.gsub!(/^\s{,#{indent}}/, '')
239
+ return false unless @opts[:output].is_a?(Array)
152
240
 
153
- # Remove blank lines
154
- str.gsub!(/^\s*$\n/, '')
241
+ regions = @config['regions']
242
+ output = @opts[:output]
243
+ regions.each do |region|
244
+ return false unless region.key?('name') && region.key?('selector')
245
+ end
155
246
 
156
- return str
157
- end
247
+ # Check that each named output has an associated region.
248
+ output.each do |name|
249
+ return false unless get_named_region(regions, name)
250
+ end
158
251
 
159
- # Parse HTML into a node
160
- def self.domify(str, force_doc = false)
161
- if force_doc || /<!DOCTYPE/.match(str[0, 512])
162
- return Nokogiri::HTML(str)
163
- else
164
- return Nokogiri::HTML.fragment(str)
165
- end
166
- end
252
+ true
253
+ end
167
254
 
168
- # Force this object to be a document, so we can apply a stylesheet
169
- def self.to_document(obj)
170
- if Nokogiri::XML::Document === obj
171
- return obj
172
- elsif Nokogiri::XML::Node === obj # node or fragment
173
- return domify(obj.to_s, true)
174
-
175
- # This ought to work, and would be faster,
176
- # but seems to segfault Nokogiri
177
- if false
178
- doc = Nokogiri::HTML('<html><body>')
179
- doc.at('body').children = obj.children
180
- return doc
181
- end
182
- else
183
- return to_document(domify(obj))
255
+ # Return the selector from a named region.
256
+ def get_named_region(regions, name)
257
+ regions.find { |region| region['name'] == name }
258
+ end
184
259
  end
185
260
  end
186
-
187
- end
188
- end