sitediff 0.0.2 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,76 +1,112 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'sitediff/diff'
5
+ require 'sitediff/report'
3
6
  require 'digest/sha1'
4
7
  require 'fileutils'
5
8
 
6
9
  class SiteDiff
7
- class Result < Struct.new(:path, :before, :after, :error, :verbose)
10
+ # SiteDiff Result Object.
11
+ class Result < Struct.new(
12
+ :path,
13
+ :before,
14
+ :after,
15
+ :before_encoding,
16
+ :after_encoding,
17
+ :error,
18
+ :verbose
19
+ )
8
20
  STATUS_SUCCESS = 0 # Identical before and after
9
21
  STATUS_FAILURE = 1 # Different before and after
10
22
  STATUS_ERROR = 2 # Couldn't fetch page
11
- STATUS_TEXT = %w[success failure error]
23
+ STATUS_TEXT = %w[unchanged changed error].freeze
12
24
 
13
25
  attr_reader :status, :diff
14
26
 
27
+ ##
28
+ # Creates a Result.
15
29
  def initialize(*args)
16
30
  super
17
31
  if error
18
32
  @status = STATUS_ERROR
19
33
  else
20
- @diff = Diff::html_diffy(before, after)
34
+ if !before_encoding || !after_encoding
35
+ @diff = Diff.binary_diffy(
36
+ before,
37
+ after,
38
+ before_encoding,
39
+ after_encoding
40
+ )
41
+ else
42
+ @diff = Diff.html_diffy(before, after)
43
+ end
21
44
  @status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
22
45
  end
23
46
  end
24
47
 
48
+ ##
49
+ # Whether the result has no diff.
50
+ #
51
+ # If there is a diff, it is not a success.
52
+ #
53
+ # TODO: Change "Success" to unchanged.
25
54
  def success?
26
55
  status == STATUS_SUCCESS
27
56
  end
28
57
 
58
+ ##
59
+ # Whether the result has an error.
60
+ def error?
61
+ status == STATUS_ERROR
62
+ end
63
+
29
64
  # Textual representation of the status
30
65
  def status_text
31
- return STATUS_TEXT[status]
66
+ STATUS_TEXT[status]
32
67
  end
33
68
 
34
69
  # Printable URL
35
70
  def url(tag, prefix, cache)
71
+ return unless prefix
72
+
36
73
  base = cache.read_tags.include?(tag) ? "/cache/#{tag}" : prefix
37
74
  base.to_s + path
38
75
  end
39
76
 
40
77
  # Filename to store diff
41
78
  def filename
42
- File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(self.path) + '.html')
79
+ File.join(Report::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
43
80
  end
44
81
 
45
- # Text of the link in the HTML report
46
- def link
47
- case status
48
- when STATUS_ERROR then error
49
- when STATUS_SUCCESS then status_text
50
- when STATUS_FAILURE then "<a href='#{filename}'>DIFF</a>"
51
- end
82
+ # Returns a URL to the result diff.
83
+ #
84
+ # Returns nil if the result has no diffs.
85
+ def diff_url(relative = false)
86
+ prefix = relative ? 'files/' : '/files/'
87
+ return prefix + filename if status == STATUS_FAILURE
52
88
  end
53
89
 
54
90
  # Log the result to the terminal
55
- def log(verbose=true)
91
+ def log(verbose = true)
56
92
  case status
57
- when STATUS_SUCCESS then
58
- SiteDiff::log path, :diff_success, 'SUCCESS'
59
- when STATUS_ERROR then
60
- SiteDiff::log path, :warn, "ERROR (#{error})"
61
- when STATUS_FAILURE then
62
- SiteDiff::log path, :diff_failure, "FAILURE"
63
- puts Diff::terminal_diffy(before, after) if verbose
93
+ when STATUS_SUCCESS
94
+ SiteDiff.log path, :success, 'UNCHANGED'
95
+ when STATUS_ERROR
96
+ SiteDiff.log path + " (#{error})", :warning, 'ERROR'
97
+ when STATUS_FAILURE
98
+ SiteDiff.log path, :error, 'CHANGED'
99
+ puts Diff.terminal_diffy(before, after) if verbose
64
100
  end
65
101
  end
66
102
 
67
103
  # Dump the result to a file
68
- def dump(dir)
104
+ def dump(dir, relative = false)
69
105
  dump_path = File.join(dir, filename)
70
106
  base = File.dirname(dump_path)
71
- FileUtils::mkdir_p(base) unless File.exists?(base)
107
+ FileUtils.mkdir_p(base) unless File.exist?(base)
72
108
  File.open(dump_path, 'w') do |f|
73
- f.write(Diff::generate_diff_output(self))
109
+ f.write(Diff.generate_diff_output(self, relative))
74
110
  end
75
111
  end
76
112
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'sitediff/exception'
3
5
  require 'sitediff/sanitize/dom_transform'
@@ -6,183 +8,253 @@ require 'nokogiri'
6
8
  require 'set'
7
9
 
8
10
  class SiteDiff
9
- class Sanitizer
10
- class InvalidSanitization < SiteDiffException; end
11
-
12
- TOOLS = {
13
- :array => %w[dom_transform sanitization],
14
- :scalar => %w[selector remove_spacing],
15
- }
16
- DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
17
-
18
- def initialize(html, config, opts = {})
19
- @html = html
20
- @config = config
21
- @opts = opts
22
- end
11
+ # SiteDiff Sanitizer.
12
+ class Sanitizer
13
+ class InvalidSanitization < SiteDiffException; end
14
+
15
+ TOOLS = {
16
+ array: %w[dom_transform sanitization],
17
+ scalar: %w[selector remove_spacing ignore_whitespace]
18
+ }.freeze
19
+ DOM_TRANSFORMS = Set.new(%w[remove strip unwrap_root unwrap remove_class])
20
+
21
+ ##
22
+ # Creates a Sanitizer.
23
+ def initialize(html, config, opts = {})
24
+ @html = html
25
+ @config = config
26
+ @opts = opts
27
+ end
23
28
 
24
- def sanitize
25
- return '' if @html == '' # Quick return on empty input
29
+ ##
30
+ # Performs sanitization.
31
+ def sanitize
32
+ return '' if @html == '' # Quick return on empty input
26
33
 
27
- @node, @html = Sanitizer.domify(@html), nil
34
+ @node = Sanitizer.domify(@html)
35
+ @html = nil
28
36
 
29
- remove_spacing
30
- selector
31
- dom_transforms
32
- regexps
37
+ remove_spacing
38
+ regions || selector
39
+ dom_transforms
40
+ regexps
33
41
 
34
- return @html || Sanitizer.prettify(@node)
35
- end
42
+ @html || Sanitizer.prettify(@node)
43
+ end
36
44
 
37
- # Return whether or not we want to keep a rule
38
- def want_rule(rule)
39
- return false unless rule
40
- return false if rule['disabled']
45
+ # Return whether or not we want to keep a rule
46
+ def want_rule(rule)
47
+ return false unless rule
48
+ return false if rule['disabled']
41
49
 
42
- # Filter out if path regexp doesn't match
43
- if (pathre = rule['path']) and (path = @opts[:path])
44
- return ::Regexp.new(pathre).match(path)
45
- end
50
+ # Filter out if path regexp doesn't match
51
+ if (pathre = rule['path']) && (path = @opts[:path])
52
+ return ::Regexp.new(pathre).match(path)
53
+ end
46
54
 
47
- return true
48
- end
55
+ true
56
+ end
49
57
 
50
- # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
51
- # It may be a simple value, or a hash, or an array of hashes.
52
- # Turn it into an array of hashes.
53
- def canonicalize_rule(name)
54
- rules = @config[name] or return nil
55
-
56
- if rules[0] && rules[0].respond_to?(:[]) && rules[0]['value']
57
- # Already an array
58
- elsif rules['value']
59
- # Hash, put it in an array
60
- rules = [rules]
61
- else
62
- # Scalar, put it in a hash
63
- rules = [{ 'value' => rules }]
64
- end
58
+ # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
59
+ # It may be a simple value, or a hash, or an array of hashes.
60
+ # Turn it into an array of hashes.
61
+ def canonicalize_rule(name)
62
+ (rules = @config[name]) || (return nil)
63
+
64
+ # Already an array? Do nothing.
65
+ if rules[0]&.respond_to?('each') && rules[0]&.fetch('value')
66
+ # If it is a hash, put it in an array.
67
+ elsif rules['value']
68
+ rules = [rules]
69
+ # If it is a scalar value, put it in an array.
70
+ else
71
+ rules = [{ 'value' => rules }]
72
+ end
73
+
74
+ want = rules.select { |r| want_rule(r) }
75
+ return nil if want.empty?
76
+ raise "Too many matching rules of type #{name}" if want.size > 1
77
+
78
+ want.first
79
+ end
65
80
 
66
- want = rules.select { |r| want_rule(r) }
67
- return nil if want.empty?
68
- raise "Too many matching rules of type #{name}" if want.size > 1
69
- return want.first
70
- end
81
+ # Perform 'remove_spacing' action
82
+ def remove_spacing
83
+ (rule = canonicalize_rule('remove_spacing')) || return
84
+ Sanitizer.remove_node_spacing(@node) if rule['value']
85
+ end
71
86
 
72
- # Perform 'remove_spacing' action
73
- def remove_spacing
74
- rule = canonicalize_rule('remove_spacing') or return
75
- Sanitizer.remove_node_spacing(@node) if rule['value']
76
- end
87
+ # Perform 'regions' action, don't perform 'selector' if regions exist.
88
+ def regions
89
+ return unless validate_regions
77
90
 
78
- # Perform 'selector' action, to choose a new root
79
- def selector
80
- rule = canonicalize_rule('selector') or return
81
- @node = Sanitizer.select_fragments(@node, rule['value'])
82
- end
91
+ @node = select_regions(@node, @config['regions'], @opts[:output])
92
+ end
83
93
 
84
- # Applies regexps. Also
85
- def regexps
86
- rules = @config['sanitization'] or return
87
- rules = rules.select { |r| want_rule(r) }
94
+ # Perform 'selector' action, to choose a new root
95
+ def selector
96
+ (rule = canonicalize_rule('selector')) || return
97
+ @node = Sanitizer.select_fragments(@node, rule['value'])
98
+ end
88
99
 
89
- rules.map! { |r| Regexp.create(r) }
90
- selector, global = rules.partition { |r| r.selector? }
100
+ # Applies regexps. Also
101
+ def regexps
102
+ (rules = @config['sanitization']) || return
103
+ rules = rules.select { |r| want_rule(r) }
104
+
105
+ rules.map! { |r| Regexp.create(r) }
106
+ selector, global = rules.partition(&:selector?)
107
+
108
+ selector.each { |r| r.apply(@node) }
109
+ @html = Sanitizer.prettify(@node)
110
+ @node = nil
111
+ # Prevent potential UTF-8 encoding errors by removing bytes
112
+ # Not the only solution. An alternative is to return the
113
+ # string unmodified.
114
+ @html = @html.encode(
115
+ 'UTF-8',
116
+ 'binary',
117
+ invalid: :replace,
118
+ undef: :replace,
119
+ replace: ''
120
+ )
121
+ global.each { |r| r.apply(@html) }
122
+ end
91
123
 
92
- selector.each { |r| r.apply(@node) }
93
- @html, @node = Sanitizer.prettify(@node), nil
94
- global.each { |r| r.apply(@html) }
95
- end
124
+ # Perform DOM transforms
125
+ def dom_transforms
126
+ (rules = @config['dom_transform']) || return
127
+ rules = rules.select { |r| want_rule(r) }
96
128
 
97
- # Perform DOM transforms
98
- def dom_transforms
99
- rules = @config['dom_transform'] or return
100
- rules = rules.select { |r| want_rule(r) }
129
+ rules.each do |rule|
130
+ transform = DomTransform.create(rule)
131
+ transform.apply(@node)
132
+ end
133
+ end
101
134
 
102
- rules.each do |rule|
103
- transform = DomTransform.create(rule)
104
- transform.apply(@node)
105
- end
106
- end
135
+ ##### Implementations of actions #####
107
136
 
108
- ##### Implementations of actions #####
137
+ # Remove double-spacing inside text nodes
138
+ def self.remove_node_spacing(node)
139
+ # remove double spacing, but only inside text nodes (eg not attributes)
140
+ node.xpath('//text()').each do |el|
141
+ el.content = el.content.gsub(/ +/, ' ')
142
+ end
143
+ end
109
144
 
110
- # Remove double-spacing inside text nodes
111
- def self.remove_node_spacing(node)
112
- # remove double spacing, but only inside text nodes (eg not attributes)
113
- node.xpath('//text()').each do |el|
114
- el.content = el.content.gsub(/ +/, ' ')
115
- end
116
- end
145
+ # Restructure the node into regions.
146
+ def select_regions(node, regions, output)
147
+ regions = output.map do |name|
148
+ selector = get_named_region(regions, name)['selector']
149
+ region = Nokogiri::XML.fragment('<region id="' + name + '"></region>').at_css('region')
150
+ matching = node.css(selector)
151
+ matching.each { |m| region.add_child m }
152
+ region
153
+ end
154
+ node = Nokogiri::HTML.fragment('')
155
+ regions.each { |r| node.add_child r }
156
+ node
157
+ end
117
158
 
118
- # Get a fragment consisting of the elements matching the selector(s)
119
- def self.select_fragments(node, sel)
120
- # When we choose a new root, we always become a DocumentFragment,
121
- # and lose any DOCTYPE and such.
122
- ns = node.css(sel)
123
- unless node.fragment?
124
- node = Nokogiri::HTML.fragment('')
125
- end
126
- node.children = ns
127
- return node
128
- end
159
+ # Get a fragment consisting of the elements matching the selector(s)
160
+ def self.select_fragments(node, sel)
161
+ # When we choose a new root, we always become a DocumentFragment,
162
+ # and lose any DOCTYPE and such.
163
+ ns = node.css(sel)
164
+ node = Nokogiri::HTML.fragment('') unless node.fragment?
165
+ node.children = ns
166
+ node
167
+ end
129
168
 
130
- # Pretty-print some HTML
131
- def self.prettify(obj)
132
- @stylesheet ||= begin
133
- stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
134
- Nokogiri::XSLT(File.read(stylesheet_path))
135
- end
169
+ # Pretty-print some HTML
170
+ def self.prettify(obj)
171
+ @stylesheet ||= begin
172
+ stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
173
+ Nokogiri::XSLT(File.read(stylesheet_path))
174
+ end
175
+
176
+ # Pull out the html element's children
177
+ # The obvious way to do this is to iterate over pretty.css('html'),
178
+ # but that tends to segfault Nokogiri
179
+ str = @stylesheet.apply_to(to_document(obj))
180
+
181
+ # There's a lot of cruft left over,that we don't want
182
+
183
+ # Prevent potential UTF-8 encoding errors by removing invalid bytes.
184
+ # Not the only solution.
185
+ # An alternative is to return the string unmodified.
186
+ str = str.encode(
187
+ 'UTF-8',
188
+ 'binary',
189
+ invalid: :replace,
190
+ undef: :replace,
191
+ replace: ''
192
+ )
193
+ # Remove xml declaration and <html> tags
194
+ str.sub!(/\A<\?xml.*$\n/, '')
195
+ str.sub!(/\A^<html>$\n/, '')
196
+ str.sub!(%r{</html>\n\Z}, '')
197
+
198
+ # Remove top-level indentation
199
+ indent = /\A(\s*)/.match(str)[1].size
200
+ str.gsub!(/^\s{,#{indent}}/, '')
201
+
202
+ # Remove blank lines
203
+ str.gsub!(/^\s*$\n/, '')
204
+
205
+ # Remove DOS newlines
206
+ str.gsub!(/\x0D$/, '')
207
+ str.gsub!(/&#13;$/, '')
208
+
209
+ str
210
+ end
211
+
212
+ # Parse HTML into a node
213
+ def self.domify(str, force_doc = false)
214
+ if force_doc || /<!DOCTYPE/.match(str[0, 512])
215
+ Nokogiri::HTML(str)
216
+ else
217
+ Nokogiri::HTML.fragment(str)
218
+ end
219
+ end
136
220
 
137
- # Pull out the html element's children
138
- # The obvious way to do this is to iterate over pretty.css('html'),
139
- # but that tends to segfault Nokogiri
140
- str = @stylesheet.apply_to(to_document(obj))
221
+ # Force this object to be a document, so we can apply a stylesheet
222
+ def self.to_document(obj)
223
+ if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
224
+ obj
225
+ # node or fragment
226
+ elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
227
+ domify(obj.to_s, true)
228
+ else
229
+ to_document(domify(obj, false))
230
+ end
231
+ end
141
232
 
142
- # There's a lot of cruft left over,that we don't want
233
+ private
143
234
 
144
- # Remove xml declaration and <html> tags
145
- str.sub!(/\A<\?xml.*$\n/, '')
146
- str.sub!(/\A^<html>$\n/, '')
147
- str.sub!(%r[</html>\n\Z], '')
235
+ # Validate `regions` and `output` from config.
236
+ def validate_regions
237
+ return false unless @config['regions'].is_a?(Array)
148
238
 
149
- # Remove top-level indentation
150
- indent = /\A(\s*)/.match(str)[1].size
151
- str.gsub!(/^\s{,#{indent}}/, '')
239
+ return false unless @opts[:output].is_a?(Array)
152
240
 
153
- # Remove blank lines
154
- str.gsub!(/^\s*$\n/, '')
241
+ regions = @config['regions']
242
+ output = @opts[:output]
243
+ regions.each do |region|
244
+ return false unless region.key?('name') && region.key?('selector')
245
+ end
155
246
 
156
- return str
157
- end
247
+ # Check that each named output has an associated region.
248
+ output.each do |name|
249
+ return false unless get_named_region(regions, name)
250
+ end
158
251
 
159
- # Parse HTML into a node
160
- def self.domify(str, force_doc = false)
161
- if force_doc || /<!DOCTYPE/.match(str[0, 512])
162
- return Nokogiri::HTML(str)
163
- else
164
- return Nokogiri::HTML.fragment(str)
165
- end
166
- end
252
+ true
253
+ end
167
254
 
168
- # Force this object to be a document, so we can apply a stylesheet
169
- def self.to_document(obj)
170
- if Nokogiri::XML::Document === obj
171
- return obj
172
- elsif Nokogiri::XML::Node === obj # node or fragment
173
- return domify(obj.to_s, true)
174
-
175
- # This ought to work, and would be faster,
176
- # but seems to segfault Nokogiri
177
- if false
178
- doc = Nokogiri::HTML('<html><body>')
179
- doc.at('body').children = obj.children
180
- return doc
181
- end
182
- else
183
- return to_document(domify(obj))
255
+ # Return the selector from a named region.
256
+ def get_named_region(regions, name)
257
+ regions.find { |region| region['name'] == name }
258
+ end
184
259
  end
185
260
  end
186
-
187
- end
188
- end