sitediff 0.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/sitediff +9 -3
- data/lib/sitediff.rb +153 -79
- data/lib/sitediff/api.rb +265 -0
- data/lib/sitediff/cache.rb +110 -47
- data/lib/sitediff/cli.rb +219 -165
- data/lib/sitediff/config.rb +439 -58
- data/lib/sitediff/config/creator.rb +93 -99
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +108 -72
- data/lib/sitediff/diff.rb +60 -12
- data/lib/sitediff/exception.rb +3 -1
- data/lib/sitediff/fetch.rb +62 -41
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +171 -0
- data/lib/sitediff/files/sidebyside.html.erb +5 -2
- data/lib/sitediff/files/sitediff.css +303 -30
- data/lib/sitediff/files/sitediff.js +367 -0
- data/lib/sitediff/report.rb +254 -0
- data/lib/sitediff/result.rb +59 -23
- data/lib/sitediff/sanitize.rb +222 -150
- data/lib/sitediff/sanitize/dom_transform.rb +111 -73
- data/lib/sitediff/sanitize/regexp.rb +69 -43
- data/lib/sitediff/uriwrapper.rb +104 -34
- data/lib/sitediff/webserver.rb +89 -77
- data/lib/sitediff/webserver/resultserver.rb +113 -77
- metadata +92 -76
- data/lib/sitediff/files/html_report.html.erb +0 -63
- data/lib/sitediff/files/rules/drupal.yaml +0 -33
- data/lib/sitediff/rules.rb +0 -65
data/lib/sitediff/result.rb
CHANGED
@@ -1,76 +1,112 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff'
|
2
4
|
require 'sitediff/diff'
|
5
|
+
require 'sitediff/report'
|
3
6
|
require 'digest/sha1'
|
4
7
|
require 'fileutils'
|
5
8
|
|
6
9
|
class SiteDiff
|
7
|
-
|
10
|
+
# SiteDiff Result Object.
|
11
|
+
class Result < Struct.new(
|
12
|
+
:path,
|
13
|
+
:before,
|
14
|
+
:after,
|
15
|
+
:before_encoding,
|
16
|
+
:after_encoding,
|
17
|
+
:error,
|
18
|
+
:verbose
|
19
|
+
)
|
8
20
|
STATUS_SUCCESS = 0 # Identical before and after
|
9
21
|
STATUS_FAILURE = 1 # Different before and after
|
10
22
|
STATUS_ERROR = 2 # Couldn't fetch page
|
11
|
-
STATUS_TEXT = %w[
|
23
|
+
STATUS_TEXT = %w[unchanged changed error].freeze
|
12
24
|
|
13
25
|
attr_reader :status, :diff
|
14
26
|
|
27
|
+
##
|
28
|
+
# Creates a Result.
|
15
29
|
def initialize(*args)
|
16
30
|
super
|
17
31
|
if error
|
18
32
|
@status = STATUS_ERROR
|
19
33
|
else
|
20
|
-
|
34
|
+
if !before_encoding || !after_encoding
|
35
|
+
@diff = Diff.binary_diffy(
|
36
|
+
before,
|
37
|
+
after,
|
38
|
+
before_encoding,
|
39
|
+
after_encoding
|
40
|
+
)
|
41
|
+
else
|
42
|
+
@diff = Diff.html_diffy(before, after)
|
43
|
+
end
|
21
44
|
@status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
|
22
45
|
end
|
23
46
|
end
|
24
47
|
|
48
|
+
##
|
49
|
+
# Whether the result has no diff.
|
50
|
+
#
|
51
|
+
# If there is a diff, it is not a success.
|
52
|
+
#
|
53
|
+
# TODO: Change "Success" to unchanged.
|
25
54
|
def success?
|
26
55
|
status == STATUS_SUCCESS
|
27
56
|
end
|
28
57
|
|
58
|
+
##
|
59
|
+
# Whether the result has an error.
|
60
|
+
def error?
|
61
|
+
status == STATUS_ERROR
|
62
|
+
end
|
63
|
+
|
29
64
|
# Textual representation of the status
|
30
65
|
def status_text
|
31
|
-
|
66
|
+
STATUS_TEXT[status]
|
32
67
|
end
|
33
68
|
|
34
69
|
# Printable URL
|
35
70
|
def url(tag, prefix, cache)
|
71
|
+
return unless prefix
|
72
|
+
|
36
73
|
base = cache.read_tags.include?(tag) ? "/cache/#{tag}" : prefix
|
37
74
|
base.to_s + path
|
38
75
|
end
|
39
76
|
|
40
77
|
# Filename to store diff
|
41
78
|
def filename
|
42
|
-
File.join(
|
79
|
+
File.join(Report::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
|
43
80
|
end
|
44
81
|
|
45
|
-
#
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
end
|
82
|
+
# Returns a URL to the result diff.
|
83
|
+
#
|
84
|
+
# Returns nil if the result has no diffs.
|
85
|
+
def diff_url(relative = false)
|
86
|
+
prefix = relative ? 'files/' : '/files/'
|
87
|
+
return prefix + filename if status == STATUS_FAILURE
|
52
88
|
end
|
53
89
|
|
54
90
|
# Log the result to the terminal
|
55
|
-
def log(verbose=true)
|
91
|
+
def log(verbose = true)
|
56
92
|
case status
|
57
|
-
when STATUS_SUCCESS
|
58
|
-
SiteDiff
|
59
|
-
when STATUS_ERROR
|
60
|
-
SiteDiff
|
61
|
-
when STATUS_FAILURE
|
62
|
-
SiteDiff
|
63
|
-
puts Diff
|
93
|
+
when STATUS_SUCCESS
|
94
|
+
SiteDiff.log path, :success, 'UNCHANGED'
|
95
|
+
when STATUS_ERROR
|
96
|
+
SiteDiff.log path + " (#{error})", :warning, 'ERROR'
|
97
|
+
when STATUS_FAILURE
|
98
|
+
SiteDiff.log path, :error, 'CHANGED'
|
99
|
+
puts Diff.terminal_diffy(before, after) if verbose
|
64
100
|
end
|
65
101
|
end
|
66
102
|
|
67
103
|
# Dump the result to a file
|
68
|
-
def dump(dir)
|
104
|
+
def dump(dir, relative = false)
|
69
105
|
dump_path = File.join(dir, filename)
|
70
106
|
base = File.dirname(dump_path)
|
71
|
-
FileUtils
|
107
|
+
FileUtils.mkdir_p(base) unless File.exist?(base)
|
72
108
|
File.open(dump_path, 'w') do |f|
|
73
|
-
f.write(Diff
|
109
|
+
f.write(Diff.generate_diff_output(self, relative))
|
74
110
|
end
|
75
111
|
end
|
76
112
|
end
|
data/lib/sitediff/sanitize.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff'
|
2
4
|
require 'sitediff/exception'
|
3
5
|
require 'sitediff/sanitize/dom_transform'
|
@@ -6,183 +8,253 @@ require 'nokogiri'
|
|
6
8
|
require 'set'
|
7
9
|
|
8
10
|
class SiteDiff
|
9
|
-
|
10
|
-
class
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
11
|
+
# SiteDiff Sanitizer.
|
12
|
+
class Sanitizer
|
13
|
+
class InvalidSanitization < SiteDiffException; end
|
14
|
+
|
15
|
+
TOOLS = {
|
16
|
+
array: %w[dom_transform sanitization],
|
17
|
+
scalar: %w[selector remove_spacing ignore_whitespace]
|
18
|
+
}.freeze
|
19
|
+
DOM_TRANSFORMS = Set.new(%w[remove strip unwrap_root unwrap remove_class])
|
20
|
+
|
21
|
+
##
|
22
|
+
# Creates a Sanitizer.
|
23
|
+
def initialize(html, config, opts = {})
|
24
|
+
@html = html
|
25
|
+
@config = config
|
26
|
+
@opts = opts
|
27
|
+
end
|
23
28
|
|
24
|
-
|
25
|
-
|
29
|
+
##
|
30
|
+
# Performs sanitization.
|
31
|
+
def sanitize
|
32
|
+
return '' if @html == '' # Quick return on empty input
|
26
33
|
|
27
|
-
|
34
|
+
@node = Sanitizer.domify(@html)
|
35
|
+
@html = nil
|
28
36
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
37
|
+
remove_spacing
|
38
|
+
regions || selector
|
39
|
+
dom_transforms
|
40
|
+
regexps
|
33
41
|
|
34
|
-
|
35
|
-
end
|
42
|
+
@html || Sanitizer.prettify(@node)
|
43
|
+
end
|
36
44
|
|
37
|
-
# Return whether or not we want to keep a rule
|
38
|
-
def want_rule(rule)
|
39
|
-
|
40
|
-
|
45
|
+
# Return whether or not we want to keep a rule
|
46
|
+
def want_rule(rule)
|
47
|
+
return false unless rule
|
48
|
+
return false if rule['disabled']
|
41
49
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
50
|
+
# Filter out if path regexp doesn't match
|
51
|
+
if (pathre = rule['path']) && (path = @opts[:path])
|
52
|
+
return ::Regexp.new(pathre).match(path)
|
53
|
+
end
|
46
54
|
|
47
|
-
|
48
|
-
end
|
55
|
+
true
|
56
|
+
end
|
49
57
|
|
50
|
-
# Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
|
51
|
-
# It may be a simple value, or a hash, or an array of hashes.
|
52
|
-
# Turn it into an array of hashes.
|
53
|
-
def canonicalize_rule(name)
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
58
|
+
# Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
|
59
|
+
# It may be a simple value, or a hash, or an array of hashes.
|
60
|
+
# Turn it into an array of hashes.
|
61
|
+
def canonicalize_rule(name)
|
62
|
+
(rules = @config[name]) || (return nil)
|
63
|
+
|
64
|
+
# Already an array? Do nothing.
|
65
|
+
if rules[0]&.respond_to?('each') && rules[0]&.fetch('value')
|
66
|
+
# If it is a hash, put it in an array.
|
67
|
+
elsif rules['value']
|
68
|
+
rules = [rules]
|
69
|
+
# If it is a scalar value, put it in an array.
|
70
|
+
else
|
71
|
+
rules = [{ 'value' => rules }]
|
72
|
+
end
|
73
|
+
|
74
|
+
want = rules.select { |r| want_rule(r) }
|
75
|
+
return nil if want.empty?
|
76
|
+
raise "Too many matching rules of type #{name}" if want.size > 1
|
77
|
+
|
78
|
+
want.first
|
79
|
+
end
|
65
80
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
end
|
81
|
+
# Perform 'remove_spacing' action
|
82
|
+
def remove_spacing
|
83
|
+
(rule = canonicalize_rule('remove_spacing')) || return
|
84
|
+
Sanitizer.remove_node_spacing(@node) if rule['value']
|
85
|
+
end
|
71
86
|
|
72
|
-
# Perform '
|
73
|
-
def
|
74
|
-
|
75
|
-
Sanitizer.remove_node_spacing(@node) if rule['value']
|
76
|
-
end
|
87
|
+
# Perform 'regions' action, don't perform 'selector' if regions exist.
|
88
|
+
def regions
|
89
|
+
return unless validate_regions
|
77
90
|
|
78
|
-
|
79
|
-
|
80
|
-
rule = canonicalize_rule('selector') or return
|
81
|
-
@node = Sanitizer.select_fragments(@node, rule['value'])
|
82
|
-
end
|
91
|
+
@node = select_regions(@node, @config['regions'], @opts[:output])
|
92
|
+
end
|
83
93
|
|
84
|
-
#
|
85
|
-
def
|
86
|
-
|
87
|
-
|
94
|
+
# Perform 'selector' action, to choose a new root
|
95
|
+
def selector
|
96
|
+
(rule = canonicalize_rule('selector')) || return
|
97
|
+
@node = Sanitizer.select_fragments(@node, rule['value'])
|
98
|
+
end
|
88
99
|
|
89
|
-
|
90
|
-
|
100
|
+
# Applies regexps. Also
|
101
|
+
def regexps
|
102
|
+
(rules = @config['sanitization']) || return
|
103
|
+
rules = rules.select { |r| want_rule(r) }
|
104
|
+
|
105
|
+
rules.map! { |r| Regexp.create(r) }
|
106
|
+
selector, global = rules.partition(&:selector?)
|
107
|
+
|
108
|
+
selector.each { |r| r.apply(@node) }
|
109
|
+
@html = Sanitizer.prettify(@node)
|
110
|
+
@node = nil
|
111
|
+
# Prevent potential UTF-8 encoding errors by removing bytes
|
112
|
+
# Not the only solution. An alternative is to return the
|
113
|
+
# string unmodified.
|
114
|
+
@html = @html.encode(
|
115
|
+
'UTF-8',
|
116
|
+
'binary',
|
117
|
+
invalid: :replace,
|
118
|
+
undef: :replace,
|
119
|
+
replace: ''
|
120
|
+
)
|
121
|
+
global.each { |r| r.apply(@html) }
|
122
|
+
end
|
91
123
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
124
|
+
# Perform DOM transforms
|
125
|
+
def dom_transforms
|
126
|
+
(rules = @config['dom_transform']) || return
|
127
|
+
rules = rules.select { |r| want_rule(r) }
|
96
128
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
129
|
+
rules.each do |rule|
|
130
|
+
transform = DomTransform.create(rule)
|
131
|
+
transform.apply(@node)
|
132
|
+
end
|
133
|
+
end
|
101
134
|
|
102
|
-
|
103
|
-
transform = DomTransform.create(rule)
|
104
|
-
transform.apply(@node)
|
105
|
-
end
|
106
|
-
end
|
135
|
+
##### Implementations of actions #####
|
107
136
|
|
108
|
-
|
137
|
+
# Remove double-spacing inside text nodes
|
138
|
+
def self.remove_node_spacing(node)
|
139
|
+
# remove double spacing, but only inside text nodes (eg not attributes)
|
140
|
+
node.xpath('//text()').each do |el|
|
141
|
+
el.content = el.content.gsub(/ +/, ' ')
|
142
|
+
end
|
143
|
+
end
|
109
144
|
|
110
|
-
#
|
111
|
-
def
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
145
|
+
# Restructure the node into regions.
|
146
|
+
def select_regions(node, regions, output)
|
147
|
+
regions = output.map do |name|
|
148
|
+
selector = get_named_region(regions, name)['selector']
|
149
|
+
region = Nokogiri::XML.fragment('<region id="' + name + '"></region>').at_css('region')
|
150
|
+
matching = node.css(selector)
|
151
|
+
matching.each { |m| region.add_child m }
|
152
|
+
region
|
153
|
+
end
|
154
|
+
node = Nokogiri::HTML.fragment('')
|
155
|
+
regions.each { |r| node.add_child r }
|
156
|
+
node
|
157
|
+
end
|
117
158
|
|
118
|
-
# Get a fragment consisting of the elements matching the selector(s)
|
119
|
-
def self.select_fragments(node, sel)
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
return node
|
128
|
-
end
|
159
|
+
# Get a fragment consisting of the elements matching the selector(s)
|
160
|
+
def self.select_fragments(node, sel)
|
161
|
+
# When we choose a new root, we always become a DocumentFragment,
|
162
|
+
# and lose any DOCTYPE and such.
|
163
|
+
ns = node.css(sel)
|
164
|
+
node = Nokogiri::HTML.fragment('') unless node.fragment?
|
165
|
+
node.children = ns
|
166
|
+
node
|
167
|
+
end
|
129
168
|
|
130
|
-
# Pretty-print some HTML
|
131
|
-
def self.prettify(obj)
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
169
|
+
# Pretty-print some HTML
|
170
|
+
def self.prettify(obj)
|
171
|
+
@stylesheet ||= begin
|
172
|
+
stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
|
173
|
+
Nokogiri::XSLT(File.read(stylesheet_path))
|
174
|
+
end
|
175
|
+
|
176
|
+
# Pull out the html element's children
|
177
|
+
# The obvious way to do this is to iterate over pretty.css('html'),
|
178
|
+
# but that tends to segfault Nokogiri
|
179
|
+
str = @stylesheet.apply_to(to_document(obj))
|
180
|
+
|
181
|
+
# There's a lot of cruft left over,that we don't want
|
182
|
+
|
183
|
+
# Prevent potential UTF-8 encoding errors by removing invalid bytes.
|
184
|
+
# Not the only solution.
|
185
|
+
# An alternative is to return the string unmodified.
|
186
|
+
str = str.encode(
|
187
|
+
'UTF-8',
|
188
|
+
'binary',
|
189
|
+
invalid: :replace,
|
190
|
+
undef: :replace,
|
191
|
+
replace: ''
|
192
|
+
)
|
193
|
+
# Remove xml declaration and <html> tags
|
194
|
+
str.sub!(/\A<\?xml.*$\n/, '')
|
195
|
+
str.sub!(/\A^<html>$\n/, '')
|
196
|
+
str.sub!(%r{</html>\n\Z}, '')
|
197
|
+
|
198
|
+
# Remove top-level indentation
|
199
|
+
indent = /\A(\s*)/.match(str)[1].size
|
200
|
+
str.gsub!(/^\s{,#{indent}}/, '')
|
201
|
+
|
202
|
+
# Remove blank lines
|
203
|
+
str.gsub!(/^\s*$\n/, '')
|
204
|
+
|
205
|
+
# Remove DOS newlines
|
206
|
+
str.gsub!(/\x0D$/, '')
|
207
|
+
str.gsub!(/ $/, '')
|
208
|
+
|
209
|
+
str
|
210
|
+
end
|
211
|
+
|
212
|
+
# Parse HTML into a node
|
213
|
+
def self.domify(str, force_doc = false)
|
214
|
+
if force_doc || /<!DOCTYPE/.match(str[0, 512])
|
215
|
+
Nokogiri::HTML(str)
|
216
|
+
else
|
217
|
+
Nokogiri::HTML.fragment(str)
|
218
|
+
end
|
219
|
+
end
|
136
220
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
221
|
+
# Force this object to be a document, so we can apply a stylesheet
|
222
|
+
def self.to_document(obj)
|
223
|
+
if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
|
224
|
+
obj
|
225
|
+
# node or fragment
|
226
|
+
elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
|
227
|
+
domify(obj.to_s, true)
|
228
|
+
else
|
229
|
+
to_document(domify(obj, false))
|
230
|
+
end
|
231
|
+
end
|
141
232
|
|
142
|
-
|
233
|
+
private
|
143
234
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
str.sub!(%r[</html>\n\Z], '')
|
235
|
+
# Validate `regions` and `output` from config.
|
236
|
+
def validate_regions
|
237
|
+
return false unless @config['regions'].is_a?(Array)
|
148
238
|
|
149
|
-
|
150
|
-
indent = /\A(\s*)/.match(str)[1].size
|
151
|
-
str.gsub!(/^\s{,#{indent}}/, '')
|
239
|
+
return false unless @opts[:output].is_a?(Array)
|
152
240
|
|
153
|
-
|
154
|
-
|
241
|
+
regions = @config['regions']
|
242
|
+
output = @opts[:output]
|
243
|
+
regions.each do |region|
|
244
|
+
return false unless region.key?('name') && region.key?('selector')
|
245
|
+
end
|
155
246
|
|
156
|
-
|
157
|
-
|
247
|
+
# Check that each named output has an associated region.
|
248
|
+
output.each do |name|
|
249
|
+
return false unless get_named_region(regions, name)
|
250
|
+
end
|
158
251
|
|
159
|
-
|
160
|
-
|
161
|
-
if force_doc || /<!DOCTYPE/.match(str[0, 512])
|
162
|
-
return Nokogiri::HTML(str)
|
163
|
-
else
|
164
|
-
return Nokogiri::HTML.fragment(str)
|
165
|
-
end
|
166
|
-
end
|
252
|
+
true
|
253
|
+
end
|
167
254
|
|
168
|
-
#
|
169
|
-
def
|
170
|
-
|
171
|
-
|
172
|
-
elsif Nokogiri::XML::Node === obj # node or fragment
|
173
|
-
return domify(obj.to_s, true)
|
174
|
-
|
175
|
-
# This ought to work, and would be faster,
|
176
|
-
# but seems to segfault Nokogiri
|
177
|
-
if false
|
178
|
-
doc = Nokogiri::HTML('<html><body>')
|
179
|
-
doc.at('body').children = obj.children
|
180
|
-
return doc
|
181
|
-
end
|
182
|
-
else
|
183
|
-
return to_document(domify(obj))
|
255
|
+
# Return the selector from a named region.
|
256
|
+
def get_named_region(regions, name)
|
257
|
+
regions.find { |region| region['name'] == name }
|
258
|
+
end
|
184
259
|
end
|
185
260
|
end
|
186
|
-
|
187
|
-
end
|
188
|
-
end
|