sitediff 0.0.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/sitediff +10 -4
- data/lib/sitediff.rb +179 -91
- data/lib/sitediff/cache.rb +106 -0
- data/lib/sitediff/cli.rb +391 -60
- data/lib/sitediff/config.rb +383 -37
- data/lib/sitediff/config/creator.rb +114 -0
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +131 -0
- data/lib/sitediff/diff.rb +57 -12
- data/lib/sitediff/exception.rb +5 -0
- data/lib/sitediff/fetch.rb +76 -0
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +144 -0
- data/lib/sitediff/files/sidebyside.html.erb +16 -0
- data/lib/sitediff/files/sitediff.css +236 -29
- data/lib/sitediff/files/sitediff.js +176 -0
- data/lib/sitediff/report.rb +238 -0
- data/lib/sitediff/result.rb +63 -26
- data/lib/sitediff/sanitize.rb +160 -141
- data/lib/sitediff/sanitize/dom_transform.rb +130 -0
- data/lib/sitediff/sanitize/regexp.rb +82 -0
- data/lib/sitediff/uriwrapper.rb +114 -35
- data/lib/sitediff/webserver.rb +94 -0
- data/lib/sitediff/webserver/resultserver.rb +134 -0
- metadata +103 -43
- data/lib/sitediff/files/html_report.html.erb +0 -47
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
@@ -0,0 +1,238 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
require 'json'
|
5
|
+
require 'minitar'
|
6
|
+
require 'sitediff'
|
7
|
+
require 'sitediff/config'
|
8
|
+
require 'zlib'
|
9
|
+
|
10
|
+
class SiteDiff
|
11
|
+
##
|
12
|
+
# SiteDiff Report Helper.
|
13
|
+
class Report
|
14
|
+
attr_reader :results, :cache
|
15
|
+
|
16
|
+
##
|
17
|
+
# Directory where diffs will be generated.
|
18
|
+
DIFFS_DIR = 'diffs'
|
19
|
+
|
20
|
+
##
|
21
|
+
# Name of file containing a list of pages with diffs.
|
22
|
+
FAILURES_FILE = 'failures.txt'
|
23
|
+
|
24
|
+
##
|
25
|
+
# Name of file containing HTML report of diffs.
|
26
|
+
REPORT_FILE_HTML = 'report.html'
|
27
|
+
|
28
|
+
##
|
29
|
+
# Name of file containing JSON report of diffs.
|
30
|
+
REPORT_FILE_JSON = 'report.json'
|
31
|
+
|
32
|
+
##
|
33
|
+
# Name of file containing exported file archive.
|
34
|
+
REPORT_FILE_TAR = 'report.tgz'
|
35
|
+
|
36
|
+
##
|
37
|
+
# Name of directory in which to build the portable report.
|
38
|
+
REPORT_BUILD_DIR = '_tmp_report'
|
39
|
+
|
40
|
+
##
|
41
|
+
# Name of the portable report directory.
|
42
|
+
REPORT_DIR = 'report'
|
43
|
+
|
44
|
+
##
|
45
|
+
# Path to settings used for report.
|
46
|
+
SETTINGS_FILE = 'settings.yaml'
|
47
|
+
|
48
|
+
##
|
49
|
+
# Creates a Reporter object.
|
50
|
+
#
|
51
|
+
# @param [Config] config.
|
52
|
+
# @param [Cache] cache.
|
53
|
+
# @param [Array] results.
|
54
|
+
def initialize(config, cache, results)
|
55
|
+
@config = config
|
56
|
+
@cache = cache
|
57
|
+
@results = results
|
58
|
+
end
|
59
|
+
|
60
|
+
##
|
61
|
+
# Generates an HTML report.
|
62
|
+
#
|
63
|
+
# @param [String] dir
|
64
|
+
# The directory in which the report is to be generated.
|
65
|
+
def generate_html(
|
66
|
+
dir,
|
67
|
+
report_before = nil,
|
68
|
+
report_after = nil
|
69
|
+
)
|
70
|
+
report_before ||= @config.before_url
|
71
|
+
report_after ||= @config.after_url
|
72
|
+
|
73
|
+
dir = SiteDiff.ensure_dir dir
|
74
|
+
|
75
|
+
write_diffs dir
|
76
|
+
write_failures dir
|
77
|
+
|
78
|
+
# Prepare report.
|
79
|
+
report = Diff.generate_html(
|
80
|
+
@results,
|
81
|
+
report_before,
|
82
|
+
report_after,
|
83
|
+
@cache,
|
84
|
+
@config.export
|
85
|
+
)
|
86
|
+
|
87
|
+
# Write report.
|
88
|
+
report_file = dir + REPORT_FILE_HTML
|
89
|
+
report_file.unlink if report_file.file?
|
90
|
+
report_file.open('w') { |f| f.write(report) }
|
91
|
+
|
92
|
+
write_settings dir, report_before, report_after
|
93
|
+
|
94
|
+
if @config.export
|
95
|
+
package_report(dir)
|
96
|
+
else
|
97
|
+
SiteDiff.log 'Report generated to ' + report_file.expand_path.to_s
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
##
|
102
|
+
# Generates a JSON report.
|
103
|
+
#
|
104
|
+
# @param dir
|
105
|
+
# The directory in which the report is to be generated.
|
106
|
+
def generate_json(dir)
|
107
|
+
dir = SiteDiff.ensure_dir dir
|
108
|
+
write_diffs dir
|
109
|
+
write_failures dir
|
110
|
+
|
111
|
+
# Prepare report.
|
112
|
+
report = {
|
113
|
+
paths_compared: @results.length,
|
114
|
+
paths_diffs: 0,
|
115
|
+
paths: {}
|
116
|
+
}
|
117
|
+
@results.each do |item|
|
118
|
+
report[:paths_diffs] += 1 unless item.success?
|
119
|
+
|
120
|
+
item_report = {
|
121
|
+
path: item.path,
|
122
|
+
status: item.status,
|
123
|
+
message: item.error
|
124
|
+
}
|
125
|
+
report[:paths][item.path] = item_report
|
126
|
+
end
|
127
|
+
report = JSON report
|
128
|
+
|
129
|
+
# Write report.
|
130
|
+
report_file = dir + REPORT_FILE_JSON
|
131
|
+
report_file.unlink if report_file.file?
|
132
|
+
report_file.open('w') { |f| f.write(report) }
|
133
|
+
|
134
|
+
write_settings dir
|
135
|
+
|
136
|
+
SiteDiff.log 'Report generated to ' + report_file.expand_path.to_s
|
137
|
+
end
|
138
|
+
|
139
|
+
##
|
140
|
+
# Package report for export.
|
141
|
+
def package_report(dir)
|
142
|
+
# Create temporaryreport directories.
|
143
|
+
temp_path = dir + REPORT_BUILD_DIR
|
144
|
+
temp_path.rmtree if temp_path.directory?
|
145
|
+
temp_path.mkpath
|
146
|
+
report_path = temp_path + REPORT_DIR
|
147
|
+
report_path.mkpath
|
148
|
+
files_path = report_path + 'files'
|
149
|
+
files_path.mkpath
|
150
|
+
diffs_path = dir + DIFFS_DIR
|
151
|
+
|
152
|
+
# Move files to place.
|
153
|
+
FileUtils.move(dir + REPORT_FILE_HTML, report_path)
|
154
|
+
FileUtils.move(diffs_path, files_path) if diffs_path.directory?
|
155
|
+
|
156
|
+
# Make tar file.
|
157
|
+
Dir.chdir(temp_path) do
|
158
|
+
Minitar.pack(
|
159
|
+
REPORT_DIR,
|
160
|
+
Zlib::GzipWriter.new(File.open(REPORT_FILE_TAR, 'wb'))
|
161
|
+
)
|
162
|
+
end
|
163
|
+
FileUtils.move(temp_path + REPORT_FILE_TAR, dir)
|
164
|
+
temp_path.rmtree
|
165
|
+
SiteDiff.log 'Archived report generated to ' + dir.join(REPORT_FILE_TAR).to_s
|
166
|
+
end
|
167
|
+
|
168
|
+
##
|
169
|
+
# Creates diff files in a directory named "diffs".
|
170
|
+
#
|
171
|
+
# If "dir" is /foo/bar, then diffs will be placed in /foo/bar/diffs.
|
172
|
+
#
|
173
|
+
# @param [Pathname] dir
|
174
|
+
# The directory in which a "diffs" directory is to be generated.
|
175
|
+
def write_diffs(dir)
|
176
|
+
raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
|
177
|
+
|
178
|
+
# Delete existing "diffs" dir, if exists.
|
179
|
+
diff_dir = dir + DIFFS_DIR
|
180
|
+
diff_dir.rmtree if diff_dir.exist?
|
181
|
+
|
182
|
+
# Write diffs to the diff directory.
|
183
|
+
@results.each { |r| r.dump(dir, @config.export) if r.status == Result::STATUS_FAILURE }
|
184
|
+
SiteDiff.log "All diff files written to #{diff_dir.expand_path}" unless @config.export
|
185
|
+
end
|
186
|
+
|
187
|
+
##
|
188
|
+
# Writes paths with diffs into a file.
|
189
|
+
#
|
190
|
+
# @param [Pathname] dir
|
191
|
+
# The directory in which the report is to be generated.
|
192
|
+
def write_failures(dir)
|
193
|
+
raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
|
194
|
+
|
195
|
+
failures = dir + FAILURES_FILE
|
196
|
+
SiteDiff.log "All failures written to #{failures.expand_path}"
|
197
|
+
failures.open('w') do |f|
|
198
|
+
@results.each { |r| f.puts r.path unless r.success? }
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
##
|
203
|
+
# Creates report settings.yaml file.
|
204
|
+
#
|
205
|
+
# TODO: Find a way to avoid having to create this file.
|
206
|
+
#
|
207
|
+
# @param [Pathname] dir
|
208
|
+
# The directory in which the report is to be generated.
|
209
|
+
def write_settings(dir, report_before = nil, report_after = nil)
|
210
|
+
raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
|
211
|
+
|
212
|
+
settings = {
|
213
|
+
'before' => report_before,
|
214
|
+
'after' => report_after,
|
215
|
+
'cached' => %w[before after]
|
216
|
+
}
|
217
|
+
dir.+(SETTINGS_FILE).open('w') { |f| YAML.dump(settings, f) }
|
218
|
+
end
|
219
|
+
|
220
|
+
##
|
221
|
+
# Returns CSS for HTML report.
|
222
|
+
def self.css
|
223
|
+
output = ''
|
224
|
+
output += File.read(File.join(SiteDiff::FILES_DIR, 'normalize.css'))
|
225
|
+
output += File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.css'))
|
226
|
+
output
|
227
|
+
end
|
228
|
+
|
229
|
+
##
|
230
|
+
# Returns JS for HTML report.
|
231
|
+
def self.js
|
232
|
+
output = ''
|
233
|
+
output += File.read(File.join(SiteDiff::FILES_DIR, 'jquery.min.js'))
|
234
|
+
output += File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.js'))
|
235
|
+
output
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
data/lib/sitediff/result.rb
CHANGED
@@ -1,73 +1,110 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff'
|
4
|
+
require 'sitediff/diff'
|
5
|
+
require 'sitediff/report'
|
2
6
|
require 'digest/sha1'
|
7
|
+
require 'fileutils'
|
3
8
|
|
4
9
|
class SiteDiff
|
5
|
-
|
10
|
+
# SiteDiff Result Object.
|
11
|
+
class Result < Struct.new(
|
12
|
+
:path,
|
13
|
+
:before,
|
14
|
+
:after,
|
15
|
+
:before_encoding,
|
16
|
+
:after_encoding,
|
17
|
+
:error,
|
18
|
+
:verbose
|
19
|
+
)
|
6
20
|
STATUS_SUCCESS = 0 # Identical before and after
|
7
21
|
STATUS_FAILURE = 1 # Different before and after
|
8
22
|
STATUS_ERROR = 2 # Couldn't fetch page
|
9
|
-
STATUS_TEXT = %w[
|
23
|
+
STATUS_TEXT = %w[unchanged changed error].freeze
|
10
24
|
|
11
25
|
attr_reader :status, :diff
|
12
26
|
|
27
|
+
##
|
28
|
+
# Creates a Result.
|
13
29
|
def initialize(*args)
|
14
30
|
super
|
15
31
|
if error
|
16
32
|
@status = STATUS_ERROR
|
17
33
|
else
|
18
|
-
|
34
|
+
if !before_encoding || !after_encoding
|
35
|
+
@diff = Diff.binary_diffy(
|
36
|
+
before,
|
37
|
+
after,
|
38
|
+
before_encoding,
|
39
|
+
after_encoding
|
40
|
+
)
|
41
|
+
else
|
42
|
+
@diff = Diff.html_diffy(before, after)
|
43
|
+
end
|
19
44
|
@status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
|
20
45
|
end
|
21
46
|
end
|
22
47
|
|
48
|
+
##
|
49
|
+
# Whether the result has no diff.
|
50
|
+
#
|
51
|
+
# If there is a diff, it is not a success.
|
52
|
+
#
|
53
|
+
# TODO: Change "Success" to unchanged.
|
23
54
|
def success?
|
24
55
|
status == STATUS_SUCCESS
|
25
56
|
end
|
26
57
|
|
58
|
+
##
|
59
|
+
# Whether the result has an error.
|
60
|
+
def error?
|
61
|
+
status == STATUS_ERROR
|
62
|
+
end
|
63
|
+
|
27
64
|
# Textual representation of the status
|
28
65
|
def status_text
|
29
|
-
|
66
|
+
STATUS_TEXT[status]
|
30
67
|
end
|
31
68
|
|
32
69
|
# Printable URL
|
33
|
-
def url(prefix)
|
34
|
-
|
70
|
+
def url(tag, prefix, cache)
|
71
|
+
base = cache.read_tags.include?(tag) ? "/cache/#{tag}" : prefix
|
72
|
+
base.to_s + path
|
35
73
|
end
|
36
74
|
|
37
75
|
# Filename to store diff
|
38
76
|
def filename
|
39
|
-
File.join(
|
77
|
+
File.join(Report::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
|
40
78
|
end
|
41
79
|
|
42
|
-
#
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
end
|
80
|
+
# Returns a URL to the result diff.
|
81
|
+
#
|
82
|
+
# Returns nil if the result has no diffs.
|
83
|
+
def diff_url(relative = false)
|
84
|
+
prefix = relative ? 'files/' : '/files/'
|
85
|
+
return prefix + filename if status == STATUS_FAILURE
|
49
86
|
end
|
50
87
|
|
51
88
|
# Log the result to the terminal
|
52
|
-
def log
|
89
|
+
def log(verbose = true)
|
53
90
|
case status
|
54
|
-
when STATUS_SUCCESS
|
55
|
-
SiteDiff
|
56
|
-
when STATUS_ERROR
|
57
|
-
SiteDiff
|
58
|
-
when STATUS_FAILURE
|
59
|
-
SiteDiff
|
60
|
-
puts Diff
|
91
|
+
when STATUS_SUCCESS
|
92
|
+
SiteDiff.log path, :success, 'UNCHANGED'
|
93
|
+
when STATUS_ERROR
|
94
|
+
SiteDiff.log path + " (#{error})", :warning, 'ERROR'
|
95
|
+
when STATUS_FAILURE
|
96
|
+
SiteDiff.log path, :error, 'CHANGED'
|
97
|
+
puts Diff.terminal_diffy(before, after) if verbose
|
61
98
|
end
|
62
99
|
end
|
63
100
|
|
64
101
|
# Dump the result to a file
|
65
|
-
def dump(dir)
|
102
|
+
def dump(dir, relative = false)
|
66
103
|
dump_path = File.join(dir, filename)
|
67
104
|
base = File.dirname(dump_path)
|
68
|
-
FileUtils
|
105
|
+
FileUtils.mkdir_p(base) unless File.exist?(base)
|
69
106
|
File.open(dump_path, 'w') do |f|
|
70
|
-
f.write(Diff
|
107
|
+
f.write(Diff.generate_diff_output(self, relative))
|
71
108
|
end
|
72
109
|
end
|
73
110
|
end
|
data/lib/sitediff/sanitize.rb
CHANGED
@@ -1,104 +1,152 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff'
|
4
|
+
require 'sitediff/exception'
|
5
|
+
require 'sitediff/sanitize/dom_transform'
|
6
|
+
require 'sitediff/sanitize/regexp'
|
1
7
|
require 'nokogiri'
|
2
8
|
require 'set'
|
3
9
|
|
4
10
|
class SiteDiff
|
5
|
-
|
6
|
-
|
11
|
+
# SiteDiff Sanitizer.
|
12
|
+
class Sanitizer
|
13
|
+
class InvalidSanitization < SiteDiffException; end
|
7
14
|
|
8
15
|
TOOLS = {
|
9
|
-
:
|
10
|
-
:
|
11
|
-
}
|
12
|
-
DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
# * { :type => "unwrap_root" }
|
21
|
-
# * { :type => "unwrap", :selector => "div.field-item" }
|
22
|
-
# * { :type => "remove", :selector => "div.extra-stuff" }
|
23
|
-
#
|
24
|
-
# @arg node - Nokogiri document or Node
|
25
|
-
# @arg rules - array of dom_transform rules
|
26
|
-
# @return - transformed Nokogiri document node
|
27
|
-
def perform_dom_transforms(node, rules)
|
28
|
-
rules.each do |rule|
|
29
|
-
type = rule['type'] or
|
30
|
-
raise InvalidSanitization, "DOM transform needs a type"
|
31
|
-
DOM_TRANSFORMS.include?(type) or
|
32
|
-
raise InvalidSanitization, "No DOM transform named #{type}"
|
33
|
-
|
34
|
-
meth = 'transform_' + type
|
35
|
-
|
36
|
-
if sels = rule['selector']
|
37
|
-
sels = [sels].flatten # Either array or scalar is fine
|
38
|
-
# Call method for each node the selectors find
|
39
|
-
sels.each do |sel|
|
40
|
-
node.css(sel).each { |e| send(meth, rule, e) }
|
41
|
-
end
|
42
|
-
else
|
43
|
-
send(meth, rule, node)
|
44
|
-
end
|
45
|
-
end
|
16
|
+
array: %w[dom_transform sanitization],
|
17
|
+
scalar: %w[selector remove_spacing ignore_whitespace]
|
18
|
+
}.freeze
|
19
|
+
DOM_TRANSFORMS = Set.new(%w[remove strip unwrap_root unwrap remove_class])
|
20
|
+
|
21
|
+
##
|
22
|
+
# Creates a Sanitizer.
|
23
|
+
def initialize(html, config, opts = {})
|
24
|
+
@html = html
|
25
|
+
@config = config
|
26
|
+
@opts = opts
|
46
27
|
end
|
47
28
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
29
|
+
##
|
30
|
+
# Performs sanitization.
|
31
|
+
def sanitize
|
32
|
+
return '' if @html == '' # Quick return on empty input
|
33
|
+
|
34
|
+
@node = Sanitizer.domify(@html)
|
35
|
+
@html = nil
|
36
|
+
|
37
|
+
remove_spacing
|
38
|
+
selector
|
39
|
+
dom_transforms
|
40
|
+
regexps
|
41
|
+
|
42
|
+
@html || Sanitizer.prettify(@node)
|
54
43
|
end
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
44
|
+
|
45
|
+
# Return whether or not we want to keep a rule
|
46
|
+
def want_rule(rule)
|
47
|
+
return false unless rule
|
48
|
+
return false if rule['disabled']
|
49
|
+
|
50
|
+
# Filter out if path regexp doesn't match
|
51
|
+
if (pathre = rule['path']) && (path = @opts[:path])
|
52
|
+
return ::Regexp.new(pathre).match(path)
|
60
53
|
end
|
61
|
-
|
62
|
-
|
63
|
-
node.children.size == 1 or
|
64
|
-
raise InvalidSanitization, "Multiple root elements in unwrap_root"
|
65
|
-
node.children = node.children[0].children
|
54
|
+
|
55
|
+
true
|
66
56
|
end
|
67
57
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
58
|
+
# Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
|
59
|
+
# It may be a simple value, or a hash, or an array of hashes.
|
60
|
+
# Turn it into an array of hashes.
|
61
|
+
def canonicalize_rule(name)
|
62
|
+
(rules = @config[name]) || (return nil)
|
63
|
+
|
64
|
+
# Already an array? Do nothing.
|
65
|
+
if rules[0]&.respond_to?('each') && rules[0]&.fetch('value')
|
66
|
+
# If it is a hash, put it in an array.
|
67
|
+
elsif rules['value']
|
68
|
+
rules = [rules]
|
69
|
+
# If it is a scalar value, put it in an array.
|
72
70
|
else
|
73
|
-
|
71
|
+
rules = [{ 'value' => rules }]
|
74
72
|
end
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
|
74
|
+
want = rules.select { |r| want_rule(r) }
|
75
|
+
return nil if want.empty?
|
76
|
+
raise "Too many matching rules of type #{name}" if want.size > 1
|
77
|
+
|
78
|
+
want.first
|
79
|
+
end
|
80
|
+
|
81
|
+
# Perform 'remove_spacing' action
|
82
|
+
def remove_spacing
|
83
|
+
(rule = canonicalize_rule('remove_spacing')) || return
|
84
|
+
Sanitizer.remove_node_spacing(@node) if rule['value']
|
85
|
+
end
|
86
|
+
|
87
|
+
# Perform 'selector' action, to choose a new root
|
88
|
+
def selector
|
89
|
+
(rule = canonicalize_rule('selector')) || return
|
90
|
+
@node = Sanitizer.select_fragments(@node, rule['value'])
|
91
|
+
end
|
92
|
+
|
93
|
+
# Applies regexps. Also
|
94
|
+
def regexps
|
95
|
+
(rules = @config['sanitization']) || return
|
96
|
+
rules = rules.select { |r| want_rule(r) }
|
97
|
+
|
98
|
+
rules.map! { |r| Regexp.create(r) }
|
99
|
+
selector, global = rules.partition(&:selector?)
|
100
|
+
|
101
|
+
selector.each { |r| r.apply(@node) }
|
102
|
+
@html = Sanitizer.prettify(@node)
|
103
|
+
@node = nil
|
104
|
+
# Prevent potential UTF-8 encoding errors by removing bytes
|
105
|
+
# Not the only solution. An alternative is to return the
|
106
|
+
# string unmodified.
|
107
|
+
@html = @html.encode(
|
108
|
+
'UTF-8',
|
109
|
+
'binary',
|
110
|
+
invalid: :replace,
|
111
|
+
undef: :replace,
|
112
|
+
replace: ''
|
113
|
+
)
|
114
|
+
global.each { |r| r.apply(@html) }
|
115
|
+
end
|
116
|
+
|
117
|
+
# Perform DOM transforms
|
118
|
+
def dom_transforms
|
119
|
+
(rules = @config['dom_transform']) || return
|
120
|
+
rules = rules.select { |r| want_rule(r) }
|
121
|
+
|
122
|
+
rules.each do |rule|
|
123
|
+
transform = DomTransform.create(rule)
|
124
|
+
transform.apply(@node)
|
79
125
|
end
|
80
|
-
doc
|
81
126
|
end
|
82
127
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
# This ought to work, and would be faster,
|
91
|
-
# but seems to segfault Nokogiri
|
92
|
-
# doc = Nokogiri::HTML('<html><body>')
|
93
|
-
# doc.at('body').children = obj.children
|
94
|
-
# return doc
|
95
|
-
else
|
96
|
-
return to_document(parse(obj))
|
128
|
+
##### Implementations of actions #####
|
129
|
+
|
130
|
+
# Remove double-spacing inside text nodes
|
131
|
+
def self.remove_node_spacing(node)
|
132
|
+
# remove double spacing, but only inside text nodes (eg not attributes)
|
133
|
+
node.xpath('//text()').each do |el|
|
134
|
+
el.content = el.content.gsub(/ +/, ' ')
|
97
135
|
end
|
98
136
|
end
|
99
137
|
|
100
|
-
#
|
101
|
-
def
|
138
|
+
# Get a fragment consisting of the elements matching the selector(s)
|
139
|
+
def self.select_fragments(node, sel)
|
140
|
+
# When we choose a new root, we always become a DocumentFragment,
|
141
|
+
# and lose any DOCTYPE and such.
|
142
|
+
ns = node.css(sel)
|
143
|
+
node = Nokogiri::HTML.fragment('') unless node.fragment?
|
144
|
+
node.children = ns
|
145
|
+
node
|
146
|
+
end
|
147
|
+
|
148
|
+
# Pretty-print some HTML
|
149
|
+
def self.prettify(obj)
|
102
150
|
@stylesheet ||= begin
|
103
151
|
stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
|
104
152
|
Nokogiri::XSLT(File.read(stylesheet_path))
|
@@ -109,10 +157,22 @@ class SiteDiff
|
|
109
157
|
# but that tends to segfault Nokogiri
|
110
158
|
str = @stylesheet.apply_to(to_document(obj))
|
111
159
|
|
160
|
+
# There's a lot of cruft left over,that we don't want
|
161
|
+
|
162
|
+
# Prevent potential UTF-8 encoding errors by removing invalid bytes.
|
163
|
+
# Not the only solution.
|
164
|
+
# An alternative is to return the string unmodified.
|
165
|
+
str = str.encode(
|
166
|
+
'UTF-8',
|
167
|
+
'binary',
|
168
|
+
invalid: :replace,
|
169
|
+
undef: :replace,
|
170
|
+
replace: ''
|
171
|
+
)
|
112
172
|
# Remove xml declaration and <html> tags
|
113
173
|
str.sub!(/\A<\?xml.*$\n/, '')
|
114
174
|
str.sub!(/\A^<html>$\n/, '')
|
115
|
-
str.sub!(%r
|
175
|
+
str.sub!(%r{</html>\n\Z}, '')
|
116
176
|
|
117
177
|
# Remove top-level indentation
|
118
178
|
indent = /\A(\s*)/.match(str)[1].size
|
@@ -121,73 +181,32 @@ class SiteDiff
|
|
121
181
|
# Remove blank lines
|
122
182
|
str.gsub!(/^\s*$\n/, '')
|
123
183
|
|
124
|
-
|
125
|
-
|
184
|
+
# Remove DOS newlines
|
185
|
+
str.gsub!(/\x0D$/, '')
|
186
|
+
str.gsub!(/ $/, '')
|
126
187
|
|
127
|
-
def remove_spacing(doc)
|
128
|
-
# remove double spacing, but only inside text nodes (eg not attributes)
|
129
|
-
doc.xpath('//text()').each do |node|
|
130
|
-
node.content = node.content.gsub(/ +/, ' ')
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
# Do one regexp transformation on a string
|
135
|
-
def substitute(str, rule)
|
136
|
-
#FIXME escape forward slashes, right now we are escaping them in YAML!
|
137
|
-
str.gsub!(/#{rule['pattern']}/, rule['substitute'] || '' )
|
138
188
|
str
|
139
189
|
end
|
140
190
|
|
141
|
-
#
|
142
|
-
def
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
if sel = rule['selector']
|
148
|
-
node.css(sel).each do |e|
|
149
|
-
e.replace(substitute(e.to_html, rule))
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
|
154
|
-
# If needed, do rules without a selector. We'd rather not convert to
|
155
|
-
# a string unless necessary.
|
156
|
-
global_rules = rules.reject { |r| r['selector'] }
|
157
|
-
return node if global_rules.empty?
|
158
|
-
|
159
|
-
str = node.to_html # Convert to string
|
160
|
-
global_rules.each { |r| substitute(str, r) }
|
161
|
-
return str
|
162
|
-
end
|
163
|
-
|
164
|
-
def select_root(node, sel)
|
165
|
-
return node unless sel
|
166
|
-
|
167
|
-
# When we choose a new root, we always become a DocumentFragment,
|
168
|
-
# and lose any DOCTYPE and such.
|
169
|
-
ns = node.css(sel)
|
170
|
-
unless node.fragment?
|
171
|
-
node = Nokogiri::HTML.fragment('')
|
191
|
+
# Parse HTML into a node
|
192
|
+
def self.domify(str, force_doc = false)
|
193
|
+
if force_doc || /<!DOCTYPE/.match(str[0, 512])
|
194
|
+
Nokogiri::HTML(str)
|
195
|
+
else
|
196
|
+
Nokogiri::HTML.fragment(str)
|
172
197
|
end
|
173
|
-
node.children = ns
|
174
|
-
return node
|
175
198
|
end
|
176
199
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
200
|
+
# Force this object to be a document, so we can apply a stylesheet
|
201
|
+
def self.to_document(obj)
|
202
|
+
if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
|
203
|
+
obj
|
204
|
+
# node or fragment
|
205
|
+
elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
|
206
|
+
domify(obj.to_s, true)
|
207
|
+
else
|
208
|
+
to_document(domify(obj, false))
|
186
209
|
end
|
187
|
-
|
188
|
-
obj = perform_regexps(node, config['sanitization'])
|
189
|
-
|
190
|
-
return prettify(obj)
|
191
210
|
end
|
192
211
|
end
|
193
212
|
end
|