sitediff 0.0.6 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.eslintignore +1 -0
- data/.eslintrc.json +28 -0
- data/.project +11 -0
- data/.rubocop.yml +179 -0
- data/.rubocop_todo.yml +51 -0
- data/CHANGELOG.md +28 -0
- data/Dockerfile +33 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +85 -0
- data/INSTALLATION.md +146 -0
- data/LICENSE +339 -0
- data/README.md +810 -0
- data/Rakefile +12 -0
- data/Thorfile +135 -0
- data/bin/sitediff +9 -2
- data/config/.gitkeep +0 -0
- data/config/sanitize_domains.example.yaml +8 -0
- data/config/sitediff.example.yaml +81 -0
- data/docker-compose.test.yml +3 -0
- data/lib/sitediff/api.rb +276 -0
- data/lib/sitediff/cache.rb +57 -8
- data/lib/sitediff/cli.rb +156 -176
- data/lib/sitediff/config/creator.rb +61 -77
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/config.rb +436 -31
- data/lib/sitediff/crawler.rb +27 -21
- data/lib/sitediff/diff.rb +32 -9
- data/lib/sitediff/fetch.rb +10 -3
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +171 -0
- data/lib/sitediff/files/sidebyside.html.erb +5 -2
- data/lib/sitediff/files/sitediff.css +303 -30
- data/lib/sitediff/files/sitediff.js +367 -0
- data/lib/sitediff/presets/drupal.yaml +63 -0
- data/lib/sitediff/report.rb +254 -0
- data/lib/sitediff/result.rb +50 -20
- data/lib/sitediff/sanitize/dom_transform.rb +47 -8
- data/lib/sitediff/sanitize/regexp.rb +24 -3
- data/lib/sitediff/sanitize.rb +81 -12
- data/lib/sitediff/uriwrapper.rb +65 -23
- data/lib/sitediff/webserver/resultserver.rb +30 -33
- data/lib/sitediff/webserver.rb +15 -3
- data/lib/sitediff.rb +130 -83
- data/misc/sitediff - overview report.png +0 -0
- data/misc/sitediff - page report.png +0 -0
- data/package-lock.json +878 -0
- data/package.json +25 -0
- data/sitediff.gemspec +51 -0
- metadata +91 -29
- data/lib/sitediff/files/html_report.html.erb +0 -66
- data/lib/sitediff/files/rules/drupal.yaml +0 -63
- data/lib/sitediff/rules.rb +0 -65
@@ -0,0 +1,63 @@
|
|
1
|
+
sanitization:
|
2
|
+
- title: Strip Drupal.settings
|
3
|
+
selector: script
|
4
|
+
pattern: '^(<script>)?jQuery.extend\(Drupal.settings.*$'
|
5
|
+
- title: Strip IE CSS/JS cache IDs
|
6
|
+
pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
|
7
|
+
substitute: '\1'
|
8
|
+
- title: Strip form build ID
|
9
|
+
selector: input
|
10
|
+
pattern: 'name="form_build_id" value="form-[-\w]{40,43}"'
|
11
|
+
substitute: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
|
12
|
+
- title: Strip view DOM ID
|
13
|
+
pattern: '(class="view .*) view-dom-id-[a-f0-9]{32}"'
|
14
|
+
substitute: '\1 view-dom-id-DRUPAL_VIEW_DOM_ID"'
|
15
|
+
- title: Strip CSS aggregation filenames
|
16
|
+
selector: link[rel=stylesheet]
|
17
|
+
pattern: '(href="[^"]*/files/css/css_)[-\w]{40,43}\.css"'
|
18
|
+
substitute: '\1DRUPAL_AGGREGATED_CSS.css"'
|
19
|
+
- title: Strip JS aggregation filenames
|
20
|
+
selector: script
|
21
|
+
pattern: '(src="[^"]*/files/js/js_)[-\w]{40,43}\.js"'
|
22
|
+
substitute: '\1DRUPAL_AGGREGATED_JS.js"'
|
23
|
+
- title: Strip CSS/JS cache IDs
|
24
|
+
selector: style, script
|
25
|
+
pattern: '("[^"]*\.(js|css))\?[a-z0-9]{6}"'
|
26
|
+
substitute: '\1'
|
27
|
+
- title: Strip Drupal JS version tags
|
28
|
+
selector: script
|
29
|
+
pattern: '(src="[^"]*/misc/\w+\.js)?v=\d+\.\d+"'
|
30
|
+
substitute: '\1'
|
31
|
+
- title: Strip domain names from absolute URLs
|
32
|
+
pattern: 'http:\/\/[a-zA-Z0-9.:-]+'
|
33
|
+
substitute: '__domain__'
|
34
|
+
- title: Strip form build ID
|
35
|
+
selector: input
|
36
|
+
pattern: 'autocomplete="off" data-drupal-selector="form-[-\w]{40,43}"'
|
37
|
+
substitute: 'autocomplete="off" data-drupal-selector="form-DRUPAL_FORM_BUILD_ID"'
|
38
|
+
- title: Strip form build ID 2
|
39
|
+
selector: input
|
40
|
+
pattern: 'name="form_build_id" value="form-[-\w]{40,43}"'
|
41
|
+
substitute: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
|
42
|
+
- title: Strip Drupal CSS link queries
|
43
|
+
selector: link
|
44
|
+
pattern: '\.css\?(\w*)'
|
45
|
+
substitute: '\.css'
|
46
|
+
- title: Strip Drupal JS link queries
|
47
|
+
selector: script
|
48
|
+
pattern: '\.js\?(\w*)'
|
49
|
+
substitute: '\.js'
|
50
|
+
- title: Strip Drupal View-DOM ID
|
51
|
+
pattern: 'view-dom-id-\w*'
|
52
|
+
substitute: 'view-dom-id-_ID_'
|
53
|
+
- title: Strip Drupal View-DOM ID 2
|
54
|
+
pattern: '(views?_dom_id"?:"?)\w*'
|
55
|
+
substitute: '\1_ID_'
|
56
|
+
- title: Ignore Drupal CSS file names
|
57
|
+
selector: link
|
58
|
+
pattern: 'css_[-\w]{40,43}(\\|%5C)?\.css'
|
59
|
+
substitute: 'css__ID__.css'
|
60
|
+
- title: Ignore Drupal JS file names
|
61
|
+
selector: script
|
62
|
+
pattern: 'js_[-\w]{40,43}\\?\.js'
|
63
|
+
substitute: 'js__ID__.js'
|
@@ -0,0 +1,254 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
require 'json'
|
5
|
+
require 'minitar'
|
6
|
+
require 'sitediff'
|
7
|
+
require 'sitediff/config'
|
8
|
+
require 'zlib'
|
9
|
+
|
10
|
+
class SiteDiff
|
11
|
+
##
|
12
|
+
# SiteDiff Report Helper.
|
13
|
+
class Report
|
14
|
+
attr_reader :results, :cache
|
15
|
+
|
16
|
+
##
|
17
|
+
# Directory where diffs will be generated.
|
18
|
+
DIFFS_DIR = 'diffs'
|
19
|
+
|
20
|
+
##
|
21
|
+
# Name of file containing a list of pages with diffs.
|
22
|
+
FAILURES_FILE = 'failures.txt'
|
23
|
+
|
24
|
+
##
|
25
|
+
# Name of file containing HTML report of diffs.
|
26
|
+
REPORT_FILE_HTML = 'report.html'
|
27
|
+
|
28
|
+
##
|
29
|
+
# Name of file containing JSON report of diffs.
|
30
|
+
REPORT_FILE_JSON = 'report.json'
|
31
|
+
|
32
|
+
##
|
33
|
+
# Name of file containing exported file archive.
|
34
|
+
REPORT_FILE_TAR = 'report.tgz'
|
35
|
+
|
36
|
+
##
|
37
|
+
# Name of directory in which to build the portable report.
|
38
|
+
REPORT_BUILD_DIR = '_tmp_report'
|
39
|
+
|
40
|
+
##
|
41
|
+
# Name of the portable report directory.
|
42
|
+
REPORT_DIR = 'report'
|
43
|
+
|
44
|
+
##
|
45
|
+
# Path to settings used for report.
|
46
|
+
SETTINGS_FILE = 'settings.yaml'
|
47
|
+
|
48
|
+
##
|
49
|
+
# Creates a Reporter object.
|
50
|
+
#
|
51
|
+
# @param [Config] config.
|
52
|
+
# @param [Cache] cache.
|
53
|
+
# @param [Array] results.
|
54
|
+
def initialize(config, cache, results)
|
55
|
+
@config = config
|
56
|
+
@cache = cache
|
57
|
+
@results = results
|
58
|
+
end
|
59
|
+
|
60
|
+
##
|
61
|
+
# Generates an HTML report.
|
62
|
+
#
|
63
|
+
# @param [String] dir
|
64
|
+
# The directory in which the report is to be generated.
|
65
|
+
def generate_html(
|
66
|
+
dir,
|
67
|
+
report_before = nil,
|
68
|
+
report_after = nil
|
69
|
+
)
|
70
|
+
report_before ||= @config.before_url
|
71
|
+
report_after ||= @config.after_url
|
72
|
+
@config.before_time = get_timestamp(:before)
|
73
|
+
@config.after_time = get_timestamp(:after)
|
74
|
+
|
75
|
+
dir = SiteDiff.ensure_dir dir
|
76
|
+
|
77
|
+
write_diffs dir
|
78
|
+
write_failures dir
|
79
|
+
|
80
|
+
# Prepare report.
|
81
|
+
report = Diff.generate_html(
|
82
|
+
@results,
|
83
|
+
report_before,
|
84
|
+
report_after,
|
85
|
+
@cache,
|
86
|
+
@config
|
87
|
+
)
|
88
|
+
|
89
|
+
# Write report.
|
90
|
+
report_file = dir + REPORT_FILE_HTML
|
91
|
+
report_file.unlink if report_file.file?
|
92
|
+
report_file.open('w') { |f| f.write(report) }
|
93
|
+
|
94
|
+
write_settings dir, report_before, report_after
|
95
|
+
|
96
|
+
if @config.export
|
97
|
+
package_report(dir)
|
98
|
+
else
|
99
|
+
SiteDiff.log "Report generated to #{report_file.expand_path}"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Generates a JSON report.
|
105
|
+
#
|
106
|
+
# @param dir
|
107
|
+
# The directory in which the report is to be generated.
|
108
|
+
def generate_json(dir)
|
109
|
+
dir = SiteDiff.ensure_dir dir
|
110
|
+
write_diffs dir
|
111
|
+
write_failures dir
|
112
|
+
|
113
|
+
# Prepare report.
|
114
|
+
report = {
|
115
|
+
paths_compared: @results.length,
|
116
|
+
paths_diffs: 0,
|
117
|
+
paths: {}
|
118
|
+
}
|
119
|
+
@results.each do |item|
|
120
|
+
report[:paths_diffs] += 1 unless item.success?
|
121
|
+
|
122
|
+
item_report = {
|
123
|
+
path: item.path,
|
124
|
+
status: item.status,
|
125
|
+
message: item.error
|
126
|
+
}
|
127
|
+
report[:paths][item.path] = item_report
|
128
|
+
end
|
129
|
+
report = JSON report
|
130
|
+
|
131
|
+
# Write report.
|
132
|
+
report_file = dir + REPORT_FILE_JSON
|
133
|
+
report_file.unlink if report_file.file?
|
134
|
+
report_file.open('w') { |f| f.write(report) }
|
135
|
+
|
136
|
+
write_settings dir
|
137
|
+
|
138
|
+
SiteDiff.log "Report generated to #{report_file.expand_path}"
|
139
|
+
end
|
140
|
+
|
141
|
+
##
|
142
|
+
# Package report for export.
|
143
|
+
def package_report(dir)
|
144
|
+
# Create temporaryreport directories.
|
145
|
+
temp_path = dir + REPORT_BUILD_DIR
|
146
|
+
temp_path.rmtree if temp_path.directory?
|
147
|
+
temp_path.mkpath
|
148
|
+
report_path = temp_path + REPORT_DIR
|
149
|
+
report_path.mkpath
|
150
|
+
files_path = "#{report_path}files"
|
151
|
+
files_path.mkpath
|
152
|
+
diffs_path = dir + DIFFS_DIR
|
153
|
+
|
154
|
+
# Move files to place.
|
155
|
+
FileUtils.move(dir + REPORT_FILE_HTML, report_path)
|
156
|
+
FileUtils.move(diffs_path, files_path) if diffs_path.directory?
|
157
|
+
|
158
|
+
# Make tar file.
|
159
|
+
Dir.chdir(temp_path) do
|
160
|
+
Minitar.pack(
|
161
|
+
REPORT_DIR,
|
162
|
+
Zlib::GzipWriter.new(File.open(REPORT_FILE_TAR, 'wb'))
|
163
|
+
)
|
164
|
+
end
|
165
|
+
FileUtils.move(temp_path + REPORT_FILE_TAR, dir)
|
166
|
+
temp_path.rmtree
|
167
|
+
SiteDiff.log "Archived report generated to #{dir.join(REPORT_FILE_TAR)}"
|
168
|
+
end
|
169
|
+
|
170
|
+
##
|
171
|
+
# Creates diff files in a directory named "diffs".
|
172
|
+
#
|
173
|
+
# If "dir" is /foo/bar, then diffs will be placed in /foo/bar/diffs.
|
174
|
+
#
|
175
|
+
# @param [Pathname] dir
|
176
|
+
# The directory in which a "diffs" directory is to be generated.
|
177
|
+
def write_diffs(dir)
|
178
|
+
raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
|
179
|
+
|
180
|
+
# Delete existing "diffs" dir, if exists.
|
181
|
+
diff_dir = dir + DIFFS_DIR
|
182
|
+
diff_dir.rmtree if diff_dir.exist?
|
183
|
+
|
184
|
+
# Write diffs to the diff directory.
|
185
|
+
@results.each { |r| r.dump(dir, relative: @config.export) if r.status == Result::STATUS_FAILURE }
|
186
|
+
SiteDiff.log "All diff files written to #{diff_dir.expand_path}" unless @config.export
|
187
|
+
end
|
188
|
+
|
189
|
+
##
|
190
|
+
# Writes paths with diffs into a file.
|
191
|
+
#
|
192
|
+
# @param [Pathname] dir
|
193
|
+
# The directory in which the report is to be generated.
|
194
|
+
def write_failures(dir)
|
195
|
+
raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
|
196
|
+
|
197
|
+
failures = dir + FAILURES_FILE
|
198
|
+
SiteDiff.log "All failures written to #{failures.expand_path}"
|
199
|
+
failures.open('w') do |f|
|
200
|
+
@results.each { |r| f.puts r.path unless r.success? }
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
##
|
205
|
+
# Creates report settings.yaml file.
|
206
|
+
#
|
207
|
+
# TODO: Find a way to avoid having to create this file.
|
208
|
+
#
|
209
|
+
# @param [Pathname] dir
|
210
|
+
# The directory in which the report is to be generated.
|
211
|
+
def write_settings(dir, report_before = nil, report_after = nil)
|
212
|
+
raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
|
213
|
+
|
214
|
+
settings = {
|
215
|
+
'before' => report_before,
|
216
|
+
'after' => report_after,
|
217
|
+
'cached' => %w[before after]
|
218
|
+
}
|
219
|
+
dir.+(SETTINGS_FILE).open('w') { |f| YAML.dump(settings, f) }
|
220
|
+
end
|
221
|
+
|
222
|
+
##
|
223
|
+
# Returns CSS for HTML report.
|
224
|
+
def self.css
|
225
|
+
output = ''
|
226
|
+
output += File.read(File.join(SiteDiff::FILES_DIR, 'normalize.css'))
|
227
|
+
output += File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.css'))
|
228
|
+
output
|
229
|
+
end
|
230
|
+
|
231
|
+
##
|
232
|
+
# Returns JS for HTML report.
|
233
|
+
def self.js
|
234
|
+
output = ''
|
235
|
+
output += File.read(File.join(SiteDiff::FILES_DIR, 'jquery.min.js'))
|
236
|
+
output += File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.js'))
|
237
|
+
output
|
238
|
+
end
|
239
|
+
|
240
|
+
private
|
241
|
+
|
242
|
+
# Get crawl timestamps
|
243
|
+
def get_timestamp(tag)
|
244
|
+
timestamp_file = File.join(@config.directory, 'snapshot', tag.to_s, SiteDiff::Cache::TIMESTAMP_FILE)
|
245
|
+
if File.exist? timestamp_file
|
246
|
+
file = File::Stat.new(timestamp_file)
|
247
|
+
time = file.mtime
|
248
|
+
time.instance_of?(Time) ? time.strftime('%Y-%m-%d %H:%M') : ''
|
249
|
+
else
|
250
|
+
'unknown'
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
data/lib/sitediff/result.rb
CHANGED
@@ -2,25 +2,42 @@
|
|
2
2
|
|
3
3
|
require 'sitediff'
|
4
4
|
require 'sitediff/diff'
|
5
|
+
require 'sitediff/report'
|
5
6
|
require 'digest/sha1'
|
6
7
|
require 'fileutils'
|
7
8
|
|
8
9
|
class SiteDiff
|
9
|
-
|
10
|
+
# SiteDiff Result Object.
|
11
|
+
class Result < Struct.new(
|
12
|
+
:path,
|
13
|
+
:before,
|
14
|
+
:after,
|
15
|
+
:before_encoding,
|
16
|
+
:after_encoding,
|
17
|
+
:error,
|
18
|
+
:verbose
|
19
|
+
)
|
10
20
|
STATUS_SUCCESS = 0 # Identical before and after
|
11
21
|
STATUS_FAILURE = 1 # Different before and after
|
12
22
|
STATUS_ERROR = 2 # Couldn't fetch page
|
13
|
-
STATUS_TEXT = %w[
|
23
|
+
STATUS_TEXT = %w[unchanged changed error].freeze
|
14
24
|
|
15
25
|
attr_reader :status, :diff
|
16
26
|
|
27
|
+
##
|
28
|
+
# Creates a Result.
|
17
29
|
def initialize(*args)
|
18
30
|
super
|
19
31
|
if error
|
20
32
|
@status = STATUS_ERROR
|
21
33
|
else
|
22
34
|
if !before_encoding || !after_encoding
|
23
|
-
@diff = Diff.binary_diffy(
|
35
|
+
@diff = Diff.binary_diffy(
|
36
|
+
before,
|
37
|
+
after,
|
38
|
+
before_encoding,
|
39
|
+
after_encoding
|
40
|
+
)
|
24
41
|
else
|
25
42
|
@diff = Diff.html_diffy(before, after)
|
26
43
|
end
|
@@ -28,10 +45,22 @@ class SiteDiff
|
|
28
45
|
end
|
29
46
|
end
|
30
47
|
|
48
|
+
##
|
49
|
+
# Whether the result has no diff.
|
50
|
+
#
|
51
|
+
# If there is a diff, it is not a success.
|
52
|
+
#
|
53
|
+
# TODO: Change "Success" to unchanged.
|
31
54
|
def success?
|
32
55
|
status == STATUS_SUCCESS
|
33
56
|
end
|
34
57
|
|
58
|
+
##
|
59
|
+
# Whether the result has an error.
|
60
|
+
def error?
|
61
|
+
status == STATUS_ERROR
|
62
|
+
end
|
63
|
+
|
35
64
|
# Textual representation of the status
|
36
65
|
def status_text
|
37
66
|
STATUS_TEXT[status]
|
@@ -39,44 +68,45 @@ class SiteDiff
|
|
39
68
|
|
40
69
|
# Printable URL
|
41
70
|
def url(tag, prefix, cache)
|
71
|
+
return unless prefix
|
72
|
+
|
42
73
|
base = cache.read_tags.include?(tag) ? "/cache/#{tag}" : prefix
|
43
74
|
base.to_s + path
|
44
75
|
end
|
45
76
|
|
46
77
|
# Filename to store diff
|
47
78
|
def filename
|
48
|
-
File.join(
|
79
|
+
File.join(Report::DIFFS_DIR, "#{Digest::SHA1.hexdigest(path)}.html")
|
49
80
|
end
|
50
81
|
|
51
|
-
#
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
end
|
82
|
+
# Returns a URL to the result diff.
|
83
|
+
#
|
84
|
+
# Returns nil if the result has no diffs.
|
85
|
+
def diff_url(relative: false)
|
86
|
+
prefix = relative ? 'files/' : '/files/'
|
87
|
+
return prefix + filename if status == STATUS_FAILURE
|
58
88
|
end
|
59
89
|
|
60
90
|
# Log the result to the terminal
|
61
|
-
def log(verbose
|
91
|
+
def log(verbose: true)
|
62
92
|
case status
|
63
|
-
when STATUS_SUCCESS
|
64
|
-
SiteDiff.log path, :
|
65
|
-
when STATUS_ERROR
|
66
|
-
SiteDiff.log path
|
67
|
-
when STATUS_FAILURE
|
68
|
-
SiteDiff.log path, :
|
93
|
+
when STATUS_SUCCESS
|
94
|
+
SiteDiff.log path, :success, 'UNCHANGED'
|
95
|
+
when STATUS_ERROR
|
96
|
+
SiteDiff.log path + " (#{error})", :warning, 'ERROR'
|
97
|
+
when STATUS_FAILURE
|
98
|
+
SiteDiff.log path, :error, 'CHANGED'
|
69
99
|
puts Diff.terminal_diffy(before, after) if verbose
|
70
100
|
end
|
71
101
|
end
|
72
102
|
|
73
103
|
# Dump the result to a file
|
74
|
-
def dump(dir)
|
104
|
+
def dump(dir, relative: false)
|
75
105
|
dump_path = File.join(dir, filename)
|
76
106
|
base = File.dirname(dump_path)
|
77
107
|
FileUtils.mkdir_p(base) unless File.exist?(base)
|
78
108
|
File.open(dump_path, 'w') do |f|
|
79
|
-
f.write(Diff.generate_diff_output(self))
|
109
|
+
f.write(Diff.generate_diff_output(self, relative:))
|
80
110
|
end
|
81
111
|
end
|
82
112
|
end
|
@@ -11,61 +11,96 @@ class SiteDiff
|
|
11
11
|
# * { :type => "unwrap", :selector => "div.field-item" }
|
12
12
|
# * { :type => "remove", :selector => "div.extra-stuff" }
|
13
13
|
# * { :type => "remove_class", :class => 'class1' }
|
14
|
+
# * { :type => "strip", :selector => 'h1' }
|
14
15
|
class DomTransform
|
15
|
-
|
16
|
+
# Supported dom_transform types.
|
17
|
+
TRANSFORMS = {}
|
16
18
|
|
19
|
+
##
|
20
|
+
# Creates a DOM Transform.
|
17
21
|
def initialize(rule)
|
18
22
|
@rule = rule
|
19
23
|
end
|
20
24
|
|
25
|
+
##
|
21
26
|
# Often an array or scalar are both ok values. Turn either into an array.
|
22
27
|
def to_array(val)
|
23
28
|
[val].flatten
|
24
29
|
end
|
25
30
|
|
26
|
-
|
31
|
+
##
|
32
|
+
# TODO: Document what this method does.
|
33
|
+
def targets(node, &block)
|
27
34
|
selectors = to_array(@rule['selector'])
|
28
35
|
selectors.each do |sel|
|
29
|
-
node.css(sel).each
|
36
|
+
node.css(sel).each(&block)
|
30
37
|
end
|
31
38
|
end
|
32
39
|
|
40
|
+
##
|
41
|
+
# Applies the transformation to a DOM node.
|
33
42
|
def apply(node)
|
34
43
|
targets(node) { |t| process(t) }
|
35
44
|
end
|
36
45
|
|
46
|
+
##
|
47
|
+
# Registers a DOM Transform plugin.
|
37
48
|
def self.register(name)
|
38
|
-
|
49
|
+
TRANSFORMS[name] = self
|
39
50
|
end
|
40
51
|
|
52
|
+
##
|
53
|
+
# Creates a DOM Transform as per rule.
|
41
54
|
def self.create(rule)
|
42
55
|
(type = rule['type']) ||
|
43
56
|
raise(InvalidSanitization, 'DOM transform needs a type')
|
44
|
-
(transform =
|
57
|
+
(transform = TRANSFORMS[type]) ||
|
45
58
|
raise(InvalidSanitization, "No DOM transform named #{type}")
|
46
59
|
transform.new(rule)
|
47
60
|
end
|
48
61
|
|
49
|
-
|
62
|
+
##
|
63
|
+
# Remove elements matching 'selector'.
|
50
64
|
class Remove < DomTransform
|
51
65
|
register 'remove'
|
66
|
+
|
67
|
+
##
|
68
|
+
# Processes a node.
|
52
69
|
def process(node)
|
53
70
|
node.remove
|
54
71
|
end
|
55
72
|
end
|
56
73
|
|
57
|
-
#
|
74
|
+
# Squeeze whitespace from a tag matching 'selector'.
|
75
|
+
class Strip < DomTransform
|
76
|
+
register 'strip'
|
77
|
+
|
78
|
+
##
|
79
|
+
# Processes a node.
|
80
|
+
def process(node)
|
81
|
+
node.content = node.content.strip
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Unwrap elements matching 'selector'.
|
58
86
|
class Unwrap < DomTransform
|
59
87
|
register 'unwrap'
|
88
|
+
|
89
|
+
##
|
90
|
+
# Processes a node.
|
60
91
|
def process(node)
|
61
92
|
node.add_next_sibling(node.children)
|
62
93
|
node.remove
|
63
94
|
end
|
64
95
|
end
|
65
96
|
|
97
|
+
##
|
66
98
|
# Remove classes from elements matching selector
|
67
99
|
class RemoveClass < DomTransform
|
68
100
|
register 'remove_class'
|
101
|
+
|
102
|
+
##
|
103
|
+
# Processes a node.
|
69
104
|
def process(node)
|
70
105
|
classes = to_array(@rule['class'])
|
71
106
|
|
@@ -77,9 +112,13 @@ class SiteDiff
|
|
77
112
|
end
|
78
113
|
end
|
79
114
|
|
80
|
-
|
115
|
+
##
|
116
|
+
# Unwrap the root element.
|
81
117
|
class UnwrapRoot < DomTransform
|
82
118
|
register 'unwrap_root'
|
119
|
+
|
120
|
+
##
|
121
|
+
# Applies the transformation to a DOM node.
|
83
122
|
def apply(node)
|
84
123
|
(node.children.size == 1) ||
|
85
124
|
raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
|
@@ -2,41 +2,62 @@
|
|
2
2
|
|
3
3
|
class SiteDiff
|
4
4
|
class Sanitizer
|
5
|
+
# Regular Expression Object.
|
5
6
|
class Regexp
|
7
|
+
##
|
8
|
+
# Creates a RegExp object.
|
6
9
|
def initialize(rule)
|
7
10
|
@rule = rule
|
8
11
|
end
|
9
12
|
|
13
|
+
##
|
14
|
+
# Whether the RegExp has a selector.
|
10
15
|
def selector?
|
11
16
|
false
|
12
17
|
end
|
13
18
|
|
19
|
+
##
|
20
|
+
# Whether the RegExp applies to the given markup.
|
14
21
|
def applies?(html, _node)
|
15
22
|
applies_to_string?(html)
|
16
23
|
end
|
17
24
|
|
25
|
+
##
|
26
|
+
# Applies the RegExp to the markup.
|
18
27
|
def apply(html)
|
19
28
|
gsub!(html)
|
20
29
|
end
|
21
30
|
|
31
|
+
##
|
32
|
+
# Creates a RegExp object as per rule.
|
22
33
|
def self.create(rule)
|
23
34
|
rule['selector'] ? WithSelector.new(rule) : new(rule)
|
24
35
|
end
|
25
36
|
|
37
|
+
##
|
38
|
+
# A RegExp with selector.
|
26
39
|
class WithSelector < Regexp
|
40
|
+
##
|
41
|
+
# Whether the RegExp has a selector.
|
27
42
|
def selector?
|
28
43
|
true
|
29
44
|
end
|
30
45
|
|
31
|
-
|
32
|
-
|
33
|
-
|
46
|
+
##
|
47
|
+
# TODO: Document what this method does.
|
48
|
+
def contexts(node, &block)
|
49
|
+
selectors = @rule['selector']
|
50
|
+
node.css(selectors).each(&block)
|
34
51
|
end
|
35
52
|
|
53
|
+
##
|
54
|
+
# Whether the RegExp applies to the given markup.
|
36
55
|
def applies?(_html, node)
|
37
56
|
enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
|
38
57
|
end
|
39
58
|
|
59
|
+
##
|
60
|
+
# Applies the RegExp to the markup.
|
40
61
|
def apply(node)
|
41
62
|
contexts(node) { |e| e.replace(gsub!(e.to_html)) }
|
42
63
|
end
|