sitediff 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/sitediff +2 -3
- data/lib/sitediff.rb +35 -24
- data/lib/sitediff/cache.rb +53 -47
- data/lib/sitediff/cli.rb +127 -114
- data/lib/sitediff/config.rb +35 -59
- data/lib/sitediff/config/creator.rb +95 -90
- data/lib/sitediff/crawler.rb +83 -72
- data/lib/sitediff/diff.rb +7 -5
- data/lib/sitediff/exception.rb +3 -1
- data/lib/sitediff/fetch.rb +47 -41
- data/lib/sitediff/files/html_report.html.erb +3 -0
- data/lib/sitediff/files/rules/drupal.yaml +36 -6
- data/lib/sitediff/result.rb +13 -11
- data/lib/sitediff/rules.rb +47 -47
- data/lib/sitediff/sanitize.rb +145 -150
- data/lib/sitediff/sanitize/dom_transform.rb +73 -74
- data/lib/sitediff/sanitize/regexp.rb +55 -52
- data/lib/sitediff/uriwrapper.rb +37 -26
- data/lib/sitediff/webserver.rb +80 -77
- data/lib/sitediff/webserver/resultserver.rb +117 -76
- metadata +32 -44
data/lib/sitediff/diff.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff'
|
2
4
|
require 'diffy'
|
3
5
|
require 'erb'
|
@@ -9,26 +11,26 @@ class SiteDiff
|
|
9
11
|
|
10
12
|
def html_diffy(before_html, after_html)
|
11
13
|
diff = Diffy::Diff.new(before_html, after_html)
|
12
|
-
diff.first ?
|
14
|
+
diff.first ? # Is it non-empty?
|
13
15
|
diff.to_s(:html) : nil
|
14
16
|
end
|
15
17
|
|
16
18
|
def terminal_diffy(before_html, after_html)
|
17
19
|
args = []
|
18
20
|
args << :color if Rainbow.enabled
|
19
|
-
|
20
|
-
|
21
|
+
Diffy::Diff.new(before_html, after_html, context: 3)
|
22
|
+
.to_s(*args)
|
21
23
|
end
|
22
24
|
|
23
25
|
def generate_html_report(results, before, after, cache)
|
24
26
|
erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
|
25
27
|
report_html = ERB.new(File.read(erb_path)).result(binding)
|
26
|
-
|
28
|
+
report_html
|
27
29
|
end
|
28
30
|
|
29
31
|
def generate_diff_output(result)
|
30
32
|
erb_path = File.join(SiteDiff::FILES_DIR, 'diff.html.erb')
|
31
|
-
|
33
|
+
ERB.new(File.read(erb_path)).result(binding)
|
32
34
|
end
|
33
35
|
|
34
36
|
def css
|
data/lib/sitediff/exception.rb
CHANGED
data/lib/sitediff/fetch.rb
CHANGED
@@ -1,55 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/uriwrapper'
|
2
4
|
require 'typhoeus'
|
3
5
|
|
4
6
|
class SiteDiff
|
5
|
-
class Fetch
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
7
|
+
class Fetch
|
8
|
+
# Cache is a cache object, see sitediff/cache
|
9
|
+
# Paths is a list of sub-paths
|
10
|
+
# Tags is a hash of tag names => base URLs.
|
11
|
+
def initialize(cache, paths, concurrency = 3, curl_opts = nil, **tags)
|
12
|
+
@cache = cache
|
13
|
+
@paths = paths
|
14
|
+
@tags = tags
|
15
|
+
@curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
|
16
|
+
@concurrency = concurrency
|
17
|
+
end
|
14
18
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
# Fetch all the paths, once per tag.
|
20
|
+
# When a path has been fetched for every tag, block will be called with the
|
21
|
+
# path, and a hash of tag => UriWrapper::ReadResult objects.
|
22
|
+
def run(&block)
|
23
|
+
@callback = block
|
24
|
+
@hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
|
25
|
+
@paths.each { |path| queue_path(path) }
|
26
|
+
@hydra.run
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
24
30
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
if res = @cache.get(tag, path)
|
32
|
-
results[tag] = res
|
33
|
-
process_results(path, results)
|
34
|
-
elsif !base
|
35
|
-
# We only have the cache, but this item isn't cached!
|
36
|
-
results[tag] = UriWrapper::ReadResult.error("Not cached")
|
37
|
-
process_results(path, results)
|
38
|
-
else
|
39
|
-
uri = UriWrapper.new(base + path)
|
40
|
-
uri.queue(@hydra) do |res|
|
41
|
-
@cache.set(tag, path, res)
|
31
|
+
# Queue a path for fetching
|
32
|
+
def queue_path(path)
|
33
|
+
results = {}
|
34
|
+
|
35
|
+
@tags.each do |tag, base|
|
36
|
+
if (res = @cache.get(tag, path))
|
42
37
|
results[tag] = res
|
43
38
|
process_results(path, results)
|
39
|
+
elsif !base
|
40
|
+
# We only have the cache, but this item isn't cached!
|
41
|
+
results[tag] = UriWrapper::ReadResult.error('Not cached')
|
42
|
+
process_results(path, results)
|
43
|
+
else
|
44
|
+
uri = UriWrapper.new(base + path, @curl_opts)
|
45
|
+
uri.queue(@hydra) do |resl|
|
46
|
+
@cache.set(tag, path, resl)
|
47
|
+
results[tag] = resl
|
48
|
+
process_results(path, results)
|
49
|
+
end
|
44
50
|
end
|
45
51
|
end
|
46
52
|
end
|
47
|
-
end
|
48
53
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
54
|
+
# Process fetch results
|
55
|
+
def process_results(path, results)
|
56
|
+
return unless results.size == @tags.size
|
57
|
+
|
58
|
+
@callback[path, results]
|
59
|
+
end
|
53
60
|
end
|
54
61
|
end
|
55
|
-
end
|
@@ -2,28 +2,28 @@ sanitization:
|
|
2
2
|
- title: Strip Drupal.settings
|
3
3
|
selector: script
|
4
4
|
pattern: '^(<script>)?jQuery.extend\(Drupal.settings.*$'
|
5
|
+
- title: Strip IE CSS/JS cache IDs
|
6
|
+
pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
|
7
|
+
substitute: '\1'
|
5
8
|
- title: Strip form build ID
|
6
9
|
selector: input
|
7
|
-
pattern: 'name="form_build_id" value="form-[-\w]{43}"'
|
10
|
+
pattern: 'name="form_build_id" value="form-[-\w]{40,43}"'
|
8
11
|
substitute: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
|
9
12
|
- title: Strip view DOM ID
|
10
13
|
pattern: '(class="view .*) view-dom-id-[a-f0-9]{32}"'
|
11
14
|
substitute: '\1 view-dom-id-DRUPAL_VIEW_DOM_ID"'
|
12
15
|
- title: Strip CSS aggregation filenames
|
13
16
|
selector: link[rel=stylesheet]
|
14
|
-
pattern: '(href="[^"]*/files/css/css_)[-\w]{43}\.css"'
|
17
|
+
pattern: '(href="[^"]*/files/css/css_)[-\w]{40,43}\.css"'
|
15
18
|
substitute: '\1DRUPAL_AGGREGATED_CSS.css"'
|
16
19
|
- title: Strip JS aggregation filenames
|
17
20
|
selector: script
|
18
|
-
pattern: '(src="[^"]*/files/js/js_)[-\w]{43}\.js"'
|
21
|
+
pattern: '(src="[^"]*/files/js/js_)[-\w]{40,43}\.js"'
|
19
22
|
substitute: '\1DRUPAL_AGGREGATED_JS.js"'
|
20
23
|
- title: Strip CSS/JS cache IDs
|
21
24
|
selector: style, script
|
22
25
|
pattern: '("[^"]*\.(js|css))\?[a-z0-9]{6}"'
|
23
26
|
substitute: '\1'
|
24
|
-
- title: Strip IE CSS/JS cache IDs
|
25
|
-
pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
|
26
|
-
substitute: '\1'
|
27
27
|
- title: Strip Drupal JS version tags
|
28
28
|
selector: script
|
29
29
|
pattern: '(src="[^"]*/misc/\w+\.js)?v=\d+\.\d+"'
|
@@ -31,3 +31,33 @@ sanitization:
|
|
31
31
|
- title: Strip domain names from absolute URLs
|
32
32
|
pattern: 'http:\/\/[a-zA-Z0-9.:-]+'
|
33
33
|
substitute: '__domain__'
|
34
|
+
- title: Strip form build ID
|
35
|
+
selector: input
|
36
|
+
pattern: 'autocomplete="off" data-drupal-selector="form-[-\w]{40,43}"'
|
37
|
+
substitute: 'autocomplete="off" data-drupal-selector="form-DRUPAL_FORM_BUILD_ID"'
|
38
|
+
- title: Strip form build ID 2
|
39
|
+
selector: input
|
40
|
+
pattern: 'name="form_build_id" value="form-[-\w]{40,43}"'
|
41
|
+
substitute: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
|
42
|
+
- title: Strip Drupal CSS link queries
|
43
|
+
selector: link
|
44
|
+
pattern: '\.css\?(\w*)'
|
45
|
+
substitute: '\.css'
|
46
|
+
- title: Strip Drupal JS link queries
|
47
|
+
selector: script
|
48
|
+
pattern: '\.js\?(\w*)'
|
49
|
+
substitute: '\.js'
|
50
|
+
- title: Strip Drupal View-DOM ID
|
51
|
+
pattern: 'view-dom-id-\w*'
|
52
|
+
substitute: 'view-dom-id-_ID_'
|
53
|
+
- title: Strip Drupal View-DOM ID 2
|
54
|
+
pattern: '(views?_dom_id"?:"?)\w*'
|
55
|
+
substitute: '\1_ID_'
|
56
|
+
- title: Ignore Drupal CSS file names
|
57
|
+
selector: link
|
58
|
+
pattern: 'css_[-\w]{40,43}(\\|%5C)?\.css'
|
59
|
+
substitute: 'css__ID__.css'
|
60
|
+
- title: Ignore Drupal JS file names
|
61
|
+
selector: script
|
62
|
+
pattern: 'js_[-\w]{40,43}\\?\.js'
|
63
|
+
substitute: 'js__ID__.js'
|
data/lib/sitediff/result.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff'
|
2
4
|
require 'sitediff/diff'
|
3
5
|
require 'digest/sha1'
|
@@ -8,7 +10,7 @@ class SiteDiff
|
|
8
10
|
STATUS_SUCCESS = 0 # Identical before and after
|
9
11
|
STATUS_FAILURE = 1 # Different before and after
|
10
12
|
STATUS_ERROR = 2 # Couldn't fetch page
|
11
|
-
STATUS_TEXT = %w[success failure error]
|
13
|
+
STATUS_TEXT = %w[success failure error].freeze
|
12
14
|
|
13
15
|
attr_reader :status, :diff
|
14
16
|
|
@@ -17,7 +19,7 @@ class SiteDiff
|
|
17
19
|
if error
|
18
20
|
@status = STATUS_ERROR
|
19
21
|
else
|
20
|
-
@diff = Diff
|
22
|
+
@diff = Diff.html_diffy(before, after)
|
21
23
|
@status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
|
22
24
|
end
|
23
25
|
end
|
@@ -28,7 +30,7 @@ class SiteDiff
|
|
28
30
|
|
29
31
|
# Textual representation of the status
|
30
32
|
def status_text
|
31
|
-
|
33
|
+
STATUS_TEXT[status]
|
32
34
|
end
|
33
35
|
|
34
36
|
# Printable URL
|
@@ -39,7 +41,7 @@ class SiteDiff
|
|
39
41
|
|
40
42
|
# Filename to store diff
|
41
43
|
def filename
|
42
|
-
File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(
|
44
|
+
File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
|
43
45
|
end
|
44
46
|
|
45
47
|
# Text of the link in the HTML report
|
@@ -52,15 +54,15 @@ class SiteDiff
|
|
52
54
|
end
|
53
55
|
|
54
56
|
# Log the result to the terminal
|
55
|
-
def log(verbose=true)
|
57
|
+
def log(verbose = true)
|
56
58
|
case status
|
57
59
|
when STATUS_SUCCESS then
|
58
|
-
SiteDiff
|
60
|
+
SiteDiff.log path, :diff_success, 'UNCHANGED'
|
59
61
|
when STATUS_ERROR then
|
60
|
-
SiteDiff
|
62
|
+
SiteDiff.log path, :warn, "ERROR (#{error})"
|
61
63
|
when STATUS_FAILURE then
|
62
|
-
SiteDiff
|
63
|
-
puts Diff
|
64
|
+
SiteDiff.log path, :diff_failure, 'CHANGED'
|
65
|
+
puts Diff.terminal_diffy(before, after) if verbose
|
64
66
|
end
|
65
67
|
end
|
66
68
|
|
@@ -68,9 +70,9 @@ class SiteDiff
|
|
68
70
|
def dump(dir)
|
69
71
|
dump_path = File.join(dir, filename)
|
70
72
|
base = File.dirname(dump_path)
|
71
|
-
FileUtils
|
73
|
+
FileUtils.mkdir_p(base) unless File.exist?(base)
|
72
74
|
File.open(dump_path, 'w') do |f|
|
73
|
-
f.write(Diff
|
75
|
+
f.write(Diff.generate_diff_output(self))
|
74
76
|
end
|
75
77
|
end
|
76
78
|
end
|
data/lib/sitediff/rules.rb
CHANGED
@@ -1,65 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/sanitize/regexp'
|
2
4
|
require 'pathname'
|
3
5
|
require 'set'
|
4
6
|
|
5
7
|
class SiteDiff
|
6
|
-
# Find appropriate rules for a given site
|
7
|
-
class Rules
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
# Find appropriate rules for a given site
|
9
|
+
class Rules
|
10
|
+
def initialize(config, disabled = false)
|
11
|
+
@disabled = disabled
|
12
|
+
@config = config
|
13
|
+
find_sanitization_candidates
|
14
|
+
@rules = Hash.new { |h, k| h[k] = Set.new }
|
15
|
+
end
|
14
16
|
|
15
|
-
|
16
|
-
|
17
|
+
def find_sanitization_candidates
|
18
|
+
@candidates = Set.new
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
conf = YAML.load_file(f)
|
22
|
-
@candidates.merge(conf['sanitization'])
|
23
|
-
end
|
24
|
-
end
|
20
|
+
rules_dir = Pathname.new(__FILE__).dirname + 'files' + 'rules'
|
21
|
+
rules_dir.children.each do |f|
|
22
|
+
next unless f.file? && f.extname == '.yaml'
|
25
23
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
24
|
+
conf = YAML.load_file(f)
|
25
|
+
@candidates.merge(conf['sanitization'])
|
26
|
+
end
|
27
|
+
end
|
30
28
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
29
|
+
def handle_page(tag, html, doc)
|
30
|
+
found = find_rules(html, doc)
|
31
|
+
@rules[tag].merge(found)
|
32
|
+
end
|
35
33
|
|
36
|
-
|
37
|
-
|
38
|
-
|
34
|
+
# Yield a set of rules that seem reasonable for this HTML
|
35
|
+
# assumption: the YAML file is a list of regexp rules only
|
36
|
+
def find_rules(html, doc)
|
37
|
+
@candidates.select do |rule|
|
38
|
+
re = SiteDiff::Sanitizer::Regexp.create(rule)
|
39
|
+
re.applies?(html, doc)
|
40
|
+
end
|
39
41
|
end
|
40
|
-
end
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
# Find all rules from all rulesets that apply for all pages
|
44
|
+
def add_config
|
45
|
+
have_both = @rules.include?(:before)
|
45
46
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
47
|
+
r1, r2 = *@rules.values_at(:before, :after)
|
48
|
+
if have_both
|
49
|
+
add_section('before', r1 - r2)
|
50
|
+
add_section('after', r2 - r1)
|
51
|
+
add_section(nil, r1 & r2)
|
52
|
+
else
|
53
|
+
add_section(nil, r2)
|
54
|
+
end
|
53
55
|
end
|
54
|
-
end
|
55
56
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
rules.each { |r| r['disabled'] = true }
|
57
|
+
def add_section(name, rules)
|
58
|
+
return if rules.empty?
|
59
|
+
|
60
|
+
conf = name ? @config[name] : @config
|
61
|
+
rules.each { |r| r['disabled'] = true } if @disabled
|
62
|
+
conf['sanitization'] = rules.to_a.sort_by { |r| r['title'] }
|
61
63
|
end
|
62
|
-
conf['sanitization'] = rules.to_a.sort_by { |r| r['title'] }
|
63
64
|
end
|
64
65
|
end
|
65
|
-
end
|
data/lib/sitediff/sanitize.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff'
|
2
4
|
require 'sitediff/exception'
|
3
5
|
require 'sitediff/sanitize/dom_transform'
|
@@ -6,183 +8,176 @@ require 'nokogiri'
|
|
6
8
|
require 'set'
|
7
9
|
|
8
10
|
class SiteDiff
|
9
|
-
class Sanitizer
|
10
|
-
class InvalidSanitization < SiteDiffException; end
|
11
|
-
|
12
|
-
TOOLS = {
|
13
|
-
|
14
|
-
|
15
|
-
}
|
16
|
-
DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
|
17
|
-
|
18
|
-
def initialize(html, config, opts = {})
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
23
|
-
|
24
|
-
def sanitize
|
25
|
-
return '' if @html == '' # Quick return on empty input
|
11
|
+
class Sanitizer
|
12
|
+
class InvalidSanitization < SiteDiffException; end
|
13
|
+
|
14
|
+
TOOLS = {
|
15
|
+
array: %w[dom_transform sanitization],
|
16
|
+
scalar: %w[selector remove_spacing]
|
17
|
+
}.freeze
|
18
|
+
DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
|
19
|
+
|
20
|
+
def initialize(html, config, opts = {})
|
21
|
+
@html = html
|
22
|
+
@config = config
|
23
|
+
@opts = opts
|
24
|
+
end
|
26
25
|
|
27
|
-
|
26
|
+
def sanitize
|
27
|
+
return '' if @html == '' # Quick return on empty input
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
dom_transforms
|
32
|
-
regexps
|
29
|
+
@node = Sanitizer.domify(@html)
|
30
|
+
@html = nil
|
33
31
|
|
34
|
-
|
35
|
-
|
32
|
+
remove_spacing
|
33
|
+
selector
|
34
|
+
dom_transforms
|
35
|
+
regexps
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
return false unless rule
|
40
|
-
return false if rule['disabled']
|
37
|
+
@html || Sanitizer.prettify(@node)
|
38
|
+
end
|
41
39
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
40
|
+
# Return whether or not we want to keep a rule
|
41
|
+
def want_rule(rule)
|
42
|
+
return false unless rule
|
43
|
+
return false if rule['disabled']
|
46
44
|
|
47
|
-
|
48
|
-
|
45
|
+
# Filter out if path regexp doesn't match
|
46
|
+
if (pathre = rule['path']) && (path = @opts[:path])
|
47
|
+
return ::Regexp.new(pathre).match(path)
|
48
|
+
end
|
49
49
|
|
50
|
-
|
51
|
-
|
52
|
-
# Turn it into an array of hashes.
|
53
|
-
def canonicalize_rule(name)
|
54
|
-
rules = @config[name] or return nil
|
55
|
-
|
56
|
-
if rules[0] && rules[0].respond_to?(:[]) && rules[0]['value']
|
57
|
-
# Already an array
|
58
|
-
elsif rules['value']
|
59
|
-
# Hash, put it in an array
|
60
|
-
rules = [rules]
|
61
|
-
else
|
62
|
-
# Scalar, put it in a hash
|
63
|
-
rules = [{ 'value' => rules }]
|
64
|
-
end
|
50
|
+
true
|
51
|
+
end
|
65
52
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
53
|
+
# Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
|
54
|
+
# It may be a simple value, or a hash, or an array of hashes.
|
55
|
+
# Turn it into an array of hashes.
|
56
|
+
def canonicalize_rule(name)
|
57
|
+
(rules = @config[name]) || (return nil)
|
58
|
+
|
59
|
+
if rules[0]&.respond_to?(:[]) && rules[0]['value']
|
60
|
+
# Already an array
|
61
|
+
elsif rules['value']
|
62
|
+
# Hash, put it in an array
|
63
|
+
rules = [rules]
|
64
|
+
else
|
65
|
+
# Scalar, put it in a hash
|
66
|
+
rules = [{ 'value' => rules }]
|
67
|
+
end
|
68
|
+
|
69
|
+
want = rules.select { |r| want_rule(r) }
|
70
|
+
return nil if want.empty?
|
71
|
+
raise "Too many matching rules of type #{name}" if want.size > 1
|
72
|
+
|
73
|
+
want.first
|
74
|
+
end
|
71
75
|
|
72
|
-
# Perform 'remove_spacing' action
|
73
|
-
def remove_spacing
|
74
|
-
|
75
|
-
|
76
|
-
end
|
76
|
+
# Perform 'remove_spacing' action
|
77
|
+
def remove_spacing
|
78
|
+
(rule = canonicalize_rule('remove_spacing')) || return
|
79
|
+
Sanitizer.remove_node_spacing(@node) if rule['value']
|
80
|
+
end
|
77
81
|
|
78
|
-
# Perform 'selector' action, to choose a new root
|
79
|
-
def selector
|
80
|
-
|
81
|
-
|
82
|
-
end
|
82
|
+
# Perform 'selector' action, to choose a new root
|
83
|
+
def selector
|
84
|
+
(rule = canonicalize_rule('selector')) || return
|
85
|
+
@node = Sanitizer.select_fragments(@node, rule['value'])
|
86
|
+
end
|
83
87
|
|
84
|
-
# Applies regexps. Also
|
85
|
-
def regexps
|
86
|
-
|
87
|
-
|
88
|
+
# Applies regexps. Also
|
89
|
+
def regexps
|
90
|
+
(rules = @config['sanitization']) || return
|
91
|
+
rules = rules.select { |r| want_rule(r) }
|
88
92
|
|
89
|
-
|
90
|
-
|
93
|
+
rules.map! { |r| Regexp.create(r) }
|
94
|
+
selector, global = rules.partition(&:selector?)
|
91
95
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
+
selector.each { |r| r.apply(@node) }
|
97
|
+
@html = Sanitizer.prettify(@node)
|
98
|
+
@node = nil
|
99
|
+
global.each { |r| r.apply(@html) }
|
100
|
+
end
|
96
101
|
|
97
|
-
# Perform DOM transforms
|
98
|
-
def dom_transforms
|
99
|
-
|
100
|
-
|
102
|
+
# Perform DOM transforms
|
103
|
+
def dom_transforms
|
104
|
+
(rules = @config['dom_transform']) || return
|
105
|
+
rules = rules.select { |r| want_rule(r) }
|
101
106
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
end
|
107
|
+
rules.each do |rule|
|
108
|
+
transform = DomTransform.create(rule)
|
109
|
+
transform.apply(@node)
|
110
|
+
end
|
111
|
+
end
|
107
112
|
|
108
|
-
##### Implementations of actions #####
|
113
|
+
##### Implementations of actions #####
|
109
114
|
|
110
|
-
# Remove double-spacing inside text nodes
|
111
|
-
def self.remove_node_spacing(node)
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
end
|
115
|
+
# Remove double-spacing inside text nodes
|
116
|
+
def self.remove_node_spacing(node)
|
117
|
+
# remove double spacing, but only inside text nodes (eg not attributes)
|
118
|
+
node.xpath('//text()').each do |el|
|
119
|
+
el.content = el.content.gsub(/ +/, ' ')
|
120
|
+
end
|
121
|
+
end
|
117
122
|
|
118
|
-
# Get a fragment consisting of the elements matching the selector(s)
|
119
|
-
def self.select_fragments(node, sel)
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
return node
|
128
|
-
end
|
123
|
+
# Get a fragment consisting of the elements matching the selector(s)
|
124
|
+
def self.select_fragments(node, sel)
|
125
|
+
# When we choose a new root, we always become a DocumentFragment,
|
126
|
+
# and lose any DOCTYPE and such.
|
127
|
+
ns = node.css(sel)
|
128
|
+
node = Nokogiri::HTML.fragment('') unless node.fragment?
|
129
|
+
node.children = ns
|
130
|
+
node
|
131
|
+
end
|
129
132
|
|
130
|
-
# Pretty-print some HTML
|
131
|
-
def self.prettify(obj)
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
133
|
+
# Pretty-print some HTML
|
134
|
+
def self.prettify(obj)
|
135
|
+
@stylesheet ||= begin
|
136
|
+
stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
|
137
|
+
Nokogiri::XSLT(File.read(stylesheet_path))
|
138
|
+
end
|
136
139
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
140
|
+
# Pull out the html element's children
|
141
|
+
# The obvious way to do this is to iterate over pretty.css('html'),
|
142
|
+
# but that tends to segfault Nokogiri
|
143
|
+
str = @stylesheet.apply_to(to_document(obj))
|
141
144
|
|
142
|
-
|
145
|
+
# There's a lot of cruft left over,that we don't want
|
143
146
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
147
|
+
# Remove xml declaration and <html> tags
|
148
|
+
str.sub!(/\A<\?xml.*$\n/, '')
|
149
|
+
str.sub!(/\A^<html>$\n/, '')
|
150
|
+
str.sub!(%r{</html>\n\Z}, '')
|
148
151
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
+
# Remove top-level indentation
|
153
|
+
indent = /\A(\s*)/.match(str)[1].size
|
154
|
+
str.gsub!(/^\s{,#{indent}}/, '')
|
152
155
|
|
153
|
-
|
154
|
-
|
156
|
+
# Remove blank lines
|
157
|
+
str.gsub!(/^\s*$\n/, '')
|
155
158
|
|
156
|
-
|
157
|
-
end
|
159
|
+
str
|
160
|
+
end
|
158
161
|
|
159
|
-
# Parse HTML into a node
|
160
|
-
def self.domify(str, force_doc = false)
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
end
|
162
|
+
# Parse HTML into a node
|
163
|
+
def self.domify(str, force_doc = false)
|
164
|
+
if force_doc || /<!DOCTYPE/.match(str[0, 512])
|
165
|
+
Nokogiri::HTML(str)
|
166
|
+
else
|
167
|
+
Nokogiri::HTML.fragment(str)
|
168
|
+
end
|
169
|
+
end
|
167
170
|
|
168
|
-
# Force this object to be a document, so we can apply a stylesheet
|
169
|
-
def self.to_document(obj)
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
doc = Nokogiri::HTML('<html><body>')
|
179
|
-
doc.at('body').children = obj.children
|
180
|
-
return doc
|
171
|
+
# Force this object to be a document, so we can apply a stylesheet
|
172
|
+
def self.to_document(obj)
|
173
|
+
if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
|
174
|
+
obj
|
175
|
+
# node or fragment
|
176
|
+
elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
|
177
|
+
domify(obj.to_s, true)
|
178
|
+
else
|
179
|
+
to_document(domify(obj, false))
|
180
|
+
end
|
181
181
|
end
|
182
|
-
else
|
183
|
-
return to_document(domify(obj))
|
184
182
|
end
|
185
183
|
end
|
186
|
-
|
187
|
-
end
|
188
|
-
end
|