sitediff 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,33 @@
1
+ sanitization:
2
+ - title: Strip Drupal.settings
3
+ selector: script
4
+ pattern: '^(<script>)?jQuery.extend\(Drupal.settings.*$'
5
+ - title: Strip form build ID
6
+ selector: input
7
+ pattern: 'name="form_build_id" value="form-[-\w]{43}"'
8
+ substitution: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
9
+ - title: Strip view DOM ID
10
+ pattern: '(class="view .*) view-dom-id-[a-f0-9]{32}"'
11
+ substitution: '\1 view-dom-id-DRUPAL_VIEW_DOM_ID"'
12
+ - title: Strip CSS aggregation filenames
13
+ selector: link[rel=stylesheet]
14
+ pattern: '(href="[^"]*/files/css/css_)[-\w]{43}\.css"'
15
+ substitution: '\1DRUPAL_AGGREGATED_CSS.css"'
16
+ - title: Strip JS aggregation filenames
17
+ selector: script
18
+ pattern: '(src="[^"]*/files/js/js_)[-\w]{43}\.js"'
19
+ substitution: '\1DRUPAL_AGGREGATED_JS.js"'
20
+ - title: Strip CSS/JS cache IDs
21
+ selector: style, script
22
+ pattern: '("[^"]*\.(js|css))\?[a-z0-9]{6}"'
23
+ substitution: '\1'
24
+ - title: Strip IE CSS/JS cache IDs
25
+ pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
26
+ substitution: '\1'
27
+ - title: Strip Drupal JS version tags
28
+ selector: script
29
+ pattern: '(src="[^"]*/misc/\w+\.js)?v=\d+\.\d+"'
30
+ substitution: '\1'
31
+ - title: Strip domain names from absolute URLs
32
+ pattern: 'http:\/\/[a-zA-Z0-9.:-]+'
33
+ substitute: '__domain__'
@@ -0,0 +1,13 @@
1
+ <html>
2
+ <head>
3
+ <title>Comparison for <%= path %></title>
4
+ <style>
5
+ <%= SiteDiff::Diff.css %>
6
+ </style>
7
+ <meta charset="utf-8" />
8
+ </head>
9
+ <body id="sidebyside">
10
+ <iframe src="<%= before %>"></iframe>
11
+ <iframe src="<%= after %>"></iframe>
12
+ </body>
13
+ </html>
@@ -33,6 +33,7 @@
33
33
  background-color: salmon;
34
34
  }
35
35
  .sitediff .before-col,
36
+ .sitediff .both-col,
36
37
  .sitediff .after-col,
37
38
  .sitediff .diff-stat-col {
38
39
  width: 10%;
@@ -40,3 +41,13 @@
40
41
  .sitediff .path-col {
41
42
  width: 55%;
42
43
  }
44
+
45
+ #sidebyside {
46
+ margin: 0;
47
+ }
48
+ #sidebyside iframe {
49
+ float: left;
50
+ height: 100%;
51
+ width: 50%;
52
+ border: 0;
53
+ }
@@ -1,8 +1,10 @@
1
- require 'fileutils'
1
+ require 'sitediff'
2
+ require 'sitediff/diff'
2
3
  require 'digest/sha1'
4
+ require 'fileutils'
3
5
 
4
6
  class SiteDiff
5
- class Result < Struct.new(:path, :before, :after, :error)
7
+ class Result < Struct.new(:path, :before, :after, :error, :verbose)
6
8
  STATUS_SUCCESS = 0 # Identical before and after
7
9
  STATUS_FAILURE = 1 # Different before and after
8
10
  STATUS_ERROR = 2 # Couldn't fetch page
@@ -30,8 +32,9 @@ class SiteDiff
30
32
  end
31
33
 
32
34
  # Printable URL
33
- def url(prefix)
34
- prefix.to_s + path
35
+ def url(tag, prefix, cache)
36
+ base = cache.read_tags.include?(tag) ? "/cache/#{tag}" : prefix
37
+ base.to_s + path
35
38
  end
36
39
 
37
40
  # Filename to store diff
@@ -49,15 +52,15 @@ class SiteDiff
49
52
  end
50
53
 
51
54
  # Log the result to the terminal
52
- def log
55
+ def log(verbose=true)
53
56
  case status
54
57
  when STATUS_SUCCESS then
55
- SiteDiff::log path, :success, 'SUCCESS'
58
+ SiteDiff::log path, :diff_success, 'SUCCESS'
56
59
  when STATUS_ERROR then
57
- SiteDiff::log path, :error, "ERROR (#{error})"
60
+ SiteDiff::log path, :warn, "ERROR (#{error})"
58
61
  when STATUS_FAILURE then
59
- SiteDiff::log path, :failure, "FAILURE"
60
- puts Diff::terminal_diffy(before, after)
62
+ SiteDiff::log path, :diff_failure, "FAILURE"
63
+ puts Diff::terminal_diffy(before, after) if verbose
61
64
  end
62
65
  end
63
66
 
@@ -0,0 +1,65 @@
1
+ require 'sitediff/sanitize/regexp'
2
+ require 'pathname'
3
+ require 'set'
4
+
5
+ class SiteDiff
6
+ # Find appropriate rules for a given site
7
+ class Rules
8
+ def initialize(config, disabled = false)
9
+ @disabled = disabled
10
+ @config = config
11
+ find_sanitization_candidates
12
+ @rules = Hash.new { |h, k| h[k] = Set.new }
13
+ end
14
+
15
+ def find_sanitization_candidates
16
+ @candidates = Set.new
17
+
18
+ rules_dir = Pathname.new(__FILE__).dirname + 'files' + 'rules'
19
+ rules_dir.children.each do |f|
20
+ next unless f.file? && f.extname == '.yaml'
21
+ conf = YAML.load_file(f)
22
+ @candidates.merge(conf['sanitization'])
23
+ end
24
+ end
25
+
26
+ def handle_page(tag, html, doc)
27
+ found = find_rules(html, doc)
28
+ @rules[tag].merge(found)
29
+ end
30
+
31
+ # Yield a set of rules that seem reasonable for this HTML
32
+ # assumption: the YAML file is a list of regexp rules only
33
+ def find_rules(html, doc)
34
+ rules = []
35
+
36
+ return @candidates.select do |rule|
37
+ re = SiteDiff::Sanitizer::Regexp.create(rule)
38
+ re.applies?(html, doc)
39
+ end
40
+ end
41
+
42
+ # Find all rules from all rulesets that apply for all pages
43
+ def add_config
44
+ have_both = @rules.include?(:before)
45
+
46
+ r1, r2 = *@rules.values_at(:before, :after)
47
+ if have_both
48
+ add_section('before', r1 - r2)
49
+ add_section('after', r2 - r1)
50
+ add_section(nil, r1 & r2)
51
+ else
52
+ add_section(nil, r2)
53
+ end
54
+ end
55
+
56
+ def add_section(name, rules)
57
+ return if rules.empty?
58
+ conf = name ? @config[name] : @config
59
+ if @disabled
60
+ rules.each { |r| r['disabled'] = true }
61
+ end
62
+ conf['sanitization'] = rules.to_a.sort_by { |r| r['title'] }
63
+ end
64
+ end
65
+ end
@@ -1,193 +1,188 @@
1
+ require 'sitediff'
2
+ require 'sitediff/exception'
3
+ require 'sitediff/sanitize/dom_transform'
4
+ require 'sitediff/sanitize/regexp'
1
5
  require 'nokogiri'
2
6
  require 'set'
3
7
 
4
8
  class SiteDiff
5
- module Sanitize
6
- class InvalidSanitization < Exception; end
7
-
8
- TOOLS = {
9
- :array => %w[dom_transform sanitization],
10
- :scalar => %w[selector remove_spacing],
11
- }
12
- DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
13
-
14
- module_function
15
-
16
- # Performs dom transformations.
17
- #
18
- # Currently supported transforms:
19
- #
20
- # * { :type => "unwrap_root" }
21
- # * { :type => "unwrap", :selector => "div.field-item" }
22
- # * { :type => "remove", :selector => "div.extra-stuff" }
23
- #
24
- # @arg node - Nokogiri document or Node
25
- # @arg rules - array of dom_transform rules
26
- # @return - transformed Nokogiri document node
27
- def perform_dom_transforms(node, rules)
28
- rules.each do |rule|
29
- type = rule['type'] or
30
- raise InvalidSanitization, "DOM transform needs a type"
31
- DOM_TRANSFORMS.include?(type) or
32
- raise InvalidSanitization, "No DOM transform named #{type}"
33
-
34
- meth = 'transform_' + type
35
-
36
- if sels = rule['selector']
37
- sels = [sels].flatten # Either array or scalar is fine
38
- # Call method for each node the selectors find
39
- sels.each do |sel|
40
- node.css(sel).each { |e| send(meth, rule, e) }
41
- end
42
- else
43
- send(meth, rule, node)
44
- end
45
- end
46
- end
9
+ class Sanitizer
10
+ class InvalidSanitization < SiteDiffException; end
11
+
12
+ TOOLS = {
13
+ :array => %w[dom_transform sanitization],
14
+ :scalar => %w[selector remove_spacing],
15
+ }
16
+ DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
17
+
18
+ def initialize(html, config, opts = {})
19
+ @html = html
20
+ @config = config
21
+ @opts = opts
22
+ end
47
23
 
48
- def transform_remove(rule, el)
49
- el.remove
50
- end
51
- def transform_unwrap(rule, el)
52
- el.add_next_sibling(el.children)
53
- el.remove
54
- end
55
- def transform_remove_class(rule, el)
56
- # Must call remove_class on a NodeSet!
57
- ns = Nokogiri::XML::NodeSet.new(el.document, [el])
58
- [rule['class']].flatten.each do |class_name|
59
- ns.remove_class(class_name)
60
- end
61
- end
62
- def transform_unwrap_root(rule, node)
63
- node.children.size == 1 or
64
- raise InvalidSanitization, "Multiple root elements in unwrap_root"
65
- node.children = node.children[0].children
66
- end
24
+ def sanitize
25
+ return '' if @html == '' # Quick return on empty input
67
26
 
68
- def parse(str, force_doc = false, log_errors = false)
69
- if force_doc || /<!DOCTYPE/.match(str[0, 512])
70
- doc = Nokogiri::HTML(str)
71
- doc
72
- else
73
- doc = Nokogiri::HTML.fragment(str)
74
- end
75
- if log_errors
76
- doc.errors.each do |e|
77
- SiteDiff::log "Error in parsing HTML document: #{e}", :error
78
- end
79
- end
80
- doc
81
- end
27
+ @node, @html = Sanitizer.domify(@html), nil
82
28
 
83
- # Force this object to be a document, so we can apply a stylesheet
84
- def to_document(obj)
85
- if Nokogiri::XML::Document === obj
86
- return obj
87
- elsif Nokogiri::XML::Node === obj # or fragment
88
- return parse(obj.to_s, true)
89
-
90
- # This ought to work, and would be faster,
91
- # but seems to segfault Nokogiri
92
- # doc = Nokogiri::HTML('<html><body>')
93
- # doc.at('body').children = obj.children
94
- # return doc
95
- else
96
- return to_document(parse(obj))
97
- end
98
- end
29
+ remove_spacing
30
+ selector
31
+ dom_transforms
32
+ regexps
99
33
 
100
- # Pretty-print the HTML
101
- def prettify(obj)
102
- @stylesheet ||= begin
103
- stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
104
- Nokogiri::XSLT(File.read(stylesheet_path))
105
- end
34
+ return @html || Sanitizer.prettify(@node)
35
+ end
106
36
 
107
- # Pull out the html element's children
108
- # The obvious way to do this is to iterate over pretty.css('html'),
109
- # but that tends to segfault Nokogiri
110
- str = @stylesheet.apply_to(to_document(obj))
37
+ # Return whether or not we want to keep a rule
38
+ def want_rule(rule)
39
+ return false unless rule
40
+ return false if rule['disabled']
41
+
42
+ # Filter out if path regexp doesn't match
43
+ if (pathre = rule['path']) and (path = @opts[:path])
44
+ return ::Regexp.new(pathre).match(path)
45
+ end
111
46
 
112
- # Remove xml declaration and <html> tags
113
- str.sub!(/\A<\?xml.*$\n/, '')
114
- str.sub!(/\A^<html>$\n/, '')
115
- str.sub!(%r[</html>\n\Z], '')
47
+ return true
48
+ end
116
49
 
117
- # Remove top-level indentation
118
- indent = /\A(\s*)/.match(str)[1].size
119
- str.gsub!(/^\s{,#{indent}}/, '')
50
+ # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
51
+ # It may be a simple value, or a hash, or an array of hashes.
52
+ # Turn it into an array of hashes.
53
+ def canonicalize_rule(name)
54
+ rules = @config[name] or return nil
55
+
56
+ if rules[0] && rules[0].respond_to?(:[]) && rules[0]['value']
57
+ # Already an array
58
+ elsif rules['value']
59
+ # Hash, put it in an array
60
+ rules = [rules]
61
+ else
62
+ # Scalar, put it in a hash
63
+ rules = [{ 'value' => rules }]
64
+ end
120
65
 
121
- # Remove blank lines
122
- str.gsub!(/^\s*$\n/, '')
66
+ want = rules.select { |r| want_rule(r) }
67
+ return nil if want.empty?
68
+ raise "Too many matching rules of type #{name}" if want.size > 1
69
+ return want.first
70
+ end
123
71
 
124
- return str
125
- end
72
+ # Perform 'remove_spacing' action
73
+ def remove_spacing
74
+ rule = canonicalize_rule('remove_spacing') or return
75
+ Sanitizer.remove_node_spacing(@node) if rule['value']
76
+ end
126
77
 
127
- def remove_spacing(doc)
128
- # remove double spacing, but only inside text nodes (eg not attributes)
129
- doc.xpath('//text()').each do |node|
130
- node.content = node.content.gsub(/ +/, ' ')
131
- end
132
- end
78
+ # Perform 'selector' action, to choose a new root
79
+ def selector
80
+ rule = canonicalize_rule('selector') or return
81
+ @node = Sanitizer.select_fragments(@node, rule['value'])
82
+ end
133
83
 
134
- # Do one regexp transformation on a string
135
- def substitute(str, rule)
136
- #FIXME escape forward slashes, right now we are escaping them in YAML!
137
- str.gsub!(/#{rule['pattern']}/, rule['substitute'] || '' )
138
- str
139
- end
84
+ # Applies regexps. Also
85
+ def regexps
86
+ rules = @config['sanitization'] or return
87
+ rules = rules.select { |r| want_rule(r) }
140
88
 
141
- # Do all regexp sanitization rules
142
- def perform_regexps(node, rules)
143
- rules ||= []
144
-
145
- # First do rules with a selector
146
- rules.each do |rule|
147
- if sel = rule['selector']
148
- node.css(sel).each do |e|
149
- e.replace(substitute(e.to_html, rule))
150
- end
151
- end
152
- end
153
-
154
- # If needed, do rules without a selector. We'd rather not convert to
155
- # a string unless necessary.
156
- global_rules = rules.reject { |r| r['selector'] }
157
- return node if global_rules.empty?
158
-
159
- str = node.to_html # Convert to string
160
- global_rules.each { |r| substitute(str, r) }
161
- return str
162
- end
89
+ rules.map! { |r| Regexp.create(r) }
90
+ selector, global = rules.partition { |r| r.selector? }
163
91
 
164
- def select_root(node, sel)
165
- return node unless sel
166
-
167
- # When we choose a new root, we always become a DocumentFragment,
168
- # and lose any DOCTYPE and such.
169
- ns = node.css(sel)
170
- unless node.fragment?
171
- node = Nokogiri::HTML.fragment('')
172
- end
173
- node.children = ns
174
- return node
175
- end
92
+ selector.each { |r| r.apply(@node) }
93
+ @html, @node = Sanitizer.prettify(@node), nil
94
+ global.each { |r| r.apply(@html) }
95
+ end
176
96
 
177
- def sanitize(str, config)
178
- return '' if str == ''
97
+ # Perform DOM transforms
98
+ def dom_transforms
99
+ rules = @config['dom_transform'] or return
100
+ rules = rules.select { |r| want_rule(r) }
179
101
 
180
- node = parse(str)
102
+ rules.each do |rule|
103
+ transform = DomTransform.create(rule)
104
+ transform.apply(@node)
105
+ end
106
+ end
107
+
108
+ ##### Implementations of actions #####
109
+
110
+ # Remove double-spacing inside text nodes
111
+ def self.remove_node_spacing(node)
112
+ # remove double spacing, but only inside text nodes (eg not attributes)
113
+ node.xpath('//text()').each do |el|
114
+ el.content = el.content.gsub(/ +/, ' ')
115
+ end
116
+ end
117
+
118
+ # Get a fragment consisting of the elements matching the selector(s)
119
+ def self.select_fragments(node, sel)
120
+ # When we choose a new root, we always become a DocumentFragment,
121
+ # and lose any DOCTYPE and such.
122
+ ns = node.css(sel)
123
+ unless node.fragment?
124
+ node = Nokogiri::HTML.fragment('')
125
+ end
126
+ node.children = ns
127
+ return node
128
+ end
129
+
130
+ # Pretty-print some HTML
131
+ def self.prettify(obj)
132
+ @stylesheet ||= begin
133
+ stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
134
+ Nokogiri::XSLT(File.read(stylesheet_path))
135
+ end
181
136
 
182
- remove_spacing(node) if config['remove_spacing']
183
- node = select_root(node, config['selector'])
184
- if transform = config['dom_transform']
185
- perform_dom_transforms(node, transform)
186
- end
137
+ # Pull out the html element's children
138
+ # The obvious way to do this is to iterate over pretty.css('html'),
139
+ # but that tends to segfault Nokogiri
140
+ str = @stylesheet.apply_to(to_document(obj))
187
141
 
188
- obj = perform_regexps(node, config['sanitization'])
142
+ # There's a lot of cruft left over,that we don't want
189
143
 
190
- return prettify(obj)
144
+ # Remove xml declaration and <html> tags
145
+ str.sub!(/\A<\?xml.*$\n/, '')
146
+ str.sub!(/\A^<html>$\n/, '')
147
+ str.sub!(%r[</html>\n\Z], '')
148
+
149
+ # Remove top-level indentation
150
+ indent = /\A(\s*)/.match(str)[1].size
151
+ str.gsub!(/^\s{,#{indent}}/, '')
152
+
153
+ # Remove blank lines
154
+ str.gsub!(/^\s*$\n/, '')
155
+
156
+ return str
157
+ end
158
+
159
+ # Parse HTML into a node
160
+ def self.domify(str, force_doc = false)
161
+ if force_doc || /<!DOCTYPE/.match(str[0, 512])
162
+ return Nokogiri::HTML(str)
163
+ else
164
+ return Nokogiri::HTML.fragment(str)
165
+ end
166
+ end
167
+
168
+ # Force this object to be a document, so we can apply a stylesheet
169
+ def self.to_document(obj)
170
+ if Nokogiri::XML::Document === obj
171
+ return obj
172
+ elsif Nokogiri::XML::Node === obj # node or fragment
173
+ return domify(obj.to_s, true)
174
+
175
+ # This ought to work, and would be faster,
176
+ # but seems to segfault Nokogiri
177
+ if false
178
+ doc = Nokogiri::HTML('<html><body>')
179
+ doc.at('body').children = obj.children
180
+ return doc
191
181
  end
182
+ else
183
+ return to_document(domify(obj))
192
184
  end
193
185
  end
186
+
187
+ end
188
+ end