sitediff 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ sanitization:
2
+ - title: Strip Drupal.settings
3
+ selector: script
4
+ pattern: '^(<script>)?jQuery.extend\(Drupal.settings.*$'
5
+ - title: Strip form build ID
6
+ selector: input
7
+ pattern: 'name="form_build_id" value="form-[-\w]{43}"'
8
+ substitution: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
9
+ - title: Strip view DOM ID
10
+ pattern: '(class="view .*) view-dom-id-[a-f0-9]{32}"'
11
+ substitution: '\1 view-dom-id-DRUPAL_VIEW_DOM_ID"'
12
+ - title: Strip CSS aggregation filenames
13
+ selector: link[rel=stylesheet]
14
+ pattern: '(href="[^"]*/files/css/css_)[-\w]{43}\.css"'
15
+ substitution: '\1DRUPAL_AGGREGATED_CSS.css"'
16
+ - title: Strip JS aggregation filenames
17
+ selector: script
18
+ pattern: '(src="[^"]*/files/js/js_)[-\w]{43}\.js"'
19
+ substitution: '\1DRUPAL_AGGREGATED_JS.js"'
20
+ - title: Strip CSS/JS cache IDs
21
+ selector: style, script
22
+ pattern: '("[^"]*\.(js|css))\?[a-z0-9]{6}"'
23
+ substitution: '\1'
24
+ - title: Strip IE CSS/JS cache IDs
25
+ pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
26
+ substitution: '\1'
27
+ - title: Strip Drupal JS version tags
28
+ selector: script
29
+ pattern: '(src="[^"]*/misc/\w+\.js)?v=\d+\.\d+"'
30
+ substitution: '\1'
31
+ - title: Strip domain names from absolute URLs
32
+ pattern: 'http:\/\/[a-zA-Z0-9.:-]+'
33
+ substitute: '__domain__'
@@ -0,0 +1,13 @@
1
+ <html>
2
+ <head>
3
+ <title>Comparison for <%= path %></title>
4
+ <style>
5
+ <%= SiteDiff::Diff.css %>
6
+ </style>
7
+ <meta charset="utf-8" />
8
+ </head>
9
+ <body id="sidebyside">
10
+ <iframe src="<%= before %>"></iframe>
11
+ <iframe src="<%= after %>"></iframe>
12
+ </body>
13
+ </html>
@@ -33,6 +33,7 @@
33
33
  background-color: salmon;
34
34
  }
35
35
  .sitediff .before-col,
36
+ .sitediff .both-col,
36
37
  .sitediff .after-col,
37
38
  .sitediff .diff-stat-col {
38
39
  width: 10%;
@@ -40,3 +41,13 @@
40
41
  .sitediff .path-col {
41
42
  width: 55%;
42
43
  }
44
+
45
+ #sidebyside {
46
+ margin: 0;
47
+ }
48
+ #sidebyside iframe {
49
+ float: left;
50
+ height: 100%;
51
+ width: 50%;
52
+ border: 0;
53
+ }
@@ -1,8 +1,10 @@
1
- require 'fileutils'
1
+ require 'sitediff'
2
+ require 'sitediff/diff'
2
3
  require 'digest/sha1'
4
+ require 'fileutils'
3
5
 
4
6
  class SiteDiff
5
- class Result < Struct.new(:path, :before, :after, :error)
7
+ class Result < Struct.new(:path, :before, :after, :error, :verbose)
6
8
  STATUS_SUCCESS = 0 # Identical before and after
7
9
  STATUS_FAILURE = 1 # Different before and after
8
10
  STATUS_ERROR = 2 # Couldn't fetch page
@@ -30,8 +32,9 @@ class SiteDiff
30
32
  end
31
33
 
32
34
  # Printable URL
33
- def url(prefix)
34
- prefix.to_s + path
35
+ def url(tag, prefix, cache)
36
+ base = cache.read_tags.include?(tag) ? "/cache/#{tag}" : prefix
37
+ base.to_s + path
35
38
  end
36
39
 
37
40
  # Filename to store diff
@@ -49,15 +52,15 @@ class SiteDiff
49
52
  end
50
53
 
51
54
  # Log the result to the terminal
52
- def log
55
+ def log(verbose=true)
53
56
  case status
54
57
  when STATUS_SUCCESS then
55
- SiteDiff::log path, :success, 'SUCCESS'
58
+ SiteDiff::log path, :diff_success, 'SUCCESS'
56
59
  when STATUS_ERROR then
57
- SiteDiff::log path, :error, "ERROR (#{error})"
60
+ SiteDiff::log path, :warn, "ERROR (#{error})"
58
61
  when STATUS_FAILURE then
59
- SiteDiff::log path, :failure, "FAILURE"
60
- puts Diff::terminal_diffy(before, after)
62
+ SiteDiff::log path, :diff_failure, "FAILURE"
63
+ puts Diff::terminal_diffy(before, after) if verbose
61
64
  end
62
65
  end
63
66
 
@@ -0,0 +1,65 @@
1
+ require 'sitediff/sanitize/regexp'
2
+ require 'pathname'
3
+ require 'set'
4
+
5
+ class SiteDiff
6
+ # Find appropriate rules for a given site
7
+ class Rules
8
+ def initialize(config, disabled = false)
9
+ @disabled = disabled
10
+ @config = config
11
+ find_sanitization_candidates
12
+ @rules = Hash.new { |h, k| h[k] = Set.new }
13
+ end
14
+
15
+ def find_sanitization_candidates
16
+ @candidates = Set.new
17
+
18
+ rules_dir = Pathname.new(__FILE__).dirname + 'files' + 'rules'
19
+ rules_dir.children.each do |f|
20
+ next unless f.file? && f.extname == '.yaml'
21
+ conf = YAML.load_file(f)
22
+ @candidates.merge(conf['sanitization'])
23
+ end
24
+ end
25
+
26
+ def handle_page(tag, html, doc)
27
+ found = find_rules(html, doc)
28
+ @rules[tag].merge(found)
29
+ end
30
+
31
+ # Yield a set of rules that seem reasonable for this HTML
32
+ # assumption: the YAML file is a list of regexp rules only
33
+ def find_rules(html, doc)
34
+ rules = []
35
+
36
+ return @candidates.select do |rule|
37
+ re = SiteDiff::Sanitizer::Regexp.create(rule)
38
+ re.applies?(html, doc)
39
+ end
40
+ end
41
+
42
+ # Find all rules from all rulesets that apply for all pages
43
+ def add_config
44
+ have_both = @rules.include?(:before)
45
+
46
+ r1, r2 = *@rules.values_at(:before, :after)
47
+ if have_both
48
+ add_section('before', r1 - r2)
49
+ add_section('after', r2 - r1)
50
+ add_section(nil, r1 & r2)
51
+ else
52
+ add_section(nil, r2)
53
+ end
54
+ end
55
+
56
+ def add_section(name, rules)
57
+ return if rules.empty?
58
+ conf = name ? @config[name] : @config
59
+ if @disabled
60
+ rules.each { |r| r['disabled'] = true }
61
+ end
62
+ conf['sanitization'] = rules.to_a.sort_by { |r| r['title'] }
63
+ end
64
+ end
65
+ end
@@ -1,193 +1,188 @@
1
+ require 'sitediff'
2
+ require 'sitediff/exception'
3
+ require 'sitediff/sanitize/dom_transform'
4
+ require 'sitediff/sanitize/regexp'
1
5
  require 'nokogiri'
2
6
  require 'set'
3
7
 
4
8
  class SiteDiff
5
- module Sanitize
6
- class InvalidSanitization < Exception; end
7
-
8
- TOOLS = {
9
- :array => %w[dom_transform sanitization],
10
- :scalar => %w[selector remove_spacing],
11
- }
12
- DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
13
-
14
- module_function
15
-
16
- # Performs dom transformations.
17
- #
18
- # Currently supported transforms:
19
- #
20
- # * { :type => "unwrap_root" }
21
- # * { :type => "unwrap", :selector => "div.field-item" }
22
- # * { :type => "remove", :selector => "div.extra-stuff" }
23
- #
24
- # @arg node - Nokogiri document or Node
25
- # @arg rules - array of dom_transform rules
26
- # @return - transformed Nokogiri document node
27
- def perform_dom_transforms(node, rules)
28
- rules.each do |rule|
29
- type = rule['type'] or
30
- raise InvalidSanitization, "DOM transform needs a type"
31
- DOM_TRANSFORMS.include?(type) or
32
- raise InvalidSanitization, "No DOM transform named #{type}"
33
-
34
- meth = 'transform_' + type
35
-
36
- if sels = rule['selector']
37
- sels = [sels].flatten # Either array or scalar is fine
38
- # Call method for each node the selectors find
39
- sels.each do |sel|
40
- node.css(sel).each { |e| send(meth, rule, e) }
41
- end
42
- else
43
- send(meth, rule, node)
44
- end
45
- end
46
- end
9
+ class Sanitizer
10
+ class InvalidSanitization < SiteDiffException; end
11
+
12
+ TOOLS = {
13
+ :array => %w[dom_transform sanitization],
14
+ :scalar => %w[selector remove_spacing],
15
+ }
16
+ DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
17
+
18
+ def initialize(html, config, opts = {})
19
+ @html = html
20
+ @config = config
21
+ @opts = opts
22
+ end
47
23
 
48
- def transform_remove(rule, el)
49
- el.remove
50
- end
51
- def transform_unwrap(rule, el)
52
- el.add_next_sibling(el.children)
53
- el.remove
54
- end
55
- def transform_remove_class(rule, el)
56
- # Must call remove_class on a NodeSet!
57
- ns = Nokogiri::XML::NodeSet.new(el.document, [el])
58
- [rule['class']].flatten.each do |class_name|
59
- ns.remove_class(class_name)
60
- end
61
- end
62
- def transform_unwrap_root(rule, node)
63
- node.children.size == 1 or
64
- raise InvalidSanitization, "Multiple root elements in unwrap_root"
65
- node.children = node.children[0].children
66
- end
24
+ def sanitize
25
+ return '' if @html == '' # Quick return on empty input
67
26
 
68
- def parse(str, force_doc = false, log_errors = false)
69
- if force_doc || /<!DOCTYPE/.match(str[0, 512])
70
- doc = Nokogiri::HTML(str)
71
- doc
72
- else
73
- doc = Nokogiri::HTML.fragment(str)
74
- end
75
- if log_errors
76
- doc.errors.each do |e|
77
- SiteDiff::log "Error in parsing HTML document: #{e}", :error
78
- end
79
- end
80
- doc
81
- end
27
+ @node, @html = Sanitizer.domify(@html), nil
82
28
 
83
- # Force this object to be a document, so we can apply a stylesheet
84
- def to_document(obj)
85
- if Nokogiri::XML::Document === obj
86
- return obj
87
- elsif Nokogiri::XML::Node === obj # or fragment
88
- return parse(obj.to_s, true)
89
-
90
- # This ought to work, and would be faster,
91
- # but seems to segfault Nokogiri
92
- # doc = Nokogiri::HTML('<html><body>')
93
- # doc.at('body').children = obj.children
94
- # return doc
95
- else
96
- return to_document(parse(obj))
97
- end
98
- end
29
+ remove_spacing
30
+ selector
31
+ dom_transforms
32
+ regexps
99
33
 
100
- # Pretty-print the HTML
101
- def prettify(obj)
102
- @stylesheet ||= begin
103
- stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
104
- Nokogiri::XSLT(File.read(stylesheet_path))
105
- end
34
+ return @html || Sanitizer.prettify(@node)
35
+ end
106
36
 
107
- # Pull out the html element's children
108
- # The obvious way to do this is to iterate over pretty.css('html'),
109
- # but that tends to segfault Nokogiri
110
- str = @stylesheet.apply_to(to_document(obj))
37
+ # Return whether or not we want to keep a rule
38
+ def want_rule(rule)
39
+ return false unless rule
40
+ return false if rule['disabled']
41
+
42
+ # Filter out if path regexp doesn't match
43
+ if (pathre = rule['path']) and (path = @opts[:path])
44
+ return ::Regexp.new(pathre).match(path)
45
+ end
111
46
 
112
- # Remove xml declaration and <html> tags
113
- str.sub!(/\A<\?xml.*$\n/, '')
114
- str.sub!(/\A^<html>$\n/, '')
115
- str.sub!(%r[</html>\n\Z], '')
47
+ return true
48
+ end
116
49
 
117
- # Remove top-level indentation
118
- indent = /\A(\s*)/.match(str)[1].size
119
- str.gsub!(/^\s{,#{indent}}/, '')
50
+ # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
51
+ # It may be a simple value, or a hash, or an array of hashes.
52
+ # Turn it into an array of hashes.
53
+ def canonicalize_rule(name)
54
+ rules = @config[name] or return nil
55
+
56
+ if rules[0] && rules[0].respond_to?(:[]) && rules[0]['value']
57
+ # Already an array
58
+ elsif rules['value']
59
+ # Hash, put it in an array
60
+ rules = [rules]
61
+ else
62
+ # Scalar, put it in a hash
63
+ rules = [{ 'value' => rules }]
64
+ end
120
65
 
121
- # Remove blank lines
122
- str.gsub!(/^\s*$\n/, '')
66
+ want = rules.select { |r| want_rule(r) }
67
+ return nil if want.empty?
68
+ raise "Too many matching rules of type #{name}" if want.size > 1
69
+ return want.first
70
+ end
123
71
 
124
- return str
125
- end
72
+ # Perform 'remove_spacing' action
73
+ def remove_spacing
74
+ rule = canonicalize_rule('remove_spacing') or return
75
+ Sanitizer.remove_node_spacing(@node) if rule['value']
76
+ end
126
77
 
127
- def remove_spacing(doc)
128
- # remove double spacing, but only inside text nodes (eg not attributes)
129
- doc.xpath('//text()').each do |node|
130
- node.content = node.content.gsub(/ +/, ' ')
131
- end
132
- end
78
+ # Perform 'selector' action, to choose a new root
79
+ def selector
80
+ rule = canonicalize_rule('selector') or return
81
+ @node = Sanitizer.select_fragments(@node, rule['value'])
82
+ end
133
83
 
134
- # Do one regexp transformation on a string
135
- def substitute(str, rule)
136
- #FIXME escape forward slashes, right now we are escaping them in YAML!
137
- str.gsub!(/#{rule['pattern']}/, rule['substitute'] || '' )
138
- str
139
- end
84
+ # Applies regexps. Also
85
+ def regexps
86
+ rules = @config['sanitization'] or return
87
+ rules = rules.select { |r| want_rule(r) }
140
88
 
141
- # Do all regexp sanitization rules
142
- def perform_regexps(node, rules)
143
- rules ||= []
144
-
145
- # First do rules with a selector
146
- rules.each do |rule|
147
- if sel = rule['selector']
148
- node.css(sel).each do |e|
149
- e.replace(substitute(e.to_html, rule))
150
- end
151
- end
152
- end
153
-
154
- # If needed, do rules without a selector. We'd rather not convert to
155
- # a string unless necessary.
156
- global_rules = rules.reject { |r| r['selector'] }
157
- return node if global_rules.empty?
158
-
159
- str = node.to_html # Convert to string
160
- global_rules.each { |r| substitute(str, r) }
161
- return str
162
- end
89
+ rules.map! { |r| Regexp.create(r) }
90
+ selector, global = rules.partition { |r| r.selector? }
163
91
 
164
- def select_root(node, sel)
165
- return node unless sel
166
-
167
- # When we choose a new root, we always become a DocumentFragment,
168
- # and lose any DOCTYPE and such.
169
- ns = node.css(sel)
170
- unless node.fragment?
171
- node = Nokogiri::HTML.fragment('')
172
- end
173
- node.children = ns
174
- return node
175
- end
92
+ selector.each { |r| r.apply(@node) }
93
+ @html, @node = Sanitizer.prettify(@node), nil
94
+ global.each { |r| r.apply(@html) }
95
+ end
176
96
 
177
- def sanitize(str, config)
178
- return '' if str == ''
97
+ # Perform DOM transforms
98
+ def dom_transforms
99
+ rules = @config['dom_transform'] or return
100
+ rules = rules.select { |r| want_rule(r) }
179
101
 
180
- node = parse(str)
102
+ rules.each do |rule|
103
+ transform = DomTransform.create(rule)
104
+ transform.apply(@node)
105
+ end
106
+ end
107
+
108
+ ##### Implementations of actions #####
109
+
110
+ # Remove double-spacing inside text nodes
111
+ def self.remove_node_spacing(node)
112
+ # remove double spacing, but only inside text nodes (eg not attributes)
113
+ node.xpath('//text()').each do |el|
114
+ el.content = el.content.gsub(/ +/, ' ')
115
+ end
116
+ end
117
+
118
+ # Get a fragment consisting of the elements matching the selector(s)
119
+ def self.select_fragments(node, sel)
120
+ # When we choose a new root, we always become a DocumentFragment,
121
+ # and lose any DOCTYPE and such.
122
+ ns = node.css(sel)
123
+ unless node.fragment?
124
+ node = Nokogiri::HTML.fragment('')
125
+ end
126
+ node.children = ns
127
+ return node
128
+ end
129
+
130
+ # Pretty-print some HTML
131
+ def self.prettify(obj)
132
+ @stylesheet ||= begin
133
+ stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
134
+ Nokogiri::XSLT(File.read(stylesheet_path))
135
+ end
181
136
 
182
- remove_spacing(node) if config['remove_spacing']
183
- node = select_root(node, config['selector'])
184
- if transform = config['dom_transform']
185
- perform_dom_transforms(node, transform)
186
- end
137
+ # Pull out the html element's children
138
+ # The obvious way to do this is to iterate over pretty.css('html'),
139
+ # but that tends to segfault Nokogiri
140
+ str = @stylesheet.apply_to(to_document(obj))
187
141
 
188
- obj = perform_regexps(node, config['sanitization'])
142
+ # There's a lot of cruft left over,that we don't want
189
143
 
190
- return prettify(obj)
144
+ # Remove xml declaration and <html> tags
145
+ str.sub!(/\A<\?xml.*$\n/, '')
146
+ str.sub!(/\A^<html>$\n/, '')
147
+ str.sub!(%r[</html>\n\Z], '')
148
+
149
+ # Remove top-level indentation
150
+ indent = /\A(\s*)/.match(str)[1].size
151
+ str.gsub!(/^\s{,#{indent}}/, '')
152
+
153
+ # Remove blank lines
154
+ str.gsub!(/^\s*$\n/, '')
155
+
156
+ return str
157
+ end
158
+
159
+ # Parse HTML into a node
160
+ def self.domify(str, force_doc = false)
161
+ if force_doc || /<!DOCTYPE/.match(str[0, 512])
162
+ return Nokogiri::HTML(str)
163
+ else
164
+ return Nokogiri::HTML.fragment(str)
165
+ end
166
+ end
167
+
168
+ # Force this object to be a document, so we can apply a stylesheet
169
+ def self.to_document(obj)
170
+ if Nokogiri::XML::Document === obj
171
+ return obj
172
+ elsif Nokogiri::XML::Node === obj # node or fragment
173
+ return domify(obj.to_s, true)
174
+
175
+ # This ought to work, and would be faster,
176
+ # but seems to segfault Nokogiri
177
+ if false
178
+ doc = Nokogiri::HTML('<html><body>')
179
+ doc.at('body').children = obj.children
180
+ return doc
191
181
  end
182
+ else
183
+ return to_document(domify(obj))
192
184
  end
193
185
  end
186
+
187
+ end
188
+ end