sitediff 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/lib/sitediff/diff.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'diffy'
3
5
  require 'erb'
@@ -9,26 +11,26 @@ class SiteDiff
9
11
 
10
12
  def html_diffy(before_html, after_html)
11
13
  diff = Diffy::Diff.new(before_html, after_html)
12
- diff.first ? # Is it non-empty?
14
+ diff.first ? # Is it non-empty?
13
15
  diff.to_s(:html) : nil
14
16
  end
15
17
 
16
18
  def terminal_diffy(before_html, after_html)
17
19
  args = []
18
20
  args << :color if Rainbow.enabled
19
- return Diffy::Diff.new(before_html, after_html, :context => 3).
20
- to_s(*args)
21
+ Diffy::Diff.new(before_html, after_html, context: 3)
22
+ .to_s(*args)
21
23
  end
22
24
 
23
25
  def generate_html_report(results, before, after, cache)
24
26
  erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
25
27
  report_html = ERB.new(File.read(erb_path)).result(binding)
26
- return report_html
28
+ report_html
27
29
  end
28
30
 
29
31
  def generate_diff_output(result)
30
32
  erb_path = File.join(SiteDiff::FILES_DIR, 'diff.html.erb')
31
- return ERB.new(File.read(erb_path)).result(binding)
33
+ ERB.new(File.read(erb_path)).result(binding)
32
34
  end
33
35
 
34
36
  def css
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class SiteDiff
2
- class SiteDiffException < Exception; end
4
+ class SiteDiffException < RuntimeError; end
3
5
  end
@@ -1,55 +1,61 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/uriwrapper'
2
4
  require 'typhoeus'
3
5
 
4
6
  class SiteDiff
5
- class Fetch
6
- # Cache is a cache object, see sitediff/cache
7
- # Paths is a list of sub-paths
8
- # Tags is a hash of tag names => base URLs.
9
- def initialize(cache, paths, tags)
10
- @cache = cache
11
- @paths = paths
12
- @tags = tags
13
- end
7
+ class Fetch
8
+ # Cache is a cache object, see sitediff/cache
9
+ # Paths is a list of sub-paths
10
+ # Tags is a hash of tag names => base URLs.
11
+ def initialize(cache, paths, concurrency = 3, curl_opts = nil, **tags)
12
+ @cache = cache
13
+ @paths = paths
14
+ @tags = tags
15
+ @curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
16
+ @concurrency = concurrency
17
+ end
14
18
 
15
- # Fetch all the paths, once per tag.
16
- # When a path has been fetched for every tag, block will be called with the
17
- # path, and a hash of tag => UriWrapper::ReadResult objects.
18
- def run(&block)
19
- @callback = block
20
- @hydra = Typhoeus::Hydra.new(max_concurrency: 3)
21
- @paths.each { |path| queue_path(path) }
22
- @hydra.run
23
- end
19
+ # Fetch all the paths, once per tag.
20
+ # When a path has been fetched for every tag, block will be called with the
21
+ # path, and a hash of tag => UriWrapper::ReadResult objects.
22
+ def run(&block)
23
+ @callback = block
24
+ @hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
25
+ @paths.each { |path| queue_path(path) }
26
+ @hydra.run
27
+ end
28
+
29
+ private
24
30
 
25
- private
26
- # Queue a path for fetching
27
- def queue_path(path)
28
- results = {}
29
-
30
- @tags.each do |tag, base|
31
- if res = @cache.get(tag, path)
32
- results[tag] = res
33
- process_results(path, results)
34
- elsif !base
35
- # We only have the cache, but this item isn't cached!
36
- results[tag] = UriWrapper::ReadResult.error("Not cached")
37
- process_results(path, results)
38
- else
39
- uri = UriWrapper.new(base + path)
40
- uri.queue(@hydra) do |res|
41
- @cache.set(tag, path, res)
31
+ # Queue a path for fetching
32
+ def queue_path(path)
33
+ results = {}
34
+
35
+ @tags.each do |tag, base|
36
+ if (res = @cache.get(tag, path))
42
37
  results[tag] = res
43
38
  process_results(path, results)
39
+ elsif !base
40
+ # We only have the cache, but this item isn't cached!
41
+ results[tag] = UriWrapper::ReadResult.error('Not cached')
42
+ process_results(path, results)
43
+ else
44
+ uri = UriWrapper.new(base + path, @curl_opts)
45
+ uri.queue(@hydra) do |resl|
46
+ @cache.set(tag, path, resl)
47
+ results[tag] = resl
48
+ process_results(path, results)
49
+ end
44
50
  end
45
51
  end
46
52
  end
47
- end
48
53
 
49
- # Process fetch results
50
- def process_results(path, results)
51
- return unless results.size == @tags.size
52
- @callback[path, results]
54
+ # Process fetch results
55
+ def process_results(path, results)
56
+ return unless results.size == @tags.size
57
+
58
+ @callback[path, results]
59
+ end
53
60
  end
54
61
  end
55
- end
@@ -21,6 +21,9 @@
21
21
  <a href="<%= eval(tag) %>"><%= eval(tag) %></a>
22
22
  <% end %>
23
23
  </div>
24
+ <div class="run">
25
+ <a href="../run/diff">Rerun diff</a>
26
+ </div>
24
27
  <table class="results">
25
28
 
26
29
  <colgroup>
@@ -2,28 +2,28 @@ sanitization:
2
2
  - title: Strip Drupal.settings
3
3
  selector: script
4
4
  pattern: '^(<script>)?jQuery.extend\(Drupal.settings.*$'
5
+ - title: Strip IE CSS/JS cache IDs
6
+ pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
7
+ substitute: '\1'
5
8
  - title: Strip form build ID
6
9
  selector: input
7
- pattern: 'name="form_build_id" value="form-[-\w]{43}"'
10
+ pattern: 'name="form_build_id" value="form-[-\w]{40,43}"'
8
11
  substitute: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
9
12
  - title: Strip view DOM ID
10
13
  pattern: '(class="view .*) view-dom-id-[a-f0-9]{32}"'
11
14
  substitute: '\1 view-dom-id-DRUPAL_VIEW_DOM_ID"'
12
15
  - title: Strip CSS aggregation filenames
13
16
  selector: link[rel=stylesheet]
14
- pattern: '(href="[^"]*/files/css/css_)[-\w]{43}\.css"'
17
+ pattern: '(href="[^"]*/files/css/css_)[-\w]{40,43}\.css"'
15
18
  substitute: '\1DRUPAL_AGGREGATED_CSS.css"'
16
19
  - title: Strip JS aggregation filenames
17
20
  selector: script
18
- pattern: '(src="[^"]*/files/js/js_)[-\w]{43}\.js"'
21
+ pattern: '(src="[^"]*/files/js/js_)[-\w]{40,43}\.js"'
19
22
  substitute: '\1DRUPAL_AGGREGATED_JS.js"'
20
23
  - title: Strip CSS/JS cache IDs
21
24
  selector: style, script
22
25
  pattern: '("[^"]*\.(js|css))\?[a-z0-9]{6}"'
23
26
  substitute: '\1'
24
- - title: Strip IE CSS/JS cache IDs
25
- pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
26
- substitute: '\1'
27
27
  - title: Strip Drupal JS version tags
28
28
  selector: script
29
29
  pattern: '(src="[^"]*/misc/\w+\.js)?v=\d+\.\d+"'
@@ -31,3 +31,33 @@ sanitization:
31
31
  - title: Strip domain names from absolute URLs
32
32
  pattern: 'http:\/\/[a-zA-Z0-9.:-]+'
33
33
  substitute: '__domain__'
34
+ - title: Strip form build ID
35
+ selector: input
36
+ pattern: 'autocomplete="off" data-drupal-selector="form-[-\w]{40,43}"'
37
+ substitute: 'autocomplete="off" data-drupal-selector="form-DRUPAL_FORM_BUILD_ID"'
38
+ - title: Strip form build ID 2
39
+ selector: input
40
+ pattern: 'name="form_build_id" value="form-[-\w]{40,43}"'
41
+ substitute: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
42
+ - title: Strip Drupal CSS link queries
43
+ selector: link
44
+ pattern: '\.css\?(\w*)'
45
+ substitute: '\.css'
46
+ - title: Strip Drupal JS link queries
47
+ selector: script
48
+ pattern: '\.js\?(\w*)'
49
+ substitute: '\.js'
50
+ - title: Strip Drupal View-DOM ID
51
+ pattern: 'view-dom-id-\w*'
52
+ substitute: 'view-dom-id-_ID_'
53
+ - title: Strip Drupal View-DOM ID 2
54
+ pattern: '(views?_dom_id"?:"?)\w*'
55
+ substitute: '\1_ID_'
56
+ - title: Ignore Drupal CSS file names
57
+ selector: link
58
+ pattern: 'css_[-\w]{40,43}(\\|%5C)?\.css'
59
+ substitute: 'css__ID__.css'
60
+ - title: Ignore Drupal JS file names
61
+ selector: script
62
+ pattern: 'js_[-\w]{40,43}\\?\.js'
63
+ substitute: 'js__ID__.js'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'sitediff/diff'
3
5
  require 'digest/sha1'
@@ -8,7 +10,7 @@ class SiteDiff
8
10
  STATUS_SUCCESS = 0 # Identical before and after
9
11
  STATUS_FAILURE = 1 # Different before and after
10
12
  STATUS_ERROR = 2 # Couldn't fetch page
11
- STATUS_TEXT = %w[success failure error]
13
+ STATUS_TEXT = %w[success failure error].freeze
12
14
 
13
15
  attr_reader :status, :diff
14
16
 
@@ -17,7 +19,7 @@ class SiteDiff
17
19
  if error
18
20
  @status = STATUS_ERROR
19
21
  else
20
- @diff = Diff::html_diffy(before, after)
22
+ @diff = Diff.html_diffy(before, after)
21
23
  @status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
22
24
  end
23
25
  end
@@ -28,7 +30,7 @@ class SiteDiff
28
30
 
29
31
  # Textual representation of the status
30
32
  def status_text
31
- return STATUS_TEXT[status]
33
+ STATUS_TEXT[status]
32
34
  end
33
35
 
34
36
  # Printable URL
@@ -39,7 +41,7 @@ class SiteDiff
39
41
 
40
42
  # Filename to store diff
41
43
  def filename
42
- File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(self.path) + '.html')
44
+ File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
43
45
  end
44
46
 
45
47
  # Text of the link in the HTML report
@@ -52,15 +54,15 @@ class SiteDiff
52
54
  end
53
55
 
54
56
  # Log the result to the terminal
55
- def log(verbose=true)
57
+ def log(verbose = true)
56
58
  case status
57
59
  when STATUS_SUCCESS then
58
- SiteDiff::log path, :diff_success, 'SUCCESS'
60
+ SiteDiff.log path, :diff_success, 'UNCHANGED'
59
61
  when STATUS_ERROR then
60
- SiteDiff::log path, :warn, "ERROR (#{error})"
62
+ SiteDiff.log path, :warn, "ERROR (#{error})"
61
63
  when STATUS_FAILURE then
62
- SiteDiff::log path, :diff_failure, "FAILURE"
63
- puts Diff::terminal_diffy(before, after) if verbose
64
+ SiteDiff.log path, :diff_failure, 'CHANGED'
65
+ puts Diff.terminal_diffy(before, after) if verbose
64
66
  end
65
67
  end
66
68
 
@@ -68,9 +70,9 @@ class SiteDiff
68
70
  def dump(dir)
69
71
  dump_path = File.join(dir, filename)
70
72
  base = File.dirname(dump_path)
71
- FileUtils::mkdir_p(base) unless File.exists?(base)
73
+ FileUtils.mkdir_p(base) unless File.exist?(base)
72
74
  File.open(dump_path, 'w') do |f|
73
- f.write(Diff::generate_diff_output(self))
75
+ f.write(Diff.generate_diff_output(self))
74
76
  end
75
77
  end
76
78
  end
@@ -1,65 +1,65 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/sanitize/regexp'
2
4
  require 'pathname'
3
5
  require 'set'
4
6
 
5
7
  class SiteDiff
6
- # Find appropriate rules for a given site
7
- class Rules
8
- def initialize(config, disabled = false)
9
- @disabled = disabled
10
- @config = config
11
- find_sanitization_candidates
12
- @rules = Hash.new { |h, k| h[k] = Set.new }
13
- end
8
+ # Find appropriate rules for a given site
9
+ class Rules
10
+ def initialize(config, disabled = false)
11
+ @disabled = disabled
12
+ @config = config
13
+ find_sanitization_candidates
14
+ @rules = Hash.new { |h, k| h[k] = Set.new }
15
+ end
14
16
 
15
- def find_sanitization_candidates
16
- @candidates = Set.new
17
+ def find_sanitization_candidates
18
+ @candidates = Set.new
17
19
 
18
- rules_dir = Pathname.new(__FILE__).dirname + 'files' + 'rules'
19
- rules_dir.children.each do |f|
20
- next unless f.file? && f.extname == '.yaml'
21
- conf = YAML.load_file(f)
22
- @candidates.merge(conf['sanitization'])
23
- end
24
- end
20
+ rules_dir = Pathname.new(__FILE__).dirname + 'files' + 'rules'
21
+ rules_dir.children.each do |f|
22
+ next unless f.file? && f.extname == '.yaml'
25
23
 
26
- def handle_page(tag, html, doc)
27
- found = find_rules(html, doc)
28
- @rules[tag].merge(found)
29
- end
24
+ conf = YAML.load_file(f)
25
+ @candidates.merge(conf['sanitization'])
26
+ end
27
+ end
30
28
 
31
- # Yield a set of rules that seem reasonable for this HTML
32
- # assumption: the YAML file is a list of regexp rules only
33
- def find_rules(html, doc)
34
- rules = []
29
+ def handle_page(tag, html, doc)
30
+ found = find_rules(html, doc)
31
+ @rules[tag].merge(found)
32
+ end
35
33
 
36
- return @candidates.select do |rule|
37
- re = SiteDiff::Sanitizer::Regexp.create(rule)
38
- re.applies?(html, doc)
34
+ # Yield a set of rules that seem reasonable for this HTML
35
+ # assumption: the YAML file is a list of regexp rules only
36
+ def find_rules(html, doc)
37
+ @candidates.select do |rule|
38
+ re = SiteDiff::Sanitizer::Regexp.create(rule)
39
+ re.applies?(html, doc)
40
+ end
39
41
  end
40
- end
41
42
 
42
- # Find all rules from all rulesets that apply for all pages
43
- def add_config
44
- have_both = @rules.include?(:before)
43
+ # Find all rules from all rulesets that apply for all pages
44
+ def add_config
45
+ have_both = @rules.include?(:before)
45
46
 
46
- r1, r2 = *@rules.values_at(:before, :after)
47
- if have_both
48
- add_section('before', r1 - r2)
49
- add_section('after', r2 - r1)
50
- add_section(nil, r1 & r2)
51
- else
52
- add_section(nil, r2)
47
+ r1, r2 = *@rules.values_at(:before, :after)
48
+ if have_both
49
+ add_section('before', r1 - r2)
50
+ add_section('after', r2 - r1)
51
+ add_section(nil, r1 & r2)
52
+ else
53
+ add_section(nil, r2)
54
+ end
53
55
  end
54
- end
55
56
 
56
- def add_section(name, rules)
57
- return if rules.empty?
58
- conf = name ? @config[name] : @config
59
- if @disabled
60
- rules.each { |r| r['disabled'] = true }
57
+ def add_section(name, rules)
58
+ return if rules.empty?
59
+
60
+ conf = name ? @config[name] : @config
61
+ rules.each { |r| r['disabled'] = true } if @disabled
62
+ conf['sanitization'] = rules.to_a.sort_by { |r| r['title'] }
61
63
  end
62
- conf['sanitization'] = rules.to_a.sort_by { |r| r['title'] }
63
64
  end
64
65
  end
65
- end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'sitediff/exception'
3
5
  require 'sitediff/sanitize/dom_transform'
@@ -6,183 +8,176 @@ require 'nokogiri'
6
8
  require 'set'
7
9
 
8
10
  class SiteDiff
9
- class Sanitizer
10
- class InvalidSanitization < SiteDiffException; end
11
-
12
- TOOLS = {
13
- :array => %w[dom_transform sanitization],
14
- :scalar => %w[selector remove_spacing],
15
- }
16
- DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
17
-
18
- def initialize(html, config, opts = {})
19
- @html = html
20
- @config = config
21
- @opts = opts
22
- end
23
-
24
- def sanitize
25
- return '' if @html == '' # Quick return on empty input
11
+ class Sanitizer
12
+ class InvalidSanitization < SiteDiffException; end
13
+
14
+ TOOLS = {
15
+ array: %w[dom_transform sanitization],
16
+ scalar: %w[selector remove_spacing]
17
+ }.freeze
18
+ DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
19
+
20
+ def initialize(html, config, opts = {})
21
+ @html = html
22
+ @config = config
23
+ @opts = opts
24
+ end
26
25
 
27
- @node, @html = Sanitizer.domify(@html), nil
26
+ def sanitize
27
+ return '' if @html == '' # Quick return on empty input
28
28
 
29
- remove_spacing
30
- selector
31
- dom_transforms
32
- regexps
29
+ @node = Sanitizer.domify(@html)
30
+ @html = nil
33
31
 
34
- return @html || Sanitizer.prettify(@node)
35
- end
32
+ remove_spacing
33
+ selector
34
+ dom_transforms
35
+ regexps
36
36
 
37
- # Return whether or not we want to keep a rule
38
- def want_rule(rule)
39
- return false unless rule
40
- return false if rule['disabled']
37
+ @html || Sanitizer.prettify(@node)
38
+ end
41
39
 
42
- # Filter out if path regexp doesn't match
43
- if (pathre = rule['path']) and (path = @opts[:path])
44
- return ::Regexp.new(pathre).match(path)
45
- end
40
+ # Return whether or not we want to keep a rule
41
+ def want_rule(rule)
42
+ return false unless rule
43
+ return false if rule['disabled']
46
44
 
47
- return true
48
- end
45
+ # Filter out if path regexp doesn't match
46
+ if (pathre = rule['path']) && (path = @opts[:path])
47
+ return ::Regexp.new(pathre).match(path)
48
+ end
49
49
 
50
- # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
51
- # It may be a simple value, or a hash, or an array of hashes.
52
- # Turn it into an array of hashes.
53
- def canonicalize_rule(name)
54
- rules = @config[name] or return nil
55
-
56
- if rules[0] && rules[0].respond_to?(:[]) && rules[0]['value']
57
- # Already an array
58
- elsif rules['value']
59
- # Hash, put it in an array
60
- rules = [rules]
61
- else
62
- # Scalar, put it in a hash
63
- rules = [{ 'value' => rules }]
64
- end
50
+ true
51
+ end
65
52
 
66
- want = rules.select { |r| want_rule(r) }
67
- return nil if want.empty?
68
- raise "Too many matching rules of type #{name}" if want.size > 1
69
- return want.first
70
- end
53
+ # Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'.
54
+ # It may be a simple value, or a hash, or an array of hashes.
55
+ # Turn it into an array of hashes.
56
+ def canonicalize_rule(name)
57
+ (rules = @config[name]) || (return nil)
58
+
59
+ if rules[0]&.respond_to?(:[]) && rules[0]['value']
60
+ # Already an array
61
+ elsif rules['value']
62
+ # Hash, put it in an array
63
+ rules = [rules]
64
+ else
65
+ # Scalar, put it in a hash
66
+ rules = [{ 'value' => rules }]
67
+ end
68
+
69
+ want = rules.select { |r| want_rule(r) }
70
+ return nil if want.empty?
71
+ raise "Too many matching rules of type #{name}" if want.size > 1
72
+
73
+ want.first
74
+ end
71
75
 
72
- # Perform 'remove_spacing' action
73
- def remove_spacing
74
- rule = canonicalize_rule('remove_spacing') or return
75
- Sanitizer.remove_node_spacing(@node) if rule['value']
76
- end
76
+ # Perform 'remove_spacing' action
77
+ def remove_spacing
78
+ (rule = canonicalize_rule('remove_spacing')) || return
79
+ Sanitizer.remove_node_spacing(@node) if rule['value']
80
+ end
77
81
 
78
- # Perform 'selector' action, to choose a new root
79
- def selector
80
- rule = canonicalize_rule('selector') or return
81
- @node = Sanitizer.select_fragments(@node, rule['value'])
82
- end
82
+ # Perform 'selector' action, to choose a new root
83
+ def selector
84
+ (rule = canonicalize_rule('selector')) || return
85
+ @node = Sanitizer.select_fragments(@node, rule['value'])
86
+ end
83
87
 
84
- # Applies regexps. Also
85
- def regexps
86
- rules = @config['sanitization'] or return
87
- rules = rules.select { |r| want_rule(r) }
88
+ # Applies regexps. Also
89
+ def regexps
90
+ (rules = @config['sanitization']) || return
91
+ rules = rules.select { |r| want_rule(r) }
88
92
 
89
- rules.map! { |r| Regexp.create(r) }
90
- selector, global = rules.partition { |r| r.selector? }
93
+ rules.map! { |r| Regexp.create(r) }
94
+ selector, global = rules.partition(&:selector?)
91
95
 
92
- selector.each { |r| r.apply(@node) }
93
- @html, @node = Sanitizer.prettify(@node), nil
94
- global.each { |r| r.apply(@html) }
95
- end
96
+ selector.each { |r| r.apply(@node) }
97
+ @html = Sanitizer.prettify(@node)
98
+ @node = nil
99
+ global.each { |r| r.apply(@html) }
100
+ end
96
101
 
97
- # Perform DOM transforms
98
- def dom_transforms
99
- rules = @config['dom_transform'] or return
100
- rules = rules.select { |r| want_rule(r) }
102
+ # Perform DOM transforms
103
+ def dom_transforms
104
+ (rules = @config['dom_transform']) || return
105
+ rules = rules.select { |r| want_rule(r) }
101
106
 
102
- rules.each do |rule|
103
- transform = DomTransform.create(rule)
104
- transform.apply(@node)
105
- end
106
- end
107
+ rules.each do |rule|
108
+ transform = DomTransform.create(rule)
109
+ transform.apply(@node)
110
+ end
111
+ end
107
112
 
108
- ##### Implementations of actions #####
113
+ ##### Implementations of actions #####
109
114
 
110
- # Remove double-spacing inside text nodes
111
- def self.remove_node_spacing(node)
112
- # remove double spacing, but only inside text nodes (eg not attributes)
113
- node.xpath('//text()').each do |el|
114
- el.content = el.content.gsub(/ +/, ' ')
115
- end
116
- end
115
+ # Remove double-spacing inside text nodes
116
+ def self.remove_node_spacing(node)
117
+ # remove double spacing, but only inside text nodes (eg not attributes)
118
+ node.xpath('//text()').each do |el|
119
+ el.content = el.content.gsub(/ +/, ' ')
120
+ end
121
+ end
117
122
 
118
- # Get a fragment consisting of the elements matching the selector(s)
119
- def self.select_fragments(node, sel)
120
- # When we choose a new root, we always become a DocumentFragment,
121
- # and lose any DOCTYPE and such.
122
- ns = node.css(sel)
123
- unless node.fragment?
124
- node = Nokogiri::HTML.fragment('')
125
- end
126
- node.children = ns
127
- return node
128
- end
123
+ # Get a fragment consisting of the elements matching the selector(s)
124
+ def self.select_fragments(node, sel)
125
+ # When we choose a new root, we always become a DocumentFragment,
126
+ # and lose any DOCTYPE and such.
127
+ ns = node.css(sel)
128
+ node = Nokogiri::HTML.fragment('') unless node.fragment?
129
+ node.children = ns
130
+ node
131
+ end
129
132
 
130
- # Pretty-print some HTML
131
- def self.prettify(obj)
132
- @stylesheet ||= begin
133
- stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
134
- Nokogiri::XSLT(File.read(stylesheet_path))
135
- end
133
+ # Pretty-print some HTML
134
+ def self.prettify(obj)
135
+ @stylesheet ||= begin
136
+ stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
137
+ Nokogiri::XSLT(File.read(stylesheet_path))
138
+ end
136
139
 
137
- # Pull out the html element's children
138
- # The obvious way to do this is to iterate over pretty.css('html'),
139
- # but that tends to segfault Nokogiri
140
- str = @stylesheet.apply_to(to_document(obj))
140
+ # Pull out the html element's children
141
+ # The obvious way to do this is to iterate over pretty.css('html'),
142
+ # but that tends to segfault Nokogiri
143
+ str = @stylesheet.apply_to(to_document(obj))
141
144
 
142
- # There's a lot of cruft left over,that we don't want
145
+ # There's a lot of cruft left over,that we don't want
143
146
 
144
- # Remove xml declaration and <html> tags
145
- str.sub!(/\A<\?xml.*$\n/, '')
146
- str.sub!(/\A^<html>$\n/, '')
147
- str.sub!(%r[</html>\n\Z], '')
147
+ # Remove xml declaration and <html> tags
148
+ str.sub!(/\A<\?xml.*$\n/, '')
149
+ str.sub!(/\A^<html>$\n/, '')
150
+ str.sub!(%r{</html>\n\Z}, '')
148
151
 
149
- # Remove top-level indentation
150
- indent = /\A(\s*)/.match(str)[1].size
151
- str.gsub!(/^\s{,#{indent}}/, '')
152
+ # Remove top-level indentation
153
+ indent = /\A(\s*)/.match(str)[1].size
154
+ str.gsub!(/^\s{,#{indent}}/, '')
152
155
 
153
- # Remove blank lines
154
- str.gsub!(/^\s*$\n/, '')
156
+ # Remove blank lines
157
+ str.gsub!(/^\s*$\n/, '')
155
158
 
156
- return str
157
- end
159
+ str
160
+ end
158
161
 
159
- # Parse HTML into a node
160
- def self.domify(str, force_doc = false)
161
- if force_doc || /<!DOCTYPE/.match(str[0, 512])
162
- return Nokogiri::HTML(str)
163
- else
164
- return Nokogiri::HTML.fragment(str)
165
- end
166
- end
162
+ # Parse HTML into a node
163
+ def self.domify(str, force_doc = false)
164
+ if force_doc || /<!DOCTYPE/.match(str[0, 512])
165
+ Nokogiri::HTML(str)
166
+ else
167
+ Nokogiri::HTML.fragment(str)
168
+ end
169
+ end
167
170
 
168
- # Force this object to be a document, so we can apply a stylesheet
169
- def self.to_document(obj)
170
- if Nokogiri::XML::Document === obj
171
- return obj
172
- elsif Nokogiri::XML::Node === obj # node or fragment
173
- return domify(obj.to_s, true)
174
-
175
- # This ought to work, and would be faster,
176
- # but seems to segfault Nokogiri
177
- if false
178
- doc = Nokogiri::HTML('<html><body>')
179
- doc.at('body').children = obj.children
180
- return doc
171
+ # Force this object to be a document, so we can apply a stylesheet
172
+ def self.to_document(obj)
173
+ if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
174
+ obj
175
+ # node or fragment
176
+ elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
177
+ domify(obj.to_s, true)
178
+ else
179
+ to_document(domify(obj, false))
180
+ end
181
181
  end
182
- else
183
- return to_document(domify(obj))
184
182
  end
185
183
  end
186
-
187
- end
188
- end