sitediff 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,92 +1,91 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/sanitize'
2
4
  require 'nokogiri'
3
5
 
4
6
  class SiteDiff
5
- class Sanitizer
7
+ class Sanitizer
8
+ # Currently supported transforms:
9
+ #
10
+ # * { :type => "unwrap_root" }
11
+ # * { :type => "unwrap", :selector => "div.field-item" }
12
+ # * { :type => "remove", :selector => "div.extra-stuff" }
13
+ # * { :type => "remove_class", :class => 'class1' }
14
+ class DomTransform
15
+ Transforms = {}
6
16
 
7
- # Currently supported transforms:
8
- #
9
- # * { :type => "unwrap_root" }
10
- # * { :type => "unwrap", :selector => "div.field-item" }
11
- # * { :type => "remove", :selector => "div.extra-stuff" }
12
- # * { :type => "remove_class", :class => 'class1' }
13
- class DomTransform
17
+ def initialize(rule)
18
+ @rule = rule
19
+ end
14
20
 
15
- Transforms = {}
21
+ # Often an array or scalar are both ok values. Turn either into an array.
22
+ def to_array(val)
23
+ [val].flatten
24
+ end
16
25
 
17
- def initialize(rule)
18
- @rule = rule
19
- end
26
+ def targets(node)
27
+ selectors = to_array(@rule['selector'])
28
+ selectors.each do |sel|
29
+ node.css(sel).each { |n| yield n }
30
+ end
31
+ end
20
32
 
21
- # Often an array or scalar are both ok values. Turn either into an array.
22
- def to_array(v)
23
- return [v].flatten
24
- end
25
-
26
- def targets(node)
27
- selectors = to_array(@rule['selector'])
28
- selectors.each do |sel|
29
- node.css(sel).each { |n| yield n }
30
- end
31
- end
33
+ def apply(node)
34
+ targets(node) { |t| process(t) }
35
+ end
32
36
 
33
- def apply(node)
34
- targets(node) { |t| process(t) }
35
- end
37
+ def self.register(name)
38
+ Transforms[name] = self
39
+ end
36
40
 
37
- def self.register(name)
38
- Transforms[name] = self
39
- end
41
+ def self.create(rule)
42
+ (type = rule['type']) ||
43
+ raise(InvalidSanitization, 'DOM transform needs a type')
44
+ (transform = Transforms[type]) ||
45
+ raise(InvalidSanitization, "No DOM transform named #{type}")
46
+ transform.new(rule)
47
+ end
40
48
 
41
- def self.create(rule)
42
- type = rule['type'] or
43
- raise InvalidSanitization, "DOM transform needs a type"
44
- transform = Transforms[type] or
45
- raise InvalidSanitization, "No DOM transform named #{type}"
46
- return transform.new(rule)
47
- end
49
+ # Remove elements matching 'selector'
50
+ class Remove < DomTransform
51
+ register 'remove'
52
+ def process(node)
53
+ node.remove
54
+ end
55
+ end
48
56
 
49
- # Remove elements matching 'selector'
50
- class Remove < DomTransform
51
- register "remove"
52
- def process(node)
53
- node.remove
54
- end
55
- end
57
+ # Unwrap elements matching 'selector'
58
+ class Unwrap < DomTransform
59
+ register 'unwrap'
60
+ def process(node)
61
+ node.add_next_sibling(node.children)
62
+ node.remove
63
+ end
64
+ end
56
65
 
57
- # Unwrap elements matching 'selector'
58
- class Unwrap < DomTransform
59
- register "unwrap"
60
- def process(node)
61
- node.add_next_sibling(node.children)
62
- node.remove
63
- end
64
- end
66
+ # Remove classes from elements matching selector
67
+ class RemoveClass < DomTransform
68
+ register 'remove_class'
69
+ def process(node)
70
+ classes = to_array(@rule['class'])
65
71
 
66
- # Remove classes from elements matching selector
67
- class RemoveClass < DomTransform
68
- register "remove_class"
69
- def process(node)
70
- classes = to_array(@rule['class'])
72
+ # Must call remove_class on a NodeSet!
73
+ ns = Nokogiri::XML::NodeSet.new(node.document, [node])
74
+ classes.each do |class_name|
75
+ ns.remove_class(class_name)
76
+ end
77
+ end
78
+ end
71
79
 
72
- # Must call remove_class on a NodeSet!
73
- ns = Nokogiri::XML::NodeSet.new(node.document, [node])
74
- classes.each do |class_name|
75
- ns.remove_class(class_name)
80
+ # Unwrap the root element
81
+ class UnwrapRoot < DomTransform
82
+ register 'unwrap_root'
83
+ def apply(node)
84
+ (node.children.size == 1) ||
85
+ raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
86
+ node.children = node.children[0].children
87
+ end
88
+ end
76
89
  end
77
90
  end
78
91
  end
79
-
80
- # Unwrap the root element
81
- class UnwrapRoot < DomTransform
82
- register "unwrap_root"
83
- def apply(node)
84
- node.children.size == 1 or
85
- raise InvalidSanitization, "Multiple root elements in unwrap_root"
86
- node.children = node.children[0].children
87
- end
88
- end
89
-
90
- end
91
- end
92
- end
@@ -1,56 +1,59 @@
1
- class SiteDiff
2
- class Sanitizer
3
- class Regexp
4
- def initialize(rule)
5
- @rule = rule
6
- end
7
-
8
- def selector?
9
- false
10
- end
11
-
12
- def applies?(html, node)
13
- applies_to_string?(html)
14
- end
15
-
16
- def apply(html)
17
- gsub!(html)
18
- end
19
-
20
- def self.create(rule)
21
- rule['selector'] ? WithSelector.new(rule) : new(rule)
22
- end
23
-
24
- class WithSelector < Regexp
25
- def selector?
26
- true
27
- end
1
+ # frozen_string_literal: true
28
2
 
29
- def contexts(node)
30
- sels = @rule['selector']
31
- node.css(sels).each { |e| yield(e) }
32
- end
33
-
34
- def applies?(html, node)
35
- enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
36
- end
37
-
38
- def apply(node)
39
- contexts(node) { |e| e.replace(gsub!(e.to_html)) }
3
+ class SiteDiff
4
+ class Sanitizer
5
+ class Regexp
6
+ def initialize(rule)
7
+ @rule = rule
8
+ end
9
+
10
+ def selector?
11
+ false
12
+ end
13
+
14
+ def applies?(html, _node)
15
+ applies_to_string?(html)
16
+ end
17
+
18
+ def apply(html)
19
+ gsub!(html)
20
+ end
21
+
22
+ def self.create(rule)
23
+ rule['selector'] ? WithSelector.new(rule) : new(rule)
24
+ end
25
+
26
+ class WithSelector < Regexp
27
+ def selector?
28
+ true
29
+ end
30
+
31
+ def contexts(node)
32
+ sels = @rule['selector']
33
+ node.css(sels).each { |e| yield(e) }
34
+ end
35
+
36
+ def applies?(_html, node)
37
+ enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
38
+ end
39
+
40
+ def apply(node)
41
+ contexts(node) { |e| e.replace(gsub!(e.to_html)) }
42
+ end
43
+ end
44
+
45
+ protected
46
+
47
+ def gsub!(str)
48
+ re = ::Regexp.new(@rule['pattern'])
49
+ sub = @rule['substitute'] || ''
50
+ str.gsub!(re, sub)
51
+ str
52
+ end
53
+
54
+ def applies_to_string?(str)
55
+ gsub!(str.dup) != str
56
+ end
40
57
  end
41
58
  end
42
-
43
- protected
44
- def gsub!(str)
45
- re = ::Regexp.new(@rule['pattern'])
46
- sub = @rule['substitute'] || ''
47
- str.gsub!(re, sub)
48
- str
49
- end
50
-
51
- def applies_to_string?(str)
52
- gsub!(str.dup) != str
53
- end
54
- end
55
- end
56
59
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/exception'
2
4
  require 'typhoeus'
3
5
  require 'addressable/uri'
@@ -6,6 +8,14 @@ class SiteDiff
6
8
  class SiteDiffReadFailure < SiteDiffException; end
7
9
 
8
10
  class UriWrapper
11
+ DEFAULT_CURL_OPTS = {
12
+ connecttimeout: 3, # Don't hang on servers that don't exist
13
+ followlocation: true, # Follow HTTP redirects (code 301 and 302)
14
+ headers: {
15
+ 'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
16
+ }
17
+ }.freeze
18
+
9
19
  # This lets us treat errors or content as one object
10
20
  class ReadResult
11
21
  attr_accessor :content, :error_code, :error
@@ -20,14 +30,15 @@ class SiteDiff
20
30
  res = new
21
31
  res.error_code = code
22
32
  res.error = err
23
- return res
33
+ res
24
34
  end
25
35
  end
26
36
 
27
- def initialize(uri)
37
+ def initialize(uri, curl_opts = DEFAULT_CURL_OPTS)
28
38
  @uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
29
39
  # remove trailing '/'s from local URIs
30
- @uri.path.gsub!(/\/*$/, '') if local?
40
+ @uri.path.gsub!(%r{/*$}, '') if local?
41
+ @curl_opts = curl_opts
31
42
  end
32
43
 
33
44
  def user
@@ -42,26 +53,24 @@ class SiteDiff
42
53
  uri = @uri.dup
43
54
  uri.user = nil
44
55
  uri.password = nil
45
- return uri.to_s
56
+ uri.to_s
46
57
  end
47
58
 
48
59
  # Is this a local filesystem path?
49
60
  def local?
50
- @uri.scheme == nil
61
+ @uri.scheme.nil?
51
62
  end
52
63
 
53
- # FIXME this is not used anymore
64
+ # FIXME: this is not used anymore
54
65
  def +(path)
55
66
  # 'path' for SiteDiff includes (parts of) path, query, and fragment.
56
67
  sep = ''
57
- if local? || @uri.path.empty?
58
- sep = '/'
59
- end
68
+ sep = '/' if local? || @uri.path.empty?
60
69
  self.class.new(@uri.to_s + sep + path)
61
70
  end
62
71
 
63
72
  # Reads a file and yields to the completion handler, see .queue()
64
- def read_file(&handler)
73
+ def read_file
65
74
  File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
66
75
  rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
67
76
  yield ReadResult.error(e.message)
@@ -70,9 +79,9 @@ class SiteDiff
70
79
  # Returns the encoding of an HTTP response from headers , nil if not
71
80
  # specified.
72
81
  def http_encoding(http_headers)
73
- if content_type = http_headers['Content-Type']
74
- if md = /;\s*charset=([-\w]*)/.match(content_type)
75
- return md[1]
82
+ if (content_type = http_headers['Content-Type'])
83
+ if (md = /;\s*charset=([-\w]*)/.match(content_type))
84
+ md[1]
76
85
  end
77
86
  end
78
87
  end
@@ -81,33 +90,35 @@ class SiteDiff
81
90
  #
82
91
  # Completion callbacks of the request wrap the given handler which is
83
92
  # assumed to accept a single ReadResult argument.
84
- def typhoeus_request(&handler)
85
- params = {
86
- :connecttimeout => 3, # Don't hang on servers that don't exist
87
- :followlocation => true, # Follow HTTP redirects (code 301 and 302)
88
- :headers => {
89
- "User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
90
- }
91
- }
93
+ def typhoeus_request
94
+ params = @curl_opts.dup
92
95
  # Allow basic auth
93
96
  params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
94
97
 
95
- req = Typhoeus::Request.new(self.to_s, params)
98
+ req = Typhoeus::Request.new(to_s, params)
96
99
 
97
100
  req.on_success do |resp|
98
101
  body = resp.body
99
102
  # Typhoeus does not respect HTTP headers when setting the encoding
100
103
  # resp.body; coerce if possible.
101
- if encoding = http_encoding(resp.headers)
104
+ if (encoding = http_encoding(resp.headers))
102
105
  body.force_encoding(encoding)
103
106
  end
104
107
  yield ReadResult.new(body)
105
108
  end
106
109
 
107
110
  req.on_failure do |resp|
108
- msg = 'Unknown Error'
109
- msg = resp.status_message if resp and resp.status_message
110
- yield ReadResult.error("HTTP error #{@uri}: #{msg}", resp.response_code)
111
+ if resp&.status_message
112
+ msg = resp.status_message
113
+ yield ReadResult.error("HTTP error when loading #{@uri}: #{msg}",
114
+ resp.response_code)
115
+ elsif (msg = resp.options[:return_code])
116
+ yield ReadResult.error("Connection error when loading #{@uri}: #{msg}",
117
+ resp.response_code)
118
+ else
119
+ yield ReadResult.error("Unknown error when loading #{@uri}: #{msg}",
120
+ resp.response_code)
121
+ end
111
122
  end
112
123
 
113
124
  req
@@ -1,82 +1,85 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'webrick'
2
4
 
3
5
  class SiteDiff
4
- class Webserver
5
- # Simple webserver for testing purposes
6
- DEFAULT_PORT = 13080
7
-
8
- attr_accessor :ports
9
-
10
- # Serve a list of directories
11
- def initialize(start_port, dirs, opts = {})
12
- start_port ||= DEFAULT_PORT
13
- @ports = (start_port...(start_port + dirs.size)).to_a
14
- @dirs = dirs
15
- @opts = opts
16
-
17
- setup
18
- start_servers
19
-
20
- if block_given?
21
- yield self
22
- kill
23
- end
24
- end
25
-
26
- def kill
27
- @threads.each { |t| t.kill }
28
- end
29
-
30
- def wait
31
- @threads.each { |t| t.join }
32
- end
33
-
34
- def uris
35
- ports.map { |p| "http://localhost:#{p}" }
36
- end
37
-
38
- protected
39
- def setup
40
- @server_opts = {}
41
- if @opts[:quiet]
42
- @server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
43
- @server_opts[:AccessLog] = []
44
- end
45
- end
46
-
47
- def server(opts)
48
- WEBrick::HTTPServer.new(opts)
49
- end
50
-
51
- def start_servers
52
- @threads = []
53
- @dirs.each_with_index do |dir, idx|
54
- @server_opts[:Port] = @ports[idx]
55
- @server_opts[:DocumentRoot] = dir
56
- srv = server(@server_opts)
57
- @threads << Thread.new { srv.start }
6
+ class Webserver
7
+ # Simple webserver for testing purposes
8
+ DEFAULT_PORT = 13_080
9
+
10
+ attr_accessor :ports
11
+
12
+ # Serve a list of directories
13
+ def initialize(start_port, dirs, opts = {})
14
+ start_port ||= DEFAULT_PORT
15
+ @ports = (start_port...(start_port + dirs.size)).to_a
16
+ @dirs = dirs
17
+ @opts = opts
18
+
19
+ setup
20
+ start_servers
21
+
22
+ if block_given?
23
+ yield self
24
+ kill
25
+ end
26
+ end
27
+
28
+ def kill
29
+ @threads.each(&:kill)
30
+ end
31
+
32
+ def wait
33
+ @threads.each(&:join)
34
+ end
35
+
36
+ def uris
37
+ ports.map { |p| "http://localhost:#{p}" }
38
+ end
39
+
40
+ protected
41
+
42
+ def setup
43
+ @server_opts = {}
44
+ if @opts[:quiet]
45
+ @server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
46
+ @server_opts[:AccessLog] = []
47
+ end
48
+ end
49
+
50
+ def server(opts)
51
+ WEBrick::HTTPServer.new(opts)
52
+ end
53
+
54
+ def start_servers
55
+ @threads = []
56
+ @dirs.each_with_index do |dir, idx|
57
+ @server_opts[:Port] = @ports[idx]
58
+ @server_opts[:DocumentRoot] = dir
59
+ srv = server(@server_opts)
60
+ @threads << Thread.new { srv.start }
61
+ end
62
+ end
63
+
64
+ public
65
+
66
+ class FixtureServer < Webserver
67
+ PORT = DEFAULT_PORT + 1
68
+ BASE = 'spec/fixtures/ruby-doc.org'
69
+ NAMES = %w[core-1.9.3 core-2.0].freeze
70
+
71
+ def initialize(port = PORT, base = BASE, names = NAMES)
72
+ dirs = names.map { |n| File.join(base, n) }
73
+ super(port, dirs, quiet: true)
74
+ end
75
+
76
+ def before
77
+ uris.first
78
+ end
79
+
80
+ def after
81
+ uris.last
82
+ end
83
+ end
58
84
  end
59
85
  end
60
-
61
- public
62
-
63
- class FixtureServer < Webserver
64
- PORT = DEFAULT_PORT + 1
65
- BASE = 'spec/fixtures/ruby-doc.org'
66
- NAMES = %w[core-1.9.3 core-2.0]
67
-
68
- def initialize(port = PORT, base = BASE, names = NAMES)
69
- dirs = names.map { |n| File.join(base, n) }
70
- super(port, dirs, :quiet => true)
71
- end
72
-
73
- def before
74
- uris.first
75
- end
76
- def after
77
- uris.last
78
- end
79
- end
80
-
81
- end
82
- end