sitediff 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,92 +1,91 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/sanitize'
2
4
  require 'nokogiri'
3
5
 
4
6
  class SiteDiff
5
- class Sanitizer
7
+ class Sanitizer
8
+ # Currently supported transforms:
9
+ #
10
+ # * { :type => "unwrap_root" }
11
+ # * { :type => "unwrap", :selector => "div.field-item" }
12
+ # * { :type => "remove", :selector => "div.extra-stuff" }
13
+ # * { :type => "remove_class", :class => 'class1' }
14
+ class DomTransform
15
+ Transforms = {}
6
16
 
7
- # Currently supported transforms:
8
- #
9
- # * { :type => "unwrap_root" }
10
- # * { :type => "unwrap", :selector => "div.field-item" }
11
- # * { :type => "remove", :selector => "div.extra-stuff" }
12
- # * { :type => "remove_class", :class => 'class1' }
13
- class DomTransform
17
+ def initialize(rule)
18
+ @rule = rule
19
+ end
14
20
 
15
- Transforms = {}
21
+ # Often an array or scalar are both ok values. Turn either into an array.
22
+ def to_array(val)
23
+ [val].flatten
24
+ end
16
25
 
17
- def initialize(rule)
18
- @rule = rule
19
- end
26
+ def targets(node)
27
+ selectors = to_array(@rule['selector'])
28
+ selectors.each do |sel|
29
+ node.css(sel).each { |n| yield n }
30
+ end
31
+ end
20
32
 
21
- # Often an array or scalar are both ok values. Turn either into an array.
22
- def to_array(v)
23
- return [v].flatten
24
- end
25
-
26
- def targets(node)
27
- selectors = to_array(@rule['selector'])
28
- selectors.each do |sel|
29
- node.css(sel).each { |n| yield n }
30
- end
31
- end
33
+ def apply(node)
34
+ targets(node) { |t| process(t) }
35
+ end
32
36
 
33
- def apply(node)
34
- targets(node) { |t| process(t) }
35
- end
37
+ def self.register(name)
38
+ Transforms[name] = self
39
+ end
36
40
 
37
- def self.register(name)
38
- Transforms[name] = self
39
- end
41
+ def self.create(rule)
42
+ (type = rule['type']) ||
43
+ raise(InvalidSanitization, 'DOM transform needs a type')
44
+ (transform = Transforms[type]) ||
45
+ raise(InvalidSanitization, "No DOM transform named #{type}")
46
+ transform.new(rule)
47
+ end
40
48
 
41
- def self.create(rule)
42
- type = rule['type'] or
43
- raise InvalidSanitization, "DOM transform needs a type"
44
- transform = Transforms[type] or
45
- raise InvalidSanitization, "No DOM transform named #{type}"
46
- return transform.new(rule)
47
- end
49
+ # Remove elements matching 'selector'
50
+ class Remove < DomTransform
51
+ register 'remove'
52
+ def process(node)
53
+ node.remove
54
+ end
55
+ end
48
56
 
49
- # Remove elements matching 'selector'
50
- class Remove < DomTransform
51
- register "remove"
52
- def process(node)
53
- node.remove
54
- end
55
- end
57
+ # Unwrap elements matching 'selector'
58
+ class Unwrap < DomTransform
59
+ register 'unwrap'
60
+ def process(node)
61
+ node.add_next_sibling(node.children)
62
+ node.remove
63
+ end
64
+ end
56
65
 
57
- # Unwrap elements matching 'selector'
58
- class Unwrap < DomTransform
59
- register "unwrap"
60
- def process(node)
61
- node.add_next_sibling(node.children)
62
- node.remove
63
- end
64
- end
66
+ # Remove classes from elements matching selector
67
+ class RemoveClass < DomTransform
68
+ register 'remove_class'
69
+ def process(node)
70
+ classes = to_array(@rule['class'])
65
71
 
66
- # Remove classes from elements matching selector
67
- class RemoveClass < DomTransform
68
- register "remove_class"
69
- def process(node)
70
- classes = to_array(@rule['class'])
72
+ # Must call remove_class on a NodeSet!
73
+ ns = Nokogiri::XML::NodeSet.new(node.document, [node])
74
+ classes.each do |class_name|
75
+ ns.remove_class(class_name)
76
+ end
77
+ end
78
+ end
71
79
 
72
- # Must call remove_class on a NodeSet!
73
- ns = Nokogiri::XML::NodeSet.new(node.document, [node])
74
- classes.each do |class_name|
75
- ns.remove_class(class_name)
80
+ # Unwrap the root element
81
+ class UnwrapRoot < DomTransform
82
+ register 'unwrap_root'
83
+ def apply(node)
84
+ (node.children.size == 1) ||
85
+ raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
86
+ node.children = node.children[0].children
87
+ end
88
+ end
76
89
  end
77
90
  end
78
91
  end
79
-
80
- # Unwrap the root element
81
- class UnwrapRoot < DomTransform
82
- register "unwrap_root"
83
- def apply(node)
84
- node.children.size == 1 or
85
- raise InvalidSanitization, "Multiple root elements in unwrap_root"
86
- node.children = node.children[0].children
87
- end
88
- end
89
-
90
- end
91
- end
92
- end
@@ -1,56 +1,59 @@
1
- class SiteDiff
2
- class Sanitizer
3
- class Regexp
4
- def initialize(rule)
5
- @rule = rule
6
- end
7
-
8
- def selector?
9
- false
10
- end
11
-
12
- def applies?(html, node)
13
- applies_to_string?(html)
14
- end
15
-
16
- def apply(html)
17
- gsub!(html)
18
- end
19
-
20
- def self.create(rule)
21
- rule['selector'] ? WithSelector.new(rule) : new(rule)
22
- end
23
-
24
- class WithSelector < Regexp
25
- def selector?
26
- true
27
- end
1
+ # frozen_string_literal: true
28
2
 
29
- def contexts(node)
30
- sels = @rule['selector']
31
- node.css(sels).each { |e| yield(e) }
32
- end
33
-
34
- def applies?(html, node)
35
- enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
36
- end
37
-
38
- def apply(node)
39
- contexts(node) { |e| e.replace(gsub!(e.to_html)) }
3
+ class SiteDiff
4
+ class Sanitizer
5
+ class Regexp
6
+ def initialize(rule)
7
+ @rule = rule
8
+ end
9
+
10
+ def selector?
11
+ false
12
+ end
13
+
14
+ def applies?(html, _node)
15
+ applies_to_string?(html)
16
+ end
17
+
18
+ def apply(html)
19
+ gsub!(html)
20
+ end
21
+
22
+ def self.create(rule)
23
+ rule['selector'] ? WithSelector.new(rule) : new(rule)
24
+ end
25
+
26
+ class WithSelector < Regexp
27
+ def selector?
28
+ true
29
+ end
30
+
31
+ def contexts(node)
32
+ sels = @rule['selector']
33
+ node.css(sels).each { |e| yield(e) }
34
+ end
35
+
36
+ def applies?(_html, node)
37
+ enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
38
+ end
39
+
40
+ def apply(node)
41
+ contexts(node) { |e| e.replace(gsub!(e.to_html)) }
42
+ end
43
+ end
44
+
45
+ protected
46
+
47
+ def gsub!(str)
48
+ re = ::Regexp.new(@rule['pattern'])
49
+ sub = @rule['substitute'] || ''
50
+ str.gsub!(re, sub)
51
+ str
52
+ end
53
+
54
+ def applies_to_string?(str)
55
+ gsub!(str.dup) != str
56
+ end
40
57
  end
41
58
  end
42
-
43
- protected
44
- def gsub!(str)
45
- re = ::Regexp.new(@rule['pattern'])
46
- sub = @rule['substitute'] || ''
47
- str.gsub!(re, sub)
48
- str
49
- end
50
-
51
- def applies_to_string?(str)
52
- gsub!(str.dup) != str
53
- end
54
- end
55
- end
56
59
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/exception'
2
4
  require 'typhoeus'
3
5
  require 'addressable/uri'
@@ -6,6 +8,14 @@ class SiteDiff
6
8
  class SiteDiffReadFailure < SiteDiffException; end
7
9
 
8
10
  class UriWrapper
11
+ DEFAULT_CURL_OPTS = {
12
+ connecttimeout: 3, # Don't hang on servers that don't exist
13
+ followlocation: true, # Follow HTTP redirects (code 301 and 302)
14
+ headers: {
15
+ 'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
16
+ }
17
+ }.freeze
18
+
9
19
  # This lets us treat errors or content as one object
10
20
  class ReadResult
11
21
  attr_accessor :content, :error_code, :error
@@ -20,14 +30,15 @@ class SiteDiff
20
30
  res = new
21
31
  res.error_code = code
22
32
  res.error = err
23
- return res
33
+ res
24
34
  end
25
35
  end
26
36
 
27
- def initialize(uri)
37
+ def initialize(uri, curl_opts = DEFAULT_CURL_OPTS)
28
38
  @uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
29
39
  # remove trailing '/'s from local URIs
30
- @uri.path.gsub!(/\/*$/, '') if local?
40
+ @uri.path.gsub!(%r{/*$}, '') if local?
41
+ @curl_opts = curl_opts
31
42
  end
32
43
 
33
44
  def user
@@ -42,26 +53,24 @@ class SiteDiff
42
53
  uri = @uri.dup
43
54
  uri.user = nil
44
55
  uri.password = nil
45
- return uri.to_s
56
+ uri.to_s
46
57
  end
47
58
 
48
59
  # Is this a local filesystem path?
49
60
  def local?
50
- @uri.scheme == nil
61
+ @uri.scheme.nil?
51
62
  end
52
63
 
53
- # FIXME this is not used anymore
64
+ # FIXME: this is not used anymore
54
65
  def +(path)
55
66
  # 'path' for SiteDiff includes (parts of) path, query, and fragment.
56
67
  sep = ''
57
- if local? || @uri.path.empty?
58
- sep = '/'
59
- end
68
+ sep = '/' if local? || @uri.path.empty?
60
69
  self.class.new(@uri.to_s + sep + path)
61
70
  end
62
71
 
63
72
  # Reads a file and yields to the completion handler, see .queue()
64
- def read_file(&handler)
73
+ def read_file
65
74
  File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
66
75
  rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
67
76
  yield ReadResult.error(e.message)
@@ -70,9 +79,9 @@ class SiteDiff
70
79
  # Returns the encoding of an HTTP response from headers , nil if not
71
80
  # specified.
72
81
  def http_encoding(http_headers)
73
- if content_type = http_headers['Content-Type']
74
- if md = /;\s*charset=([-\w]*)/.match(content_type)
75
- return md[1]
82
+ if (content_type = http_headers['Content-Type'])
83
+ if (md = /;\s*charset=([-\w]*)/.match(content_type))
84
+ md[1]
76
85
  end
77
86
  end
78
87
  end
@@ -81,33 +90,35 @@ class SiteDiff
81
90
  #
82
91
  # Completion callbacks of the request wrap the given handler which is
83
92
  # assumed to accept a single ReadResult argument.
84
- def typhoeus_request(&handler)
85
- params = {
86
- :connecttimeout => 3, # Don't hang on servers that don't exist
87
- :followlocation => true, # Follow HTTP redirects (code 301 and 302)
88
- :headers => {
89
- "User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
90
- }
91
- }
93
+ def typhoeus_request
94
+ params = @curl_opts.dup
92
95
  # Allow basic auth
93
96
  params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
94
97
 
95
- req = Typhoeus::Request.new(self.to_s, params)
98
+ req = Typhoeus::Request.new(to_s, params)
96
99
 
97
100
  req.on_success do |resp|
98
101
  body = resp.body
99
102
  # Typhoeus does not respect HTTP headers when setting the encoding
100
103
  # resp.body; coerce if possible.
101
- if encoding = http_encoding(resp.headers)
104
+ if (encoding = http_encoding(resp.headers))
102
105
  body.force_encoding(encoding)
103
106
  end
104
107
  yield ReadResult.new(body)
105
108
  end
106
109
 
107
110
  req.on_failure do |resp|
108
- msg = 'Unknown Error'
109
- msg = resp.status_message if resp and resp.status_message
110
- yield ReadResult.error("HTTP error #{@uri}: #{msg}", resp.response_code)
111
+ if resp&.status_message
112
+ msg = resp.status_message
113
+ yield ReadResult.error("HTTP error when loading #{@uri}: #{msg}",
114
+ resp.response_code)
115
+ elsif (msg = resp.options[:return_code])
116
+ yield ReadResult.error("Connection error when loading #{@uri}: #{msg}",
117
+ resp.response_code)
118
+ else
119
+ yield ReadResult.error("Unknown error when loading #{@uri}: #{msg}",
120
+ resp.response_code)
121
+ end
111
122
  end
112
123
 
113
124
  req
@@ -1,82 +1,85 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'webrick'
2
4
 
3
5
  class SiteDiff
4
- class Webserver
5
- # Simple webserver for testing purposes
6
- DEFAULT_PORT = 13080
7
-
8
- attr_accessor :ports
9
-
10
- # Serve a list of directories
11
- def initialize(start_port, dirs, opts = {})
12
- start_port ||= DEFAULT_PORT
13
- @ports = (start_port...(start_port + dirs.size)).to_a
14
- @dirs = dirs
15
- @opts = opts
16
-
17
- setup
18
- start_servers
19
-
20
- if block_given?
21
- yield self
22
- kill
23
- end
24
- end
25
-
26
- def kill
27
- @threads.each { |t| t.kill }
28
- end
29
-
30
- def wait
31
- @threads.each { |t| t.join }
32
- end
33
-
34
- def uris
35
- ports.map { |p| "http://localhost:#{p}" }
36
- end
37
-
38
- protected
39
- def setup
40
- @server_opts = {}
41
- if @opts[:quiet]
42
- @server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
43
- @server_opts[:AccessLog] = []
44
- end
45
- end
46
-
47
- def server(opts)
48
- WEBrick::HTTPServer.new(opts)
49
- end
50
-
51
- def start_servers
52
- @threads = []
53
- @dirs.each_with_index do |dir, idx|
54
- @server_opts[:Port] = @ports[idx]
55
- @server_opts[:DocumentRoot] = dir
56
- srv = server(@server_opts)
57
- @threads << Thread.new { srv.start }
6
+ class Webserver
7
+ # Simple webserver for testing purposes
8
+ DEFAULT_PORT = 13_080
9
+
10
+ attr_accessor :ports
11
+
12
+ # Serve a list of directories
13
+ def initialize(start_port, dirs, opts = {})
14
+ start_port ||= DEFAULT_PORT
15
+ @ports = (start_port...(start_port + dirs.size)).to_a
16
+ @dirs = dirs
17
+ @opts = opts
18
+
19
+ setup
20
+ start_servers
21
+
22
+ if block_given?
23
+ yield self
24
+ kill
25
+ end
26
+ end
27
+
28
+ def kill
29
+ @threads.each(&:kill)
30
+ end
31
+
32
+ def wait
33
+ @threads.each(&:join)
34
+ end
35
+
36
+ def uris
37
+ ports.map { |p| "http://localhost:#{p}" }
38
+ end
39
+
40
+ protected
41
+
42
+ def setup
43
+ @server_opts = {}
44
+ if @opts[:quiet]
45
+ @server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
46
+ @server_opts[:AccessLog] = []
47
+ end
48
+ end
49
+
50
+ def server(opts)
51
+ WEBrick::HTTPServer.new(opts)
52
+ end
53
+
54
+ def start_servers
55
+ @threads = []
56
+ @dirs.each_with_index do |dir, idx|
57
+ @server_opts[:Port] = @ports[idx]
58
+ @server_opts[:DocumentRoot] = dir
59
+ srv = server(@server_opts)
60
+ @threads << Thread.new { srv.start }
61
+ end
62
+ end
63
+
64
+ public
65
+
66
+ class FixtureServer < Webserver
67
+ PORT = DEFAULT_PORT + 1
68
+ BASE = 'spec/fixtures/ruby-doc.org'
69
+ NAMES = %w[core-1.9.3 core-2.0].freeze
70
+
71
+ def initialize(port = PORT, base = BASE, names = NAMES)
72
+ dirs = names.map { |n| File.join(base, n) }
73
+ super(port, dirs, quiet: true)
74
+ end
75
+
76
+ def before
77
+ uris.first
78
+ end
79
+
80
+ def after
81
+ uris.last
82
+ end
83
+ end
58
84
  end
59
85
  end
60
-
61
- public
62
-
63
- class FixtureServer < Webserver
64
- PORT = DEFAULT_PORT + 1
65
- BASE = 'spec/fixtures/ruby-doc.org'
66
- NAMES = %w[core-1.9.3 core-2.0]
67
-
68
- def initialize(port = PORT, base = BASE, names = NAMES)
69
- dirs = names.map { |n| File.join(base, n) }
70
- super(port, dirs, :quiet => true)
71
- end
72
-
73
- def before
74
- uris.first
75
- end
76
- def after
77
- uris.last
78
- end
79
- end
80
-
81
- end
82
- end