sitediff 0.0.2 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,92 +1,130 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/sanitize'
2
4
  require 'nokogiri'
3
5
 
4
6
  class SiteDiff
5
- class Sanitizer
7
+ class Sanitizer
8
+ # Currently supported transforms:
9
+ #
10
+ # * { :type => "unwrap_root" }
11
+ # * { :type => "unwrap", :selector => "div.field-item" }
12
+ # * { :type => "remove", :selector => "div.extra-stuff" }
13
+ # * { :type => "remove_class", :class => 'class1' }
14
+ # * { :type => "strip", :selector => 'h1' }
15
+ class DomTransform
16
+ # Supported dom_transform types.
17
+ TRANSFORMS = {}
6
18
 
7
- # Currently supported transforms:
8
- #
9
- # * { :type => "unwrap_root" }
10
- # * { :type => "unwrap", :selector => "div.field-item" }
11
- # * { :type => "remove", :selector => "div.extra-stuff" }
12
- # * { :type => "remove_class", :class => 'class1' }
13
- class DomTransform
19
+ ##
20
+ # Creates a DOM Transform.
21
+ def initialize(rule)
22
+ @rule = rule
23
+ end
14
24
 
15
- Transforms = {}
25
+ ##
26
+ # Often an array or scalar are both ok values. Turn either into an array.
27
+ def to_array(val)
28
+ [val].flatten
29
+ end
16
30
 
17
- def initialize(rule)
18
- @rule = rule
19
- end
31
+ ##
32
+ # TODO: Document what this method does.
33
+ def targets(node)
34
+ selectors = to_array(@rule['selector'])
35
+ selectors.each do |sel|
36
+ node.css(sel).each { |n| yield n }
37
+ end
38
+ end
20
39
 
21
- # Often an array or scalar are both ok values. Turn either into an array.
22
- def to_array(v)
23
- return [v].flatten
24
- end
40
+ ##
41
+ # Applies the transformation to a DOM node.
42
+ def apply(node)
43
+ targets(node) { |t| process(t) }
44
+ end
25
45
 
26
- def targets(node)
27
- selectors = to_array(@rule['selector'])
28
- selectors.each do |sel|
29
- node.css(sel).each { |n| yield n }
30
- end
31
- end
46
+ ##
47
+ # Registers a DOM Transform plugin.
48
+ def self.register(name)
49
+ TRANSFORMS[name] = self
50
+ end
32
51
 
33
- def apply(node)
34
- targets(node) { |t| process(t) }
35
- end
52
+ ##
53
+ # Creates a DOM Transform as per rule.
54
+ def self.create(rule)
55
+ (type = rule['type']) ||
56
+ raise(InvalidSanitization, 'DOM transform needs a type')
57
+ (transform = TRANSFORMS[type]) ||
58
+ raise(InvalidSanitization, "No DOM transform named #{type}")
59
+ transform.new(rule)
60
+ end
36
61
 
37
- def self.register(name)
38
- Transforms[name] = self
39
- end
62
+ ##
63
+ # Remove elements matching 'selector'.
64
+ class Remove < DomTransform
65
+ register 'remove'
40
66
 
41
- def self.create(rule)
42
- type = rule['type'] or
43
- raise InvalidSanitization, "DOM transform needs a type"
44
- transform = Transforms[type] or
45
- raise InvalidSanitization, "No DOM transform named #{type}"
46
- return transform.new(rule)
47
- end
67
+ ##
68
+ # Processes a node.
69
+ def process(node)
70
+ node.remove
71
+ end
72
+ end
48
73
 
49
- # Remove elements matching 'selector'
50
- class Remove < DomTransform
51
- register "remove"
52
- def process(node)
53
- node.remove
54
- end
55
- end
74
+ # Squeeze whitespace from a tag matching 'selector'.
75
+ class Strip < DomTransform
76
+ register 'strip'
56
77
 
57
- # Unwrap elements matching 'selector'
58
- class Unwrap < DomTransform
59
- register "unwrap"
60
- def process(node)
61
- node.add_next_sibling(node.children)
62
- node.remove
63
- end
64
- end
78
+ ##
79
+ # Processes a node.
80
+ def process(node)
81
+ node.content = node.content.strip
82
+ end
83
+ end
65
84
 
66
- # Remove classes from elements matching selector
67
- class RemoveClass < DomTransform
68
- register "remove_class"
69
- def process(node)
70
- classes = to_array(@rule['class'])
85
+ # Unwrap elements matching 'selector'.
86
+ class Unwrap < DomTransform
87
+ register 'unwrap'
71
88
 
72
- # Must call remove_class on a NodeSet!
73
- ns = Nokogiri::XML::NodeSet.new(node.document, [node])
74
- classes.each do |class_name|
75
- ns.remove_class(class_name)
76
- end
77
- end
78
- end
89
+ ##
90
+ # Processes a node.
91
+ def process(node)
92
+ node.add_next_sibling(node.children)
93
+ node.remove
94
+ end
95
+ end
79
96
 
80
- # Unwrap the root element
81
- class UnwrapRoot < DomTransform
82
- register "unwrap_root"
83
- def apply(node)
84
- node.children.size == 1 or
85
- raise InvalidSanitization, "Multiple root elements in unwrap_root"
86
- node.children = node.children[0].children
87
- end
88
- end
97
+ ##
98
+ # Remove classes from elements matching selector
99
+ class RemoveClass < DomTransform
100
+ register 'remove_class'
89
101
 
90
- end
91
- end
102
+ ##
103
+ # Processes a node.
104
+ def process(node)
105
+ classes = to_array(@rule['class'])
106
+
107
+ # Must call remove_class on a NodeSet!
108
+ ns = Nokogiri::XML::NodeSet.new(node.document, [node])
109
+ classes.each do |class_name|
110
+ ns.remove_class(class_name)
111
+ end
112
+ end
113
+ end
114
+
115
+ ##
116
+ # Unwrap the root element.
117
+ class UnwrapRoot < DomTransform
118
+ register 'unwrap_root'
119
+
120
+ ##
121
+ # Applies the transformation to a DOM node.
122
+ def apply(node)
123
+ (node.children.size == 1) ||
124
+ raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
125
+ node.children = node.children[0].children
126
+ end
127
+ end
128
+ end
129
+ end
92
130
  end
@@ -1,56 +1,82 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class SiteDiff
2
- class Sanitizer
3
- class Regexp
4
- def initialize(rule)
5
- @rule = rule
6
- end
4
+ class Sanitizer
5
+ # Regular Expression Object.
6
+ class Regexp
7
+ ##
8
+ # Creates a RegExp object.
9
+ def initialize(rule)
10
+ @rule = rule
11
+ end
7
12
 
8
- def selector?
9
- false
10
- end
13
+ ##
14
+ # Whether the RegExp has a selector.
15
+ def selector?
16
+ false
17
+ end
11
18
 
12
- def applies?(html, node)
13
- applies_to_string?(html)
14
- end
19
+ ##
20
+ # Whether the RegExp applies to the given markup.
21
+ def applies?(html, _node)
22
+ applies_to_string?(html)
23
+ end
15
24
 
16
- def apply(html)
17
- gsub!(html)
18
- end
25
+ ##
26
+ # Applies the RegExp to the markup.
27
+ def apply(html)
28
+ gsub!(html)
29
+ end
19
30
 
20
- def self.create(rule)
21
- rule['selector'] ? WithSelector.new(rule) : new(rule)
22
- end
31
+ ##
32
+ # Creates a RegExp object as per rule.
33
+ def self.create(rule)
34
+ rule['selector'] ? WithSelector.new(rule) : new(rule)
35
+ end
23
36
 
24
- class WithSelector < Regexp
25
- def selector?
26
- true
27
- end
37
+ ##
38
+ # A RegExp with selector.
39
+ class WithSelector < Regexp
40
+ ##
41
+ # Whether the RegExp has a selector.
42
+ def selector?
43
+ true
44
+ end
28
45
 
29
- def contexts(node)
30
- sels = @rule['selector']
31
- node.css(sels).each { |e| yield(e) }
32
- end
46
+ ##
47
+ # TODO: Document what this method does.
48
+ def contexts(node)
49
+ selectors = @rule['selector']
50
+ node.css(selectors).each { |e| yield(e) }
51
+ end
33
52
 
34
- def applies?(html, node)
35
- enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
36
- end
53
+ ##
54
+ # Whether the RegExp applies to the given markup.
55
+ def applies?(_html, node)
56
+ enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
57
+ end
37
58
 
38
- def apply(node)
39
- contexts(node) { |e| e.replace(gsub!(e.to_html)) }
40
- end
41
- end
59
+ ##
60
+ # Applies the RegExp to the markup.
61
+ def apply(node)
62
+ contexts(node) { |e| e.replace(gsub!(e.to_html)) }
63
+ end
64
+ end
42
65
 
43
- protected
44
- def gsub!(str)
45
- re = ::Regexp.new(@rule['pattern'])
46
- sub = @rule['substitute'] || ''
47
- str.gsub!(re, sub)
48
- str
49
- end
66
+ protected
50
67
 
51
- def applies_to_string?(str)
52
- gsub!(str.dup) != str
68
+ def gsub!(str)
69
+ re = ::Regexp.new(@rule['pattern'])
70
+ sub = @rule['substitute'] || ''
71
+ # Expecting a mutation here. Do not reassign the variable str
72
+ # for the purpose of removing UTF-8 encoding errors.
73
+ str.gsub!(re, sub)
74
+ str
75
+ end
76
+
77
+ def applies_to_string?(str)
78
+ gsub!(str.dup) != str
79
+ end
80
+ end
53
81
  end
54
82
  end
55
- end
56
- end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/exception'
2
4
  require 'typhoeus'
3
5
  require 'addressable/uri'
@@ -5,63 +7,94 @@ require 'addressable/uri'
5
7
  class SiteDiff
6
8
  class SiteDiffReadFailure < SiteDiffException; end
7
9
 
10
+ # SiteDiff URI Wrapper.
8
11
  class UriWrapper
12
+ # TODO: Move these CURL OPTS to Config.DEFAULT_CONFIG.
13
+ DEFAULT_CURL_OPTS = {
14
+ # Don't hang on servers that don't exist.
15
+ connecttimeout: 3,
16
+ # Follow HTTP redirects (code 301 and 302).
17
+ followlocation: true,
18
+ headers: {
19
+ 'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
20
+ },
21
+ # always accept SSL certs
22
+ ssl_verifypeer: false,
23
+ ssl_verifyhost: 0
24
+ }.freeze
25
+
9
26
  # This lets us treat errors or content as one object
10
27
  class ReadResult
11
- attr_accessor :content, :error_code, :error
28
+ attr_accessor :encoding, :content, :error_code, :error
12
29
 
13
- def initialize(content = nil)
30
+ ##
31
+ # Creates a ReadResult.
32
+ def initialize(content = nil, encoding = 'utf-8')
14
33
  @content = content
34
+ @encoding = encoding
15
35
  @error = nil
16
36
  @error_code = nil
17
37
  end
18
38
 
19
- def self.error(err, code = nil)
39
+ ##
40
+ # Creates a ReadResult with an error.
41
+ def self.error(message, code = nil)
20
42
  res = new
21
43
  res.error_code = code
22
- res.error = err
23
- return res
44
+ res.error = message
45
+ res
24
46
  end
25
47
  end
26
48
 
27
- def initialize(uri)
49
+ ##
50
+ # Creates a UriWrapper.
51
+ def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
28
52
  @uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
29
53
  # remove trailing '/'s from local URIs
30
- @uri.path.gsub!(/\/*$/, '') if local?
54
+ @uri.path.gsub!(%r{/*$}, '') if local?
55
+ @curl_opts = curl_opts
56
+ @debug = debug
31
57
  end
32
58
 
59
+ ##
60
+ # Returns the "user" part of the URI.
33
61
  def user
34
62
  @uri.user
35
63
  end
36
64
 
65
+ ##
66
+ # Returns the "password" part of the URI.
37
67
  def password
38
68
  @uri.password
39
69
  end
40
70
 
71
+ ##
72
+ # Converts the URI to a string.
41
73
  def to_s
42
74
  uri = @uri.dup
43
75
  uri.user = nil
44
76
  uri.password = nil
45
- return uri.to_s
77
+ uri.to_s
46
78
  end
47
79
 
80
+ ##
48
81
  # Is this a local filesystem path?
49
82
  def local?
50
- @uri.scheme == nil
83
+ @uri.scheme.nil?
51
84
  end
52
85
 
53
- # FIXME this is not used anymore
54
- def +(path)
86
+ ## What does this one do?
87
+ # FIXME: this is not used anymore
88
+ def +(other)
55
89
  # 'path' for SiteDiff includes (parts of) path, query, and fragment.
56
90
  sep = ''
57
- if local? || @uri.path.empty?
58
- sep = '/'
59
- end
60
- self.class.new(@uri.to_s + sep + path)
91
+ sep = '/' if local? || @uri.path.empty?
92
+ self.class.new(@uri.to_s + sep + other)
61
93
  end
62
94
 
95
+ ##
63
96
  # Reads a file and yields to the completion handler, see .queue()
64
- def read_file(&handler)
97
+ def read_file
65
98
  File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
66
99
  rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
67
100
  yield ReadResult.error(e.message)
@@ -69,10 +102,10 @@ class SiteDiff
69
102
 
70
103
  # Returns the encoding of an HTTP response from headers , nil if not
71
104
  # specified.
72
- def http_encoding(http_headers)
73
- if content_type = http_headers['Content-Type']
74
- if md = /;\s*charset=([-\w]*)/.match(content_type)
75
- return md[1]
105
+ def charset_encoding(http_headers)
106
+ if (content_type = http_headers['Content-Type'])
107
+ if (md = /;\s*charset=([-\w]*)/.match(content_type))
108
+ md[1]
76
109
  end
77
110
  end
78
111
  end
@@ -81,33 +114,58 @@ class SiteDiff
81
114
  #
82
115
  # Completion callbacks of the request wrap the given handler which is
83
116
  # assumed to accept a single ReadResult argument.
84
- def typhoeus_request(&handler)
85
- params = {
86
- :connecttimeout => 3, # Don't hang on servers that don't exist
87
- :followlocation => true, # Follow HTTP redirects (code 301 and 302)
88
- :headers => {
89
- "User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
90
- }
91
- }
117
+ def typhoeus_request
118
+ params = @curl_opts.dup
92
119
  # Allow basic auth
93
120
  params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
94
121
 
95
- req = Typhoeus::Request.new(self.to_s, params)
122
+ req = Typhoeus::Request.new(to_s, params)
96
123
 
97
124
  req.on_success do |resp|
98
125
  body = resp.body
99
126
  # Typhoeus does not respect HTTP headers when setting the encoding
100
127
  # resp.body; coerce if possible.
101
- if encoding = http_encoding(resp.headers)
128
+ if (encoding = charset_encoding(resp.headers))
102
129
  body.force_encoding(encoding)
103
130
  end
104
- yield ReadResult.new(body)
131
+ # Should be wrapped with rescue I guess? Maybe this entire function?
132
+ # Should at least be an option in the Cli to disable this.
133
+ # "stop on first error"
134
+ begin
135
+ yield ReadResult.new(body, encoding)
136
+ rescue ArgumentError => e
137
+ raise if @debug
138
+
139
+ yield ReadResult.error(
140
+ "Parsing error for #{@uri}: #{e.message}"
141
+ )
142
+ rescue StandardError => e
143
+ raise if @debug
144
+
145
+ yield ReadResult.error(
146
+ "Unknown parsing error for #{@uri}: #{e.message}"
147
+ )
148
+ end
105
149
  end
106
150
 
107
151
  req.on_failure do |resp|
108
- msg = 'Unknown Error'
109
- msg = resp.status_message if resp and resp.status_message
110
- yield ReadResult.error("HTTP error #{@uri}: #{msg}", resp.response_code)
152
+ if resp&.status_message
153
+ msg = resp.status_message
154
+ yield ReadResult.error(
155
+ "HTTP error when loading #{@uri}: #{msg}",
156
+ resp.response_code
157
+ )
158
+ elsif (msg = resp.options[:return_code])
159
+ yield ReadResult.error(
160
+ "Connection error when loading #{@uri}: #{msg}",
161
+ resp.response_code
162
+ )
163
+ else
164
+ yield ReadResult.error(
165
+ "Unknown error when loading #{@uri}: #{msg}",
166
+ resp.response_code
167
+ )
168
+ end
111
169
  end
112
170
 
113
171
  req
@@ -126,5 +184,17 @@ class SiteDiff
126
184
  hydra.queue(typhoeus_request(&handler))
127
185
  end
128
186
  end
187
+
188
+ ##
189
+ # Canonicalize a path.
190
+ #
191
+ # @param [String] path
192
+ # A base relative path. Example: /foo/bar
193
+ def self.canonicalize(path)
194
+ # Ignore trailing slashes for all paths except "/" (front page).
195
+ path = path.chomp('/') unless path == '/'
196
+ # If the path is empty, assume that it's the front page.
197
+ path.empty? ? '/' : path
198
+ end
129
199
  end
130
200
  end