sitediff 0.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,92 +1,130 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/sanitize'
2
4
  require 'nokogiri'
3
5
 
4
6
  class SiteDiff
5
- class Sanitizer
7
+ class Sanitizer
8
+ # Currently supported transforms:
9
+ #
10
+ # * { :type => "unwrap_root" }
11
+ # * { :type => "unwrap", :selector => "div.field-item" }
12
+ # * { :type => "remove", :selector => "div.extra-stuff" }
13
+ # * { :type => "remove_class", :class => 'class1' }
14
+ # * { :type => "strip", :selector => 'h1' }
15
+ class DomTransform
16
+ # Supported dom_transform types.
17
+ TRANSFORMS = {}
6
18
 
7
- # Currently supported transforms:
8
- #
9
- # * { :type => "unwrap_root" }
10
- # * { :type => "unwrap", :selector => "div.field-item" }
11
- # * { :type => "remove", :selector => "div.extra-stuff" }
12
- # * { :type => "remove_class", :class => 'class1' }
13
- class DomTransform
19
+ ##
20
+ # Creates a DOM Transform.
21
+ def initialize(rule)
22
+ @rule = rule
23
+ end
14
24
 
15
- Transforms = {}
25
+ ##
26
+ # Often an array or scalar are both ok values. Turn either into an array.
27
+ def to_array(val)
28
+ [val].flatten
29
+ end
16
30
 
17
- def initialize(rule)
18
- @rule = rule
19
- end
31
+ ##
32
+ # TODO: Document what this method does.
33
+ def targets(node)
34
+ selectors = to_array(@rule['selector'])
35
+ selectors.each do |sel|
36
+ node.css(sel).each { |n| yield n }
37
+ end
38
+ end
20
39
 
21
- # Often an array or scalar are both ok values. Turn either into an array.
22
- def to_array(v)
23
- return [v].flatten
24
- end
40
+ ##
41
+ # Applies the transformation to a DOM node.
42
+ def apply(node)
43
+ targets(node) { |t| process(t) }
44
+ end
25
45
 
26
- def targets(node)
27
- selectors = to_array(@rule['selector'])
28
- selectors.each do |sel|
29
- node.css(sel).each { |n| yield n }
30
- end
31
- end
46
+ ##
47
+ # Registers a DOM Transform plugin.
48
+ def self.register(name)
49
+ TRANSFORMS[name] = self
50
+ end
32
51
 
33
- def apply(node)
34
- targets(node) { |t| process(t) }
35
- end
52
+ ##
53
+ # Creates a DOM Transform as per rule.
54
+ def self.create(rule)
55
+ (type = rule['type']) ||
56
+ raise(InvalidSanitization, 'DOM transform needs a type')
57
+ (transform = TRANSFORMS[type]) ||
58
+ raise(InvalidSanitization, "No DOM transform named #{type}")
59
+ transform.new(rule)
60
+ end
36
61
 
37
- def self.register(name)
38
- Transforms[name] = self
39
- end
62
+ ##
63
+ # Remove elements matching 'selector'.
64
+ class Remove < DomTransform
65
+ register 'remove'
40
66
 
41
- def self.create(rule)
42
- type = rule['type'] or
43
- raise InvalidSanitization, "DOM transform needs a type"
44
- transform = Transforms[type] or
45
- raise InvalidSanitization, "No DOM transform named #{type}"
46
- return transform.new(rule)
47
- end
67
+ ##
68
+ # Processes a node.
69
+ def process(node)
70
+ node.remove
71
+ end
72
+ end
48
73
 
49
- # Remove elements matching 'selector'
50
- class Remove < DomTransform
51
- register "remove"
52
- def process(node)
53
- node.remove
54
- end
55
- end
74
+ # Squeeze whitespace from a tag matching 'selector'.
75
+ class Strip < DomTransform
76
+ register 'strip'
56
77
 
57
- # Unwrap elements matching 'selector'
58
- class Unwrap < DomTransform
59
- register "unwrap"
60
- def process(node)
61
- node.add_next_sibling(node.children)
62
- node.remove
63
- end
64
- end
78
+ ##
79
+ # Processes a node.
80
+ def process(node)
81
+ node.content = node.content.strip
82
+ end
83
+ end
65
84
 
66
- # Remove classes from elements matching selector
67
- class RemoveClass < DomTransform
68
- register "remove_class"
69
- def process(node)
70
- classes = to_array(@rule['class'])
85
+ # Unwrap elements matching 'selector'.
86
+ class Unwrap < DomTransform
87
+ register 'unwrap'
71
88
 
72
- # Must call remove_class on a NodeSet!
73
- ns = Nokogiri::XML::NodeSet.new(node.document, [node])
74
- classes.each do |class_name|
75
- ns.remove_class(class_name)
76
- end
77
- end
78
- end
89
+ ##
90
+ # Processes a node.
91
+ def process(node)
92
+ node.add_next_sibling(node.children)
93
+ node.remove
94
+ end
95
+ end
79
96
 
80
- # Unwrap the root element
81
- class UnwrapRoot < DomTransform
82
- register "unwrap_root"
83
- def apply(node)
84
- node.children.size == 1 or
85
- raise InvalidSanitization, "Multiple root elements in unwrap_root"
86
- node.children = node.children[0].children
87
- end
88
- end
97
+ ##
98
+ # Remove classes from elements matching selector
99
+ class RemoveClass < DomTransform
100
+ register 'remove_class'
89
101
 
90
- end
91
- end
102
+ ##
103
+ # Processes a node.
104
+ def process(node)
105
+ classes = to_array(@rule['class'])
106
+
107
+ # Must call remove_class on a NodeSet!
108
+ ns = Nokogiri::XML::NodeSet.new(node.document, [node])
109
+ classes.each do |class_name|
110
+ ns.remove_class(class_name)
111
+ end
112
+ end
113
+ end
114
+
115
+ ##
116
+ # Unwrap the root element.
117
+ class UnwrapRoot < DomTransform
118
+ register 'unwrap_root'
119
+
120
+ ##
121
+ # Applies the transformation to a DOM node.
122
+ def apply(node)
123
+ (node.children.size == 1) ||
124
+ raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
125
+ node.children = node.children[0].children
126
+ end
127
+ end
128
+ end
129
+ end
92
130
  end
@@ -1,56 +1,82 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class SiteDiff
2
- class Sanitizer
3
- class Regexp
4
- def initialize(rule)
5
- @rule = rule
6
- end
4
+ class Sanitizer
5
+ # Regular Expression Object.
6
+ class Regexp
7
+ ##
8
+ # Creates a RegExp object.
9
+ def initialize(rule)
10
+ @rule = rule
11
+ end
7
12
 
8
- def selector?
9
- false
10
- end
13
+ ##
14
+ # Whether the RegExp has a selector.
15
+ def selector?
16
+ false
17
+ end
11
18
 
12
- def applies?(html, node)
13
- applies_to_string?(html)
14
- end
19
+ ##
20
+ # Whether the RegExp applies to the given markup.
21
+ def applies?(html, _node)
22
+ applies_to_string?(html)
23
+ end
15
24
 
16
- def apply(html)
17
- gsub!(html)
18
- end
25
+ ##
26
+ # Applies the RegExp to the markup.
27
+ def apply(html)
28
+ gsub!(html)
29
+ end
19
30
 
20
- def self.create(rule)
21
- rule['selector'] ? WithSelector.new(rule) : new(rule)
22
- end
31
+ ##
32
+ # Creates a RegExp object as per rule.
33
+ def self.create(rule)
34
+ rule['selector'] ? WithSelector.new(rule) : new(rule)
35
+ end
23
36
 
24
- class WithSelector < Regexp
25
- def selector?
26
- true
27
- end
37
+ ##
38
+ # A RegExp with selector.
39
+ class WithSelector < Regexp
40
+ ##
41
+ # Whether the RegExp has a selector.
42
+ def selector?
43
+ true
44
+ end
28
45
 
29
- def contexts(node)
30
- sels = @rule['selector']
31
- node.css(sels).each { |e| yield(e) }
32
- end
46
+ ##
47
+ # TODO: Document what this method does.
48
+ def contexts(node)
49
+ selectors = @rule['selector']
50
+ node.css(selectors).each { |e| yield(e) }
51
+ end
33
52
 
34
- def applies?(html, node)
35
- enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
36
- end
53
+ ##
54
+ # Whether the RegExp applies to the given markup.
55
+ def applies?(_html, node)
56
+ enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
57
+ end
37
58
 
38
- def apply(node)
39
- contexts(node) { |e| e.replace(gsub!(e.to_html)) }
40
- end
41
- end
59
+ ##
60
+ # Applies the RegExp to the markup.
61
+ def apply(node)
62
+ contexts(node) { |e| e.replace(gsub!(e.to_html)) }
63
+ end
64
+ end
42
65
 
43
- protected
44
- def gsub!(str)
45
- re = ::Regexp.new(@rule['pattern'])
46
- sub = @rule['substitute'] || ''
47
- str.gsub!(re, sub)
48
- str
49
- end
66
+ protected
50
67
 
51
- def applies_to_string?(str)
52
- gsub!(str.dup) != str
68
+ def gsub!(str)
69
+ re = ::Regexp.new(@rule['pattern'])
70
+ sub = @rule['substitute'] || ''
71
+ # Expecting a mutation here. Do not reassign the variable str
72
+ # for the purpose of removing UTF-8 encoding errors.
73
+ str.gsub!(re, sub)
74
+ str
75
+ end
76
+
77
+ def applies_to_string?(str)
78
+ gsub!(str.dup) != str
79
+ end
80
+ end
53
81
  end
54
82
  end
55
- end
56
- end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/exception'
2
4
  require 'typhoeus'
3
5
  require 'addressable/uri'
@@ -5,63 +7,94 @@ require 'addressable/uri'
5
7
  class SiteDiff
6
8
  class SiteDiffReadFailure < SiteDiffException; end
7
9
 
10
+ # SiteDiff URI Wrapper.
8
11
  class UriWrapper
12
+ # TODO: Move these CURL OPTS to Config.DEFAULT_CONFIG.
13
+ DEFAULT_CURL_OPTS = {
14
+ # Don't hang on servers that don't exist.
15
+ connecttimeout: 3,
16
+ # Follow HTTP redirects (code 301 and 302).
17
+ followlocation: true,
18
+ headers: {
19
+ 'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
20
+ },
21
+ # always accept SSL certs
22
+ ssl_verifypeer: false,
23
+ ssl_verifyhost: 0
24
+ }.freeze
25
+
9
26
  # This lets us treat errors or content as one object
10
27
  class ReadResult
11
- attr_accessor :content, :error_code, :error
28
+ attr_accessor :encoding, :content, :error_code, :error
12
29
 
13
- def initialize(content = nil)
30
+ ##
31
+ # Creates a ReadResult.
32
+ def initialize(content = nil, encoding = 'utf-8')
14
33
  @content = content
34
+ @encoding = encoding
15
35
  @error = nil
16
36
  @error_code = nil
17
37
  end
18
38
 
19
- def self.error(err, code = nil)
39
+ ##
40
+ # Creates a ReadResult with an error.
41
+ def self.error(message, code = nil)
20
42
  res = new
21
43
  res.error_code = code
22
- res.error = err
23
- return res
44
+ res.error = message
45
+ res
24
46
  end
25
47
  end
26
48
 
27
- def initialize(uri)
49
+ ##
50
+ # Creates a UriWrapper.
51
+ def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
28
52
  @uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
29
53
  # remove trailing '/'s from local URIs
30
- @uri.path.gsub!(/\/*$/, '') if local?
54
+ @uri.path.gsub!(%r{/*$}, '') if local?
55
+ @curl_opts = curl_opts
56
+ @debug = debug
31
57
  end
32
58
 
59
+ ##
60
+ # Returns the "user" part of the URI.
33
61
  def user
34
62
  @uri.user
35
63
  end
36
64
 
65
+ ##
66
+ # Returns the "password" part of the URI.
37
67
  def password
38
68
  @uri.password
39
69
  end
40
70
 
71
+ ##
72
+ # Converts the URI to a string.
41
73
  def to_s
42
74
  uri = @uri.dup
43
75
  uri.user = nil
44
76
  uri.password = nil
45
- return uri.to_s
77
+ uri.to_s
46
78
  end
47
79
 
80
+ ##
48
81
  # Is this a local filesystem path?
49
82
  def local?
50
- @uri.scheme == nil
83
+ @uri.scheme.nil?
51
84
  end
52
85
 
53
- # FIXME this is not used anymore
54
- def +(path)
86
+ ## What does this one do?
87
+ # FIXME: this is not used anymore
88
+ def +(other)
55
89
  # 'path' for SiteDiff includes (parts of) path, query, and fragment.
56
90
  sep = ''
57
- if local? || @uri.path.empty?
58
- sep = '/'
59
- end
60
- self.class.new(@uri.to_s + sep + path)
91
+ sep = '/' if local? || @uri.path.empty?
92
+ self.class.new(@uri.to_s + sep + other)
61
93
  end
62
94
 
95
+ ##
63
96
  # Reads a file and yields to the completion handler, see .queue()
64
- def read_file(&handler)
97
+ def read_file
65
98
  File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
66
99
  rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
67
100
  yield ReadResult.error(e.message)
@@ -69,10 +102,10 @@ class SiteDiff
69
102
 
70
103
  # Returns the encoding of an HTTP response from headers , nil if not
71
104
  # specified.
72
- def http_encoding(http_headers)
73
- if content_type = http_headers['Content-Type']
74
- if md = /;\s*charset=([-\w]*)/.match(content_type)
75
- return md[1]
105
+ def charset_encoding(http_headers)
106
+ if (content_type = http_headers['Content-Type'])
107
+ if (md = /;\s*charset=([-\w]*)/.match(content_type))
108
+ md[1]
76
109
  end
77
110
  end
78
111
  end
@@ -81,33 +114,58 @@ class SiteDiff
81
114
  #
82
115
  # Completion callbacks of the request wrap the given handler which is
83
116
  # assumed to accept a single ReadResult argument.
84
- def typhoeus_request(&handler)
85
- params = {
86
- :connecttimeout => 3, # Don't hang on servers that don't exist
87
- :followlocation => true, # Follow HTTP redirects (code 301 and 302)
88
- :headers => {
89
- "User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
90
- }
91
- }
117
+ def typhoeus_request
118
+ params = @curl_opts.dup
92
119
  # Allow basic auth
93
120
  params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
94
121
 
95
- req = Typhoeus::Request.new(self.to_s, params)
122
+ req = Typhoeus::Request.new(to_s, params)
96
123
 
97
124
  req.on_success do |resp|
98
125
  body = resp.body
99
126
  # Typhoeus does not respect HTTP headers when setting the encoding
100
127
  # resp.body; coerce if possible.
101
- if encoding = http_encoding(resp.headers)
128
+ if (encoding = charset_encoding(resp.headers))
102
129
  body.force_encoding(encoding)
103
130
  end
104
- yield ReadResult.new(body)
131
+ # Should be wrapped with rescue I guess? Maybe this entire function?
132
+ # Should at least be an option in the Cli to disable this.
133
+ # "stop on first error"
134
+ begin
135
+ yield ReadResult.new(body, encoding)
136
+ rescue ArgumentError => e
137
+ raise if @debug
138
+
139
+ yield ReadResult.error(
140
+ "Parsing error for #{@uri}: #{e.message}"
141
+ )
142
+ rescue StandardError => e
143
+ raise if @debug
144
+
145
+ yield ReadResult.error(
146
+ "Unknown parsing error for #{@uri}: #{e.message}"
147
+ )
148
+ end
105
149
  end
106
150
 
107
151
  req.on_failure do |resp|
108
- msg = 'Unknown Error'
109
- msg = resp.status_message if resp and resp.status_message
110
- yield ReadResult.error("HTTP error #{@uri}: #{msg}", resp.response_code)
152
+ if resp&.status_message
153
+ msg = resp.status_message
154
+ yield ReadResult.error(
155
+ "HTTP error when loading #{@uri}: #{msg}",
156
+ resp.response_code
157
+ )
158
+ elsif (msg = resp.options[:return_code])
159
+ yield ReadResult.error(
160
+ "Connection error when loading #{@uri}: #{msg}",
161
+ resp.response_code
162
+ )
163
+ else
164
+ yield ReadResult.error(
165
+ "Unknown error when loading #{@uri}: #{msg}",
166
+ resp.response_code
167
+ )
168
+ end
111
169
  end
112
170
 
113
171
  req
@@ -126,5 +184,17 @@ class SiteDiff
126
184
  hydra.queue(typhoeus_request(&handler))
127
185
  end
128
186
  end
187
+
188
+ ##
189
+ # Canonicalize a path.
190
+ #
191
+ # @param [String] path
192
+ # A base relative path. Example: /foo/bar
193
+ def self.canonicalize(path)
194
+ # Ignore trailing slashes for all paths except "/" (front page).
195
+ path = path.chomp('/') unless path == '/'
196
+ # If the path is empty, assume that it's the front page.
197
+ path.empty? ? '/' : path
198
+ end
129
199
  end
130
200
  end