sitediff 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff/sanitize'
4
+ require 'nokogiri'
5
+
6
+ class SiteDiff
7
+ class Sanitizer
8
+ # Currently supported transforms:
9
+ #
10
+ # * { :type => "unwrap_root" }
11
+ # * { :type => "unwrap", :selector => "div.field-item" }
12
+ # * { :type => "remove", :selector => "div.extra-stuff" }
13
+ # * { :type => "remove_class", :class => 'class1' }
14
+ # * { :type => "strip", :selector => 'h1' }
15
+ class DomTransform
16
+ # Supported dom_transform types.
17
+ TRANSFORMS = {}
18
+
19
+ ##
20
+ # Creates a DOM Transform.
21
+ def initialize(rule)
22
+ @rule = rule
23
+ end
24
+
25
+ ##
26
+ # Often an array or scalar are both ok values. Turn either into an array.
27
+ def to_array(val)
28
+ [val].flatten
29
+ end
30
+
31
+ ##
32
+ # TODO: Document what this method does.
33
+ def targets(node)
34
+ selectors = to_array(@rule['selector'])
35
+ selectors.each do |sel|
36
+ node.css(sel).each { |n| yield n }
37
+ end
38
+ end
39
+
40
+ ##
41
+ # Applies the transformation to a DOM node.
42
+ def apply(node)
43
+ targets(node) { |t| process(t) }
44
+ end
45
+
46
+ ##
47
+ # Registers a DOM Transform plugin.
48
+ def self.register(name)
49
+ TRANSFORMS[name] = self
50
+ end
51
+
52
+ ##
53
+ # Creates a DOM Transform as per rule.
54
+ def self.create(rule)
55
+ (type = rule['type']) ||
56
+ raise(InvalidSanitization, 'DOM transform needs a type')
57
+ (transform = TRANSFORMS[type]) ||
58
+ raise(InvalidSanitization, "No DOM transform named #{type}")
59
+ transform.new(rule)
60
+ end
61
+
62
+ ##
63
+ # Remove elements matching 'selector'.
64
+ class Remove < DomTransform
65
+ register 'remove'
66
+
67
+ ##
68
+ # Processes a node.
69
+ def process(node)
70
+ node.remove
71
+ end
72
+ end
73
+
74
+ # Squeeze whitespace from a tag matching 'selector'.
75
+ class Strip < DomTransform
76
+ register 'strip'
77
+
78
+ ##
79
+ # Processes a node.
80
+ def process(node)
81
+ node.content = node.content.strip
82
+ end
83
+ end
84
+
85
+ # Unwrap elements matching 'selector'.
86
+ class Unwrap < DomTransform
87
+ register 'unwrap'
88
+
89
+ ##
90
+ # Processes a node.
91
+ def process(node)
92
+ node.add_next_sibling(node.children)
93
+ node.remove
94
+ end
95
+ end
96
+
97
+ ##
98
+ # Remove classes from elements matching selector
99
+ class RemoveClass < DomTransform
100
+ register 'remove_class'
101
+
102
+ ##
103
+ # Processes a node.
104
+ def process(node)
105
+ classes = to_array(@rule['class'])
106
+
107
+ # Must call remove_class on a NodeSet!
108
+ ns = Nokogiri::XML::NodeSet.new(node.document, [node])
109
+ classes.each do |class_name|
110
+ ns.remove_class(class_name)
111
+ end
112
+ end
113
+ end
114
+
115
+ ##
116
+ # Unwrap the root element.
117
+ class UnwrapRoot < DomTransform
118
+ register 'unwrap_root'
119
+
120
+ ##
121
+ # Applies the transformation to a DOM node.
122
+ def apply(node)
123
+ (node.children.size == 1) ||
124
+ raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
125
+ node.children = node.children[0].children
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteDiff
4
+ class Sanitizer
5
+ # Regular Expression Object.
6
+ class Regexp
7
+ ##
8
+ # Creates a RegExp object.
9
+ def initialize(rule)
10
+ @rule = rule
11
+ end
12
+
13
+ ##
14
+ # Whether the RegExp has a selector.
15
+ def selector?
16
+ false
17
+ end
18
+
19
+ ##
20
+ # Whether the RegExp applies to the given markup.
21
+ def applies?(html, _node)
22
+ applies_to_string?(html)
23
+ end
24
+
25
+ ##
26
+ # Applies the RegExp to the markup.
27
+ def apply(html)
28
+ gsub!(html)
29
+ end
30
+
31
+ ##
32
+ # Creates a RegExp object as per rule.
33
+ def self.create(rule)
34
+ rule['selector'] ? WithSelector.new(rule) : new(rule)
35
+ end
36
+
37
+ ##
38
+ # A RegExp with selector.
39
+ class WithSelector < Regexp
40
+ ##
41
+ # Whether the RegExp has a selector.
42
+ def selector?
43
+ true
44
+ end
45
+
46
+ ##
47
+ # TODO: Document what this method does.
48
+ def contexts(node)
49
+ selectors = @rule['selector']
50
+ node.css(selectors).each { |e| yield(e) }
51
+ end
52
+
53
+ ##
54
+ # Whether the RegExp applies to the given markup.
55
+ def applies?(_html, node)
56
+ enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
57
+ end
58
+
59
+ ##
60
+ # Applies the RegExp to the markup.
61
+ def apply(node)
62
+ contexts(node) { |e| e.replace(gsub!(e.to_html)) }
63
+ end
64
+ end
65
+
66
+ protected
67
+
68
+ def gsub!(str)
69
+ re = ::Regexp.new(@rule['pattern'])
70
+ sub = @rule['substitute'] || ''
71
+ # Expecting a mutation here. Do not reassign the variable str
72
+ # for the purpose of removing UTF-8 encoding errors.
73
+ str.gsub!(re, sub)
74
+ str
75
+ end
76
+
77
+ def applies_to_string?(str)
78
+ gsub!(str.dup) != str
79
+ end
80
+ end
81
+ end
82
+ end
@@ -1,55 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff/exception'
1
4
  require 'typhoeus'
5
+ require 'addressable/uri'
2
6
 
3
7
  class SiteDiff
4
- class SiteDiffReadFailure < Exception; end
8
+ class SiteDiffReadFailure < SiteDiffException; end
5
9
 
10
+ # SiteDiff URI Wrapper.
6
11
  class UriWrapper
12
+ # TODO: Move these CURL OPTS to Config.DEFAULT_CONFIG.
13
+ DEFAULT_CURL_OPTS = {
14
+ # Don't hang on servers that don't exist.
15
+ connecttimeout: 3,
16
+ # Follow HTTP redirects (code 301 and 302).
17
+ followlocation: true,
18
+ headers: {
19
+ 'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
20
+ }
21
+ }.freeze
22
+
7
23
  # This lets us treat errors or content as one object
8
- class ReadResult < Struct.new(:content, :error)
9
- def initialize(cont, err = nil)
10
- super(cont, err)
24
+ class ReadResult
25
+ attr_accessor :encoding, :content, :error_code, :error
26
+
27
+ ##
28
+ # Creates a ReadResult.
29
+ def initialize(content = nil, encoding = 'utf-8')
30
+ @content = content
31
+ @encoding = encoding
32
+ @error = nil
33
+ @error_code = nil
34
+ end
35
+
36
+ ##
37
+ # Creates a ReadResult with an error.
38
+ def self.error(message, code = nil)
39
+ res = new
40
+ res.error_code = code
41
+ res.error = message
42
+ res
11
43
  end
12
- def self.error(err); new(nil, err); end
13
44
  end
14
45
 
15
- def initialize(uri)
16
- @uri = uri.respond_to?(:scheme) ? uri : URI.parse(uri)
46
+ ##
47
+ # Creates a UriWrapper.
48
+ def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
49
+ @uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
17
50
  # remove trailing '/'s from local URIs
18
- @uri.path.gsub!(/\/*$/, '') if local?
51
+ @uri.path.gsub!(%r{/*$}, '') if local?
52
+ @curl_opts = curl_opts
53
+ @debug = debug
19
54
  end
20
55
 
56
+ ##
57
+ # Returns the "user" part of the URI.
21
58
  def user
22
59
  @uri.user
23
60
  end
24
61
 
62
+ ##
63
+ # Returns the "password" part of the URI.
25
64
  def password
26
65
  @uri.password
27
66
  end
28
67
 
68
+ ##
69
+ # Converts the URI to a string.
29
70
  def to_s
30
71
  uri = @uri.dup
31
72
  uri.user = nil
32
73
  uri.password = nil
33
- return uri.to_s
74
+ uri.to_s
34
75
  end
35
76
 
77
+ ##
36
78
  # Is this a local filesystem path?
37
79
  def local?
38
- @uri.scheme == nil
80
+ @uri.scheme.nil?
39
81
  end
40
82
 
41
- # FIXME this is not used anymore
42
- def +(path)
83
+ ## What does this one do?
84
+ # FIXME: this is not used anymore
85
+ def +(other)
43
86
  # 'path' for SiteDiff includes (parts of) path, query, and fragment.
44
87
  sep = ''
45
- if local? || @uri.path.empty?
46
- sep = '/'
47
- end
48
- self.class.new(@uri.to_s + sep + path)
88
+ sep = '/' if local? || @uri.path.empty?
89
+ self.class.new(@uri.to_s + sep + other)
49
90
  end
50
91
 
92
+ ##
51
93
  # Reads a file and yields to the completion handler, see .queue()
52
- def read_file(&handler)
94
+ def read_file
53
95
  File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
54
96
  rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
55
97
  yield ReadResult.error(e.message)
@@ -57,10 +99,10 @@ class SiteDiff
57
99
 
58
100
  # Returns the encoding of an HTTP response from headers , nil if not
59
101
  # specified.
60
- def http_encoding(http_headers)
61
- if content_type = http_headers['Content-Type']
62
- if md = /;\s*charset=([-\w]*)/.match(content_type)
63
- return md[1]
102
+ def charset_encoding(http_headers)
103
+ if (content_type = http_headers['Content-Type'])
104
+ if (md = /;\s*charset=([-\w]*)/.match(content_type))
105
+ md[1]
64
106
  end
65
107
  end
66
108
  end
@@ -69,33 +111,58 @@ class SiteDiff
69
111
  #
70
112
  # Completion callbacks of the request wrap the given handler which is
71
113
  # assumed to accept a single ReadResult argument.
72
- def typhoeus_request(&handler)
73
- params = {
74
- :connecttimeout => 3, # Don't hang on servers that don't exist
75
- :followlocation => true, # Follow HTTP redirects (code 301 and 302)
76
- :headers => {
77
- "User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
78
- }
79
- }
114
+ def typhoeus_request
115
+ params = @curl_opts.dup
80
116
  # Allow basic auth
81
117
  params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
82
118
 
83
- req = Typhoeus::Request.new(self.to_s, params)
119
+ req = Typhoeus::Request.new(to_s, params)
84
120
 
85
121
  req.on_success do |resp|
86
122
  body = resp.body
87
123
  # Typhoeus does not respect HTTP headers when setting the encoding
88
124
  # resp.body; coerce if possible.
89
- if encoding = http_encoding(resp.headers)
125
+ if (encoding = charset_encoding(resp.headers))
90
126
  body.force_encoding(encoding)
91
127
  end
92
- yield ReadResult.new(body)
128
+ # Should be wrapped with rescue I guess? Maybe this entire function?
129
+ # Should at least be an option in the Cli to disable this.
130
+ # "stop on first error"
131
+ begin
132
+ yield ReadResult.new(body, encoding)
133
+ rescue ArgumentError => e
134
+ raise if @debug
135
+
136
+ yield ReadResult.error(
137
+ "Parsing error for #{@uri}: #{e.message}"
138
+ )
139
+ rescue StandardError => e
140
+ raise if @debug
141
+
142
+ yield ReadResult.error(
143
+ "Unknown parsing error for #{@uri}: #{e.message}"
144
+ )
145
+ end
93
146
  end
94
147
 
95
148
  req.on_failure do |resp|
96
- msg = 'Unknown Error'
97
- msg = resp.status_message if resp and resp.status_message
98
- yield ReadResult.error("HTTP error #{@uri}: #{msg}")
149
+ if resp&.status_message
150
+ msg = resp.status_message
151
+ yield ReadResult.error(
152
+ "HTTP error when loading #{@uri}: #{msg}",
153
+ resp.response_code
154
+ )
155
+ elsif (msg = resp.options[:return_code])
156
+ yield ReadResult.error(
157
+ "Connection error when loading #{@uri}: #{msg}",
158
+ resp.response_code
159
+ )
160
+ else
161
+ yield ReadResult.error(
162
+ "Unknown error when loading #{@uri}: #{msg}",
163
+ resp.response_code
164
+ )
165
+ end
99
166
  end
100
167
 
101
168
  req
@@ -114,5 +181,17 @@ class SiteDiff
114
181
  hydra.queue(typhoeus_request(&handler))
115
182
  end
116
183
  end
184
+
185
+ ##
186
+ # Canonicalize a path.
187
+ #
188
+ # @param [String] path
189
+ # A base relative path. Example: /foo/bar
190
+ def self.canonicalize(path)
191
+ # Ignore trailing slashes for all paths except "/" (front page).
192
+ path = path.chomp('/') unless path == '/'
193
+ # If the path is empty, assume that it's the front page.
194
+ path.empty? ? '/' : path
195
+ end
117
196
  end
118
197
  end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'webrick'
4
+
5
+ class SiteDiff
6
+ # SiteDiff Web Server.
7
+ class Webserver
8
+ # Simple web server for testing purposes.
9
+ DEFAULT_PORT = 13_080
10
+
11
+ attr_accessor :ports
12
+
13
+ ##
14
+ # Serve a list of directories.
15
+ def initialize(start_port, dirs, opts = {})
16
+ start_port ||= DEFAULT_PORT
17
+ @ports = (start_port...(start_port + dirs.size)).to_a
18
+ @dirs = dirs
19
+ @opts = opts
20
+
21
+ setup
22
+ start_servers
23
+
24
+ if block_given?
25
+ yield self
26
+ kill
27
+ end
28
+ end
29
+
30
+ ##
31
+ # Kills the server.
32
+ def kill
33
+ @threads.each(&:kill)
34
+ end
35
+
36
+ ##
37
+ # Waits for the server.
38
+ def wait
39
+ @threads.each(&:join)
40
+ end
41
+
42
+ ##
43
+ # Maps URIs to defined ports and returns a list of URIs.
44
+ def uris
45
+ ports.map { |p| "http://localhost:#{p}" }
46
+ end
47
+
48
+ protected
49
+
50
+ def setup
51
+ @server_opts = {}
52
+ if @opts[:quiet]
53
+ @server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
54
+ @server_opts[:AccessLog] = []
55
+ end
56
+ end
57
+
58
+ def server(opts)
59
+ WEBrick::HTTPServer.new(opts)
60
+ end
61
+
62
+ def start_servers
63
+ @threads = []
64
+ @dirs.each_with_index do |dir, idx|
65
+ @server_opts[:Port] = @ports[idx]
66
+ @server_opts[:DocumentRoot] = dir
67
+ srv = server(@server_opts)
68
+ @threads << Thread.new { srv.start }
69
+ end
70
+ end
71
+
72
+ public
73
+
74
+ # SiteDiff Fixture Server.
75
+ class FixtureServer < Webserver
76
+ PORT = DEFAULT_PORT + 1
77
+ BASE = 'spec/sites/ruby-doc.org'
78
+ NAMES = %w[core-1.9.3 core-2.0].freeze
79
+
80
+ def initialize(port = PORT, base = BASE, names = NAMES)
81
+ dirs = names.map { |n| File.join(base, n) }
82
+ super(port, dirs, quiet: true)
83
+ end
84
+
85
+ def before
86
+ uris.first
87
+ end
88
+
89
+ def after
90
+ uris.last
91
+ end
92
+ end
93
+ end
94
+ end