sitediff 0.0.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff/sanitize'
4
+ require 'nokogiri'
5
+
6
+ class SiteDiff
7
+ class Sanitizer
8
+ # Currently supported transforms:
9
+ #
10
+ # * { :type => "unwrap_root" }
11
+ # * { :type => "unwrap", :selector => "div.field-item" }
12
+ # * { :type => "remove", :selector => "div.extra-stuff" }
13
+ # * { :type => "remove_class", :class => 'class1' }
14
+ # * { :type => "strip", :selector => 'h1' }
15
+ class DomTransform
16
+ # Supported dom_transform types.
17
+ TRANSFORMS = {}
18
+
19
+ ##
20
+ # Creates a DOM Transform.
21
+ def initialize(rule)
22
+ @rule = rule
23
+ end
24
+
25
+ ##
26
+ # Often an array or scalar are both ok values. Turn either into an array.
27
+ def to_array(val)
28
+ [val].flatten
29
+ end
30
+
31
+ ##
32
+ # TODO: Document what this method does.
33
+ def targets(node)
34
+ selectors = to_array(@rule['selector'])
35
+ selectors.each do |sel|
36
+ node.css(sel).each { |n| yield n }
37
+ end
38
+ end
39
+
40
+ ##
41
+ # Applies the transformation to a DOM node.
42
+ def apply(node)
43
+ targets(node) { |t| process(t) }
44
+ end
45
+
46
+ ##
47
+ # Registers a DOM Transform plugin.
48
+ def self.register(name)
49
+ TRANSFORMS[name] = self
50
+ end
51
+
52
+ ##
53
+ # Creates a DOM Transform as per rule.
54
+ def self.create(rule)
55
+ (type = rule['type']) ||
56
+ raise(InvalidSanitization, 'DOM transform needs a type')
57
+ (transform = TRANSFORMS[type]) ||
58
+ raise(InvalidSanitization, "No DOM transform named #{type}")
59
+ transform.new(rule)
60
+ end
61
+
62
+ ##
63
+ # Remove elements matching 'selector'.
64
+ class Remove < DomTransform
65
+ register 'remove'
66
+
67
+ ##
68
+ # Processes a node.
69
+ def process(node)
70
+ node.remove
71
+ end
72
+ end
73
+
74
+ # Squeeze whitespace from a tag matching 'selector'.
75
+ class Strip < DomTransform
76
+ register 'strip'
77
+
78
+ ##
79
+ # Processes a node.
80
+ def process(node)
81
+ node.content = node.content.strip
82
+ end
83
+ end
84
+
85
+ # Unwrap elements matching 'selector'.
86
+ class Unwrap < DomTransform
87
+ register 'unwrap'
88
+
89
+ ##
90
+ # Processes a node.
91
+ def process(node)
92
+ node.add_next_sibling(node.children)
93
+ node.remove
94
+ end
95
+ end
96
+
97
+ ##
98
+ # Remove classes from elements matching selector
99
+ class RemoveClass < DomTransform
100
+ register 'remove_class'
101
+
102
+ ##
103
+ # Processes a node.
104
+ def process(node)
105
+ classes = to_array(@rule['class'])
106
+
107
+ # Must call remove_class on a NodeSet!
108
+ ns = Nokogiri::XML::NodeSet.new(node.document, [node])
109
+ classes.each do |class_name|
110
+ ns.remove_class(class_name)
111
+ end
112
+ end
113
+ end
114
+
115
+ ##
116
+ # Unwrap the root element.
117
+ class UnwrapRoot < DomTransform
118
+ register 'unwrap_root'
119
+
120
+ ##
121
+ # Applies the transformation to a DOM node.
122
+ def apply(node)
123
+ (node.children.size == 1) ||
124
+ raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
125
+ node.children = node.children[0].children
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteDiff
4
+ class Sanitizer
5
+ # Regular Expression Object.
6
+ class Regexp
7
+ ##
8
+ # Creates a RegExp object.
9
+ def initialize(rule)
10
+ @rule = rule
11
+ end
12
+
13
+ ##
14
+ # Whether the RegExp has a selector.
15
+ def selector?
16
+ false
17
+ end
18
+
19
+ ##
20
+ # Whether the RegExp applies to the given markup.
21
+ def applies?(html, _node)
22
+ applies_to_string?(html)
23
+ end
24
+
25
+ ##
26
+ # Applies the RegExp to the markup.
27
+ def apply(html)
28
+ gsub!(html)
29
+ end
30
+
31
+ ##
32
+ # Creates a RegExp object as per rule.
33
+ def self.create(rule)
34
+ rule['selector'] ? WithSelector.new(rule) : new(rule)
35
+ end
36
+
37
+ ##
38
+ # A RegExp with selector.
39
+ class WithSelector < Regexp
40
+ ##
41
+ # Whether the RegExp has a selector.
42
+ def selector?
43
+ true
44
+ end
45
+
46
+ ##
47
+ # TODO: Document what this method does.
48
+ def contexts(node)
49
+ selectors = @rule['selector']
50
+ node.css(selectors).each { |e| yield(e) }
51
+ end
52
+
53
+ ##
54
+ # Whether the RegExp applies to the given markup.
55
+ def applies?(_html, node)
56
+ enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
57
+ end
58
+
59
+ ##
60
+ # Applies the RegExp to the markup.
61
+ def apply(node)
62
+ contexts(node) { |e| e.replace(gsub!(e.to_html)) }
63
+ end
64
+ end
65
+
66
+ protected
67
+
68
+ def gsub!(str)
69
+ re = ::Regexp.new(@rule['pattern'])
70
+ sub = @rule['substitute'] || ''
71
+ # Expecting a mutation here. Do not reassign the variable str
72
+ # for the purpose of removing UTF-8 encoding errors.
73
+ str.gsub!(re, sub)
74
+ str
75
+ end
76
+
77
+ def applies_to_string?(str)
78
+ gsub!(str.dup) != str
79
+ end
80
+ end
81
+ end
82
+ end
@@ -1,55 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff/exception'
1
4
  require 'typhoeus'
5
+ require 'addressable/uri'
2
6
 
3
7
  class SiteDiff
4
- class SiteDiffReadFailure < Exception; end
8
+ class SiteDiffReadFailure < SiteDiffException; end
5
9
 
10
+ # SiteDiff URI Wrapper.
6
11
  class UriWrapper
12
+ # TODO: Move these CURL OPTS to Config.DEFAULT_CONFIG.
13
+ DEFAULT_CURL_OPTS = {
14
+ # Don't hang on servers that don't exist.
15
+ connecttimeout: 3,
16
+ # Follow HTTP redirects (code 301 and 302).
17
+ followlocation: true,
18
+ headers: {
19
+ 'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
20
+ }
21
+ }.freeze
22
+
7
23
  # This lets us treat errors or content as one object
8
- class ReadResult < Struct.new(:content, :error)
9
- def initialize(cont, err = nil)
10
- super(cont, err)
24
+ class ReadResult
25
+ attr_accessor :encoding, :content, :error_code, :error
26
+
27
+ ##
28
+ # Creates a ReadResult.
29
+ def initialize(content = nil, encoding = 'utf-8')
30
+ @content = content
31
+ @encoding = encoding
32
+ @error = nil
33
+ @error_code = nil
34
+ end
35
+
36
+ ##
37
+ # Creates a ReadResult with an error.
38
+ def self.error(message, code = nil)
39
+ res = new
40
+ res.error_code = code
41
+ res.error = message
42
+ res
11
43
  end
12
- def self.error(err); new(nil, err); end
13
44
  end
14
45
 
15
- def initialize(uri)
16
- @uri = uri.respond_to?(:scheme) ? uri : URI.parse(uri)
46
+ ##
47
+ # Creates a UriWrapper.
48
+ def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
49
+ @uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
17
50
  # remove trailing '/'s from local URIs
18
- @uri.path.gsub!(/\/*$/, '') if local?
51
+ @uri.path.gsub!(%r{/*$}, '') if local?
52
+ @curl_opts = curl_opts
53
+ @debug = debug
19
54
  end
20
55
 
56
+ ##
57
+ # Returns the "user" part of the URI.
21
58
  def user
22
59
  @uri.user
23
60
  end
24
61
 
62
+ ##
63
+ # Returns the "password" part of the URI.
25
64
  def password
26
65
  @uri.password
27
66
  end
28
67
 
68
+ ##
69
+ # Converts the URI to a string.
29
70
  def to_s
30
71
  uri = @uri.dup
31
72
  uri.user = nil
32
73
  uri.password = nil
33
- return uri.to_s
74
+ uri.to_s
34
75
  end
35
76
 
77
+ ##
36
78
  # Is this a local filesystem path?
37
79
  def local?
38
- @uri.scheme == nil
80
+ @uri.scheme.nil?
39
81
  end
40
82
 
41
- # FIXME this is not used anymore
42
- def +(path)
83
+ ## What does this one do?
84
+ # FIXME: this is not used anymore
85
+ def +(other)
43
86
  # 'path' for SiteDiff includes (parts of) path, query, and fragment.
44
87
  sep = ''
45
- if local? || @uri.path.empty?
46
- sep = '/'
47
- end
48
- self.class.new(@uri.to_s + sep + path)
88
+ sep = '/' if local? || @uri.path.empty?
89
+ self.class.new(@uri.to_s + sep + other)
49
90
  end
50
91
 
92
+ ##
51
93
  # Reads a file and yields to the completion handler, see .queue()
52
- def read_file(&handler)
94
+ def read_file
53
95
  File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
54
96
  rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
55
97
  yield ReadResult.error(e.message)
@@ -57,10 +99,10 @@ class SiteDiff
57
99
 
58
100
  # Returns the encoding of an HTTP response from headers , nil if not
59
101
  # specified.
60
- def http_encoding(http_headers)
61
- if content_type = http_headers['Content-Type']
62
- if md = /;\s*charset=([-\w]*)/.match(content_type)
63
- return md[1]
102
+ def charset_encoding(http_headers)
103
+ if (content_type = http_headers['Content-Type'])
104
+ if (md = /;\s*charset=([-\w]*)/.match(content_type))
105
+ md[1]
64
106
  end
65
107
  end
66
108
  end
@@ -69,33 +111,58 @@ class SiteDiff
69
111
  #
70
112
  # Completion callbacks of the request wrap the given handler which is
71
113
  # assumed to accept a single ReadResult argument.
72
- def typhoeus_request(&handler)
73
- params = {
74
- :connecttimeout => 3, # Don't hang on servers that don't exist
75
- :followlocation => true, # Follow HTTP redirects (code 301 and 302)
76
- :headers => {
77
- "User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
78
- }
79
- }
114
+ def typhoeus_request
115
+ params = @curl_opts.dup
80
116
  # Allow basic auth
81
117
  params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
82
118
 
83
- req = Typhoeus::Request.new(self.to_s, params)
119
+ req = Typhoeus::Request.new(to_s, params)
84
120
 
85
121
  req.on_success do |resp|
86
122
  body = resp.body
87
123
  # Typhoeus does not respect HTTP headers when setting the encoding
88
124
  # resp.body; coerce if possible.
89
- if encoding = http_encoding(resp.headers)
125
+ if (encoding = charset_encoding(resp.headers))
90
126
  body.force_encoding(encoding)
91
127
  end
92
- yield ReadResult.new(body)
128
+ # Should be wrapped with rescue I guess? Maybe this entire function?
129
+ # Should at least be an option in the Cli to disable this.
130
+ # "stop on first error"
131
+ begin
132
+ yield ReadResult.new(body, encoding)
133
+ rescue ArgumentError => e
134
+ raise if @debug
135
+
136
+ yield ReadResult.error(
137
+ "Parsing error for #{@uri}: #{e.message}"
138
+ )
139
+ rescue StandardError => e
140
+ raise if @debug
141
+
142
+ yield ReadResult.error(
143
+ "Unknown parsing error for #{@uri}: #{e.message}"
144
+ )
145
+ end
93
146
  end
94
147
 
95
148
  req.on_failure do |resp|
96
- msg = 'Unknown Error'
97
- msg = resp.status_message if resp and resp.status_message
98
- yield ReadResult.error("HTTP error #{@uri}: #{msg}")
149
+ if resp&.status_message
150
+ msg = resp.status_message
151
+ yield ReadResult.error(
152
+ "HTTP error when loading #{@uri}: #{msg}",
153
+ resp.response_code
154
+ )
155
+ elsif (msg = resp.options[:return_code])
156
+ yield ReadResult.error(
157
+ "Connection error when loading #{@uri}: #{msg}",
158
+ resp.response_code
159
+ )
160
+ else
161
+ yield ReadResult.error(
162
+ "Unknown error when loading #{@uri}: #{msg}",
163
+ resp.response_code
164
+ )
165
+ end
99
166
  end
100
167
 
101
168
  req
@@ -114,5 +181,17 @@ class SiteDiff
114
181
  hydra.queue(typhoeus_request(&handler))
115
182
  end
116
183
  end
184
+
185
+ ##
186
+ # Canonicalize a path.
187
+ #
188
+ # @param [String] path
189
+ # A base relative path. Example: /foo/bar
190
+ def self.canonicalize(path)
191
+ # Ignore trailing slashes for all paths except "/" (front page).
192
+ path = path.chomp('/') unless path == '/'
193
+ # If the path is empty, assume that it's the front page.
194
+ path.empty? ? '/' : path
195
+ end
117
196
  end
118
197
  end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'webrick'
4
+
5
+ class SiteDiff
6
+ # SiteDiff Web Server.
7
+ class Webserver
8
+ # Simple web server for testing purposes.
9
+ DEFAULT_PORT = 13_080
10
+
11
+ attr_accessor :ports
12
+
13
+ ##
14
+ # Serve a list of directories.
15
+ def initialize(start_port, dirs, opts = {})
16
+ start_port ||= DEFAULT_PORT
17
+ @ports = (start_port...(start_port + dirs.size)).to_a
18
+ @dirs = dirs
19
+ @opts = opts
20
+
21
+ setup
22
+ start_servers
23
+
24
+ if block_given?
25
+ yield self
26
+ kill
27
+ end
28
+ end
29
+
30
+ ##
31
+ # Kills the server.
32
+ def kill
33
+ @threads.each(&:kill)
34
+ end
35
+
36
+ ##
37
+ # Waits for the server.
38
+ def wait
39
+ @threads.each(&:join)
40
+ end
41
+
42
+ ##
43
+ # Maps URIs to defined ports and returns a list of URIs.
44
+ def uris
45
+ ports.map { |p| "http://localhost:#{p}" }
46
+ end
47
+
48
+ protected
49
+
50
+ def setup
51
+ @server_opts = {}
52
+ if @opts[:quiet]
53
+ @server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
54
+ @server_opts[:AccessLog] = []
55
+ end
56
+ end
57
+
58
+ def server(opts)
59
+ WEBrick::HTTPServer.new(opts)
60
+ end
61
+
62
+ def start_servers
63
+ @threads = []
64
+ @dirs.each_with_index do |dir, idx|
65
+ @server_opts[:Port] = @ports[idx]
66
+ @server_opts[:DocumentRoot] = dir
67
+ srv = server(@server_opts)
68
+ @threads << Thread.new { srv.start }
69
+ end
70
+ end
71
+
72
+ public
73
+
74
+ # SiteDiff Fixture Server.
75
+ class FixtureServer < Webserver
76
+ PORT = DEFAULT_PORT + 1
77
+ BASE = 'spec/sites/ruby-doc.org'
78
+ NAMES = %w[core-1.9.3 core-2.0].freeze
79
+
80
+ def initialize(port = PORT, base = BASE, names = NAMES)
81
+ dirs = names.map { |n| File.join(base, n) }
82
+ super(port, dirs, quiet: true)
83
+ end
84
+
85
+ def before
86
+ uris.first
87
+ end
88
+
89
+ def after
90
+ uris.last
91
+ end
92
+ end
93
+ end
94
+ end