sitediff 0.0.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/sitediff +10 -4
- data/lib/sitediff.rb +179 -91
- data/lib/sitediff/cache.rb +106 -0
- data/lib/sitediff/cli.rb +391 -60
- data/lib/sitediff/config.rb +383 -37
- data/lib/sitediff/config/creator.rb +114 -0
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +131 -0
- data/lib/sitediff/diff.rb +57 -12
- data/lib/sitediff/exception.rb +5 -0
- data/lib/sitediff/fetch.rb +76 -0
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +144 -0
- data/lib/sitediff/files/sidebyside.html.erb +16 -0
- data/lib/sitediff/files/sitediff.css +236 -29
- data/lib/sitediff/files/sitediff.js +176 -0
- data/lib/sitediff/report.rb +238 -0
- data/lib/sitediff/result.rb +63 -26
- data/lib/sitediff/sanitize.rb +160 -141
- data/lib/sitediff/sanitize/dom_transform.rb +130 -0
- data/lib/sitediff/sanitize/regexp.rb +82 -0
- data/lib/sitediff/uriwrapper.rb +114 -35
- data/lib/sitediff/webserver.rb +94 -0
- data/lib/sitediff/webserver/resultserver.rb +134 -0
- metadata +103 -43
- data/lib/sitediff/files/html_report.html.erb +0 -47
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
@@ -0,0 +1,130 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff/sanitize'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
class SiteDiff
|
7
|
+
class Sanitizer
|
8
|
+
# Currently supported transforms:
|
9
|
+
#
|
10
|
+
# * { :type => "unwrap_root" }
|
11
|
+
# * { :type => "unwrap", :selector => "div.field-item" }
|
12
|
+
# * { :type => "remove", :selector => "div.extra-stuff" }
|
13
|
+
# * { :type => "remove_class", :class => 'class1' }
|
14
|
+
# * { :type => "strip", :selector => 'h1' }
|
15
|
+
class DomTransform
|
16
|
+
# Supported dom_transform types.
|
17
|
+
TRANSFORMS = {}
|
18
|
+
|
19
|
+
##
|
20
|
+
# Creates a DOM Transform.
|
21
|
+
def initialize(rule)
|
22
|
+
@rule = rule
|
23
|
+
end
|
24
|
+
|
25
|
+
##
|
26
|
+
# Often an array or scalar are both ok values. Turn either into an array.
|
27
|
+
def to_array(val)
|
28
|
+
[val].flatten
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
# TODO: Document what this method does.
|
33
|
+
def targets(node)
|
34
|
+
selectors = to_array(@rule['selector'])
|
35
|
+
selectors.each do |sel|
|
36
|
+
node.css(sel).each { |n| yield n }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# Applies the transformation to a DOM node.
|
42
|
+
def apply(node)
|
43
|
+
targets(node) { |t| process(t) }
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# Registers a DOM Transform plugin.
|
48
|
+
def self.register(name)
|
49
|
+
TRANSFORMS[name] = self
|
50
|
+
end
|
51
|
+
|
52
|
+
##
|
53
|
+
# Creates a DOM Transform as per rule.
|
54
|
+
def self.create(rule)
|
55
|
+
(type = rule['type']) ||
|
56
|
+
raise(InvalidSanitization, 'DOM transform needs a type')
|
57
|
+
(transform = TRANSFORMS[type]) ||
|
58
|
+
raise(InvalidSanitization, "No DOM transform named #{type}")
|
59
|
+
transform.new(rule)
|
60
|
+
end
|
61
|
+
|
62
|
+
##
|
63
|
+
# Remove elements matching 'selector'.
|
64
|
+
class Remove < DomTransform
|
65
|
+
register 'remove'
|
66
|
+
|
67
|
+
##
|
68
|
+
# Processes a node.
|
69
|
+
def process(node)
|
70
|
+
node.remove
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Squeeze whitespace from a tag matching 'selector'.
|
75
|
+
class Strip < DomTransform
|
76
|
+
register 'strip'
|
77
|
+
|
78
|
+
##
|
79
|
+
# Processes a node.
|
80
|
+
def process(node)
|
81
|
+
node.content = node.content.strip
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Unwrap elements matching 'selector'.
|
86
|
+
class Unwrap < DomTransform
|
87
|
+
register 'unwrap'
|
88
|
+
|
89
|
+
##
|
90
|
+
# Processes a node.
|
91
|
+
def process(node)
|
92
|
+
node.add_next_sibling(node.children)
|
93
|
+
node.remove
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
##
|
98
|
+
# Remove classes from elements matching selector
|
99
|
+
class RemoveClass < DomTransform
|
100
|
+
register 'remove_class'
|
101
|
+
|
102
|
+
##
|
103
|
+
# Processes a node.
|
104
|
+
def process(node)
|
105
|
+
classes = to_array(@rule['class'])
|
106
|
+
|
107
|
+
# Must call remove_class on a NodeSet!
|
108
|
+
ns = Nokogiri::XML::NodeSet.new(node.document, [node])
|
109
|
+
classes.each do |class_name|
|
110
|
+
ns.remove_class(class_name)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
##
|
116
|
+
# Unwrap the root element.
|
117
|
+
class UnwrapRoot < DomTransform
|
118
|
+
register 'unwrap_root'
|
119
|
+
|
120
|
+
##
|
121
|
+
# Applies the transformation to a DOM node.
|
122
|
+
def apply(node)
|
123
|
+
(node.children.size == 1) ||
|
124
|
+
raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
|
125
|
+
node.children = node.children[0].children
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class SiteDiff
|
4
|
+
class Sanitizer
|
5
|
+
# Regular Expression Object.
|
6
|
+
class Regexp
|
7
|
+
##
|
8
|
+
# Creates a RegExp object.
|
9
|
+
def initialize(rule)
|
10
|
+
@rule = rule
|
11
|
+
end
|
12
|
+
|
13
|
+
##
|
14
|
+
# Whether the RegExp has a selector.
|
15
|
+
def selector?
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# Whether the RegExp applies to the given markup.
|
21
|
+
def applies?(html, _node)
|
22
|
+
applies_to_string?(html)
|
23
|
+
end
|
24
|
+
|
25
|
+
##
|
26
|
+
# Applies the RegExp to the markup.
|
27
|
+
def apply(html)
|
28
|
+
gsub!(html)
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
# Creates a RegExp object as per rule.
|
33
|
+
def self.create(rule)
|
34
|
+
rule['selector'] ? WithSelector.new(rule) : new(rule)
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# A RegExp with selector.
|
39
|
+
class WithSelector < Regexp
|
40
|
+
##
|
41
|
+
# Whether the RegExp has a selector.
|
42
|
+
def selector?
|
43
|
+
true
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# TODO: Document what this method does.
|
48
|
+
def contexts(node)
|
49
|
+
selectors = @rule['selector']
|
50
|
+
node.css(selectors).each { |e| yield(e) }
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
# Whether the RegExp applies to the given markup.
|
55
|
+
def applies?(_html, node)
|
56
|
+
enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
# Applies the RegExp to the markup.
|
61
|
+
def apply(node)
|
62
|
+
contexts(node) { |e| e.replace(gsub!(e.to_html)) }
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
protected
|
67
|
+
|
68
|
+
def gsub!(str)
|
69
|
+
re = ::Regexp.new(@rule['pattern'])
|
70
|
+
sub = @rule['substitute'] || ''
|
71
|
+
# Expecting a mutation here. Do not reassign the variable str
|
72
|
+
# for the purpose of removing UTF-8 encoding errors.
|
73
|
+
str.gsub!(re, sub)
|
74
|
+
str
|
75
|
+
end
|
76
|
+
|
77
|
+
def applies_to_string?(str)
|
78
|
+
gsub!(str.dup) != str
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/sitediff/uriwrapper.rb
CHANGED
@@ -1,55 +1,97 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff/exception'
|
1
4
|
require 'typhoeus'
|
5
|
+
require 'addressable/uri'
|
2
6
|
|
3
7
|
class SiteDiff
|
4
|
-
class SiteDiffReadFailure <
|
8
|
+
class SiteDiffReadFailure < SiteDiffException; end
|
5
9
|
|
10
|
+
# SiteDiff URI Wrapper.
|
6
11
|
class UriWrapper
|
12
|
+
# TODO: Move these CURL OPTS to Config.DEFAULT_CONFIG.
|
13
|
+
DEFAULT_CURL_OPTS = {
|
14
|
+
# Don't hang on servers that don't exist.
|
15
|
+
connecttimeout: 3,
|
16
|
+
# Follow HTTP redirects (code 301 and 302).
|
17
|
+
followlocation: true,
|
18
|
+
headers: {
|
19
|
+
'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
|
20
|
+
}
|
21
|
+
}.freeze
|
22
|
+
|
7
23
|
# This lets us treat errors or content as one object
|
8
|
-
class ReadResult
|
9
|
-
|
10
|
-
|
24
|
+
class ReadResult
|
25
|
+
attr_accessor :encoding, :content, :error_code, :error
|
26
|
+
|
27
|
+
##
|
28
|
+
# Creates a ReadResult.
|
29
|
+
def initialize(content = nil, encoding = 'utf-8')
|
30
|
+
@content = content
|
31
|
+
@encoding = encoding
|
32
|
+
@error = nil
|
33
|
+
@error_code = nil
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# Creates a ReadResult with an error.
|
38
|
+
def self.error(message, code = nil)
|
39
|
+
res = new
|
40
|
+
res.error_code = code
|
41
|
+
res.error = message
|
42
|
+
res
|
11
43
|
end
|
12
|
-
def self.error(err); new(nil, err); end
|
13
44
|
end
|
14
45
|
|
15
|
-
|
16
|
-
|
46
|
+
##
|
47
|
+
# Creates a UriWrapper.
|
48
|
+
def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
|
49
|
+
@uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
|
17
50
|
# remove trailing '/'s from local URIs
|
18
|
-
@uri.path.gsub!(
|
51
|
+
@uri.path.gsub!(%r{/*$}, '') if local?
|
52
|
+
@curl_opts = curl_opts
|
53
|
+
@debug = debug
|
19
54
|
end
|
20
55
|
|
56
|
+
##
|
57
|
+
# Returns the "user" part of the URI.
|
21
58
|
def user
|
22
59
|
@uri.user
|
23
60
|
end
|
24
61
|
|
62
|
+
##
|
63
|
+
# Returns the "password" part of the URI.
|
25
64
|
def password
|
26
65
|
@uri.password
|
27
66
|
end
|
28
67
|
|
68
|
+
##
|
69
|
+
# Converts the URI to a string.
|
29
70
|
def to_s
|
30
71
|
uri = @uri.dup
|
31
72
|
uri.user = nil
|
32
73
|
uri.password = nil
|
33
|
-
|
74
|
+
uri.to_s
|
34
75
|
end
|
35
76
|
|
77
|
+
##
|
36
78
|
# Is this a local filesystem path?
|
37
79
|
def local?
|
38
|
-
@uri.scheme
|
80
|
+
@uri.scheme.nil?
|
39
81
|
end
|
40
82
|
|
41
|
-
|
42
|
-
|
83
|
+
## What does this one do?
|
84
|
+
# FIXME: this is not used anymore
|
85
|
+
def +(other)
|
43
86
|
# 'path' for SiteDiff includes (parts of) path, query, and fragment.
|
44
87
|
sep = ''
|
45
|
-
if local? || @uri.path.empty?
|
46
|
-
|
47
|
-
end
|
48
|
-
self.class.new(@uri.to_s + sep + path)
|
88
|
+
sep = '/' if local? || @uri.path.empty?
|
89
|
+
self.class.new(@uri.to_s + sep + other)
|
49
90
|
end
|
50
91
|
|
92
|
+
##
|
51
93
|
# Reads a file and yields to the completion handler, see .queue()
|
52
|
-
def read_file
|
94
|
+
def read_file
|
53
95
|
File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
|
54
96
|
rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
|
55
97
|
yield ReadResult.error(e.message)
|
@@ -57,10 +99,10 @@ class SiteDiff
|
|
57
99
|
|
58
100
|
# Returns the encoding of an HTTP response from headers , nil if not
|
59
101
|
# specified.
|
60
|
-
def
|
61
|
-
if content_type = http_headers['Content-Type']
|
62
|
-
if md = /;\s*charset=([-\w]*)/.match(content_type)
|
63
|
-
|
102
|
+
def charset_encoding(http_headers)
|
103
|
+
if (content_type = http_headers['Content-Type'])
|
104
|
+
if (md = /;\s*charset=([-\w]*)/.match(content_type))
|
105
|
+
md[1]
|
64
106
|
end
|
65
107
|
end
|
66
108
|
end
|
@@ -69,33 +111,58 @@ class SiteDiff
|
|
69
111
|
#
|
70
112
|
# Completion callbacks of the request wrap the given handler which is
|
71
113
|
# assumed to accept a single ReadResult argument.
|
72
|
-
def typhoeus_request
|
73
|
-
params =
|
74
|
-
:connecttimeout => 3, # Don't hang on servers that don't exist
|
75
|
-
:followlocation => true, # Follow HTTP redirects (code 301 and 302)
|
76
|
-
:headers => {
|
77
|
-
"User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
|
78
|
-
}
|
79
|
-
}
|
114
|
+
def typhoeus_request
|
115
|
+
params = @curl_opts.dup
|
80
116
|
# Allow basic auth
|
81
117
|
params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
|
82
118
|
|
83
|
-
req = Typhoeus::Request.new(
|
119
|
+
req = Typhoeus::Request.new(to_s, params)
|
84
120
|
|
85
121
|
req.on_success do |resp|
|
86
122
|
body = resp.body
|
87
123
|
# Typhoeus does not respect HTTP headers when setting the encoding
|
88
124
|
# resp.body; coerce if possible.
|
89
|
-
if encoding =
|
125
|
+
if (encoding = charset_encoding(resp.headers))
|
90
126
|
body.force_encoding(encoding)
|
91
127
|
end
|
92
|
-
|
128
|
+
# Should be wrapped with rescue I guess? Maybe this entire function?
|
129
|
+
# Should at least be an option in the Cli to disable this.
|
130
|
+
# "stop on first error"
|
131
|
+
begin
|
132
|
+
yield ReadResult.new(body, encoding)
|
133
|
+
rescue ArgumentError => e
|
134
|
+
raise if @debug
|
135
|
+
|
136
|
+
yield ReadResult.error(
|
137
|
+
"Parsing error for #{@uri}: #{e.message}"
|
138
|
+
)
|
139
|
+
rescue StandardError => e
|
140
|
+
raise if @debug
|
141
|
+
|
142
|
+
yield ReadResult.error(
|
143
|
+
"Unknown parsing error for #{@uri}: #{e.message}"
|
144
|
+
)
|
145
|
+
end
|
93
146
|
end
|
94
147
|
|
95
148
|
req.on_failure do |resp|
|
96
|
-
|
97
|
-
|
98
|
-
|
149
|
+
if resp&.status_message
|
150
|
+
msg = resp.status_message
|
151
|
+
yield ReadResult.error(
|
152
|
+
"HTTP error when loading #{@uri}: #{msg}",
|
153
|
+
resp.response_code
|
154
|
+
)
|
155
|
+
elsif (msg = resp.options[:return_code])
|
156
|
+
yield ReadResult.error(
|
157
|
+
"Connection error when loading #{@uri}: #{msg}",
|
158
|
+
resp.response_code
|
159
|
+
)
|
160
|
+
else
|
161
|
+
yield ReadResult.error(
|
162
|
+
"Unknown error when loading #{@uri}: #{msg}",
|
163
|
+
resp.response_code
|
164
|
+
)
|
165
|
+
end
|
99
166
|
end
|
100
167
|
|
101
168
|
req
|
@@ -114,5 +181,17 @@ class SiteDiff
|
|
114
181
|
hydra.queue(typhoeus_request(&handler))
|
115
182
|
end
|
116
183
|
end
|
184
|
+
|
185
|
+
##
|
186
|
+
# Canonicalize a path.
|
187
|
+
#
|
188
|
+
# @param [String] path
|
189
|
+
# A base relative path. Example: /foo/bar
|
190
|
+
def self.canonicalize(path)
|
191
|
+
# Ignore trailing slashes for all paths except "/" (front page).
|
192
|
+
path = path.chomp('/') unless path == '/'
|
193
|
+
# If the path is empty, assume that it's the front page.
|
194
|
+
path.empty? ? '/' : path
|
195
|
+
end
|
117
196
|
end
|
118
197
|
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'webrick'
|
4
|
+
|
5
|
+
class SiteDiff
|
6
|
+
# SiteDiff Web Server.
|
7
|
+
class Webserver
|
8
|
+
# Simple web server for testing purposes.
|
9
|
+
DEFAULT_PORT = 13_080
|
10
|
+
|
11
|
+
attr_accessor :ports
|
12
|
+
|
13
|
+
##
|
14
|
+
# Serve a list of directories.
|
15
|
+
def initialize(start_port, dirs, opts = {})
|
16
|
+
start_port ||= DEFAULT_PORT
|
17
|
+
@ports = (start_port...(start_port + dirs.size)).to_a
|
18
|
+
@dirs = dirs
|
19
|
+
@opts = opts
|
20
|
+
|
21
|
+
setup
|
22
|
+
start_servers
|
23
|
+
|
24
|
+
if block_given?
|
25
|
+
yield self
|
26
|
+
kill
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Kills the server.
|
32
|
+
def kill
|
33
|
+
@threads.each(&:kill)
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# Waits for the server.
|
38
|
+
def wait
|
39
|
+
@threads.each(&:join)
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# Maps URIs to defined ports and returns a list of URIs.
|
44
|
+
def uris
|
45
|
+
ports.map { |p| "http://localhost:#{p}" }
|
46
|
+
end
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
def setup
|
51
|
+
@server_opts = {}
|
52
|
+
if @opts[:quiet]
|
53
|
+
@server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
|
54
|
+
@server_opts[:AccessLog] = []
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def server(opts)
|
59
|
+
WEBrick::HTTPServer.new(opts)
|
60
|
+
end
|
61
|
+
|
62
|
+
def start_servers
|
63
|
+
@threads = []
|
64
|
+
@dirs.each_with_index do |dir, idx|
|
65
|
+
@server_opts[:Port] = @ports[idx]
|
66
|
+
@server_opts[:DocumentRoot] = dir
|
67
|
+
srv = server(@server_opts)
|
68
|
+
@threads << Thread.new { srv.start }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
public
|
73
|
+
|
74
|
+
# SiteDiff Fixture Server.
|
75
|
+
class FixtureServer < Webserver
|
76
|
+
PORT = DEFAULT_PORT + 1
|
77
|
+
BASE = 'spec/sites/ruby-doc.org'
|
78
|
+
NAMES = %w[core-1.9.3 core-2.0].freeze
|
79
|
+
|
80
|
+
def initialize(port = PORT, base = BASE, names = NAMES)
|
81
|
+
dirs = names.map { |n| File.join(base, n) }
|
82
|
+
super(port, dirs, quiet: true)
|
83
|
+
end
|
84
|
+
|
85
|
+
def before
|
86
|
+
uris.first
|
87
|
+
end
|
88
|
+
|
89
|
+
def after
|
90
|
+
uris.last
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|