sitediff 0.0.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/bin/sitediff +10 -4
- data/lib/sitediff.rb +179 -91
- data/lib/sitediff/cache.rb +106 -0
- data/lib/sitediff/cli.rb +391 -60
- data/lib/sitediff/config.rb +383 -37
- data/lib/sitediff/config/creator.rb +114 -0
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +131 -0
- data/lib/sitediff/diff.rb +57 -12
- data/lib/sitediff/exception.rb +5 -0
- data/lib/sitediff/fetch.rb +76 -0
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +144 -0
- data/lib/sitediff/files/sidebyside.html.erb +16 -0
- data/lib/sitediff/files/sitediff.css +236 -29
- data/lib/sitediff/files/sitediff.js +176 -0
- data/lib/sitediff/report.rb +238 -0
- data/lib/sitediff/result.rb +63 -26
- data/lib/sitediff/sanitize.rb +160 -141
- data/lib/sitediff/sanitize/dom_transform.rb +130 -0
- data/lib/sitediff/sanitize/regexp.rb +82 -0
- data/lib/sitediff/uriwrapper.rb +114 -35
- data/lib/sitediff/webserver.rb +94 -0
- data/lib/sitediff/webserver/resultserver.rb +134 -0
- metadata +103 -43
- data/lib/sitediff/files/html_report.html.erb +0 -47
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
@@ -0,0 +1,130 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff/sanitize'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
class SiteDiff
|
7
|
+
class Sanitizer
|
8
|
+
# Currently supported transforms:
|
9
|
+
#
|
10
|
+
# * { :type => "unwrap_root" }
|
11
|
+
# * { :type => "unwrap", :selector => "div.field-item" }
|
12
|
+
# * { :type => "remove", :selector => "div.extra-stuff" }
|
13
|
+
# * { :type => "remove_class", :class => 'class1' }
|
14
|
+
# * { :type => "strip", :selector => 'h1' }
|
15
|
+
class DomTransform
|
16
|
+
# Supported dom_transform types.
|
17
|
+
TRANSFORMS = {}
|
18
|
+
|
19
|
+
##
|
20
|
+
# Creates a DOM Transform.
|
21
|
+
def initialize(rule)
|
22
|
+
@rule = rule
|
23
|
+
end
|
24
|
+
|
25
|
+
##
|
26
|
+
# Often an array or scalar are both ok values. Turn either into an array.
|
27
|
+
def to_array(val)
|
28
|
+
[val].flatten
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
# TODO: Document what this method does.
|
33
|
+
def targets(node)
|
34
|
+
selectors = to_array(@rule['selector'])
|
35
|
+
selectors.each do |sel|
|
36
|
+
node.css(sel).each { |n| yield n }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# Applies the transformation to a DOM node.
|
42
|
+
def apply(node)
|
43
|
+
targets(node) { |t| process(t) }
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# Registers a DOM Transform plugin.
|
48
|
+
def self.register(name)
|
49
|
+
TRANSFORMS[name] = self
|
50
|
+
end
|
51
|
+
|
52
|
+
##
|
53
|
+
# Creates a DOM Transform as per rule.
|
54
|
+
def self.create(rule)
|
55
|
+
(type = rule['type']) ||
|
56
|
+
raise(InvalidSanitization, 'DOM transform needs a type')
|
57
|
+
(transform = TRANSFORMS[type]) ||
|
58
|
+
raise(InvalidSanitization, "No DOM transform named #{type}")
|
59
|
+
transform.new(rule)
|
60
|
+
end
|
61
|
+
|
62
|
+
##
|
63
|
+
# Remove elements matching 'selector'.
|
64
|
+
class Remove < DomTransform
|
65
|
+
register 'remove'
|
66
|
+
|
67
|
+
##
|
68
|
+
# Processes a node.
|
69
|
+
def process(node)
|
70
|
+
node.remove
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Squeeze whitespace from a tag matching 'selector'.
|
75
|
+
class Strip < DomTransform
|
76
|
+
register 'strip'
|
77
|
+
|
78
|
+
##
|
79
|
+
# Processes a node.
|
80
|
+
def process(node)
|
81
|
+
node.content = node.content.strip
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Unwrap elements matching 'selector'.
|
86
|
+
class Unwrap < DomTransform
|
87
|
+
register 'unwrap'
|
88
|
+
|
89
|
+
##
|
90
|
+
# Processes a node.
|
91
|
+
def process(node)
|
92
|
+
node.add_next_sibling(node.children)
|
93
|
+
node.remove
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
##
|
98
|
+
# Remove classes from elements matching selector
|
99
|
+
class RemoveClass < DomTransform
|
100
|
+
register 'remove_class'
|
101
|
+
|
102
|
+
##
|
103
|
+
# Processes a node.
|
104
|
+
def process(node)
|
105
|
+
classes = to_array(@rule['class'])
|
106
|
+
|
107
|
+
# Must call remove_class on a NodeSet!
|
108
|
+
ns = Nokogiri::XML::NodeSet.new(node.document, [node])
|
109
|
+
classes.each do |class_name|
|
110
|
+
ns.remove_class(class_name)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
##
|
116
|
+
# Unwrap the root element.
|
117
|
+
class UnwrapRoot < DomTransform
|
118
|
+
register 'unwrap_root'
|
119
|
+
|
120
|
+
##
|
121
|
+
# Applies the transformation to a DOM node.
|
122
|
+
def apply(node)
|
123
|
+
(node.children.size == 1) ||
|
124
|
+
raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
|
125
|
+
node.children = node.children[0].children
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class SiteDiff
|
4
|
+
class Sanitizer
|
5
|
+
# Regular Expression Object.
|
6
|
+
class Regexp
|
7
|
+
##
|
8
|
+
# Creates a RegExp object.
|
9
|
+
def initialize(rule)
|
10
|
+
@rule = rule
|
11
|
+
end
|
12
|
+
|
13
|
+
##
|
14
|
+
# Whether the RegExp has a selector.
|
15
|
+
def selector?
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# Whether the RegExp applies to the given markup.
|
21
|
+
def applies?(html, _node)
|
22
|
+
applies_to_string?(html)
|
23
|
+
end
|
24
|
+
|
25
|
+
##
|
26
|
+
# Applies the RegExp to the markup.
|
27
|
+
def apply(html)
|
28
|
+
gsub!(html)
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
# Creates a RegExp object as per rule.
|
33
|
+
def self.create(rule)
|
34
|
+
rule['selector'] ? WithSelector.new(rule) : new(rule)
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# A RegExp with selector.
|
39
|
+
class WithSelector < Regexp
|
40
|
+
##
|
41
|
+
# Whether the RegExp has a selector.
|
42
|
+
def selector?
|
43
|
+
true
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# TODO: Document what this method does.
|
48
|
+
def contexts(node)
|
49
|
+
selectors = @rule['selector']
|
50
|
+
node.css(selectors).each { |e| yield(e) }
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
# Whether the RegExp applies to the given markup.
|
55
|
+
def applies?(_html, node)
|
56
|
+
enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
# Applies the RegExp to the markup.
|
61
|
+
def apply(node)
|
62
|
+
contexts(node) { |e| e.replace(gsub!(e.to_html)) }
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
protected
|
67
|
+
|
68
|
+
def gsub!(str)
|
69
|
+
re = ::Regexp.new(@rule['pattern'])
|
70
|
+
sub = @rule['substitute'] || ''
|
71
|
+
# Expecting a mutation here. Do not reassign the variable str
|
72
|
+
# for the purpose of removing UTF-8 encoding errors.
|
73
|
+
str.gsub!(re, sub)
|
74
|
+
str
|
75
|
+
end
|
76
|
+
|
77
|
+
def applies_to_string?(str)
|
78
|
+
gsub!(str.dup) != str
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/sitediff/uriwrapper.rb
CHANGED
@@ -1,55 +1,97 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff/exception'
|
1
4
|
require 'typhoeus'
|
5
|
+
require 'addressable/uri'
|
2
6
|
|
3
7
|
class SiteDiff
|
4
|
-
class SiteDiffReadFailure <
|
8
|
+
class SiteDiffReadFailure < SiteDiffException; end
|
5
9
|
|
10
|
+
# SiteDiff URI Wrapper.
|
6
11
|
class UriWrapper
|
12
|
+
# TODO: Move these CURL OPTS to Config.DEFAULT_CONFIG.
|
13
|
+
DEFAULT_CURL_OPTS = {
|
14
|
+
# Don't hang on servers that don't exist.
|
15
|
+
connecttimeout: 3,
|
16
|
+
# Follow HTTP redirects (code 301 and 302).
|
17
|
+
followlocation: true,
|
18
|
+
headers: {
|
19
|
+
'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
|
20
|
+
}
|
21
|
+
}.freeze
|
22
|
+
|
7
23
|
# This lets us treat errors or content as one object
|
8
|
-
class ReadResult
|
9
|
-
|
10
|
-
|
24
|
+
class ReadResult
|
25
|
+
attr_accessor :encoding, :content, :error_code, :error
|
26
|
+
|
27
|
+
##
|
28
|
+
# Creates a ReadResult.
|
29
|
+
def initialize(content = nil, encoding = 'utf-8')
|
30
|
+
@content = content
|
31
|
+
@encoding = encoding
|
32
|
+
@error = nil
|
33
|
+
@error_code = nil
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# Creates a ReadResult with an error.
|
38
|
+
def self.error(message, code = nil)
|
39
|
+
res = new
|
40
|
+
res.error_code = code
|
41
|
+
res.error = message
|
42
|
+
res
|
11
43
|
end
|
12
|
-
def self.error(err); new(nil, err); end
|
13
44
|
end
|
14
45
|
|
15
|
-
|
16
|
-
|
46
|
+
##
|
47
|
+
# Creates a UriWrapper.
|
48
|
+
def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
|
49
|
+
@uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
|
17
50
|
# remove trailing '/'s from local URIs
|
18
|
-
@uri.path.gsub!(
|
51
|
+
@uri.path.gsub!(%r{/*$}, '') if local?
|
52
|
+
@curl_opts = curl_opts
|
53
|
+
@debug = debug
|
19
54
|
end
|
20
55
|
|
56
|
+
##
|
57
|
+
# Returns the "user" part of the URI.
|
21
58
|
def user
|
22
59
|
@uri.user
|
23
60
|
end
|
24
61
|
|
62
|
+
##
|
63
|
+
# Returns the "password" part of the URI.
|
25
64
|
def password
|
26
65
|
@uri.password
|
27
66
|
end
|
28
67
|
|
68
|
+
##
|
69
|
+
# Converts the URI to a string.
|
29
70
|
def to_s
|
30
71
|
uri = @uri.dup
|
31
72
|
uri.user = nil
|
32
73
|
uri.password = nil
|
33
|
-
|
74
|
+
uri.to_s
|
34
75
|
end
|
35
76
|
|
77
|
+
##
|
36
78
|
# Is this a local filesystem path?
|
37
79
|
def local?
|
38
|
-
@uri.scheme
|
80
|
+
@uri.scheme.nil?
|
39
81
|
end
|
40
82
|
|
41
|
-
|
42
|
-
|
83
|
+
## What does this one do?
|
84
|
+
# FIXME: this is not used anymore
|
85
|
+
def +(other)
|
43
86
|
# 'path' for SiteDiff includes (parts of) path, query, and fragment.
|
44
87
|
sep = ''
|
45
|
-
if local? || @uri.path.empty?
|
46
|
-
|
47
|
-
end
|
48
|
-
self.class.new(@uri.to_s + sep + path)
|
88
|
+
sep = '/' if local? || @uri.path.empty?
|
89
|
+
self.class.new(@uri.to_s + sep + other)
|
49
90
|
end
|
50
91
|
|
92
|
+
##
|
51
93
|
# Reads a file and yields to the completion handler, see .queue()
|
52
|
-
def read_file
|
94
|
+
def read_file
|
53
95
|
File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
|
54
96
|
rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
|
55
97
|
yield ReadResult.error(e.message)
|
@@ -57,10 +99,10 @@ class SiteDiff
|
|
57
99
|
|
58
100
|
# Returns the encoding of an HTTP response from headers , nil if not
|
59
101
|
# specified.
|
60
|
-
def
|
61
|
-
if content_type = http_headers['Content-Type']
|
62
|
-
if md = /;\s*charset=([-\w]*)/.match(content_type)
|
63
|
-
|
102
|
+
def charset_encoding(http_headers)
|
103
|
+
if (content_type = http_headers['Content-Type'])
|
104
|
+
if (md = /;\s*charset=([-\w]*)/.match(content_type))
|
105
|
+
md[1]
|
64
106
|
end
|
65
107
|
end
|
66
108
|
end
|
@@ -69,33 +111,58 @@ class SiteDiff
|
|
69
111
|
#
|
70
112
|
# Completion callbacks of the request wrap the given handler which is
|
71
113
|
# assumed to accept a single ReadResult argument.
|
72
|
-
def typhoeus_request
|
73
|
-
params =
|
74
|
-
:connecttimeout => 3, # Don't hang on servers that don't exist
|
75
|
-
:followlocation => true, # Follow HTTP redirects (code 301 and 302)
|
76
|
-
:headers => {
|
77
|
-
"User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
|
78
|
-
}
|
79
|
-
}
|
114
|
+
def typhoeus_request
|
115
|
+
params = @curl_opts.dup
|
80
116
|
# Allow basic auth
|
81
117
|
params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
|
82
118
|
|
83
|
-
req = Typhoeus::Request.new(
|
119
|
+
req = Typhoeus::Request.new(to_s, params)
|
84
120
|
|
85
121
|
req.on_success do |resp|
|
86
122
|
body = resp.body
|
87
123
|
# Typhoeus does not respect HTTP headers when setting the encoding
|
88
124
|
# resp.body; coerce if possible.
|
89
|
-
if encoding =
|
125
|
+
if (encoding = charset_encoding(resp.headers))
|
90
126
|
body.force_encoding(encoding)
|
91
127
|
end
|
92
|
-
|
128
|
+
# Should be wrapped with rescue I guess? Maybe this entire function?
|
129
|
+
# Should at least be an option in the Cli to disable this.
|
130
|
+
# "stop on first error"
|
131
|
+
begin
|
132
|
+
yield ReadResult.new(body, encoding)
|
133
|
+
rescue ArgumentError => e
|
134
|
+
raise if @debug
|
135
|
+
|
136
|
+
yield ReadResult.error(
|
137
|
+
"Parsing error for #{@uri}: #{e.message}"
|
138
|
+
)
|
139
|
+
rescue StandardError => e
|
140
|
+
raise if @debug
|
141
|
+
|
142
|
+
yield ReadResult.error(
|
143
|
+
"Unknown parsing error for #{@uri}: #{e.message}"
|
144
|
+
)
|
145
|
+
end
|
93
146
|
end
|
94
147
|
|
95
148
|
req.on_failure do |resp|
|
96
|
-
|
97
|
-
|
98
|
-
|
149
|
+
if resp&.status_message
|
150
|
+
msg = resp.status_message
|
151
|
+
yield ReadResult.error(
|
152
|
+
"HTTP error when loading #{@uri}: #{msg}",
|
153
|
+
resp.response_code
|
154
|
+
)
|
155
|
+
elsif (msg = resp.options[:return_code])
|
156
|
+
yield ReadResult.error(
|
157
|
+
"Connection error when loading #{@uri}: #{msg}",
|
158
|
+
resp.response_code
|
159
|
+
)
|
160
|
+
else
|
161
|
+
yield ReadResult.error(
|
162
|
+
"Unknown error when loading #{@uri}: #{msg}",
|
163
|
+
resp.response_code
|
164
|
+
)
|
165
|
+
end
|
99
166
|
end
|
100
167
|
|
101
168
|
req
|
@@ -114,5 +181,17 @@ class SiteDiff
|
|
114
181
|
hydra.queue(typhoeus_request(&handler))
|
115
182
|
end
|
116
183
|
end
|
184
|
+
|
185
|
+
##
|
186
|
+
# Canonicalize a path.
|
187
|
+
#
|
188
|
+
# @param [String] path
|
189
|
+
# A base relative path. Example: /foo/bar
|
190
|
+
def self.canonicalize(path)
|
191
|
+
# Ignore trailing slashes for all paths except "/" (front page).
|
192
|
+
path = path.chomp('/') unless path == '/'
|
193
|
+
# If the path is empty, assume that it's the front page.
|
194
|
+
path.empty? ? '/' : path
|
195
|
+
end
|
117
196
|
end
|
118
197
|
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'webrick'
|
4
|
+
|
5
|
+
class SiteDiff
|
6
|
+
# SiteDiff Web Server.
|
7
|
+
class Webserver
|
8
|
+
# Simple web server for testing purposes.
|
9
|
+
DEFAULT_PORT = 13_080
|
10
|
+
|
11
|
+
attr_accessor :ports
|
12
|
+
|
13
|
+
##
|
14
|
+
# Serve a list of directories.
|
15
|
+
def initialize(start_port, dirs, opts = {})
|
16
|
+
start_port ||= DEFAULT_PORT
|
17
|
+
@ports = (start_port...(start_port + dirs.size)).to_a
|
18
|
+
@dirs = dirs
|
19
|
+
@opts = opts
|
20
|
+
|
21
|
+
setup
|
22
|
+
start_servers
|
23
|
+
|
24
|
+
if block_given?
|
25
|
+
yield self
|
26
|
+
kill
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Kills the server.
|
32
|
+
def kill
|
33
|
+
@threads.each(&:kill)
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# Waits for the server.
|
38
|
+
def wait
|
39
|
+
@threads.each(&:join)
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# Maps URIs to defined ports and returns a list of URIs.
|
44
|
+
def uris
|
45
|
+
ports.map { |p| "http://localhost:#{p}" }
|
46
|
+
end
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
def setup
|
51
|
+
@server_opts = {}
|
52
|
+
if @opts[:quiet]
|
53
|
+
@server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
|
54
|
+
@server_opts[:AccessLog] = []
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def server(opts)
|
59
|
+
WEBrick::HTTPServer.new(opts)
|
60
|
+
end
|
61
|
+
|
62
|
+
def start_servers
|
63
|
+
@threads = []
|
64
|
+
@dirs.each_with_index do |dir, idx|
|
65
|
+
@server_opts[:Port] = @ports[idx]
|
66
|
+
@server_opts[:DocumentRoot] = dir
|
67
|
+
srv = server(@server_opts)
|
68
|
+
@threads << Thread.new { srv.start }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
public
|
73
|
+
|
74
|
+
# SiteDiff Fixture Server.
|
75
|
+
class FixtureServer < Webserver
|
76
|
+
PORT = DEFAULT_PORT + 1
|
77
|
+
BASE = 'spec/sites/ruby-doc.org'
|
78
|
+
NAMES = %w[core-1.9.3 core-2.0].freeze
|
79
|
+
|
80
|
+
def initialize(port = PORT, base = BASE, names = NAMES)
|
81
|
+
dirs = names.map { |n| File.join(base, n) }
|
82
|
+
super(port, dirs, quiet: true)
|
83
|
+
end
|
84
|
+
|
85
|
+
def before
|
86
|
+
uris.first
|
87
|
+
end
|
88
|
+
|
89
|
+
def after
|
90
|
+
uris.last
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|