sitediff 0.0.2 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/sitediff +9 -3
- data/lib/sitediff.rb +153 -79
- data/lib/sitediff/api.rb +265 -0
- data/lib/sitediff/cache.rb +110 -47
- data/lib/sitediff/cli.rb +219 -165
- data/lib/sitediff/config.rb +439 -58
- data/lib/sitediff/config/creator.rb +93 -99
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +108 -72
- data/lib/sitediff/diff.rb +60 -12
- data/lib/sitediff/exception.rb +3 -1
- data/lib/sitediff/fetch.rb +62 -41
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +171 -0
- data/lib/sitediff/files/sidebyside.html.erb +5 -2
- data/lib/sitediff/files/sitediff.css +303 -30
- data/lib/sitediff/files/sitediff.js +367 -0
- data/lib/sitediff/report.rb +254 -0
- data/lib/sitediff/result.rb +59 -23
- data/lib/sitediff/sanitize.rb +222 -150
- data/lib/sitediff/sanitize/dom_transform.rb +111 -73
- data/lib/sitediff/sanitize/regexp.rb +69 -43
- data/lib/sitediff/uriwrapper.rb +104 -34
- data/lib/sitediff/webserver.rb +89 -77
- data/lib/sitediff/webserver/resultserver.rb +113 -77
- metadata +92 -76
- data/lib/sitediff/files/html_report.html.erb +0 -63
- data/lib/sitediff/files/rules/drupal.yaml +0 -33
- data/lib/sitediff/rules.rb +0 -65
@@ -1,92 +1,130 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/sanitize'
|
2
4
|
require 'nokogiri'
|
3
5
|
|
4
6
|
class SiteDiff
|
5
|
-
class Sanitizer
|
7
|
+
class Sanitizer
|
8
|
+
# Currently supported transforms:
|
9
|
+
#
|
10
|
+
# * { :type => "unwrap_root" }
|
11
|
+
# * { :type => "unwrap", :selector => "div.field-item" }
|
12
|
+
# * { :type => "remove", :selector => "div.extra-stuff" }
|
13
|
+
# * { :type => "remove_class", :class => 'class1' }
|
14
|
+
# * { :type => "strip", :selector => 'h1' }
|
15
|
+
class DomTransform
|
16
|
+
# Supported dom_transform types.
|
17
|
+
TRANSFORMS = {}
|
6
18
|
|
7
|
-
|
8
|
-
#
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
# * { :type => "remove_class", :class => 'class1' }
|
13
|
-
class DomTransform
|
19
|
+
##
|
20
|
+
# Creates a DOM Transform.
|
21
|
+
def initialize(rule)
|
22
|
+
@rule = rule
|
23
|
+
end
|
14
24
|
|
15
|
-
|
25
|
+
##
|
26
|
+
# Often an array or scalar are both ok values. Turn either into an array.
|
27
|
+
def to_array(val)
|
28
|
+
[val].flatten
|
29
|
+
end
|
16
30
|
|
17
|
-
|
18
|
-
|
19
|
-
|
31
|
+
##
|
32
|
+
# TODO: Document what this method does.
|
33
|
+
def targets(node)
|
34
|
+
selectors = to_array(@rule['selector'])
|
35
|
+
selectors.each do |sel|
|
36
|
+
node.css(sel).each { |n| yield n }
|
37
|
+
end
|
38
|
+
end
|
20
39
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
40
|
+
##
|
41
|
+
# Applies the transformation to a DOM node.
|
42
|
+
def apply(node)
|
43
|
+
targets(node) { |t| process(t) }
|
44
|
+
end
|
25
45
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
end
|
46
|
+
##
|
47
|
+
# Registers a DOM Transform plugin.
|
48
|
+
def self.register(name)
|
49
|
+
TRANSFORMS[name] = self
|
50
|
+
end
|
32
51
|
|
33
|
-
|
34
|
-
|
35
|
-
|
52
|
+
##
|
53
|
+
# Creates a DOM Transform as per rule.
|
54
|
+
def self.create(rule)
|
55
|
+
(type = rule['type']) ||
|
56
|
+
raise(InvalidSanitization, 'DOM transform needs a type')
|
57
|
+
(transform = TRANSFORMS[type]) ||
|
58
|
+
raise(InvalidSanitization, "No DOM transform named #{type}")
|
59
|
+
transform.new(rule)
|
60
|
+
end
|
36
61
|
|
37
|
-
|
38
|
-
|
39
|
-
|
62
|
+
##
|
63
|
+
# Remove elements matching 'selector'.
|
64
|
+
class Remove < DomTransform
|
65
|
+
register 'remove'
|
40
66
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
end
|
67
|
+
##
|
68
|
+
# Processes a node.
|
69
|
+
def process(node)
|
70
|
+
node.remove
|
71
|
+
end
|
72
|
+
end
|
48
73
|
|
49
|
-
#
|
50
|
-
class
|
51
|
-
|
52
|
-
def process(node)
|
53
|
-
node.remove
|
54
|
-
end
|
55
|
-
end
|
74
|
+
# Squeeze whitespace from a tag matching 'selector'.
|
75
|
+
class Strip < DomTransform
|
76
|
+
register 'strip'
|
56
77
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
end
|
64
|
-
end
|
78
|
+
##
|
79
|
+
# Processes a node.
|
80
|
+
def process(node)
|
81
|
+
node.content = node.content.strip
|
82
|
+
end
|
83
|
+
end
|
65
84
|
|
66
|
-
#
|
67
|
-
class
|
68
|
-
|
69
|
-
def process(node)
|
70
|
-
classes = to_array(@rule['class'])
|
85
|
+
# Unwrap elements matching 'selector'.
|
86
|
+
class Unwrap < DomTransform
|
87
|
+
register 'unwrap'
|
71
88
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
end
|
89
|
+
##
|
90
|
+
# Processes a node.
|
91
|
+
def process(node)
|
92
|
+
node.add_next_sibling(node.children)
|
93
|
+
node.remove
|
94
|
+
end
|
95
|
+
end
|
79
96
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
node.children.size == 1 or
|
85
|
-
raise InvalidSanitization, "Multiple root elements in unwrap_root"
|
86
|
-
node.children = node.children[0].children
|
87
|
-
end
|
88
|
-
end
|
97
|
+
##
|
98
|
+
# Remove classes from elements matching selector
|
99
|
+
class RemoveClass < DomTransform
|
100
|
+
register 'remove_class'
|
89
101
|
|
90
|
-
|
91
|
-
|
102
|
+
##
|
103
|
+
# Processes a node.
|
104
|
+
def process(node)
|
105
|
+
classes = to_array(@rule['class'])
|
106
|
+
|
107
|
+
# Must call remove_class on a NodeSet!
|
108
|
+
ns = Nokogiri::XML::NodeSet.new(node.document, [node])
|
109
|
+
classes.each do |class_name|
|
110
|
+
ns.remove_class(class_name)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
##
|
116
|
+
# Unwrap the root element.
|
117
|
+
class UnwrapRoot < DomTransform
|
118
|
+
register 'unwrap_root'
|
119
|
+
|
120
|
+
##
|
121
|
+
# Applies the transformation to a DOM node.
|
122
|
+
def apply(node)
|
123
|
+
(node.children.size == 1) ||
|
124
|
+
raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
|
125
|
+
node.children = node.children[0].children
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
92
130
|
end
|
@@ -1,56 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class SiteDiff
|
2
|
-
class Sanitizer
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
class Sanitizer
|
5
|
+
# Regular Expression Object.
|
6
|
+
class Regexp
|
7
|
+
##
|
8
|
+
# Creates a RegExp object.
|
9
|
+
def initialize(rule)
|
10
|
+
@rule = rule
|
11
|
+
end
|
7
12
|
|
8
|
-
|
9
|
-
|
10
|
-
|
13
|
+
##
|
14
|
+
# Whether the RegExp has a selector.
|
15
|
+
def selector?
|
16
|
+
false
|
17
|
+
end
|
11
18
|
|
12
|
-
|
13
|
-
|
14
|
-
|
19
|
+
##
|
20
|
+
# Whether the RegExp applies to the given markup.
|
21
|
+
def applies?(html, _node)
|
22
|
+
applies_to_string?(html)
|
23
|
+
end
|
15
24
|
|
16
|
-
|
17
|
-
|
18
|
-
|
25
|
+
##
|
26
|
+
# Applies the RegExp to the markup.
|
27
|
+
def apply(html)
|
28
|
+
gsub!(html)
|
29
|
+
end
|
19
30
|
|
20
|
-
|
21
|
-
|
22
|
-
|
31
|
+
##
|
32
|
+
# Creates a RegExp object as per rule.
|
33
|
+
def self.create(rule)
|
34
|
+
rule['selector'] ? WithSelector.new(rule) : new(rule)
|
35
|
+
end
|
23
36
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
37
|
+
##
|
38
|
+
# A RegExp with selector.
|
39
|
+
class WithSelector < Regexp
|
40
|
+
##
|
41
|
+
# Whether the RegExp has a selector.
|
42
|
+
def selector?
|
43
|
+
true
|
44
|
+
end
|
28
45
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
46
|
+
##
|
47
|
+
# TODO: Document what this method does.
|
48
|
+
def contexts(node)
|
49
|
+
selectors = @rule['selector']
|
50
|
+
node.css(selectors).each { |e| yield(e) }
|
51
|
+
end
|
33
52
|
|
34
|
-
|
35
|
-
|
36
|
-
|
53
|
+
##
|
54
|
+
# Whether the RegExp applies to the given markup.
|
55
|
+
def applies?(_html, node)
|
56
|
+
enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
|
57
|
+
end
|
37
58
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
59
|
+
##
|
60
|
+
# Applies the RegExp to the markup.
|
61
|
+
def apply(node)
|
62
|
+
contexts(node) { |e| e.replace(gsub!(e.to_html)) }
|
63
|
+
end
|
64
|
+
end
|
42
65
|
|
43
|
-
protected
|
44
|
-
def gsub!(str)
|
45
|
-
re = ::Regexp.new(@rule['pattern'])
|
46
|
-
sub = @rule['substitute'] || ''
|
47
|
-
str.gsub!(re, sub)
|
48
|
-
str
|
49
|
-
end
|
66
|
+
protected
|
50
67
|
|
51
|
-
|
52
|
-
|
68
|
+
def gsub!(str)
|
69
|
+
re = ::Regexp.new(@rule['pattern'])
|
70
|
+
sub = @rule['substitute'] || ''
|
71
|
+
# Expecting a mutation here. Do not reassign the variable str
|
72
|
+
# for the purpose of removing UTF-8 encoding errors.
|
73
|
+
str.gsub!(re, sub)
|
74
|
+
str
|
75
|
+
end
|
76
|
+
|
77
|
+
def applies_to_string?(str)
|
78
|
+
gsub!(str.dup) != str
|
79
|
+
end
|
80
|
+
end
|
53
81
|
end
|
54
82
|
end
|
55
|
-
end
|
56
|
-
end
|
data/lib/sitediff/uriwrapper.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/exception'
|
2
4
|
require 'typhoeus'
|
3
5
|
require 'addressable/uri'
|
@@ -5,63 +7,94 @@ require 'addressable/uri'
|
|
5
7
|
class SiteDiff
|
6
8
|
class SiteDiffReadFailure < SiteDiffException; end
|
7
9
|
|
10
|
+
# SiteDiff URI Wrapper.
|
8
11
|
class UriWrapper
|
12
|
+
# TODO: Move these CURL OPTS to Config.DEFAULT_CONFIG.
|
13
|
+
DEFAULT_CURL_OPTS = {
|
14
|
+
# Don't hang on servers that don't exist.
|
15
|
+
connecttimeout: 3,
|
16
|
+
# Follow HTTP redirects (code 301 and 302).
|
17
|
+
followlocation: true,
|
18
|
+
headers: {
|
19
|
+
'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
|
20
|
+
},
|
21
|
+
# always accept SSL certs
|
22
|
+
ssl_verifypeer: false,
|
23
|
+
ssl_verifyhost: 0
|
24
|
+
}.freeze
|
25
|
+
|
9
26
|
# This lets us treat errors or content as one object
|
10
27
|
class ReadResult
|
11
|
-
attr_accessor :content, :error_code, :error
|
28
|
+
attr_accessor :encoding, :content, :error_code, :error
|
12
29
|
|
13
|
-
|
30
|
+
##
|
31
|
+
# Creates a ReadResult.
|
32
|
+
def initialize(content = nil, encoding = 'utf-8')
|
14
33
|
@content = content
|
34
|
+
@encoding = encoding
|
15
35
|
@error = nil
|
16
36
|
@error_code = nil
|
17
37
|
end
|
18
38
|
|
19
|
-
|
39
|
+
##
|
40
|
+
# Creates a ReadResult with an error.
|
41
|
+
def self.error(message, code = nil)
|
20
42
|
res = new
|
21
43
|
res.error_code = code
|
22
|
-
res.error =
|
23
|
-
|
44
|
+
res.error = message
|
45
|
+
res
|
24
46
|
end
|
25
47
|
end
|
26
48
|
|
27
|
-
|
49
|
+
##
|
50
|
+
# Creates a UriWrapper.
|
51
|
+
def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
|
28
52
|
@uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
|
29
53
|
# remove trailing '/'s from local URIs
|
30
|
-
@uri.path.gsub!(
|
54
|
+
@uri.path.gsub!(%r{/*$}, '') if local?
|
55
|
+
@curl_opts = curl_opts
|
56
|
+
@debug = debug
|
31
57
|
end
|
32
58
|
|
59
|
+
##
|
60
|
+
# Returns the "user" part of the URI.
|
33
61
|
def user
|
34
62
|
@uri.user
|
35
63
|
end
|
36
64
|
|
65
|
+
##
|
66
|
+
# Returns the "password" part of the URI.
|
37
67
|
def password
|
38
68
|
@uri.password
|
39
69
|
end
|
40
70
|
|
71
|
+
##
|
72
|
+
# Converts the URI to a string.
|
41
73
|
def to_s
|
42
74
|
uri = @uri.dup
|
43
75
|
uri.user = nil
|
44
76
|
uri.password = nil
|
45
|
-
|
77
|
+
uri.to_s
|
46
78
|
end
|
47
79
|
|
80
|
+
##
|
48
81
|
# Is this a local filesystem path?
|
49
82
|
def local?
|
50
|
-
@uri.scheme
|
83
|
+
@uri.scheme.nil?
|
51
84
|
end
|
52
85
|
|
53
|
-
|
54
|
-
|
86
|
+
## What does this one do?
|
87
|
+
# FIXME: this is not used anymore
|
88
|
+
def +(other)
|
55
89
|
# 'path' for SiteDiff includes (parts of) path, query, and fragment.
|
56
90
|
sep = ''
|
57
|
-
if local? || @uri.path.empty?
|
58
|
-
|
59
|
-
end
|
60
|
-
self.class.new(@uri.to_s + sep + path)
|
91
|
+
sep = '/' if local? || @uri.path.empty?
|
92
|
+
self.class.new(@uri.to_s + sep + other)
|
61
93
|
end
|
62
94
|
|
95
|
+
##
|
63
96
|
# Reads a file and yields to the completion handler, see .queue()
|
64
|
-
def read_file
|
97
|
+
def read_file
|
65
98
|
File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
|
66
99
|
rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
|
67
100
|
yield ReadResult.error(e.message)
|
@@ -69,10 +102,10 @@ class SiteDiff
|
|
69
102
|
|
70
103
|
# Returns the encoding of an HTTP response from headers , nil if not
|
71
104
|
# specified.
|
72
|
-
def
|
73
|
-
if content_type = http_headers['Content-Type']
|
74
|
-
if md = /;\s*charset=([-\w]*)/.match(content_type)
|
75
|
-
|
105
|
+
def charset_encoding(http_headers)
|
106
|
+
if (content_type = http_headers['Content-Type'])
|
107
|
+
if (md = /;\s*charset=([-\w]*)/.match(content_type))
|
108
|
+
md[1]
|
76
109
|
end
|
77
110
|
end
|
78
111
|
end
|
@@ -81,33 +114,58 @@ class SiteDiff
|
|
81
114
|
#
|
82
115
|
# Completion callbacks of the request wrap the given handler which is
|
83
116
|
# assumed to accept a single ReadResult argument.
|
84
|
-
def typhoeus_request
|
85
|
-
params =
|
86
|
-
:connecttimeout => 3, # Don't hang on servers that don't exist
|
87
|
-
:followlocation => true, # Follow HTTP redirects (code 301 and 302)
|
88
|
-
:headers => {
|
89
|
-
"User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
|
90
|
-
}
|
91
|
-
}
|
117
|
+
def typhoeus_request
|
118
|
+
params = @curl_opts.dup
|
92
119
|
# Allow basic auth
|
93
120
|
params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
|
94
121
|
|
95
|
-
req = Typhoeus::Request.new(
|
122
|
+
req = Typhoeus::Request.new(to_s, params)
|
96
123
|
|
97
124
|
req.on_success do |resp|
|
98
125
|
body = resp.body
|
99
126
|
# Typhoeus does not respect HTTP headers when setting the encoding
|
100
127
|
# resp.body; coerce if possible.
|
101
|
-
if encoding =
|
128
|
+
if (encoding = charset_encoding(resp.headers))
|
102
129
|
body.force_encoding(encoding)
|
103
130
|
end
|
104
|
-
|
131
|
+
# Should be wrapped with rescue I guess? Maybe this entire function?
|
132
|
+
# Should at least be an option in the Cli to disable this.
|
133
|
+
# "stop on first error"
|
134
|
+
begin
|
135
|
+
yield ReadResult.new(body, encoding)
|
136
|
+
rescue ArgumentError => e
|
137
|
+
raise if @debug
|
138
|
+
|
139
|
+
yield ReadResult.error(
|
140
|
+
"Parsing error for #{@uri}: #{e.message}"
|
141
|
+
)
|
142
|
+
rescue StandardError => e
|
143
|
+
raise if @debug
|
144
|
+
|
145
|
+
yield ReadResult.error(
|
146
|
+
"Unknown parsing error for #{@uri}: #{e.message}"
|
147
|
+
)
|
148
|
+
end
|
105
149
|
end
|
106
150
|
|
107
151
|
req.on_failure do |resp|
|
108
|
-
|
109
|
-
|
110
|
-
|
152
|
+
if resp&.status_message
|
153
|
+
msg = resp.status_message
|
154
|
+
yield ReadResult.error(
|
155
|
+
"HTTP error when loading #{@uri}: #{msg}",
|
156
|
+
resp.response_code
|
157
|
+
)
|
158
|
+
elsif (msg = resp.options[:return_code])
|
159
|
+
yield ReadResult.error(
|
160
|
+
"Connection error when loading #{@uri}: #{msg}",
|
161
|
+
resp.response_code
|
162
|
+
)
|
163
|
+
else
|
164
|
+
yield ReadResult.error(
|
165
|
+
"Unknown error when loading #{@uri}: #{msg}",
|
166
|
+
resp.response_code
|
167
|
+
)
|
168
|
+
end
|
111
169
|
end
|
112
170
|
|
113
171
|
req
|
@@ -126,5 +184,17 @@ class SiteDiff
|
|
126
184
|
hydra.queue(typhoeus_request(&handler))
|
127
185
|
end
|
128
186
|
end
|
187
|
+
|
188
|
+
##
|
189
|
+
# Canonicalize a path.
|
190
|
+
#
|
191
|
+
# @param [String] path
|
192
|
+
# A base relative path. Example: /foo/bar
|
193
|
+
def self.canonicalize(path)
|
194
|
+
# Ignore trailing slashes for all paths except "/" (front page).
|
195
|
+
path = path.chomp('/') unless path == '/'
|
196
|
+
# If the path is empty, assume that it's the front page.
|
197
|
+
path.empty? ? '/' : path
|
198
|
+
end
|
129
199
|
end
|
130
200
|
end
|