sitediff 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/sitediff +2 -3
- data/lib/sitediff.rb +35 -24
- data/lib/sitediff/cache.rb +53 -47
- data/lib/sitediff/cli.rb +127 -114
- data/lib/sitediff/config.rb +35 -59
- data/lib/sitediff/config/creator.rb +95 -90
- data/lib/sitediff/crawler.rb +83 -72
- data/lib/sitediff/diff.rb +7 -5
- data/lib/sitediff/exception.rb +3 -1
- data/lib/sitediff/fetch.rb +47 -41
- data/lib/sitediff/files/html_report.html.erb +3 -0
- data/lib/sitediff/files/rules/drupal.yaml +36 -6
- data/lib/sitediff/result.rb +13 -11
- data/lib/sitediff/rules.rb +47 -47
- data/lib/sitediff/sanitize.rb +145 -150
- data/lib/sitediff/sanitize/dom_transform.rb +73 -74
- data/lib/sitediff/sanitize/regexp.rb +55 -52
- data/lib/sitediff/uriwrapper.rb +37 -26
- data/lib/sitediff/webserver.rb +80 -77
- data/lib/sitediff/webserver/resultserver.rb +117 -76
- metadata +32 -44
@@ -1,92 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/sanitize'
|
2
4
|
require 'nokogiri'
|
3
5
|
|
4
6
|
class SiteDiff
|
5
|
-
class Sanitizer
|
7
|
+
class Sanitizer
|
8
|
+
# Currently supported transforms:
|
9
|
+
#
|
10
|
+
# * { :type => "unwrap_root" }
|
11
|
+
# * { :type => "unwrap", :selector => "div.field-item" }
|
12
|
+
# * { :type => "remove", :selector => "div.extra-stuff" }
|
13
|
+
# * { :type => "remove_class", :class => 'class1' }
|
14
|
+
class DomTransform
|
15
|
+
Transforms = {}
|
6
16
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
# * { :type => "unwrap", :selector => "div.field-item" }
|
11
|
-
# * { :type => "remove", :selector => "div.extra-stuff" }
|
12
|
-
# * { :type => "remove_class", :class => 'class1' }
|
13
|
-
class DomTransform
|
17
|
+
def initialize(rule)
|
18
|
+
@rule = rule
|
19
|
+
end
|
14
20
|
|
15
|
-
|
21
|
+
# Often an array or scalar are both ok values. Turn either into an array.
|
22
|
+
def to_array(val)
|
23
|
+
[val].flatten
|
24
|
+
end
|
16
25
|
|
17
|
-
def
|
18
|
-
|
19
|
-
|
26
|
+
def targets(node)
|
27
|
+
selectors = to_array(@rule['selector'])
|
28
|
+
selectors.each do |sel|
|
29
|
+
node.css(sel).each { |n| yield n }
|
30
|
+
end
|
31
|
+
end
|
20
32
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def targets(node)
|
27
|
-
selectors = to_array(@rule['selector'])
|
28
|
-
selectors.each do |sel|
|
29
|
-
node.css(sel).each { |n| yield n }
|
30
|
-
end
|
31
|
-
end
|
33
|
+
def apply(node)
|
34
|
+
targets(node) { |t| process(t) }
|
35
|
+
end
|
32
36
|
|
33
|
-
def
|
34
|
-
|
35
|
-
end
|
37
|
+
def self.register(name)
|
38
|
+
Transforms[name] = self
|
39
|
+
end
|
36
40
|
|
37
|
-
def self.
|
38
|
-
|
39
|
-
|
41
|
+
def self.create(rule)
|
42
|
+
(type = rule['type']) ||
|
43
|
+
raise(InvalidSanitization, 'DOM transform needs a type')
|
44
|
+
(transform = Transforms[type]) ||
|
45
|
+
raise(InvalidSanitization, "No DOM transform named #{type}")
|
46
|
+
transform.new(rule)
|
47
|
+
end
|
40
48
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
end
|
49
|
+
# Remove elements matching 'selector'
|
50
|
+
class Remove < DomTransform
|
51
|
+
register 'remove'
|
52
|
+
def process(node)
|
53
|
+
node.remove
|
54
|
+
end
|
55
|
+
end
|
48
56
|
|
49
|
-
#
|
50
|
-
class
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
end
|
57
|
+
# Unwrap elements matching 'selector'
|
58
|
+
class Unwrap < DomTransform
|
59
|
+
register 'unwrap'
|
60
|
+
def process(node)
|
61
|
+
node.add_next_sibling(node.children)
|
62
|
+
node.remove
|
63
|
+
end
|
64
|
+
end
|
56
65
|
|
57
|
-
#
|
58
|
-
class
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
node.remove
|
63
|
-
end
|
64
|
-
end
|
66
|
+
# Remove classes from elements matching selector
|
67
|
+
class RemoveClass < DomTransform
|
68
|
+
register 'remove_class'
|
69
|
+
def process(node)
|
70
|
+
classes = to_array(@rule['class'])
|
65
71
|
|
66
|
-
#
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
72
|
+
# Must call remove_class on a NodeSet!
|
73
|
+
ns = Nokogiri::XML::NodeSet.new(node.document, [node])
|
74
|
+
classes.each do |class_name|
|
75
|
+
ns.remove_class(class_name)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
71
79
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
80
|
+
# Unwrap the root element
|
81
|
+
class UnwrapRoot < DomTransform
|
82
|
+
register 'unwrap_root'
|
83
|
+
def apply(node)
|
84
|
+
(node.children.size == 1) ||
|
85
|
+
raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
|
86
|
+
node.children = node.children[0].children
|
87
|
+
end
|
88
|
+
end
|
76
89
|
end
|
77
90
|
end
|
78
91
|
end
|
79
|
-
|
80
|
-
# Unwrap the root element
|
81
|
-
class UnwrapRoot < DomTransform
|
82
|
-
register "unwrap_root"
|
83
|
-
def apply(node)
|
84
|
-
node.children.size == 1 or
|
85
|
-
raise InvalidSanitization, "Multiple root elements in unwrap_root"
|
86
|
-
node.children = node.children[0].children
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
@@ -1,56 +1,59 @@
|
|
1
|
-
|
2
|
-
class Sanitizer
|
3
|
-
class Regexp
|
4
|
-
def initialize(rule)
|
5
|
-
@rule = rule
|
6
|
-
end
|
7
|
-
|
8
|
-
def selector?
|
9
|
-
false
|
10
|
-
end
|
11
|
-
|
12
|
-
def applies?(html, node)
|
13
|
-
applies_to_string?(html)
|
14
|
-
end
|
15
|
-
|
16
|
-
def apply(html)
|
17
|
-
gsub!(html)
|
18
|
-
end
|
19
|
-
|
20
|
-
def self.create(rule)
|
21
|
-
rule['selector'] ? WithSelector.new(rule) : new(rule)
|
22
|
-
end
|
23
|
-
|
24
|
-
class WithSelector < Regexp
|
25
|
-
def selector?
|
26
|
-
true
|
27
|
-
end
|
1
|
+
# frozen_string_literal: true
|
28
2
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
3
|
+
class SiteDiff
|
4
|
+
class Sanitizer
|
5
|
+
class Regexp
|
6
|
+
def initialize(rule)
|
7
|
+
@rule = rule
|
8
|
+
end
|
9
|
+
|
10
|
+
def selector?
|
11
|
+
false
|
12
|
+
end
|
13
|
+
|
14
|
+
def applies?(html, _node)
|
15
|
+
applies_to_string?(html)
|
16
|
+
end
|
17
|
+
|
18
|
+
def apply(html)
|
19
|
+
gsub!(html)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.create(rule)
|
23
|
+
rule['selector'] ? WithSelector.new(rule) : new(rule)
|
24
|
+
end
|
25
|
+
|
26
|
+
class WithSelector < Regexp
|
27
|
+
def selector?
|
28
|
+
true
|
29
|
+
end
|
30
|
+
|
31
|
+
def contexts(node)
|
32
|
+
sels = @rule['selector']
|
33
|
+
node.css(sels).each { |e| yield(e) }
|
34
|
+
end
|
35
|
+
|
36
|
+
def applies?(_html, node)
|
37
|
+
enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
|
38
|
+
end
|
39
|
+
|
40
|
+
def apply(node)
|
41
|
+
contexts(node) { |e| e.replace(gsub!(e.to_html)) }
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
|
47
|
+
def gsub!(str)
|
48
|
+
re = ::Regexp.new(@rule['pattern'])
|
49
|
+
sub = @rule['substitute'] || ''
|
50
|
+
str.gsub!(re, sub)
|
51
|
+
str
|
52
|
+
end
|
53
|
+
|
54
|
+
def applies_to_string?(str)
|
55
|
+
gsub!(str.dup) != str
|
56
|
+
end
|
40
57
|
end
|
41
58
|
end
|
42
|
-
|
43
|
-
protected
|
44
|
-
def gsub!(str)
|
45
|
-
re = ::Regexp.new(@rule['pattern'])
|
46
|
-
sub = @rule['substitute'] || ''
|
47
|
-
str.gsub!(re, sub)
|
48
|
-
str
|
49
|
-
end
|
50
|
-
|
51
|
-
def applies_to_string?(str)
|
52
|
-
gsub!(str.dup) != str
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
59
|
end
|
data/lib/sitediff/uriwrapper.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/exception'
|
2
4
|
require 'typhoeus'
|
3
5
|
require 'addressable/uri'
|
@@ -6,6 +8,14 @@ class SiteDiff
|
|
6
8
|
class SiteDiffReadFailure < SiteDiffException; end
|
7
9
|
|
8
10
|
class UriWrapper
|
11
|
+
DEFAULT_CURL_OPTS = {
|
12
|
+
connecttimeout: 3, # Don't hang on servers that don't exist
|
13
|
+
followlocation: true, # Follow HTTP redirects (code 301 and 302)
|
14
|
+
headers: {
|
15
|
+
'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
|
16
|
+
}
|
17
|
+
}.freeze
|
18
|
+
|
9
19
|
# This lets us treat errors or content as one object
|
10
20
|
class ReadResult
|
11
21
|
attr_accessor :content, :error_code, :error
|
@@ -20,14 +30,15 @@ class SiteDiff
|
|
20
30
|
res = new
|
21
31
|
res.error_code = code
|
22
32
|
res.error = err
|
23
|
-
|
33
|
+
res
|
24
34
|
end
|
25
35
|
end
|
26
36
|
|
27
|
-
def initialize(uri)
|
37
|
+
def initialize(uri, curl_opts = DEFAULT_CURL_OPTS)
|
28
38
|
@uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
|
29
39
|
# remove trailing '/'s from local URIs
|
30
|
-
@uri.path.gsub!(
|
40
|
+
@uri.path.gsub!(%r{/*$}, '') if local?
|
41
|
+
@curl_opts = curl_opts
|
31
42
|
end
|
32
43
|
|
33
44
|
def user
|
@@ -42,26 +53,24 @@ class SiteDiff
|
|
42
53
|
uri = @uri.dup
|
43
54
|
uri.user = nil
|
44
55
|
uri.password = nil
|
45
|
-
|
56
|
+
uri.to_s
|
46
57
|
end
|
47
58
|
|
48
59
|
# Is this a local filesystem path?
|
49
60
|
def local?
|
50
|
-
@uri.scheme
|
61
|
+
@uri.scheme.nil?
|
51
62
|
end
|
52
63
|
|
53
|
-
# FIXME this is not used anymore
|
64
|
+
# FIXME: this is not used anymore
|
54
65
|
def +(path)
|
55
66
|
# 'path' for SiteDiff includes (parts of) path, query, and fragment.
|
56
67
|
sep = ''
|
57
|
-
if local? || @uri.path.empty?
|
58
|
-
sep = '/'
|
59
|
-
end
|
68
|
+
sep = '/' if local? || @uri.path.empty?
|
60
69
|
self.class.new(@uri.to_s + sep + path)
|
61
70
|
end
|
62
71
|
|
63
72
|
# Reads a file and yields to the completion handler, see .queue()
|
64
|
-
def read_file
|
73
|
+
def read_file
|
65
74
|
File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
|
66
75
|
rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
|
67
76
|
yield ReadResult.error(e.message)
|
@@ -70,9 +79,9 @@ class SiteDiff
|
|
70
79
|
# Returns the encoding of an HTTP response from headers , nil if not
|
71
80
|
# specified.
|
72
81
|
def http_encoding(http_headers)
|
73
|
-
if content_type = http_headers['Content-Type']
|
74
|
-
if md = /;\s*charset=([-\w]*)/.match(content_type)
|
75
|
-
|
82
|
+
if (content_type = http_headers['Content-Type'])
|
83
|
+
if (md = /;\s*charset=([-\w]*)/.match(content_type))
|
84
|
+
md[1]
|
76
85
|
end
|
77
86
|
end
|
78
87
|
end
|
@@ -81,33 +90,35 @@ class SiteDiff
|
|
81
90
|
#
|
82
91
|
# Completion callbacks of the request wrap the given handler which is
|
83
92
|
# assumed to accept a single ReadResult argument.
|
84
|
-
def typhoeus_request
|
85
|
-
params =
|
86
|
-
:connecttimeout => 3, # Don't hang on servers that don't exist
|
87
|
-
:followlocation => true, # Follow HTTP redirects (code 301 and 302)
|
88
|
-
:headers => {
|
89
|
-
"User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
|
90
|
-
}
|
91
|
-
}
|
93
|
+
def typhoeus_request
|
94
|
+
params = @curl_opts.dup
|
92
95
|
# Allow basic auth
|
93
96
|
params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
|
94
97
|
|
95
|
-
req = Typhoeus::Request.new(
|
98
|
+
req = Typhoeus::Request.new(to_s, params)
|
96
99
|
|
97
100
|
req.on_success do |resp|
|
98
101
|
body = resp.body
|
99
102
|
# Typhoeus does not respect HTTP headers when setting the encoding
|
100
103
|
# resp.body; coerce if possible.
|
101
|
-
if encoding = http_encoding(resp.headers)
|
104
|
+
if (encoding = http_encoding(resp.headers))
|
102
105
|
body.force_encoding(encoding)
|
103
106
|
end
|
104
107
|
yield ReadResult.new(body)
|
105
108
|
end
|
106
109
|
|
107
110
|
req.on_failure do |resp|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
+
if resp&.status_message
|
112
|
+
msg = resp.status_message
|
113
|
+
yield ReadResult.error("HTTP error when loading #{@uri}: #{msg}",
|
114
|
+
resp.response_code)
|
115
|
+
elsif (msg = resp.options[:return_code])
|
116
|
+
yield ReadResult.error("Connection error when loading #{@uri}: #{msg}",
|
117
|
+
resp.response_code)
|
118
|
+
else
|
119
|
+
yield ReadResult.error("Unknown error when loading #{@uri}: #{msg}",
|
120
|
+
resp.response_code)
|
121
|
+
end
|
111
122
|
end
|
112
123
|
|
113
124
|
req
|
data/lib/sitediff/webserver.rb
CHANGED
@@ -1,82 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'webrick'
|
2
4
|
|
3
5
|
class SiteDiff
|
4
|
-
class Webserver
|
5
|
-
# Simple webserver for testing purposes
|
6
|
-
DEFAULT_PORT =
|
7
|
-
|
8
|
-
attr_accessor :ports
|
9
|
-
|
10
|
-
# Serve a list of directories
|
11
|
-
def initialize(start_port, dirs, opts = {})
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def kill
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
def wait
|
31
|
-
|
32
|
-
end
|
33
|
-
|
34
|
-
def uris
|
35
|
-
|
36
|
-
end
|
37
|
-
|
38
|
-
protected
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
6
|
+
class Webserver
|
7
|
+
# Simple webserver for testing purposes
|
8
|
+
DEFAULT_PORT = 13_080
|
9
|
+
|
10
|
+
attr_accessor :ports
|
11
|
+
|
12
|
+
# Serve a list of directories
|
13
|
+
def initialize(start_port, dirs, opts = {})
|
14
|
+
start_port ||= DEFAULT_PORT
|
15
|
+
@ports = (start_port...(start_port + dirs.size)).to_a
|
16
|
+
@dirs = dirs
|
17
|
+
@opts = opts
|
18
|
+
|
19
|
+
setup
|
20
|
+
start_servers
|
21
|
+
|
22
|
+
if block_given?
|
23
|
+
yield self
|
24
|
+
kill
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def kill
|
29
|
+
@threads.each(&:kill)
|
30
|
+
end
|
31
|
+
|
32
|
+
def wait
|
33
|
+
@threads.each(&:join)
|
34
|
+
end
|
35
|
+
|
36
|
+
def uris
|
37
|
+
ports.map { |p| "http://localhost:#{p}" }
|
38
|
+
end
|
39
|
+
|
40
|
+
protected
|
41
|
+
|
42
|
+
def setup
|
43
|
+
@server_opts = {}
|
44
|
+
if @opts[:quiet]
|
45
|
+
@server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
|
46
|
+
@server_opts[:AccessLog] = []
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def server(opts)
|
51
|
+
WEBrick::HTTPServer.new(opts)
|
52
|
+
end
|
53
|
+
|
54
|
+
def start_servers
|
55
|
+
@threads = []
|
56
|
+
@dirs.each_with_index do |dir, idx|
|
57
|
+
@server_opts[:Port] = @ports[idx]
|
58
|
+
@server_opts[:DocumentRoot] = dir
|
59
|
+
srv = server(@server_opts)
|
60
|
+
@threads << Thread.new { srv.start }
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
public
|
65
|
+
|
66
|
+
class FixtureServer < Webserver
|
67
|
+
PORT = DEFAULT_PORT + 1
|
68
|
+
BASE = 'spec/fixtures/ruby-doc.org'
|
69
|
+
NAMES = %w[core-1.9.3 core-2.0].freeze
|
70
|
+
|
71
|
+
def initialize(port = PORT, base = BASE, names = NAMES)
|
72
|
+
dirs = names.map { |n| File.join(base, n) }
|
73
|
+
super(port, dirs, quiet: true)
|
74
|
+
end
|
75
|
+
|
76
|
+
def before
|
77
|
+
uris.first
|
78
|
+
end
|
79
|
+
|
80
|
+
def after
|
81
|
+
uris.last
|
82
|
+
end
|
83
|
+
end
|
58
84
|
end
|
59
85
|
end
|
60
|
-
|
61
|
-
public
|
62
|
-
|
63
|
-
class FixtureServer < Webserver
|
64
|
-
PORT = DEFAULT_PORT + 1
|
65
|
-
BASE = 'spec/fixtures/ruby-doc.org'
|
66
|
-
NAMES = %w[core-1.9.3 core-2.0]
|
67
|
-
|
68
|
-
def initialize(port = PORT, base = BASE, names = NAMES)
|
69
|
-
dirs = names.map { |n| File.join(base, n) }
|
70
|
-
super(port, dirs, :quiet => true)
|
71
|
-
end
|
72
|
-
|
73
|
-
def before
|
74
|
-
uris.first
|
75
|
-
end
|
76
|
-
def after
|
77
|
-
uris.last
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
end
|
82
|
-
end
|