sitediff 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/sitediff +2 -3
- data/lib/sitediff.rb +35 -24
- data/lib/sitediff/cache.rb +53 -47
- data/lib/sitediff/cli.rb +127 -114
- data/lib/sitediff/config.rb +35 -59
- data/lib/sitediff/config/creator.rb +95 -90
- data/lib/sitediff/crawler.rb +83 -72
- data/lib/sitediff/diff.rb +7 -5
- data/lib/sitediff/exception.rb +3 -1
- data/lib/sitediff/fetch.rb +47 -41
- data/lib/sitediff/files/html_report.html.erb +3 -0
- data/lib/sitediff/files/rules/drupal.yaml +36 -6
- data/lib/sitediff/result.rb +13 -11
- data/lib/sitediff/rules.rb +47 -47
- data/lib/sitediff/sanitize.rb +145 -150
- data/lib/sitediff/sanitize/dom_transform.rb +73 -74
- data/lib/sitediff/sanitize/regexp.rb +55 -52
- data/lib/sitediff/uriwrapper.rb +37 -26
- data/lib/sitediff/webserver.rb +80 -77
- data/lib/sitediff/webserver/resultserver.rb +117 -76
- metadata +32 -44
@@ -1,92 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/sanitize'
|
2
4
|
require 'nokogiri'
|
3
5
|
|
4
6
|
class SiteDiff
|
5
|
-
class Sanitizer
|
7
|
+
class Sanitizer
|
8
|
+
# Currently supported transforms:
|
9
|
+
#
|
10
|
+
# * { :type => "unwrap_root" }
|
11
|
+
# * { :type => "unwrap", :selector => "div.field-item" }
|
12
|
+
# * { :type => "remove", :selector => "div.extra-stuff" }
|
13
|
+
# * { :type => "remove_class", :class => 'class1' }
|
14
|
+
class DomTransform
|
15
|
+
Transforms = {}
|
6
16
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
# * { :type => "unwrap", :selector => "div.field-item" }
|
11
|
-
# * { :type => "remove", :selector => "div.extra-stuff" }
|
12
|
-
# * { :type => "remove_class", :class => 'class1' }
|
13
|
-
class DomTransform
|
17
|
+
def initialize(rule)
|
18
|
+
@rule = rule
|
19
|
+
end
|
14
20
|
|
15
|
-
|
21
|
+
# Often an array or scalar are both ok values. Turn either into an array.
|
22
|
+
def to_array(val)
|
23
|
+
[val].flatten
|
24
|
+
end
|
16
25
|
|
17
|
-
def
|
18
|
-
|
19
|
-
|
26
|
+
def targets(node)
|
27
|
+
selectors = to_array(@rule['selector'])
|
28
|
+
selectors.each do |sel|
|
29
|
+
node.css(sel).each { |n| yield n }
|
30
|
+
end
|
31
|
+
end
|
20
32
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def targets(node)
|
27
|
-
selectors = to_array(@rule['selector'])
|
28
|
-
selectors.each do |sel|
|
29
|
-
node.css(sel).each { |n| yield n }
|
30
|
-
end
|
31
|
-
end
|
33
|
+
def apply(node)
|
34
|
+
targets(node) { |t| process(t) }
|
35
|
+
end
|
32
36
|
|
33
|
-
def
|
34
|
-
|
35
|
-
end
|
37
|
+
def self.register(name)
|
38
|
+
Transforms[name] = self
|
39
|
+
end
|
36
40
|
|
37
|
-
def self.
|
38
|
-
|
39
|
-
|
41
|
+
def self.create(rule)
|
42
|
+
(type = rule['type']) ||
|
43
|
+
raise(InvalidSanitization, 'DOM transform needs a type')
|
44
|
+
(transform = Transforms[type]) ||
|
45
|
+
raise(InvalidSanitization, "No DOM transform named #{type}")
|
46
|
+
transform.new(rule)
|
47
|
+
end
|
40
48
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
end
|
49
|
+
# Remove elements matching 'selector'
|
50
|
+
class Remove < DomTransform
|
51
|
+
register 'remove'
|
52
|
+
def process(node)
|
53
|
+
node.remove
|
54
|
+
end
|
55
|
+
end
|
48
56
|
|
49
|
-
#
|
50
|
-
class
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
end
|
57
|
+
# Unwrap elements matching 'selector'
|
58
|
+
class Unwrap < DomTransform
|
59
|
+
register 'unwrap'
|
60
|
+
def process(node)
|
61
|
+
node.add_next_sibling(node.children)
|
62
|
+
node.remove
|
63
|
+
end
|
64
|
+
end
|
56
65
|
|
57
|
-
#
|
58
|
-
class
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
node.remove
|
63
|
-
end
|
64
|
-
end
|
66
|
+
# Remove classes from elements matching selector
|
67
|
+
class RemoveClass < DomTransform
|
68
|
+
register 'remove_class'
|
69
|
+
def process(node)
|
70
|
+
classes = to_array(@rule['class'])
|
65
71
|
|
66
|
-
#
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
72
|
+
# Must call remove_class on a NodeSet!
|
73
|
+
ns = Nokogiri::XML::NodeSet.new(node.document, [node])
|
74
|
+
classes.each do |class_name|
|
75
|
+
ns.remove_class(class_name)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
71
79
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
80
|
+
# Unwrap the root element
|
81
|
+
class UnwrapRoot < DomTransform
|
82
|
+
register 'unwrap_root'
|
83
|
+
def apply(node)
|
84
|
+
(node.children.size == 1) ||
|
85
|
+
raise(InvalidSanitization, 'Multiple root elements in unwrap_root')
|
86
|
+
node.children = node.children[0].children
|
87
|
+
end
|
88
|
+
end
|
76
89
|
end
|
77
90
|
end
|
78
91
|
end
|
79
|
-
|
80
|
-
# Unwrap the root element
|
81
|
-
class UnwrapRoot < DomTransform
|
82
|
-
register "unwrap_root"
|
83
|
-
def apply(node)
|
84
|
-
node.children.size == 1 or
|
85
|
-
raise InvalidSanitization, "Multiple root elements in unwrap_root"
|
86
|
-
node.children = node.children[0].children
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
@@ -1,56 +1,59 @@
|
|
1
|
-
|
2
|
-
class Sanitizer
|
3
|
-
class Regexp
|
4
|
-
def initialize(rule)
|
5
|
-
@rule = rule
|
6
|
-
end
|
7
|
-
|
8
|
-
def selector?
|
9
|
-
false
|
10
|
-
end
|
11
|
-
|
12
|
-
def applies?(html, node)
|
13
|
-
applies_to_string?(html)
|
14
|
-
end
|
15
|
-
|
16
|
-
def apply(html)
|
17
|
-
gsub!(html)
|
18
|
-
end
|
19
|
-
|
20
|
-
def self.create(rule)
|
21
|
-
rule['selector'] ? WithSelector.new(rule) : new(rule)
|
22
|
-
end
|
23
|
-
|
24
|
-
class WithSelector < Regexp
|
25
|
-
def selector?
|
26
|
-
true
|
27
|
-
end
|
1
|
+
# frozen_string_literal: true
|
28
2
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
3
|
+
class SiteDiff
|
4
|
+
class Sanitizer
|
5
|
+
class Regexp
|
6
|
+
def initialize(rule)
|
7
|
+
@rule = rule
|
8
|
+
end
|
9
|
+
|
10
|
+
def selector?
|
11
|
+
false
|
12
|
+
end
|
13
|
+
|
14
|
+
def applies?(html, _node)
|
15
|
+
applies_to_string?(html)
|
16
|
+
end
|
17
|
+
|
18
|
+
def apply(html)
|
19
|
+
gsub!(html)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.create(rule)
|
23
|
+
rule['selector'] ? WithSelector.new(rule) : new(rule)
|
24
|
+
end
|
25
|
+
|
26
|
+
class WithSelector < Regexp
|
27
|
+
def selector?
|
28
|
+
true
|
29
|
+
end
|
30
|
+
|
31
|
+
def contexts(node)
|
32
|
+
sels = @rule['selector']
|
33
|
+
node.css(sels).each { |e| yield(e) }
|
34
|
+
end
|
35
|
+
|
36
|
+
def applies?(_html, node)
|
37
|
+
enum_for(:contexts, node).any? { |e| applies_to_string?(e.to_html) }
|
38
|
+
end
|
39
|
+
|
40
|
+
def apply(node)
|
41
|
+
contexts(node) { |e| e.replace(gsub!(e.to_html)) }
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
|
47
|
+
def gsub!(str)
|
48
|
+
re = ::Regexp.new(@rule['pattern'])
|
49
|
+
sub = @rule['substitute'] || ''
|
50
|
+
str.gsub!(re, sub)
|
51
|
+
str
|
52
|
+
end
|
53
|
+
|
54
|
+
def applies_to_string?(str)
|
55
|
+
gsub!(str.dup) != str
|
56
|
+
end
|
40
57
|
end
|
41
58
|
end
|
42
|
-
|
43
|
-
protected
|
44
|
-
def gsub!(str)
|
45
|
-
re = ::Regexp.new(@rule['pattern'])
|
46
|
-
sub = @rule['substitute'] || ''
|
47
|
-
str.gsub!(re, sub)
|
48
|
-
str
|
49
|
-
end
|
50
|
-
|
51
|
-
def applies_to_string?(str)
|
52
|
-
gsub!(str.dup) != str
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
59
|
end
|
data/lib/sitediff/uriwrapper.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/exception'
|
2
4
|
require 'typhoeus'
|
3
5
|
require 'addressable/uri'
|
@@ -6,6 +8,14 @@ class SiteDiff
|
|
6
8
|
class SiteDiffReadFailure < SiteDiffException; end
|
7
9
|
|
8
10
|
class UriWrapper
|
11
|
+
DEFAULT_CURL_OPTS = {
|
12
|
+
connecttimeout: 3, # Don't hang on servers that don't exist
|
13
|
+
followlocation: true, # Follow HTTP redirects (code 301 and 302)
|
14
|
+
headers: {
|
15
|
+
'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
|
16
|
+
}
|
17
|
+
}.freeze
|
18
|
+
|
9
19
|
# This lets us treat errors or content as one object
|
10
20
|
class ReadResult
|
11
21
|
attr_accessor :content, :error_code, :error
|
@@ -20,14 +30,15 @@ class SiteDiff
|
|
20
30
|
res = new
|
21
31
|
res.error_code = code
|
22
32
|
res.error = err
|
23
|
-
|
33
|
+
res
|
24
34
|
end
|
25
35
|
end
|
26
36
|
|
27
|
-
def initialize(uri)
|
37
|
+
def initialize(uri, curl_opts = DEFAULT_CURL_OPTS)
|
28
38
|
@uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
|
29
39
|
# remove trailing '/'s from local URIs
|
30
|
-
@uri.path.gsub!(
|
40
|
+
@uri.path.gsub!(%r{/*$}, '') if local?
|
41
|
+
@curl_opts = curl_opts
|
31
42
|
end
|
32
43
|
|
33
44
|
def user
|
@@ -42,26 +53,24 @@ class SiteDiff
|
|
42
53
|
uri = @uri.dup
|
43
54
|
uri.user = nil
|
44
55
|
uri.password = nil
|
45
|
-
|
56
|
+
uri.to_s
|
46
57
|
end
|
47
58
|
|
48
59
|
# Is this a local filesystem path?
|
49
60
|
def local?
|
50
|
-
@uri.scheme
|
61
|
+
@uri.scheme.nil?
|
51
62
|
end
|
52
63
|
|
53
|
-
# FIXME this is not used anymore
|
64
|
+
# FIXME: this is not used anymore
|
54
65
|
def +(path)
|
55
66
|
# 'path' for SiteDiff includes (parts of) path, query, and fragment.
|
56
67
|
sep = ''
|
57
|
-
if local? || @uri.path.empty?
|
58
|
-
sep = '/'
|
59
|
-
end
|
68
|
+
sep = '/' if local? || @uri.path.empty?
|
60
69
|
self.class.new(@uri.to_s + sep + path)
|
61
70
|
end
|
62
71
|
|
63
72
|
# Reads a file and yields to the completion handler, see .queue()
|
64
|
-
def read_file
|
73
|
+
def read_file
|
65
74
|
File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
|
66
75
|
rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
|
67
76
|
yield ReadResult.error(e.message)
|
@@ -70,9 +79,9 @@ class SiteDiff
|
|
70
79
|
# Returns the encoding of an HTTP response from headers , nil if not
|
71
80
|
# specified.
|
72
81
|
def http_encoding(http_headers)
|
73
|
-
if content_type = http_headers['Content-Type']
|
74
|
-
if md = /;\s*charset=([-\w]*)/.match(content_type)
|
75
|
-
|
82
|
+
if (content_type = http_headers['Content-Type'])
|
83
|
+
if (md = /;\s*charset=([-\w]*)/.match(content_type))
|
84
|
+
md[1]
|
76
85
|
end
|
77
86
|
end
|
78
87
|
end
|
@@ -81,33 +90,35 @@ class SiteDiff
|
|
81
90
|
#
|
82
91
|
# Completion callbacks of the request wrap the given handler which is
|
83
92
|
# assumed to accept a single ReadResult argument.
|
84
|
-
def typhoeus_request
|
85
|
-
params =
|
86
|
-
:connecttimeout => 3, # Don't hang on servers that don't exist
|
87
|
-
:followlocation => true, # Follow HTTP redirects (code 301 and 302)
|
88
|
-
:headers => {
|
89
|
-
"User-Agent" => "Sitediff - https://github.com/evolvingweb/sitediff"
|
90
|
-
}
|
91
|
-
}
|
93
|
+
def typhoeus_request
|
94
|
+
params = @curl_opts.dup
|
92
95
|
# Allow basic auth
|
93
96
|
params[:userpwd] = @uri.user + ':' + @uri.password if @uri.user
|
94
97
|
|
95
|
-
req = Typhoeus::Request.new(
|
98
|
+
req = Typhoeus::Request.new(to_s, params)
|
96
99
|
|
97
100
|
req.on_success do |resp|
|
98
101
|
body = resp.body
|
99
102
|
# Typhoeus does not respect HTTP headers when setting the encoding
|
100
103
|
# resp.body; coerce if possible.
|
101
|
-
if encoding = http_encoding(resp.headers)
|
104
|
+
if (encoding = http_encoding(resp.headers))
|
102
105
|
body.force_encoding(encoding)
|
103
106
|
end
|
104
107
|
yield ReadResult.new(body)
|
105
108
|
end
|
106
109
|
|
107
110
|
req.on_failure do |resp|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
+
if resp&.status_message
|
112
|
+
msg = resp.status_message
|
113
|
+
yield ReadResult.error("HTTP error when loading #{@uri}: #{msg}",
|
114
|
+
resp.response_code)
|
115
|
+
elsif (msg = resp.options[:return_code])
|
116
|
+
yield ReadResult.error("Connection error when loading #{@uri}: #{msg}",
|
117
|
+
resp.response_code)
|
118
|
+
else
|
119
|
+
yield ReadResult.error("Unknown error when loading #{@uri}: #{msg}",
|
120
|
+
resp.response_code)
|
121
|
+
end
|
111
122
|
end
|
112
123
|
|
113
124
|
req
|
data/lib/sitediff/webserver.rb
CHANGED
@@ -1,82 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'webrick'
|
2
4
|
|
3
5
|
class SiteDiff
|
4
|
-
class Webserver
|
5
|
-
# Simple webserver for testing purposes
|
6
|
-
DEFAULT_PORT =
|
7
|
-
|
8
|
-
attr_accessor :ports
|
9
|
-
|
10
|
-
# Serve a list of directories
|
11
|
-
def initialize(start_port, dirs, opts = {})
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def kill
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
def wait
|
31
|
-
|
32
|
-
end
|
33
|
-
|
34
|
-
def uris
|
35
|
-
|
36
|
-
end
|
37
|
-
|
38
|
-
protected
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
6
|
+
class Webserver
|
7
|
+
# Simple webserver for testing purposes
|
8
|
+
DEFAULT_PORT = 13_080
|
9
|
+
|
10
|
+
attr_accessor :ports
|
11
|
+
|
12
|
+
# Serve a list of directories
|
13
|
+
def initialize(start_port, dirs, opts = {})
|
14
|
+
start_port ||= DEFAULT_PORT
|
15
|
+
@ports = (start_port...(start_port + dirs.size)).to_a
|
16
|
+
@dirs = dirs
|
17
|
+
@opts = opts
|
18
|
+
|
19
|
+
setup
|
20
|
+
start_servers
|
21
|
+
|
22
|
+
if block_given?
|
23
|
+
yield self
|
24
|
+
kill
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def kill
|
29
|
+
@threads.each(&:kill)
|
30
|
+
end
|
31
|
+
|
32
|
+
def wait
|
33
|
+
@threads.each(&:join)
|
34
|
+
end
|
35
|
+
|
36
|
+
def uris
|
37
|
+
ports.map { |p| "http://localhost:#{p}" }
|
38
|
+
end
|
39
|
+
|
40
|
+
protected
|
41
|
+
|
42
|
+
def setup
|
43
|
+
@server_opts = {}
|
44
|
+
if @opts[:quiet]
|
45
|
+
@server_opts[:Logger] = WEBrick::Log.new(IO::NULL)
|
46
|
+
@server_opts[:AccessLog] = []
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def server(opts)
|
51
|
+
WEBrick::HTTPServer.new(opts)
|
52
|
+
end
|
53
|
+
|
54
|
+
def start_servers
|
55
|
+
@threads = []
|
56
|
+
@dirs.each_with_index do |dir, idx|
|
57
|
+
@server_opts[:Port] = @ports[idx]
|
58
|
+
@server_opts[:DocumentRoot] = dir
|
59
|
+
srv = server(@server_opts)
|
60
|
+
@threads << Thread.new { srv.start }
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
public
|
65
|
+
|
66
|
+
class FixtureServer < Webserver
|
67
|
+
PORT = DEFAULT_PORT + 1
|
68
|
+
BASE = 'spec/fixtures/ruby-doc.org'
|
69
|
+
NAMES = %w[core-1.9.3 core-2.0].freeze
|
70
|
+
|
71
|
+
def initialize(port = PORT, base = BASE, names = NAMES)
|
72
|
+
dirs = names.map { |n| File.join(base, n) }
|
73
|
+
super(port, dirs, quiet: true)
|
74
|
+
end
|
75
|
+
|
76
|
+
def before
|
77
|
+
uris.first
|
78
|
+
end
|
79
|
+
|
80
|
+
def after
|
81
|
+
uris.last
|
82
|
+
end
|
83
|
+
end
|
58
84
|
end
|
59
85
|
end
|
60
|
-
|
61
|
-
public
|
62
|
-
|
63
|
-
class FixtureServer < Webserver
|
64
|
-
PORT = DEFAULT_PORT + 1
|
65
|
-
BASE = 'spec/fixtures/ruby-doc.org'
|
66
|
-
NAMES = %w[core-1.9.3 core-2.0]
|
67
|
-
|
68
|
-
def initialize(port = PORT, base = BASE, names = NAMES)
|
69
|
-
dirs = names.map { |n| File.join(base, n) }
|
70
|
-
super(port, dirs, :quiet => true)
|
71
|
-
end
|
72
|
-
|
73
|
-
def before
|
74
|
-
uris.first
|
75
|
-
end
|
76
|
-
def after
|
77
|
-
uris.last
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
end
|
82
|
-
end
|