sitediff 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/sitediff +2 -3
- data/lib/sitediff.rb +35 -24
- data/lib/sitediff/cache.rb +53 -47
- data/lib/sitediff/cli.rb +127 -114
- data/lib/sitediff/config.rb +35 -59
- data/lib/sitediff/config/creator.rb +95 -90
- data/lib/sitediff/crawler.rb +83 -72
- data/lib/sitediff/diff.rb +7 -5
- data/lib/sitediff/exception.rb +3 -1
- data/lib/sitediff/fetch.rb +47 -41
- data/lib/sitediff/files/html_report.html.erb +3 -0
- data/lib/sitediff/files/rules/drupal.yaml +36 -6
- data/lib/sitediff/result.rb +13 -11
- data/lib/sitediff/rules.rb +47 -47
- data/lib/sitediff/sanitize.rb +145 -150
- data/lib/sitediff/sanitize/dom_transform.rb +73 -74
- data/lib/sitediff/sanitize/regexp.rb +55 -52
- data/lib/sitediff/uriwrapper.rb +37 -26
- data/lib/sitediff/webserver.rb +80 -77
- data/lib/sitediff/webserver/resultserver.rb +117 -76
- metadata +32 -44
data/lib/sitediff/config.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/exception'
|
2
4
|
require 'sitediff/sanitize'
|
3
5
|
require 'pathname'
|
@@ -9,7 +11,7 @@ class SiteDiff
|
|
9
11
|
|
10
12
|
# keys allowed in configuration files
|
11
13
|
CONF_KEYS = Sanitizer::TOOLS.values.flatten(1) +
|
12
|
-
%w[paths before after before_url after_url includes]
|
14
|
+
%w[paths before after before_url after_url includes curl_opts]
|
13
15
|
|
14
16
|
class InvalidConfig < SiteDiffException; end
|
15
17
|
class ConfigNotFound < SiteDiffException; end
|
@@ -41,13 +43,14 @@ class SiteDiff
|
|
41
43
|
conf[pos][key] ||= []
|
42
44
|
conf[pos][key] += conf[key] if conf[key]
|
43
45
|
end
|
44
|
-
tools[:scalar].each {|key| conf[pos][key] ||= conf[key]}
|
46
|
+
tools[:scalar].each { |key| conf[pos][key] ||= conf[key] }
|
45
47
|
conf[pos]['url'] ||= conf[pos + '_url']
|
48
|
+
conf[pos]['curl_opts'] = conf['curl_opts']
|
46
49
|
end
|
47
50
|
# normalize paths
|
48
|
-
conf['paths'] = Config
|
51
|
+
conf['paths'] = Config.normalize_paths(conf['paths'])
|
49
52
|
|
50
|
-
conf.select {|k,
|
53
|
+
conf.select { |k, _v| %w[before after paths curl_opts].include? k }
|
51
54
|
end
|
52
55
|
|
53
56
|
# Merges two normalized Hashes according to the following rules:
|
@@ -72,56 +75,33 @@ class SiteDiff
|
|
72
75
|
next
|
73
76
|
end
|
74
77
|
result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
|
75
|
-
if Sanitizer::TOOLS[:array].include? key # rule 2a
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
78
|
+
result[pos][key] = if Sanitizer::TOOLS[:array].include? key # rule 2a
|
79
|
+
(a || []) + (b || [])
|
80
|
+
else
|
81
|
+
a || b # rule 2b
|
82
|
+
end
|
80
83
|
end
|
81
84
|
end
|
82
85
|
result
|
83
86
|
end
|
84
87
|
|
85
|
-
|
86
|
-
|
87
|
-
def self.search
|
88
|
-
subdirs = %w[. sitediff]
|
89
|
-
root_indicators = %w[.git .svn]
|
90
|
-
|
91
|
-
Pathname.pwd.ascend do |dir|
|
92
|
-
subdirs.each do |subdir|
|
93
|
-
d = dir + subdir + DEFAULT_FILENAME
|
94
|
-
if d.exist?
|
95
|
-
Dir.chdir(dir.+(subdir).to_s)
|
96
|
-
return [DEFAULT_FILENAME]
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
root_indicators.each { |r| return [] if dir.+(r).exist? }
|
101
|
-
end
|
102
|
-
|
103
|
-
return []
|
104
|
-
end
|
105
|
-
|
106
|
-
def initialize(files, opts = {})
|
107
|
-
@config = {'paths' => [], 'before' => {}, 'after' => {} }
|
108
|
-
|
109
|
-
files = Config.search if files.empty? && opts[:search]
|
110
|
-
files = [DEFAULT_FILENAME] if files.empty? &&
|
111
|
-
File.exists?(DEFAULT_FILENAME)
|
112
|
-
raise ConfigNotFound, "No configuration file found." if files.empty?
|
88
|
+
def initialize(files, dir)
|
89
|
+
@config = { 'paths' => [], 'before' => {}, 'after' => {} }
|
113
90
|
|
91
|
+
files = [File.join(dir, DEFAULT_FILENAME)] if files.empty?
|
114
92
|
files.each do |file|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
93
|
+
unless File.exist?(file)
|
94
|
+
raise InvalidConfig,
|
95
|
+
format('Missing config file %s.', File.expand_path(file))
|
96
|
+
end
|
97
|
+
@config = Config.merge(@config, Config.load_conf(file))
|
119
98
|
end
|
120
99
|
end
|
121
100
|
|
122
101
|
def before
|
123
102
|
@config['before']
|
124
103
|
end
|
104
|
+
|
125
105
|
def after
|
126
106
|
@config['after']
|
127
107
|
end
|
@@ -129,61 +109,57 @@ class SiteDiff
|
|
129
109
|
def paths
|
130
110
|
@config['paths']
|
131
111
|
end
|
112
|
+
|
132
113
|
def paths=(paths)
|
133
|
-
@config['paths'] = Config
|
114
|
+
@config['paths'] = Config.normalize_paths(paths)
|
134
115
|
end
|
135
116
|
|
136
117
|
# Checks if the configuration is usable for diff-ing.
|
137
118
|
def validate(opts = {})
|
138
|
-
opts = { :
|
119
|
+
opts = { need_before: true }.merge(opts)
|
139
120
|
|
140
121
|
raise InvalidConfig, "Undefined 'before' base URL." if \
|
141
122
|
opts[:need_before] && !before['url']
|
142
123
|
raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
|
143
|
-
raise InvalidConfig, "Undefined 'paths'." unless
|
124
|
+
raise InvalidConfig, "Undefined 'paths'." unless paths && !paths.empty?
|
144
125
|
end
|
145
126
|
|
146
127
|
private
|
147
128
|
|
148
129
|
def self.normalize_paths(paths)
|
149
130
|
paths ||= []
|
150
|
-
|
131
|
+
paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
|
151
132
|
end
|
152
133
|
|
153
134
|
# reads a YAML file and raises an InvalidConfig if the file is not valid.
|
154
135
|
def self.load_raw_yaml(file)
|
155
|
-
SiteDiff
|
136
|
+
SiteDiff.log "Reading config file: #{Pathname.new(file).expand_path}"
|
156
137
|
conf = YAML.load_file(file) || {}
|
157
|
-
unless conf.is_a? Hash
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
unless CONF_KEYS.include? k
|
162
|
-
raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'"
|
163
|
-
end
|
138
|
+
raise InvalidConfig, "Invalid configuration file: '#{file}'" unless conf.is_a? Hash
|
139
|
+
|
140
|
+
conf.each_key do |k, _v|
|
141
|
+
raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'" unless CONF_KEYS.include? k
|
164
142
|
end
|
165
143
|
conf
|
166
144
|
end
|
167
145
|
|
168
146
|
# loads a single YAML configuration file, merges all its 'included' files
|
169
147
|
# and returns a normalized Hash.
|
170
|
-
def self.load_conf(file, visited=[])
|
148
|
+
def self.load_conf(file, visited = [])
|
171
149
|
# don't get fooled by a/../a/ or symlinks
|
172
150
|
file = File.realpath(file)
|
173
|
-
if visited.include? file
|
174
|
-
raise InvalidConfig, "Circular dependency: #{file}"
|
175
|
-
end
|
151
|
+
raise InvalidConfig, "Circular dependency: #{file}" if visited.include? file
|
176
152
|
|
177
153
|
conf = load_raw_yaml(file) # not normalized yet
|
178
154
|
visited << file
|
179
155
|
|
180
156
|
# normalize and merge includes
|
181
157
|
includes = conf['includes'] || []
|
182
|
-
conf = Config
|
158
|
+
conf = Config.normalize(conf)
|
183
159
|
includes.each do |dep|
|
184
160
|
# include paths are relative to the including file.
|
185
161
|
dep = File.join(File.dirname(file), dep)
|
186
|
-
conf = Config
|
162
|
+
conf = Config.merge(conf, load_conf(dep, visited))
|
187
163
|
end
|
188
164
|
conf
|
189
165
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/cache'
|
2
4
|
require 'sitediff/config'
|
3
5
|
require 'sitediff/crawler'
|
@@ -7,116 +9,119 @@ require 'typhoeus'
|
|
7
9
|
require 'yaml'
|
8
10
|
|
9
11
|
class SiteDiff
|
10
|
-
class Config
|
11
|
-
class Creator
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
class Config
|
13
|
+
class Creator
|
14
|
+
def initialize(concurrency, curl_opts, *urls)
|
15
|
+
@concurrency = concurrency
|
16
|
+
@after = urls.pop
|
17
|
+
@before = urls.pop # May be nil
|
18
|
+
@curl_opts = curl_opts
|
19
|
+
end
|
16
20
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
def roots
|
22
|
+
@roots = begin
|
23
|
+
r = { after: @after }
|
24
|
+
r[:before] = @before if @before
|
25
|
+
r
|
26
|
+
end
|
27
|
+
end
|
24
28
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
+
# Build a config structure, return it
|
30
|
+
def create(opts, &block)
|
31
|
+
@config = {}
|
32
|
+
@callback = block
|
29
33
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
+
# Handle options
|
35
|
+
@dir = Pathname.new(opts[:directory])
|
36
|
+
@depth = opts[:depth]
|
37
|
+
@rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
|
34
38
|
|
35
|
-
|
36
|
-
|
39
|
+
# Create the dir. Must go before cache initialization!
|
40
|
+
@dir.mkpath unless @dir.directory?
|
37
41
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
@cache.write_tags << :before << :after
|
42
|
+
# Setup instance vars
|
43
|
+
@paths = Hash.new { |h, k| h[k] = Set.new }
|
44
|
+
@cache = Cache.new(dir: @dir.to_s, create: true)
|
45
|
+
@cache.write_tags << :before << :after
|
43
46
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
+
build_config
|
48
|
+
write_config
|
49
|
+
end
|
47
50
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
@config[tag] = {'url' => u}
|
52
|
-
end
|
51
|
+
def build_config
|
52
|
+
%w[before after].each do |tag|
|
53
|
+
next unless (u = roots[tag.to_sym])
|
53
54
|
|
54
|
-
|
55
|
-
|
56
|
-
@rules.add_config if @rules
|
55
|
+
@config[tag] = { 'url' => u }
|
56
|
+
end
|
57
57
|
|
58
|
-
|
59
|
-
|
58
|
+
crawl(@depth)
|
59
|
+
@rules&.add_config
|
60
60
|
|
61
|
-
|
62
|
-
hydra = Typhoeus::Hydra.new(max_concurrency: 10)
|
63
|
-
roots.each do |tag, u|
|
64
|
-
Crawler.new(hydra, u, depth) do |info|
|
65
|
-
crawled_path(tag, info)
|
61
|
+
@config['paths'] = @paths.values.reduce(&:|).to_a.sort
|
66
62
|
end
|
67
|
-
end
|
68
|
-
hydra.run
|
69
|
-
end
|
70
63
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
64
|
+
def crawl(depth = nil)
|
65
|
+
hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
|
66
|
+
roots.each do |tag, u|
|
67
|
+
Crawler.new(hydra, u, depth, @curl_opts) do |info|
|
68
|
+
crawled_path(tag, info)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
hydra.run
|
72
|
+
end
|
77
73
|
|
78
|
-
|
79
|
-
|
74
|
+
# Deduplicate paths with slashes at the end
|
75
|
+
def canonicalize(_tag, path)
|
76
|
+
def altered_paths(path)
|
77
|
+
yield path + '/'
|
78
|
+
yield path.sub(%r{/$}, '')
|
79
|
+
end
|
80
80
|
|
81
|
-
|
82
|
-
|
83
|
-
return if dup
|
81
|
+
path.empty? ? '/' : path
|
82
|
+
end
|
84
83
|
|
85
|
-
|
84
|
+
def crawled_path(tag, info)
|
85
|
+
path, dup = canonicalize(tag, info.relative)
|
86
|
+
return if dup
|
86
87
|
|
87
|
-
|
88
|
-
@paths[tag] << path
|
89
|
-
@cache.set(tag, path, res)
|
88
|
+
res = info.read_result
|
90
89
|
|
91
|
-
|
92
|
-
|
90
|
+
@callback[tag, info]
|
91
|
+
@paths[tag] << path
|
92
|
+
@cache.set(tag, path, res)
|
93
93
|
|
94
|
-
|
95
|
-
|
94
|
+
# If single-site, cache after as before!
|
95
|
+
@cache.set(:before, path, res) unless roots[:before]
|
96
96
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
dir.+('.gitignore').open('w') do |f|
|
103
|
-
f.puts <<-EOF.gsub(/^\s+/, '')
|
104
|
-
output
|
105
|
-
cache.db
|
106
|
-
cache.db.db
|
107
|
-
EOF
|
108
|
-
end
|
109
|
-
end
|
97
|
+
# This is used to populate the list of rules we guess are
|
98
|
+
# applicable to the current site.
|
99
|
+
@rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
100
|
+
end
|
110
101
|
|
111
|
-
|
112
|
-
|
113
|
-
|
102
|
+
# Create a gitignore if we seem to be in git
|
103
|
+
def make_gitignore(dir)
|
104
|
+
# Check if we're in git
|
105
|
+
return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
|
106
|
+
|
107
|
+
dir.+('.gitignore').open('w') do |f|
|
108
|
+
f.puts <<-GITIGNORE.gsub(/^\s+/, '')
|
109
|
+
output
|
110
|
+
cache.db
|
111
|
+
cache.db.db
|
112
|
+
GITIGNORE
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def config_file
|
117
|
+
@dir + Config::DEFAULT_FILENAME
|
118
|
+
end
|
114
119
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
120
|
+
# Turn a config structure into a config file
|
121
|
+
def write_config
|
122
|
+
make_gitignore(@dir)
|
123
|
+
config_file.open('w') { |f| f.puts @config.to_yaml }
|
124
|
+
end
|
125
|
+
end
|
119
126
|
end
|
120
127
|
end
|
121
|
-
end
|
122
|
-
end
|
data/lib/sitediff/crawler.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff'
|
2
4
|
require 'sitediff/uriwrapper'
|
3
5
|
require 'addressable/uri'
|
@@ -6,90 +8,99 @@ require 'ostruct'
|
|
6
8
|
require 'set'
|
7
9
|
|
8
10
|
class SiteDiff
|
9
|
-
class Crawler
|
10
|
-
|
11
|
-
|
12
|
-
|
11
|
+
class Crawler
|
12
|
+
class Info < OpenStruct; end
|
13
|
+
|
14
|
+
DEFAULT_DEPTH = 3
|
15
|
+
|
16
|
+
# Create a crawler with a base URL
|
17
|
+
def initialize(hydra, base, depth = DEFAULT_DEPTH,
|
18
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS, &block)
|
19
|
+
@hydra = hydra
|
20
|
+
@base_uri = Addressable::URI.parse(base)
|
21
|
+
@base = base
|
22
|
+
@found = Set.new
|
23
|
+
@callback = block
|
24
|
+
@curl_opts = curl_opts
|
25
|
+
|
26
|
+
add_uri('', depth)
|
27
|
+
end
|
13
28
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
@base_uri = Addressable::URI.parse(base)
|
18
|
-
@base = base
|
19
|
-
@found = Set.new
|
20
|
-
@callback = block
|
29
|
+
# Handle a newly found relative URI
|
30
|
+
def add_uri(rel, depth)
|
31
|
+
return if @found.include? rel
|
21
32
|
|
22
|
-
|
23
|
-
end
|
33
|
+
@found << rel
|
24
34
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
wrapper = UriWrapper.new(@base + rel)
|
31
|
-
wrapper.queue(@hydra) do |res|
|
32
|
-
fetched_uri(rel, depth, res)
|
35
|
+
wrapper = UriWrapper.new(@base + rel, @curl_opts)
|
36
|
+
wrapper.queue(@hydra) do |res|
|
37
|
+
fetched_uri(rel, depth, res)
|
38
|
+
end
|
33
39
|
end
|
34
|
-
end
|
35
40
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
41
|
+
# Handle the fetch of a URI
|
42
|
+
def fetched_uri(rel, depth, res)
|
43
|
+
if res.error
|
44
|
+
SiteDiff.log(res.error, :error)
|
45
|
+
return
|
46
|
+
elsif !res.content
|
47
|
+
SiteDiff.log('Response is missing content. Treating as an error.', :error)
|
48
|
+
return
|
49
|
+
end
|
50
|
+
|
51
|
+
base = Addressable::URI.parse(@base + rel)
|
52
|
+
doc = Nokogiri::HTML(res.content)
|
53
|
+
|
54
|
+
# Call the callback
|
55
|
+
info = Info.new(
|
56
|
+
relative: rel,
|
57
|
+
uri: base,
|
58
|
+
read_result: res,
|
59
|
+
document: doc
|
60
|
+
)
|
61
|
+
@callback[info]
|
62
|
+
|
63
|
+
return unless depth >= 1
|
64
|
+
|
65
|
+
# Find links
|
66
|
+
links = find_links(doc)
|
67
|
+
uris = links.map { |l| resolve_link(base, l) }.compact
|
68
|
+
uris = filter_links(uris)
|
69
|
+
|
70
|
+
# Make them relative
|
71
|
+
rels = uris.map { |u| relativize_link(u) }
|
72
|
+
|
73
|
+
# Queue them in turn
|
74
|
+
rels.each do |r|
|
75
|
+
next if @found.include? r
|
76
|
+
|
77
|
+
add_uri(r, depth - 1)
|
78
|
+
end
|
65
79
|
end
|
66
|
-
end
|
67
80
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
return base + rel
|
81
|
+
# Resolve a potentially-relative link. Return nil on error.
|
82
|
+
def resolve_link(base, rel)
|
83
|
+
base + rel
|
72
84
|
rescue Addressable::URI::InvalidURIError
|
73
|
-
SiteDiff.log "skipped invalid URL: '#{rel}'", :warn
|
74
|
-
|
85
|
+
SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warn
|
86
|
+
nil
|
75
87
|
end
|
76
|
-
end
|
77
88
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
89
|
+
# Make a link relative to @base_uri
|
90
|
+
def relativize_link(uri)
|
91
|
+
uri.path.slice(@base_uri.path.length, uri.path.length)
|
92
|
+
end
|
82
93
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
94
|
+
# Return a list of string links found on a page.
|
95
|
+
def find_links(doc)
|
96
|
+
doc.xpath('//a[@href]').map { |e| e['href'] }
|
97
|
+
end
|
87
98
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
99
|
+
# Filter out links we don't want. Links passed in are absolute URIs.
|
100
|
+
def filter_links(uris)
|
101
|
+
uris.find_all do |u|
|
102
|
+
u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
|
103
|
+
end
|
92
104
|
end
|
93
105
|
end
|
94
106
|
end
|
95
|
-
end
|