sitediff 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/sitediff +2 -3
- data/lib/sitediff.rb +35 -24
- data/lib/sitediff/cache.rb +53 -47
- data/lib/sitediff/cli.rb +127 -114
- data/lib/sitediff/config.rb +35 -59
- data/lib/sitediff/config/creator.rb +95 -90
- data/lib/sitediff/crawler.rb +83 -72
- data/lib/sitediff/diff.rb +7 -5
- data/lib/sitediff/exception.rb +3 -1
- data/lib/sitediff/fetch.rb +47 -41
- data/lib/sitediff/files/html_report.html.erb +3 -0
- data/lib/sitediff/files/rules/drupal.yaml +36 -6
- data/lib/sitediff/result.rb +13 -11
- data/lib/sitediff/rules.rb +47 -47
- data/lib/sitediff/sanitize.rb +145 -150
- data/lib/sitediff/sanitize/dom_transform.rb +73 -74
- data/lib/sitediff/sanitize/regexp.rb +55 -52
- data/lib/sitediff/uriwrapper.rb +37 -26
- data/lib/sitediff/webserver.rb +80 -77
- data/lib/sitediff/webserver/resultserver.rb +117 -76
- metadata +32 -44
data/lib/sitediff/config.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/exception'
|
2
4
|
require 'sitediff/sanitize'
|
3
5
|
require 'pathname'
|
@@ -9,7 +11,7 @@ class SiteDiff
|
|
9
11
|
|
10
12
|
# keys allowed in configuration files
|
11
13
|
CONF_KEYS = Sanitizer::TOOLS.values.flatten(1) +
|
12
|
-
%w[paths before after before_url after_url includes]
|
14
|
+
%w[paths before after before_url after_url includes curl_opts]
|
13
15
|
|
14
16
|
class InvalidConfig < SiteDiffException; end
|
15
17
|
class ConfigNotFound < SiteDiffException; end
|
@@ -41,13 +43,14 @@ class SiteDiff
|
|
41
43
|
conf[pos][key] ||= []
|
42
44
|
conf[pos][key] += conf[key] if conf[key]
|
43
45
|
end
|
44
|
-
tools[:scalar].each {|key| conf[pos][key] ||= conf[key]}
|
46
|
+
tools[:scalar].each { |key| conf[pos][key] ||= conf[key] }
|
45
47
|
conf[pos]['url'] ||= conf[pos + '_url']
|
48
|
+
conf[pos]['curl_opts'] = conf['curl_opts']
|
46
49
|
end
|
47
50
|
# normalize paths
|
48
|
-
conf['paths'] = Config
|
51
|
+
conf['paths'] = Config.normalize_paths(conf['paths'])
|
49
52
|
|
50
|
-
conf.select {|k,
|
53
|
+
conf.select { |k, _v| %w[before after paths curl_opts].include? k }
|
51
54
|
end
|
52
55
|
|
53
56
|
# Merges two normalized Hashes according to the following rules:
|
@@ -72,56 +75,33 @@ class SiteDiff
|
|
72
75
|
next
|
73
76
|
end
|
74
77
|
result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
|
75
|
-
if Sanitizer::TOOLS[:array].include? key # rule 2a
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
78
|
+
result[pos][key] = if Sanitizer::TOOLS[:array].include? key # rule 2a
|
79
|
+
(a || []) + (b || [])
|
80
|
+
else
|
81
|
+
a || b # rule 2b
|
82
|
+
end
|
80
83
|
end
|
81
84
|
end
|
82
85
|
result
|
83
86
|
end
|
84
87
|
|
85
|
-
|
86
|
-
|
87
|
-
def self.search
|
88
|
-
subdirs = %w[. sitediff]
|
89
|
-
root_indicators = %w[.git .svn]
|
90
|
-
|
91
|
-
Pathname.pwd.ascend do |dir|
|
92
|
-
subdirs.each do |subdir|
|
93
|
-
d = dir + subdir + DEFAULT_FILENAME
|
94
|
-
if d.exist?
|
95
|
-
Dir.chdir(dir.+(subdir).to_s)
|
96
|
-
return [DEFAULT_FILENAME]
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
root_indicators.each { |r| return [] if dir.+(r).exist? }
|
101
|
-
end
|
102
|
-
|
103
|
-
return []
|
104
|
-
end
|
105
|
-
|
106
|
-
def initialize(files, opts = {})
|
107
|
-
@config = {'paths' => [], 'before' => {}, 'after' => {} }
|
108
|
-
|
109
|
-
files = Config.search if files.empty? && opts[:search]
|
110
|
-
files = [DEFAULT_FILENAME] if files.empty? &&
|
111
|
-
File.exists?(DEFAULT_FILENAME)
|
112
|
-
raise ConfigNotFound, "No configuration file found." if files.empty?
|
88
|
+
def initialize(files, dir)
|
89
|
+
@config = { 'paths' => [], 'before' => {}, 'after' => {} }
|
113
90
|
|
91
|
+
files = [File.join(dir, DEFAULT_FILENAME)] if files.empty?
|
114
92
|
files.each do |file|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
93
|
+
unless File.exist?(file)
|
94
|
+
raise InvalidConfig,
|
95
|
+
format('Missing config file %s.', File.expand_path(file))
|
96
|
+
end
|
97
|
+
@config = Config.merge(@config, Config.load_conf(file))
|
119
98
|
end
|
120
99
|
end
|
121
100
|
|
122
101
|
def before
|
123
102
|
@config['before']
|
124
103
|
end
|
104
|
+
|
125
105
|
def after
|
126
106
|
@config['after']
|
127
107
|
end
|
@@ -129,61 +109,57 @@ class SiteDiff
|
|
129
109
|
def paths
|
130
110
|
@config['paths']
|
131
111
|
end
|
112
|
+
|
132
113
|
def paths=(paths)
|
133
|
-
@config['paths'] = Config
|
114
|
+
@config['paths'] = Config.normalize_paths(paths)
|
134
115
|
end
|
135
116
|
|
136
117
|
# Checks if the configuration is usable for diff-ing.
|
137
118
|
def validate(opts = {})
|
138
|
-
opts = { :
|
119
|
+
opts = { need_before: true }.merge(opts)
|
139
120
|
|
140
121
|
raise InvalidConfig, "Undefined 'before' base URL." if \
|
141
122
|
opts[:need_before] && !before['url']
|
142
123
|
raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
|
143
|
-
raise InvalidConfig, "Undefined 'paths'." unless
|
124
|
+
raise InvalidConfig, "Undefined 'paths'." unless paths && !paths.empty?
|
144
125
|
end
|
145
126
|
|
146
127
|
private
|
147
128
|
|
148
129
|
def self.normalize_paths(paths)
|
149
130
|
paths ||= []
|
150
|
-
|
131
|
+
paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
|
151
132
|
end
|
152
133
|
|
153
134
|
# reads a YAML file and raises an InvalidConfig if the file is not valid.
|
154
135
|
def self.load_raw_yaml(file)
|
155
|
-
SiteDiff
|
136
|
+
SiteDiff.log "Reading config file: #{Pathname.new(file).expand_path}"
|
156
137
|
conf = YAML.load_file(file) || {}
|
157
|
-
unless conf.is_a? Hash
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
unless CONF_KEYS.include? k
|
162
|
-
raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'"
|
163
|
-
end
|
138
|
+
raise InvalidConfig, "Invalid configuration file: '#{file}'" unless conf.is_a? Hash
|
139
|
+
|
140
|
+
conf.each_key do |k, _v|
|
141
|
+
raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'" unless CONF_KEYS.include? k
|
164
142
|
end
|
165
143
|
conf
|
166
144
|
end
|
167
145
|
|
168
146
|
# loads a single YAML configuration file, merges all its 'included' files
|
169
147
|
# and returns a normalized Hash.
|
170
|
-
def self.load_conf(file, visited=[])
|
148
|
+
def self.load_conf(file, visited = [])
|
171
149
|
# don't get fooled by a/../a/ or symlinks
|
172
150
|
file = File.realpath(file)
|
173
|
-
if visited.include? file
|
174
|
-
raise InvalidConfig, "Circular dependency: #{file}"
|
175
|
-
end
|
151
|
+
raise InvalidConfig, "Circular dependency: #{file}" if visited.include? file
|
176
152
|
|
177
153
|
conf = load_raw_yaml(file) # not normalized yet
|
178
154
|
visited << file
|
179
155
|
|
180
156
|
# normalize and merge includes
|
181
157
|
includes = conf['includes'] || []
|
182
|
-
conf = Config
|
158
|
+
conf = Config.normalize(conf)
|
183
159
|
includes.each do |dep|
|
184
160
|
# include paths are relative to the including file.
|
185
161
|
dep = File.join(File.dirname(file), dep)
|
186
|
-
conf = Config
|
162
|
+
conf = Config.merge(conf, load_conf(dep, visited))
|
187
163
|
end
|
188
164
|
conf
|
189
165
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/cache'
|
2
4
|
require 'sitediff/config'
|
3
5
|
require 'sitediff/crawler'
|
@@ -7,116 +9,119 @@ require 'typhoeus'
|
|
7
9
|
require 'yaml'
|
8
10
|
|
9
11
|
class SiteDiff
|
10
|
-
class Config
|
11
|
-
class Creator
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
class Config
|
13
|
+
class Creator
|
14
|
+
def initialize(concurrency, curl_opts, *urls)
|
15
|
+
@concurrency = concurrency
|
16
|
+
@after = urls.pop
|
17
|
+
@before = urls.pop # May be nil
|
18
|
+
@curl_opts = curl_opts
|
19
|
+
end
|
16
20
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
def roots
|
22
|
+
@roots = begin
|
23
|
+
r = { after: @after }
|
24
|
+
r[:before] = @before if @before
|
25
|
+
r
|
26
|
+
end
|
27
|
+
end
|
24
28
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
+
# Build a config structure, return it
|
30
|
+
def create(opts, &block)
|
31
|
+
@config = {}
|
32
|
+
@callback = block
|
29
33
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
+
# Handle options
|
35
|
+
@dir = Pathname.new(opts[:directory])
|
36
|
+
@depth = opts[:depth]
|
37
|
+
@rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
|
34
38
|
|
35
|
-
|
36
|
-
|
39
|
+
# Create the dir. Must go before cache initialization!
|
40
|
+
@dir.mkpath unless @dir.directory?
|
37
41
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
@cache.write_tags << :before << :after
|
42
|
+
# Setup instance vars
|
43
|
+
@paths = Hash.new { |h, k| h[k] = Set.new }
|
44
|
+
@cache = Cache.new(dir: @dir.to_s, create: true)
|
45
|
+
@cache.write_tags << :before << :after
|
43
46
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
+
build_config
|
48
|
+
write_config
|
49
|
+
end
|
47
50
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
@config[tag] = {'url' => u}
|
52
|
-
end
|
51
|
+
def build_config
|
52
|
+
%w[before after].each do |tag|
|
53
|
+
next unless (u = roots[tag.to_sym])
|
53
54
|
|
54
|
-
|
55
|
-
|
56
|
-
@rules.add_config if @rules
|
55
|
+
@config[tag] = { 'url' => u }
|
56
|
+
end
|
57
57
|
|
58
|
-
|
59
|
-
|
58
|
+
crawl(@depth)
|
59
|
+
@rules&.add_config
|
60
60
|
|
61
|
-
|
62
|
-
hydra = Typhoeus::Hydra.new(max_concurrency: 10)
|
63
|
-
roots.each do |tag, u|
|
64
|
-
Crawler.new(hydra, u, depth) do |info|
|
65
|
-
crawled_path(tag, info)
|
61
|
+
@config['paths'] = @paths.values.reduce(&:|).to_a.sort
|
66
62
|
end
|
67
|
-
end
|
68
|
-
hydra.run
|
69
|
-
end
|
70
63
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
64
|
+
def crawl(depth = nil)
|
65
|
+
hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
|
66
|
+
roots.each do |tag, u|
|
67
|
+
Crawler.new(hydra, u, depth, @curl_opts) do |info|
|
68
|
+
crawled_path(tag, info)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
hydra.run
|
72
|
+
end
|
77
73
|
|
78
|
-
|
79
|
-
|
74
|
+
# Deduplicate paths with slashes at the end
|
75
|
+
def canonicalize(_tag, path)
|
76
|
+
def altered_paths(path)
|
77
|
+
yield path + '/'
|
78
|
+
yield path.sub(%r{/$}, '')
|
79
|
+
end
|
80
80
|
|
81
|
-
|
82
|
-
|
83
|
-
return if dup
|
81
|
+
path.empty? ? '/' : path
|
82
|
+
end
|
84
83
|
|
85
|
-
|
84
|
+
def crawled_path(tag, info)
|
85
|
+
path, dup = canonicalize(tag, info.relative)
|
86
|
+
return if dup
|
86
87
|
|
87
|
-
|
88
|
-
@paths[tag] << path
|
89
|
-
@cache.set(tag, path, res)
|
88
|
+
res = info.read_result
|
90
89
|
|
91
|
-
|
92
|
-
|
90
|
+
@callback[tag, info]
|
91
|
+
@paths[tag] << path
|
92
|
+
@cache.set(tag, path, res)
|
93
93
|
|
94
|
-
|
95
|
-
|
94
|
+
# If single-site, cache after as before!
|
95
|
+
@cache.set(:before, path, res) unless roots[:before]
|
96
96
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
dir.+('.gitignore').open('w') do |f|
|
103
|
-
f.puts <<-EOF.gsub(/^\s+/, '')
|
104
|
-
output
|
105
|
-
cache.db
|
106
|
-
cache.db.db
|
107
|
-
EOF
|
108
|
-
end
|
109
|
-
end
|
97
|
+
# This is used to populate the list of rules we guess are
|
98
|
+
# applicable to the current site.
|
99
|
+
@rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
100
|
+
end
|
110
101
|
|
111
|
-
|
112
|
-
|
113
|
-
|
102
|
+
# Create a gitignore if we seem to be in git
|
103
|
+
def make_gitignore(dir)
|
104
|
+
# Check if we're in git
|
105
|
+
return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
|
106
|
+
|
107
|
+
dir.+('.gitignore').open('w') do |f|
|
108
|
+
f.puts <<-GITIGNORE.gsub(/^\s+/, '')
|
109
|
+
output
|
110
|
+
cache.db
|
111
|
+
cache.db.db
|
112
|
+
GITIGNORE
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def config_file
|
117
|
+
@dir + Config::DEFAULT_FILENAME
|
118
|
+
end
|
114
119
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
120
|
+
# Turn a config structure into a config file
|
121
|
+
def write_config
|
122
|
+
make_gitignore(@dir)
|
123
|
+
config_file.open('w') { |f| f.puts @config.to_yaml }
|
124
|
+
end
|
125
|
+
end
|
119
126
|
end
|
120
127
|
end
|
121
|
-
end
|
122
|
-
end
|
data/lib/sitediff/crawler.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff'
|
2
4
|
require 'sitediff/uriwrapper'
|
3
5
|
require 'addressable/uri'
|
@@ -6,90 +8,99 @@ require 'ostruct'
|
|
6
8
|
require 'set'
|
7
9
|
|
8
10
|
class SiteDiff
|
9
|
-
class Crawler
|
10
|
-
|
11
|
-
|
12
|
-
|
11
|
+
class Crawler
|
12
|
+
class Info < OpenStruct; end
|
13
|
+
|
14
|
+
DEFAULT_DEPTH = 3
|
15
|
+
|
16
|
+
# Create a crawler with a base URL
|
17
|
+
def initialize(hydra, base, depth = DEFAULT_DEPTH,
|
18
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS, &block)
|
19
|
+
@hydra = hydra
|
20
|
+
@base_uri = Addressable::URI.parse(base)
|
21
|
+
@base = base
|
22
|
+
@found = Set.new
|
23
|
+
@callback = block
|
24
|
+
@curl_opts = curl_opts
|
25
|
+
|
26
|
+
add_uri('', depth)
|
27
|
+
end
|
13
28
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
@base_uri = Addressable::URI.parse(base)
|
18
|
-
@base = base
|
19
|
-
@found = Set.new
|
20
|
-
@callback = block
|
29
|
+
# Handle a newly found relative URI
|
30
|
+
def add_uri(rel, depth)
|
31
|
+
return if @found.include? rel
|
21
32
|
|
22
|
-
|
23
|
-
end
|
33
|
+
@found << rel
|
24
34
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
wrapper = UriWrapper.new(@base + rel)
|
31
|
-
wrapper.queue(@hydra) do |res|
|
32
|
-
fetched_uri(rel, depth, res)
|
35
|
+
wrapper = UriWrapper.new(@base + rel, @curl_opts)
|
36
|
+
wrapper.queue(@hydra) do |res|
|
37
|
+
fetched_uri(rel, depth, res)
|
38
|
+
end
|
33
39
|
end
|
34
|
-
end
|
35
40
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
41
|
+
# Handle the fetch of a URI
|
42
|
+
def fetched_uri(rel, depth, res)
|
43
|
+
if res.error
|
44
|
+
SiteDiff.log(res.error, :error)
|
45
|
+
return
|
46
|
+
elsif !res.content
|
47
|
+
SiteDiff.log('Response is missing content. Treating as an error.', :error)
|
48
|
+
return
|
49
|
+
end
|
50
|
+
|
51
|
+
base = Addressable::URI.parse(@base + rel)
|
52
|
+
doc = Nokogiri::HTML(res.content)
|
53
|
+
|
54
|
+
# Call the callback
|
55
|
+
info = Info.new(
|
56
|
+
relative: rel,
|
57
|
+
uri: base,
|
58
|
+
read_result: res,
|
59
|
+
document: doc
|
60
|
+
)
|
61
|
+
@callback[info]
|
62
|
+
|
63
|
+
return unless depth >= 1
|
64
|
+
|
65
|
+
# Find links
|
66
|
+
links = find_links(doc)
|
67
|
+
uris = links.map { |l| resolve_link(base, l) }.compact
|
68
|
+
uris = filter_links(uris)
|
69
|
+
|
70
|
+
# Make them relative
|
71
|
+
rels = uris.map { |u| relativize_link(u) }
|
72
|
+
|
73
|
+
# Queue them in turn
|
74
|
+
rels.each do |r|
|
75
|
+
next if @found.include? r
|
76
|
+
|
77
|
+
add_uri(r, depth - 1)
|
78
|
+
end
|
65
79
|
end
|
66
|
-
end
|
67
80
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
return base + rel
|
81
|
+
# Resolve a potentially-relative link. Return nil on error.
|
82
|
+
def resolve_link(base, rel)
|
83
|
+
base + rel
|
72
84
|
rescue Addressable::URI::InvalidURIError
|
73
|
-
SiteDiff.log "skipped invalid URL: '#{rel}'", :warn
|
74
|
-
|
85
|
+
SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warn
|
86
|
+
nil
|
75
87
|
end
|
76
|
-
end
|
77
88
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
89
|
+
# Make a link relative to @base_uri
|
90
|
+
def relativize_link(uri)
|
91
|
+
uri.path.slice(@base_uri.path.length, uri.path.length)
|
92
|
+
end
|
82
93
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
94
|
+
# Return a list of string links found on a page.
|
95
|
+
def find_links(doc)
|
96
|
+
doc.xpath('//a[@href]').map { |e| e['href'] }
|
97
|
+
end
|
87
98
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
99
|
+
# Filter out links we don't want. Links passed in are absolute URIs.
|
100
|
+
def filter_links(uris)
|
101
|
+
uris.find_all do |u|
|
102
|
+
u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
|
103
|
+
end
|
92
104
|
end
|
93
105
|
end
|
94
106
|
end
|
95
|
-
end
|