sitediff 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/sitediff +1 -1
- data/lib/sitediff.rb +79 -63
- data/lib/sitediff/cache.rb +61 -0
- data/lib/sitediff/cli.rb +144 -23
- data/lib/sitediff/config.rb +46 -9
- data/lib/sitediff/config/creator.rb +122 -0
- data/lib/sitediff/crawler.rb +95 -0
- data/lib/sitediff/diff.rb +2 -1
- data/lib/sitediff/exception.rb +3 -0
- data/lib/sitediff/fetch.rb +55 -0
- data/lib/sitediff/files/html_report.html.erb +20 -4
- data/lib/sitediff/files/rules/drupal.yaml +33 -0
- data/lib/sitediff/files/sidebyside.html.erb +13 -0
- data/lib/sitediff/files/sitediff.css +11 -0
- data/lib/sitediff/result.rb +12 -9
- data/lib/sitediff/rules.rb +65 -0
- data/lib/sitediff/sanitize.rb +163 -168
- data/lib/sitediff/sanitize/dom_transform.rb +92 -0
- data/lib/sitediff/sanitize/regexp.rb +56 -0
- data/lib/sitediff/uriwrapper.rb +19 -7
- data/lib/sitediff/webserver.rb +82 -0
- data/lib/sitediff/webserver/resultserver.rb +98 -0
- metadata +70 -25
- checksums.yaml +0 -7
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
data/lib/sitediff/config.rb
CHANGED
@@ -1,13 +1,18 @@
|
|
1
|
+
require 'sitediff/exception'
|
2
|
+
require 'sitediff/sanitize'
|
3
|
+
require 'pathname'
|
1
4
|
require 'yaml'
|
2
5
|
|
3
6
|
class SiteDiff
|
4
7
|
class Config
|
8
|
+
DEFAULT_FILENAME = 'sitediff.yaml'
|
5
9
|
|
6
10
|
# keys allowed in configuration files
|
7
|
-
CONF_KEYS =
|
11
|
+
CONF_KEYS = Sanitizer::TOOLS.values.flatten(1) +
|
8
12
|
%w[paths before after before_url after_url includes]
|
9
13
|
|
10
|
-
class InvalidConfig <
|
14
|
+
class InvalidConfig < SiteDiffException; end
|
15
|
+
class ConfigNotFound < SiteDiffException; end
|
11
16
|
|
12
17
|
# Takes a Hash and normalizes it to the following form by merging globals
|
13
18
|
# into before and after. A normalized config Hash looks like this:
|
@@ -27,7 +32,7 @@ class SiteDiff
|
|
27
32
|
# selector: body
|
28
33
|
#
|
29
34
|
def self.normalize(conf)
|
30
|
-
tools =
|
35
|
+
tools = Sanitizer::TOOLS
|
31
36
|
|
32
37
|
# merge globals
|
33
38
|
%w[before after].each do |pos|
|
@@ -67,7 +72,7 @@ class SiteDiff
|
|
67
72
|
next
|
68
73
|
end
|
69
74
|
result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
|
70
|
-
if
|
75
|
+
if Sanitizer::TOOLS[:array].include? key # rule 2a
|
71
76
|
result[pos][key] = (a || []) + (b|| [])
|
72
77
|
else
|
73
78
|
result[pos][key] = a || b # rule 2b
|
@@ -77,9 +82,39 @@ class SiteDiff
|
|
77
82
|
result
|
78
83
|
end
|
79
84
|
|
80
|
-
|
85
|
+
# Search for a config file. If found, change to the containing directory,
|
86
|
+
# and return an array of config files found.
|
87
|
+
def self.search
|
88
|
+
subdirs = %w[. sitediff]
|
89
|
+
root_indicators = %w[.git .svn]
|
90
|
+
|
91
|
+
Pathname.pwd.ascend do |dir|
|
92
|
+
subdirs.each do |subdir|
|
93
|
+
d = dir + subdir + DEFAULT_FILENAME
|
94
|
+
if d.exist?
|
95
|
+
Dir.chdir(dir.+(subdir).to_s)
|
96
|
+
return [DEFAULT_FILENAME]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
root_indicators.each { |r| return [] if dir.+(r).exist? }
|
101
|
+
end
|
102
|
+
|
103
|
+
return []
|
104
|
+
end
|
105
|
+
|
106
|
+
def initialize(files, opts = {})
|
81
107
|
@config = {'paths' => [], 'before' => {}, 'after' => {} }
|
108
|
+
|
109
|
+
files = Config.search if files.empty? && opts[:search]
|
110
|
+
files = [DEFAULT_FILENAME] if files.empty? &&
|
111
|
+
File.exists?(DEFAULT_FILENAME)
|
112
|
+
raise ConfigNotFound, "No configuration file found." if files.empty?
|
113
|
+
|
82
114
|
files.each do |file|
|
115
|
+
raise InvalidConfig,
|
116
|
+
"Missing config file %s." % File.expand_path(file) \
|
117
|
+
unless File.exist?(file)
|
83
118
|
@config = Config::merge(@config, Config::load_conf(file))
|
84
119
|
end
|
85
120
|
end
|
@@ -99,8 +134,11 @@ class SiteDiff
|
|
99
134
|
end
|
100
135
|
|
101
136
|
# Checks if the configuration is usable for diff-ing.
|
102
|
-
def validate
|
103
|
-
|
137
|
+
def validate(opts = {})
|
138
|
+
opts = { :need_before => true }.merge(opts)
|
139
|
+
|
140
|
+
raise InvalidConfig, "Undefined 'before' base URL." if \
|
141
|
+
opts[:need_before] && !before['url']
|
104
142
|
raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
|
105
143
|
raise InvalidConfig, "Undefined 'paths'." unless (paths and !paths.empty?)
|
106
144
|
end
|
@@ -114,7 +152,7 @@ class SiteDiff
|
|
114
152
|
|
115
153
|
# reads a YAML file and raises an InvalidConfig if the file is not valid.
|
116
154
|
def self.load_raw_yaml(file)
|
117
|
-
SiteDiff::log "Reading config file: #{file}"
|
155
|
+
SiteDiff::log "Reading config file: #{Pathname.new(file).expand_path}"
|
118
156
|
conf = YAML.load_file(file) || {}
|
119
157
|
unless conf.is_a? Hash
|
120
158
|
raise InvalidConfig, "Invalid configuration file: '#{file}'"
|
@@ -149,6 +187,5 @@ class SiteDiff
|
|
149
187
|
end
|
150
188
|
conf
|
151
189
|
end
|
152
|
-
|
153
190
|
end
|
154
191
|
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'sitediff/cache'
|
2
|
+
require 'sitediff/config'
|
3
|
+
require 'sitediff/crawler'
|
4
|
+
require 'sitediff/rules'
|
5
|
+
require 'pathname'
|
6
|
+
require 'typhoeus'
|
7
|
+
require 'yaml'
|
8
|
+
|
9
|
+
class SiteDiff
|
10
|
+
class Config
|
11
|
+
class Creator
|
12
|
+
def initialize(*urls, &block)
|
13
|
+
@after = urls.pop
|
14
|
+
@before = urls.pop # May be nil
|
15
|
+
end
|
16
|
+
|
17
|
+
def roots
|
18
|
+
@roots = begin
|
19
|
+
r = { :after => @after }
|
20
|
+
r[:before] = @before if @before
|
21
|
+
r
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Build a config structure, return it
|
26
|
+
def create(opts, &block)
|
27
|
+
@config = {}
|
28
|
+
@callback = block
|
29
|
+
|
30
|
+
# Handle options
|
31
|
+
@dir = Pathname.new(opts[:directory])
|
32
|
+
@depth = opts[:depth]
|
33
|
+
@rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
|
34
|
+
|
35
|
+
# Create the dir. Must go before cache initialization!
|
36
|
+
@dir.mkpath unless @dir.directory?
|
37
|
+
|
38
|
+
# Setup instance vars
|
39
|
+
@paths = Hash.new { |h,k| h[k] = Set.new }
|
40
|
+
@cache = Cache.new(:file => @dir.+(Cache::DEFAULT_FILENAME).to_s,
|
41
|
+
:create => true)
|
42
|
+
@cache.write_tags << :before << :after
|
43
|
+
|
44
|
+
build_config
|
45
|
+
write_config
|
46
|
+
end
|
47
|
+
|
48
|
+
def build_config
|
49
|
+
%w[before after].each do |tag|
|
50
|
+
next unless u = roots[tag.to_sym]
|
51
|
+
@config[tag] = {'url' => u}
|
52
|
+
end
|
53
|
+
|
54
|
+
crawl(@depth)
|
55
|
+
@cache.close
|
56
|
+
@rules.add_config if @rules
|
57
|
+
|
58
|
+
@config['paths'] = @paths.values.reduce(&:|).to_a.sort
|
59
|
+
end
|
60
|
+
|
61
|
+
def crawl(depth = nil)
|
62
|
+
hydra = Typhoeus::Hydra.new(max_concurrency: 10)
|
63
|
+
roots.each do |tag, u|
|
64
|
+
Crawler.new(hydra, u, depth) do |info|
|
65
|
+
crawled_path(tag, info)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
hydra.run
|
69
|
+
end
|
70
|
+
|
71
|
+
# Deduplicate paths with slashes at the end
|
72
|
+
def canonicalize(tag, path)
|
73
|
+
def altered_paths(path)
|
74
|
+
yield path + '/'
|
75
|
+
yield path.sub(%r[/$], '')
|
76
|
+
end
|
77
|
+
|
78
|
+
return path.empty? ? '/' : path
|
79
|
+
end
|
80
|
+
|
81
|
+
def crawled_path(tag, info)
|
82
|
+
path, dup = canonicalize(tag, info.relative)
|
83
|
+
return if dup
|
84
|
+
|
85
|
+
res = info.read_result
|
86
|
+
|
87
|
+
@callback[tag, info]
|
88
|
+
@paths[tag] << path
|
89
|
+
@cache.set(tag, path, res)
|
90
|
+
|
91
|
+
# If single-site, cache after as before!
|
92
|
+
@cache.set(:before, path, res) unless roots[:before]
|
93
|
+
|
94
|
+
@rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
95
|
+
end
|
96
|
+
|
97
|
+
# Create a gitignore if we seem to be in git
|
98
|
+
def make_gitignore(dir)
|
99
|
+
# Check if we're in git
|
100
|
+
return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
|
101
|
+
|
102
|
+
dir.+('.gitignore').open('w') do |f|
|
103
|
+
f.puts <<-EOF.gsub(/^\s+/, '')
|
104
|
+
output
|
105
|
+
cache.db
|
106
|
+
cache.db.db
|
107
|
+
EOF
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def config_file
|
112
|
+
@dir + Config::DEFAULT_FILENAME
|
113
|
+
end
|
114
|
+
|
115
|
+
# Turn a config structure into a config file
|
116
|
+
def write_config
|
117
|
+
make_gitignore(@dir)
|
118
|
+
config_file.open('w') { |f| f.puts @config.to_yaml }
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'sitediff'
|
2
|
+
require 'sitediff/uriwrapper'
|
3
|
+
require 'addressable/uri'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'ostruct'
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
class SiteDiff
|
9
|
+
class Crawler
|
10
|
+
class Info < OpenStruct; end
|
11
|
+
|
12
|
+
DEFAULT_DEPTH = 3
|
13
|
+
|
14
|
+
# Create a crawler with a base URL
|
15
|
+
def initialize(hydra, base, depth = DEFAULT_DEPTH, &block)
|
16
|
+
@hydra = hydra
|
17
|
+
@base_uri = Addressable::URI.parse(base)
|
18
|
+
@base = base
|
19
|
+
@found = Set.new
|
20
|
+
@callback = block
|
21
|
+
|
22
|
+
add_uri('', depth)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Handle a newly found relative URI
|
26
|
+
def add_uri(rel, depth)
|
27
|
+
return if @found.include? rel
|
28
|
+
@found << rel
|
29
|
+
|
30
|
+
wrapper = UriWrapper.new(@base + rel)
|
31
|
+
wrapper.queue(@hydra) do |res|
|
32
|
+
fetched_uri(rel, depth, res)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Handle the fetch of a URI
|
37
|
+
def fetched_uri(rel, depth, res)
|
38
|
+
return unless res.content # Ignore errors
|
39
|
+
return unless depth >= 0
|
40
|
+
|
41
|
+
base = Addressable::URI.parse(@base + rel)
|
42
|
+
doc = Nokogiri::HTML(res.content)
|
43
|
+
|
44
|
+
# Call the callback
|
45
|
+
info = Info.new(
|
46
|
+
:relative => rel,
|
47
|
+
:uri => base,
|
48
|
+
:read_result => res,
|
49
|
+
:document => doc,
|
50
|
+
)
|
51
|
+
@callback[info]
|
52
|
+
|
53
|
+
# Find links
|
54
|
+
links = find_links(doc)
|
55
|
+
uris = links.map { |l| resolve_link(base, l) }.compact
|
56
|
+
uris = filter_links(uris)
|
57
|
+
|
58
|
+
# Make them relative
|
59
|
+
rels = uris.map { |u| relativize_link(u) }
|
60
|
+
|
61
|
+
# Queue them in turn
|
62
|
+
rels.each do |r|
|
63
|
+
next if @found.include? r
|
64
|
+
add_uri(r, depth - 1)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Resolve a potentially-relative link. Return nil on error.
|
69
|
+
def resolve_link(base, rel)
|
70
|
+
begin
|
71
|
+
return base + rel
|
72
|
+
rescue Addressable::URI::InvalidURIError
|
73
|
+
SiteDiff.log "skipped invalid URL: '#{rel}'", :warn
|
74
|
+
return nil
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Make a link relative to @base_uri
|
79
|
+
def relativize_link(uri)
|
80
|
+
uri.path.slice(@base_uri.path.length, uri.path.length)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Return a list of string links found on a page.
|
84
|
+
def find_links(doc)
|
85
|
+
return doc.xpath('//a[@href]').map { |e| e['href'] }
|
86
|
+
end
|
87
|
+
|
88
|
+
# Filter out links we don't want. Links passed in are absolute URIs.
|
89
|
+
def filter_links(uris)
|
90
|
+
uris.find_all do |u|
|
91
|
+
u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/sitediff/diff.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'sitediff'
|
1
2
|
require 'diffy'
|
2
3
|
require 'erb'
|
3
4
|
require 'rainbow'
|
@@ -19,7 +20,7 @@ class SiteDiff
|
|
19
20
|
to_s(*args)
|
20
21
|
end
|
21
22
|
|
22
|
-
def generate_html_report(results, before, after)
|
23
|
+
def generate_html_report(results, before, after, cache)
|
23
24
|
erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
|
24
25
|
report_html = ERB.new(File.read(erb_path)).result(binding)
|
25
26
|
return report_html
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'sitediff/uriwrapper'
|
2
|
+
require 'typhoeus'
|
3
|
+
|
4
|
+
class SiteDiff
|
5
|
+
class Fetch
|
6
|
+
# Cache is a cache object, see sitediff/cache
|
7
|
+
# Paths is a list of sub-paths
|
8
|
+
# Tags is a hash of tag names => base URLs.
|
9
|
+
def initialize(cache, paths, tags)
|
10
|
+
@cache = cache
|
11
|
+
@paths = paths
|
12
|
+
@tags = tags
|
13
|
+
end
|
14
|
+
|
15
|
+
# Fetch all the paths, once per tag.
|
16
|
+
# When a path has been fetched for every tag, block will be called with the
|
17
|
+
# path, and a hash of tag => UriWrapper::ReadResult objects.
|
18
|
+
def run(&block)
|
19
|
+
@callback = block
|
20
|
+
@hydra = Typhoeus::Hydra.new(max_concurrency: 3)
|
21
|
+
@paths.each { |path| queue_path(path) }
|
22
|
+
@hydra.run
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
# Queue a path for fetching
|
27
|
+
def queue_path(path)
|
28
|
+
results = {}
|
29
|
+
|
30
|
+
@tags.each do |tag, base|
|
31
|
+
if res = @cache.get(tag, path)
|
32
|
+
results[tag] = res
|
33
|
+
process_results(path, results)
|
34
|
+
elsif !base
|
35
|
+
# We only have the cache, but this item isn't cached!
|
36
|
+
results[tag] = UriWrapper::ReadResult.error("Not cached")
|
37
|
+
process_results(path, results)
|
38
|
+
else
|
39
|
+
uri = UriWrapper.new(base + path)
|
40
|
+
uri.queue(@hydra) do |res|
|
41
|
+
@cache.set(tag, path, res)
|
42
|
+
results[tag] = res
|
43
|
+
process_results(path, results)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Process fetch results
|
50
|
+
def process_results(path, results)
|
51
|
+
return unless results.size == @tags.size
|
52
|
+
@callback[path, results]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -11,14 +11,22 @@
|
|
11
11
|
<body>
|
12
12
|
<div class="sitediff">
|
13
13
|
<div class="legend">
|
14
|
-
|
15
|
-
|
14
|
+
<%
|
15
|
+
tags = %w[before after]
|
16
|
+
tags.each do |tag| %>
|
17
|
+
<% if tags.first != tag %> | <% end %>
|
18
|
+
<% notes = ['base url']
|
19
|
+
notes << 'cached' if cache.read_tags.include?(tag.to_sym) %>
|
20
|
+
<strong><%= tag %></strong> (<%= notes.join(', ') %>):
|
21
|
+
<a href="<%= eval(tag) %>"><%= eval(tag) %></a>
|
22
|
+
<% end %>
|
16
23
|
</div>
|
17
24
|
<table class="results">
|
18
25
|
|
19
26
|
<colgroup>
|
20
27
|
<col class="before-col">
|
21
28
|
<col class="after-col">
|
29
|
+
<col class="both-col">
|
22
30
|
<col class="path-col">
|
23
31
|
<col class="diff-stat-col">
|
24
32
|
</colgroup>
|
@@ -27,6 +35,7 @@
|
|
27
35
|
<tr>
|
28
36
|
<th> Before </th>
|
29
37
|
<th> After </th>
|
38
|
+
<th> Both </th>
|
30
39
|
<th> Path </th>
|
31
40
|
<th> Status </th>
|
32
41
|
</tr>
|
@@ -34,8 +43,15 @@
|
|
34
43
|
|
35
44
|
<% results.each do |result| %>
|
36
45
|
<tr class="<%= result.status_text %>">
|
37
|
-
<td class="before"
|
38
|
-
|
46
|
+
<td class="before">
|
47
|
+
<a href="<%= result.url(:before, before, cache) %>">[before]</a>
|
48
|
+
</td>
|
49
|
+
<td class="after">
|
50
|
+
<a href="<%= result.url(:after, after, cache) %>">[after]</a>
|
51
|
+
</td>
|
52
|
+
<td class="both">
|
53
|
+
<a href="/sidebyside<%= result.path %>">[both]</a>
|
54
|
+
</td>
|
39
55
|
<td class="path"><%= result.path %></td>
|
40
56
|
<td class="status"><%= result.link %></td>
|
41
57
|
</tr>
|