sitediff 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/sitediff +1 -1
- data/lib/sitediff.rb +79 -63
- data/lib/sitediff/cache.rb +61 -0
- data/lib/sitediff/cli.rb +144 -23
- data/lib/sitediff/config.rb +46 -9
- data/lib/sitediff/config/creator.rb +122 -0
- data/lib/sitediff/crawler.rb +95 -0
- data/lib/sitediff/diff.rb +2 -1
- data/lib/sitediff/exception.rb +3 -0
- data/lib/sitediff/fetch.rb +55 -0
- data/lib/sitediff/files/html_report.html.erb +20 -4
- data/lib/sitediff/files/rules/drupal.yaml +33 -0
- data/lib/sitediff/files/sidebyside.html.erb +13 -0
- data/lib/sitediff/files/sitediff.css +11 -0
- data/lib/sitediff/result.rb +12 -9
- data/lib/sitediff/rules.rb +65 -0
- data/lib/sitediff/sanitize.rb +163 -168
- data/lib/sitediff/sanitize/dom_transform.rb +92 -0
- data/lib/sitediff/sanitize/regexp.rb +56 -0
- data/lib/sitediff/uriwrapper.rb +19 -7
- data/lib/sitediff/webserver.rb +82 -0
- data/lib/sitediff/webserver/resultserver.rb +98 -0
- metadata +70 -25
- checksums.yaml +0 -7
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
data/lib/sitediff/config.rb
CHANGED
@@ -1,13 +1,18 @@
|
|
1
|
+
require 'sitediff/exception'
|
2
|
+
require 'sitediff/sanitize'
|
3
|
+
require 'pathname'
|
1
4
|
require 'yaml'
|
2
5
|
|
3
6
|
class SiteDiff
|
4
7
|
class Config
|
8
|
+
DEFAULT_FILENAME = 'sitediff.yaml'
|
5
9
|
|
6
10
|
# keys allowed in configuration files
|
7
|
-
CONF_KEYS =
|
11
|
+
CONF_KEYS = Sanitizer::TOOLS.values.flatten(1) +
|
8
12
|
%w[paths before after before_url after_url includes]
|
9
13
|
|
10
|
-
class InvalidConfig <
|
14
|
+
class InvalidConfig < SiteDiffException; end
|
15
|
+
class ConfigNotFound < SiteDiffException; end
|
11
16
|
|
12
17
|
# Takes a Hash and normalizes it to the following form by merging globals
|
13
18
|
# into before and after. A normalized config Hash looks like this:
|
@@ -27,7 +32,7 @@ class SiteDiff
|
|
27
32
|
# selector: body
|
28
33
|
#
|
29
34
|
def self.normalize(conf)
|
30
|
-
tools =
|
35
|
+
tools = Sanitizer::TOOLS
|
31
36
|
|
32
37
|
# merge globals
|
33
38
|
%w[before after].each do |pos|
|
@@ -67,7 +72,7 @@ class SiteDiff
|
|
67
72
|
next
|
68
73
|
end
|
69
74
|
result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
|
70
|
-
if
|
75
|
+
if Sanitizer::TOOLS[:array].include? key # rule 2a
|
71
76
|
result[pos][key] = (a || []) + (b|| [])
|
72
77
|
else
|
73
78
|
result[pos][key] = a || b # rule 2b
|
@@ -77,9 +82,39 @@ class SiteDiff
|
|
77
82
|
result
|
78
83
|
end
|
79
84
|
|
80
|
-
|
85
|
+
# Search for a config file. If found, change to the containing directory,
|
86
|
+
# and return an array of config files found.
|
87
|
+
def self.search
|
88
|
+
subdirs = %w[. sitediff]
|
89
|
+
root_indicators = %w[.git .svn]
|
90
|
+
|
91
|
+
Pathname.pwd.ascend do |dir|
|
92
|
+
subdirs.each do |subdir|
|
93
|
+
d = dir + subdir + DEFAULT_FILENAME
|
94
|
+
if d.exist?
|
95
|
+
Dir.chdir(dir.+(subdir).to_s)
|
96
|
+
return [DEFAULT_FILENAME]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
root_indicators.each { |r| return [] if dir.+(r).exist? }
|
101
|
+
end
|
102
|
+
|
103
|
+
return []
|
104
|
+
end
|
105
|
+
|
106
|
+
def initialize(files, opts = {})
|
81
107
|
@config = {'paths' => [], 'before' => {}, 'after' => {} }
|
108
|
+
|
109
|
+
files = Config.search if files.empty? && opts[:search]
|
110
|
+
files = [DEFAULT_FILENAME] if files.empty? &&
|
111
|
+
File.exists?(DEFAULT_FILENAME)
|
112
|
+
raise ConfigNotFound, "No configuration file found." if files.empty?
|
113
|
+
|
82
114
|
files.each do |file|
|
115
|
+
raise InvalidConfig,
|
116
|
+
"Missing config file %s." % File.expand_path(file) \
|
117
|
+
unless File.exist?(file)
|
83
118
|
@config = Config::merge(@config, Config::load_conf(file))
|
84
119
|
end
|
85
120
|
end
|
@@ -99,8 +134,11 @@ class SiteDiff
|
|
99
134
|
end
|
100
135
|
|
101
136
|
# Checks if the configuration is usable for diff-ing.
|
102
|
-
def validate
|
103
|
-
|
137
|
+
def validate(opts = {})
|
138
|
+
opts = { :need_before => true }.merge(opts)
|
139
|
+
|
140
|
+
raise InvalidConfig, "Undefined 'before' base URL." if \
|
141
|
+
opts[:need_before] && !before['url']
|
104
142
|
raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
|
105
143
|
raise InvalidConfig, "Undefined 'paths'." unless (paths and !paths.empty?)
|
106
144
|
end
|
@@ -114,7 +152,7 @@ class SiteDiff
|
|
114
152
|
|
115
153
|
# reads a YAML file and raises an InvalidConfig if the file is not valid.
|
116
154
|
def self.load_raw_yaml(file)
|
117
|
-
SiteDiff::log "Reading config file: #{file}"
|
155
|
+
SiteDiff::log "Reading config file: #{Pathname.new(file).expand_path}"
|
118
156
|
conf = YAML.load_file(file) || {}
|
119
157
|
unless conf.is_a? Hash
|
120
158
|
raise InvalidConfig, "Invalid configuration file: '#{file}'"
|
@@ -149,6 +187,5 @@ class SiteDiff
|
|
149
187
|
end
|
150
188
|
conf
|
151
189
|
end
|
152
|
-
|
153
190
|
end
|
154
191
|
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'sitediff/cache'
|
2
|
+
require 'sitediff/config'
|
3
|
+
require 'sitediff/crawler'
|
4
|
+
require 'sitediff/rules'
|
5
|
+
require 'pathname'
|
6
|
+
require 'typhoeus'
|
7
|
+
require 'yaml'
|
8
|
+
|
9
|
+
class SiteDiff
|
10
|
+
class Config
|
11
|
+
class Creator
|
12
|
+
def initialize(*urls, &block)
|
13
|
+
@after = urls.pop
|
14
|
+
@before = urls.pop # May be nil
|
15
|
+
end
|
16
|
+
|
17
|
+
def roots
|
18
|
+
@roots = begin
|
19
|
+
r = { :after => @after }
|
20
|
+
r[:before] = @before if @before
|
21
|
+
r
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Build a config structure, return it
|
26
|
+
def create(opts, &block)
|
27
|
+
@config = {}
|
28
|
+
@callback = block
|
29
|
+
|
30
|
+
# Handle options
|
31
|
+
@dir = Pathname.new(opts[:directory])
|
32
|
+
@depth = opts[:depth]
|
33
|
+
@rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
|
34
|
+
|
35
|
+
# Create the dir. Must go before cache initialization!
|
36
|
+
@dir.mkpath unless @dir.directory?
|
37
|
+
|
38
|
+
# Setup instance vars
|
39
|
+
@paths = Hash.new { |h,k| h[k] = Set.new }
|
40
|
+
@cache = Cache.new(:file => @dir.+(Cache::DEFAULT_FILENAME).to_s,
|
41
|
+
:create => true)
|
42
|
+
@cache.write_tags << :before << :after
|
43
|
+
|
44
|
+
build_config
|
45
|
+
write_config
|
46
|
+
end
|
47
|
+
|
48
|
+
def build_config
|
49
|
+
%w[before after].each do |tag|
|
50
|
+
next unless u = roots[tag.to_sym]
|
51
|
+
@config[tag] = {'url' => u}
|
52
|
+
end
|
53
|
+
|
54
|
+
crawl(@depth)
|
55
|
+
@cache.close
|
56
|
+
@rules.add_config if @rules
|
57
|
+
|
58
|
+
@config['paths'] = @paths.values.reduce(&:|).to_a.sort
|
59
|
+
end
|
60
|
+
|
61
|
+
def crawl(depth = nil)
|
62
|
+
hydra = Typhoeus::Hydra.new(max_concurrency: 10)
|
63
|
+
roots.each do |tag, u|
|
64
|
+
Crawler.new(hydra, u, depth) do |info|
|
65
|
+
crawled_path(tag, info)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
hydra.run
|
69
|
+
end
|
70
|
+
|
71
|
+
# Deduplicate paths with slashes at the end
|
72
|
+
def canonicalize(tag, path)
|
73
|
+
def altered_paths(path)
|
74
|
+
yield path + '/'
|
75
|
+
yield path.sub(%r[/$], '')
|
76
|
+
end
|
77
|
+
|
78
|
+
return path.empty? ? '/' : path
|
79
|
+
end
|
80
|
+
|
81
|
+
def crawled_path(tag, info)
|
82
|
+
path, dup = canonicalize(tag, info.relative)
|
83
|
+
return if dup
|
84
|
+
|
85
|
+
res = info.read_result
|
86
|
+
|
87
|
+
@callback[tag, info]
|
88
|
+
@paths[tag] << path
|
89
|
+
@cache.set(tag, path, res)
|
90
|
+
|
91
|
+
# If single-site, cache after as before!
|
92
|
+
@cache.set(:before, path, res) unless roots[:before]
|
93
|
+
|
94
|
+
@rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
95
|
+
end
|
96
|
+
|
97
|
+
# Create a gitignore if we seem to be in git
|
98
|
+
def make_gitignore(dir)
|
99
|
+
# Check if we're in git
|
100
|
+
return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
|
101
|
+
|
102
|
+
dir.+('.gitignore').open('w') do |f|
|
103
|
+
f.puts <<-EOF.gsub(/^\s+/, '')
|
104
|
+
output
|
105
|
+
cache.db
|
106
|
+
cache.db.db
|
107
|
+
EOF
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def config_file
|
112
|
+
@dir + Config::DEFAULT_FILENAME
|
113
|
+
end
|
114
|
+
|
115
|
+
# Turn a config structure into a config file
|
116
|
+
def write_config
|
117
|
+
make_gitignore(@dir)
|
118
|
+
config_file.open('w') { |f| f.puts @config.to_yaml }
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'sitediff'
|
2
|
+
require 'sitediff/uriwrapper'
|
3
|
+
require 'addressable/uri'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'ostruct'
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
class SiteDiff
|
9
|
+
class Crawler
|
10
|
+
class Info < OpenStruct; end
|
11
|
+
|
12
|
+
DEFAULT_DEPTH = 3
|
13
|
+
|
14
|
+
# Create a crawler with a base URL
|
15
|
+
def initialize(hydra, base, depth = DEFAULT_DEPTH, &block)
|
16
|
+
@hydra = hydra
|
17
|
+
@base_uri = Addressable::URI.parse(base)
|
18
|
+
@base = base
|
19
|
+
@found = Set.new
|
20
|
+
@callback = block
|
21
|
+
|
22
|
+
add_uri('', depth)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Handle a newly found relative URI
|
26
|
+
def add_uri(rel, depth)
|
27
|
+
return if @found.include? rel
|
28
|
+
@found << rel
|
29
|
+
|
30
|
+
wrapper = UriWrapper.new(@base + rel)
|
31
|
+
wrapper.queue(@hydra) do |res|
|
32
|
+
fetched_uri(rel, depth, res)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Handle the fetch of a URI
|
37
|
+
def fetched_uri(rel, depth, res)
|
38
|
+
return unless res.content # Ignore errors
|
39
|
+
return unless depth >= 0
|
40
|
+
|
41
|
+
base = Addressable::URI.parse(@base + rel)
|
42
|
+
doc = Nokogiri::HTML(res.content)
|
43
|
+
|
44
|
+
# Call the callback
|
45
|
+
info = Info.new(
|
46
|
+
:relative => rel,
|
47
|
+
:uri => base,
|
48
|
+
:read_result => res,
|
49
|
+
:document => doc,
|
50
|
+
)
|
51
|
+
@callback[info]
|
52
|
+
|
53
|
+
# Find links
|
54
|
+
links = find_links(doc)
|
55
|
+
uris = links.map { |l| resolve_link(base, l) }.compact
|
56
|
+
uris = filter_links(uris)
|
57
|
+
|
58
|
+
# Make them relative
|
59
|
+
rels = uris.map { |u| relativize_link(u) }
|
60
|
+
|
61
|
+
# Queue them in turn
|
62
|
+
rels.each do |r|
|
63
|
+
next if @found.include? r
|
64
|
+
add_uri(r, depth - 1)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Resolve a potentially-relative link. Return nil on error.
|
69
|
+
def resolve_link(base, rel)
|
70
|
+
begin
|
71
|
+
return base + rel
|
72
|
+
rescue Addressable::URI::InvalidURIError
|
73
|
+
SiteDiff.log "skipped invalid URL: '#{rel}'", :warn
|
74
|
+
return nil
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Make a link relative to @base_uri
|
79
|
+
def relativize_link(uri)
|
80
|
+
uri.path.slice(@base_uri.path.length, uri.path.length)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Return a list of string links found on a page.
|
84
|
+
def find_links(doc)
|
85
|
+
return doc.xpath('//a[@href]').map { |e| e['href'] }
|
86
|
+
end
|
87
|
+
|
88
|
+
# Filter out links we don't want. Links passed in are absolute URIs.
|
89
|
+
def filter_links(uris)
|
90
|
+
uris.find_all do |u|
|
91
|
+
u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/sitediff/diff.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'sitediff'
|
1
2
|
require 'diffy'
|
2
3
|
require 'erb'
|
3
4
|
require 'rainbow'
|
@@ -19,7 +20,7 @@ class SiteDiff
|
|
19
20
|
to_s(*args)
|
20
21
|
end
|
21
22
|
|
22
|
-
def generate_html_report(results, before, after)
|
23
|
+
def generate_html_report(results, before, after, cache)
|
23
24
|
erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
|
24
25
|
report_html = ERB.new(File.read(erb_path)).result(binding)
|
25
26
|
return report_html
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'sitediff/uriwrapper'
|
2
|
+
require 'typhoeus'
|
3
|
+
|
4
|
+
class SiteDiff
|
5
|
+
class Fetch
|
6
|
+
# Cache is a cache object, see sitediff/cache
|
7
|
+
# Paths is a list of sub-paths
|
8
|
+
# Tags is a hash of tag names => base URLs.
|
9
|
+
def initialize(cache, paths, tags)
|
10
|
+
@cache = cache
|
11
|
+
@paths = paths
|
12
|
+
@tags = tags
|
13
|
+
end
|
14
|
+
|
15
|
+
# Fetch all the paths, once per tag.
|
16
|
+
# When a path has been fetched for every tag, block will be called with the
|
17
|
+
# path, and a hash of tag => UriWrapper::ReadResult objects.
|
18
|
+
def run(&block)
|
19
|
+
@callback = block
|
20
|
+
@hydra = Typhoeus::Hydra.new(max_concurrency: 3)
|
21
|
+
@paths.each { |path| queue_path(path) }
|
22
|
+
@hydra.run
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
# Queue a path for fetching
|
27
|
+
def queue_path(path)
|
28
|
+
results = {}
|
29
|
+
|
30
|
+
@tags.each do |tag, base|
|
31
|
+
if res = @cache.get(tag, path)
|
32
|
+
results[tag] = res
|
33
|
+
process_results(path, results)
|
34
|
+
elsif !base
|
35
|
+
# We only have the cache, but this item isn't cached!
|
36
|
+
results[tag] = UriWrapper::ReadResult.error("Not cached")
|
37
|
+
process_results(path, results)
|
38
|
+
else
|
39
|
+
uri = UriWrapper.new(base + path)
|
40
|
+
uri.queue(@hydra) do |res|
|
41
|
+
@cache.set(tag, path, res)
|
42
|
+
results[tag] = res
|
43
|
+
process_results(path, results)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Process fetch results
|
50
|
+
def process_results(path, results)
|
51
|
+
return unless results.size == @tags.size
|
52
|
+
@callback[path, results]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -11,14 +11,22 @@
|
|
11
11
|
<body>
|
12
12
|
<div class="sitediff">
|
13
13
|
<div class="legend">
|
14
|
-
|
15
|
-
|
14
|
+
<%
|
15
|
+
tags = %w[before after]
|
16
|
+
tags.each do |tag| %>
|
17
|
+
<% if tags.first != tag %> | <% end %>
|
18
|
+
<% notes = ['base url']
|
19
|
+
notes << 'cached' if cache.read_tags.include?(tag.to_sym) %>
|
20
|
+
<strong><%= tag %></strong> (<%= notes.join(', ') %>):
|
21
|
+
<a href="<%= eval(tag) %>"><%= eval(tag) %></a>
|
22
|
+
<% end %>
|
16
23
|
</div>
|
17
24
|
<table class="results">
|
18
25
|
|
19
26
|
<colgroup>
|
20
27
|
<col class="before-col">
|
21
28
|
<col class="after-col">
|
29
|
+
<col class="both-col">
|
22
30
|
<col class="path-col">
|
23
31
|
<col class="diff-stat-col">
|
24
32
|
</colgroup>
|
@@ -27,6 +35,7 @@
|
|
27
35
|
<tr>
|
28
36
|
<th> Before </th>
|
29
37
|
<th> After </th>
|
38
|
+
<th> Both </th>
|
30
39
|
<th> Path </th>
|
31
40
|
<th> Status </th>
|
32
41
|
</tr>
|
@@ -34,8 +43,15 @@
|
|
34
43
|
|
35
44
|
<% results.each do |result| %>
|
36
45
|
<tr class="<%= result.status_text %>">
|
37
|
-
<td class="before"
|
38
|
-
|
46
|
+
<td class="before">
|
47
|
+
<a href="<%= result.url(:before, before, cache) %>">[before]</a>
|
48
|
+
</td>
|
49
|
+
<td class="after">
|
50
|
+
<a href="<%= result.url(:after, after, cache) %>">[after]</a>
|
51
|
+
</td>
|
52
|
+
<td class="both">
|
53
|
+
<a href="/sidebyside<%= result.path %>">[both]</a>
|
54
|
+
</td>
|
39
55
|
<td class="path"><%= result.path %></td>
|
40
56
|
<td class="status"><%= result.link %></td>
|
41
57
|
</tr>
|