sitediff 0.0.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/bin/sitediff +10 -4
- data/lib/sitediff.rb +179 -91
- data/lib/sitediff/cache.rb +106 -0
- data/lib/sitediff/cli.rb +391 -60
- data/lib/sitediff/config.rb +383 -37
- data/lib/sitediff/config/creator.rb +114 -0
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +131 -0
- data/lib/sitediff/diff.rb +57 -12
- data/lib/sitediff/exception.rb +5 -0
- data/lib/sitediff/fetch.rb +76 -0
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +144 -0
- data/lib/sitediff/files/sidebyside.html.erb +16 -0
- data/lib/sitediff/files/sitediff.css +236 -29
- data/lib/sitediff/files/sitediff.js +176 -0
- data/lib/sitediff/report.rb +238 -0
- data/lib/sitediff/result.rb +63 -26
- data/lib/sitediff/sanitize.rb +160 -141
- data/lib/sitediff/sanitize/dom_transform.rb +130 -0
- data/lib/sitediff/sanitize/regexp.rb +82 -0
- data/lib/sitediff/uriwrapper.rb +114 -35
- data/lib/sitediff/webserver.rb +94 -0
- data/lib/sitediff/webserver/resultserver.rb +134 -0
- metadata +103 -43
- data/lib/sitediff/files/html_report.html.erb +0 -47
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
@@ -0,0 +1,114 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff/cache'
|
4
|
+
require 'sitediff/config'
|
5
|
+
require 'sitediff/crawler'
|
6
|
+
require 'pathname'
|
7
|
+
require 'typhoeus'
|
8
|
+
require 'yaml'
|
9
|
+
|
10
|
+
class SiteDiff
|
11
|
+
class Config
|
12
|
+
##
|
13
|
+
# SiteDiff Config Creator Object.
|
14
|
+
class Creator
|
15
|
+
##
|
16
|
+
# Creates a Creator object.
|
17
|
+
def initialize(debug, *urls)
|
18
|
+
@config = nil
|
19
|
+
@after = urls.pop
|
20
|
+
@before = urls.pop # May be nil
|
21
|
+
@debug = debug
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# Determine if we're dealing with one or two URLs.
|
26
|
+
def roots
|
27
|
+
@roots = { 'after' => @after }
|
28
|
+
@roots['before'] = @before if @before
|
29
|
+
@roots
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# Build a config structure, return it.
|
34
|
+
def create(options, &block)
|
35
|
+
@config = {}
|
36
|
+
@callback = block
|
37
|
+
@dir = Pathname.new(options[:directory])
|
38
|
+
|
39
|
+
# Setup instance vars
|
40
|
+
@paths = Hash.new { |h, k| h[k] = Set.new }
|
41
|
+
@cache = Cache.new(directory: @dir.to_s, create: true)
|
42
|
+
@cache.write_tags << :before << :after
|
43
|
+
|
44
|
+
build_config options
|
45
|
+
write_config
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# Build and populate the config object which is being created.
|
50
|
+
#
|
51
|
+
# @param [String] options
|
52
|
+
# One or more options.
|
53
|
+
def build_config(options)
|
54
|
+
options = Config.stringify_keys options
|
55
|
+
|
56
|
+
# Build config for "before" and "after".
|
57
|
+
%w[before after].each do |tag|
|
58
|
+
next unless (url = roots[tag])
|
59
|
+
|
60
|
+
@config[tag] = { 'url' => url }
|
61
|
+
end
|
62
|
+
|
63
|
+
# Build other settings.
|
64
|
+
@config['settings'] = {}
|
65
|
+
Config::ALLOWED_SETTINGS_KEYS.each do |key|
|
66
|
+
@config['settings'][key] = options[key]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
##
|
71
|
+
# Create a gitignore if we seem to be in git.
|
72
|
+
def make_gitignore(dir)
|
73
|
+
# Check if we're in git
|
74
|
+
unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
|
75
|
+
return
|
76
|
+
end
|
77
|
+
|
78
|
+
dir.+('.gitignore').open('w') do |f|
|
79
|
+
f.puts <<-GITIGNORE.gsub(/^\s+/, '')
|
80
|
+
# Directories.
|
81
|
+
diffs
|
82
|
+
snapshot
|
83
|
+
|
84
|
+
# Files.
|
85
|
+
settings.yaml
|
86
|
+
paths.txt
|
87
|
+
failures.txt
|
88
|
+
GITIGNORE
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
##
|
93
|
+
# Returns the name of the config directory.
|
94
|
+
def directory
|
95
|
+
@dir
|
96
|
+
end
|
97
|
+
|
98
|
+
##
|
99
|
+
# Returns the name of the config file.
|
100
|
+
def config_file
|
101
|
+
@dir + Config::DEFAULT_FILENAME
|
102
|
+
end
|
103
|
+
|
104
|
+
##
|
105
|
+
# Writes the built config into the config file.
|
106
|
+
# TODO: Exclude default params before writing.
|
107
|
+
def write_config
|
108
|
+
make_gitignore(@dir)
|
109
|
+
data = Config.remove_defaults(@config)
|
110
|
+
config_file.open('w') { |f| f.puts data.to_yaml }
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pathname'
|
4
|
+
require 'sitediff/config'
|
5
|
+
|
6
|
+
class SiteDiff
|
7
|
+
class Config
|
8
|
+
##
|
9
|
+
# Preset helper.
|
10
|
+
class Preset
|
11
|
+
##
|
12
|
+
# Directory in which presets live.
|
13
|
+
#
|
14
|
+
# TODO: Move this outside "lib".
|
15
|
+
DIRECTORY = (Pathname.new(__dir__).dirname + 'presets').freeze
|
16
|
+
|
17
|
+
##
|
18
|
+
# Reads preset rules.
|
19
|
+
#
|
20
|
+
# @param [String] preset
|
21
|
+
# Presets
|
22
|
+
#
|
23
|
+
# @return [Hash]
|
24
|
+
# A hash containing the preset's rules.
|
25
|
+
def self.read(name)
|
26
|
+
@cache = {} if @cache.nil?
|
27
|
+
|
28
|
+
# Load and cache preset config.
|
29
|
+
if @cache[name].nil?
|
30
|
+
exist? name, true
|
31
|
+
@cache[name] = Config.load_conf file(name)
|
32
|
+
end
|
33
|
+
|
34
|
+
@cache[name]
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# Get all possible rules.
|
39
|
+
#
|
40
|
+
# @return [Array]
|
41
|
+
# All presets.
|
42
|
+
def self.all
|
43
|
+
# Load and cache preset names.
|
44
|
+
if @all.nil?
|
45
|
+
@all = []
|
46
|
+
pattern = DIRECTORY + '*.yaml'
|
47
|
+
Dir.glob(pattern) do |file|
|
48
|
+
@all << File.basename(file, '.yaml')
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
@all
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Checks whether a preset exists.
|
57
|
+
def self.exist?(name, exception = false)
|
58
|
+
result = File.exist? file(name)
|
59
|
+
|
60
|
+
# Raise an exception, if required.
|
61
|
+
if exception && !result
|
62
|
+
raise Config::InvalidConfig, "Preset not found: #{name}"
|
63
|
+
end
|
64
|
+
|
65
|
+
result
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# Returns the path to a preset file.
|
70
|
+
def self.file(name)
|
71
|
+
DIRECTORY + "#{name}.yaml"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff'
|
4
|
+
require 'sitediff/uriwrapper'
|
5
|
+
require 'addressable/uri'
|
6
|
+
require 'nokogiri'
|
7
|
+
require 'ostruct'
|
8
|
+
require 'set'
|
9
|
+
|
10
|
+
class SiteDiff
|
11
|
+
# SiteDiff Crawler.
|
12
|
+
class Crawler
|
13
|
+
class Info < OpenStruct; end
|
14
|
+
|
15
|
+
DEFAULT_DEPTH = 3
|
16
|
+
|
17
|
+
# Create a crawler with a base URL
|
18
|
+
def initialize(hydra, base,
|
19
|
+
interval,
|
20
|
+
whitelist,
|
21
|
+
blacklist,
|
22
|
+
depth = DEFAULT_DEPTH,
|
23
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
|
24
|
+
debug = true,
|
25
|
+
&block)
|
26
|
+
@hydra = hydra
|
27
|
+
@base_uri = Addressable::URI.parse(base)
|
28
|
+
@base = base
|
29
|
+
@interval = interval
|
30
|
+
@whitelist = whitelist
|
31
|
+
@blacklist = blacklist
|
32
|
+
@found = Set.new
|
33
|
+
@callback = block
|
34
|
+
@curl_opts = curl_opts
|
35
|
+
@debug = debug
|
36
|
+
|
37
|
+
add_uri('', depth)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Handle a newly found relative URI
|
41
|
+
def add_uri(rel, depth)
|
42
|
+
return if @found.include? rel
|
43
|
+
|
44
|
+
@found << rel
|
45
|
+
|
46
|
+
wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
|
47
|
+
wrapper.queue(@hydra) do |res|
|
48
|
+
fetched_uri(rel, depth, res)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Handle the fetch of a URI
|
53
|
+
def fetched_uri(rel, depth, res)
|
54
|
+
if res.error
|
55
|
+
SiteDiff.log(res.error, :error)
|
56
|
+
return
|
57
|
+
elsif !res.content
|
58
|
+
SiteDiff.log('Response is missing content. Treating as an error.', :error)
|
59
|
+
return
|
60
|
+
end
|
61
|
+
|
62
|
+
base = Addressable::URI.parse(@base + rel)
|
63
|
+
doc = Nokogiri::HTML(res.content)
|
64
|
+
|
65
|
+
# Call the callback
|
66
|
+
info = Info.new(
|
67
|
+
relative: rel,
|
68
|
+
uri: base,
|
69
|
+
read_result: res,
|
70
|
+
document: doc
|
71
|
+
)
|
72
|
+
# Insert delay to limit fetching rate
|
73
|
+
if @interval != 0
|
74
|
+
SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
|
75
|
+
sleep(@interval / 1000.0)
|
76
|
+
end
|
77
|
+
@callback[info]
|
78
|
+
|
79
|
+
return unless depth >= 1
|
80
|
+
|
81
|
+
# Find links
|
82
|
+
links = find_links(doc)
|
83
|
+
uris = links.map { |l| resolve_link(base, l) }.compact
|
84
|
+
uris = filter_links(uris)
|
85
|
+
|
86
|
+
# Make them relative
|
87
|
+
rels = uris.map { |u| relativize_link(u) }
|
88
|
+
|
89
|
+
# Queue them in turn
|
90
|
+
rels.each do |r|
|
91
|
+
next if @found.include? r
|
92
|
+
|
93
|
+
add_uri(r, depth - 1)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Resolve a potentially-relative link. Return nil on error.
|
98
|
+
def resolve_link(base, rel)
|
99
|
+
base + rel
|
100
|
+
rescue Addressable::URI::InvalidURIError
|
101
|
+
SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
|
102
|
+
nil
|
103
|
+
end
|
104
|
+
|
105
|
+
# Make a link relative to @base_uri
|
106
|
+
def relativize_link(uri)
|
107
|
+
uri.path.slice(@base_uri.path.length, uri.path.length)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Return a list of string links found on a page.
|
111
|
+
def find_links(doc)
|
112
|
+
doc.xpath('//a[@href]').map { |e| e['href'] }
|
113
|
+
end
|
114
|
+
|
115
|
+
# Filter out links we don't want. Links passed in are absolute URIs.
|
116
|
+
def filter_links(uris)
|
117
|
+
uris.find_all do |u|
|
118
|
+
is_sub_uri = (u.host == @base_uri.host) &&
|
119
|
+
u.path.start_with?(@base_uri.path)
|
120
|
+
next unless is_sub_uri
|
121
|
+
|
122
|
+
is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
|
123
|
+
is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
|
124
|
+
if is_blacklisted && !is_whitelisted
|
125
|
+
SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
|
126
|
+
end
|
127
|
+
is_whitelisted || !is_blacklisted
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
data/lib/sitediff/diff.rb
CHANGED
@@ -1,37 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff'
|
1
4
|
require 'diffy'
|
2
5
|
require 'erb'
|
3
6
|
require 'rainbow'
|
7
|
+
require 'digest'
|
4
8
|
|
5
9
|
class SiteDiff
|
10
|
+
# SiteDiff Diff Object.
|
6
11
|
module Diff
|
7
12
|
module_function
|
8
13
|
|
14
|
+
##
|
15
|
+
# Generates HTML diff.
|
9
16
|
def html_diffy(before_html, after_html)
|
10
17
|
diff = Diffy::Diff.new(before_html, after_html)
|
11
|
-
|
12
|
-
|
18
|
+
# If the diff is non-empty, convert it to string.
|
19
|
+
diff.first ? diff.to_s(:html) : nil
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Generates a description about encoding.
|
24
|
+
def encoding_blurb(encoding)
|
25
|
+
if encoding
|
26
|
+
"Text content returned - charset #{encoding}"
|
27
|
+
else
|
28
|
+
'Binary content returned'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# Computes diff of binary files using MD5 hashes.
|
34
|
+
def binary_diffy(before, after, before_encoding, after_encoding)
|
35
|
+
if before_encoding || after_encoding
|
36
|
+
Diffy::Diff.new(encoding_blurb(before_encoding),
|
37
|
+
encoding_blurb(after_encoding)).to_s(:html)
|
38
|
+
elsif before == after
|
39
|
+
nil
|
40
|
+
else
|
41
|
+
md5_before = Digest::MD5.hexdigest(before)
|
42
|
+
md5_after = Digest::MD5.hexdigest(after)
|
43
|
+
Diffy::Diff.new("Binary content returned md5: #{md5_before}",
|
44
|
+
"Binary content returned md5: #{md5_after}").to_s(:html)
|
45
|
+
end
|
13
46
|
end
|
14
47
|
|
48
|
+
##
|
49
|
+
# Generates diff for CLI output.
|
15
50
|
def terminal_diffy(before_html, after_html)
|
16
51
|
args = []
|
17
52
|
args << :color if Rainbow.enabled
|
18
|
-
|
19
|
-
|
53
|
+
Diffy::Diff.new(before_html, after_html, context: 3)
|
54
|
+
.to_s(*args)
|
20
55
|
end
|
21
56
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
57
|
+
##
|
58
|
+
# Generates an HTML report.
|
59
|
+
# TODO: Generate the report in SiteDif::Report instead.
|
60
|
+
def generate_html(results, before, after, cache, relative = false)
|
61
|
+
erb_path = File.join(SiteDiff::FILES_DIR, 'report.html.erb')
|
62
|
+
ERB.new(File.read(erb_path)).result(binding)
|
26
63
|
end
|
27
64
|
|
28
|
-
|
65
|
+
##
|
66
|
+
# Generates diff output for a single result.
|
67
|
+
def generate_diff_output(result, relative = false)
|
29
68
|
erb_path = File.join(SiteDiff::FILES_DIR, 'diff.html.erb')
|
30
|
-
|
69
|
+
ERB.new(File.read(erb_path)).result(binding)
|
31
70
|
end
|
32
71
|
|
33
|
-
|
34
|
-
|
72
|
+
##
|
73
|
+
# Set configuration for Diffy.
|
74
|
+
def diff_config(config)
|
75
|
+
diff_options = Diffy::Diff.default_options[:diff]
|
76
|
+
diff_options = [diff_options] unless diff_options.is_a?(Array)
|
77
|
+
# ignore_whitespace option
|
78
|
+
diff_options.push('-w').uniq if config.ignore_whitespace
|
79
|
+
Diffy::Diff.default_options[:diff] = diff_options
|
35
80
|
end
|
36
81
|
end
|
37
82
|
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff/uriwrapper'
|
4
|
+
require 'typhoeus'
|
5
|
+
|
6
|
+
class SiteDiff
|
7
|
+
# SiteDiff Data Fetcher.
|
8
|
+
# TODO: Rename this to Fetcher.
|
9
|
+
class Fetch
|
10
|
+
# Cache is a cache object, see sitediff/cache
|
11
|
+
# Paths is a list of sub-paths
|
12
|
+
# Tags is a hash of tag names => base URLs.
|
13
|
+
def initialize(cache,
|
14
|
+
paths,
|
15
|
+
interval,
|
16
|
+
concurrency = 3,
|
17
|
+
curl_opts = nil,
|
18
|
+
debug = true,
|
19
|
+
**tags)
|
20
|
+
@cache = cache
|
21
|
+
@interval = interval
|
22
|
+
@paths = paths
|
23
|
+
@tags = tags
|
24
|
+
@curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
|
25
|
+
@concurrency = concurrency
|
26
|
+
@debug = debug
|
27
|
+
end
|
28
|
+
|
29
|
+
# Fetch all the paths, once per tag.
|
30
|
+
# When a path has been fetched for every tag, block will be called with the
|
31
|
+
# path, and a hash of tag => UriWrapper::ReadResult objects.
|
32
|
+
def run(&block)
|
33
|
+
@callback = block
|
34
|
+
@hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
|
35
|
+
@paths.each { |path| queue_path(path) }
|
36
|
+
@hydra.run
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
# Queue a path for fetching
|
42
|
+
def queue_path(path)
|
43
|
+
results = {}
|
44
|
+
|
45
|
+
@tags.each do |tag, base|
|
46
|
+
if (res = @cache.get(tag, path))
|
47
|
+
results[tag] = res
|
48
|
+
process_results(path, results)
|
49
|
+
elsif !base
|
50
|
+
# We only have the cache, but this item isn't cached!
|
51
|
+
results[tag] = UriWrapper::ReadResult.error('Not cached')
|
52
|
+
process_results(path, results)
|
53
|
+
else
|
54
|
+
uri = UriWrapper.new(base + path, @curl_opts, @debug)
|
55
|
+
uri.queue(@hydra) do |resl|
|
56
|
+
# Insert delay to limit fetching rate
|
57
|
+
if @interval != 0
|
58
|
+
SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
|
59
|
+
sleep(@interval / 1000.0)
|
60
|
+
end
|
61
|
+
@cache.set(tag, path, resl)
|
62
|
+
results[tag] = resl
|
63
|
+
process_results(path, results)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Process fetch results
|
70
|
+
def process_results(path, results)
|
71
|
+
return unless results.size == @tags.size
|
72
|
+
|
73
|
+
@callback[path, results]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|