sitediff 0.0.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/sitediff +10 -4
- data/lib/sitediff.rb +179 -91
- data/lib/sitediff/cache.rb +106 -0
- data/lib/sitediff/cli.rb +391 -60
- data/lib/sitediff/config.rb +383 -37
- data/lib/sitediff/config/creator.rb +114 -0
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +131 -0
- data/lib/sitediff/diff.rb +57 -12
- data/lib/sitediff/exception.rb +5 -0
- data/lib/sitediff/fetch.rb +76 -0
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +144 -0
- data/lib/sitediff/files/sidebyside.html.erb +16 -0
- data/lib/sitediff/files/sitediff.css +236 -29
- data/lib/sitediff/files/sitediff.js +176 -0
- data/lib/sitediff/report.rb +238 -0
- data/lib/sitediff/result.rb +63 -26
- data/lib/sitediff/sanitize.rb +160 -141
- data/lib/sitediff/sanitize/dom_transform.rb +130 -0
- data/lib/sitediff/sanitize/regexp.rb +82 -0
- data/lib/sitediff/uriwrapper.rb +114 -35
- data/lib/sitediff/webserver.rb +94 -0
- data/lib/sitediff/webserver/resultserver.rb +134 -0
- metadata +103 -43
- data/lib/sitediff/files/html_report.html.erb +0 -47
- data/lib/sitediff/util/cache.rb +0 -32
- data/lib/sitediff/util/webserver.rb +0 -77
@@ -0,0 +1,114 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff/cache'
|
4
|
+
require 'sitediff/config'
|
5
|
+
require 'sitediff/crawler'
|
6
|
+
require 'pathname'
|
7
|
+
require 'typhoeus'
|
8
|
+
require 'yaml'
|
9
|
+
|
10
|
+
class SiteDiff
|
11
|
+
class Config
|
12
|
+
##
|
13
|
+
# SiteDiff Config Creator Object.
|
14
|
+
class Creator
|
15
|
+
##
|
16
|
+
# Creates a Creator object.
|
17
|
+
def initialize(debug, *urls)
|
18
|
+
@config = nil
|
19
|
+
@after = urls.pop
|
20
|
+
@before = urls.pop # May be nil
|
21
|
+
@debug = debug
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# Determine if we're dealing with one or two URLs.
|
26
|
+
def roots
|
27
|
+
@roots = { 'after' => @after }
|
28
|
+
@roots['before'] = @before if @before
|
29
|
+
@roots
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# Build a config structure, return it.
|
34
|
+
def create(options, &block)
|
35
|
+
@config = {}
|
36
|
+
@callback = block
|
37
|
+
@dir = Pathname.new(options[:directory])
|
38
|
+
|
39
|
+
# Setup instance vars
|
40
|
+
@paths = Hash.new { |h, k| h[k] = Set.new }
|
41
|
+
@cache = Cache.new(directory: @dir.to_s, create: true)
|
42
|
+
@cache.write_tags << :before << :after
|
43
|
+
|
44
|
+
build_config options
|
45
|
+
write_config
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# Build and populate the config object which is being created.
|
50
|
+
#
|
51
|
+
# @param [String] options
|
52
|
+
# One or more options.
|
53
|
+
def build_config(options)
|
54
|
+
options = Config.stringify_keys options
|
55
|
+
|
56
|
+
# Build config for "before" and "after".
|
57
|
+
%w[before after].each do |tag|
|
58
|
+
next unless (url = roots[tag])
|
59
|
+
|
60
|
+
@config[tag] = { 'url' => url }
|
61
|
+
end
|
62
|
+
|
63
|
+
# Build other settings.
|
64
|
+
@config['settings'] = {}
|
65
|
+
Config::ALLOWED_SETTINGS_KEYS.each do |key|
|
66
|
+
@config['settings'][key] = options[key]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
##
|
71
|
+
# Create a gitignore if we seem to be in git.
|
72
|
+
def make_gitignore(dir)
|
73
|
+
# Check if we're in git
|
74
|
+
unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
|
75
|
+
return
|
76
|
+
end
|
77
|
+
|
78
|
+
dir.+('.gitignore').open('w') do |f|
|
79
|
+
f.puts <<-GITIGNORE.gsub(/^\s+/, '')
|
80
|
+
# Directories.
|
81
|
+
diffs
|
82
|
+
snapshot
|
83
|
+
|
84
|
+
# Files.
|
85
|
+
settings.yaml
|
86
|
+
paths.txt
|
87
|
+
failures.txt
|
88
|
+
GITIGNORE
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
##
|
93
|
+
# Returns the name of the config directory.
|
94
|
+
def directory
|
95
|
+
@dir
|
96
|
+
end
|
97
|
+
|
98
|
+
##
|
99
|
+
# Returns the name of the config file.
|
100
|
+
def config_file
|
101
|
+
@dir + Config::DEFAULT_FILENAME
|
102
|
+
end
|
103
|
+
|
104
|
+
##
|
105
|
+
# Writes the built config into the config file.
|
106
|
+
# TODO: Exclude default params before writing.
|
107
|
+
def write_config
|
108
|
+
make_gitignore(@dir)
|
109
|
+
data = Config.remove_defaults(@config)
|
110
|
+
config_file.open('w') { |f| f.puts data.to_yaml }
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pathname'
|
4
|
+
require 'sitediff/config'
|
5
|
+
|
6
|
+
class SiteDiff
|
7
|
+
class Config
|
8
|
+
##
|
9
|
+
# Preset helper.
|
10
|
+
class Preset
|
11
|
+
##
|
12
|
+
# Directory in which presets live.
|
13
|
+
#
|
14
|
+
# TODO: Move this outside "lib".
|
15
|
+
DIRECTORY = (Pathname.new(__dir__).dirname + 'presets').freeze
|
16
|
+
|
17
|
+
##
|
18
|
+
# Reads preset rules.
|
19
|
+
#
|
20
|
+
# @param [String] preset
|
21
|
+
# Presets
|
22
|
+
#
|
23
|
+
# @return [Hash]
|
24
|
+
# A hash containing the preset's rules.
|
25
|
+
def self.read(name)
|
26
|
+
@cache = {} if @cache.nil?
|
27
|
+
|
28
|
+
# Load and cache preset config.
|
29
|
+
if @cache[name].nil?
|
30
|
+
exist? name, true
|
31
|
+
@cache[name] = Config.load_conf file(name)
|
32
|
+
end
|
33
|
+
|
34
|
+
@cache[name]
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# Get all possible rules.
|
39
|
+
#
|
40
|
+
# @return [Array]
|
41
|
+
# All presets.
|
42
|
+
def self.all
|
43
|
+
# Load and cache preset names.
|
44
|
+
if @all.nil?
|
45
|
+
@all = []
|
46
|
+
pattern = DIRECTORY + '*.yaml'
|
47
|
+
Dir.glob(pattern) do |file|
|
48
|
+
@all << File.basename(file, '.yaml')
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
@all
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Checks whether a preset exists.
|
57
|
+
def self.exist?(name, exception = false)
|
58
|
+
result = File.exist? file(name)
|
59
|
+
|
60
|
+
# Raise an exception, if required.
|
61
|
+
if exception && !result
|
62
|
+
raise Config::InvalidConfig, "Preset not found: #{name}"
|
63
|
+
end
|
64
|
+
|
65
|
+
result
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# Returns the path to a preset file.
|
70
|
+
def self.file(name)
|
71
|
+
DIRECTORY + "#{name}.yaml"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff'
|
4
|
+
require 'sitediff/uriwrapper'
|
5
|
+
require 'addressable/uri'
|
6
|
+
require 'nokogiri'
|
7
|
+
require 'ostruct'
|
8
|
+
require 'set'
|
9
|
+
|
10
|
+
class SiteDiff
|
11
|
+
# SiteDiff Crawler.
|
12
|
+
class Crawler
|
13
|
+
class Info < OpenStruct; end
|
14
|
+
|
15
|
+
DEFAULT_DEPTH = 3
|
16
|
+
|
17
|
+
# Create a crawler with a base URL
|
18
|
+
def initialize(hydra, base,
|
19
|
+
interval,
|
20
|
+
whitelist,
|
21
|
+
blacklist,
|
22
|
+
depth = DEFAULT_DEPTH,
|
23
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
|
24
|
+
debug = true,
|
25
|
+
&block)
|
26
|
+
@hydra = hydra
|
27
|
+
@base_uri = Addressable::URI.parse(base)
|
28
|
+
@base = base
|
29
|
+
@interval = interval
|
30
|
+
@whitelist = whitelist
|
31
|
+
@blacklist = blacklist
|
32
|
+
@found = Set.new
|
33
|
+
@callback = block
|
34
|
+
@curl_opts = curl_opts
|
35
|
+
@debug = debug
|
36
|
+
|
37
|
+
add_uri('', depth)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Handle a newly found relative URI
|
41
|
+
def add_uri(rel, depth)
|
42
|
+
return if @found.include? rel
|
43
|
+
|
44
|
+
@found << rel
|
45
|
+
|
46
|
+
wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
|
47
|
+
wrapper.queue(@hydra) do |res|
|
48
|
+
fetched_uri(rel, depth, res)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Handle the fetch of a URI
|
53
|
+
def fetched_uri(rel, depth, res)
|
54
|
+
if res.error
|
55
|
+
SiteDiff.log(res.error, :error)
|
56
|
+
return
|
57
|
+
elsif !res.content
|
58
|
+
SiteDiff.log('Response is missing content. Treating as an error.', :error)
|
59
|
+
return
|
60
|
+
end
|
61
|
+
|
62
|
+
base = Addressable::URI.parse(@base + rel)
|
63
|
+
doc = Nokogiri::HTML(res.content)
|
64
|
+
|
65
|
+
# Call the callback
|
66
|
+
info = Info.new(
|
67
|
+
relative: rel,
|
68
|
+
uri: base,
|
69
|
+
read_result: res,
|
70
|
+
document: doc
|
71
|
+
)
|
72
|
+
# Insert delay to limit fetching rate
|
73
|
+
if @interval != 0
|
74
|
+
SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
|
75
|
+
sleep(@interval / 1000.0)
|
76
|
+
end
|
77
|
+
@callback[info]
|
78
|
+
|
79
|
+
return unless depth >= 1
|
80
|
+
|
81
|
+
# Find links
|
82
|
+
links = find_links(doc)
|
83
|
+
uris = links.map { |l| resolve_link(base, l) }.compact
|
84
|
+
uris = filter_links(uris)
|
85
|
+
|
86
|
+
# Make them relative
|
87
|
+
rels = uris.map { |u| relativize_link(u) }
|
88
|
+
|
89
|
+
# Queue them in turn
|
90
|
+
rels.each do |r|
|
91
|
+
next if @found.include? r
|
92
|
+
|
93
|
+
add_uri(r, depth - 1)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Resolve a potentially-relative link. Return nil on error.
|
98
|
+
def resolve_link(base, rel)
|
99
|
+
base + rel
|
100
|
+
rescue Addressable::URI::InvalidURIError
|
101
|
+
SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
|
102
|
+
nil
|
103
|
+
end
|
104
|
+
|
105
|
+
# Make a link relative to @base_uri
|
106
|
+
def relativize_link(uri)
|
107
|
+
uri.path.slice(@base_uri.path.length, uri.path.length)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Return a list of string links found on a page.
|
111
|
+
def find_links(doc)
|
112
|
+
doc.xpath('//a[@href]').map { |e| e['href'] }
|
113
|
+
end
|
114
|
+
|
115
|
+
# Filter out links we don't want. Links passed in are absolute URIs.
|
116
|
+
def filter_links(uris)
|
117
|
+
uris.find_all do |u|
|
118
|
+
is_sub_uri = (u.host == @base_uri.host) &&
|
119
|
+
u.path.start_with?(@base_uri.path)
|
120
|
+
next unless is_sub_uri
|
121
|
+
|
122
|
+
is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
|
123
|
+
is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
|
124
|
+
if is_blacklisted && !is_whitelisted
|
125
|
+
SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
|
126
|
+
end
|
127
|
+
is_whitelisted || !is_blacklisted
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
data/lib/sitediff/diff.rb
CHANGED
@@ -1,37 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff'
|
1
4
|
require 'diffy'
|
2
5
|
require 'erb'
|
3
6
|
require 'rainbow'
|
7
|
+
require 'digest'
|
4
8
|
|
5
9
|
class SiteDiff
|
10
|
+
# SiteDiff Diff Object.
|
6
11
|
module Diff
|
7
12
|
module_function
|
8
13
|
|
14
|
+
##
|
15
|
+
# Generates HTML diff.
|
9
16
|
def html_diffy(before_html, after_html)
|
10
17
|
diff = Diffy::Diff.new(before_html, after_html)
|
11
|
-
|
12
|
-
|
18
|
+
# If the diff is non-empty, convert it to string.
|
19
|
+
diff.first ? diff.to_s(:html) : nil
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Generates a description about encoding.
|
24
|
+
def encoding_blurb(encoding)
|
25
|
+
if encoding
|
26
|
+
"Text content returned - charset #{encoding}"
|
27
|
+
else
|
28
|
+
'Binary content returned'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# Computes diff of binary files using MD5 hashes.
|
34
|
+
def binary_diffy(before, after, before_encoding, after_encoding)
|
35
|
+
if before_encoding || after_encoding
|
36
|
+
Diffy::Diff.new(encoding_blurb(before_encoding),
|
37
|
+
encoding_blurb(after_encoding)).to_s(:html)
|
38
|
+
elsif before == after
|
39
|
+
nil
|
40
|
+
else
|
41
|
+
md5_before = Digest::MD5.hexdigest(before)
|
42
|
+
md5_after = Digest::MD5.hexdigest(after)
|
43
|
+
Diffy::Diff.new("Binary content returned md5: #{md5_before}",
|
44
|
+
"Binary content returned md5: #{md5_after}").to_s(:html)
|
45
|
+
end
|
13
46
|
end
|
14
47
|
|
48
|
+
##
|
49
|
+
# Generates diff for CLI output.
|
15
50
|
def terminal_diffy(before_html, after_html)
|
16
51
|
args = []
|
17
52
|
args << :color if Rainbow.enabled
|
18
|
-
|
19
|
-
|
53
|
+
Diffy::Diff.new(before_html, after_html, context: 3)
|
54
|
+
.to_s(*args)
|
20
55
|
end
|
21
56
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
57
|
+
##
|
58
|
+
# Generates an HTML report.
|
59
|
+
# TODO: Generate the report in SiteDif::Report instead.
|
60
|
+
def generate_html(results, before, after, cache, relative = false)
|
61
|
+
erb_path = File.join(SiteDiff::FILES_DIR, 'report.html.erb')
|
62
|
+
ERB.new(File.read(erb_path)).result(binding)
|
26
63
|
end
|
27
64
|
|
28
|
-
|
65
|
+
##
|
66
|
+
# Generates diff output for a single result.
|
67
|
+
def generate_diff_output(result, relative = false)
|
29
68
|
erb_path = File.join(SiteDiff::FILES_DIR, 'diff.html.erb')
|
30
|
-
|
69
|
+
ERB.new(File.read(erb_path)).result(binding)
|
31
70
|
end
|
32
71
|
|
33
|
-
|
34
|
-
|
72
|
+
##
|
73
|
+
# Set configuration for Diffy.
|
74
|
+
def diff_config(config)
|
75
|
+
diff_options = Diffy::Diff.default_options[:diff]
|
76
|
+
diff_options = [diff_options] unless diff_options.is_a?(Array)
|
77
|
+
# ignore_whitespace option
|
78
|
+
diff_options.push('-w').uniq if config.ignore_whitespace
|
79
|
+
Diffy::Diff.default_options[:diff] = diff_options
|
35
80
|
end
|
36
81
|
end
|
37
82
|
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff/uriwrapper'
|
4
|
+
require 'typhoeus'
|
5
|
+
|
6
|
+
class SiteDiff
|
7
|
+
# SiteDiff Data Fetcher.
|
8
|
+
# TODO: Rename this to Fetcher.
|
9
|
+
class Fetch
|
10
|
+
# Cache is a cache object, see sitediff/cache
|
11
|
+
# Paths is a list of sub-paths
|
12
|
+
# Tags is a hash of tag names => base URLs.
|
13
|
+
def initialize(cache,
|
14
|
+
paths,
|
15
|
+
interval,
|
16
|
+
concurrency = 3,
|
17
|
+
curl_opts = nil,
|
18
|
+
debug = true,
|
19
|
+
**tags)
|
20
|
+
@cache = cache
|
21
|
+
@interval = interval
|
22
|
+
@paths = paths
|
23
|
+
@tags = tags
|
24
|
+
@curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
|
25
|
+
@concurrency = concurrency
|
26
|
+
@debug = debug
|
27
|
+
end
|
28
|
+
|
29
|
+
# Fetch all the paths, once per tag.
|
30
|
+
# When a path has been fetched for every tag, block will be called with the
|
31
|
+
# path, and a hash of tag => UriWrapper::ReadResult objects.
|
32
|
+
def run(&block)
|
33
|
+
@callback = block
|
34
|
+
@hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
|
35
|
+
@paths.each { |path| queue_path(path) }
|
36
|
+
@hydra.run
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
# Queue a path for fetching
|
42
|
+
def queue_path(path)
|
43
|
+
results = {}
|
44
|
+
|
45
|
+
@tags.each do |tag, base|
|
46
|
+
if (res = @cache.get(tag, path))
|
47
|
+
results[tag] = res
|
48
|
+
process_results(path, results)
|
49
|
+
elsif !base
|
50
|
+
# We only have the cache, but this item isn't cached!
|
51
|
+
results[tag] = UriWrapper::ReadResult.error('Not cached')
|
52
|
+
process_results(path, results)
|
53
|
+
else
|
54
|
+
uri = UriWrapper.new(base + path, @curl_opts, @debug)
|
55
|
+
uri.queue(@hydra) do |resl|
|
56
|
+
# Insert delay to limit fetching rate
|
57
|
+
if @interval != 0
|
58
|
+
SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
|
59
|
+
sleep(@interval / 1000.0)
|
60
|
+
end
|
61
|
+
@cache.set(tag, path, resl)
|
62
|
+
results[tag] = resl
|
63
|
+
process_results(path, results)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Process fetch results
|
70
|
+
def process_results(path, results)
|
71
|
+
return unless results.size == @tags.size
|
72
|
+
|
73
|
+
@callback[path, results]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|