sitediff 0.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/sitediff +9 -3
- data/lib/sitediff.rb +153 -79
- data/lib/sitediff/api.rb +265 -0
- data/lib/sitediff/cache.rb +110 -47
- data/lib/sitediff/cli.rb +219 -165
- data/lib/sitediff/config.rb +439 -58
- data/lib/sitediff/config/creator.rb +93 -99
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +108 -72
- data/lib/sitediff/diff.rb +60 -12
- data/lib/sitediff/exception.rb +3 -1
- data/lib/sitediff/fetch.rb +62 -41
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +171 -0
- data/lib/sitediff/files/sidebyside.html.erb +5 -2
- data/lib/sitediff/files/sitediff.css +303 -30
- data/lib/sitediff/files/sitediff.js +367 -0
- data/lib/sitediff/report.rb +254 -0
- data/lib/sitediff/result.rb +59 -23
- data/lib/sitediff/sanitize.rb +222 -150
- data/lib/sitediff/sanitize/dom_transform.rb +111 -73
- data/lib/sitediff/sanitize/regexp.rb +69 -43
- data/lib/sitediff/uriwrapper.rb +104 -34
- data/lib/sitediff/webserver.rb +89 -77
- data/lib/sitediff/webserver/resultserver.rb +113 -77
- metadata +92 -76
- data/lib/sitediff/files/html_report.html.erb +0 -63
- data/lib/sitediff/files/rules/drupal.yaml +0 -33
- data/lib/sitediff/rules.rb +0 -65
@@ -1,122 +1,116 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/cache'
|
2
4
|
require 'sitediff/config'
|
3
5
|
require 'sitediff/crawler'
|
4
|
-
require 'sitediff/rules'
|
5
6
|
require 'pathname'
|
6
7
|
require 'typhoeus'
|
7
8
|
require 'yaml'
|
8
9
|
|
9
10
|
class SiteDiff
|
10
|
-
class Config
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
# Build a config structure, return it
|
26
|
-
def create(opts, &block)
|
27
|
-
@config = {}
|
28
|
-
@callback = block
|
29
|
-
|
30
|
-
# Handle options
|
31
|
-
@dir = Pathname.new(opts[:directory])
|
32
|
-
@depth = opts[:depth]
|
33
|
-
@rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
|
34
|
-
|
35
|
-
# Create the dir. Must go before cache initialization!
|
36
|
-
@dir.mkpath unless @dir.directory?
|
11
|
+
class Config
|
12
|
+
##
|
13
|
+
# SiteDiff Config Creator Object.
|
14
|
+
class Creator
|
15
|
+
##
|
16
|
+
# Creates a Creator object.
|
17
|
+
def initialize(debug, before, after)
|
18
|
+
@config = nil
|
19
|
+
@before = before
|
20
|
+
@after = after
|
21
|
+
@debug = debug
|
22
|
+
end
|
37
23
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
24
|
+
##
|
25
|
+
# Determine if we're dealing with one or two URLs.
|
26
|
+
def roots
|
27
|
+
@roots = { 'after' => @after }
|
28
|
+
@roots['before'] = @before || @after
|
29
|
+
@roots
|
30
|
+
end
|
43
31
|
|
44
|
-
|
45
|
-
|
46
|
-
|
32
|
+
##
|
33
|
+
# Build a config structure, return it.
|
34
|
+
def create(options)
|
35
|
+
@config = {}
|
47
36
|
|
48
|
-
|
49
|
-
%w[before after].each do |tag|
|
50
|
-
next unless u = roots[tag.to_sym]
|
51
|
-
@config[tag] = {'url' => u}
|
52
|
-
end
|
37
|
+
# @callback = block
|
53
38
|
|
54
|
-
|
55
|
-
@cache.close
|
56
|
-
@rules.add_config if @rules
|
39
|
+
@dir = Pathname.new(options[:directory])
|
57
40
|
|
58
|
-
|
59
|
-
|
41
|
+
# Setup instance vars
|
42
|
+
@paths = Hash.new { |h, k| h[k] = Set.new }
|
43
|
+
@cache = Cache.new(directory: @dir.to_s, create: true)
|
44
|
+
@cache.write_tags << :before << :after
|
60
45
|
|
61
|
-
|
62
|
-
|
63
|
-
roots.each do |tag, u|
|
64
|
-
Crawler.new(hydra, u, depth) do |info|
|
65
|
-
crawled_path(tag, info)
|
46
|
+
build_config options
|
47
|
+
write_config
|
66
48
|
end
|
67
|
-
end
|
68
|
-
hydra.run
|
69
|
-
end
|
70
|
-
|
71
|
-
# Deduplicate paths with slashes at the end
|
72
|
-
def canonicalize(tag, path)
|
73
|
-
def altered_paths(path)
|
74
|
-
yield path + '/'
|
75
|
-
yield path.sub(%r[/$], '')
|
76
|
-
end
|
77
|
-
|
78
|
-
return path.empty? ? '/' : path
|
79
|
-
end
|
80
|
-
|
81
|
-
def crawled_path(tag, info)
|
82
|
-
path, dup = canonicalize(tag, info.relative)
|
83
|
-
return if dup
|
84
49
|
|
85
|
-
|
50
|
+
##
|
51
|
+
# Build and populate the config object which is being created.
|
52
|
+
#
|
53
|
+
# @param [String] options
|
54
|
+
# One or more options.
|
55
|
+
def build_config(options)
|
56
|
+
options = Config.stringify_keys options
|
57
|
+
|
58
|
+
# Build config for "before" and "after".
|
59
|
+
%w[before after].each do |tag|
|
60
|
+
next unless (url = roots[tag])
|
61
|
+
|
62
|
+
@config[tag] = { 'url' => url }
|
63
|
+
end
|
64
|
+
|
65
|
+
# Build other settings.
|
66
|
+
@config['settings'] = {}
|
67
|
+
Config::ALLOWED_SETTINGS_KEYS.each do |key|
|
68
|
+
@config['settings'][key] = options[key]
|
69
|
+
end
|
70
|
+
end
|
86
71
|
|
87
|
-
|
88
|
-
|
89
|
-
|
72
|
+
##
|
73
|
+
# Create a gitignore if we seem to be in git.
|
74
|
+
def make_gitignore(dir)
|
75
|
+
# Check if we're in git
|
76
|
+
unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
|
77
|
+
return
|
78
|
+
end
|
79
|
+
|
80
|
+
dir.+('.gitignore').open('w') do |f|
|
81
|
+
f.puts <<-GITIGNORE.gsub(/^\s+/, '')
|
82
|
+
# Directories.
|
83
|
+
diffs
|
84
|
+
snapshot
|
85
|
+
|
86
|
+
# Files.
|
87
|
+
settings.yaml
|
88
|
+
paths.txt
|
89
|
+
failures.txt
|
90
|
+
GITIGNORE
|
91
|
+
end
|
92
|
+
end
|
90
93
|
|
91
|
-
|
92
|
-
|
94
|
+
##
|
95
|
+
# Returns the name of the config directory.
|
96
|
+
def directory
|
97
|
+
@dir
|
98
|
+
end
|
93
99
|
|
94
|
-
|
95
|
-
|
100
|
+
##
|
101
|
+
# Returns the name of the config file.
|
102
|
+
def config_file
|
103
|
+
@dir + Config::DEFAULT_FILENAME
|
104
|
+
end
|
96
105
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
cache.db
|
106
|
-
cache.db.db
|
107
|
-
EOF
|
106
|
+
##
|
107
|
+
# Writes the built config into the config file.
|
108
|
+
# TODO: Exclude default params before writing.
|
109
|
+
def write_config
|
110
|
+
make_gitignore(@dir)
|
111
|
+
data = Config.remove_defaults(@config)
|
112
|
+
config_file.open('w') { |f| f.puts data.to_yaml }
|
113
|
+
end
|
108
114
|
end
|
109
115
|
end
|
110
|
-
|
111
|
-
def config_file
|
112
|
-
@dir + Config::DEFAULT_FILENAME
|
113
|
-
end
|
114
|
-
|
115
|
-
# Turn a config structure into a config file
|
116
|
-
def write_config
|
117
|
-
make_gitignore(@dir)
|
118
|
-
config_file.open('w') { |f| f.puts @config.to_yaml }
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
116
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pathname'
|
4
|
+
require 'sitediff/config'
|
5
|
+
|
6
|
+
class SiteDiff
|
7
|
+
class Config
|
8
|
+
##
|
9
|
+
# Preset helper.
|
10
|
+
class Preset
|
11
|
+
##
|
12
|
+
# Directory in which presets live.
|
13
|
+
#
|
14
|
+
# TODO: Move this outside "lib".
|
15
|
+
DIRECTORY = (Pathname.new(__dir__).dirname + 'presets').freeze
|
16
|
+
|
17
|
+
##
|
18
|
+
# Reads preset rules.
|
19
|
+
#
|
20
|
+
# @param [String] preset
|
21
|
+
# Presets
|
22
|
+
#
|
23
|
+
# @return [Hash]
|
24
|
+
# A hash containing the preset's rules.
|
25
|
+
def self.read(name)
|
26
|
+
@cache = {} if @cache.nil?
|
27
|
+
|
28
|
+
# Load and cache preset config.
|
29
|
+
if @cache[name].nil?
|
30
|
+
exist? name, true
|
31
|
+
@cache[name] = Config.load_conf file(name)
|
32
|
+
end
|
33
|
+
|
34
|
+
@cache[name]
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# Get all possible rules.
|
39
|
+
#
|
40
|
+
# @return [Array]
|
41
|
+
# All presets.
|
42
|
+
def self.all
|
43
|
+
# Load and cache preset names.
|
44
|
+
if @all.nil?
|
45
|
+
@all = []
|
46
|
+
pattern = DIRECTORY + '*.yaml'
|
47
|
+
Dir.glob(pattern) do |file|
|
48
|
+
@all << File.basename(file, '.yaml')
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
@all
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Checks whether a preset exists.
|
57
|
+
def self.exist?(name, exception = false)
|
58
|
+
result = File.exist? file(name)
|
59
|
+
|
60
|
+
# Raise an exception, if required.
|
61
|
+
if exception && !result
|
62
|
+
raise Config::InvalidConfig, "Preset not found: #{name}"
|
63
|
+
end
|
64
|
+
|
65
|
+
result
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# Returns the path to a preset file.
|
70
|
+
def self.file(name)
|
71
|
+
DIRECTORY + "#{name}.yaml"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
data/lib/sitediff/crawler.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff'
|
2
4
|
require 'sitediff/uriwrapper'
|
3
5
|
require 'addressable/uri'
|
@@ -6,90 +8,124 @@ require 'ostruct'
|
|
6
8
|
require 'set'
|
7
9
|
|
8
10
|
class SiteDiff
|
9
|
-
|
10
|
-
class
|
11
|
-
|
12
|
-
|
11
|
+
# SiteDiff Crawler.
|
12
|
+
class Crawler
|
13
|
+
class Info < OpenStruct; end
|
14
|
+
|
15
|
+
DEFAULT_DEPTH = 3
|
16
|
+
|
17
|
+
# Create a crawler with a base URL
|
18
|
+
def initialize(hydra, base,
|
19
|
+
interval,
|
20
|
+
include_regex,
|
21
|
+
exclude_regex,
|
22
|
+
depth = DEFAULT_DEPTH,
|
23
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
|
24
|
+
debug = true,
|
25
|
+
&block)
|
26
|
+
@hydra = hydra
|
27
|
+
@base_uri = Addressable::URI.parse(base)
|
28
|
+
@base = base
|
29
|
+
@interval = interval
|
30
|
+
@include_regex = include_regex
|
31
|
+
@exclude_regex = exclude_regex
|
32
|
+
@found = Set.new
|
33
|
+
@callback = block
|
34
|
+
@curl_opts = curl_opts
|
35
|
+
@debug = debug
|
36
|
+
|
37
|
+
add_uri('', depth)
|
38
|
+
end
|
13
39
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
@base_uri = Addressable::URI.parse(base)
|
18
|
-
@base = base
|
19
|
-
@found = Set.new
|
20
|
-
@callback = block
|
40
|
+
# Handle a newly found relative URI
|
41
|
+
def add_uri(rel, depth)
|
42
|
+
return if @found.include? rel
|
21
43
|
|
22
|
-
|
23
|
-
end
|
44
|
+
@found << rel
|
24
45
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
wrapper = UriWrapper.new(@base + rel)
|
31
|
-
wrapper.queue(@hydra) do |res|
|
32
|
-
fetched_uri(rel, depth, res)
|
46
|
+
wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
|
47
|
+
wrapper.queue(@hydra) do |res|
|
48
|
+
fetched_uri(rel, depth, res)
|
49
|
+
end
|
33
50
|
end
|
34
|
-
end
|
35
51
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
52
|
+
# Handle the fetch of a URI
|
53
|
+
def fetched_uri(rel, depth, res)
|
54
|
+
if res.error
|
55
|
+
SiteDiff.log(res.error, :error)
|
56
|
+
return
|
57
|
+
elsif !res.content
|
58
|
+
SiteDiff.log('Response is missing content. Treating as an error.', :error)
|
59
|
+
return
|
60
|
+
end
|
61
|
+
|
62
|
+
base = Addressable::URI.parse(@base + rel)
|
63
|
+
doc = Nokogiri::HTML(res.content)
|
64
|
+
|
65
|
+
# Call the callback
|
66
|
+
info = Info.new(
|
67
|
+
relative: rel,
|
68
|
+
uri: base,
|
69
|
+
read_result: res,
|
70
|
+
document: doc
|
71
|
+
)
|
72
|
+
# Insert delay to limit fetching rate
|
73
|
+
if @interval != 0
|
74
|
+
SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
|
75
|
+
sleep(@interval / 1000.0)
|
76
|
+
end
|
77
|
+
@callback[info]
|
78
|
+
|
79
|
+
return unless depth >= 1
|
80
|
+
|
81
|
+
# Find links
|
82
|
+
links = find_links(doc)
|
83
|
+
uris = links.map { |l| resolve_link(base, l) }.compact
|
84
|
+
uris = filter_links(uris)
|
85
|
+
|
86
|
+
# Make them relative
|
87
|
+
rels = uris.map { |u| relativize_link(u) }
|
88
|
+
|
89
|
+
# Queue them in turn
|
90
|
+
rels.each do |r|
|
91
|
+
next if @found.include? r
|
92
|
+
|
93
|
+
add_uri(r, depth - 1)
|
94
|
+
end
|
65
95
|
end
|
66
|
-
end
|
67
96
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
return base + rel
|
97
|
+
# Resolve a potentially-relative link. Return nil on error.
|
98
|
+
def resolve_link(base, rel)
|
99
|
+
base + rel
|
72
100
|
rescue Addressable::URI::InvalidURIError
|
73
|
-
SiteDiff.log "skipped invalid URL: '#{rel}'", :
|
74
|
-
|
101
|
+
SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
|
102
|
+
nil
|
75
103
|
end
|
76
|
-
end
|
77
104
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
105
|
+
# Make a link relative to @base_uri
|
106
|
+
def relativize_link(uri)
|
107
|
+
uri.path.slice(@base_uri.path.length, uri.path.length)
|
108
|
+
end
|
82
109
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
110
|
+
# Return a list of string links found on a page.
|
111
|
+
def find_links(doc)
|
112
|
+
doc.xpath('//a[@href]').map { |e| e['href'] }
|
113
|
+
end
|
87
114
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
115
|
+
# Filter out links we don't want. Links passed in are absolute URIs.
|
116
|
+
def filter_links(uris)
|
117
|
+
uris.find_all do |u|
|
118
|
+
is_sub_uri = (u.host == @base_uri.host) &&
|
119
|
+
u.path.start_with?(@base_uri.path)
|
120
|
+
next unless is_sub_uri
|
121
|
+
|
122
|
+
is_included = @include_regex.nil? ? false : @include_regex.match(u.path)
|
123
|
+
is_excluded = @exclude_regex.nil? ? false : @exclude_regex.match(u.path)
|
124
|
+
if is_excluded && !is_included
|
125
|
+
SiteDiff.log "Ignoring excluded URL #{u.path}", :info
|
126
|
+
end
|
127
|
+
is_included || !is_excluded
|
128
|
+
end
|
92
129
|
end
|
93
130
|
end
|
94
131
|
end
|
95
|
-
end
|