sitediff 0.0.2 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/sitediff +9 -3
- data/lib/sitediff.rb +153 -79
- data/lib/sitediff/api.rb +265 -0
- data/lib/sitediff/cache.rb +110 -47
- data/lib/sitediff/cli.rb +219 -165
- data/lib/sitediff/config.rb +439 -58
- data/lib/sitediff/config/creator.rb +93 -99
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/crawler.rb +108 -72
- data/lib/sitediff/diff.rb +60 -12
- data/lib/sitediff/exception.rb +3 -1
- data/lib/sitediff/fetch.rb +62 -41
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +171 -0
- data/lib/sitediff/files/sidebyside.html.erb +5 -2
- data/lib/sitediff/files/sitediff.css +303 -30
- data/lib/sitediff/files/sitediff.js +367 -0
- data/lib/sitediff/report.rb +254 -0
- data/lib/sitediff/result.rb +59 -23
- data/lib/sitediff/sanitize.rb +222 -150
- data/lib/sitediff/sanitize/dom_transform.rb +111 -73
- data/lib/sitediff/sanitize/regexp.rb +69 -43
- data/lib/sitediff/uriwrapper.rb +104 -34
- data/lib/sitediff/webserver.rb +89 -77
- data/lib/sitediff/webserver/resultserver.rb +113 -77
- metadata +92 -76
- data/lib/sitediff/files/html_report.html.erb +0 -63
- data/lib/sitediff/files/rules/drupal.yaml +0 -33
- data/lib/sitediff/rules.rb +0 -65
@@ -1,122 +1,116 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff/cache'
|
2
4
|
require 'sitediff/config'
|
3
5
|
require 'sitediff/crawler'
|
4
|
-
require 'sitediff/rules'
|
5
6
|
require 'pathname'
|
6
7
|
require 'typhoeus'
|
7
8
|
require 'yaml'
|
8
9
|
|
9
10
|
class SiteDiff
|
10
|
-
class Config
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
# Build a config structure, return it
|
26
|
-
def create(opts, &block)
|
27
|
-
@config = {}
|
28
|
-
@callback = block
|
29
|
-
|
30
|
-
# Handle options
|
31
|
-
@dir = Pathname.new(opts[:directory])
|
32
|
-
@depth = opts[:depth]
|
33
|
-
@rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
|
34
|
-
|
35
|
-
# Create the dir. Must go before cache initialization!
|
36
|
-
@dir.mkpath unless @dir.directory?
|
11
|
+
class Config
|
12
|
+
##
|
13
|
+
# SiteDiff Config Creator Object.
|
14
|
+
class Creator
|
15
|
+
##
|
16
|
+
# Creates a Creator object.
|
17
|
+
def initialize(debug, before, after)
|
18
|
+
@config = nil
|
19
|
+
@before = before
|
20
|
+
@after = after
|
21
|
+
@debug = debug
|
22
|
+
end
|
37
23
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
24
|
+
##
|
25
|
+
# Determine if we're dealing with one or two URLs.
|
26
|
+
def roots
|
27
|
+
@roots = { 'after' => @after }
|
28
|
+
@roots['before'] = @before || @after
|
29
|
+
@roots
|
30
|
+
end
|
43
31
|
|
44
|
-
|
45
|
-
|
46
|
-
|
32
|
+
##
|
33
|
+
# Build a config structure, return it.
|
34
|
+
def create(options)
|
35
|
+
@config = {}
|
47
36
|
|
48
|
-
|
49
|
-
%w[before after].each do |tag|
|
50
|
-
next unless u = roots[tag.to_sym]
|
51
|
-
@config[tag] = {'url' => u}
|
52
|
-
end
|
37
|
+
# @callback = block
|
53
38
|
|
54
|
-
|
55
|
-
@cache.close
|
56
|
-
@rules.add_config if @rules
|
39
|
+
@dir = Pathname.new(options[:directory])
|
57
40
|
|
58
|
-
|
59
|
-
|
41
|
+
# Setup instance vars
|
42
|
+
@paths = Hash.new { |h, k| h[k] = Set.new }
|
43
|
+
@cache = Cache.new(directory: @dir.to_s, create: true)
|
44
|
+
@cache.write_tags << :before << :after
|
60
45
|
|
61
|
-
|
62
|
-
|
63
|
-
roots.each do |tag, u|
|
64
|
-
Crawler.new(hydra, u, depth) do |info|
|
65
|
-
crawled_path(tag, info)
|
46
|
+
build_config options
|
47
|
+
write_config
|
66
48
|
end
|
67
|
-
end
|
68
|
-
hydra.run
|
69
|
-
end
|
70
|
-
|
71
|
-
# Deduplicate paths with slashes at the end
|
72
|
-
def canonicalize(tag, path)
|
73
|
-
def altered_paths(path)
|
74
|
-
yield path + '/'
|
75
|
-
yield path.sub(%r[/$], '')
|
76
|
-
end
|
77
|
-
|
78
|
-
return path.empty? ? '/' : path
|
79
|
-
end
|
80
|
-
|
81
|
-
def crawled_path(tag, info)
|
82
|
-
path, dup = canonicalize(tag, info.relative)
|
83
|
-
return if dup
|
84
49
|
|
85
|
-
|
50
|
+
##
|
51
|
+
# Build and populate the config object which is being created.
|
52
|
+
#
|
53
|
+
# @param [String] options
|
54
|
+
# One or more options.
|
55
|
+
def build_config(options)
|
56
|
+
options = Config.stringify_keys options
|
57
|
+
|
58
|
+
# Build config for "before" and "after".
|
59
|
+
%w[before after].each do |tag|
|
60
|
+
next unless (url = roots[tag])
|
61
|
+
|
62
|
+
@config[tag] = { 'url' => url }
|
63
|
+
end
|
64
|
+
|
65
|
+
# Build other settings.
|
66
|
+
@config['settings'] = {}
|
67
|
+
Config::ALLOWED_SETTINGS_KEYS.each do |key|
|
68
|
+
@config['settings'][key] = options[key]
|
69
|
+
end
|
70
|
+
end
|
86
71
|
|
87
|
-
|
88
|
-
|
89
|
-
|
72
|
+
##
|
73
|
+
# Create a gitignore if we seem to be in git.
|
74
|
+
def make_gitignore(dir)
|
75
|
+
# Check if we're in git
|
76
|
+
unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
|
77
|
+
return
|
78
|
+
end
|
79
|
+
|
80
|
+
dir.+('.gitignore').open('w') do |f|
|
81
|
+
f.puts <<-GITIGNORE.gsub(/^\s+/, '')
|
82
|
+
# Directories.
|
83
|
+
diffs
|
84
|
+
snapshot
|
85
|
+
|
86
|
+
# Files.
|
87
|
+
settings.yaml
|
88
|
+
paths.txt
|
89
|
+
failures.txt
|
90
|
+
GITIGNORE
|
91
|
+
end
|
92
|
+
end
|
90
93
|
|
91
|
-
|
92
|
-
|
94
|
+
##
|
95
|
+
# Returns the name of the config directory.
|
96
|
+
def directory
|
97
|
+
@dir
|
98
|
+
end
|
93
99
|
|
94
|
-
|
95
|
-
|
100
|
+
##
|
101
|
+
# Returns the name of the config file.
|
102
|
+
def config_file
|
103
|
+
@dir + Config::DEFAULT_FILENAME
|
104
|
+
end
|
96
105
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
cache.db
|
106
|
-
cache.db.db
|
107
|
-
EOF
|
106
|
+
##
|
107
|
+
# Writes the built config into the config file.
|
108
|
+
# TODO: Exclude default params before writing.
|
109
|
+
def write_config
|
110
|
+
make_gitignore(@dir)
|
111
|
+
data = Config.remove_defaults(@config)
|
112
|
+
config_file.open('w') { |f| f.puts data.to_yaml }
|
113
|
+
end
|
108
114
|
end
|
109
115
|
end
|
110
|
-
|
111
|
-
def config_file
|
112
|
-
@dir + Config::DEFAULT_FILENAME
|
113
|
-
end
|
114
|
-
|
115
|
-
# Turn a config structure into a config file
|
116
|
-
def write_config
|
117
|
-
make_gitignore(@dir)
|
118
|
-
config_file.open('w') { |f| f.puts @config.to_yaml }
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
116
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pathname'
|
4
|
+
require 'sitediff/config'
|
5
|
+
|
6
|
+
class SiteDiff
|
7
|
+
class Config
|
8
|
+
##
|
9
|
+
# Preset helper.
|
10
|
+
class Preset
|
11
|
+
##
|
12
|
+
# Directory in which presets live.
|
13
|
+
#
|
14
|
+
# TODO: Move this outside "lib".
|
15
|
+
DIRECTORY = (Pathname.new(__dir__).dirname + 'presets').freeze
|
16
|
+
|
17
|
+
##
|
18
|
+
# Reads preset rules.
|
19
|
+
#
|
20
|
+
# @param [String] preset
|
21
|
+
# Presets
|
22
|
+
#
|
23
|
+
# @return [Hash]
|
24
|
+
# A hash containing the preset's rules.
|
25
|
+
def self.read(name)
|
26
|
+
@cache = {} if @cache.nil?
|
27
|
+
|
28
|
+
# Load and cache preset config.
|
29
|
+
if @cache[name].nil?
|
30
|
+
exist? name, true
|
31
|
+
@cache[name] = Config.load_conf file(name)
|
32
|
+
end
|
33
|
+
|
34
|
+
@cache[name]
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# Get all possible rules.
|
39
|
+
#
|
40
|
+
# @return [Array]
|
41
|
+
# All presets.
|
42
|
+
def self.all
|
43
|
+
# Load and cache preset names.
|
44
|
+
if @all.nil?
|
45
|
+
@all = []
|
46
|
+
pattern = DIRECTORY + '*.yaml'
|
47
|
+
Dir.glob(pattern) do |file|
|
48
|
+
@all << File.basename(file, '.yaml')
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
@all
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Checks whether a preset exists.
|
57
|
+
def self.exist?(name, exception = false)
|
58
|
+
result = File.exist? file(name)
|
59
|
+
|
60
|
+
# Raise an exception, if required.
|
61
|
+
if exception && !result
|
62
|
+
raise Config::InvalidConfig, "Preset not found: #{name}"
|
63
|
+
end
|
64
|
+
|
65
|
+
result
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# Returns the path to a preset file.
|
70
|
+
def self.file(name)
|
71
|
+
DIRECTORY + "#{name}.yaml"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
data/lib/sitediff/crawler.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sitediff'
|
2
4
|
require 'sitediff/uriwrapper'
|
3
5
|
require 'addressable/uri'
|
@@ -6,90 +8,124 @@ require 'ostruct'
|
|
6
8
|
require 'set'
|
7
9
|
|
8
10
|
class SiteDiff
|
9
|
-
|
10
|
-
class
|
11
|
-
|
12
|
-
|
11
|
+
# SiteDiff Crawler.
|
12
|
+
class Crawler
|
13
|
+
class Info < OpenStruct; end
|
14
|
+
|
15
|
+
DEFAULT_DEPTH = 3
|
16
|
+
|
17
|
+
# Create a crawler with a base URL
|
18
|
+
def initialize(hydra, base,
|
19
|
+
interval,
|
20
|
+
include_regex,
|
21
|
+
exclude_regex,
|
22
|
+
depth = DEFAULT_DEPTH,
|
23
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
|
24
|
+
debug = true,
|
25
|
+
&block)
|
26
|
+
@hydra = hydra
|
27
|
+
@base_uri = Addressable::URI.parse(base)
|
28
|
+
@base = base
|
29
|
+
@interval = interval
|
30
|
+
@include_regex = include_regex
|
31
|
+
@exclude_regex = exclude_regex
|
32
|
+
@found = Set.new
|
33
|
+
@callback = block
|
34
|
+
@curl_opts = curl_opts
|
35
|
+
@debug = debug
|
36
|
+
|
37
|
+
add_uri('', depth)
|
38
|
+
end
|
13
39
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
@base_uri = Addressable::URI.parse(base)
|
18
|
-
@base = base
|
19
|
-
@found = Set.new
|
20
|
-
@callback = block
|
40
|
+
# Handle a newly found relative URI
|
41
|
+
def add_uri(rel, depth)
|
42
|
+
return if @found.include? rel
|
21
43
|
|
22
|
-
|
23
|
-
end
|
44
|
+
@found << rel
|
24
45
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
wrapper = UriWrapper.new(@base + rel)
|
31
|
-
wrapper.queue(@hydra) do |res|
|
32
|
-
fetched_uri(rel, depth, res)
|
46
|
+
wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
|
47
|
+
wrapper.queue(@hydra) do |res|
|
48
|
+
fetched_uri(rel, depth, res)
|
49
|
+
end
|
33
50
|
end
|
34
|
-
end
|
35
51
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
52
|
+
# Handle the fetch of a URI
|
53
|
+
def fetched_uri(rel, depth, res)
|
54
|
+
if res.error
|
55
|
+
SiteDiff.log(res.error, :error)
|
56
|
+
return
|
57
|
+
elsif !res.content
|
58
|
+
SiteDiff.log('Response is missing content. Treating as an error.', :error)
|
59
|
+
return
|
60
|
+
end
|
61
|
+
|
62
|
+
base = Addressable::URI.parse(@base + rel)
|
63
|
+
doc = Nokogiri::HTML(res.content)
|
64
|
+
|
65
|
+
# Call the callback
|
66
|
+
info = Info.new(
|
67
|
+
relative: rel,
|
68
|
+
uri: base,
|
69
|
+
read_result: res,
|
70
|
+
document: doc
|
71
|
+
)
|
72
|
+
# Insert delay to limit fetching rate
|
73
|
+
if @interval != 0
|
74
|
+
SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
|
75
|
+
sleep(@interval / 1000.0)
|
76
|
+
end
|
77
|
+
@callback[info]
|
78
|
+
|
79
|
+
return unless depth >= 1
|
80
|
+
|
81
|
+
# Find links
|
82
|
+
links = find_links(doc)
|
83
|
+
uris = links.map { |l| resolve_link(base, l) }.compact
|
84
|
+
uris = filter_links(uris)
|
85
|
+
|
86
|
+
# Make them relative
|
87
|
+
rels = uris.map { |u| relativize_link(u) }
|
88
|
+
|
89
|
+
# Queue them in turn
|
90
|
+
rels.each do |r|
|
91
|
+
next if @found.include? r
|
92
|
+
|
93
|
+
add_uri(r, depth - 1)
|
94
|
+
end
|
65
95
|
end
|
66
|
-
end
|
67
96
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
return base + rel
|
97
|
+
# Resolve a potentially-relative link. Return nil on error.
|
98
|
+
def resolve_link(base, rel)
|
99
|
+
base + rel
|
72
100
|
rescue Addressable::URI::InvalidURIError
|
73
|
-
SiteDiff.log "skipped invalid URL: '#{rel}'", :
|
74
|
-
|
101
|
+
SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
|
102
|
+
nil
|
75
103
|
end
|
76
|
-
end
|
77
104
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
105
|
+
# Make a link relative to @base_uri
|
106
|
+
def relativize_link(uri)
|
107
|
+
uri.path.slice(@base_uri.path.length, uri.path.length)
|
108
|
+
end
|
82
109
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
110
|
+
# Return a list of string links found on a page.
|
111
|
+
def find_links(doc)
|
112
|
+
doc.xpath('//a[@href]').map { |e| e['href'] }
|
113
|
+
end
|
87
114
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
115
|
+
# Filter out links we don't want. Links passed in are absolute URIs.
|
116
|
+
def filter_links(uris)
|
117
|
+
uris.find_all do |u|
|
118
|
+
is_sub_uri = (u.host == @base_uri.host) &&
|
119
|
+
u.path.start_with?(@base_uri.path)
|
120
|
+
next unless is_sub_uri
|
121
|
+
|
122
|
+
is_included = @include_regex.nil? ? false : @include_regex.match(u.path)
|
123
|
+
is_excluded = @exclude_regex.nil? ? false : @exclude_regex.match(u.path)
|
124
|
+
if is_excluded && !is_included
|
125
|
+
SiteDiff.log "Ignoring excluded URL #{u.path}", :info
|
126
|
+
end
|
127
|
+
is_included || !is_excluded
|
128
|
+
end
|
92
129
|
end
|
93
130
|
end
|
94
131
|
end
|
95
|
-
end
|