sitediff 1.1.1 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.eslintignore +1 -0
- data/.eslintrc.json +28 -0
- data/.project +11 -0
- data/.rubocop.yml +179 -0
- data/.rubocop_todo.yml +51 -0
- data/CHANGELOG.md +33 -0
- data/Dockerfile +33 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +85 -0
- data/INSTALLATION.md +146 -0
- data/LICENSE +339 -0
- data/README.md +810 -0
- data/Rakefile +12 -0
- data/Thorfile +135 -0
- data/config/.gitkeep +0 -0
- data/config/sanitize_domains.example.yaml +8 -0
- data/config/sitediff.example.yaml +81 -0
- data/docker-compose.test.yml +3 -0
- data/lib/sitediff/api.rb +24 -7
- data/lib/sitediff/cache.rb +5 -3
- data/lib/sitediff/cli.rb +4 -3
- data/lib/sitediff/config/creator.rb +13 -13
- data/lib/sitediff/config/preset.rb +6 -6
- data/lib/sitediff/config.rb +9 -9
- data/lib/sitediff/crawler.rb +15 -5
- data/lib/sitediff/diff.rb +1 -1
- data/lib/sitediff/fetch.rb +2 -2
- data/lib/sitediff/files/report.html.erb +1 -1
- data/lib/sitediff/presets/drupal.yaml +63 -0
- data/lib/sitediff/report.rb +6 -6
- data/lib/sitediff/result.rb +5 -5
- data/lib/sitediff/sanitize/dom_transform.rb +2 -2
- data/lib/sitediff/sanitize/regexp.rb +2 -2
- data/lib/sitediff/sanitize.rb +5 -5
- data/lib/sitediff/uriwrapper.rb +12 -13
- data/lib/sitediff/webserver/resultserver.rb +2 -0
- data/lib/sitediff/webserver.rb +3 -0
- data/lib/sitediff.rb +9 -9
- data/misc/sitediff - overview report.png +0 -0
- data/misc/sitediff - page report.png +0 -0
- data/package-lock.json +878 -0
- data/package.json +25 -0
- data/sitediff.gemspec +51 -0
- metadata +62 -18
data/Rakefile
ADDED
data/Thorfile
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
# TODO: Determine the utility of this file.
|
5
|
+
|
6
|
+
LIB_DIR = File.join(File.dirname(__FILE__), 'lib')
|
7
|
+
$LOAD_PATH << LIB_DIR
|
8
|
+
require 'sitediff/webserver'
|
9
|
+
require 'sitediff/webserver/resultserver'
|
10
|
+
|
11
|
+
# Thor Base class.
|
12
|
+
class Base < Thor
|
13
|
+
method_options local: true
|
14
|
+
# Adds the option to all Base subclasses.
|
15
|
+
# method_options() takes different arguments than option().
|
16
|
+
def initialize(*args)
|
17
|
+
super(*args)
|
18
|
+
@local = options['local']
|
19
|
+
end
|
20
|
+
|
21
|
+
# gives us run()
|
22
|
+
include Thor::Actions
|
23
|
+
|
24
|
+
# Thor, by default, exits with 0 no matter what!
|
25
|
+
def self.exit_on_failure?
|
26
|
+
true
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def executable(gem)
|
32
|
+
gem = './bin/sitediff' if (gem == 'sitediff') && @local
|
33
|
+
"#{'bundle exec' if @local} #{gem}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Thor for Docker.
|
38
|
+
class Docker < Base
|
39
|
+
IMAGE = 'evolvingweb/sitediff'
|
40
|
+
|
41
|
+
desc 'build', 'Build a docker image for sitediff'
|
42
|
+
# Make a build image for docker.
|
43
|
+
def build
|
44
|
+
run "docker build -t #{IMAGE} . "
|
45
|
+
end
|
46
|
+
|
47
|
+
desc 'run', 'Run a rake task (or a login shell if none given) inside docker'
|
48
|
+
# NOTE: We can't override run() (which is reserved by Thor). Luckily, Thor only
|
49
|
+
# checks for the first N necessary characters to match a command with a
|
50
|
+
# method. Cf. Thor::normalize_command_name()
|
51
|
+
def run_(task = 'bash')
|
52
|
+
docker_opts = ['-t', "-v #{File.dirname(__FILE__)}:/sitediff"]
|
53
|
+
finish_exec(task, docker_opts)
|
54
|
+
end
|
55
|
+
|
56
|
+
desc 'compose', 'Run a task inside docker without volume mounting (not supported with compose)'
|
57
|
+
# Run a task inside docker without volume mounting.
|
58
|
+
def compose(task = 'bash')
|
59
|
+
docker_opts = ['-t']
|
60
|
+
finish_exec(task, docker_opts)
|
61
|
+
end
|
62
|
+
|
63
|
+
no_commands do
|
64
|
+
# Finished exec
|
65
|
+
def finish_exec(task, docker_opts)
|
66
|
+
if task == 'bash'
|
67
|
+
cmd = 'bash'
|
68
|
+
docker_opts << '-i'
|
69
|
+
else
|
70
|
+
# pass down the local flag to docker command
|
71
|
+
cmd = "#{executable('thor')} #{task} #{@local ? '--local' : '--no-local'}"
|
72
|
+
end
|
73
|
+
puts "docker run #{docker_opts.join(' ')} #{IMAGE} #{cmd}"
|
74
|
+
run "docker run #{docker_opts.join(' ')} #{IMAGE} #{cmd}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Thor for Spec.
|
80
|
+
class Spec < Base
|
81
|
+
desc 'unit', 'run RSpec unit tests'
|
82
|
+
# Run RSpec unit tests.
|
83
|
+
def unit
|
84
|
+
puts "#{executable('rspec')} spec/unit"
|
85
|
+
run "#{executable('rspec')} spec/unit"
|
86
|
+
end
|
87
|
+
|
88
|
+
desc 'fixture', 'run RSpec integration tests'
|
89
|
+
# Run RSpec integration tests.
|
90
|
+
def fixture
|
91
|
+
puts "#{executable('rspec')} spec/unit"
|
92
|
+
run "#{executable('rspec')} spec/fixtures"
|
93
|
+
end
|
94
|
+
|
95
|
+
desc 'all', 'runs both unit and fixture tests', hide: true
|
96
|
+
# hidden task to lump together multiple tasks
|
97
|
+
def all
|
98
|
+
unit
|
99
|
+
fixture
|
100
|
+
end
|
101
|
+
default_task :all
|
102
|
+
end
|
103
|
+
|
104
|
+
# Thor for fixtures.
|
105
|
+
class Fixture < Base
|
106
|
+
desc 'local', 'Run a sitediff test case'
|
107
|
+
# Run a sitediff test case.
|
108
|
+
def local
|
109
|
+
run "#{executable('sitediff')} diff --cached=none spec/fixtures/cli/config.yaml"
|
110
|
+
end
|
111
|
+
|
112
|
+
desc 'http', 'Run a sitediff test case, using web servers'
|
113
|
+
# Run a sitediff test case, using web servers.
|
114
|
+
def http
|
115
|
+
cmd = "#{executable('sitediff')} diff --cached=none spec/fixtures/cli/config.yaml"
|
116
|
+
http_fixtures(cmd).kill
|
117
|
+
end
|
118
|
+
|
119
|
+
desc 'serve', 'Serve the result of the fixture test'
|
120
|
+
# Serve the result of the fixture test.
|
121
|
+
def serve
|
122
|
+
cmd = "#{executable('sitediff')} diff --cached=none --paths-file=spec/sites/ruby-doc.org/paths.txt spec/unit/cli/config.yaml"
|
123
|
+
http_fixtures(cmd)
|
124
|
+
SiteDiff::Webserver::ResultServer.new(nil, 'sitediff', quiet: true).wait
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
# HTTP Fixtures.
|
130
|
+
def http_fixtures(cmd)
|
131
|
+
serv = SiteDiff::Webserver::FixtureServer.new
|
132
|
+
run "#{cmd} --before #{serv.before} --after #{serv.after}"
|
133
|
+
serv
|
134
|
+
end
|
135
|
+
end
|
data/config/.gitkeep
ADDED
File without changes
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# Include other configuration files, merging them with this file.
|
2
|
+
includes:
|
3
|
+
- extra-rules.yaml
|
4
|
+
|
5
|
+
# Settings.
|
6
|
+
#
|
7
|
+
# If you use "sitediff init" with the right parameters, it will generate
|
8
|
+
# this section for you.
|
9
|
+
settings:
|
10
|
+
# Crawl 2 levels deep.
|
11
|
+
depth: 2
|
12
|
+
# Wait for 250ms between requests.
|
13
|
+
interval: 250
|
14
|
+
# Make only 1 request at a time - no simultaneous requests.
|
15
|
+
# Concurrency has to be one when an interval is set.
|
16
|
+
concurrency: 1
|
17
|
+
# Don't follow links to PDF files.
|
18
|
+
exclude: '.*\.pdf'
|
19
|
+
# Curl options, if any.
|
20
|
+
curl_opts:
|
21
|
+
max_recv_speed_large: 10000
|
22
|
+
|
23
|
+
# Rules under this element apply only to the 'before' site.
|
24
|
+
before:
|
25
|
+
# URL of the 'before' version of the site.
|
26
|
+
url: http://localhost/old
|
27
|
+
|
28
|
+
# Sanitizations and DOM transformations, just like the general ones
|
29
|
+
# demonstrated above, but applied only to the 'before' site.
|
30
|
+
dom_transform:
|
31
|
+
- title: Example
|
32
|
+
type: remove
|
33
|
+
selector: div.updates-required
|
34
|
+
|
35
|
+
# Rules under this element apply only to the 'after' site.
|
36
|
+
after:
|
37
|
+
# URL of the 'after' version of the site.
|
38
|
+
url: http://localhost/new
|
39
|
+
|
40
|
+
# The root element to compare.
|
41
|
+
#
|
42
|
+
# Usually, sitediff compares the HTML of the entire page. If you'd rather
|
43
|
+
# check just a subset of the page, specify a selector here. For example, the
|
44
|
+
# line below causes only the body to be compared, ignoring the HTML head.
|
45
|
+
selector: 'body'
|
46
|
+
|
47
|
+
# General regular expression rules, applied to both versions of the site.
|
48
|
+
sanitization:
|
49
|
+
# Normalize input tags containg random tokens.
|
50
|
+
- title: Remove form-build-id
|
51
|
+
pattern: '<input type="hidden" name="form_build_id" value="form-[a-zA-Z0-9_-]+" *\/?>'
|
52
|
+
substitute: '<input type="hidden" name="form_build_id" value="__form_build_id__">'
|
53
|
+
|
54
|
+
# Replace meta property="twitter:*" with meta name="twitter:*".
|
55
|
+
- title: Meta 'property' changed to 'name'
|
56
|
+
pattern: 'property="twitter:'
|
57
|
+
substitute: 'name="twitter:'
|
58
|
+
# 'selector' limits this rule to only within the selected elements.
|
59
|
+
selector: meta
|
60
|
+
# 'path' limits this rule to only certain pages.
|
61
|
+
path: /user
|
62
|
+
|
63
|
+
# General DOM transforms, applied to both versions of the site.
|
64
|
+
dom_transform:
|
65
|
+
# Remove article elements, replacing them with their content
|
66
|
+
- title: Unwrap article elements
|
67
|
+
type: unwrap
|
68
|
+
selector: article
|
69
|
+
|
70
|
+
# Remove classes from divs
|
71
|
+
- title: Remove classes bar and baz from divs
|
72
|
+
type: remove_class
|
73
|
+
selector: div
|
74
|
+
class:
|
75
|
+
- class-bar
|
76
|
+
- class-baz
|
77
|
+
|
78
|
+
# Remove a div ID.
|
79
|
+
- title: Remove block containing current time.
|
80
|
+
type: remove
|
81
|
+
selector: div#block-time
|
data/lib/sitediff/api.rb
CHANGED
@@ -111,8 +111,8 @@ class SiteDiff
|
|
111
111
|
sitediff = SiteDiff.new(
|
112
112
|
@config,
|
113
113
|
cache,
|
114
|
-
options[:verbose],
|
115
|
-
options[:debug]
|
114
|
+
verbose: options[:verbose],
|
115
|
+
debug: options[:debug]
|
116
116
|
)
|
117
117
|
num_failing = sitediff.run
|
118
118
|
exit_code = num_failing.positive? ? 2 : 0
|
@@ -159,7 +159,13 @@ class SiteDiff
|
|
159
159
|
max_concurrency: @config.setting(:concurrency)
|
160
160
|
)
|
161
161
|
@paths = {}
|
162
|
-
|
162
|
+
|
163
|
+
ignoreAfter = @config.roots
|
164
|
+
if @config.roots['before'] == @config.roots['after']
|
165
|
+
ignoreAfter.delete('after')
|
166
|
+
end
|
167
|
+
|
168
|
+
ignoreAfter.each do |tag, url|
|
163
169
|
Crawler.new(
|
164
170
|
hydra,
|
165
171
|
url,
|
@@ -168,7 +174,7 @@ class SiteDiff
|
|
168
174
|
@config.setting(:exclude),
|
169
175
|
@config.setting(:depth),
|
170
176
|
@config.curl_opts,
|
171
|
-
@debug
|
177
|
+
debug: @debug
|
172
178
|
) do |info|
|
173
179
|
SiteDiff.log "Visited #{info.uri}, cached."
|
174
180
|
after_crawl(tag, info)
|
@@ -254,12 +260,23 @@ class SiteDiff
|
|
254
260
|
result = info.read_result
|
255
261
|
|
256
262
|
# Write result to applicable cache.
|
257
|
-
@cache.set(tag, path, result)
|
258
|
-
|
259
|
-
@cache.set(:
|
263
|
+
# @cache.set(tag, path, result)
|
264
|
+
@cache.set(:before, path, result) if tag == 'before'
|
265
|
+
@cache.set(:after, path, result) if tag == 'after'
|
260
266
|
|
261
267
|
# TODO: Restore application of rules.
|
262
268
|
# @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
263
269
|
end
|
270
|
+
|
271
|
+
def get_curl_opts(options)
|
272
|
+
# We do want string keys here
|
273
|
+
bool_hash = { 'true' => true, 'false' => false }
|
274
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS
|
275
|
+
.clone
|
276
|
+
.merge(options['curl_options'] || {})
|
277
|
+
.merge(options['curl_opts'] || {})
|
278
|
+
curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
|
279
|
+
curl_opts
|
280
|
+
end
|
264
281
|
end
|
265
282
|
end
|
data/lib/sitediff/cache.rb
CHANGED
@@ -71,7 +71,8 @@ class SiteDiff
|
|
71
71
|
rescue Errno::EEXIST
|
72
72
|
curdir = filepath
|
73
73
|
curdir = curdir.parent until curdir.exist?
|
74
|
-
tempname = curdir.dirname
|
74
|
+
tempname = "#{curdir.dirname}/#{curdir.basename}.temporary"
|
75
|
+
# tempname = curdir.dirname + (curdir.basename.to_s + '.temporary')
|
75
76
|
# May cause problems if action is not atomic!
|
76
77
|
# Move existing file to dir/index.html first
|
77
78
|
# Not robust! Should generate an UUID or something.
|
@@ -82,10 +83,11 @@ class SiteDiff
|
|
82
83
|
filepath.dirname.mkpath
|
83
84
|
# Should only happen in strange situations such as when the path
|
84
85
|
# is foo/index.html/bar (i.e., index.html is a directory)
|
85
|
-
if (curdir
|
86
|
+
if File.exist?("#{curdir}/index.html")
|
86
87
|
SiteDiff.log "Overwriting file #{tempname}", :warning
|
87
88
|
end
|
88
|
-
|
89
|
+
File.rename(tempname, "#{curdir}/index.html")
|
90
|
+
# tempname.rename(curdir + 'index.html')
|
89
91
|
end
|
90
92
|
end
|
91
93
|
File.open(filename, 'w') { |file| file.write(Marshal.dump(result)) }
|
data/lib/sitediff/cli.rb
CHANGED
@@ -44,8 +44,8 @@ class SiteDiff
|
|
44
44
|
output = []
|
45
45
|
output.push("Sitediff CLI #{gemspec.version}")
|
46
46
|
if options[:verbose]
|
47
|
-
output.push(
|
48
|
-
output.push(
|
47
|
+
output.push("Website: #{gemspec.homepage}")
|
48
|
+
output.push("GitHub: #{gemspec.metadata['source_code_uri']}")
|
49
49
|
end
|
50
50
|
puts output.join("\n")
|
51
51
|
end
|
@@ -199,11 +199,12 @@ class SiteDiff
|
|
199
199
|
.merge(
|
200
200
|
{
|
201
201
|
after_url: urls.pop,
|
202
|
-
before_url: urls.pop,
|
202
|
+
before_url: urls.pop,
|
203
203
|
directory: get_dir(options['directory']),
|
204
204
|
curl_opts: get_curl_opts(options)
|
205
205
|
}
|
206
206
|
)
|
207
|
+
|
207
208
|
Api.init(api_options)
|
208
209
|
end
|
209
210
|
|
@@ -73,22 +73,22 @@ class SiteDiff
|
|
73
73
|
# Create a gitignore if we seem to be in git.
|
74
74
|
def make_gitignore(dir)
|
75
75
|
# Check if we're in git
|
76
|
-
unless dir.realpath.to_enum(:ascend).any? { |d| d
|
76
|
+
unless dir.realpath.to_enum(:ascend).any? { |d| Dir.exist?("#{d}/.git") }
|
77
77
|
return
|
78
78
|
end
|
79
79
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
80
|
+
f = File.open("#{dir}/.gitignore", 'w')
|
81
|
+
f.puts <<-GITIGNORE.gsub(/^\s+/, '')
|
82
|
+
# Directories.
|
83
|
+
diffs
|
84
|
+
snapshot
|
85
|
+
|
86
|
+
# Files.
|
87
|
+
settings.yaml
|
88
|
+
paths.txt
|
89
|
+
failures.txt
|
90
|
+
GITIGNORE
|
91
|
+
f.close
|
92
92
|
end
|
93
93
|
|
94
94
|
##
|
@@ -12,7 +12,7 @@ class SiteDiff
|
|
12
12
|
# Directory in which presets live.
|
13
13
|
#
|
14
14
|
# TODO: Move this outside "lib".
|
15
|
-
DIRECTORY =
|
15
|
+
DIRECTORY = "#{Pathname.new(__dir__).dirname}/presets".freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Reads preset rules.
|
@@ -27,7 +27,7 @@ class SiteDiff
|
|
27
27
|
|
28
28
|
# Load and cache preset config.
|
29
29
|
if @cache[name].nil?
|
30
|
-
exist? name, true
|
30
|
+
exist? name, exception: true
|
31
31
|
@cache[name] = Config.load_conf file(name)
|
32
32
|
end
|
33
33
|
|
@@ -43,7 +43,7 @@ class SiteDiff
|
|
43
43
|
# Load and cache preset names.
|
44
44
|
if @all.nil?
|
45
45
|
@all = []
|
46
|
-
pattern = DIRECTORY
|
46
|
+
pattern = "#{DIRECTORY}/*.yaml"
|
47
47
|
Dir.glob(pattern) do |file|
|
48
48
|
@all << File.basename(file, '.yaml')
|
49
49
|
end
|
@@ -54,8 +54,8 @@ class SiteDiff
|
|
54
54
|
|
55
55
|
##
|
56
56
|
# Checks whether a preset exists.
|
57
|
-
def self.exist?(name, exception
|
58
|
-
result = File.exist?
|
57
|
+
def self.exist?(name, exception: false)
|
58
|
+
result = File.exist?(file(name))
|
59
59
|
|
60
60
|
# Raise an exception, if required.
|
61
61
|
if exception && !result
|
@@ -68,7 +68,7 @@ class SiteDiff
|
|
68
68
|
##
|
69
69
|
# Returns the path to a preset file.
|
70
70
|
def self.file(name)
|
71
|
-
DIRECTORY + "
|
71
|
+
DIRECTORY + "/#{name}.yaml"
|
72
72
|
end
|
73
73
|
end
|
74
74
|
end
|
data/lib/sitediff/config.rb
CHANGED
@@ -107,7 +107,7 @@ class SiteDiff
|
|
107
107
|
conf[pos][key] += conf[key] if conf[key]
|
108
108
|
end
|
109
109
|
tools[:scalar].each { |key| conf[pos][key] ||= conf[key] }
|
110
|
-
conf[pos]['url'] ||= conf[pos
|
110
|
+
conf[pos]['url'] ||= conf["pos#{_url}"] if defined?(_url)
|
111
111
|
conf[pos]['curl_opts'] = conf['curl_opts']
|
112
112
|
end
|
113
113
|
|
@@ -260,8 +260,8 @@ class SiteDiff
|
|
260
260
|
end
|
261
261
|
|
262
262
|
# Get "before" site configuration.
|
263
|
-
def before(apply_preset
|
264
|
-
section
|
263
|
+
def before(apply_preset: false)
|
264
|
+
section(:before, with_preset: apply_preset)
|
265
265
|
end
|
266
266
|
|
267
267
|
# Get "before" site URL.
|
@@ -271,8 +271,8 @@ class SiteDiff
|
|
271
271
|
end
|
272
272
|
|
273
273
|
# Get "after" site configuration.
|
274
|
-
def after(apply_preset
|
275
|
-
section
|
274
|
+
def after(apply_preset: false)
|
275
|
+
section(:after, with_preset: apply_preset)
|
276
276
|
end
|
277
277
|
|
278
278
|
# Get "after" site URL.
|
@@ -431,7 +431,7 @@ class SiteDiff
|
|
431
431
|
end
|
432
432
|
|
433
433
|
# Validate preset.
|
434
|
-
Preset.exist? setting(:preset), true if setting(:preset)
|
434
|
+
Preset.exist? setting(:preset), exception: true if setting(:preset)
|
435
435
|
end
|
436
436
|
|
437
437
|
##
|
@@ -459,7 +459,7 @@ class SiteDiff
|
|
459
459
|
@return_value = string_param == '' ? nil : Regexp.new(string_param)
|
460
460
|
rescue SiteDiffException => e
|
461
461
|
@return_value = nil
|
462
|
-
SiteDiff.log
|
462
|
+
SiteDiff.log "Invalid RegExp: #{string_param}", :error
|
463
463
|
SiteDiff.log e.message, :error
|
464
464
|
# TODO: Use SiteDiff.log type :debug
|
465
465
|
# SiteDiff.log e.backtrace, :error if options[:verbose]
|
@@ -491,7 +491,7 @@ class SiteDiff
|
|
491
491
|
#
|
492
492
|
# @return [Hash|Nil]
|
493
493
|
# Section data or Nil.
|
494
|
-
def section(name, with_preset
|
494
|
+
def section(name, with_preset: false)
|
495
495
|
name = name.to_s if name.is_a? Symbol
|
496
496
|
|
497
497
|
# Validate section.
|
@@ -531,7 +531,7 @@ class SiteDiff
|
|
531
531
|
def self.load_raw_yaml(file)
|
532
532
|
# TODO: Only show this in verbose mode.
|
533
533
|
SiteDiff.log "Reading config file: #{Pathname.new(file).expand_path}"
|
534
|
-
conf = YAML.load_file(file) || {}
|
534
|
+
conf = YAML.load_file(file, permitted_classes: [Regexp]) || {}
|
535
535
|
|
536
536
|
unless conf.is_a? Hash
|
537
537
|
raise InvalidConfig, "Invalid configuration file: '#{file}'"
|
data/lib/sitediff/crawler.rb
CHANGED
@@ -21,7 +21,7 @@ class SiteDiff
|
|
21
21
|
exclude_regex,
|
22
22
|
depth = DEFAULT_DEPTH,
|
23
23
|
curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
|
24
|
-
debug
|
24
|
+
debug: true,
|
25
25
|
&block)
|
26
26
|
@hydra = hydra
|
27
27
|
@base_uri = Addressable::URI.parse(base)
|
@@ -34,16 +34,16 @@ class SiteDiff
|
|
34
34
|
@curl_opts = curl_opts
|
35
35
|
@debug = debug
|
36
36
|
|
37
|
-
add_uri('', depth)
|
37
|
+
add_uri('', depth, referrer: '/')
|
38
38
|
end
|
39
39
|
|
40
40
|
# Handle a newly found relative URI
|
41
|
-
def add_uri(rel, depth)
|
41
|
+
def add_uri(rel, depth, referrer = '')
|
42
42
|
return if @found.include? rel
|
43
43
|
|
44
44
|
@found << rel
|
45
45
|
|
46
|
-
wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
|
46
|
+
wrapper = UriWrapper.new(@base + rel, @curl_opts, debug: @debug, referrer: referrer)
|
47
47
|
wrapper.queue(@hydra) do |res|
|
48
48
|
fetched_uri(rel, depth, res)
|
49
49
|
end
|
@@ -90,7 +90,7 @@ class SiteDiff
|
|
90
90
|
rels.each do |r|
|
91
91
|
next if @found.include? r
|
92
92
|
|
93
|
-
add_uri(r, depth - 1)
|
93
|
+
add_uri(r, depth - 1, rel)
|
94
94
|
end
|
95
95
|
end
|
96
96
|
|
@@ -104,6 +104,16 @@ class SiteDiff
|
|
104
104
|
|
105
105
|
# Make a link relative to @base_uri
|
106
106
|
def relativize_link(uri)
|
107
|
+
# fullPath = uri.path
|
108
|
+
# if uri.query
|
109
|
+
# fullPath += "?" + uri.query
|
110
|
+
# end
|
111
|
+
#
|
112
|
+
# if uri.fragment
|
113
|
+
# fullPath += "#" + uri.fragment
|
114
|
+
# end
|
115
|
+
# fullPath.gsub(@base_uri.path, "")
|
116
|
+
#
|
107
117
|
uri.path.slice(@base_uri.path.length, uri.path.length)
|
108
118
|
end
|
109
119
|
|
data/lib/sitediff/diff.rb
CHANGED
@@ -68,7 +68,7 @@ class SiteDiff
|
|
68
68
|
|
69
69
|
##
|
70
70
|
# Generates diff output for a single result.
|
71
|
-
def generate_diff_output(result, relative
|
71
|
+
def generate_diff_output(result, relative: false)
|
72
72
|
erb_path = File.join(SiteDiff::FILES_DIR, 'diff.html.erb')
|
73
73
|
ERB.new(File.read(erb_path)).result(binding)
|
74
74
|
end
|
data/lib/sitediff/fetch.rb
CHANGED
@@ -15,7 +15,7 @@ class SiteDiff
|
|
15
15
|
interval,
|
16
16
|
concurrency = 3,
|
17
17
|
curl_opts = nil,
|
18
|
-
debug
|
18
|
+
debug: true,
|
19
19
|
**tags)
|
20
20
|
@cache = cache
|
21
21
|
@interval = interval
|
@@ -51,7 +51,7 @@ class SiteDiff
|
|
51
51
|
results[tag] = UriWrapper::ReadResult.error('Not cached')
|
52
52
|
process_results(path, results)
|
53
53
|
else
|
54
|
-
uri = UriWrapper.new(base + path, @curl_opts, @debug)
|
54
|
+
uri = UriWrapper.new(base + path, @curl_opts, debug: @debug)
|
55
55
|
uri.queue(@hydra) do |resl|
|
56
56
|
# Insert delay to limit fetching rate
|
57
57
|
if @interval != 0
|
@@ -144,7 +144,7 @@
|
|
144
144
|
<% end %>
|
145
145
|
<% end %>
|
146
146
|
<% unless result.diff_url.nil? %>
|
147
|
-
<a href="<%= result.diff_url(relative) %>" class="button button-diff">View diff</a>
|
147
|
+
<a href="<%= result.diff_url(relative: relative) %>" class="button button-diff">View diff</a>
|
148
148
|
<% end %>
|
149
149
|
</div>
|
150
150
|
</td>
|
@@ -0,0 +1,63 @@
|
|
1
|
+
sanitization:
|
2
|
+
- title: Strip Drupal.settings
|
3
|
+
selector: script
|
4
|
+
pattern: '^(<script>)?jQuery.extend\(Drupal.settings.*$'
|
5
|
+
- title: Strip IE CSS/JS cache IDs
|
6
|
+
pattern: '("[^"]*ie\d?\.(js|css))\?[a-z0-9]{6}"'
|
7
|
+
substitute: '\1'
|
8
|
+
- title: Strip form build ID
|
9
|
+
selector: input
|
10
|
+
pattern: 'name="form_build_id" value="form-[-\w]{40,43}"'
|
11
|
+
substitute: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
|
12
|
+
- title: Strip view DOM ID
|
13
|
+
pattern: '(class="view .*) view-dom-id-[a-f0-9]{32}"'
|
14
|
+
substitute: '\1 view-dom-id-DRUPAL_VIEW_DOM_ID"'
|
15
|
+
- title: Strip CSS aggregation filenames
|
16
|
+
selector: link[rel=stylesheet]
|
17
|
+
pattern: '(href="[^"]*/files/css/css_)[-\w]{40,43}\.css"'
|
18
|
+
substitute: '\1DRUPAL_AGGREGATED_CSS.css"'
|
19
|
+
- title: Strip JS aggregation filenames
|
20
|
+
selector: script
|
21
|
+
pattern: '(src="[^"]*/files/js/js_)[-\w]{40,43}\.js"'
|
22
|
+
substitute: '\1DRUPAL_AGGREGATED_JS.js"'
|
23
|
+
- title: Strip CSS/JS cache IDs
|
24
|
+
selector: style, script
|
25
|
+
pattern: '("[^"]*\.(js|css))\?[a-z0-9]{6}"'
|
26
|
+
substitute: '\1'
|
27
|
+
- title: Strip Drupal JS version tags
|
28
|
+
selector: script
|
29
|
+
pattern: '(src="[^"]*/misc/\w+\.js)?v=\d+\.\d+"'
|
30
|
+
substitute: '\1'
|
31
|
+
- title: Strip domain names from absolute URLs
|
32
|
+
pattern: 'http:\/\/[a-zA-Z0-9.:-]+'
|
33
|
+
substitute: '__domain__'
|
34
|
+
- title: Strip form build ID
|
35
|
+
selector: input
|
36
|
+
pattern: 'autocomplete="off" data-drupal-selector="form-[-\w]{40,43}"'
|
37
|
+
substitute: 'autocomplete="off" data-drupal-selector="form-DRUPAL_FORM_BUILD_ID"'
|
38
|
+
- title: Strip form build ID 2
|
39
|
+
selector: input
|
40
|
+
pattern: 'name="form_build_id" value="form-[-\w]{40,43}"'
|
41
|
+
substitute: 'name="form_build_id" value="form-DRUPAL_FORM_BUILD_ID"'
|
42
|
+
- title: Strip Drupal CSS link queries
|
43
|
+
selector: link
|
44
|
+
pattern: '\.css\?(\w*)'
|
45
|
+
substitute: '\.css'
|
46
|
+
- title: Strip Drupal JS link queries
|
47
|
+
selector: script
|
48
|
+
pattern: '\.js\?(\w*)'
|
49
|
+
substitute: '\.js'
|
50
|
+
- title: Strip Drupal View-DOM ID
|
51
|
+
pattern: 'view-dom-id-\w*'
|
52
|
+
substitute: 'view-dom-id-_ID_'
|
53
|
+
- title: Strip Drupal View-DOM ID 2
|
54
|
+
pattern: '(views?_dom_id"?:"?)\w*'
|
55
|
+
substitute: '\1_ID_'
|
56
|
+
- title: Ignore Drupal CSS file names
|
57
|
+
selector: link
|
58
|
+
pattern: 'css_[-\w]{40,43}(\\|%5C)?\.css'
|
59
|
+
substitute: 'css__ID__.css'
|
60
|
+
- title: Ignore Drupal JS file names
|
61
|
+
selector: script
|
62
|
+
pattern: 'js_[-\w]{40,43}\\?\.js'
|
63
|
+
substitute: 'js__ID__.js'
|