sitediff 0.0.6 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.eslintignore +1 -0
- data/.eslintrc.json +28 -0
- data/.project +11 -0
- data/.rubocop.yml +179 -0
- data/.rubocop_todo.yml +51 -0
- data/CHANGELOG.md +28 -0
- data/Dockerfile +33 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +85 -0
- data/INSTALLATION.md +146 -0
- data/LICENSE +339 -0
- data/README.md +810 -0
- data/Rakefile +12 -0
- data/Thorfile +135 -0
- data/bin/sitediff +9 -2
- data/config/.gitkeep +0 -0
- data/config/sanitize_domains.example.yaml +8 -0
- data/config/sitediff.example.yaml +81 -0
- data/docker-compose.test.yml +3 -0
- data/lib/sitediff/api.rb +276 -0
- data/lib/sitediff/cache.rb +57 -8
- data/lib/sitediff/cli.rb +156 -176
- data/lib/sitediff/config/creator.rb +61 -77
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/config.rb +436 -31
- data/lib/sitediff/crawler.rb +27 -21
- data/lib/sitediff/diff.rb +32 -9
- data/lib/sitediff/fetch.rb +10 -3
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +171 -0
- data/lib/sitediff/files/sidebyside.html.erb +5 -2
- data/lib/sitediff/files/sitediff.css +303 -30
- data/lib/sitediff/files/sitediff.js +367 -0
- data/lib/sitediff/presets/drupal.yaml +63 -0
- data/lib/sitediff/report.rb +254 -0
- data/lib/sitediff/result.rb +50 -20
- data/lib/sitediff/sanitize/dom_transform.rb +47 -8
- data/lib/sitediff/sanitize/regexp.rb +24 -3
- data/lib/sitediff/sanitize.rb +81 -12
- data/lib/sitediff/uriwrapper.rb +65 -23
- data/lib/sitediff/webserver/resultserver.rb +30 -33
- data/lib/sitediff/webserver.rb +15 -3
- data/lib/sitediff.rb +130 -83
- data/misc/sitediff - overview report.png +0 -0
- data/misc/sitediff - page report.png +0 -0
- data/package-lock.json +878 -0
- data/package.json +25 -0
- data/sitediff.gemspec +51 -0
- metadata +91 -29
- data/lib/sitediff/files/html_report.html.erb +0 -66
- data/lib/sitediff/files/rules/drupal.yaml +0 -63
- data/lib/sitediff/rules.rb +0 -65
data/Rakefile
ADDED
data/Thorfile
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
# TODO: Determine the utility of this file.
|
5
|
+
|
6
|
+
LIB_DIR = File.join(File.dirname(__FILE__), 'lib')
|
7
|
+
$LOAD_PATH << LIB_DIR
|
8
|
+
require 'sitediff/webserver'
|
9
|
+
require 'sitediff/webserver/resultserver'
|
10
|
+
|
11
|
+
# Thor Base class.
|
12
|
+
class Base < Thor
|
13
|
+
method_options local: true
|
14
|
+
# Adds the option to all Base subclasses.
|
15
|
+
# method_options() takes different arguments than option().
|
16
|
+
def initialize(*args)
|
17
|
+
super(*args)
|
18
|
+
@local = options['local']
|
19
|
+
end
|
20
|
+
|
21
|
+
# gives us run()
|
22
|
+
include Thor::Actions
|
23
|
+
|
24
|
+
# Thor, by default, exits with 0 no matter what!
|
25
|
+
def self.exit_on_failure?
|
26
|
+
true
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def executable(gem)
|
32
|
+
gem = './bin/sitediff' if (gem == 'sitediff') && @local
|
33
|
+
"#{'bundle exec' if @local} #{gem}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Thor for Docker.
|
38
|
+
class Docker < Base
|
39
|
+
IMAGE = 'evolvingweb/sitediff'
|
40
|
+
|
41
|
+
desc 'build', 'Build a docker image for sitediff'
|
42
|
+
# Make a build image for docker.
|
43
|
+
def build
|
44
|
+
run "docker build -t #{IMAGE} . "
|
45
|
+
end
|
46
|
+
|
47
|
+
desc 'run', 'Run a rake task (or a login shell if none given) inside docker'
|
48
|
+
# NOTE: We can't override run() (which is reserved by Thor). Luckily, Thor only
|
49
|
+
# checks for the first N necessary characters to match a command with a
|
50
|
+
# method. Cf. Thor::normalize_command_name()
|
51
|
+
def run_(task = 'bash')
|
52
|
+
docker_opts = ['-t', "-v #{File.dirname(__FILE__)}:/sitediff"]
|
53
|
+
finish_exec(task, docker_opts)
|
54
|
+
end
|
55
|
+
|
56
|
+
desc 'compose', 'Run a task inside docker without volume mounting (not supported with compose)'
|
57
|
+
# Run a task inside docker without volume mounting.
|
58
|
+
def compose(task = 'bash')
|
59
|
+
docker_opts = ['-t']
|
60
|
+
finish_exec(task, docker_opts)
|
61
|
+
end
|
62
|
+
|
63
|
+
no_commands do
|
64
|
+
# Finished exec
|
65
|
+
def finish_exec(task, docker_opts)
|
66
|
+
if task == 'bash'
|
67
|
+
cmd = 'bash'
|
68
|
+
docker_opts << '-i'
|
69
|
+
else
|
70
|
+
# pass down the local flag to docker command
|
71
|
+
cmd = "#{executable('thor')} #{task} #{@local ? '--local' : '--no-local'}"
|
72
|
+
end
|
73
|
+
puts "docker run #{docker_opts.join(' ')} #{IMAGE} #{cmd}"
|
74
|
+
run "docker run #{docker_opts.join(' ')} #{IMAGE} #{cmd}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Thor for Spec.
|
80
|
+
class Spec < Base
|
81
|
+
desc 'unit', 'run RSpec unit tests'
|
82
|
+
# Run RSpec unit tests.
|
83
|
+
def unit
|
84
|
+
puts "#{executable('rspec')} spec/unit"
|
85
|
+
run "#{executable('rspec')} spec/unit"
|
86
|
+
end
|
87
|
+
|
88
|
+
desc 'fixture', 'run RSpec integration tests'
|
89
|
+
# Run RSpec integration tests.
|
90
|
+
def fixture
|
91
|
+
puts "#{executable('rspec')} spec/unit"
|
92
|
+
run "#{executable('rspec')} spec/fixtures"
|
93
|
+
end
|
94
|
+
|
95
|
+
desc 'all', 'runs both unit and fixture tests', hide: true
|
96
|
+
# hidden task to lump together multiple tasks
|
97
|
+
def all
|
98
|
+
unit
|
99
|
+
fixture
|
100
|
+
end
|
101
|
+
default_task :all
|
102
|
+
end
|
103
|
+
|
104
|
+
# Thor for fixtures.
|
105
|
+
class Fixture < Base
|
106
|
+
desc 'local', 'Run a sitediff test case'
|
107
|
+
# Run a sitediff test case.
|
108
|
+
def local
|
109
|
+
run "#{executable('sitediff')} diff --cached=none spec/fixtures/cli/config.yaml"
|
110
|
+
end
|
111
|
+
|
112
|
+
desc 'http', 'Run a sitediff test case, using web servers'
|
113
|
+
# Run a sitediff test case, using web servers.
|
114
|
+
def http
|
115
|
+
cmd = "#{executable('sitediff')} diff --cached=none spec/fixtures/cli/config.yaml"
|
116
|
+
http_fixtures(cmd).kill
|
117
|
+
end
|
118
|
+
|
119
|
+
desc 'serve', 'Serve the result of the fixture test'
|
120
|
+
# Serve the result of the fixture test.
|
121
|
+
def serve
|
122
|
+
cmd = "#{executable('sitediff')} diff --cached=none --paths-file=spec/sites/ruby-doc.org/paths.txt spec/unit/cli/config.yaml"
|
123
|
+
http_fixtures(cmd)
|
124
|
+
SiteDiff::Webserver::ResultServer.new(nil, 'sitediff', quiet: true).wait
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
# HTTP Fixtures.
|
130
|
+
def http_fixtures(cmd)
|
131
|
+
serv = SiteDiff::Webserver::FixtureServer.new
|
132
|
+
run "#{cmd} --before #{serv.before} --after #{serv.after}"
|
133
|
+
serv
|
134
|
+
end
|
135
|
+
end
|
data/bin/sitediff
CHANGED
@@ -2,8 +2,15 @@
|
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
4
|
# when run as gem, $0 is /usr/local/bin/sitediff not this file
|
5
|
-
|
5
|
+
if $PROGRAM_NAME == __FILE__
|
6
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
|
7
|
+
end
|
6
8
|
|
7
9
|
require 'sitediff/cli'
|
8
10
|
|
9
|
-
|
11
|
+
begin
|
12
|
+
SiteDiff::Cli.start
|
13
|
+
rescue Interrupt
|
14
|
+
puts("\n")
|
15
|
+
SiteDiff.log('Stopping. Interrupted by user.')
|
16
|
+
end
|
data/config/.gitkeep
ADDED
File without changes
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# Include other configuration files, merging them with this file.
|
2
|
+
includes:
|
3
|
+
- extra-rules.yaml
|
4
|
+
|
5
|
+
# Settings.
|
6
|
+
#
|
7
|
+
# If you use "sitediff init" with the right parameters, it will generate
|
8
|
+
# this section for you.
|
9
|
+
settings:
|
10
|
+
# Crawl 2 levels deep.
|
11
|
+
depth: 2
|
12
|
+
# Wait for 250ms between requests.
|
13
|
+
interval: 250
|
14
|
+
# Make only 1 request at a time - no simultaneous requests.
|
15
|
+
# Concurrency has to be one when an interval is set.
|
16
|
+
concurrency: 1
|
17
|
+
# Don't follow links to PDF files.
|
18
|
+
exclude: '.*\.pdf'
|
19
|
+
# Curl options, if any.
|
20
|
+
curl_opts:
|
21
|
+
max_recv_speed_large: 10000
|
22
|
+
|
23
|
+
# Rules under this element apply only to the 'before' site.
|
24
|
+
before:
|
25
|
+
# URL of the 'before' version of the site.
|
26
|
+
url: http://localhost/old
|
27
|
+
|
28
|
+
# Sanitizations and DOM transformations, just like the general ones
|
29
|
+
# demonstrated above, but applied only to the 'before' site.
|
30
|
+
dom_transform:
|
31
|
+
- title: Example
|
32
|
+
type: remove
|
33
|
+
selector: div.updates-required
|
34
|
+
|
35
|
+
# Rules under this element apply only to the 'after' site.
|
36
|
+
after:
|
37
|
+
# URL of the 'after' version of the site.
|
38
|
+
url: http://localhost/new
|
39
|
+
|
40
|
+
# The root element to compare.
|
41
|
+
#
|
42
|
+
# Usually, sitediff compares the HTML of the entire page. If you'd rather
|
43
|
+
# check just a subset of the page, specify a selector here. For example, the
|
44
|
+
# line below causes only the body to be compared, ignoring the HTML head.
|
45
|
+
selector: 'body'
|
46
|
+
|
47
|
+
# General regular expression rules, applied to both versions of the site.
|
48
|
+
sanitization:
|
49
|
+
# Normalize input tags containg random tokens.
|
50
|
+
- title: Remove form-build-id
|
51
|
+
pattern: '<input type="hidden" name="form_build_id" value="form-[a-zA-Z0-9_-]+" *\/?>'
|
52
|
+
substitute: '<input type="hidden" name="form_build_id" value="__form_build_id__">'
|
53
|
+
|
54
|
+
# Replace meta property="twitter:*" with meta name="twitter:*".
|
55
|
+
- title: Meta 'property' changed to 'name'
|
56
|
+
pattern: 'property="twitter:'
|
57
|
+
substitute: 'name="twitter:'
|
58
|
+
# 'selector' limits this rule to only within the selected elements.
|
59
|
+
selector: meta
|
60
|
+
# 'path' limits this rule to only certain pages.
|
61
|
+
path: /user
|
62
|
+
|
63
|
+
# General DOM transforms, applied to both versions of the site.
|
64
|
+
dom_transform:
|
65
|
+
# Remove article elements, replacing them with their content
|
66
|
+
- title: Unwrap article elements
|
67
|
+
type: unwrap
|
68
|
+
selector: article
|
69
|
+
|
70
|
+
# Remove classes from divs
|
71
|
+
- title: Remove classes bar and baz from divs
|
72
|
+
type: remove_class
|
73
|
+
selector: div
|
74
|
+
class:
|
75
|
+
- class-bar
|
76
|
+
- class-baz
|
77
|
+
|
78
|
+
# Remove a div ID.
|
79
|
+
- title: Remove block containing current time.
|
80
|
+
type: remove
|
81
|
+
selector: div#block-time
|
data/lib/sitediff/api.rb
ADDED
@@ -0,0 +1,276 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff'
|
4
|
+
require 'sitediff/cache'
|
5
|
+
require 'sitediff/config'
|
6
|
+
require 'sitediff/config/creator'
|
7
|
+
require 'sitediff/config/preset'
|
8
|
+
require 'sitediff/fetch'
|
9
|
+
require 'sitediff/webserver/resultserver'
|
10
|
+
|
11
|
+
class SiteDiff
|
12
|
+
##
|
13
|
+
# Sitediff API interface.
|
14
|
+
class Api
|
15
|
+
##
|
16
|
+
# Initializes new Api object.
|
17
|
+
def initialize(directory, config_file = nil)
|
18
|
+
@dir = get_dir(directory)
|
19
|
+
@config = SiteDiff::Config.new(config_file, @dir)
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Intialize a SiteDiff project.
|
24
|
+
#
|
25
|
+
# Calling:
|
26
|
+
# SiteDiff::Api.init(
|
27
|
+
# depth: 3,
|
28
|
+
# directory: 'sitediff',
|
29
|
+
# concurrency: 3,
|
30
|
+
# interval: 0,
|
31
|
+
# include: nil,
|
32
|
+
# exclude: '*.pdf',
|
33
|
+
# preset: 'drupal',
|
34
|
+
# curl_opts: {timeout: 60},
|
35
|
+
# crawl: false
|
36
|
+
# )
|
37
|
+
def self.init(options)
|
38
|
+
# Prepare a config object and write it to the file system.
|
39
|
+
creator = SiteDiff::Config::Creator.new(options[:debug], options[:before_url], options[:after_url])
|
40
|
+
include_regex = Config.create_regexp(options[:include])
|
41
|
+
exclude_regex = Config.create_regexp(options[:exclude])
|
42
|
+
creator.create(
|
43
|
+
depth: options[:depth],
|
44
|
+
directory: options[:directory],
|
45
|
+
concurrency: options[:concurrency],
|
46
|
+
interval: options[:interval],
|
47
|
+
include: include_regex,
|
48
|
+
exclude: exclude_regex,
|
49
|
+
preset: options[:preset],
|
50
|
+
curl_opts: options[:curl_opts]
|
51
|
+
)
|
52
|
+
SiteDiff.log "Created #{creator.config_file.expand_path}", :success
|
53
|
+
|
54
|
+
# TODO: implement crawl ^^^
|
55
|
+
# Discover paths, if enabled.
|
56
|
+
# if options[:crawl]
|
57
|
+
# crawl(creator.config_file)
|
58
|
+
# SiteDiff.log 'You can now run "sitediff diff".', :success
|
59
|
+
# else
|
60
|
+
# SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
|
61
|
+
# end
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Diff the `before` and `after`.
|
66
|
+
#
|
67
|
+
# Calling:
|
68
|
+
# Api.diff(
|
69
|
+
# paths: options['paths'],
|
70
|
+
# paths_file: options['paths-file'],
|
71
|
+
# ignore_whitespace: options['ignore-whitespace'],
|
72
|
+
# export: options['export'],
|
73
|
+
# before: options['before'],
|
74
|
+
# after: options['after'],
|
75
|
+
# cached: options['cached'],
|
76
|
+
# verbose: options['verbose'],
|
77
|
+
# report_format: options['report-format'],
|
78
|
+
# before_report: options['before-report'],
|
79
|
+
# after_report: options['after-report'],
|
80
|
+
# cli_mode: false
|
81
|
+
# )
|
82
|
+
def diff(options)
|
83
|
+
@config.ignore_whitespace = options[:ignore_whitespace]
|
84
|
+
@config.export = options[:export]
|
85
|
+
# Apply "paths" override, if any.
|
86
|
+
if options[:paths]
|
87
|
+
@config.paths = options[:paths]
|
88
|
+
else
|
89
|
+
paths_file = options[:paths_file]
|
90
|
+
paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
|
91
|
+
paths_file = File.expand_path(paths_file)
|
92
|
+
|
93
|
+
paths_count = @config.paths_file_read(paths_file)
|
94
|
+
SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
|
95
|
+
end
|
96
|
+
|
97
|
+
# TODO: Why do we allow before and after override during diff?
|
98
|
+
@config.before['url'] = options[:before] if options[:before]
|
99
|
+
@config.after['url'] = options[:after] if options[:after]
|
100
|
+
|
101
|
+
# Prepare cache.
|
102
|
+
cache = SiteDiff::Cache.new(
|
103
|
+
create: options[:cached] != 'none',
|
104
|
+
directory: @dir
|
105
|
+
)
|
106
|
+
cache.read_tags << :before if %w[before all].include?(options[:cached])
|
107
|
+
cache.read_tags << :after if %w[after all].include?(options[:cached])
|
108
|
+
cache.write_tags << :before << :after
|
109
|
+
|
110
|
+
# Run sitediff.
|
111
|
+
sitediff = SiteDiff.new(
|
112
|
+
@config,
|
113
|
+
cache,
|
114
|
+
verbose: options[:verbose],
|
115
|
+
debug: options[:debug]
|
116
|
+
)
|
117
|
+
num_failing = sitediff.run
|
118
|
+
exit_code = num_failing.positive? ? 2 : 0
|
119
|
+
|
120
|
+
# Generate HTML report.
|
121
|
+
if options[:report_format] == 'html' || @config.export
|
122
|
+
sitediff.report.generate_html(
|
123
|
+
@dir,
|
124
|
+
options[:before_report],
|
125
|
+
options[:after_report]
|
126
|
+
)
|
127
|
+
end
|
128
|
+
|
129
|
+
# Generate JSON report.
|
130
|
+
if options[:report_format] == 'json' && @config.export == false
|
131
|
+
sitediff.report.generate_json @dir
|
132
|
+
end
|
133
|
+
|
134
|
+
SiteDiff.log 'Run "sitediff serve" to see a report.' unless options[:export]
|
135
|
+
rescue Config::InvalidConfig => e
|
136
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
137
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
138
|
+
rescue Config::ConfigNotFound => e
|
139
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
140
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
141
|
+
else # no exception was raised
|
142
|
+
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
143
|
+
# Failing diff --> exit(2), populated above
|
144
|
+
exit(exit_code) if options[:cli_mode]
|
145
|
+
end
|
146
|
+
|
147
|
+
##
|
148
|
+
# Crawl the `before` site to determine `paths`.
|
149
|
+
def crawl
|
150
|
+
# Prepare cache.
|
151
|
+
@cache = SiteDiff::Cache.new(
|
152
|
+
create: true,
|
153
|
+
directory: @dir
|
154
|
+
)
|
155
|
+
@cache.write_tags << :before << :after
|
156
|
+
|
157
|
+
# Crawl with Hydra to discover paths.
|
158
|
+
hydra = Typhoeus::Hydra.new(
|
159
|
+
max_concurrency: @config.setting(:concurrency)
|
160
|
+
)
|
161
|
+
@paths = {}
|
162
|
+
@config.roots.each do |tag, url|
|
163
|
+
Crawler.new(
|
164
|
+
hydra,
|
165
|
+
url,
|
166
|
+
@config.setting(:interval),
|
167
|
+
@config.setting(:include),
|
168
|
+
@config.setting(:exclude),
|
169
|
+
@config.setting(:depth),
|
170
|
+
@config.curl_opts,
|
171
|
+
debug: @debug
|
172
|
+
) do |info|
|
173
|
+
SiteDiff.log "Visited #{info.uri}, cached."
|
174
|
+
after_crawl(tag, info)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
hydra.run
|
178
|
+
|
179
|
+
# Write paths to a file.
|
180
|
+
@paths = @paths.values.reduce(&:|).to_a.sort
|
181
|
+
@config.paths_file_write(@paths)
|
182
|
+
|
183
|
+
# Log output.
|
184
|
+
file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
|
185
|
+
SiteDiff.log ''
|
186
|
+
SiteDiff.log "#{@paths.length} page(s) found."
|
187
|
+
SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
|
188
|
+
end
|
189
|
+
|
190
|
+
##
|
191
|
+
# Serves SiteDiff report for accessing in the browser.
|
192
|
+
#
|
193
|
+
# Calling:
|
194
|
+
# api.serve(browse: true, port: 13080)
|
195
|
+
def serve(options)
|
196
|
+
@cache = Cache.new(directory: @dir)
|
197
|
+
@cache.read_tags << :before << :after
|
198
|
+
|
199
|
+
SiteDiff::Webserver::ResultServer.new(
|
200
|
+
options[:port],
|
201
|
+
@dir,
|
202
|
+
browse: options[:browse],
|
203
|
+
cache: @cache,
|
204
|
+
config: @config
|
205
|
+
).wait
|
206
|
+
rescue SiteDiffException => e
|
207
|
+
SiteDiff.log e.message, :error
|
208
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
209
|
+
end
|
210
|
+
|
211
|
+
##
|
212
|
+
#
|
213
|
+
def store(options)
|
214
|
+
# TODO: Figure out how to remove this config.validate call.
|
215
|
+
@config.validate(need_before: false)
|
216
|
+
@config.paths_file_read
|
217
|
+
|
218
|
+
@cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
219
|
+
@cache.write_tags << :before
|
220
|
+
|
221
|
+
base = options[:url] || @config.after['url']
|
222
|
+
fetcher = SiteDiff::Fetch.new(@cache,
|
223
|
+
@config.paths,
|
224
|
+
@config.setting(:interval),
|
225
|
+
@config.setting(:concurrency),
|
226
|
+
get_curl_opts(@config.settings),
|
227
|
+
options[:debug],
|
228
|
+
before: base)
|
229
|
+
fetcher.run do |path, _res|
|
230
|
+
SiteDiff.log "Visited #{path}, cached"
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
private
|
235
|
+
|
236
|
+
##
|
237
|
+
# Ensures that the given directory exists.
|
238
|
+
def get_dir(directory)
|
239
|
+
# Create the dir. Must go before cache initialization!
|
240
|
+
@dir = Pathname.new(directory || '.')
|
241
|
+
@dir.mkpath unless @dir.directory?
|
242
|
+
@dir.to_s
|
243
|
+
end
|
244
|
+
|
245
|
+
##
|
246
|
+
# Processes a crawled path.
|
247
|
+
def after_crawl(tag, info)
|
248
|
+
path = UriWrapper.canonicalize(info.relative)
|
249
|
+
|
250
|
+
# Register the path.
|
251
|
+
@paths[tag] = [] unless @paths[tag]
|
252
|
+
@paths[tag] << path
|
253
|
+
|
254
|
+
result = info.read_result
|
255
|
+
|
256
|
+
# Write result to applicable cache.
|
257
|
+
# @cache.set(tag, path, result)
|
258
|
+
@cache.set(:before, path, result) if tag == 'before'
|
259
|
+
@cache.set(:after, path, result) if tag == 'after'
|
260
|
+
|
261
|
+
# TODO: Restore application of rules.
|
262
|
+
# @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
263
|
+
end
|
264
|
+
|
265
|
+
def get_curl_opts(options)
|
266
|
+
# We do want string keys here
|
267
|
+
bool_hash = { 'true' => true, 'false' => false }
|
268
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS
|
269
|
+
.clone
|
270
|
+
.merge(options['curl_options'] || {})
|
271
|
+
.merge(options['curl_opts'] || {})
|
272
|
+
curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
|
273
|
+
curl_opts
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
data/lib/sitediff/cache.rb
CHANGED
@@ -4,28 +4,45 @@ require 'set'
|
|
4
4
|
require 'fileutils'
|
5
5
|
|
6
6
|
class SiteDiff
|
7
|
+
# SiteDiff Cache Handler.
|
7
8
|
class Cache
|
9
|
+
TIMESTAMP_FILE = 'timestamp'
|
10
|
+
|
8
11
|
attr_accessor :read_tags, :write_tags
|
9
12
|
|
13
|
+
##
|
14
|
+
# Creates a Cache object.
|
10
15
|
def initialize(opts = {})
|
11
16
|
@create = opts[:create]
|
12
17
|
|
13
|
-
# Read and Write tags are sets that can contain :before and :after
|
14
|
-
# They indicate whether we should use the cache for reading or writing
|
18
|
+
# Read and Write tags are sets that can contain :before and :after.
|
19
|
+
# They indicate whether we should use the cache for reading or writing.
|
15
20
|
@read_tags = Set.new
|
16
21
|
@write_tags = Set.new
|
22
|
+
@timestamp_flag = { before: false, after: false }
|
23
|
+
|
24
|
+
# The directory used by the cache for storage.
|
17
25
|
@dir = opts[:directory] || '.'
|
18
26
|
end
|
19
27
|
|
28
|
+
##
|
20
29
|
# Is a tag cached?
|
30
|
+
# TODO: Rename it to is_cached? as it makes more sense.
|
21
31
|
def tag?(tag)
|
22
32
|
File.directory?(File.join(@dir, 'snapshot', tag.to_s))
|
23
33
|
end
|
24
34
|
|
35
|
+
##
|
36
|
+
# Get data from cache.
|
25
37
|
def get(tag, path)
|
26
38
|
return nil unless @read_tags.include? tag
|
27
39
|
|
28
|
-
filename = File.join(
|
40
|
+
filename = File.join(
|
41
|
+
@dir,
|
42
|
+
'snapshot',
|
43
|
+
tag.to_s,
|
44
|
+
*path.split(File::SEPARATOR)
|
45
|
+
)
|
29
46
|
|
30
47
|
filename = File.join(filename, 'index.html') if File.directory?(filename)
|
31
48
|
return nil unless File.file? filename
|
@@ -33,10 +50,18 @@ class SiteDiff
|
|
33
50
|
Marshal.load(File.read(filename))
|
34
51
|
end
|
35
52
|
|
53
|
+
##
|
54
|
+
# Set data to cache.
|
36
55
|
def set(tag, path, result)
|
37
56
|
return unless @write_tags.include? tag
|
38
57
|
|
39
|
-
|
58
|
+
save_timestamp(tag)
|
59
|
+
filename = File.join(
|
60
|
+
@dir,
|
61
|
+
'snapshot',
|
62
|
+
tag.to_s,
|
63
|
+
*path.split(File::SEPARATOR)
|
64
|
+
)
|
40
65
|
|
41
66
|
filename = File.join(filename, 'index.html') if File.directory?(filename)
|
42
67
|
filepath = Pathname.new(filename)
|
@@ -46,32 +71,56 @@ class SiteDiff
|
|
46
71
|
rescue Errno::EEXIST
|
47
72
|
curdir = filepath
|
48
73
|
curdir = curdir.parent until curdir.exist?
|
49
|
-
tempname = curdir.dirname
|
74
|
+
tempname = "#{curdir.dirname}/#{curdir.basename}.temporary"
|
75
|
+
# tempname = curdir.dirname + (curdir.basename.to_s + '.temporary')
|
50
76
|
# May cause problems if action is not atomic!
|
51
77
|
# Move existing file to dir/index.html first
|
52
78
|
# Not robust! Should generate an UUID or something.
|
53
|
-
|
79
|
+
if File.exist?(tempname)
|
80
|
+
SiteDiff.log "Overwriting file #{tempname}", :warning
|
81
|
+
end
|
54
82
|
curdir.rename(tempname)
|
55
83
|
filepath.dirname.mkpath
|
56
84
|
# Should only happen in strange situations such as when the path
|
57
85
|
# is foo/index.html/bar (i.e., index.html is a directory)
|
58
|
-
|
59
|
-
|
86
|
+
if File.exist?("#{curdir}/index.html")
|
87
|
+
SiteDiff.log "Overwriting file #{tempname}", :warning
|
88
|
+
end
|
89
|
+
File.rename(tempname, "#{curdir}/index.html")
|
90
|
+
# tempname.rename(curdir + 'index.html')
|
60
91
|
end
|
61
92
|
end
|
62
93
|
File.open(filename, 'w') { |file| file.write(Marshal.dump(result)) }
|
63
94
|
end
|
64
95
|
|
96
|
+
##
|
97
|
+
# TODO: Document this or remove it if unused.
|
65
98
|
def key(tag, path)
|
66
99
|
# Ensure encoding stays the same!
|
67
100
|
Marshal.dump([tag, path.encode('UTF-8')])
|
68
101
|
end
|
69
102
|
|
103
|
+
##
|
104
|
+
# Ensures that a directory exists.
|
70
105
|
def get_dir(directory)
|
71
106
|
# Create the dir. Must go before cache initialization!
|
72
107
|
@dir = Pathname.new(directory || '.')
|
73
108
|
@dir.mkpath unless @dir.directory?
|
74
109
|
@dir.to_s
|
75
110
|
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
def save_timestamp(tag)
|
115
|
+
# run once
|
116
|
+
return if @timestamp_flag[tag]
|
117
|
+
|
118
|
+
@timestamp_flag[tag] = true
|
119
|
+
cache_dir = File.join(@dir, 'snapshot', tag.to_s)
|
120
|
+
if File.exist? cache_dir
|
121
|
+
file = File.join(cache_dir, TIMESTAMP_FILE)
|
122
|
+
FileUtils.touch(file)
|
123
|
+
end
|
124
|
+
end
|
76
125
|
end
|
77
126
|
end
|