sitediff 0.0.6 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.eslintignore +1 -0
- data/.eslintrc.json +28 -0
- data/.project +11 -0
- data/.rubocop.yml +179 -0
- data/.rubocop_todo.yml +51 -0
- data/CHANGELOG.md +28 -0
- data/Dockerfile +33 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +85 -0
- data/INSTALLATION.md +146 -0
- data/LICENSE +339 -0
- data/README.md +810 -0
- data/Rakefile +12 -0
- data/Thorfile +135 -0
- data/bin/sitediff +9 -2
- data/config/.gitkeep +0 -0
- data/config/sanitize_domains.example.yaml +8 -0
- data/config/sitediff.example.yaml +81 -0
- data/docker-compose.test.yml +3 -0
- data/lib/sitediff/api.rb +276 -0
- data/lib/sitediff/cache.rb +57 -8
- data/lib/sitediff/cli.rb +156 -176
- data/lib/sitediff/config/creator.rb +61 -77
- data/lib/sitediff/config/preset.rb +75 -0
- data/lib/sitediff/config.rb +436 -31
- data/lib/sitediff/crawler.rb +27 -21
- data/lib/sitediff/diff.rb +32 -9
- data/lib/sitediff/fetch.rb +10 -3
- data/lib/sitediff/files/diff.html.erb +20 -2
- data/lib/sitediff/files/jquery.min.js +2 -0
- data/lib/sitediff/files/normalize.css +349 -0
- data/lib/sitediff/files/report.html.erb +171 -0
- data/lib/sitediff/files/sidebyside.html.erb +5 -2
- data/lib/sitediff/files/sitediff.css +303 -30
- data/lib/sitediff/files/sitediff.js +367 -0
- data/lib/sitediff/presets/drupal.yaml +63 -0
- data/lib/sitediff/report.rb +254 -0
- data/lib/sitediff/result.rb +50 -20
- data/lib/sitediff/sanitize/dom_transform.rb +47 -8
- data/lib/sitediff/sanitize/regexp.rb +24 -3
- data/lib/sitediff/sanitize.rb +81 -12
- data/lib/sitediff/uriwrapper.rb +65 -23
- data/lib/sitediff/webserver/resultserver.rb +30 -33
- data/lib/sitediff/webserver.rb +15 -3
- data/lib/sitediff.rb +130 -83
- data/misc/sitediff - overview report.png +0 -0
- data/misc/sitediff - page report.png +0 -0
- data/package-lock.json +878 -0
- data/package.json +25 -0
- data/sitediff.gemspec +51 -0
- metadata +91 -29
- data/lib/sitediff/files/html_report.html.erb +0 -66
- data/lib/sitediff/files/rules/drupal.yaml +0 -63
- data/lib/sitediff/rules.rb +0 -65
data/Rakefile
ADDED
data/Thorfile
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
# TODO: Determine the utility of this file.
|
5
|
+
|
6
|
+
LIB_DIR = File.join(File.dirname(__FILE__), 'lib')
|
7
|
+
$LOAD_PATH << LIB_DIR
|
8
|
+
require 'sitediff/webserver'
|
9
|
+
require 'sitediff/webserver/resultserver'
|
10
|
+
|
11
|
+
# Thor Base class.
|
12
|
+
class Base < Thor
|
13
|
+
method_options local: true
|
14
|
+
# Adds the option to all Base subclasses.
|
15
|
+
# method_options() takes different arguments than option().
|
16
|
+
def initialize(*args)
|
17
|
+
super(*args)
|
18
|
+
@local = options['local']
|
19
|
+
end
|
20
|
+
|
21
|
+
# gives us run()
|
22
|
+
include Thor::Actions
|
23
|
+
|
24
|
+
# Thor, by default, exits with 0 no matter what!
|
25
|
+
def self.exit_on_failure?
|
26
|
+
true
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def executable(gem)
|
32
|
+
gem = './bin/sitediff' if (gem == 'sitediff') && @local
|
33
|
+
"#{'bundle exec' if @local} #{gem}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Thor for Docker.
|
38
|
+
class Docker < Base
|
39
|
+
IMAGE = 'evolvingweb/sitediff'
|
40
|
+
|
41
|
+
desc 'build', 'Build a docker image for sitediff'
|
42
|
+
# Make a build image for docker.
|
43
|
+
def build
|
44
|
+
run "docker build -t #{IMAGE} . "
|
45
|
+
end
|
46
|
+
|
47
|
+
desc 'run', 'Run a rake task (or a login shell if none given) inside docker'
|
48
|
+
# NOTE: We can't override run() (which is reserved by Thor). Luckily, Thor only
|
49
|
+
# checks for the first N necessary characters to match a command with a
|
50
|
+
# method. Cf. Thor::normalize_command_name()
|
51
|
+
def run_(task = 'bash')
|
52
|
+
docker_opts = ['-t', "-v #{File.dirname(__FILE__)}:/sitediff"]
|
53
|
+
finish_exec(task, docker_opts)
|
54
|
+
end
|
55
|
+
|
56
|
+
desc 'compose', 'Run a task inside docker without volume mounting (not supported with compose)'
|
57
|
+
# Run a task inside docker without volume mounting.
|
58
|
+
def compose(task = 'bash')
|
59
|
+
docker_opts = ['-t']
|
60
|
+
finish_exec(task, docker_opts)
|
61
|
+
end
|
62
|
+
|
63
|
+
no_commands do
|
64
|
+
# Finished exec
|
65
|
+
def finish_exec(task, docker_opts)
|
66
|
+
if task == 'bash'
|
67
|
+
cmd = 'bash'
|
68
|
+
docker_opts << '-i'
|
69
|
+
else
|
70
|
+
# pass down the local flag to docker command
|
71
|
+
cmd = "#{executable('thor')} #{task} #{@local ? '--local' : '--no-local'}"
|
72
|
+
end
|
73
|
+
puts "docker run #{docker_opts.join(' ')} #{IMAGE} #{cmd}"
|
74
|
+
run "docker run #{docker_opts.join(' ')} #{IMAGE} #{cmd}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Thor for Spec.
|
80
|
+
class Spec < Base
|
81
|
+
desc 'unit', 'run RSpec unit tests'
|
82
|
+
# Run RSpec unit tests.
|
83
|
+
def unit
|
84
|
+
puts "#{executable('rspec')} spec/unit"
|
85
|
+
run "#{executable('rspec')} spec/unit"
|
86
|
+
end
|
87
|
+
|
88
|
+
desc 'fixture', 'run RSpec integration tests'
|
89
|
+
# Run RSpec integration tests.
|
90
|
+
def fixture
|
91
|
+
puts "#{executable('rspec')} spec/unit"
|
92
|
+
run "#{executable('rspec')} spec/fixtures"
|
93
|
+
end
|
94
|
+
|
95
|
+
desc 'all', 'runs both unit and fixture tests', hide: true
|
96
|
+
# hidden task to lump together multiple tasks
|
97
|
+
def all
|
98
|
+
unit
|
99
|
+
fixture
|
100
|
+
end
|
101
|
+
default_task :all
|
102
|
+
end
|
103
|
+
|
104
|
+
# Thor for fixtures.
|
105
|
+
class Fixture < Base
|
106
|
+
desc 'local', 'Run a sitediff test case'
|
107
|
+
# Run a sitediff test case.
|
108
|
+
def local
|
109
|
+
run "#{executable('sitediff')} diff --cached=none spec/fixtures/cli/config.yaml"
|
110
|
+
end
|
111
|
+
|
112
|
+
desc 'http', 'Run a sitediff test case, using web servers'
|
113
|
+
# Run a sitediff test case, using web servers.
|
114
|
+
def http
|
115
|
+
cmd = "#{executable('sitediff')} diff --cached=none spec/fixtures/cli/config.yaml"
|
116
|
+
http_fixtures(cmd).kill
|
117
|
+
end
|
118
|
+
|
119
|
+
desc 'serve', 'Serve the result of the fixture test'
|
120
|
+
# Serve the result of the fixture test.
|
121
|
+
def serve
|
122
|
+
cmd = "#{executable('sitediff')} diff --cached=none --paths-file=spec/sites/ruby-doc.org/paths.txt spec/unit/cli/config.yaml"
|
123
|
+
http_fixtures(cmd)
|
124
|
+
SiteDiff::Webserver::ResultServer.new(nil, 'sitediff', quiet: true).wait
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
# HTTP Fixtures.
|
130
|
+
def http_fixtures(cmd)
|
131
|
+
serv = SiteDiff::Webserver::FixtureServer.new
|
132
|
+
run "#{cmd} --before #{serv.before} --after #{serv.after}"
|
133
|
+
serv
|
134
|
+
end
|
135
|
+
end
|
data/bin/sitediff
CHANGED
@@ -2,8 +2,15 @@
|
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
4
|
# when run as gem, $0 is /usr/local/bin/sitediff not this file
|
5
|
-
|
5
|
+
if $PROGRAM_NAME == __FILE__
|
6
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
|
7
|
+
end
|
6
8
|
|
7
9
|
require 'sitediff/cli'
|
8
10
|
|
9
|
-
|
11
|
+
begin
|
12
|
+
SiteDiff::Cli.start
|
13
|
+
rescue Interrupt
|
14
|
+
puts("\n")
|
15
|
+
SiteDiff.log('Stopping. Interrupted by user.')
|
16
|
+
end
|
data/config/.gitkeep
ADDED
File without changes
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# Include other configuration files, merging them with this file.
|
2
|
+
includes:
|
3
|
+
- extra-rules.yaml
|
4
|
+
|
5
|
+
# Settings.
|
6
|
+
#
|
7
|
+
# If you use "sitediff init" with the right parameters, it will generate
|
8
|
+
# this section for you.
|
9
|
+
settings:
|
10
|
+
# Crawl 2 levels deep.
|
11
|
+
depth: 2
|
12
|
+
# Wait for 250ms between requests.
|
13
|
+
interval: 250
|
14
|
+
# Make only 1 request at a time - no simultaneous requests.
|
15
|
+
# Concurrency has to be one when an interval is set.
|
16
|
+
concurrency: 1
|
17
|
+
# Don't follow links to PDF files.
|
18
|
+
exclude: '.*\.pdf'
|
19
|
+
# Curl options, if any.
|
20
|
+
curl_opts:
|
21
|
+
max_recv_speed_large: 10000
|
22
|
+
|
23
|
+
# Rules under this element apply only to the 'before' site.
|
24
|
+
before:
|
25
|
+
# URL of the 'before' version of the site.
|
26
|
+
url: http://localhost/old
|
27
|
+
|
28
|
+
# Sanitizations and DOM transformations, just like the general ones
|
29
|
+
# demonstrated above, but applied only to the 'before' site.
|
30
|
+
dom_transform:
|
31
|
+
- title: Example
|
32
|
+
type: remove
|
33
|
+
selector: div.updates-required
|
34
|
+
|
35
|
+
# Rules under this element apply only to the 'after' site.
|
36
|
+
after:
|
37
|
+
# URL of the 'after' version of the site.
|
38
|
+
url: http://localhost/new
|
39
|
+
|
40
|
+
# The root element to compare.
|
41
|
+
#
|
42
|
+
# Usually, sitediff compares the HTML of the entire page. If you'd rather
|
43
|
+
# check just a subset of the page, specify a selector here. For example, the
|
44
|
+
# line below causes only the body to be compared, ignoring the HTML head.
|
45
|
+
selector: 'body'
|
46
|
+
|
47
|
+
# General regular expression rules, applied to both versions of the site.
|
48
|
+
sanitization:
|
49
|
+
# Normalize input tags containg random tokens.
|
50
|
+
- title: Remove form-build-id
|
51
|
+
pattern: '<input type="hidden" name="form_build_id" value="form-[a-zA-Z0-9_-]+" *\/?>'
|
52
|
+
substitute: '<input type="hidden" name="form_build_id" value="__form_build_id__">'
|
53
|
+
|
54
|
+
# Replace meta property="twitter:*" with meta name="twitter:*".
|
55
|
+
- title: Meta 'property' changed to 'name'
|
56
|
+
pattern: 'property="twitter:'
|
57
|
+
substitute: 'name="twitter:'
|
58
|
+
# 'selector' limits this rule to only within the selected elements.
|
59
|
+
selector: meta
|
60
|
+
# 'path' limits this rule to only certain pages.
|
61
|
+
path: /user
|
62
|
+
|
63
|
+
# General DOM transforms, applied to both versions of the site.
|
64
|
+
dom_transform:
|
65
|
+
# Remove article elements, replacing them with their content
|
66
|
+
- title: Unwrap article elements
|
67
|
+
type: unwrap
|
68
|
+
selector: article
|
69
|
+
|
70
|
+
# Remove classes from divs
|
71
|
+
- title: Remove classes bar and baz from divs
|
72
|
+
type: remove_class
|
73
|
+
selector: div
|
74
|
+
class:
|
75
|
+
- class-bar
|
76
|
+
- class-baz
|
77
|
+
|
78
|
+
# Remove a div ID.
|
79
|
+
- title: Remove block containing current time.
|
80
|
+
type: remove
|
81
|
+
selector: div#block-time
|
data/lib/sitediff/api.rb
ADDED
@@ -0,0 +1,276 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff'
|
4
|
+
require 'sitediff/cache'
|
5
|
+
require 'sitediff/config'
|
6
|
+
require 'sitediff/config/creator'
|
7
|
+
require 'sitediff/config/preset'
|
8
|
+
require 'sitediff/fetch'
|
9
|
+
require 'sitediff/webserver/resultserver'
|
10
|
+
|
11
|
+
class SiteDiff
|
12
|
+
##
|
13
|
+
# Sitediff API interface.
|
14
|
+
class Api
|
15
|
+
##
|
16
|
+
# Initializes new Api object.
|
17
|
+
def initialize(directory, config_file = nil)
|
18
|
+
@dir = get_dir(directory)
|
19
|
+
@config = SiteDiff::Config.new(config_file, @dir)
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Intialize a SiteDiff project.
|
24
|
+
#
|
25
|
+
# Calling:
|
26
|
+
# SiteDiff::Api.init(
|
27
|
+
# depth: 3,
|
28
|
+
# directory: 'sitediff',
|
29
|
+
# concurrency: 3,
|
30
|
+
# interval: 0,
|
31
|
+
# include: nil,
|
32
|
+
# exclude: '*.pdf',
|
33
|
+
# preset: 'drupal',
|
34
|
+
# curl_opts: {timeout: 60},
|
35
|
+
# crawl: false
|
36
|
+
# )
|
37
|
+
def self.init(options)
|
38
|
+
# Prepare a config object and write it to the file system.
|
39
|
+
creator = SiteDiff::Config::Creator.new(options[:debug], options[:before_url], options[:after_url])
|
40
|
+
include_regex = Config.create_regexp(options[:include])
|
41
|
+
exclude_regex = Config.create_regexp(options[:exclude])
|
42
|
+
creator.create(
|
43
|
+
depth: options[:depth],
|
44
|
+
directory: options[:directory],
|
45
|
+
concurrency: options[:concurrency],
|
46
|
+
interval: options[:interval],
|
47
|
+
include: include_regex,
|
48
|
+
exclude: exclude_regex,
|
49
|
+
preset: options[:preset],
|
50
|
+
curl_opts: options[:curl_opts]
|
51
|
+
)
|
52
|
+
SiteDiff.log "Created #{creator.config_file.expand_path}", :success
|
53
|
+
|
54
|
+
# TODO: implement crawl ^^^
|
55
|
+
# Discover paths, if enabled.
|
56
|
+
# if options[:crawl]
|
57
|
+
# crawl(creator.config_file)
|
58
|
+
# SiteDiff.log 'You can now run "sitediff diff".', :success
|
59
|
+
# else
|
60
|
+
# SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
|
61
|
+
# end
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Diff the `before` and `after`.
|
66
|
+
#
|
67
|
+
# Calling:
|
68
|
+
# Api.diff(
|
69
|
+
# paths: options['paths'],
|
70
|
+
# paths_file: options['paths-file'],
|
71
|
+
# ignore_whitespace: options['ignore-whitespace'],
|
72
|
+
# export: options['export'],
|
73
|
+
# before: options['before'],
|
74
|
+
# after: options['after'],
|
75
|
+
# cached: options['cached'],
|
76
|
+
# verbose: options['verbose'],
|
77
|
+
# report_format: options['report-format'],
|
78
|
+
# before_report: options['before-report'],
|
79
|
+
# after_report: options['after-report'],
|
80
|
+
# cli_mode: false
|
81
|
+
# )
|
82
|
+
def diff(options)
|
83
|
+
@config.ignore_whitespace = options[:ignore_whitespace]
|
84
|
+
@config.export = options[:export]
|
85
|
+
# Apply "paths" override, if any.
|
86
|
+
if options[:paths]
|
87
|
+
@config.paths = options[:paths]
|
88
|
+
else
|
89
|
+
paths_file = options[:paths_file]
|
90
|
+
paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
|
91
|
+
paths_file = File.expand_path(paths_file)
|
92
|
+
|
93
|
+
paths_count = @config.paths_file_read(paths_file)
|
94
|
+
SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
|
95
|
+
end
|
96
|
+
|
97
|
+
# TODO: Why do we allow before and after override during diff?
|
98
|
+
@config.before['url'] = options[:before] if options[:before]
|
99
|
+
@config.after['url'] = options[:after] if options[:after]
|
100
|
+
|
101
|
+
# Prepare cache.
|
102
|
+
cache = SiteDiff::Cache.new(
|
103
|
+
create: options[:cached] != 'none',
|
104
|
+
directory: @dir
|
105
|
+
)
|
106
|
+
cache.read_tags << :before if %w[before all].include?(options[:cached])
|
107
|
+
cache.read_tags << :after if %w[after all].include?(options[:cached])
|
108
|
+
cache.write_tags << :before << :after
|
109
|
+
|
110
|
+
# Run sitediff.
|
111
|
+
sitediff = SiteDiff.new(
|
112
|
+
@config,
|
113
|
+
cache,
|
114
|
+
verbose: options[:verbose],
|
115
|
+
debug: options[:debug]
|
116
|
+
)
|
117
|
+
num_failing = sitediff.run
|
118
|
+
exit_code = num_failing.positive? ? 2 : 0
|
119
|
+
|
120
|
+
# Generate HTML report.
|
121
|
+
if options[:report_format] == 'html' || @config.export
|
122
|
+
sitediff.report.generate_html(
|
123
|
+
@dir,
|
124
|
+
options[:before_report],
|
125
|
+
options[:after_report]
|
126
|
+
)
|
127
|
+
end
|
128
|
+
|
129
|
+
# Generate JSON report.
|
130
|
+
if options[:report_format] == 'json' && @config.export == false
|
131
|
+
sitediff.report.generate_json @dir
|
132
|
+
end
|
133
|
+
|
134
|
+
SiteDiff.log 'Run "sitediff serve" to see a report.' unless options[:export]
|
135
|
+
rescue Config::InvalidConfig => e
|
136
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
137
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
138
|
+
rescue Config::ConfigNotFound => e
|
139
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
140
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
141
|
+
else # no exception was raised
|
142
|
+
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
143
|
+
# Failing diff --> exit(2), populated above
|
144
|
+
exit(exit_code) if options[:cli_mode]
|
145
|
+
end
|
146
|
+
|
147
|
+
##
|
148
|
+
# Crawl the `before` site to determine `paths`.
|
149
|
+
def crawl
|
150
|
+
# Prepare cache.
|
151
|
+
@cache = SiteDiff::Cache.new(
|
152
|
+
create: true,
|
153
|
+
directory: @dir
|
154
|
+
)
|
155
|
+
@cache.write_tags << :before << :after
|
156
|
+
|
157
|
+
# Crawl with Hydra to discover paths.
|
158
|
+
hydra = Typhoeus::Hydra.new(
|
159
|
+
max_concurrency: @config.setting(:concurrency)
|
160
|
+
)
|
161
|
+
@paths = {}
|
162
|
+
@config.roots.each do |tag, url|
|
163
|
+
Crawler.new(
|
164
|
+
hydra,
|
165
|
+
url,
|
166
|
+
@config.setting(:interval),
|
167
|
+
@config.setting(:include),
|
168
|
+
@config.setting(:exclude),
|
169
|
+
@config.setting(:depth),
|
170
|
+
@config.curl_opts,
|
171
|
+
debug: @debug
|
172
|
+
) do |info|
|
173
|
+
SiteDiff.log "Visited #{info.uri}, cached."
|
174
|
+
after_crawl(tag, info)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
hydra.run
|
178
|
+
|
179
|
+
# Write paths to a file.
|
180
|
+
@paths = @paths.values.reduce(&:|).to_a.sort
|
181
|
+
@config.paths_file_write(@paths)
|
182
|
+
|
183
|
+
# Log output.
|
184
|
+
file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
|
185
|
+
SiteDiff.log ''
|
186
|
+
SiteDiff.log "#{@paths.length} page(s) found."
|
187
|
+
SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
|
188
|
+
end
|
189
|
+
|
190
|
+
##
|
191
|
+
# Serves SiteDiff report for accessing in the browser.
|
192
|
+
#
|
193
|
+
# Calling:
|
194
|
+
# api.serve(browse: true, port: 13080)
|
195
|
+
def serve(options)
|
196
|
+
@cache = Cache.new(directory: @dir)
|
197
|
+
@cache.read_tags << :before << :after
|
198
|
+
|
199
|
+
SiteDiff::Webserver::ResultServer.new(
|
200
|
+
options[:port],
|
201
|
+
@dir,
|
202
|
+
browse: options[:browse],
|
203
|
+
cache: @cache,
|
204
|
+
config: @config
|
205
|
+
).wait
|
206
|
+
rescue SiteDiffException => e
|
207
|
+
SiteDiff.log e.message, :error
|
208
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
209
|
+
end
|
210
|
+
|
211
|
+
##
|
212
|
+
#
|
213
|
+
def store(options)
|
214
|
+
# TODO: Figure out how to remove this config.validate call.
|
215
|
+
@config.validate(need_before: false)
|
216
|
+
@config.paths_file_read
|
217
|
+
|
218
|
+
@cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
219
|
+
@cache.write_tags << :before
|
220
|
+
|
221
|
+
base = options[:url] || @config.after['url']
|
222
|
+
fetcher = SiteDiff::Fetch.new(@cache,
|
223
|
+
@config.paths,
|
224
|
+
@config.setting(:interval),
|
225
|
+
@config.setting(:concurrency),
|
226
|
+
get_curl_opts(@config.settings),
|
227
|
+
options[:debug],
|
228
|
+
before: base)
|
229
|
+
fetcher.run do |path, _res|
|
230
|
+
SiteDiff.log "Visited #{path}, cached"
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
private
|
235
|
+
|
236
|
+
##
|
237
|
+
# Ensures that the given directory exists.
|
238
|
+
def get_dir(directory)
|
239
|
+
# Create the dir. Must go before cache initialization!
|
240
|
+
@dir = Pathname.new(directory || '.')
|
241
|
+
@dir.mkpath unless @dir.directory?
|
242
|
+
@dir.to_s
|
243
|
+
end
|
244
|
+
|
245
|
+
##
|
246
|
+
# Processes a crawled path.
|
247
|
+
def after_crawl(tag, info)
|
248
|
+
path = UriWrapper.canonicalize(info.relative)
|
249
|
+
|
250
|
+
# Register the path.
|
251
|
+
@paths[tag] = [] unless @paths[tag]
|
252
|
+
@paths[tag] << path
|
253
|
+
|
254
|
+
result = info.read_result
|
255
|
+
|
256
|
+
# Write result to applicable cache.
|
257
|
+
# @cache.set(tag, path, result)
|
258
|
+
@cache.set(:before, path, result) if tag == 'before'
|
259
|
+
@cache.set(:after, path, result) if tag == 'after'
|
260
|
+
|
261
|
+
# TODO: Restore application of rules.
|
262
|
+
# @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
263
|
+
end
|
264
|
+
|
265
|
+
def get_curl_opts(options)
|
266
|
+
# We do want string keys here
|
267
|
+
bool_hash = { 'true' => true, 'false' => false }
|
268
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS
|
269
|
+
.clone
|
270
|
+
.merge(options['curl_options'] || {})
|
271
|
+
.merge(options['curl_opts'] || {})
|
272
|
+
curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
|
273
|
+
curl_opts
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
data/lib/sitediff/cache.rb
CHANGED
@@ -4,28 +4,45 @@ require 'set'
|
|
4
4
|
require 'fileutils'
|
5
5
|
|
6
6
|
class SiteDiff
|
7
|
+
# SiteDiff Cache Handler.
|
7
8
|
class Cache
|
9
|
+
TIMESTAMP_FILE = 'timestamp'
|
10
|
+
|
8
11
|
attr_accessor :read_tags, :write_tags
|
9
12
|
|
13
|
+
##
|
14
|
+
# Creates a Cache object.
|
10
15
|
def initialize(opts = {})
|
11
16
|
@create = opts[:create]
|
12
17
|
|
13
|
-
# Read and Write tags are sets that can contain :before and :after
|
14
|
-
# They indicate whether we should use the cache for reading or writing
|
18
|
+
# Read and Write tags are sets that can contain :before and :after.
|
19
|
+
# They indicate whether we should use the cache for reading or writing.
|
15
20
|
@read_tags = Set.new
|
16
21
|
@write_tags = Set.new
|
22
|
+
@timestamp_flag = { before: false, after: false }
|
23
|
+
|
24
|
+
# The directory used by the cache for storage.
|
17
25
|
@dir = opts[:directory] || '.'
|
18
26
|
end
|
19
27
|
|
28
|
+
##
|
20
29
|
# Is a tag cached?
|
30
|
+
# TODO: Rename it to is_cached? as it makes more sense.
|
21
31
|
def tag?(tag)
|
22
32
|
File.directory?(File.join(@dir, 'snapshot', tag.to_s))
|
23
33
|
end
|
24
34
|
|
35
|
+
##
|
36
|
+
# Get data from cache.
|
25
37
|
def get(tag, path)
|
26
38
|
return nil unless @read_tags.include? tag
|
27
39
|
|
28
|
-
filename = File.join(
|
40
|
+
filename = File.join(
|
41
|
+
@dir,
|
42
|
+
'snapshot',
|
43
|
+
tag.to_s,
|
44
|
+
*path.split(File::SEPARATOR)
|
45
|
+
)
|
29
46
|
|
30
47
|
filename = File.join(filename, 'index.html') if File.directory?(filename)
|
31
48
|
return nil unless File.file? filename
|
@@ -33,10 +50,18 @@ class SiteDiff
|
|
33
50
|
Marshal.load(File.read(filename))
|
34
51
|
end
|
35
52
|
|
53
|
+
##
|
54
|
+
# Set data to cache.
|
36
55
|
def set(tag, path, result)
|
37
56
|
return unless @write_tags.include? tag
|
38
57
|
|
39
|
-
|
58
|
+
save_timestamp(tag)
|
59
|
+
filename = File.join(
|
60
|
+
@dir,
|
61
|
+
'snapshot',
|
62
|
+
tag.to_s,
|
63
|
+
*path.split(File::SEPARATOR)
|
64
|
+
)
|
40
65
|
|
41
66
|
filename = File.join(filename, 'index.html') if File.directory?(filename)
|
42
67
|
filepath = Pathname.new(filename)
|
@@ -46,32 +71,56 @@ class SiteDiff
|
|
46
71
|
rescue Errno::EEXIST
|
47
72
|
curdir = filepath
|
48
73
|
curdir = curdir.parent until curdir.exist?
|
49
|
-
tempname = curdir.dirname
|
74
|
+
tempname = "#{curdir.dirname}/#{curdir.basename}.temporary"
|
75
|
+
# tempname = curdir.dirname + (curdir.basename.to_s + '.temporary')
|
50
76
|
# May cause problems if action is not atomic!
|
51
77
|
# Move existing file to dir/index.html first
|
52
78
|
# Not robust! Should generate an UUID or something.
|
53
|
-
|
79
|
+
if File.exist?(tempname)
|
80
|
+
SiteDiff.log "Overwriting file #{tempname}", :warning
|
81
|
+
end
|
54
82
|
curdir.rename(tempname)
|
55
83
|
filepath.dirname.mkpath
|
56
84
|
# Should only happen in strange situations such as when the path
|
57
85
|
# is foo/index.html/bar (i.e., index.html is a directory)
|
58
|
-
|
59
|
-
|
86
|
+
if File.exist?("#{curdir}/index.html")
|
87
|
+
SiteDiff.log "Overwriting file #{tempname}", :warning
|
88
|
+
end
|
89
|
+
File.rename(tempname, "#{curdir}/index.html")
|
90
|
+
# tempname.rename(curdir + 'index.html')
|
60
91
|
end
|
61
92
|
end
|
62
93
|
File.open(filename, 'w') { |file| file.write(Marshal.dump(result)) }
|
63
94
|
end
|
64
95
|
|
96
|
+
##
|
97
|
+
# TODO: Document this or remove it if unused.
|
65
98
|
def key(tag, path)
|
66
99
|
# Ensure encoding stays the same!
|
67
100
|
Marshal.dump([tag, path.encode('UTF-8')])
|
68
101
|
end
|
69
102
|
|
103
|
+
##
|
104
|
+
# Ensures that a directory exists.
|
70
105
|
def get_dir(directory)
|
71
106
|
# Create the dir. Must go before cache initialization!
|
72
107
|
@dir = Pathname.new(directory || '.')
|
73
108
|
@dir.mkpath unless @dir.directory?
|
74
109
|
@dir.to_s
|
75
110
|
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
def save_timestamp(tag)
|
115
|
+
# run once
|
116
|
+
return if @timestamp_flag[tag]
|
117
|
+
|
118
|
+
@timestamp_flag[tag] = true
|
119
|
+
cache_dir = File.join(@dir, 'snapshot', tag.to_s)
|
120
|
+
if File.exist? cache_dir
|
121
|
+
file = File.join(cache_dir, TIMESTAMP_FILE)
|
122
|
+
FileUtils.touch(file)
|
123
|
+
end
|
124
|
+
end
|
76
125
|
end
|
77
126
|
end
|