sitediff 1.0.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/sitediff.rb +3 -1
- data/lib/sitediff/api.rb +265 -0
- data/lib/sitediff/cache.rb +18 -0
- data/lib/sitediff/cli.rb +58 -214
- data/lib/sitediff/config.rb +78 -6
- data/lib/sitediff/config/creator.rb +8 -6
- data/lib/sitediff/crawler.rb +9 -9
- data/lib/sitediff/diff.rb +5 -1
- data/lib/sitediff/files/report.html.erb +35 -8
- data/lib/sitediff/files/sitediff.css +78 -1
- data/lib/sitediff/files/sitediff.js +204 -13
- data/lib/sitediff/report.rb +17 -1
- data/lib/sitediff/result.rb +2 -0
- data/lib/sitediff/sanitize.rb +49 -1
- data/lib/sitediff/uriwrapper.rb +4 -1
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f75892f718764c8fd2c18d7f3f7e7cf8908d60ea07c2a765510c8ef409b9f0c1
|
4
|
+
data.tar.gz: 3b3744eca0dda04821152aab596fb67891204a1599b4db72e13b4af484693e65
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 97e9098b290742f1b3efe3c284e9392be95ffd0f7576df413a6ec612142b0573acf8b8b4d43369961c154d801db6284fcc1a8d69cea7da8ed99b64a0a1f1af75
|
7
|
+
data.tar.gz: c4b0e93bc4e0acb3d675c8d675d8f6235035aae72421794495f25223cb086eaa4c87d2cde63caa0eda257b0d91f374a0efbbb416ef8ee88c2f0ffde89a608831
|
data/lib/sitediff.rb
CHANGED
@@ -108,7 +108,9 @@ class SiteDiff
|
|
108
108
|
encoding = read_results[tag].encoding
|
109
109
|
if encoding || html.length.positive?
|
110
110
|
section = @config.send(tag, true)
|
111
|
-
|
111
|
+
opts = { path: path }
|
112
|
+
opts[:output] = @config.output if @config.output
|
113
|
+
Sanitizer.new(html, section, opts).sanitize
|
112
114
|
else
|
113
115
|
html
|
114
116
|
end
|
data/lib/sitediff/api.rb
ADDED
@@ -0,0 +1,265 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff'
|
4
|
+
require 'sitediff/cache'
|
5
|
+
require 'sitediff/config'
|
6
|
+
require 'sitediff/config/creator'
|
7
|
+
require 'sitediff/config/preset'
|
8
|
+
require 'sitediff/fetch'
|
9
|
+
require 'sitediff/webserver/resultserver'
|
10
|
+
|
11
|
+
class SiteDiff
|
12
|
+
##
|
13
|
+
# Sitediff API interface.
|
14
|
+
class Api
|
15
|
+
##
|
16
|
+
# Initializes new Api object.
|
17
|
+
def initialize(directory, config_file = nil)
|
18
|
+
@dir = get_dir(directory)
|
19
|
+
@config = SiteDiff::Config.new(config_file, @dir)
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Intialize a SiteDiff project.
|
24
|
+
#
|
25
|
+
# Calling:
|
26
|
+
# SiteDiff::Api.init(
|
27
|
+
# depth: 3,
|
28
|
+
# directory: 'sitediff',
|
29
|
+
# concurrency: 3,
|
30
|
+
# interval: 0,
|
31
|
+
# include: nil,
|
32
|
+
# exclude: '*.pdf',
|
33
|
+
# preset: 'drupal',
|
34
|
+
# curl_opts: {timeout: 60},
|
35
|
+
# crawl: false
|
36
|
+
# )
|
37
|
+
def self.init(options)
|
38
|
+
# Prepare a config object and write it to the file system.
|
39
|
+
creator = SiteDiff::Config::Creator.new(options[:debug], options[:before_url], options[:after_url])
|
40
|
+
include_regex = Config.create_regexp(options[:include])
|
41
|
+
exclude_regex = Config.create_regexp(options[:exclude])
|
42
|
+
creator.create(
|
43
|
+
depth: options[:depth],
|
44
|
+
directory: options[:directory],
|
45
|
+
concurrency: options[:concurrency],
|
46
|
+
interval: options[:interval],
|
47
|
+
include: include_regex,
|
48
|
+
exclude: exclude_regex,
|
49
|
+
preset: options[:preset],
|
50
|
+
curl_opts: options[:curl_opts]
|
51
|
+
)
|
52
|
+
SiteDiff.log "Created #{creator.config_file.expand_path}", :success
|
53
|
+
|
54
|
+
# TODO: implement crawl ^^^
|
55
|
+
# Discover paths, if enabled.
|
56
|
+
# if options[:crawl]
|
57
|
+
# crawl(creator.config_file)
|
58
|
+
# SiteDiff.log 'You can now run "sitediff diff".', :success
|
59
|
+
# else
|
60
|
+
# SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
|
61
|
+
# end
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Diff the `before` and `after`.
|
66
|
+
#
|
67
|
+
# Calling:
|
68
|
+
# Api.diff(
|
69
|
+
# paths: options['paths'],
|
70
|
+
# paths_file: options['paths-file'],
|
71
|
+
# ignore_whitespace: options['ignore-whitespace'],
|
72
|
+
# export: options['export'],
|
73
|
+
# before: options['before'],
|
74
|
+
# after: options['after'],
|
75
|
+
# cached: options['cached'],
|
76
|
+
# verbose: options['verbose'],
|
77
|
+
# report_format: options['report-format'],
|
78
|
+
# before_report: options['before-report'],
|
79
|
+
# after_report: options['after-report'],
|
80
|
+
# cli_mode: false
|
81
|
+
# )
|
82
|
+
def diff(options)
|
83
|
+
@config.ignore_whitespace = options[:ignore_whitespace]
|
84
|
+
@config.export = options[:export]
|
85
|
+
# Apply "paths" override, if any.
|
86
|
+
if options[:paths]
|
87
|
+
@config.paths = options[:paths]
|
88
|
+
else
|
89
|
+
paths_file = options[:paths_file]
|
90
|
+
paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
|
91
|
+
paths_file = File.expand_path(paths_file)
|
92
|
+
|
93
|
+
paths_count = @config.paths_file_read(paths_file)
|
94
|
+
SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
|
95
|
+
end
|
96
|
+
|
97
|
+
# TODO: Why do we allow before and after override during diff?
|
98
|
+
@config.before['url'] = options[:before] if options[:before]
|
99
|
+
@config.after['url'] = options[:after] if options[:after]
|
100
|
+
|
101
|
+
# Prepare cache.
|
102
|
+
cache = SiteDiff::Cache.new(
|
103
|
+
create: options[:cached] != 'none',
|
104
|
+
directory: @dir
|
105
|
+
)
|
106
|
+
cache.read_tags << :before if %w[before all].include?(options[:cached])
|
107
|
+
cache.read_tags << :after if %w[after all].include?(options[:cached])
|
108
|
+
cache.write_tags << :before << :after
|
109
|
+
|
110
|
+
# Run sitediff.
|
111
|
+
sitediff = SiteDiff.new(
|
112
|
+
@config,
|
113
|
+
cache,
|
114
|
+
options[:verbose],
|
115
|
+
options[:debug]
|
116
|
+
)
|
117
|
+
num_failing = sitediff.run
|
118
|
+
exit_code = num_failing.positive? ? 2 : 0
|
119
|
+
|
120
|
+
# Generate HTML report.
|
121
|
+
if options[:report_format] == 'html' || @config.export
|
122
|
+
sitediff.report.generate_html(
|
123
|
+
@dir,
|
124
|
+
options[:before_report],
|
125
|
+
options[:after_report]
|
126
|
+
)
|
127
|
+
end
|
128
|
+
|
129
|
+
# Generate JSON report.
|
130
|
+
if options[:report_format] == 'json' && @config.export == false
|
131
|
+
sitediff.report.generate_json @dir
|
132
|
+
end
|
133
|
+
|
134
|
+
SiteDiff.log 'Run "sitediff serve" to see a report.' unless options[:export]
|
135
|
+
rescue Config::InvalidConfig => e
|
136
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
137
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
138
|
+
rescue Config::ConfigNotFound => e
|
139
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
140
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
141
|
+
else # no exception was raised
|
142
|
+
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
143
|
+
# Failing diff --> exit(2), populated above
|
144
|
+
exit(exit_code) if options[:cli_mode]
|
145
|
+
end
|
146
|
+
|
147
|
+
##
|
148
|
+
# Crawl the `before` site to determine `paths`.
|
149
|
+
def crawl
|
150
|
+
# Prepare cache.
|
151
|
+
@cache = SiteDiff::Cache.new(
|
152
|
+
create: true,
|
153
|
+
directory: @dir
|
154
|
+
)
|
155
|
+
@cache.write_tags << :before << :after
|
156
|
+
|
157
|
+
# Crawl with Hydra to discover paths.
|
158
|
+
hydra = Typhoeus::Hydra.new(
|
159
|
+
max_concurrency: @config.setting(:concurrency)
|
160
|
+
)
|
161
|
+
@paths = {}
|
162
|
+
@config.roots.each do |tag, url|
|
163
|
+
Crawler.new(
|
164
|
+
hydra,
|
165
|
+
url,
|
166
|
+
@config.setting(:interval),
|
167
|
+
@config.setting(:include),
|
168
|
+
@config.setting(:exclude),
|
169
|
+
@config.setting(:depth),
|
170
|
+
@config.curl_opts,
|
171
|
+
@debug
|
172
|
+
) do |info|
|
173
|
+
SiteDiff.log "Visited #{info.uri}, cached."
|
174
|
+
after_crawl(tag, info)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
hydra.run
|
178
|
+
|
179
|
+
# Write paths to a file.
|
180
|
+
@paths = @paths.values.reduce(&:|).to_a.sort
|
181
|
+
@config.paths_file_write(@paths)
|
182
|
+
|
183
|
+
# Log output.
|
184
|
+
file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
|
185
|
+
SiteDiff.log ''
|
186
|
+
SiteDiff.log "#{@paths.length} page(s) found."
|
187
|
+
SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
|
188
|
+
end
|
189
|
+
|
190
|
+
##
|
191
|
+
# Serves SiteDiff report for accessing in the browser.
|
192
|
+
#
|
193
|
+
# Calling:
|
194
|
+
# api.serve(browse: true, port: 13080)
|
195
|
+
def serve(options)
|
196
|
+
@cache = Cache.new(directory: @dir)
|
197
|
+
@cache.read_tags << :before << :after
|
198
|
+
|
199
|
+
SiteDiff::Webserver::ResultServer.new(
|
200
|
+
options[:port],
|
201
|
+
@dir,
|
202
|
+
browse: options[:browse],
|
203
|
+
cache: @cache,
|
204
|
+
config: @config
|
205
|
+
).wait
|
206
|
+
rescue SiteDiffException => e
|
207
|
+
SiteDiff.log e.message, :error
|
208
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
209
|
+
end
|
210
|
+
|
211
|
+
##
|
212
|
+
#
|
213
|
+
def store(options)
|
214
|
+
# TODO: Figure out how to remove this config.validate call.
|
215
|
+
@config.validate(need_before: false)
|
216
|
+
@config.paths_file_read
|
217
|
+
|
218
|
+
@cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
219
|
+
@cache.write_tags << :before
|
220
|
+
|
221
|
+
base = options[:url] || @config.after['url']
|
222
|
+
fetcher = SiteDiff::Fetch.new(@cache,
|
223
|
+
@config.paths,
|
224
|
+
@config.setting(:interval),
|
225
|
+
@config.setting(:concurrency),
|
226
|
+
get_curl_opts(@config.settings),
|
227
|
+
options[:debug],
|
228
|
+
before: base)
|
229
|
+
fetcher.run do |path, _res|
|
230
|
+
SiteDiff.log "Visited #{path}, cached"
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
private
|
235
|
+
|
236
|
+
##
|
237
|
+
# Ensures that the given directory exists.
|
238
|
+
def get_dir(directory)
|
239
|
+
# Create the dir. Must go before cache initialization!
|
240
|
+
@dir = Pathname.new(directory || '.')
|
241
|
+
@dir.mkpath unless @dir.directory?
|
242
|
+
@dir.to_s
|
243
|
+
end
|
244
|
+
|
245
|
+
##
|
246
|
+
# Processes a crawled path.
|
247
|
+
def after_crawl(tag, info)
|
248
|
+
path = UriWrapper.canonicalize(info.relative)
|
249
|
+
|
250
|
+
# Register the path.
|
251
|
+
@paths[tag] = [] unless @paths[tag]
|
252
|
+
@paths[tag] << path
|
253
|
+
|
254
|
+
result = info.read_result
|
255
|
+
|
256
|
+
# Write result to applicable cache.
|
257
|
+
@cache.set(tag, path, result)
|
258
|
+
# If single-site, cache "after" as "before".
|
259
|
+
@cache.set(:before, path, result) unless @config.roots[:before]
|
260
|
+
|
261
|
+
# TODO: Restore application of rules.
|
262
|
+
# @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
data/lib/sitediff/cache.rb
CHANGED
@@ -6,6 +6,8 @@ require 'fileutils'
|
|
6
6
|
class SiteDiff
|
7
7
|
# SiteDiff Cache Handler.
|
8
8
|
class Cache
|
9
|
+
TIMESTAMP_FILE = 'timestamp'
|
10
|
+
|
9
11
|
attr_accessor :read_tags, :write_tags
|
10
12
|
|
11
13
|
##
|
@@ -17,6 +19,7 @@ class SiteDiff
|
|
17
19
|
# They indicate whether we should use the cache for reading or writing.
|
18
20
|
@read_tags = Set.new
|
19
21
|
@write_tags = Set.new
|
22
|
+
@timestamp_flag = { before: false, after: false }
|
20
23
|
|
21
24
|
# The directory used by the cache for storage.
|
22
25
|
@dir = opts[:directory] || '.'
|
@@ -52,6 +55,7 @@ class SiteDiff
|
|
52
55
|
def set(tag, path, result)
|
53
56
|
return unless @write_tags.include? tag
|
54
57
|
|
58
|
+
save_timestamp(tag)
|
55
59
|
filename = File.join(
|
56
60
|
@dir,
|
57
61
|
'snapshot',
|
@@ -102,5 +106,19 @@ class SiteDiff
|
|
102
106
|
@dir.mkpath unless @dir.directory?
|
103
107
|
@dir.to_s
|
104
108
|
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def save_timestamp(tag)
|
113
|
+
# run once
|
114
|
+
return if @timestamp_flag[tag]
|
115
|
+
|
116
|
+
@timestamp_flag[tag] = true
|
117
|
+
cache_dir = File.join(@dir, 'snapshot', tag.to_s)
|
118
|
+
if File.exist? cache_dir
|
119
|
+
file = File.join(cache_dir, TIMESTAMP_FILE)
|
120
|
+
FileUtils.touch(file)
|
121
|
+
end
|
122
|
+
end
|
105
123
|
end
|
106
124
|
end
|
data/lib/sitediff/cli.rb
CHANGED
@@ -2,16 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'thor'
|
4
4
|
require 'sitediff'
|
5
|
-
require 'sitediff/
|
6
|
-
require 'sitediff/config'
|
7
|
-
require 'sitediff/config/creator'
|
8
|
-
require 'sitediff/config/preset'
|
9
|
-
require 'sitediff/fetch'
|
10
|
-
require 'sitediff/webserver/resultserver'
|
5
|
+
require 'sitediff/api'
|
11
6
|
|
12
7
|
class SiteDiff
|
13
8
|
# SiteDiff CLI.
|
14
|
-
# TODO: Use config.defaults to feed default values for sitediff.yaml params?
|
15
9
|
class Cli < Thor
|
16
10
|
class_option 'directory',
|
17
11
|
type: :string,
|
@@ -78,7 +72,6 @@ class SiteDiff
|
|
78
72
|
enum: %w[html json],
|
79
73
|
default: 'html',
|
80
74
|
desc: 'The format in which a report should be generated.'
|
81
|
-
# TODO: Deprecate the parameters before-report / after-report?
|
82
75
|
option 'before-report',
|
83
76
|
type: :string,
|
84
77
|
desc: 'URL to use in reports. Useful if port forwarding.',
|
@@ -107,82 +100,31 @@ class SiteDiff
|
|
107
100
|
##
|
108
101
|
# Computes diffs.
|
109
102
|
def diff(config_file = nil)
|
110
|
-
@dir = get_dir(options['directory'])
|
111
|
-
config = SiteDiff::Config.new(config_file, @dir)
|
112
|
-
|
113
103
|
# Determine "paths" override based on options.
|
114
104
|
if options['paths'] && options['paths-file']
|
115
105
|
SiteDiff.log "Can't specify both --paths-file and --paths.", :error
|
116
106
|
exit(-1)
|
117
107
|
end
|
118
108
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
|
136
|
-
end
|
137
|
-
|
138
|
-
# TODO: Why do we allow before and after override during diff?
|
139
|
-
config.before['url'] = options['before'] if options['before']
|
140
|
-
config.after['url'] = options['after'] if options['after']
|
141
|
-
|
142
|
-
# Prepare cache.
|
143
|
-
cache = SiteDiff::Cache.new(
|
144
|
-
create: options['cached'] != 'none',
|
145
|
-
directory: @dir
|
146
|
-
)
|
147
|
-
cache.read_tags << :before if %w[before all].include?(options['cached'])
|
148
|
-
cache.read_tags << :after if %w[after all].include?(options['cached'])
|
149
|
-
cache.write_tags << :before << :after
|
150
|
-
|
151
|
-
# Run sitediff.
|
152
|
-
sitediff = SiteDiff.new(
|
153
|
-
config,
|
154
|
-
cache,
|
155
|
-
options['verbose'],
|
156
|
-
options[:debug]
|
157
|
-
)
|
158
|
-
num_failing = sitediff.run
|
159
|
-
exit_code = num_failing.positive? ? 2 : 0
|
160
|
-
|
161
|
-
# Generate HTML report.
|
162
|
-
if options['report-format'] == 'html' || config.export
|
163
|
-
sitediff.report.generate_html(
|
164
|
-
@dir,
|
165
|
-
options['before-report'],
|
166
|
-
options['after-report']
|
109
|
+
api = Api.new(options['directory'], config_file)
|
110
|
+
api_options =
|
111
|
+
clean_keys(
|
112
|
+
options,
|
113
|
+
:paths,
|
114
|
+
:paths_file,
|
115
|
+
:ignore_whitespace,
|
116
|
+
:export,
|
117
|
+
:before,
|
118
|
+
:after,
|
119
|
+
:cached,
|
120
|
+
:verbose,
|
121
|
+
:debug,
|
122
|
+
:report_format,
|
123
|
+
:before_report,
|
124
|
+
:after_report
|
167
125
|
)
|
168
|
-
|
169
|
-
|
170
|
-
# Generate JSON report.
|
171
|
-
if options['report-format'] == 'json' && config.export == false
|
172
|
-
sitediff.report.generate_json @dir
|
173
|
-
end
|
174
|
-
|
175
|
-
SiteDiff.log 'Run "sitediff serve" to see a report.' unless options['export']
|
176
|
-
rescue Config::InvalidConfig => e
|
177
|
-
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
178
|
-
SiteDiff.log e.backtrace, :error if options[:verbose]
|
179
|
-
rescue Config::ConfigNotFound => e
|
180
|
-
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
181
|
-
SiteDiff.log e.backtrace, :error if options[:verbose]
|
182
|
-
else # no exception was raised
|
183
|
-
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
184
|
-
# Failing diff --> exit(2), populated above
|
185
|
-
exit(exit_code)
|
126
|
+
api_options[:cli_mode] = true
|
127
|
+
api.diff(api_options)
|
186
128
|
end
|
187
129
|
|
188
130
|
option :port,
|
@@ -198,22 +140,9 @@ class SiteDiff
|
|
198
140
|
##
|
199
141
|
# Serves SiteDiff report for accessing in the browser.
|
200
142
|
def serve(config_file = nil)
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
cache = Cache.new(directory: @dir)
|
205
|
-
cache.read_tags << :before << :after
|
206
|
-
|
207
|
-
SiteDiff::Webserver::ResultServer.new(
|
208
|
-
options[:port],
|
209
|
-
options['directory'],
|
210
|
-
browse: options[:browse],
|
211
|
-
cache: cache,
|
212
|
-
config: config
|
213
|
-
).wait
|
214
|
-
rescue SiteDiffException => e
|
215
|
-
SiteDiff.log e.message, :error
|
216
|
-
SiteDiff.log e.backtrace, :error if options[:verbose]
|
143
|
+
api = Api.new(options['directory'], config_file)
|
144
|
+
api_options = clean_keys(options, :browse, :port)
|
145
|
+
api.serve(api_options)
|
217
146
|
end
|
218
147
|
|
219
148
|
option :depth,
|
@@ -236,19 +165,14 @@ class SiteDiff
|
|
236
165
|
type: :numeric,
|
237
166
|
default: Config::DEFAULT_CONFIG['settings']['interval'],
|
238
167
|
desc: 'Crawling delay - interval in milliseconds.'
|
239
|
-
option :
|
168
|
+
option :include,
|
240
169
|
type: :string,
|
241
|
-
default: Config::DEFAULT_CONFIG['settings']['
|
242
|
-
desc: 'Optional
|
243
|
-
option :
|
170
|
+
default: Config::DEFAULT_CONFIG['settings']['include'],
|
171
|
+
desc: 'Optional URL include regex for crawling.'
|
172
|
+
option :exclude,
|
244
173
|
type: :string,
|
245
|
-
default: Config::DEFAULT_CONFIG['settings']['
|
246
|
-
desc: 'Optional
|
247
|
-
# TODO: Remove this option. Always ignore SSL errors.
|
248
|
-
option :insecure,
|
249
|
-
type: :boolean,
|
250
|
-
default: false,
|
251
|
-
desc: 'Ignore many HTTPS/SSL errors'
|
174
|
+
default: Config::DEFAULT_CONFIG['settings']['exclude'],
|
175
|
+
desc: 'Optional URL exclude regex for crawling.'
|
252
176
|
option :curl_options,
|
253
177
|
type: :hash,
|
254
178
|
default: {},
|
@@ -261,29 +185,26 @@ class SiteDiff
|
|
261
185
|
SiteDiff.log 'sitediff init requires one or two URLs', :error
|
262
186
|
exit(2)
|
263
187
|
end
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
else
|
285
|
-
SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
|
286
|
-
end
|
188
|
+
api_options =
|
189
|
+
clean_keys(
|
190
|
+
options,
|
191
|
+
:depth,
|
192
|
+
:concurrency,
|
193
|
+
:interval,
|
194
|
+
:include,
|
195
|
+
:exclude,
|
196
|
+
:preset,
|
197
|
+
:crawl
|
198
|
+
)
|
199
|
+
.merge(
|
200
|
+
{
|
201
|
+
after_url: urls.pop,
|
202
|
+
before_url: urls.pop, # may be nil
|
203
|
+
directory: get_dir(options['directory']),
|
204
|
+
curl_opts: get_curl_opts(options)
|
205
|
+
}
|
206
|
+
)
|
207
|
+
Api.init(api_options)
|
287
208
|
end
|
288
209
|
|
289
210
|
option :url,
|
@@ -294,26 +215,9 @@ class SiteDiff
|
|
294
215
|
##
|
295
216
|
# Caches the current version of the site.
|
296
217
|
def store(config_file = nil)
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
config.validate(need_before: false)
|
301
|
-
config.paths_file_read
|
302
|
-
|
303
|
-
cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
304
|
-
cache.write_tags << :before
|
305
|
-
|
306
|
-
base = options[:url] || config.after['url']
|
307
|
-
fetcher = SiteDiff::Fetch.new(cache,
|
308
|
-
config.paths,
|
309
|
-
config.setting(:interval),
|
310
|
-
config.setting(:concurrency),
|
311
|
-
get_curl_opts(config.settings),
|
312
|
-
options[:debug],
|
313
|
-
before: base)
|
314
|
-
fetcher.run do |path, _res|
|
315
|
-
SiteDiff.log "Visited #{path}, cached"
|
316
|
-
end
|
218
|
+
api = Api.new(options['directory'], config_file)
|
219
|
+
api_options = clean_keys(options, :url, :debug)
|
220
|
+
api.store(api_options)
|
317
221
|
end
|
318
222
|
|
319
223
|
desc 'crawl [CONFIG-FILE]',
|
@@ -321,58 +225,15 @@ class SiteDiff
|
|
321
225
|
##
|
322
226
|
# Crawls the "before" site to determine "paths".
|
323
227
|
#
|
324
|
-
# TODO: Move actual crawling to sitediff.crawl(config).
|
325
|
-
# TODO: Switch to paths = sitediff.crawl().
|
326
228
|
def crawl(config_file = nil)
|
327
|
-
|
328
|
-
|
329
|
-
@config = SiteDiff::Config.new(config_file, @dir)
|
330
|
-
|
331
|
-
# Prepare cache.
|
332
|
-
@cache = SiteDiff::Cache.new(
|
333
|
-
create: options['cached'] != 'none',
|
334
|
-
directory: @dir
|
335
|
-
)
|
336
|
-
@cache.write_tags << :before << :after
|
337
|
-
|
338
|
-
# Crawl with Hydra to discover paths.
|
339
|
-
hydra = Typhoeus::Hydra.new(
|
340
|
-
max_concurrency: @config.setting(:concurrency)
|
341
|
-
)
|
342
|
-
@paths = {}
|
343
|
-
@config.roots.each do |tag, url|
|
344
|
-
Crawler.new(
|
345
|
-
hydra,
|
346
|
-
url,
|
347
|
-
@config.setting(:interval),
|
348
|
-
@config.setting(:whitelist),
|
349
|
-
@config.setting(:blacklist),
|
350
|
-
@config.setting(:depth),
|
351
|
-
get_curl_opts(@config.settings),
|
352
|
-
@debug
|
353
|
-
) do |info|
|
354
|
-
SiteDiff.log "Visited #{info.uri}, cached."
|
355
|
-
after_crawl(tag, info)
|
356
|
-
end
|
357
|
-
end
|
358
|
-
hydra.run
|
359
|
-
|
360
|
-
# Write paths to a file.
|
361
|
-
@paths = @paths.values.reduce(&:|).to_a.sort
|
362
|
-
@config.paths_file_write(@paths)
|
363
|
-
|
364
|
-
# Log output.
|
365
|
-
file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
|
366
|
-
SiteDiff.log ''
|
367
|
-
SiteDiff.log "#{@paths.length} page(s) found."
|
368
|
-
SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
|
229
|
+
api = Api.new(options['directory'], config_file)
|
230
|
+
api.crawl
|
369
231
|
end
|
370
232
|
|
371
233
|
no_commands do
|
372
234
|
# Generates CURL options.
|
373
235
|
#
|
374
|
-
# TODO:
|
375
|
-
# TODO: Make all requests insecure and avoid custom curl-opts.
|
236
|
+
# TODO: Possibly move to API class.
|
376
237
|
def get_curl_opts(options)
|
377
238
|
# We do want string keys here
|
378
239
|
bool_hash = { 'true' => true, 'false' => false }
|
@@ -381,10 +242,6 @@ class SiteDiff
|
|
381
242
|
.merge(options['curl_options'] || {})
|
382
243
|
.merge(options['curl_opts'] || {})
|
383
244
|
curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
|
384
|
-
if options[:insecure]
|
385
|
-
curl_opts[:ssl_verifypeer] = false
|
386
|
-
curl_opts[:ssl_verifyhost] = 0
|
387
|
-
end
|
388
245
|
curl_opts
|
389
246
|
end
|
390
247
|
|
@@ -398,23 +255,10 @@ class SiteDiff
|
|
398
255
|
end
|
399
256
|
|
400
257
|
##
|
401
|
-
#
|
402
|
-
def
|
403
|
-
|
404
|
-
|
405
|
-
# Register the path.
|
406
|
-
@paths[tag] = [] unless @paths[tag]
|
407
|
-
@paths[tag] << path
|
408
|
-
|
409
|
-
result = info.read_result
|
410
|
-
|
411
|
-
# Write result to applicable cache.
|
412
|
-
@cache.set(tag, path, result)
|
413
|
-
# If single-site, cache "after" as "before".
|
414
|
-
@cache.set(:before, path, result) unless @config.roots[:before]
|
415
|
-
|
416
|
-
# TODO: Restore application of rules.
|
417
|
-
# @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
258
|
+
# Clean keys - return a subset of a hash with keys as symbols.
|
259
|
+
def clean_keys(hash, *keys)
|
260
|
+
new_hash = hash.transform_keys { |k| k.tr('-', '_').to_sym }
|
261
|
+
new_hash.slice(*keys)
|
418
262
|
end
|
419
263
|
end
|
420
264
|
end
|