sitediff 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sitediff.rb +3 -1
- data/lib/sitediff/api.rb +265 -0
- data/lib/sitediff/cache.rb +18 -0
- data/lib/sitediff/cli.rb +58 -214
- data/lib/sitediff/config.rb +78 -6
- data/lib/sitediff/config/creator.rb +8 -6
- data/lib/sitediff/crawler.rb +9 -9
- data/lib/sitediff/diff.rb +5 -1
- data/lib/sitediff/files/report.html.erb +35 -8
- data/lib/sitediff/files/sitediff.css +78 -1
- data/lib/sitediff/files/sitediff.js +204 -13
- data/lib/sitediff/report.rb +17 -1
- data/lib/sitediff/result.rb +2 -0
- data/lib/sitediff/sanitize.rb +49 -1
- data/lib/sitediff/uriwrapper.rb +4 -1
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f75892f718764c8fd2c18d7f3f7e7cf8908d60ea07c2a765510c8ef409b9f0c1
|
4
|
+
data.tar.gz: 3b3744eca0dda04821152aab596fb67891204a1599b4db72e13b4af484693e65
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 97e9098b290742f1b3efe3c284e9392be95ffd0f7576df413a6ec612142b0573acf8b8b4d43369961c154d801db6284fcc1a8d69cea7da8ed99b64a0a1f1af75
|
7
|
+
data.tar.gz: c4b0e93bc4e0acb3d675c8d675d8f6235035aae72421794495f25223cb086eaa4c87d2cde63caa0eda257b0d91f374a0efbbb416ef8ee88c2f0ffde89a608831
|
data/lib/sitediff.rb
CHANGED
@@ -108,7 +108,9 @@ class SiteDiff
|
|
108
108
|
encoding = read_results[tag].encoding
|
109
109
|
if encoding || html.length.positive?
|
110
110
|
section = @config.send(tag, true)
|
111
|
-
|
111
|
+
opts = { path: path }
|
112
|
+
opts[:output] = @config.output if @config.output
|
113
|
+
Sanitizer.new(html, section, opts).sanitize
|
112
114
|
else
|
113
115
|
html
|
114
116
|
end
|
data/lib/sitediff/api.rb
ADDED
@@ -0,0 +1,265 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'sitediff'
|
4
|
+
require 'sitediff/cache'
|
5
|
+
require 'sitediff/config'
|
6
|
+
require 'sitediff/config/creator'
|
7
|
+
require 'sitediff/config/preset'
|
8
|
+
require 'sitediff/fetch'
|
9
|
+
require 'sitediff/webserver/resultserver'
|
10
|
+
|
11
|
+
class SiteDiff
|
12
|
+
##
|
13
|
+
# Sitediff API interface.
|
14
|
+
class Api
|
15
|
+
##
|
16
|
+
# Initializes new Api object.
|
17
|
+
def initialize(directory, config_file = nil)
|
18
|
+
@dir = get_dir(directory)
|
19
|
+
@config = SiteDiff::Config.new(config_file, @dir)
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Intialize a SiteDiff project.
|
24
|
+
#
|
25
|
+
# Calling:
|
26
|
+
# SiteDiff::Api.init(
|
27
|
+
# depth: 3,
|
28
|
+
# directory: 'sitediff',
|
29
|
+
# concurrency: 3,
|
30
|
+
# interval: 0,
|
31
|
+
# include: nil,
|
32
|
+
# exclude: '*.pdf',
|
33
|
+
# preset: 'drupal',
|
34
|
+
# curl_opts: {timeout: 60},
|
35
|
+
# crawl: false
|
36
|
+
# )
|
37
|
+
def self.init(options)
|
38
|
+
# Prepare a config object and write it to the file system.
|
39
|
+
creator = SiteDiff::Config::Creator.new(options[:debug], options[:before_url], options[:after_url])
|
40
|
+
include_regex = Config.create_regexp(options[:include])
|
41
|
+
exclude_regex = Config.create_regexp(options[:exclude])
|
42
|
+
creator.create(
|
43
|
+
depth: options[:depth],
|
44
|
+
directory: options[:directory],
|
45
|
+
concurrency: options[:concurrency],
|
46
|
+
interval: options[:interval],
|
47
|
+
include: include_regex,
|
48
|
+
exclude: exclude_regex,
|
49
|
+
preset: options[:preset],
|
50
|
+
curl_opts: options[:curl_opts]
|
51
|
+
)
|
52
|
+
SiteDiff.log "Created #{creator.config_file.expand_path}", :success
|
53
|
+
|
54
|
+
# TODO: implement crawl ^^^
|
55
|
+
# Discover paths, if enabled.
|
56
|
+
# if options[:crawl]
|
57
|
+
# crawl(creator.config_file)
|
58
|
+
# SiteDiff.log 'You can now run "sitediff diff".', :success
|
59
|
+
# else
|
60
|
+
# SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
|
61
|
+
# end
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Diff the `before` and `after`.
|
66
|
+
#
|
67
|
+
# Calling:
|
68
|
+
# Api.diff(
|
69
|
+
# paths: options['paths'],
|
70
|
+
# paths_file: options['paths-file'],
|
71
|
+
# ignore_whitespace: options['ignore-whitespace'],
|
72
|
+
# export: options['export'],
|
73
|
+
# before: options['before'],
|
74
|
+
# after: options['after'],
|
75
|
+
# cached: options['cached'],
|
76
|
+
# verbose: options['verbose'],
|
77
|
+
# report_format: options['report-format'],
|
78
|
+
# before_report: options['before-report'],
|
79
|
+
# after_report: options['after-report'],
|
80
|
+
# cli_mode: false
|
81
|
+
# )
|
82
|
+
def diff(options)
|
83
|
+
@config.ignore_whitespace = options[:ignore_whitespace]
|
84
|
+
@config.export = options[:export]
|
85
|
+
# Apply "paths" override, if any.
|
86
|
+
if options[:paths]
|
87
|
+
@config.paths = options[:paths]
|
88
|
+
else
|
89
|
+
paths_file = options[:paths_file]
|
90
|
+
paths_file ||= File.join(@dir, Config::DEFAULT_PATHS_FILENAME)
|
91
|
+
paths_file = File.expand_path(paths_file)
|
92
|
+
|
93
|
+
paths_count = @config.paths_file_read(paths_file)
|
94
|
+
SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
|
95
|
+
end
|
96
|
+
|
97
|
+
# TODO: Why do we allow before and after override during diff?
|
98
|
+
@config.before['url'] = options[:before] if options[:before]
|
99
|
+
@config.after['url'] = options[:after] if options[:after]
|
100
|
+
|
101
|
+
# Prepare cache.
|
102
|
+
cache = SiteDiff::Cache.new(
|
103
|
+
create: options[:cached] != 'none',
|
104
|
+
directory: @dir
|
105
|
+
)
|
106
|
+
cache.read_tags << :before if %w[before all].include?(options[:cached])
|
107
|
+
cache.read_tags << :after if %w[after all].include?(options[:cached])
|
108
|
+
cache.write_tags << :before << :after
|
109
|
+
|
110
|
+
# Run sitediff.
|
111
|
+
sitediff = SiteDiff.new(
|
112
|
+
@config,
|
113
|
+
cache,
|
114
|
+
options[:verbose],
|
115
|
+
options[:debug]
|
116
|
+
)
|
117
|
+
num_failing = sitediff.run
|
118
|
+
exit_code = num_failing.positive? ? 2 : 0
|
119
|
+
|
120
|
+
# Generate HTML report.
|
121
|
+
if options[:report_format] == 'html' || @config.export
|
122
|
+
sitediff.report.generate_html(
|
123
|
+
@dir,
|
124
|
+
options[:before_report],
|
125
|
+
options[:after_report]
|
126
|
+
)
|
127
|
+
end
|
128
|
+
|
129
|
+
# Generate JSON report.
|
130
|
+
if options[:report_format] == 'json' && @config.export == false
|
131
|
+
sitediff.report.generate_json @dir
|
132
|
+
end
|
133
|
+
|
134
|
+
SiteDiff.log 'Run "sitediff serve" to see a report.' unless options[:export]
|
135
|
+
rescue Config::InvalidConfig => e
|
136
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
137
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
138
|
+
rescue Config::ConfigNotFound => e
|
139
|
+
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
140
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
141
|
+
else # no exception was raised
|
142
|
+
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
143
|
+
# Failing diff --> exit(2), populated above
|
144
|
+
exit(exit_code) if options[:cli_mode]
|
145
|
+
end
|
146
|
+
|
147
|
+
##
|
148
|
+
# Crawl the `before` site to determine `paths`.
|
149
|
+
def crawl
|
150
|
+
# Prepare cache.
|
151
|
+
@cache = SiteDiff::Cache.new(
|
152
|
+
create: true,
|
153
|
+
directory: @dir
|
154
|
+
)
|
155
|
+
@cache.write_tags << :before << :after
|
156
|
+
|
157
|
+
# Crawl with Hydra to discover paths.
|
158
|
+
hydra = Typhoeus::Hydra.new(
|
159
|
+
max_concurrency: @config.setting(:concurrency)
|
160
|
+
)
|
161
|
+
@paths = {}
|
162
|
+
@config.roots.each do |tag, url|
|
163
|
+
Crawler.new(
|
164
|
+
hydra,
|
165
|
+
url,
|
166
|
+
@config.setting(:interval),
|
167
|
+
@config.setting(:include),
|
168
|
+
@config.setting(:exclude),
|
169
|
+
@config.setting(:depth),
|
170
|
+
@config.curl_opts,
|
171
|
+
@debug
|
172
|
+
) do |info|
|
173
|
+
SiteDiff.log "Visited #{info.uri}, cached."
|
174
|
+
after_crawl(tag, info)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
hydra.run
|
178
|
+
|
179
|
+
# Write paths to a file.
|
180
|
+
@paths = @paths.values.reduce(&:|).to_a.sort
|
181
|
+
@config.paths_file_write(@paths)
|
182
|
+
|
183
|
+
# Log output.
|
184
|
+
file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
|
185
|
+
SiteDiff.log ''
|
186
|
+
SiteDiff.log "#{@paths.length} page(s) found."
|
187
|
+
SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
|
188
|
+
end
|
189
|
+
|
190
|
+
##
|
191
|
+
# Serves SiteDiff report for accessing in the browser.
|
192
|
+
#
|
193
|
+
# Calling:
|
194
|
+
# api.serve(browse: true, port: 13080)
|
195
|
+
def serve(options)
|
196
|
+
@cache = Cache.new(directory: @dir)
|
197
|
+
@cache.read_tags << :before << :after
|
198
|
+
|
199
|
+
SiteDiff::Webserver::ResultServer.new(
|
200
|
+
options[:port],
|
201
|
+
@dir,
|
202
|
+
browse: options[:browse],
|
203
|
+
cache: @cache,
|
204
|
+
config: @config
|
205
|
+
).wait
|
206
|
+
rescue SiteDiffException => e
|
207
|
+
SiteDiff.log e.message, :error
|
208
|
+
SiteDiff.log e.backtrace, :error if options[:verbose]
|
209
|
+
end
|
210
|
+
|
211
|
+
##
|
212
|
+
#
|
213
|
+
def store(options)
|
214
|
+
# TODO: Figure out how to remove this config.validate call.
|
215
|
+
@config.validate(need_before: false)
|
216
|
+
@config.paths_file_read
|
217
|
+
|
218
|
+
@cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
219
|
+
@cache.write_tags << :before
|
220
|
+
|
221
|
+
base = options[:url] || @config.after['url']
|
222
|
+
fetcher = SiteDiff::Fetch.new(@cache,
|
223
|
+
@config.paths,
|
224
|
+
@config.setting(:interval),
|
225
|
+
@config.setting(:concurrency),
|
226
|
+
get_curl_opts(@config.settings),
|
227
|
+
options[:debug],
|
228
|
+
before: base)
|
229
|
+
fetcher.run do |path, _res|
|
230
|
+
SiteDiff.log "Visited #{path}, cached"
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
private
|
235
|
+
|
236
|
+
##
|
237
|
+
# Ensures that the given directory exists.
|
238
|
+
def get_dir(directory)
|
239
|
+
# Create the dir. Must go before cache initialization!
|
240
|
+
@dir = Pathname.new(directory || '.')
|
241
|
+
@dir.mkpath unless @dir.directory?
|
242
|
+
@dir.to_s
|
243
|
+
end
|
244
|
+
|
245
|
+
##
|
246
|
+
# Processes a crawled path.
|
247
|
+
def after_crawl(tag, info)
|
248
|
+
path = UriWrapper.canonicalize(info.relative)
|
249
|
+
|
250
|
+
# Register the path.
|
251
|
+
@paths[tag] = [] unless @paths[tag]
|
252
|
+
@paths[tag] << path
|
253
|
+
|
254
|
+
result = info.read_result
|
255
|
+
|
256
|
+
# Write result to applicable cache.
|
257
|
+
@cache.set(tag, path, result)
|
258
|
+
# If single-site, cache "after" as "before".
|
259
|
+
@cache.set(:before, path, result) unless @config.roots[:before]
|
260
|
+
|
261
|
+
# TODO: Restore application of rules.
|
262
|
+
# @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
data/lib/sitediff/cache.rb
CHANGED
@@ -6,6 +6,8 @@ require 'fileutils'
|
|
6
6
|
class SiteDiff
|
7
7
|
# SiteDiff Cache Handler.
|
8
8
|
class Cache
|
9
|
+
TIMESTAMP_FILE = 'timestamp'
|
10
|
+
|
9
11
|
attr_accessor :read_tags, :write_tags
|
10
12
|
|
11
13
|
##
|
@@ -17,6 +19,7 @@ class SiteDiff
|
|
17
19
|
# They indicate whether we should use the cache for reading or writing.
|
18
20
|
@read_tags = Set.new
|
19
21
|
@write_tags = Set.new
|
22
|
+
@timestamp_flag = { before: false, after: false }
|
20
23
|
|
21
24
|
# The directory used by the cache for storage.
|
22
25
|
@dir = opts[:directory] || '.'
|
@@ -52,6 +55,7 @@ class SiteDiff
|
|
52
55
|
def set(tag, path, result)
|
53
56
|
return unless @write_tags.include? tag
|
54
57
|
|
58
|
+
save_timestamp(tag)
|
55
59
|
filename = File.join(
|
56
60
|
@dir,
|
57
61
|
'snapshot',
|
@@ -102,5 +106,19 @@ class SiteDiff
|
|
102
106
|
@dir.mkpath unless @dir.directory?
|
103
107
|
@dir.to_s
|
104
108
|
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def save_timestamp(tag)
|
113
|
+
# run once
|
114
|
+
return if @timestamp_flag[tag]
|
115
|
+
|
116
|
+
@timestamp_flag[tag] = true
|
117
|
+
cache_dir = File.join(@dir, 'snapshot', tag.to_s)
|
118
|
+
if File.exist? cache_dir
|
119
|
+
file = File.join(cache_dir, TIMESTAMP_FILE)
|
120
|
+
FileUtils.touch(file)
|
121
|
+
end
|
122
|
+
end
|
105
123
|
end
|
106
124
|
end
|
data/lib/sitediff/cli.rb
CHANGED
@@ -2,16 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'thor'
|
4
4
|
require 'sitediff'
|
5
|
-
require 'sitediff/
|
6
|
-
require 'sitediff/config'
|
7
|
-
require 'sitediff/config/creator'
|
8
|
-
require 'sitediff/config/preset'
|
9
|
-
require 'sitediff/fetch'
|
10
|
-
require 'sitediff/webserver/resultserver'
|
5
|
+
require 'sitediff/api'
|
11
6
|
|
12
7
|
class SiteDiff
|
13
8
|
# SiteDiff CLI.
|
14
|
-
# TODO: Use config.defaults to feed default values for sitediff.yaml params?
|
15
9
|
class Cli < Thor
|
16
10
|
class_option 'directory',
|
17
11
|
type: :string,
|
@@ -78,7 +72,6 @@ class SiteDiff
|
|
78
72
|
enum: %w[html json],
|
79
73
|
default: 'html',
|
80
74
|
desc: 'The format in which a report should be generated.'
|
81
|
-
# TODO: Deprecate the parameters before-report / after-report?
|
82
75
|
option 'before-report',
|
83
76
|
type: :string,
|
84
77
|
desc: 'URL to use in reports. Useful if port forwarding.',
|
@@ -107,82 +100,31 @@ class SiteDiff
|
|
107
100
|
##
|
108
101
|
# Computes diffs.
|
109
102
|
def diff(config_file = nil)
|
110
|
-
@dir = get_dir(options['directory'])
|
111
|
-
config = SiteDiff::Config.new(config_file, @dir)
|
112
|
-
|
113
103
|
# Determine "paths" override based on options.
|
114
104
|
if options['paths'] && options['paths-file']
|
115
105
|
SiteDiff.log "Can't specify both --paths-file and --paths.", :error
|
116
106
|
exit(-1)
|
117
107
|
end
|
118
108
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
SiteDiff.log "Read #{paths_count} paths from: #{paths_file}"
|
136
|
-
end
|
137
|
-
|
138
|
-
# TODO: Why do we allow before and after override during diff?
|
139
|
-
config.before['url'] = options['before'] if options['before']
|
140
|
-
config.after['url'] = options['after'] if options['after']
|
141
|
-
|
142
|
-
# Prepare cache.
|
143
|
-
cache = SiteDiff::Cache.new(
|
144
|
-
create: options['cached'] != 'none',
|
145
|
-
directory: @dir
|
146
|
-
)
|
147
|
-
cache.read_tags << :before if %w[before all].include?(options['cached'])
|
148
|
-
cache.read_tags << :after if %w[after all].include?(options['cached'])
|
149
|
-
cache.write_tags << :before << :after
|
150
|
-
|
151
|
-
# Run sitediff.
|
152
|
-
sitediff = SiteDiff.new(
|
153
|
-
config,
|
154
|
-
cache,
|
155
|
-
options['verbose'],
|
156
|
-
options[:debug]
|
157
|
-
)
|
158
|
-
num_failing = sitediff.run
|
159
|
-
exit_code = num_failing.positive? ? 2 : 0
|
160
|
-
|
161
|
-
# Generate HTML report.
|
162
|
-
if options['report-format'] == 'html' || config.export
|
163
|
-
sitediff.report.generate_html(
|
164
|
-
@dir,
|
165
|
-
options['before-report'],
|
166
|
-
options['after-report']
|
109
|
+
api = Api.new(options['directory'], config_file)
|
110
|
+
api_options =
|
111
|
+
clean_keys(
|
112
|
+
options,
|
113
|
+
:paths,
|
114
|
+
:paths_file,
|
115
|
+
:ignore_whitespace,
|
116
|
+
:export,
|
117
|
+
:before,
|
118
|
+
:after,
|
119
|
+
:cached,
|
120
|
+
:verbose,
|
121
|
+
:debug,
|
122
|
+
:report_format,
|
123
|
+
:before_report,
|
124
|
+
:after_report
|
167
125
|
)
|
168
|
-
|
169
|
-
|
170
|
-
# Generate JSON report.
|
171
|
-
if options['report-format'] == 'json' && config.export == false
|
172
|
-
sitediff.report.generate_json @dir
|
173
|
-
end
|
174
|
-
|
175
|
-
SiteDiff.log 'Run "sitediff serve" to see a report.' unless options['export']
|
176
|
-
rescue Config::InvalidConfig => e
|
177
|
-
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
178
|
-
SiteDiff.log e.backtrace, :error if options[:verbose]
|
179
|
-
rescue Config::ConfigNotFound => e
|
180
|
-
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
181
|
-
SiteDiff.log e.backtrace, :error if options[:verbose]
|
182
|
-
else # no exception was raised
|
183
|
-
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
184
|
-
# Failing diff --> exit(2), populated above
|
185
|
-
exit(exit_code)
|
126
|
+
api_options[:cli_mode] = true
|
127
|
+
api.diff(api_options)
|
186
128
|
end
|
187
129
|
|
188
130
|
option :port,
|
@@ -198,22 +140,9 @@ class SiteDiff
|
|
198
140
|
##
|
199
141
|
# Serves SiteDiff report for accessing in the browser.
|
200
142
|
def serve(config_file = nil)
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
cache = Cache.new(directory: @dir)
|
205
|
-
cache.read_tags << :before << :after
|
206
|
-
|
207
|
-
SiteDiff::Webserver::ResultServer.new(
|
208
|
-
options[:port],
|
209
|
-
options['directory'],
|
210
|
-
browse: options[:browse],
|
211
|
-
cache: cache,
|
212
|
-
config: config
|
213
|
-
).wait
|
214
|
-
rescue SiteDiffException => e
|
215
|
-
SiteDiff.log e.message, :error
|
216
|
-
SiteDiff.log e.backtrace, :error if options[:verbose]
|
143
|
+
api = Api.new(options['directory'], config_file)
|
144
|
+
api_options = clean_keys(options, :browse, :port)
|
145
|
+
api.serve(api_options)
|
217
146
|
end
|
218
147
|
|
219
148
|
option :depth,
|
@@ -236,19 +165,14 @@ class SiteDiff
|
|
236
165
|
type: :numeric,
|
237
166
|
default: Config::DEFAULT_CONFIG['settings']['interval'],
|
238
167
|
desc: 'Crawling delay - interval in milliseconds.'
|
239
|
-
option :
|
168
|
+
option :include,
|
240
169
|
type: :string,
|
241
|
-
default: Config::DEFAULT_CONFIG['settings']['
|
242
|
-
desc: 'Optional
|
243
|
-
option :
|
170
|
+
default: Config::DEFAULT_CONFIG['settings']['include'],
|
171
|
+
desc: 'Optional URL include regex for crawling.'
|
172
|
+
option :exclude,
|
244
173
|
type: :string,
|
245
|
-
default: Config::DEFAULT_CONFIG['settings']['
|
246
|
-
desc: 'Optional
|
247
|
-
# TODO: Remove this option. Always ignore SSL errors.
|
248
|
-
option :insecure,
|
249
|
-
type: :boolean,
|
250
|
-
default: false,
|
251
|
-
desc: 'Ignore many HTTPS/SSL errors'
|
174
|
+
default: Config::DEFAULT_CONFIG['settings']['exclude'],
|
175
|
+
desc: 'Optional URL exclude regex for crawling.'
|
252
176
|
option :curl_options,
|
253
177
|
type: :hash,
|
254
178
|
default: {},
|
@@ -261,29 +185,26 @@ class SiteDiff
|
|
261
185
|
SiteDiff.log 'sitediff init requires one or two URLs', :error
|
262
186
|
exit(2)
|
263
187
|
end
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
else
|
285
|
-
SiteDiff.log 'Run "sitediff crawl" to discover paths. You should then be able to run "sitediff diff".', :info
|
286
|
-
end
|
188
|
+
api_options =
|
189
|
+
clean_keys(
|
190
|
+
options,
|
191
|
+
:depth,
|
192
|
+
:concurrency,
|
193
|
+
:interval,
|
194
|
+
:include,
|
195
|
+
:exclude,
|
196
|
+
:preset,
|
197
|
+
:crawl
|
198
|
+
)
|
199
|
+
.merge(
|
200
|
+
{
|
201
|
+
after_url: urls.pop,
|
202
|
+
before_url: urls.pop, # may be nil
|
203
|
+
directory: get_dir(options['directory']),
|
204
|
+
curl_opts: get_curl_opts(options)
|
205
|
+
}
|
206
|
+
)
|
207
|
+
Api.init(api_options)
|
287
208
|
end
|
288
209
|
|
289
210
|
option :url,
|
@@ -294,26 +215,9 @@ class SiteDiff
|
|
294
215
|
##
|
295
216
|
# Caches the current version of the site.
|
296
217
|
def store(config_file = nil)
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
config.validate(need_before: false)
|
301
|
-
config.paths_file_read
|
302
|
-
|
303
|
-
cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
304
|
-
cache.write_tags << :before
|
305
|
-
|
306
|
-
base = options[:url] || config.after['url']
|
307
|
-
fetcher = SiteDiff::Fetch.new(cache,
|
308
|
-
config.paths,
|
309
|
-
config.setting(:interval),
|
310
|
-
config.setting(:concurrency),
|
311
|
-
get_curl_opts(config.settings),
|
312
|
-
options[:debug],
|
313
|
-
before: base)
|
314
|
-
fetcher.run do |path, _res|
|
315
|
-
SiteDiff.log "Visited #{path}, cached"
|
316
|
-
end
|
218
|
+
api = Api.new(options['directory'], config_file)
|
219
|
+
api_options = clean_keys(options, :url, :debug)
|
220
|
+
api.store(api_options)
|
317
221
|
end
|
318
222
|
|
319
223
|
desc 'crawl [CONFIG-FILE]',
|
@@ -321,58 +225,15 @@ class SiteDiff
|
|
321
225
|
##
|
322
226
|
# Crawls the "before" site to determine "paths".
|
323
227
|
#
|
324
|
-
# TODO: Move actual crawling to sitediff.crawl(config).
|
325
|
-
# TODO: Switch to paths = sitediff.crawl().
|
326
228
|
def crawl(config_file = nil)
|
327
|
-
|
328
|
-
|
329
|
-
@config = SiteDiff::Config.new(config_file, @dir)
|
330
|
-
|
331
|
-
# Prepare cache.
|
332
|
-
@cache = SiteDiff::Cache.new(
|
333
|
-
create: options['cached'] != 'none',
|
334
|
-
directory: @dir
|
335
|
-
)
|
336
|
-
@cache.write_tags << :before << :after
|
337
|
-
|
338
|
-
# Crawl with Hydra to discover paths.
|
339
|
-
hydra = Typhoeus::Hydra.new(
|
340
|
-
max_concurrency: @config.setting(:concurrency)
|
341
|
-
)
|
342
|
-
@paths = {}
|
343
|
-
@config.roots.each do |tag, url|
|
344
|
-
Crawler.new(
|
345
|
-
hydra,
|
346
|
-
url,
|
347
|
-
@config.setting(:interval),
|
348
|
-
@config.setting(:whitelist),
|
349
|
-
@config.setting(:blacklist),
|
350
|
-
@config.setting(:depth),
|
351
|
-
get_curl_opts(@config.settings),
|
352
|
-
@debug
|
353
|
-
) do |info|
|
354
|
-
SiteDiff.log "Visited #{info.uri}, cached."
|
355
|
-
after_crawl(tag, info)
|
356
|
-
end
|
357
|
-
end
|
358
|
-
hydra.run
|
359
|
-
|
360
|
-
# Write paths to a file.
|
361
|
-
@paths = @paths.values.reduce(&:|).to_a.sort
|
362
|
-
@config.paths_file_write(@paths)
|
363
|
-
|
364
|
-
# Log output.
|
365
|
-
file = Pathname.new(@dir) + Config::DEFAULT_PATHS_FILENAME
|
366
|
-
SiteDiff.log ''
|
367
|
-
SiteDiff.log "#{@paths.length} page(s) found."
|
368
|
-
SiteDiff.log "Created #{file.expand_path}.", :success, 'done'
|
229
|
+
api = Api.new(options['directory'], config_file)
|
230
|
+
api.crawl
|
369
231
|
end
|
370
232
|
|
371
233
|
no_commands do
|
372
234
|
# Generates CURL options.
|
373
235
|
#
|
374
|
-
# TODO:
|
375
|
-
# TODO: Make all requests insecure and avoid custom curl-opts.
|
236
|
+
# TODO: Possibly move to API class.
|
376
237
|
def get_curl_opts(options)
|
377
238
|
# We do want string keys here
|
378
239
|
bool_hash = { 'true' => true, 'false' => false }
|
@@ -381,10 +242,6 @@ class SiteDiff
|
|
381
242
|
.merge(options['curl_options'] || {})
|
382
243
|
.merge(options['curl_opts'] || {})
|
383
244
|
curl_opts.each { |k, v| curl_opts[k] = bool_hash.fetch(v, v) }
|
384
|
-
if options[:insecure]
|
385
|
-
curl_opts[:ssl_verifypeer] = false
|
386
|
-
curl_opts[:ssl_verifyhost] = 0
|
387
|
-
end
|
388
245
|
curl_opts
|
389
246
|
end
|
390
247
|
|
@@ -398,23 +255,10 @@ class SiteDiff
|
|
398
255
|
end
|
399
256
|
|
400
257
|
##
|
401
|
-
#
|
402
|
-
def
|
403
|
-
|
404
|
-
|
405
|
-
# Register the path.
|
406
|
-
@paths[tag] = [] unless @paths[tag]
|
407
|
-
@paths[tag] << path
|
408
|
-
|
409
|
-
result = info.read_result
|
410
|
-
|
411
|
-
# Write result to applicable cache.
|
412
|
-
@cache.set(tag, path, result)
|
413
|
-
# If single-site, cache "after" as "before".
|
414
|
-
@cache.set(:before, path, result) unless @config.roots[:before]
|
415
|
-
|
416
|
-
# TODO: Restore application of rules.
|
417
|
-
# @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
|
258
|
+
# Clean keys - return a subset of a hash with keys as symbols.
|
259
|
+
def clean_keys(hash, *keys)
|
260
|
+
new_hash = hash.transform_keys { |k| k.tr('-', '_').to_sym }
|
261
|
+
new_hash.slice(*keys)
|
418
262
|
end
|
419
263
|
end
|
420
264
|
end
|