sitediff 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/sitediff.rb +27 -11
- data/lib/sitediff/cache.rb +11 -1
- data/lib/sitediff/cli.rb +78 -16
- data/lib/sitediff/config/creator.rb +13 -8
- data/lib/sitediff/crawler.rb +33 -4
- data/lib/sitediff/diff.rb +23 -0
- data/lib/sitediff/fetch.rb +10 -2
- data/lib/sitediff/result.rb +6 -2
- data/lib/sitediff/sanitize.rb +8 -0
- data/lib/sitediff/sanitize/regexp.rb +2 -0
- data/lib/sitediff/uriwrapper.rb +21 -6
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1377b6bafe658b4a8a8f50ef0f54e577e99f1a87
|
4
|
+
data.tar.gz: 9a80e20a89b7f2f60506bbcccdc2b9f7037320f8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7fe3ce1b2e7bc1762d5e8f4a1bfd4ab9280963732d033d4b403087f71a4d6caa394669eadb9d82064ad963dd62918f95cae7e0b0495dd92979f105be1bfe6f5e
|
7
|
+
data.tar.gz: fb98c439544172ae40c0ba347272ec1287a1dc9042ab238e4abd8f720d52307ffa8a4b3fec7a70df68c7fdf845b324fe69c99693c5bbbc369f9e2c22fbe8c404
|
data/lib/sitediff.rb
CHANGED
@@ -54,10 +54,11 @@ class SiteDiff
|
|
54
54
|
@config.after['url']
|
55
55
|
end
|
56
56
|
|
57
|
-
def initialize(config, cache, concurrency, verbose = true)
|
57
|
+
def initialize(config, cache, concurrency, interval, verbose = true, debug = false)
|
58
58
|
@cache = cache
|
59
59
|
@verbose = verbose
|
60
|
-
|
60
|
+
@debug = debug
|
61
|
+
@interval = interval
|
61
62
|
# Check for single-site mode
|
62
63
|
validate_opts = {}
|
63
64
|
if !config.before['url'] && @cache.tag?(:before)
|
@@ -77,18 +78,33 @@ class SiteDiff
|
|
77
78
|
def sanitize(path, read_results)
|
78
79
|
%i[before after].map do |tag|
|
79
80
|
html = read_results[tag].content
|
80
|
-
|
81
|
-
|
81
|
+
encoding = read_results[tag].encoding
|
82
|
+
if encoding
|
83
|
+
config = @config.send(tag)
|
84
|
+
Sanitizer.new(html, config, path: path).sanitize
|
85
|
+
else
|
86
|
+
html
|
87
|
+
end
|
82
88
|
end
|
83
89
|
end
|
84
90
|
|
85
91
|
# Process a set of read results
|
86
92
|
def process_results(path, read_results)
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
93
|
+
if (error = (read_results[:before].error || read_results[:after].error))
|
94
|
+
diff = Result.new(path, nil, nil, nil, nil, error)
|
95
|
+
else
|
96
|
+
begin
|
97
|
+
diff = Result.new(path,
|
98
|
+
*sanitize(path, read_results),
|
99
|
+
read_results[:before].encoding,
|
100
|
+
read_results[:after].encoding,
|
101
|
+
nil)
|
102
|
+
rescue => e
|
103
|
+
raise if @debug
|
104
|
+
|
105
|
+
Result.new(path, nil, nil, nil, nil, "Sanitization error: #{e}")
|
106
|
+
end
|
107
|
+
end
|
92
108
|
@results[path] = diff
|
93
109
|
|
94
110
|
# Print results in order!
|
@@ -100,7 +116,7 @@ class SiteDiff
|
|
100
116
|
|
101
117
|
# Perform the comparison, populate @results and return the number of failing
|
102
118
|
# paths (paths with non-zero diff).
|
103
|
-
def run(curl_opts = {})
|
119
|
+
def run(curl_opts = {}, debug = true)
|
104
120
|
# Map of path -> Result object, populated by process_results
|
105
121
|
@results = {}
|
106
122
|
@ordered = @config.paths.dup
|
@@ -115,7 +131,7 @@ class SiteDiff
|
|
115
131
|
# so passing this instead but @config.after['curl_opts'] is ignored.
|
116
132
|
config_curl_opts = @config.before['curl_opts']
|
117
133
|
curl_opts = config_curl_opts.clone.merge(curl_opts) if config_curl_opts
|
118
|
-
fetcher = Fetch.new(@cache, @config.paths, @concurrency, curl_opts,
|
134
|
+
fetcher = Fetch.new(@cache, @config.paths, @interval, @concurrency, curl_opts, debug,
|
119
135
|
before: before, after: after)
|
120
136
|
fetcher.run(&method(:process_results))
|
121
137
|
|
data/lib/sitediff/cache.rb
CHANGED
@@ -8,10 +8,13 @@ class SiteDiff
|
|
8
8
|
attr_accessor :read_tags, :write_tags
|
9
9
|
|
10
10
|
def initialize(opts = {})
|
11
|
-
@dir = opts[:dir] || '.'
|
12
11
|
@create = opts[:create]
|
12
|
+
|
13
|
+
# Read and Write tags are sets that can contain :before and :after
|
14
|
+
# They indicate whether we should use the cache for reading or writing
|
13
15
|
@read_tags = Set.new
|
14
16
|
@write_tags = Set.new
|
17
|
+
@dir = opts[:directory] || '.'
|
15
18
|
end
|
16
19
|
|
17
20
|
# Is a tag cached?
|
@@ -63,5 +66,12 @@ class SiteDiff
|
|
63
66
|
# Ensure encoding stays the same!
|
64
67
|
Marshal.dump([tag, path.encode('UTF-8')])
|
65
68
|
end
|
69
|
+
|
70
|
+
def get_dir(directory)
|
71
|
+
# Create the dir. Must go before cache initialization!
|
72
|
+
@dir = Pathname.new(directory || '.')
|
73
|
+
@dir.mkpath unless @dir.directory?
|
74
|
+
@dir.to_s
|
75
|
+
end
|
66
76
|
end
|
67
77
|
end
|
data/lib/sitediff/cli.rb
CHANGED
@@ -23,6 +23,14 @@ class SiteDiff
|
|
23
23
|
type: :boolean,
|
24
24
|
default: false,
|
25
25
|
desc: 'Ignore many HTTPS/SSL errors'
|
26
|
+
class_option :debug,
|
27
|
+
type: :boolean,
|
28
|
+
default: false,
|
29
|
+
desc: 'Debug mode. Stop on certain errors and produce a traceback.'
|
30
|
+
class_option :interval,
|
31
|
+
type: :numeric,
|
32
|
+
default: 0,
|
33
|
+
desc: 'Crawling delay - interval in milliseconds'
|
26
34
|
|
27
35
|
# Thor, by default, exits with 0 no matter what!
|
28
36
|
def self.exit_on_failure?
|
@@ -75,7 +83,10 @@ class SiteDiff
|
|
75
83
|
desc: 'Max number of concurrent connections made'
|
76
84
|
desc 'diff [OPTIONS] [CONFIGFILES]', 'Perform systematic diff on given URLs'
|
77
85
|
def diff(*config_files)
|
78
|
-
|
86
|
+
@interval = options['interval']
|
87
|
+
check_interval(@interval)
|
88
|
+
@dir = get_dir(options['directory'])
|
89
|
+
config = SiteDiff::Config.new(config_files, @dir)
|
79
90
|
|
80
91
|
# override config based on options
|
81
92
|
paths = options['paths']
|
@@ -100,20 +111,21 @@ class SiteDiff
|
|
100
111
|
|
101
112
|
# Setup cache
|
102
113
|
cache = SiteDiff::Cache.new(create: options['cached'] != 'none',
|
103
|
-
|
114
|
+
directory: @dir)
|
104
115
|
cache.read_tags << :before if %w[before all].include?(options['cached'])
|
105
116
|
cache.read_tags << :after if %w[after all].include?(options['cached'])
|
106
117
|
cache.write_tags << :before << :after
|
107
118
|
|
108
|
-
sitediff = SiteDiff.new(config, cache, options[:concurrency],
|
109
|
-
options['verbose'])
|
110
|
-
num_failing = sitediff.run(get_curl_opts(options))
|
119
|
+
sitediff = SiteDiff.new(config, cache, options[:concurrency], @interval,
|
120
|
+
options['verbose'], options[:debug])
|
121
|
+
num_failing = sitediff.run(get_curl_opts(options), options[:debug])
|
111
122
|
exit_code = num_failing > 0 ? 2 : 0
|
112
123
|
|
113
|
-
sitediff.dump(
|
124
|
+
sitediff.dump(@dir, options['before-report'],
|
114
125
|
options['after-report'])
|
115
126
|
rescue Config::InvalidConfig => e
|
116
127
|
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
128
|
+
SiteDiff.log "at #{e.backtrace}", :error
|
117
129
|
else # no exception was raised
|
118
130
|
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
119
131
|
# Failing diff --> exit(2), populated above
|
@@ -132,8 +144,8 @@ class SiteDiff
|
|
132
144
|
def serve(*config_files)
|
133
145
|
config = SiteDiff::Config.new(config_files, options['directory'])
|
134
146
|
# Could check non-empty config here but currently errors are already raised.
|
135
|
-
|
136
|
-
cache = Cache.new(
|
147
|
+
@dir = get_dir(options['directory'])
|
148
|
+
cache = Cache.new(directory: @dir)
|
137
149
|
cache.read_tags << :before << :after
|
138
150
|
|
139
151
|
SiteDiff::Webserver::ResultServer.new(
|
@@ -145,6 +157,7 @@ class SiteDiff
|
|
145
157
|
).wait
|
146
158
|
rescue SiteDiffException => e
|
147
159
|
SiteDiff.log e.message, :error
|
160
|
+
SiteDiff.log e.backtrace, :error
|
148
161
|
end
|
149
162
|
|
150
163
|
option :depth,
|
@@ -160,19 +173,37 @@ class SiteDiff
|
|
160
173
|
type: :numeric,
|
161
174
|
default: 3,
|
162
175
|
desc: 'Max number of concurrent connections made'
|
176
|
+
option :whitelist,
|
177
|
+
type: :string,
|
178
|
+
default: '',
|
179
|
+
desc: 'Optional whitelist for crawling'
|
180
|
+
option :blacklist,
|
181
|
+
type: :string,
|
182
|
+
default: '',
|
183
|
+
desc: 'Optional blacklist for crawling'
|
163
184
|
desc 'init URL [URL]', 'Create a sitediff configuration'
|
164
185
|
def init(*urls)
|
165
186
|
unless (1..2).cover? urls.size
|
166
187
|
SiteDiff.log 'sitediff init requires one or two URLs', :error
|
167
|
-
exit
|
188
|
+
exit(2)
|
168
189
|
end
|
169
190
|
|
191
|
+
@interval = options['interval']
|
192
|
+
check_interval(@interval)
|
193
|
+
@dir = get_dir(options['directory'])
|
170
194
|
curl_opts = get_curl_opts(options)
|
171
|
-
|
172
|
-
|
195
|
+
@whitelist = create_regexp(options['whitelist'])
|
196
|
+
@blacklist = create_regexp(options['blacklist'])
|
197
|
+
creator = SiteDiff::Config::Creator.new(options[:concurrency],
|
198
|
+
options['interval'],
|
199
|
+
@whitelist,
|
200
|
+
@blacklist,
|
201
|
+
curl_opts,
|
202
|
+
options[:debug],
|
203
|
+
*urls)
|
173
204
|
creator.create(
|
174
205
|
depth: options[:depth],
|
175
|
-
directory:
|
206
|
+
directory: @dir,
|
176
207
|
rules: options[:rules] != 'no',
|
177
208
|
rules_disabled: (options[:rules] == 'disabled')
|
178
209
|
) do |_tag, info|
|
@@ -193,14 +224,19 @@ class SiteDiff
|
|
193
224
|
desc 'store [CONFIGFILES]',
|
194
225
|
'Cache the current contents of a site for later comparison'
|
195
226
|
def store(*config_files)
|
196
|
-
|
227
|
+
@dir = get_dir(options['directory'])
|
228
|
+
config = SiteDiff::Config.new(config_files, @dir)
|
197
229
|
config.validate(need_before: false)
|
198
|
-
|
199
|
-
cache = SiteDiff::Cache.new(create: true)
|
230
|
+
cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
200
231
|
cache.write_tags << :before
|
201
232
|
|
202
233
|
base = options[:url] || config.after['url']
|
203
|
-
fetcher = SiteDiff::Fetch.new(cache,
|
234
|
+
fetcher = SiteDiff::Fetch.new(cache,
|
235
|
+
config.paths,
|
236
|
+
options[:interval],
|
237
|
+
options[:concurrency],
|
238
|
+
get_curl_opts(options),
|
239
|
+
options[:debug],
|
204
240
|
before: base)
|
205
241
|
fetcher.run do |path, _res|
|
206
242
|
SiteDiff.log "Visited #{path}, cached"
|
@@ -219,6 +255,32 @@ class SiteDiff
|
|
219
255
|
end
|
220
256
|
curl_opts
|
221
257
|
end
|
258
|
+
|
259
|
+
def check_interval(interval)
|
260
|
+
if interval != 0 && options[:concurrency] != 1
|
261
|
+
SiteDiff.log '--concurrency must be set to 1 in order to enable the interval feature'
|
262
|
+
exit(2)
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
def get_dir(directory)
|
267
|
+
# Create the dir. Must go before cache initialization!
|
268
|
+
@dir = Pathname.new(directory || '.')
|
269
|
+
@dir.mkpath unless @dir.directory?
|
270
|
+
@dir.to_s
|
271
|
+
end
|
272
|
+
|
273
|
+
def create_regexp(string_param)
|
274
|
+
begin
|
275
|
+
@return_value = string_param == '' ? nil : Regexp.new(string_param)
|
276
|
+
rescue SiteDiffException => e
|
277
|
+
@return_value = nil
|
278
|
+
SiteDiff.log 'whitelist and blacklist parameters must be valid regular expressions', :error
|
279
|
+
SiteDiff.log e.message, :error
|
280
|
+
SiteDiff.log e.backtrace, :error
|
281
|
+
end
|
282
|
+
return @return_value
|
283
|
+
end
|
222
284
|
end
|
223
285
|
end
|
224
286
|
end
|
@@ -11,11 +11,15 @@ require 'yaml'
|
|
11
11
|
class SiteDiff
|
12
12
|
class Config
|
13
13
|
class Creator
|
14
|
-
def initialize(concurrency, curl_opts, *urls)
|
14
|
+
def initialize(concurrency, interval, whitelist, blacklist, curl_opts, debug, *urls)
|
15
15
|
@concurrency = concurrency
|
16
|
+
@interval = interval
|
17
|
+
@whitelist = whitelist
|
18
|
+
@blacklist = blacklist
|
16
19
|
@after = urls.pop
|
17
20
|
@before = urls.pop # May be nil
|
18
21
|
@curl_opts = curl_opts
|
22
|
+
@debug = debug
|
19
23
|
end
|
20
24
|
|
21
25
|
def roots
|
@@ -30,18 +34,15 @@ class SiteDiff
|
|
30
34
|
def create(opts, &block)
|
31
35
|
@config = {}
|
32
36
|
@callback = block
|
33
|
-
|
34
|
-
# Handle options
|
35
37
|
@dir = Pathname.new(opts[:directory])
|
38
|
+
|
39
|
+
# Handle other options
|
36
40
|
@depth = opts[:depth]
|
37
41
|
@rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
|
38
42
|
|
39
|
-
# Create the dir. Must go before cache initialization!
|
40
|
-
@dir.mkpath unless @dir.directory?
|
41
|
-
|
42
43
|
# Setup instance vars
|
43
44
|
@paths = Hash.new { |h, k| h[k] = Set.new }
|
44
|
-
@cache = Cache.new(
|
45
|
+
@cache = Cache.new(directory: @dir.to_s, create: true)
|
45
46
|
@cache.write_tags << :before << :after
|
46
47
|
|
47
48
|
build_config
|
@@ -64,7 +65,7 @@ class SiteDiff
|
|
64
65
|
def crawl(depth = nil)
|
65
66
|
hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
|
66
67
|
roots.each do |tag, u|
|
67
|
-
Crawler.new(hydra, u, depth, @curl_opts) do |info|
|
68
|
+
Crawler.new(hydra, u, @interval, @whitelist, @blacklist, depth, @curl_opts, @debug) do |info|
|
68
69
|
crawled_path(tag, info)
|
69
70
|
end
|
70
71
|
end
|
@@ -113,6 +114,10 @@ class SiteDiff
|
|
113
114
|
end
|
114
115
|
end
|
115
116
|
|
117
|
+
def directory
|
118
|
+
@dir
|
119
|
+
end
|
120
|
+
|
116
121
|
def config_file
|
117
122
|
@dir + Config::DEFAULT_FILENAME
|
118
123
|
end
|
data/lib/sitediff/crawler.rb
CHANGED
@@ -14,14 +14,24 @@ class SiteDiff
|
|
14
14
|
DEFAULT_DEPTH = 3
|
15
15
|
|
16
16
|
# Create a crawler with a base URL
|
17
|
-
def initialize(hydra, base,
|
18
|
-
|
17
|
+
def initialize(hydra, base,
|
18
|
+
interval,
|
19
|
+
whitelist,
|
20
|
+
blacklist,
|
21
|
+
depth = DEFAULT_DEPTH,
|
22
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
|
23
|
+
debug = true,
|
24
|
+
&block)
|
19
25
|
@hydra = hydra
|
20
26
|
@base_uri = Addressable::URI.parse(base)
|
21
27
|
@base = base
|
28
|
+
@interval = interval
|
29
|
+
@whitelist = whitelist
|
30
|
+
@blacklist = blacklist
|
22
31
|
@found = Set.new
|
23
32
|
@callback = block
|
24
33
|
@curl_opts = curl_opts
|
34
|
+
@debug = debug
|
25
35
|
|
26
36
|
add_uri('', depth)
|
27
37
|
end
|
@@ -32,7 +42,7 @@ class SiteDiff
|
|
32
42
|
|
33
43
|
@found << rel
|
34
44
|
|
35
|
-
wrapper = UriWrapper.new(@base + rel, @curl_opts)
|
45
|
+
wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
|
36
46
|
wrapper.queue(@hydra) do |res|
|
37
47
|
fetched_uri(rel, depth, res)
|
38
48
|
end
|
@@ -58,6 +68,11 @@ class SiteDiff
|
|
58
68
|
read_result: res,
|
59
69
|
document: doc
|
60
70
|
)
|
71
|
+
# Insert delay to limit fetching rate
|
72
|
+
if @interval != 0
|
73
|
+
SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
|
74
|
+
sleep(@interval / 1000.0)
|
75
|
+
end
|
61
76
|
@callback[info]
|
62
77
|
|
63
78
|
return unless depth >= 1
|
@@ -99,7 +114,21 @@ class SiteDiff
|
|
99
114
|
# Filter out links we don't want. Links passed in are absolute URIs.
|
100
115
|
def filter_links(uris)
|
101
116
|
uris.find_all do |u|
|
102
|
-
u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
|
117
|
+
is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path)
|
118
|
+
if is_sub_uri
|
119
|
+
is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
|
120
|
+
is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
|
121
|
+
if is_blacklisted && !is_whitelisted
|
122
|
+
SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
|
123
|
+
end
|
124
|
+
is_whitelisted || !is_blacklisted
|
125
|
+
end
|
126
|
+
# SiteDiff.log "Filtering URL #{u.path}", :info
|
127
|
+
# SiteDiff.log Regexp.new(@blacklist).match(u.path).inspect, :info
|
128
|
+
# (u.host == @base_uri.host) &&
|
129
|
+
# (u.path.start_with?(@base_uri.path)) &&
|
130
|
+
# (@whitelist == '' || Regexp.new(@whitelist).match(u.path)) &&
|
131
|
+
# (@blacklist == '' || !(Regexp.new(@blacklist).match(u.path)))
|
103
132
|
end
|
104
133
|
end
|
105
134
|
end
|
data/lib/sitediff/diff.rb
CHANGED
@@ -4,6 +4,7 @@ require 'sitediff'
|
|
4
4
|
require 'diffy'
|
5
5
|
require 'erb'
|
6
6
|
require 'rainbow'
|
7
|
+
require 'digest'
|
7
8
|
|
8
9
|
class SiteDiff
|
9
10
|
module Diff
|
@@ -15,6 +16,28 @@ class SiteDiff
|
|
15
16
|
diff.to_s(:html) : nil
|
16
17
|
end
|
17
18
|
|
19
|
+
def encoding_blurb(encoding)
|
20
|
+
if encoding
|
21
|
+
"Text content returned - charset #{encoding}"
|
22
|
+
else
|
23
|
+
'Binary content returned'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def binary_diffy(before, after, before_encoding, after_encoding)
|
28
|
+
if before_encoding || after_encoding
|
29
|
+
Diffy::Diff.new(encoding_blurb(before_encoding),
|
30
|
+
encoding_blurb(after_encoding)).to_s(:html)
|
31
|
+
elsif before == after
|
32
|
+
nil
|
33
|
+
else
|
34
|
+
md5_before = Digest::MD5.hexdigest(before)
|
35
|
+
md5_after = Digest::MD5.hexdigest(after)
|
36
|
+
Diffy::Diff.new("Binary content returned md5: #{md5_before}",
|
37
|
+
"Binary content returned md5: #{md5_after}").to_s(:html)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
18
41
|
def terminal_diffy(before_html, after_html)
|
19
42
|
args = []
|
20
43
|
args << :color if Rainbow.enabled
|
data/lib/sitediff/fetch.rb
CHANGED
@@ -8,12 +8,15 @@ class SiteDiff
|
|
8
8
|
# Cache is a cache object, see sitediff/cache
|
9
9
|
# Paths is a list of sub-paths
|
10
10
|
# Tags is a hash of tag names => base URLs.
|
11
|
-
def initialize(cache, paths, concurrency = 3, curl_opts = nil,
|
11
|
+
def initialize(cache, paths, interval, concurrency = 3, curl_opts = nil,
|
12
|
+
debug = true, **tags)
|
12
13
|
@cache = cache
|
14
|
+
@interval = interval
|
13
15
|
@paths = paths
|
14
16
|
@tags = tags
|
15
17
|
@curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
|
16
18
|
@concurrency = concurrency
|
19
|
+
@debug = debug
|
17
20
|
end
|
18
21
|
|
19
22
|
# Fetch all the paths, once per tag.
|
@@ -41,8 +44,13 @@ class SiteDiff
|
|
41
44
|
results[tag] = UriWrapper::ReadResult.error('Not cached')
|
42
45
|
process_results(path, results)
|
43
46
|
else
|
44
|
-
uri = UriWrapper.new(base + path, @curl_opts)
|
47
|
+
uri = UriWrapper.new(base + path, @curl_opts, @debug)
|
45
48
|
uri.queue(@hydra) do |resl|
|
49
|
+
# Insert delay to limit fetching rate
|
50
|
+
if @interval != 0
|
51
|
+
SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
|
52
|
+
sleep(@interval / 1000.0)
|
53
|
+
end
|
46
54
|
@cache.set(tag, path, resl)
|
47
55
|
results[tag] = resl
|
48
56
|
process_results(path, results)
|
data/lib/sitediff/result.rb
CHANGED
@@ -6,7 +6,7 @@ require 'digest/sha1'
|
|
6
6
|
require 'fileutils'
|
7
7
|
|
8
8
|
class SiteDiff
|
9
|
-
class Result < Struct.new(:path, :before, :after, :error, :verbose)
|
9
|
+
class Result < Struct.new(:path, :before, :after, :before_encoding, :after_encoding, :error, :verbose)
|
10
10
|
STATUS_SUCCESS = 0 # Identical before and after
|
11
11
|
STATUS_FAILURE = 1 # Different before and after
|
12
12
|
STATUS_ERROR = 2 # Couldn't fetch page
|
@@ -19,7 +19,11 @@ class SiteDiff
|
|
19
19
|
if error
|
20
20
|
@status = STATUS_ERROR
|
21
21
|
else
|
22
|
-
|
22
|
+
if !before_encoding || !after_encoding
|
23
|
+
@diff = Diff.binary_diffy(before, after, before_encoding, after_encoding)
|
24
|
+
else
|
25
|
+
@diff = Diff.html_diffy(before, after)
|
26
|
+
end
|
23
27
|
@status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
|
24
28
|
end
|
25
29
|
end
|
data/lib/sitediff/sanitize.rb
CHANGED
@@ -96,6 +96,10 @@ class SiteDiff
|
|
96
96
|
selector.each { |r| r.apply(@node) }
|
97
97
|
@html = Sanitizer.prettify(@node)
|
98
98
|
@node = nil
|
99
|
+
# Prevent potential UTF-8 encoding errors by removing bytes
|
100
|
+
# Not the only solution. An alternative is to return the
|
101
|
+
# string unmodified.
|
102
|
+
@html = @html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
99
103
|
global.each { |r| r.apply(@html) }
|
100
104
|
end
|
101
105
|
|
@@ -144,6 +148,10 @@ class SiteDiff
|
|
144
148
|
|
145
149
|
# There's a lot of cruft left over,that we don't want
|
146
150
|
|
151
|
+
# Prevent potential UTF-8 encoding errors by removing invalid bytes.
|
152
|
+
# Not the only solution.
|
153
|
+
# An alternative is to return the string unmodified.
|
154
|
+
str = str.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
147
155
|
# Remove xml declaration and <html> tags
|
148
156
|
str.sub!(/\A<\?xml.*$\n/, '')
|
149
157
|
str.sub!(/\A^<html>$\n/, '')
|
@@ -47,6 +47,8 @@ class SiteDiff
|
|
47
47
|
def gsub!(str)
|
48
48
|
re = ::Regexp.new(@rule['pattern'])
|
49
49
|
sub = @rule['substitute'] || ''
|
50
|
+
# Expecting a mutation here. Do not reassign the variable str
|
51
|
+
# for the purpose of removing UTF-8 encoding errors.
|
50
52
|
str.gsub!(re, sub)
|
51
53
|
str
|
52
54
|
end
|
data/lib/sitediff/uriwrapper.rb
CHANGED
@@ -18,10 +18,11 @@ class SiteDiff
|
|
18
18
|
|
19
19
|
# This lets us treat errors or content as one object
|
20
20
|
class ReadResult
|
21
|
-
attr_accessor :content, :error_code, :error
|
21
|
+
attr_accessor :encoding, :content, :error_code, :error
|
22
22
|
|
23
|
-
def initialize(content = nil)
|
23
|
+
def initialize(content = nil, encoding = 'utf-8')
|
24
24
|
@content = content
|
25
|
+
@encoding = encoding
|
25
26
|
@error = nil
|
26
27
|
@error_code = nil
|
27
28
|
end
|
@@ -34,11 +35,12 @@ class SiteDiff
|
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
37
|
-
def initialize(uri, curl_opts = DEFAULT_CURL_OPTS)
|
38
|
+
def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
|
38
39
|
@uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
|
39
40
|
# remove trailing '/'s from local URIs
|
40
41
|
@uri.path.gsub!(%r{/*$}, '') if local?
|
41
42
|
@curl_opts = curl_opts
|
43
|
+
@debug = debug
|
42
44
|
end
|
43
45
|
|
44
46
|
def user
|
@@ -78,7 +80,7 @@ class SiteDiff
|
|
78
80
|
|
79
81
|
# Returns the encoding of an HTTP response from headers , nil if not
|
80
82
|
# specified.
|
81
|
-
def
|
83
|
+
def charset_encoding(http_headers)
|
82
84
|
if (content_type = http_headers['Content-Type'])
|
83
85
|
if (md = /;\s*charset=([-\w]*)/.match(content_type))
|
84
86
|
md[1]
|
@@ -101,10 +103,23 @@ class SiteDiff
|
|
101
103
|
body = resp.body
|
102
104
|
# Typhoeus does not respect HTTP headers when setting the encoding
|
103
105
|
# resp.body; coerce if possible.
|
104
|
-
if (encoding =
|
106
|
+
if (encoding = charset_encoding(resp.headers))
|
105
107
|
body.force_encoding(encoding)
|
106
108
|
end
|
107
|
-
|
109
|
+
# Should be wrapped with rescue I guess? Maybe this entire function?
|
110
|
+
# Should at least be an option in the Cli to disable this.
|
111
|
+
# "stop on first error"
|
112
|
+
begin
|
113
|
+
yield ReadResult.new(body, encoding)
|
114
|
+
rescue ArgumentError => e
|
115
|
+
raise if @debug
|
116
|
+
|
117
|
+
yield ReadResult.error("Parsing error for #{@uri}: #{e.message}")
|
118
|
+
rescue => e
|
119
|
+
raise if @debug
|
120
|
+
|
121
|
+
yield ReadResult.error("Unknown parsing error for #{@uri}: #{e.message}")
|
122
|
+
end
|
108
123
|
end
|
109
124
|
|
110
125
|
req.on_failure do |resp|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitediff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Dergachev
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2019-04-02 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: pkg-config
|
@@ -167,7 +167,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
167
167
|
version: '0'
|
168
168
|
requirements: []
|
169
169
|
rubyforge_project:
|
170
|
-
rubygems_version: 2.5.2
|
170
|
+
rubygems_version: 2.5.2.3
|
171
171
|
signing_key:
|
172
172
|
specification_version: 4
|
173
173
|
summary: Compare two versions of a site with ease!
|