sitediff 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sitediff.rb +27 -11
- data/lib/sitediff/cache.rb +11 -1
- data/lib/sitediff/cli.rb +78 -16
- data/lib/sitediff/config/creator.rb +13 -8
- data/lib/sitediff/crawler.rb +33 -4
- data/lib/sitediff/diff.rb +23 -0
- data/lib/sitediff/fetch.rb +10 -2
- data/lib/sitediff/result.rb +6 -2
- data/lib/sitediff/sanitize.rb +8 -0
- data/lib/sitediff/sanitize/regexp.rb +2 -0
- data/lib/sitediff/uriwrapper.rb +21 -6
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1377b6bafe658b4a8a8f50ef0f54e577e99f1a87
|
4
|
+
data.tar.gz: 9a80e20a89b7f2f60506bbcccdc2b9f7037320f8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7fe3ce1b2e7bc1762d5e8f4a1bfd4ab9280963732d033d4b403087f71a4d6caa394669eadb9d82064ad963dd62918f95cae7e0b0495dd92979f105be1bfe6f5e
|
7
|
+
data.tar.gz: fb98c439544172ae40c0ba347272ec1287a1dc9042ab238e4abd8f720d52307ffa8a4b3fec7a70df68c7fdf845b324fe69c99693c5bbbc369f9e2c22fbe8c404
|
data/lib/sitediff.rb
CHANGED
@@ -54,10 +54,11 @@ class SiteDiff
|
|
54
54
|
@config.after['url']
|
55
55
|
end
|
56
56
|
|
57
|
-
def initialize(config, cache, concurrency, verbose = true)
|
57
|
+
def initialize(config, cache, concurrency, interval, verbose = true, debug = false)
|
58
58
|
@cache = cache
|
59
59
|
@verbose = verbose
|
60
|
-
|
60
|
+
@debug = debug
|
61
|
+
@interval = interval
|
61
62
|
# Check for single-site mode
|
62
63
|
validate_opts = {}
|
63
64
|
if !config.before['url'] && @cache.tag?(:before)
|
@@ -77,18 +78,33 @@ class SiteDiff
|
|
77
78
|
def sanitize(path, read_results)
|
78
79
|
%i[before after].map do |tag|
|
79
80
|
html = read_results[tag].content
|
80
|
-
|
81
|
-
|
81
|
+
encoding = read_results[tag].encoding
|
82
|
+
if encoding
|
83
|
+
config = @config.send(tag)
|
84
|
+
Sanitizer.new(html, config, path: path).sanitize
|
85
|
+
else
|
86
|
+
html
|
87
|
+
end
|
82
88
|
end
|
83
89
|
end
|
84
90
|
|
85
91
|
# Process a set of read results
|
86
92
|
def process_results(path, read_results)
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
93
|
+
if (error = (read_results[:before].error || read_results[:after].error))
|
94
|
+
diff = Result.new(path, nil, nil, nil, nil, error)
|
95
|
+
else
|
96
|
+
begin
|
97
|
+
diff = Result.new(path,
|
98
|
+
*sanitize(path, read_results),
|
99
|
+
read_results[:before].encoding,
|
100
|
+
read_results[:after].encoding,
|
101
|
+
nil)
|
102
|
+
rescue => e
|
103
|
+
raise if @debug
|
104
|
+
|
105
|
+
Result.new(path, nil, nil, nil, nil, "Sanitization error: #{e}")
|
106
|
+
end
|
107
|
+
end
|
92
108
|
@results[path] = diff
|
93
109
|
|
94
110
|
# Print results in order!
|
@@ -100,7 +116,7 @@ class SiteDiff
|
|
100
116
|
|
101
117
|
# Perform the comparison, populate @results and return the number of failing
|
102
118
|
# paths (paths with non-zero diff).
|
103
|
-
def run(curl_opts = {})
|
119
|
+
def run(curl_opts = {}, debug = true)
|
104
120
|
# Map of path -> Result object, populated by process_results
|
105
121
|
@results = {}
|
106
122
|
@ordered = @config.paths.dup
|
@@ -115,7 +131,7 @@ class SiteDiff
|
|
115
131
|
# so passing this instead but @config.after['curl_opts'] is ignored.
|
116
132
|
config_curl_opts = @config.before['curl_opts']
|
117
133
|
curl_opts = config_curl_opts.clone.merge(curl_opts) if config_curl_opts
|
118
|
-
fetcher = Fetch.new(@cache, @config.paths, @concurrency, curl_opts,
|
134
|
+
fetcher = Fetch.new(@cache, @config.paths, @interval, @concurrency, curl_opts, debug,
|
119
135
|
before: before, after: after)
|
120
136
|
fetcher.run(&method(:process_results))
|
121
137
|
|
data/lib/sitediff/cache.rb
CHANGED
@@ -8,10 +8,13 @@ class SiteDiff
|
|
8
8
|
attr_accessor :read_tags, :write_tags
|
9
9
|
|
10
10
|
def initialize(opts = {})
|
11
|
-
@dir = opts[:dir] || '.'
|
12
11
|
@create = opts[:create]
|
12
|
+
|
13
|
+
# Read and Write tags are sets that can contain :before and :after
|
14
|
+
# They indicate whether we should use the cache for reading or writing
|
13
15
|
@read_tags = Set.new
|
14
16
|
@write_tags = Set.new
|
17
|
+
@dir = opts[:directory] || '.'
|
15
18
|
end
|
16
19
|
|
17
20
|
# Is a tag cached?
|
@@ -63,5 +66,12 @@ class SiteDiff
|
|
63
66
|
# Ensure encoding stays the same!
|
64
67
|
Marshal.dump([tag, path.encode('UTF-8')])
|
65
68
|
end
|
69
|
+
|
70
|
+
def get_dir(directory)
|
71
|
+
# Create the dir. Must go before cache initialization!
|
72
|
+
@dir = Pathname.new(directory || '.')
|
73
|
+
@dir.mkpath unless @dir.directory?
|
74
|
+
@dir.to_s
|
75
|
+
end
|
66
76
|
end
|
67
77
|
end
|
data/lib/sitediff/cli.rb
CHANGED
@@ -23,6 +23,14 @@ class SiteDiff
|
|
23
23
|
type: :boolean,
|
24
24
|
default: false,
|
25
25
|
desc: 'Ignore many HTTPS/SSL errors'
|
26
|
+
class_option :debug,
|
27
|
+
type: :boolean,
|
28
|
+
default: false,
|
29
|
+
desc: 'Debug mode. Stop on certain errors and produce a traceback.'
|
30
|
+
class_option :interval,
|
31
|
+
type: :numeric,
|
32
|
+
default: 0,
|
33
|
+
desc: 'Crawling delay - interval in milliseconds'
|
26
34
|
|
27
35
|
# Thor, by default, exits with 0 no matter what!
|
28
36
|
def self.exit_on_failure?
|
@@ -75,7 +83,10 @@ class SiteDiff
|
|
75
83
|
desc: 'Max number of concurrent connections made'
|
76
84
|
desc 'diff [OPTIONS] [CONFIGFILES]', 'Perform systematic diff on given URLs'
|
77
85
|
def diff(*config_files)
|
78
|
-
|
86
|
+
@interval = options['interval']
|
87
|
+
check_interval(@interval)
|
88
|
+
@dir = get_dir(options['directory'])
|
89
|
+
config = SiteDiff::Config.new(config_files, @dir)
|
79
90
|
|
80
91
|
# override config based on options
|
81
92
|
paths = options['paths']
|
@@ -100,20 +111,21 @@ class SiteDiff
|
|
100
111
|
|
101
112
|
# Setup cache
|
102
113
|
cache = SiteDiff::Cache.new(create: options['cached'] != 'none',
|
103
|
-
|
114
|
+
directory: @dir)
|
104
115
|
cache.read_tags << :before if %w[before all].include?(options['cached'])
|
105
116
|
cache.read_tags << :after if %w[after all].include?(options['cached'])
|
106
117
|
cache.write_tags << :before << :after
|
107
118
|
|
108
|
-
sitediff = SiteDiff.new(config, cache, options[:concurrency],
|
109
|
-
options['verbose'])
|
110
|
-
num_failing = sitediff.run(get_curl_opts(options))
|
119
|
+
sitediff = SiteDiff.new(config, cache, options[:concurrency], @interval,
|
120
|
+
options['verbose'], options[:debug])
|
121
|
+
num_failing = sitediff.run(get_curl_opts(options), options[:debug])
|
111
122
|
exit_code = num_failing > 0 ? 2 : 0
|
112
123
|
|
113
|
-
sitediff.dump(
|
124
|
+
sitediff.dump(@dir, options['before-report'],
|
114
125
|
options['after-report'])
|
115
126
|
rescue Config::InvalidConfig => e
|
116
127
|
SiteDiff.log "Invalid configuration: #{e.message}", :error
|
128
|
+
SiteDiff.log "at #{e.backtrace}", :error
|
117
129
|
else # no exception was raised
|
118
130
|
# Thor::Error --> exit(1), guaranteed by exit_on_failure?
|
119
131
|
# Failing diff --> exit(2), populated above
|
@@ -132,8 +144,8 @@ class SiteDiff
|
|
132
144
|
def serve(*config_files)
|
133
145
|
config = SiteDiff::Config.new(config_files, options['directory'])
|
134
146
|
# Could check non-empty config here but currently errors are already raised.
|
135
|
-
|
136
|
-
cache = Cache.new(
|
147
|
+
@dir = get_dir(options['directory'])
|
148
|
+
cache = Cache.new(directory: @dir)
|
137
149
|
cache.read_tags << :before << :after
|
138
150
|
|
139
151
|
SiteDiff::Webserver::ResultServer.new(
|
@@ -145,6 +157,7 @@ class SiteDiff
|
|
145
157
|
).wait
|
146
158
|
rescue SiteDiffException => e
|
147
159
|
SiteDiff.log e.message, :error
|
160
|
+
SiteDiff.log e.backtrace, :error
|
148
161
|
end
|
149
162
|
|
150
163
|
option :depth,
|
@@ -160,19 +173,37 @@ class SiteDiff
|
|
160
173
|
type: :numeric,
|
161
174
|
default: 3,
|
162
175
|
desc: 'Max number of concurrent connections made'
|
176
|
+
option :whitelist,
|
177
|
+
type: :string,
|
178
|
+
default: '',
|
179
|
+
desc: 'Optional whitelist for crawling'
|
180
|
+
option :blacklist,
|
181
|
+
type: :string,
|
182
|
+
default: '',
|
183
|
+
desc: 'Optional blacklist for crawling'
|
163
184
|
desc 'init URL [URL]', 'Create a sitediff configuration'
|
164
185
|
def init(*urls)
|
165
186
|
unless (1..2).cover? urls.size
|
166
187
|
SiteDiff.log 'sitediff init requires one or two URLs', :error
|
167
|
-
exit
|
188
|
+
exit(2)
|
168
189
|
end
|
169
190
|
|
191
|
+
@interval = options['interval']
|
192
|
+
check_interval(@interval)
|
193
|
+
@dir = get_dir(options['directory'])
|
170
194
|
curl_opts = get_curl_opts(options)
|
171
|
-
|
172
|
-
|
195
|
+
@whitelist = create_regexp(options['whitelist'])
|
196
|
+
@blacklist = create_regexp(options['blacklist'])
|
197
|
+
creator = SiteDiff::Config::Creator.new(options[:concurrency],
|
198
|
+
options['interval'],
|
199
|
+
@whitelist,
|
200
|
+
@blacklist,
|
201
|
+
curl_opts,
|
202
|
+
options[:debug],
|
203
|
+
*urls)
|
173
204
|
creator.create(
|
174
205
|
depth: options[:depth],
|
175
|
-
directory:
|
206
|
+
directory: @dir,
|
176
207
|
rules: options[:rules] != 'no',
|
177
208
|
rules_disabled: (options[:rules] == 'disabled')
|
178
209
|
) do |_tag, info|
|
@@ -193,14 +224,19 @@ class SiteDiff
|
|
193
224
|
desc 'store [CONFIGFILES]',
|
194
225
|
'Cache the current contents of a site for later comparison'
|
195
226
|
def store(*config_files)
|
196
|
-
|
227
|
+
@dir = get_dir(options['directory'])
|
228
|
+
config = SiteDiff::Config.new(config_files, @dir)
|
197
229
|
config.validate(need_before: false)
|
198
|
-
|
199
|
-
cache = SiteDiff::Cache.new(create: true)
|
230
|
+
cache = SiteDiff::Cache.new(directory: @dir, create: true)
|
200
231
|
cache.write_tags << :before
|
201
232
|
|
202
233
|
base = options[:url] || config.after['url']
|
203
|
-
fetcher = SiteDiff::Fetch.new(cache,
|
234
|
+
fetcher = SiteDiff::Fetch.new(cache,
|
235
|
+
config.paths,
|
236
|
+
options[:interval],
|
237
|
+
options[:concurrency],
|
238
|
+
get_curl_opts(options),
|
239
|
+
options[:debug],
|
204
240
|
before: base)
|
205
241
|
fetcher.run do |path, _res|
|
206
242
|
SiteDiff.log "Visited #{path}, cached"
|
@@ -219,6 +255,32 @@ class SiteDiff
|
|
219
255
|
end
|
220
256
|
curl_opts
|
221
257
|
end
|
258
|
+
|
259
|
+
def check_interval(interval)
|
260
|
+
if interval != 0 && options[:concurrency] != 1
|
261
|
+
SiteDiff.log '--concurrency must be set to 1 in order to enable the interval feature'
|
262
|
+
exit(2)
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
def get_dir(directory)
|
267
|
+
# Create the dir. Must go before cache initialization!
|
268
|
+
@dir = Pathname.new(directory || '.')
|
269
|
+
@dir.mkpath unless @dir.directory?
|
270
|
+
@dir.to_s
|
271
|
+
end
|
272
|
+
|
273
|
+
def create_regexp(string_param)
|
274
|
+
begin
|
275
|
+
@return_value = string_param == '' ? nil : Regexp.new(string_param)
|
276
|
+
rescue SiteDiffException => e
|
277
|
+
@return_value = nil
|
278
|
+
SiteDiff.log 'whitelist and blacklist parameters must be valid regular expressions', :error
|
279
|
+
SiteDiff.log e.message, :error
|
280
|
+
SiteDiff.log e.backtrace, :error
|
281
|
+
end
|
282
|
+
return @return_value
|
283
|
+
end
|
222
284
|
end
|
223
285
|
end
|
224
286
|
end
|
@@ -11,11 +11,15 @@ require 'yaml'
|
|
11
11
|
class SiteDiff
|
12
12
|
class Config
|
13
13
|
class Creator
|
14
|
-
def initialize(concurrency, curl_opts, *urls)
|
14
|
+
def initialize(concurrency, interval, whitelist, blacklist, curl_opts, debug, *urls)
|
15
15
|
@concurrency = concurrency
|
16
|
+
@interval = interval
|
17
|
+
@whitelist = whitelist
|
18
|
+
@blacklist = blacklist
|
16
19
|
@after = urls.pop
|
17
20
|
@before = urls.pop # May be nil
|
18
21
|
@curl_opts = curl_opts
|
22
|
+
@debug = debug
|
19
23
|
end
|
20
24
|
|
21
25
|
def roots
|
@@ -30,18 +34,15 @@ class SiteDiff
|
|
30
34
|
def create(opts, &block)
|
31
35
|
@config = {}
|
32
36
|
@callback = block
|
33
|
-
|
34
|
-
# Handle options
|
35
37
|
@dir = Pathname.new(opts[:directory])
|
38
|
+
|
39
|
+
# Handle other options
|
36
40
|
@depth = opts[:depth]
|
37
41
|
@rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
|
38
42
|
|
39
|
-
# Create the dir. Must go before cache initialization!
|
40
|
-
@dir.mkpath unless @dir.directory?
|
41
|
-
|
42
43
|
# Setup instance vars
|
43
44
|
@paths = Hash.new { |h, k| h[k] = Set.new }
|
44
|
-
@cache = Cache.new(
|
45
|
+
@cache = Cache.new(directory: @dir.to_s, create: true)
|
45
46
|
@cache.write_tags << :before << :after
|
46
47
|
|
47
48
|
build_config
|
@@ -64,7 +65,7 @@ class SiteDiff
|
|
64
65
|
def crawl(depth = nil)
|
65
66
|
hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
|
66
67
|
roots.each do |tag, u|
|
67
|
-
Crawler.new(hydra, u, depth, @curl_opts) do |info|
|
68
|
+
Crawler.new(hydra, u, @interval, @whitelist, @blacklist, depth, @curl_opts, @debug) do |info|
|
68
69
|
crawled_path(tag, info)
|
69
70
|
end
|
70
71
|
end
|
@@ -113,6 +114,10 @@ class SiteDiff
|
|
113
114
|
end
|
114
115
|
end
|
115
116
|
|
117
|
+
def directory
|
118
|
+
@dir
|
119
|
+
end
|
120
|
+
|
116
121
|
def config_file
|
117
122
|
@dir + Config::DEFAULT_FILENAME
|
118
123
|
end
|
data/lib/sitediff/crawler.rb
CHANGED
@@ -14,14 +14,24 @@ class SiteDiff
|
|
14
14
|
DEFAULT_DEPTH = 3
|
15
15
|
|
16
16
|
# Create a crawler with a base URL
|
17
|
-
def initialize(hydra, base,
|
18
|
-
|
17
|
+
def initialize(hydra, base,
|
18
|
+
interval,
|
19
|
+
whitelist,
|
20
|
+
blacklist,
|
21
|
+
depth = DEFAULT_DEPTH,
|
22
|
+
curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
|
23
|
+
debug = true,
|
24
|
+
&block)
|
19
25
|
@hydra = hydra
|
20
26
|
@base_uri = Addressable::URI.parse(base)
|
21
27
|
@base = base
|
28
|
+
@interval = interval
|
29
|
+
@whitelist = whitelist
|
30
|
+
@blacklist = blacklist
|
22
31
|
@found = Set.new
|
23
32
|
@callback = block
|
24
33
|
@curl_opts = curl_opts
|
34
|
+
@debug = debug
|
25
35
|
|
26
36
|
add_uri('', depth)
|
27
37
|
end
|
@@ -32,7 +42,7 @@ class SiteDiff
|
|
32
42
|
|
33
43
|
@found << rel
|
34
44
|
|
35
|
-
wrapper = UriWrapper.new(@base + rel, @curl_opts)
|
45
|
+
wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
|
36
46
|
wrapper.queue(@hydra) do |res|
|
37
47
|
fetched_uri(rel, depth, res)
|
38
48
|
end
|
@@ -58,6 +68,11 @@ class SiteDiff
|
|
58
68
|
read_result: res,
|
59
69
|
document: doc
|
60
70
|
)
|
71
|
+
# Insert delay to limit fetching rate
|
72
|
+
if @interval != 0
|
73
|
+
SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
|
74
|
+
sleep(@interval / 1000.0)
|
75
|
+
end
|
61
76
|
@callback[info]
|
62
77
|
|
63
78
|
return unless depth >= 1
|
@@ -99,7 +114,21 @@ class SiteDiff
|
|
99
114
|
# Filter out links we don't want. Links passed in are absolute URIs.
|
100
115
|
def filter_links(uris)
|
101
116
|
uris.find_all do |u|
|
102
|
-
u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
|
117
|
+
is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path)
|
118
|
+
if is_sub_uri
|
119
|
+
is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
|
120
|
+
is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
|
121
|
+
if is_blacklisted && !is_whitelisted
|
122
|
+
SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
|
123
|
+
end
|
124
|
+
is_whitelisted || !is_blacklisted
|
125
|
+
end
|
126
|
+
# SiteDiff.log "Filtering URL #{u.path}", :info
|
127
|
+
# SiteDiff.log Regexp.new(@blacklist).match(u.path).inspect, :info
|
128
|
+
# (u.host == @base_uri.host) &&
|
129
|
+
# (u.path.start_with?(@base_uri.path)) &&
|
130
|
+
# (@whitelist == '' || Regexp.new(@whitelist).match(u.path)) &&
|
131
|
+
# (@blacklist == '' || !(Regexp.new(@blacklist).match(u.path)))
|
103
132
|
end
|
104
133
|
end
|
105
134
|
end
|
data/lib/sitediff/diff.rb
CHANGED
@@ -4,6 +4,7 @@ require 'sitediff'
|
|
4
4
|
require 'diffy'
|
5
5
|
require 'erb'
|
6
6
|
require 'rainbow'
|
7
|
+
require 'digest'
|
7
8
|
|
8
9
|
class SiteDiff
|
9
10
|
module Diff
|
@@ -15,6 +16,28 @@ class SiteDiff
|
|
15
16
|
diff.to_s(:html) : nil
|
16
17
|
end
|
17
18
|
|
19
|
+
def encoding_blurb(encoding)
|
20
|
+
if encoding
|
21
|
+
"Text content returned - charset #{encoding}"
|
22
|
+
else
|
23
|
+
'Binary content returned'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def binary_diffy(before, after, before_encoding, after_encoding)
|
28
|
+
if before_encoding || after_encoding
|
29
|
+
Diffy::Diff.new(encoding_blurb(before_encoding),
|
30
|
+
encoding_blurb(after_encoding)).to_s(:html)
|
31
|
+
elsif before == after
|
32
|
+
nil
|
33
|
+
else
|
34
|
+
md5_before = Digest::MD5.hexdigest(before)
|
35
|
+
md5_after = Digest::MD5.hexdigest(after)
|
36
|
+
Diffy::Diff.new("Binary content returned md5: #{md5_before}",
|
37
|
+
"Binary content returned md5: #{md5_after}").to_s(:html)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
18
41
|
def terminal_diffy(before_html, after_html)
|
19
42
|
args = []
|
20
43
|
args << :color if Rainbow.enabled
|
data/lib/sitediff/fetch.rb
CHANGED
@@ -8,12 +8,15 @@ class SiteDiff
|
|
8
8
|
# Cache is a cache object, see sitediff/cache
|
9
9
|
# Paths is a list of sub-paths
|
10
10
|
# Tags is a hash of tag names => base URLs.
|
11
|
-
def initialize(cache, paths, concurrency = 3, curl_opts = nil,
|
11
|
+
def initialize(cache, paths, interval, concurrency = 3, curl_opts = nil,
|
12
|
+
debug = true, **tags)
|
12
13
|
@cache = cache
|
14
|
+
@interval = interval
|
13
15
|
@paths = paths
|
14
16
|
@tags = tags
|
15
17
|
@curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
|
16
18
|
@concurrency = concurrency
|
19
|
+
@debug = debug
|
17
20
|
end
|
18
21
|
|
19
22
|
# Fetch all the paths, once per tag.
|
@@ -41,8 +44,13 @@ class SiteDiff
|
|
41
44
|
results[tag] = UriWrapper::ReadResult.error('Not cached')
|
42
45
|
process_results(path, results)
|
43
46
|
else
|
44
|
-
uri = UriWrapper.new(base + path, @curl_opts)
|
47
|
+
uri = UriWrapper.new(base + path, @curl_opts, @debug)
|
45
48
|
uri.queue(@hydra) do |resl|
|
49
|
+
# Insert delay to limit fetching rate
|
50
|
+
if @interval != 0
|
51
|
+
SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
|
52
|
+
sleep(@interval / 1000.0)
|
53
|
+
end
|
46
54
|
@cache.set(tag, path, resl)
|
47
55
|
results[tag] = resl
|
48
56
|
process_results(path, results)
|
data/lib/sitediff/result.rb
CHANGED
@@ -6,7 +6,7 @@ require 'digest/sha1'
|
|
6
6
|
require 'fileutils'
|
7
7
|
|
8
8
|
class SiteDiff
|
9
|
-
class Result < Struct.new(:path, :before, :after, :error, :verbose)
|
9
|
+
class Result < Struct.new(:path, :before, :after, :before_encoding, :after_encoding, :error, :verbose)
|
10
10
|
STATUS_SUCCESS = 0 # Identical before and after
|
11
11
|
STATUS_FAILURE = 1 # Different before and after
|
12
12
|
STATUS_ERROR = 2 # Couldn't fetch page
|
@@ -19,7 +19,11 @@ class SiteDiff
|
|
19
19
|
if error
|
20
20
|
@status = STATUS_ERROR
|
21
21
|
else
|
22
|
-
|
22
|
+
if !before_encoding || !after_encoding
|
23
|
+
@diff = Diff.binary_diffy(before, after, before_encoding, after_encoding)
|
24
|
+
else
|
25
|
+
@diff = Diff.html_diffy(before, after)
|
26
|
+
end
|
23
27
|
@status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
|
24
28
|
end
|
25
29
|
end
|
data/lib/sitediff/sanitize.rb
CHANGED
@@ -96,6 +96,10 @@ class SiteDiff
|
|
96
96
|
selector.each { |r| r.apply(@node) }
|
97
97
|
@html = Sanitizer.prettify(@node)
|
98
98
|
@node = nil
|
99
|
+
# Prevent potential UTF-8 encoding errors by removing bytes
|
100
|
+
# Not the only solution. An alternative is to return the
|
101
|
+
# string unmodified.
|
102
|
+
@html = @html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
99
103
|
global.each { |r| r.apply(@html) }
|
100
104
|
end
|
101
105
|
|
@@ -144,6 +148,10 @@ class SiteDiff
|
|
144
148
|
|
145
149
|
# There's a lot of cruft left over,that we don't want
|
146
150
|
|
151
|
+
# Prevent potential UTF-8 encoding errors by removing invalid bytes.
|
152
|
+
# Not the only solution.
|
153
|
+
# An alternative is to return the string unmodified.
|
154
|
+
str = str.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
147
155
|
# Remove xml declaration and <html> tags
|
148
156
|
str.sub!(/\A<\?xml.*$\n/, '')
|
149
157
|
str.sub!(/\A^<html>$\n/, '')
|
@@ -47,6 +47,8 @@ class SiteDiff
|
|
47
47
|
def gsub!(str)
|
48
48
|
re = ::Regexp.new(@rule['pattern'])
|
49
49
|
sub = @rule['substitute'] || ''
|
50
|
+
# Expecting a mutation here. Do not reassign the variable str
|
51
|
+
# for the purpose of removing UTF-8 encoding errors.
|
50
52
|
str.gsub!(re, sub)
|
51
53
|
str
|
52
54
|
end
|
data/lib/sitediff/uriwrapper.rb
CHANGED
@@ -18,10 +18,11 @@ class SiteDiff
|
|
18
18
|
|
19
19
|
# This lets us treat errors or content as one object
|
20
20
|
class ReadResult
|
21
|
-
attr_accessor :content, :error_code, :error
|
21
|
+
attr_accessor :encoding, :content, :error_code, :error
|
22
22
|
|
23
|
-
def initialize(content = nil)
|
23
|
+
def initialize(content = nil, encoding = 'utf-8')
|
24
24
|
@content = content
|
25
|
+
@encoding = encoding
|
25
26
|
@error = nil
|
26
27
|
@error_code = nil
|
27
28
|
end
|
@@ -34,11 +35,12 @@ class SiteDiff
|
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
37
|
-
def initialize(uri, curl_opts = DEFAULT_CURL_OPTS)
|
38
|
+
def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
|
38
39
|
@uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
|
39
40
|
# remove trailing '/'s from local URIs
|
40
41
|
@uri.path.gsub!(%r{/*$}, '') if local?
|
41
42
|
@curl_opts = curl_opts
|
43
|
+
@debug = debug
|
42
44
|
end
|
43
45
|
|
44
46
|
def user
|
@@ -78,7 +80,7 @@ class SiteDiff
|
|
78
80
|
|
79
81
|
# Returns the encoding of an HTTP response from headers , nil if not
|
80
82
|
# specified.
|
81
|
-
def
|
83
|
+
def charset_encoding(http_headers)
|
82
84
|
if (content_type = http_headers['Content-Type'])
|
83
85
|
if (md = /;\s*charset=([-\w]*)/.match(content_type))
|
84
86
|
md[1]
|
@@ -101,10 +103,23 @@ class SiteDiff
|
|
101
103
|
body = resp.body
|
102
104
|
# Typhoeus does not respect HTTP headers when setting the encoding
|
103
105
|
# resp.body; coerce if possible.
|
104
|
-
if (encoding =
|
106
|
+
if (encoding = charset_encoding(resp.headers))
|
105
107
|
body.force_encoding(encoding)
|
106
108
|
end
|
107
|
-
|
109
|
+
# Should be wrapped with rescue I guess? Maybe this entire function?
|
110
|
+
# Should at least be an option in the Cli to disable this.
|
111
|
+
# "stop on first error"
|
112
|
+
begin
|
113
|
+
yield ReadResult.new(body, encoding)
|
114
|
+
rescue ArgumentError => e
|
115
|
+
raise if @debug
|
116
|
+
|
117
|
+
yield ReadResult.error("Parsing error for #{@uri}: #{e.message}")
|
118
|
+
rescue => e
|
119
|
+
raise if @debug
|
120
|
+
|
121
|
+
yield ReadResult.error("Unknown parsing error for #{@uri}: #{e.message}")
|
122
|
+
end
|
108
123
|
end
|
109
124
|
|
110
125
|
req.on_failure do |resp|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitediff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Dergachev
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2019-04-02 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: pkg-config
|
@@ -167,7 +167,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
167
167
|
version: '0'
|
168
168
|
requirements: []
|
169
169
|
rubyforge_project:
|
170
|
-
rubygems_version: 2.5.2
|
170
|
+
rubygems_version: 2.5.2.3
|
171
171
|
signing_key:
|
172
172
|
specification_version: 4
|
173
173
|
summary: Compare two versions of a site with ease!
|