sitediff 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 93ab2ffc296a3c9de8ea835e47f435e0193e7854
4
- data.tar.gz: 5a80c5bb738912114aeb029c4ff283ff890f2ce2
3
+ metadata.gz: 1377b6bafe658b4a8a8f50ef0f54e577e99f1a87
4
+ data.tar.gz: 9a80e20a89b7f2f60506bbcccdc2b9f7037320f8
5
5
  SHA512:
6
- metadata.gz: fd07a6ff9a14a8da965f4f5b09acb1c7f30be594d97f6383c3bec6545d1b5bb3fc26451a9943ed3fc3c81d261a0fa4ca6c5fcc5355d7263ae09700800af5af9d
7
- data.tar.gz: bb940ede7b68b1e047dbda46d66fade88d06e0f54d9e257842a311f119efb4043939ea9b83021cdf2fd5c76ac3a23da92e7da91072cc5d4421a1717902093e17
6
+ metadata.gz: 7fe3ce1b2e7bc1762d5e8f4a1bfd4ab9280963732d033d4b403087f71a4d6caa394669eadb9d82064ad963dd62918f95cae7e0b0495dd92979f105be1bfe6f5e
7
+ data.tar.gz: fb98c439544172ae40c0ba347272ec1287a1dc9042ab238e4abd8f720d52307ffa8a4b3fec7a70df68c7fdf845b324fe69c99693c5bbbc369f9e2c22fbe8c404
data/lib/sitediff.rb CHANGED
@@ -54,10 +54,11 @@ class SiteDiff
54
54
  @config.after['url']
55
55
  end
56
56
 
57
- def initialize(config, cache, concurrency, verbose = true)
57
+ def initialize(config, cache, concurrency, interval, verbose = true, debug = false)
58
58
  @cache = cache
59
59
  @verbose = verbose
60
-
60
+ @debug = debug
61
+ @interval = interval
61
62
  # Check for single-site mode
62
63
  validate_opts = {}
63
64
  if !config.before['url'] && @cache.tag?(:before)
@@ -77,18 +78,33 @@ class SiteDiff
77
78
  def sanitize(path, read_results)
78
79
  %i[before after].map do |tag|
79
80
  html = read_results[tag].content
80
- config = @config.send(tag)
81
- Sanitizer.new(html, config, path: path).sanitize
81
+ encoding = read_results[tag].encoding
82
+ if encoding
83
+ config = @config.send(tag)
84
+ Sanitizer.new(html, config, path: path).sanitize
85
+ else
86
+ html
87
+ end
82
88
  end
83
89
  end
84
90
 
85
91
  # Process a set of read results
86
92
  def process_results(path, read_results)
87
- diff = if (error = (read_results[:before].error || read_results[:after].error))
88
- Result.new(path, nil, nil, error)
89
- else
90
- Result.new(path, *sanitize(path, read_results), nil)
91
- end
93
+ if (error = (read_results[:before].error || read_results[:after].error))
94
+ diff = Result.new(path, nil, nil, nil, nil, error)
95
+ else
96
+ begin
97
+ diff = Result.new(path,
98
+ *sanitize(path, read_results),
99
+ read_results[:before].encoding,
100
+ read_results[:after].encoding,
101
+ nil)
102
+ rescue => e
103
+ raise if @debug
104
+
105
+ Result.new(path, nil, nil, nil, nil, "Sanitization error: #{e}")
106
+ end
107
+ end
92
108
  @results[path] = diff
93
109
 
94
110
  # Print results in order!
@@ -100,7 +116,7 @@ class SiteDiff
100
116
 
101
117
  # Perform the comparison, populate @results and return the number of failing
102
118
  # paths (paths with non-zero diff).
103
- def run(curl_opts = {})
119
+ def run(curl_opts = {}, debug = true)
104
120
  # Map of path -> Result object, populated by process_results
105
121
  @results = {}
106
122
  @ordered = @config.paths.dup
@@ -115,7 +131,7 @@ class SiteDiff
115
131
  # so passing this instead but @config.after['curl_opts'] is ignored.
116
132
  config_curl_opts = @config.before['curl_opts']
117
133
  curl_opts = config_curl_opts.clone.merge(curl_opts) if config_curl_opts
118
- fetcher = Fetch.new(@cache, @config.paths, @concurrency, curl_opts,
134
+ fetcher = Fetch.new(@cache, @config.paths, @interval, @concurrency, curl_opts, debug,
119
135
  before: before, after: after)
120
136
  fetcher.run(&method(:process_results))
121
137
 
@@ -8,10 +8,13 @@ class SiteDiff
8
8
  attr_accessor :read_tags, :write_tags
9
9
 
10
10
  def initialize(opts = {})
11
- @dir = opts[:dir] || '.'
12
11
  @create = opts[:create]
12
+
13
+ # Read and Write tags are sets that can contain :before and :after
14
+ # They indicate whether we should use the cache for reading or writing
13
15
  @read_tags = Set.new
14
16
  @write_tags = Set.new
17
+ @dir = opts[:directory] || '.'
15
18
  end
16
19
 
17
20
  # Is a tag cached?
@@ -63,5 +66,12 @@ class SiteDiff
63
66
  # Ensure encoding stays the same!
64
67
  Marshal.dump([tag, path.encode('UTF-8')])
65
68
  end
69
+
70
+ def get_dir(directory)
71
+ # Create the dir. Must go before cache initialization!
72
+ @dir = Pathname.new(directory || '.')
73
+ @dir.mkpath unless @dir.directory?
74
+ @dir.to_s
75
+ end
66
76
  end
67
77
  end
data/lib/sitediff/cli.rb CHANGED
@@ -23,6 +23,14 @@ class SiteDiff
23
23
  type: :boolean,
24
24
  default: false,
25
25
  desc: 'Ignore many HTTPS/SSL errors'
26
+ class_option :debug,
27
+ type: :boolean,
28
+ default: false,
29
+ desc: 'Debug mode. Stop on certain errors and produce a traceback.'
30
+ class_option :interval,
31
+ type: :numeric,
32
+ default: 0,
33
+ desc: 'Crawling delay - interval in milliseconds'
26
34
 
27
35
  # Thor, by default, exits with 0 no matter what!
28
36
  def self.exit_on_failure?
@@ -75,7 +83,10 @@ class SiteDiff
75
83
  desc: 'Max number of concurrent connections made'
76
84
  desc 'diff [OPTIONS] [CONFIGFILES]', 'Perform systematic diff on given URLs'
77
85
  def diff(*config_files)
78
- config = SiteDiff::Config.new(config_files, options[:directory])
86
+ @interval = options['interval']
87
+ check_interval(@interval)
88
+ @dir = get_dir(options['directory'])
89
+ config = SiteDiff::Config.new(config_files, @dir)
79
90
 
80
91
  # override config based on options
81
92
  paths = options['paths']
@@ -100,20 +111,21 @@ class SiteDiff
100
111
 
101
112
  # Setup cache
102
113
  cache = SiteDiff::Cache.new(create: options['cached'] != 'none',
103
- dir: options['directory'])
114
+ directory: @dir)
104
115
  cache.read_tags << :before if %w[before all].include?(options['cached'])
105
116
  cache.read_tags << :after if %w[after all].include?(options['cached'])
106
117
  cache.write_tags << :before << :after
107
118
 
108
- sitediff = SiteDiff.new(config, cache, options[:concurrency],
109
- options['verbose'])
110
- num_failing = sitediff.run(get_curl_opts(options))
119
+ sitediff = SiteDiff.new(config, cache, options[:concurrency], @interval,
120
+ options['verbose'], options[:debug])
121
+ num_failing = sitediff.run(get_curl_opts(options), options[:debug])
111
122
  exit_code = num_failing > 0 ? 2 : 0
112
123
 
113
- sitediff.dump(options['directory'], options['before-report'],
124
+ sitediff.dump(@dir, options['before-report'],
114
125
  options['after-report'])
115
126
  rescue Config::InvalidConfig => e
116
127
  SiteDiff.log "Invalid configuration: #{e.message}", :error
128
+ SiteDiff.log "at #{e.backtrace}", :error
117
129
  else # no exception was raised
118
130
  # Thor::Error --> exit(1), guaranteed by exit_on_failure?
119
131
  # Failing diff --> exit(2), populated above
@@ -132,8 +144,8 @@ class SiteDiff
132
144
  def serve(*config_files)
133
145
  config = SiteDiff::Config.new(config_files, options['directory'])
134
146
  # Could check non-empty config here but currently errors are already raised.
135
-
136
- cache = Cache.new(dir: options['directory'])
147
+ @dir = get_dir(options['directory'])
148
+ cache = Cache.new(directory: @dir)
137
149
  cache.read_tags << :before << :after
138
150
 
139
151
  SiteDiff::Webserver::ResultServer.new(
@@ -145,6 +157,7 @@ class SiteDiff
145
157
  ).wait
146
158
  rescue SiteDiffException => e
147
159
  SiteDiff.log e.message, :error
160
+ SiteDiff.log e.backtrace, :error
148
161
  end
149
162
 
150
163
  option :depth,
@@ -160,19 +173,37 @@ class SiteDiff
160
173
  type: :numeric,
161
174
  default: 3,
162
175
  desc: 'Max number of concurrent connections made'
176
+ option :whitelist,
177
+ type: :string,
178
+ default: '',
179
+ desc: 'Optional whitelist for crawling'
180
+ option :blacklist,
181
+ type: :string,
182
+ default: '',
183
+ desc: 'Optional blacklist for crawling'
163
184
  desc 'init URL [URL]', 'Create a sitediff configuration'
164
185
  def init(*urls)
165
186
  unless (1..2).cover? urls.size
166
187
  SiteDiff.log 'sitediff init requires one or two URLs', :error
167
- exit 2
188
+ exit(2)
168
189
  end
169
190
 
191
+ @interval = options['interval']
192
+ check_interval(@interval)
193
+ @dir = get_dir(options['directory'])
170
194
  curl_opts = get_curl_opts(options)
171
-
172
- creator = SiteDiff::Config::Creator.new(options[:concurrency], curl_opts, *urls)
195
+ @whitelist = create_regexp(options['whitelist'])
196
+ @blacklist = create_regexp(options['blacklist'])
197
+ creator = SiteDiff::Config::Creator.new(options[:concurrency],
198
+ options['interval'],
199
+ @whitelist,
200
+ @blacklist,
201
+ curl_opts,
202
+ options[:debug],
203
+ *urls)
173
204
  creator.create(
174
205
  depth: options[:depth],
175
- directory: options[:directory],
206
+ directory: @dir,
176
207
  rules: options[:rules] != 'no',
177
208
  rules_disabled: (options[:rules] == 'disabled')
178
209
  ) do |_tag, info|
@@ -193,14 +224,19 @@ class SiteDiff
193
224
  desc 'store [CONFIGFILES]',
194
225
  'Cache the current contents of a site for later comparison'
195
226
  def store(*config_files)
196
- config = SiteDiff::Config.new(config_files, options['directory'])
227
+ @dir = get_dir(options['directory'])
228
+ config = SiteDiff::Config.new(config_files, @dir)
197
229
  config.validate(need_before: false)
198
-
199
- cache = SiteDiff::Cache.new(create: true)
230
+ cache = SiteDiff::Cache.new(directory: @dir, create: true)
200
231
  cache.write_tags << :before
201
232
 
202
233
  base = options[:url] || config.after['url']
203
- fetcher = SiteDiff::Fetch.new(cache, config.paths, options['concurrency'],
234
+ fetcher = SiteDiff::Fetch.new(cache,
235
+ config.paths,
236
+ options[:interval],
237
+ options[:concurrency],
238
+ get_curl_opts(options),
239
+ options[:debug],
204
240
  before: base)
205
241
  fetcher.run do |path, _res|
206
242
  SiteDiff.log "Visited #{path}, cached"
@@ -219,6 +255,32 @@ class SiteDiff
219
255
  end
220
256
  curl_opts
221
257
  end
258
+
259
+ def check_interval(interval)
260
+ if interval != 0 && options[:concurrency] != 1
261
+ SiteDiff.log '--concurrency must be set to 1 in order to enable the interval feature'
262
+ exit(2)
263
+ end
264
+ end
265
+
266
+ def get_dir(directory)
267
+ # Create the dir. Must go before cache initialization!
268
+ @dir = Pathname.new(directory || '.')
269
+ @dir.mkpath unless @dir.directory?
270
+ @dir.to_s
271
+ end
272
+
273
+ def create_regexp(string_param)
274
+ begin
275
+ @return_value = string_param == '' ? nil : Regexp.new(string_param)
276
+ rescue SiteDiffException => e
277
+ @return_value = nil
278
+ SiteDiff.log 'whitelist and blacklist parameters must be valid regular expressions', :error
279
+ SiteDiff.log e.message, :error
280
+ SiteDiff.log e.backtrace, :error
281
+ end
282
+ return @return_value
283
+ end
222
284
  end
223
285
  end
224
286
  end
@@ -11,11 +11,15 @@ require 'yaml'
11
11
  class SiteDiff
12
12
  class Config
13
13
  class Creator
14
- def initialize(concurrency, curl_opts, *urls)
14
+ def initialize(concurrency, interval, whitelist, blacklist, curl_opts, debug, *urls)
15
15
  @concurrency = concurrency
16
+ @interval = interval
17
+ @whitelist = whitelist
18
+ @blacklist = blacklist
16
19
  @after = urls.pop
17
20
  @before = urls.pop # May be nil
18
21
  @curl_opts = curl_opts
22
+ @debug = debug
19
23
  end
20
24
 
21
25
  def roots
@@ -30,18 +34,15 @@ class SiteDiff
30
34
  def create(opts, &block)
31
35
  @config = {}
32
36
  @callback = block
33
-
34
- # Handle options
35
37
  @dir = Pathname.new(opts[:directory])
38
+
39
+ # Handle other options
36
40
  @depth = opts[:depth]
37
41
  @rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
38
42
 
39
- # Create the dir. Must go before cache initialization!
40
- @dir.mkpath unless @dir.directory?
41
-
42
43
  # Setup instance vars
43
44
  @paths = Hash.new { |h, k| h[k] = Set.new }
44
- @cache = Cache.new(dir: @dir.to_s, create: true)
45
+ @cache = Cache.new(directory: @dir.to_s, create: true)
45
46
  @cache.write_tags << :before << :after
46
47
 
47
48
  build_config
@@ -64,7 +65,7 @@ class SiteDiff
64
65
  def crawl(depth = nil)
65
66
  hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
66
67
  roots.each do |tag, u|
67
- Crawler.new(hydra, u, depth, @curl_opts) do |info|
68
+ Crawler.new(hydra, u, @interval, @whitelist, @blacklist, depth, @curl_opts, @debug) do |info|
68
69
  crawled_path(tag, info)
69
70
  end
70
71
  end
@@ -113,6 +114,10 @@ class SiteDiff
113
114
  end
114
115
  end
115
116
 
117
+ def directory
118
+ @dir
119
+ end
120
+
116
121
  def config_file
117
122
  @dir + Config::DEFAULT_FILENAME
118
123
  end
@@ -14,14 +14,24 @@ class SiteDiff
14
14
  DEFAULT_DEPTH = 3
15
15
 
16
16
  # Create a crawler with a base URL
17
- def initialize(hydra, base, depth = DEFAULT_DEPTH,
18
- curl_opts = UriWrapper::DEFAULT_CURL_OPTS, &block)
17
+ def initialize(hydra, base,
18
+ interval,
19
+ whitelist,
20
+ blacklist,
21
+ depth = DEFAULT_DEPTH,
22
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
23
+ debug = true,
24
+ &block)
19
25
  @hydra = hydra
20
26
  @base_uri = Addressable::URI.parse(base)
21
27
  @base = base
28
+ @interval = interval
29
+ @whitelist = whitelist
30
+ @blacklist = blacklist
22
31
  @found = Set.new
23
32
  @callback = block
24
33
  @curl_opts = curl_opts
34
+ @debug = debug
25
35
 
26
36
  add_uri('', depth)
27
37
  end
@@ -32,7 +42,7 @@ class SiteDiff
32
42
 
33
43
  @found << rel
34
44
 
35
- wrapper = UriWrapper.new(@base + rel, @curl_opts)
45
+ wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
36
46
  wrapper.queue(@hydra) do |res|
37
47
  fetched_uri(rel, depth, res)
38
48
  end
@@ -58,6 +68,11 @@ class SiteDiff
58
68
  read_result: res,
59
69
  document: doc
60
70
  )
71
+ # Insert delay to limit fetching rate
72
+ if @interval != 0
73
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
74
+ sleep(@interval / 1000.0)
75
+ end
61
76
  @callback[info]
62
77
 
63
78
  return unless depth >= 1
@@ -99,7 +114,21 @@ class SiteDiff
99
114
  # Filter out links we don't want. Links passed in are absolute URIs.
100
115
  def filter_links(uris)
101
116
  uris.find_all do |u|
102
- u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
117
+ is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path)
118
+ if is_sub_uri
119
+ is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
120
+ is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
121
+ if is_blacklisted && !is_whitelisted
122
+ SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
123
+ end
124
+ is_whitelisted || !is_blacklisted
125
+ end
126
+ # SiteDiff.log "Filtering URL #{u.path}", :info
127
+ # SiteDiff.log Regexp.new(@blacklist).match(u.path).inspect, :info
128
+ # (u.host == @base_uri.host) &&
129
+ # (u.path.start_with?(@base_uri.path)) &&
130
+ # (@whitelist == '' || Regexp.new(@whitelist).match(u.path)) &&
131
+ # (@blacklist == '' || !(Regexp.new(@blacklist).match(u.path)))
103
132
  end
104
133
  end
105
134
  end
data/lib/sitediff/diff.rb CHANGED
@@ -4,6 +4,7 @@ require 'sitediff'
4
4
  require 'diffy'
5
5
  require 'erb'
6
6
  require 'rainbow'
7
+ require 'digest'
7
8
 
8
9
  class SiteDiff
9
10
  module Diff
@@ -15,6 +16,28 @@ class SiteDiff
15
16
  diff.to_s(:html) : nil
16
17
  end
17
18
 
19
+ def encoding_blurb(encoding)
20
+ if encoding
21
+ "Text content returned - charset #{encoding}"
22
+ else
23
+ 'Binary content returned'
24
+ end
25
+ end
26
+
27
+ def binary_diffy(before, after, before_encoding, after_encoding)
28
+ if before_encoding || after_encoding
29
+ Diffy::Diff.new(encoding_blurb(before_encoding),
30
+ encoding_blurb(after_encoding)).to_s(:html)
31
+ elsif before == after
32
+ nil
33
+ else
34
+ md5_before = Digest::MD5.hexdigest(before)
35
+ md5_after = Digest::MD5.hexdigest(after)
36
+ Diffy::Diff.new("Binary content returned md5: #{md5_before}",
37
+ "Binary content returned md5: #{md5_after}").to_s(:html)
38
+ end
39
+ end
40
+
18
41
  def terminal_diffy(before_html, after_html)
19
42
  args = []
20
43
  args << :color if Rainbow.enabled
@@ -8,12 +8,15 @@ class SiteDiff
8
8
  # Cache is a cache object, see sitediff/cache
9
9
  # Paths is a list of sub-paths
10
10
  # Tags is a hash of tag names => base URLs.
11
- def initialize(cache, paths, concurrency = 3, curl_opts = nil, **tags)
11
+ def initialize(cache, paths, interval, concurrency = 3, curl_opts = nil,
12
+ debug = true, **tags)
12
13
  @cache = cache
14
+ @interval = interval
13
15
  @paths = paths
14
16
  @tags = tags
15
17
  @curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
16
18
  @concurrency = concurrency
19
+ @debug = debug
17
20
  end
18
21
 
19
22
  # Fetch all the paths, once per tag.
@@ -41,8 +44,13 @@ class SiteDiff
41
44
  results[tag] = UriWrapper::ReadResult.error('Not cached')
42
45
  process_results(path, results)
43
46
  else
44
- uri = UriWrapper.new(base + path, @curl_opts)
47
+ uri = UriWrapper.new(base + path, @curl_opts, @debug)
45
48
  uri.queue(@hydra) do |resl|
49
+ # Insert delay to limit fetching rate
50
+ if @interval != 0
51
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
52
+ sleep(@interval / 1000.0)
53
+ end
46
54
  @cache.set(tag, path, resl)
47
55
  results[tag] = resl
48
56
  process_results(path, results)
@@ -6,7 +6,7 @@ require 'digest/sha1'
6
6
  require 'fileutils'
7
7
 
8
8
  class SiteDiff
9
- class Result < Struct.new(:path, :before, :after, :error, :verbose)
9
+ class Result < Struct.new(:path, :before, :after, :before_encoding, :after_encoding, :error, :verbose)
10
10
  STATUS_SUCCESS = 0 # Identical before and after
11
11
  STATUS_FAILURE = 1 # Different before and after
12
12
  STATUS_ERROR = 2 # Couldn't fetch page
@@ -19,7 +19,11 @@ class SiteDiff
19
19
  if error
20
20
  @status = STATUS_ERROR
21
21
  else
22
- @diff = Diff.html_diffy(before, after)
22
+ if !before_encoding || !after_encoding
23
+ @diff = Diff.binary_diffy(before, after, before_encoding, after_encoding)
24
+ else
25
+ @diff = Diff.html_diffy(before, after)
26
+ end
23
27
  @status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
24
28
  end
25
29
  end
@@ -96,6 +96,10 @@ class SiteDiff
96
96
  selector.each { |r| r.apply(@node) }
97
97
  @html = Sanitizer.prettify(@node)
98
98
  @node = nil
99
+ # Prevent potential UTF-8 encoding errors by removing bytes
100
+ # Not the only solution. An alternative is to return the
101
+ # string unmodified.
102
+ @html = @html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
99
103
  global.each { |r| r.apply(@html) }
100
104
  end
101
105
 
@@ -144,6 +148,10 @@ class SiteDiff
144
148
 
145
149
  # There's a lot of cruft left over,that we don't want
146
150
 
151
+ # Prevent potential UTF-8 encoding errors by removing invalid bytes.
152
+ # Not the only solution.
153
+ # An alternative is to return the string unmodified.
154
+ str = str.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
147
155
  # Remove xml declaration and <html> tags
148
156
  str.sub!(/\A<\?xml.*$\n/, '')
149
157
  str.sub!(/\A^<html>$\n/, '')
@@ -47,6 +47,8 @@ class SiteDiff
47
47
  def gsub!(str)
48
48
  re = ::Regexp.new(@rule['pattern'])
49
49
  sub = @rule['substitute'] || ''
50
+ # Expecting a mutation here. Do not reassign the variable str
51
+ # for the purpose of removing UTF-8 encoding errors.
50
52
  str.gsub!(re, sub)
51
53
  str
52
54
  end
@@ -18,10 +18,11 @@ class SiteDiff
18
18
 
19
19
  # This lets us treat errors or content as one object
20
20
  class ReadResult
21
- attr_accessor :content, :error_code, :error
21
+ attr_accessor :encoding, :content, :error_code, :error
22
22
 
23
- def initialize(content = nil)
23
+ def initialize(content = nil, encoding = 'utf-8')
24
24
  @content = content
25
+ @encoding = encoding
25
26
  @error = nil
26
27
  @error_code = nil
27
28
  end
@@ -34,11 +35,12 @@ class SiteDiff
34
35
  end
35
36
  end
36
37
 
37
- def initialize(uri, curl_opts = DEFAULT_CURL_OPTS)
38
+ def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
38
39
  @uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
39
40
  # remove trailing '/'s from local URIs
40
41
  @uri.path.gsub!(%r{/*$}, '') if local?
41
42
  @curl_opts = curl_opts
43
+ @debug = debug
42
44
  end
43
45
 
44
46
  def user
@@ -78,7 +80,7 @@ class SiteDiff
78
80
 
79
81
  # Returns the encoding of an HTTP response from headers , nil if not
80
82
  # specified.
81
- def http_encoding(http_headers)
83
+ def charset_encoding(http_headers)
82
84
  if (content_type = http_headers['Content-Type'])
83
85
  if (md = /;\s*charset=([-\w]*)/.match(content_type))
84
86
  md[1]
@@ -101,10 +103,23 @@ class SiteDiff
101
103
  body = resp.body
102
104
  # Typhoeus does not respect HTTP headers when setting the encoding
103
105
  # resp.body; coerce if possible.
104
- if (encoding = http_encoding(resp.headers))
106
+ if (encoding = charset_encoding(resp.headers))
105
107
  body.force_encoding(encoding)
106
108
  end
107
- yield ReadResult.new(body)
109
+ # Should be wrapped with rescue I guess? Maybe this entire function?
110
+ # Should at least be an option in the Cli to disable this.
111
+ # "stop on first error"
112
+ begin
113
+ yield ReadResult.new(body, encoding)
114
+ rescue ArgumentError => e
115
+ raise if @debug
116
+
117
+ yield ReadResult.error("Parsing error for #{@uri}: #{e.message}")
118
+ rescue => e
119
+ raise if @debug
120
+
121
+ yield ReadResult.error("Unknown parsing error for #{@uri}: #{e.message}")
122
+ end
108
123
  end
109
124
 
110
125
  req.on_failure do |resp|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitediff
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Dergachev
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2018-12-14 00:00:00.000000000 Z
13
+ date: 2019-04-02 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: pkg-config
@@ -167,7 +167,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
167
167
  version: '0'
168
168
  requirements: []
169
169
  rubyforge_project:
170
- rubygems_version: 2.5.2
170
+ rubygems_version: 2.5.2.3
171
171
  signing_key:
172
172
  specification_version: 4
173
173
  summary: Compare two versions of a site with ease!