sitediff 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 93ab2ffc296a3c9de8ea835e47f435e0193e7854
4
- data.tar.gz: 5a80c5bb738912114aeb029c4ff283ff890f2ce2
3
+ metadata.gz: 1377b6bafe658b4a8a8f50ef0f54e577e99f1a87
4
+ data.tar.gz: 9a80e20a89b7f2f60506bbcccdc2b9f7037320f8
5
5
  SHA512:
6
- metadata.gz: fd07a6ff9a14a8da965f4f5b09acb1c7f30be594d97f6383c3bec6545d1b5bb3fc26451a9943ed3fc3c81d261a0fa4ca6c5fcc5355d7263ae09700800af5af9d
7
- data.tar.gz: bb940ede7b68b1e047dbda46d66fade88d06e0f54d9e257842a311f119efb4043939ea9b83021cdf2fd5c76ac3a23da92e7da91072cc5d4421a1717902093e17
6
+ metadata.gz: 7fe3ce1b2e7bc1762d5e8f4a1bfd4ab9280963732d033d4b403087f71a4d6caa394669eadb9d82064ad963dd62918f95cae7e0b0495dd92979f105be1bfe6f5e
7
+ data.tar.gz: fb98c439544172ae40c0ba347272ec1287a1dc9042ab238e4abd8f720d52307ffa8a4b3fec7a70df68c7fdf845b324fe69c99693c5bbbc369f9e2c22fbe8c404
data/lib/sitediff.rb CHANGED
@@ -54,10 +54,11 @@ class SiteDiff
54
54
  @config.after['url']
55
55
  end
56
56
 
57
- def initialize(config, cache, concurrency, verbose = true)
57
+ def initialize(config, cache, concurrency, interval, verbose = true, debug = false)
58
58
  @cache = cache
59
59
  @verbose = verbose
60
-
60
+ @debug = debug
61
+ @interval = interval
61
62
  # Check for single-site mode
62
63
  validate_opts = {}
63
64
  if !config.before['url'] && @cache.tag?(:before)
@@ -77,18 +78,33 @@ class SiteDiff
77
78
  def sanitize(path, read_results)
78
79
  %i[before after].map do |tag|
79
80
  html = read_results[tag].content
80
- config = @config.send(tag)
81
- Sanitizer.new(html, config, path: path).sanitize
81
+ encoding = read_results[tag].encoding
82
+ if encoding
83
+ config = @config.send(tag)
84
+ Sanitizer.new(html, config, path: path).sanitize
85
+ else
86
+ html
87
+ end
82
88
  end
83
89
  end
84
90
 
85
91
  # Process a set of read results
86
92
  def process_results(path, read_results)
87
- diff = if (error = (read_results[:before].error || read_results[:after].error))
88
- Result.new(path, nil, nil, error)
89
- else
90
- Result.new(path, *sanitize(path, read_results), nil)
91
- end
93
+ if (error = (read_results[:before].error || read_results[:after].error))
94
+ diff = Result.new(path, nil, nil, nil, nil, error)
95
+ else
96
+ begin
97
+ diff = Result.new(path,
98
+ *sanitize(path, read_results),
99
+ read_results[:before].encoding,
100
+ read_results[:after].encoding,
101
+ nil)
102
+ rescue => e
103
+ raise if @debug
104
+
105
+ Result.new(path, nil, nil, nil, nil, "Sanitization error: #{e}")
106
+ end
107
+ end
92
108
  @results[path] = diff
93
109
 
94
110
  # Print results in order!
@@ -100,7 +116,7 @@ class SiteDiff
100
116
 
101
117
  # Perform the comparison, populate @results and return the number of failing
102
118
  # paths (paths with non-zero diff).
103
- def run(curl_opts = {})
119
+ def run(curl_opts = {}, debug = true)
104
120
  # Map of path -> Result object, populated by process_results
105
121
  @results = {}
106
122
  @ordered = @config.paths.dup
@@ -115,7 +131,7 @@ class SiteDiff
115
131
  # so passing this instead but @config.after['curl_opts'] is ignored.
116
132
  config_curl_opts = @config.before['curl_opts']
117
133
  curl_opts = config_curl_opts.clone.merge(curl_opts) if config_curl_opts
118
- fetcher = Fetch.new(@cache, @config.paths, @concurrency, curl_opts,
134
+ fetcher = Fetch.new(@cache, @config.paths, @interval, @concurrency, curl_opts, debug,
119
135
  before: before, after: after)
120
136
  fetcher.run(&method(:process_results))
121
137
 
@@ -8,10 +8,13 @@ class SiteDiff
8
8
  attr_accessor :read_tags, :write_tags
9
9
 
10
10
  def initialize(opts = {})
11
- @dir = opts[:dir] || '.'
12
11
  @create = opts[:create]
12
+
13
+ # Read and Write tags are sets that can contain :before and :after
14
+ # They indicate whether we should use the cache for reading or writing
13
15
  @read_tags = Set.new
14
16
  @write_tags = Set.new
17
+ @dir = opts[:directory] || '.'
15
18
  end
16
19
 
17
20
  # Is a tag cached?
@@ -63,5 +66,12 @@ class SiteDiff
63
66
  # Ensure encoding stays the same!
64
67
  Marshal.dump([tag, path.encode('UTF-8')])
65
68
  end
69
+
70
+ def get_dir(directory)
71
+ # Create the dir. Must go before cache initialization!
72
+ @dir = Pathname.new(directory || '.')
73
+ @dir.mkpath unless @dir.directory?
74
+ @dir.to_s
75
+ end
66
76
  end
67
77
  end
data/lib/sitediff/cli.rb CHANGED
@@ -23,6 +23,14 @@ class SiteDiff
23
23
  type: :boolean,
24
24
  default: false,
25
25
  desc: 'Ignore many HTTPS/SSL errors'
26
+ class_option :debug,
27
+ type: :boolean,
28
+ default: false,
29
+ desc: 'Debug mode. Stop on certain errors and produce a traceback.'
30
+ class_option :interval,
31
+ type: :numeric,
32
+ default: 0,
33
+ desc: 'Crawling delay - interval in milliseconds'
26
34
 
27
35
  # Thor, by default, exits with 0 no matter what!
28
36
  def self.exit_on_failure?
@@ -75,7 +83,10 @@ class SiteDiff
75
83
  desc: 'Max number of concurrent connections made'
76
84
  desc 'diff [OPTIONS] [CONFIGFILES]', 'Perform systematic diff on given URLs'
77
85
  def diff(*config_files)
78
- config = SiteDiff::Config.new(config_files, options[:directory])
86
+ @interval = options['interval']
87
+ check_interval(@interval)
88
+ @dir = get_dir(options['directory'])
89
+ config = SiteDiff::Config.new(config_files, @dir)
79
90
 
80
91
  # override config based on options
81
92
  paths = options['paths']
@@ -100,20 +111,21 @@ class SiteDiff
100
111
 
101
112
  # Setup cache
102
113
  cache = SiteDiff::Cache.new(create: options['cached'] != 'none',
103
- dir: options['directory'])
114
+ directory: @dir)
104
115
  cache.read_tags << :before if %w[before all].include?(options['cached'])
105
116
  cache.read_tags << :after if %w[after all].include?(options['cached'])
106
117
  cache.write_tags << :before << :after
107
118
 
108
- sitediff = SiteDiff.new(config, cache, options[:concurrency],
109
- options['verbose'])
110
- num_failing = sitediff.run(get_curl_opts(options))
119
+ sitediff = SiteDiff.new(config, cache, options[:concurrency], @interval,
120
+ options['verbose'], options[:debug])
121
+ num_failing = sitediff.run(get_curl_opts(options), options[:debug])
111
122
  exit_code = num_failing > 0 ? 2 : 0
112
123
 
113
- sitediff.dump(options['directory'], options['before-report'],
124
+ sitediff.dump(@dir, options['before-report'],
114
125
  options['after-report'])
115
126
  rescue Config::InvalidConfig => e
116
127
  SiteDiff.log "Invalid configuration: #{e.message}", :error
128
+ SiteDiff.log "at #{e.backtrace}", :error
117
129
  else # no exception was raised
118
130
  # Thor::Error --> exit(1), guaranteed by exit_on_failure?
119
131
  # Failing diff --> exit(2), populated above
@@ -132,8 +144,8 @@ class SiteDiff
132
144
  def serve(*config_files)
133
145
  config = SiteDiff::Config.new(config_files, options['directory'])
134
146
  # Could check non-empty config here but currently errors are already raised.
135
-
136
- cache = Cache.new(dir: options['directory'])
147
+ @dir = get_dir(options['directory'])
148
+ cache = Cache.new(directory: @dir)
137
149
  cache.read_tags << :before << :after
138
150
 
139
151
  SiteDiff::Webserver::ResultServer.new(
@@ -145,6 +157,7 @@ class SiteDiff
145
157
  ).wait
146
158
  rescue SiteDiffException => e
147
159
  SiteDiff.log e.message, :error
160
+ SiteDiff.log e.backtrace, :error
148
161
  end
149
162
 
150
163
  option :depth,
@@ -160,19 +173,37 @@ class SiteDiff
160
173
  type: :numeric,
161
174
  default: 3,
162
175
  desc: 'Max number of concurrent connections made'
176
+ option :whitelist,
177
+ type: :string,
178
+ default: '',
179
+ desc: 'Optional whitelist for crawling'
180
+ option :blacklist,
181
+ type: :string,
182
+ default: '',
183
+ desc: 'Optional blacklist for crawling'
163
184
  desc 'init URL [URL]', 'Create a sitediff configuration'
164
185
  def init(*urls)
165
186
  unless (1..2).cover? urls.size
166
187
  SiteDiff.log 'sitediff init requires one or two URLs', :error
167
- exit 2
188
+ exit(2)
168
189
  end
169
190
 
191
+ @interval = options['interval']
192
+ check_interval(@interval)
193
+ @dir = get_dir(options['directory'])
170
194
  curl_opts = get_curl_opts(options)
171
-
172
- creator = SiteDiff::Config::Creator.new(options[:concurrency], curl_opts, *urls)
195
+ @whitelist = create_regexp(options['whitelist'])
196
+ @blacklist = create_regexp(options['blacklist'])
197
+ creator = SiteDiff::Config::Creator.new(options[:concurrency],
198
+ options['interval'],
199
+ @whitelist,
200
+ @blacklist,
201
+ curl_opts,
202
+ options[:debug],
203
+ *urls)
173
204
  creator.create(
174
205
  depth: options[:depth],
175
- directory: options[:directory],
206
+ directory: @dir,
176
207
  rules: options[:rules] != 'no',
177
208
  rules_disabled: (options[:rules] == 'disabled')
178
209
  ) do |_tag, info|
@@ -193,14 +224,19 @@ class SiteDiff
193
224
  desc 'store [CONFIGFILES]',
194
225
  'Cache the current contents of a site for later comparison'
195
226
  def store(*config_files)
196
- config = SiteDiff::Config.new(config_files, options['directory'])
227
+ @dir = get_dir(options['directory'])
228
+ config = SiteDiff::Config.new(config_files, @dir)
197
229
  config.validate(need_before: false)
198
-
199
- cache = SiteDiff::Cache.new(create: true)
230
+ cache = SiteDiff::Cache.new(directory: @dir, create: true)
200
231
  cache.write_tags << :before
201
232
 
202
233
  base = options[:url] || config.after['url']
203
- fetcher = SiteDiff::Fetch.new(cache, config.paths, options['concurrency'],
234
+ fetcher = SiteDiff::Fetch.new(cache,
235
+ config.paths,
236
+ options[:interval],
237
+ options[:concurrency],
238
+ get_curl_opts(options),
239
+ options[:debug],
204
240
  before: base)
205
241
  fetcher.run do |path, _res|
206
242
  SiteDiff.log "Visited #{path}, cached"
@@ -219,6 +255,32 @@ class SiteDiff
219
255
  end
220
256
  curl_opts
221
257
  end
258
+
259
+ def check_interval(interval)
260
+ if interval != 0 && options[:concurrency] != 1
261
+ SiteDiff.log '--concurrency must be set to 1 in order to enable the interval feature'
262
+ exit(2)
263
+ end
264
+ end
265
+
266
+ def get_dir(directory)
267
+ # Create the dir. Must go before cache initialization!
268
+ @dir = Pathname.new(directory || '.')
269
+ @dir.mkpath unless @dir.directory?
270
+ @dir.to_s
271
+ end
272
+
273
+ def create_regexp(string_param)
274
+ begin
275
+ @return_value = string_param == '' ? nil : Regexp.new(string_param)
276
+ rescue SiteDiffException => e
277
+ @return_value = nil
278
+ SiteDiff.log 'whitelist and blacklist parameters must be valid regular expressions', :error
279
+ SiteDiff.log e.message, :error
280
+ SiteDiff.log e.backtrace, :error
281
+ end
282
+ return @return_value
283
+ end
222
284
  end
223
285
  end
224
286
  end
@@ -11,11 +11,15 @@ require 'yaml'
11
11
  class SiteDiff
12
12
  class Config
13
13
  class Creator
14
- def initialize(concurrency, curl_opts, *urls)
14
+ def initialize(concurrency, interval, whitelist, blacklist, curl_opts, debug, *urls)
15
15
  @concurrency = concurrency
16
+ @interval = interval
17
+ @whitelist = whitelist
18
+ @blacklist = blacklist
16
19
  @after = urls.pop
17
20
  @before = urls.pop # May be nil
18
21
  @curl_opts = curl_opts
22
+ @debug = debug
19
23
  end
20
24
 
21
25
  def roots
@@ -30,18 +34,15 @@ class SiteDiff
30
34
  def create(opts, &block)
31
35
  @config = {}
32
36
  @callback = block
33
-
34
- # Handle options
35
37
  @dir = Pathname.new(opts[:directory])
38
+
39
+ # Handle other options
36
40
  @depth = opts[:depth]
37
41
  @rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
38
42
 
39
- # Create the dir. Must go before cache initialization!
40
- @dir.mkpath unless @dir.directory?
41
-
42
43
  # Setup instance vars
43
44
  @paths = Hash.new { |h, k| h[k] = Set.new }
44
- @cache = Cache.new(dir: @dir.to_s, create: true)
45
+ @cache = Cache.new(directory: @dir.to_s, create: true)
45
46
  @cache.write_tags << :before << :after
46
47
 
47
48
  build_config
@@ -64,7 +65,7 @@ class SiteDiff
64
65
  def crawl(depth = nil)
65
66
  hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
66
67
  roots.each do |tag, u|
67
- Crawler.new(hydra, u, depth, @curl_opts) do |info|
68
+ Crawler.new(hydra, u, @interval, @whitelist, @blacklist, depth, @curl_opts, @debug) do |info|
68
69
  crawled_path(tag, info)
69
70
  end
70
71
  end
@@ -113,6 +114,10 @@ class SiteDiff
113
114
  end
114
115
  end
115
116
 
117
+ def directory
118
+ @dir
119
+ end
120
+
116
121
  def config_file
117
122
  @dir + Config::DEFAULT_FILENAME
118
123
  end
@@ -14,14 +14,24 @@ class SiteDiff
14
14
  DEFAULT_DEPTH = 3
15
15
 
16
16
  # Create a crawler with a base URL
17
- def initialize(hydra, base, depth = DEFAULT_DEPTH,
18
- curl_opts = UriWrapper::DEFAULT_CURL_OPTS, &block)
17
+ def initialize(hydra, base,
18
+ interval,
19
+ whitelist,
20
+ blacklist,
21
+ depth = DEFAULT_DEPTH,
22
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
23
+ debug = true,
24
+ &block)
19
25
  @hydra = hydra
20
26
  @base_uri = Addressable::URI.parse(base)
21
27
  @base = base
28
+ @interval = interval
29
+ @whitelist = whitelist
30
+ @blacklist = blacklist
22
31
  @found = Set.new
23
32
  @callback = block
24
33
  @curl_opts = curl_opts
34
+ @debug = debug
25
35
 
26
36
  add_uri('', depth)
27
37
  end
@@ -32,7 +42,7 @@ class SiteDiff
32
42
 
33
43
  @found << rel
34
44
 
35
- wrapper = UriWrapper.new(@base + rel, @curl_opts)
45
+ wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
36
46
  wrapper.queue(@hydra) do |res|
37
47
  fetched_uri(rel, depth, res)
38
48
  end
@@ -58,6 +68,11 @@ class SiteDiff
58
68
  read_result: res,
59
69
  document: doc
60
70
  )
71
+ # Insert delay to limit fetching rate
72
+ if @interval != 0
73
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
74
+ sleep(@interval / 1000.0)
75
+ end
61
76
  @callback[info]
62
77
 
63
78
  return unless depth >= 1
@@ -99,7 +114,21 @@ class SiteDiff
99
114
  # Filter out links we don't want. Links passed in are absolute URIs.
100
115
  def filter_links(uris)
101
116
  uris.find_all do |u|
102
- u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
117
+ is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path)
118
+ if is_sub_uri
119
+ is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
120
+ is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
121
+ if is_blacklisted && !is_whitelisted
122
+ SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
123
+ end
124
+ is_whitelisted || !is_blacklisted
125
+ end
126
+ # SiteDiff.log "Filtering URL #{u.path}", :info
127
+ # SiteDiff.log Regexp.new(@blacklist).match(u.path).inspect, :info
128
+ # (u.host == @base_uri.host) &&
129
+ # (u.path.start_with?(@base_uri.path)) &&
130
+ # (@whitelist == '' || Regexp.new(@whitelist).match(u.path)) &&
131
+ # (@blacklist == '' || !(Regexp.new(@blacklist).match(u.path)))
103
132
  end
104
133
  end
105
134
  end
data/lib/sitediff/diff.rb CHANGED
@@ -4,6 +4,7 @@ require 'sitediff'
4
4
  require 'diffy'
5
5
  require 'erb'
6
6
  require 'rainbow'
7
+ require 'digest'
7
8
 
8
9
  class SiteDiff
9
10
  module Diff
@@ -15,6 +16,28 @@ class SiteDiff
15
16
  diff.to_s(:html) : nil
16
17
  end
17
18
 
19
+ def encoding_blurb(encoding)
20
+ if encoding
21
+ "Text content returned - charset #{encoding}"
22
+ else
23
+ 'Binary content returned'
24
+ end
25
+ end
26
+
27
+ def binary_diffy(before, after, before_encoding, after_encoding)
28
+ if before_encoding || after_encoding
29
+ Diffy::Diff.new(encoding_blurb(before_encoding),
30
+ encoding_blurb(after_encoding)).to_s(:html)
31
+ elsif before == after
32
+ nil
33
+ else
34
+ md5_before = Digest::MD5.hexdigest(before)
35
+ md5_after = Digest::MD5.hexdigest(after)
36
+ Diffy::Diff.new("Binary content returned md5: #{md5_before}",
37
+ "Binary content returned md5: #{md5_after}").to_s(:html)
38
+ end
39
+ end
40
+
18
41
  def terminal_diffy(before_html, after_html)
19
42
  args = []
20
43
  args << :color if Rainbow.enabled
@@ -8,12 +8,15 @@ class SiteDiff
8
8
  # Cache is a cache object, see sitediff/cache
9
9
  # Paths is a list of sub-paths
10
10
  # Tags is a hash of tag names => base URLs.
11
- def initialize(cache, paths, concurrency = 3, curl_opts = nil, **tags)
11
+ def initialize(cache, paths, interval, concurrency = 3, curl_opts = nil,
12
+ debug = true, **tags)
12
13
  @cache = cache
14
+ @interval = interval
13
15
  @paths = paths
14
16
  @tags = tags
15
17
  @curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
16
18
  @concurrency = concurrency
19
+ @debug = debug
17
20
  end
18
21
 
19
22
  # Fetch all the paths, once per tag.
@@ -41,8 +44,13 @@ class SiteDiff
41
44
  results[tag] = UriWrapper::ReadResult.error('Not cached')
42
45
  process_results(path, results)
43
46
  else
44
- uri = UriWrapper.new(base + path, @curl_opts)
47
+ uri = UriWrapper.new(base + path, @curl_opts, @debug)
45
48
  uri.queue(@hydra) do |resl|
49
+ # Insert delay to limit fetching rate
50
+ if @interval != 0
51
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
52
+ sleep(@interval / 1000.0)
53
+ end
46
54
  @cache.set(tag, path, resl)
47
55
  results[tag] = resl
48
56
  process_results(path, results)
@@ -6,7 +6,7 @@ require 'digest/sha1'
6
6
  require 'fileutils'
7
7
 
8
8
  class SiteDiff
9
- class Result < Struct.new(:path, :before, :after, :error, :verbose)
9
+ class Result < Struct.new(:path, :before, :after, :before_encoding, :after_encoding, :error, :verbose)
10
10
  STATUS_SUCCESS = 0 # Identical before and after
11
11
  STATUS_FAILURE = 1 # Different before and after
12
12
  STATUS_ERROR = 2 # Couldn't fetch page
@@ -19,7 +19,11 @@ class SiteDiff
19
19
  if error
20
20
  @status = STATUS_ERROR
21
21
  else
22
- @diff = Diff.html_diffy(before, after)
22
+ if !before_encoding || !after_encoding
23
+ @diff = Diff.binary_diffy(before, after, before_encoding, after_encoding)
24
+ else
25
+ @diff = Diff.html_diffy(before, after)
26
+ end
23
27
  @status = @diff ? STATUS_FAILURE : STATUS_SUCCESS
24
28
  end
25
29
  end
@@ -96,6 +96,10 @@ class SiteDiff
96
96
  selector.each { |r| r.apply(@node) }
97
97
  @html = Sanitizer.prettify(@node)
98
98
  @node = nil
99
+ # Prevent potential UTF-8 encoding errors by removing bytes
100
+ # Not the only solution. An alternative is to return the
101
+ # string unmodified.
102
+ @html = @html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
99
103
  global.each { |r| r.apply(@html) }
100
104
  end
101
105
 
@@ -144,6 +148,10 @@ class SiteDiff
144
148
 
145
149
  # There's a lot of cruft left over,that we don't want
146
150
 
151
+ # Prevent potential UTF-8 encoding errors by removing invalid bytes.
152
+ # Not the only solution.
153
+ # An alternative is to return the string unmodified.
154
+ str = str.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
147
155
  # Remove xml declaration and <html> tags
148
156
  str.sub!(/\A<\?xml.*$\n/, '')
149
157
  str.sub!(/\A^<html>$\n/, '')
@@ -47,6 +47,8 @@ class SiteDiff
47
47
  def gsub!(str)
48
48
  re = ::Regexp.new(@rule['pattern'])
49
49
  sub = @rule['substitute'] || ''
50
+ # Expecting a mutation here. Do not reassign the variable str
51
+ # for the purpose of removing UTF-8 encoding errors.
50
52
  str.gsub!(re, sub)
51
53
  str
52
54
  end
@@ -18,10 +18,11 @@ class SiteDiff
18
18
 
19
19
  # This lets us treat errors or content as one object
20
20
  class ReadResult
21
- attr_accessor :content, :error_code, :error
21
+ attr_accessor :encoding, :content, :error_code, :error
22
22
 
23
- def initialize(content = nil)
23
+ def initialize(content = nil, encoding = 'utf-8')
24
24
  @content = content
25
+ @encoding = encoding
25
26
  @error = nil
26
27
  @error_code = nil
27
28
  end
@@ -34,11 +35,12 @@ class SiteDiff
34
35
  end
35
36
  end
36
37
 
37
- def initialize(uri, curl_opts = DEFAULT_CURL_OPTS)
38
+ def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug = true)
38
39
  @uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
39
40
  # remove trailing '/'s from local URIs
40
41
  @uri.path.gsub!(%r{/*$}, '') if local?
41
42
  @curl_opts = curl_opts
43
+ @debug = debug
42
44
  end
43
45
 
44
46
  def user
@@ -78,7 +80,7 @@ class SiteDiff
78
80
 
79
81
  # Returns the encoding of an HTTP response from headers , nil if not
80
82
  # specified.
81
- def http_encoding(http_headers)
83
+ def charset_encoding(http_headers)
82
84
  if (content_type = http_headers['Content-Type'])
83
85
  if (md = /;\s*charset=([-\w]*)/.match(content_type))
84
86
  md[1]
@@ -101,10 +103,23 @@ class SiteDiff
101
103
  body = resp.body
102
104
  # Typhoeus does not respect HTTP headers when setting the encoding
103
105
  # resp.body; coerce if possible.
104
- if (encoding = http_encoding(resp.headers))
106
+ if (encoding = charset_encoding(resp.headers))
105
107
  body.force_encoding(encoding)
106
108
  end
107
- yield ReadResult.new(body)
109
+ # Should be wrapped with rescue I guess? Maybe this entire function?
110
+ # Should at least be an option in the Cli to disable this.
111
+ # "stop on first error"
112
+ begin
113
+ yield ReadResult.new(body, encoding)
114
+ rescue ArgumentError => e
115
+ raise if @debug
116
+
117
+ yield ReadResult.error("Parsing error for #{@uri}: #{e.message}")
118
+ rescue => e
119
+ raise if @debug
120
+
121
+ yield ReadResult.error("Unknown parsing error for #{@uri}: #{e.message}")
122
+ end
108
123
  end
109
124
 
110
125
  req.on_failure do |resp|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitediff
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Dergachev
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2018-12-14 00:00:00.000000000 Z
13
+ date: 2019-04-02 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: pkg-config
@@ -167,7 +167,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
167
167
  version: '0'
168
168
  requirements: []
169
169
  rubyforge_project:
170
- rubygems_version: 2.5.2
170
+ rubygems_version: 2.5.2.3
171
171
  signing_key:
172
172
  specification_version: 4
173
173
  summary: Compare two versions of a site with ease!