utf8_validator 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -16,6 +16,7 @@ That functionality is left as an exercise for the reader.
16
16
 
17
17
  The Unicode Consortium:: At http://unicode.org/ for all the information published there.
18
18
  Frank Yung-Fong Tang:: For the state machine algorithm. See: http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
19
+ Markus Kuhn:: For invalid test data. http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
19
20
 
20
21
  == A Word On Ruby Versions
21
22
 
data/Rakefile CHANGED
@@ -21,11 +21,11 @@ Validation algorithm.}
21
21
  gem.email = "allard.guy.m@gmail.com"
22
22
  gem.authors = ["Guy Allard"]
23
23
 
24
- # Include your dependencies below. Runtime dependencies are required when using your gem,
25
- # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
24
+ # Runtime Dependencies - None at present
26
25
  # gem.add_runtime_dependency 'jabber4r', '> 0.1'
27
-
28
- gem.add_development_dependency 'bundler', '>= 2.1.2'
26
+ #
27
+ # Bundler/Jeweler takes care of this via the Gemfile.lock process
28
+ # gem.add_development_dependency 'bundler', '>= 2.1.2'
29
29
  end
30
30
  Jeweler::RubygemsDotOrgTasks.new
31
31
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
@@ -0,0 +1,79 @@
1
+ # encoding: utf-8
2
+ #
3
+ # Show how to parse a full string with multiple UTF8 validation failures.
4
+ # Accumulate error information, and report it.
5
+ #
6
+ require 'rubygems' unless RUBY_VERSION =~ /1\.9/
7
+ require 'utf8_validator'
8
+ #
9
+ # = Purpose
10
+ #
11
+ # A helper class for processing multiple validation errors in a single string.
12
+ #
13
+ class ValidationHelper
14
+ #
15
+ attr_reader :error_list
16
+ #
17
+ # Get a validator instance.
18
+ #
19
+ def initialize
20
+ @validator = UTF8::Validator.new
21
+ end
22
+ #
23
+ # Validate the whole string.
24
+ #
25
+ def scanstring(string)
26
+ @error_list = ""
27
+ work_string = string
28
+ run_pos = 0
29
+ begin
30
+ @validator.valid_encoding?(work_string, true)
31
+ rescue UTF8::ValidationError => e
32
+ # Extract offset of error, keep running offset up to date
33
+ last_colon = e.message.rindex(':')
34
+ last_lparen = e.message.rindex('(')
35
+ epos = e.message[last_colon+1..last_lparen-1]
36
+ sub_start = epos.to_i
37
+ if run_pos == 0
38
+ run_pos += sub_start
39
+ else
40
+ run_pos += sub_start + 1
41
+ end
42
+ # Start again at error offset + 1
43
+ work_string = work_string[sub_start+1..-1]
44
+ # Build next error message
45
+ next_emsg = e.message[0..last_colon] # Part A of current message
46
+ # Add running offset position
47
+ run_pos_str = sprintf "%d(0x%x)", run_pos, run_pos
48
+ next_emsg += run_pos_str
49
+ #
50
+ @error_list += next_emsg
51
+ @error_list += "\n"
52
+ retry
53
+ end
54
+ end
55
+ end
56
+ #
57
+ puts "Started"
58
+ puts
59
+ #
60
+ helper = ValidationHelper.new
61
+ #
62
+ test_data = [
63
+ "a\xffbc\xfed",
64
+ "abcdefghijk\xffbcdefghijk\xfecdefg",
65
+ "anoerrorsz",
66
+ "errorlast\x80",
67
+ "a\xffbcd\xfeefgh\xfd123",
68
+ ]
69
+ #
70
+ test_data.each do |string|
71
+ puts "/" * 60
72
+ puts "#{string}"
73
+ helper.scanstring(string)
74
+ puts "#{helper.error_list}"
75
+ end
76
+ #
77
+ puts
78
+ puts "Complete"
79
+
@@ -1,4 +1,6 @@
1
- #
1
+ #--
2
+ # Copyright (c) 2011 Guy Allard
3
+ #--
2
4
  require 'validation/validator'
3
5
  require 'validation/errors'
4
6
 
@@ -1,3 +1,6 @@
1
+ #--
2
+ # Copyright (c) 2011 Guy Allard
3
+ #--
1
4
  module UTF8
2
5
  #
3
6
  # == Purpose
@@ -7,8 +10,5 @@ module UTF8
7
10
  #
8
11
  class ValidationError < ::RuntimeError
9
12
  #
10
- def message()
11
- "general UTF-8 validation error"
12
- end
13
13
  end
14
14
  end
@@ -60,30 +60,33 @@ module UTF8
60
60
  # Instances of this class are thread safe, and a single instance may be used
61
61
  # safely by multiple concurrent threads, with one caveat:
62
62
  #
63
- # The value of #{Validator::DEBUG} must not be changed by any thread.
63
+ # The value of #{DEBUG} must not be changed by any thread.
64
64
  #
65
65
  #--
66
66
  # Copyright (c) 2011 Guy Allard
67
- #
67
+ #--
68
68
  class Validator
69
69
  #
70
70
  # For use during development only.
71
71
  #
72
72
  DEBUG=false
73
-
74
73
  #
75
74
  # Validate the supplied string for proper UTF-8 encoding.
76
75
  #
77
76
  # Calling Sequence:
78
77
  #
78
+ # validator = UTF8::Validator.new -> validator
79
79
  # validator.valid_encoding?(string) -> true or false
80
- # validator.valid_encoding?(string, raise_on_error) -> true or exception
80
+ # validator.valid_encoding?(string, true) -> true or exception
81
81
  #
82
82
  # Parameters:
83
83
  #
84
84
  # string:: the string to validate
85
85
  # raise_on_error:: a flag to indicate failure behavior
86
- #
86
+ #
87
+ # When raise_on_error is _true_ and a string fails validation, an
88
+ # error of type #{UTF8::ValidationError} is raised. The byte in error
89
+ # and the location of that byte are described in the error message.
87
90
  #
88
91
  def valid_encoding?(string, raise_on_error = false)
89
92
  bytes = string.bytes
@@ -169,7 +172,7 @@ class Validator
169
172
  else
170
173
  valid = false
171
174
  break
172
- end # of the inner case
175
+ end # of the inner case, the 'start' state
173
176
 
174
177
  # The last continuation byte of a 2, 3, or 4 byte character
175
178
  # State: 'a'
@@ -185,6 +188,7 @@ class Validator
185
188
  end
186
189
 
187
190
  # The first continuation byte for most 3 byte characters
191
+ # (those with start bytes in: 0xe1-0xec or 0xee-0xef)
188
192
  # State: 'b'
189
193
  # o Input = 0x80-0xBF: change state to A
190
194
  # o Others: ERROR
@@ -198,6 +202,7 @@ class Validator
198
202
  end
199
203
 
200
204
  # The first continuation byte for some special 3 byte characters
205
+ # (those with start byte 0xe0)
201
206
  # State: 'c'
202
207
  # o Input = 0xA0-0xBF: change state to A
203
208
  # o Others: ERROR
@@ -211,6 +216,7 @@ class Validator
211
216
  end
212
217
 
213
218
  # The first continuation byte for the remaining 3 byte characters
219
+ # (those with start byte 0xed)
214
220
  # State: 'd'
215
221
  # o Input = 0x80-0x9F: change state to A
216
222
  # o Others: ERROR
@@ -224,6 +230,7 @@ class Validator
224
230
  end
225
231
 
226
232
  # The first continuation byte for some 4 byte characters
233
+ # (those with start bytes in: 0xf1-0xf3)
227
234
  # State: 'e'
228
235
  # o Input = 0x80-0xBF: change state to B
229
236
  # o Others: ERROR
@@ -237,6 +244,7 @@ class Validator
237
244
  end
238
245
 
239
246
  # The first continuation byte for some special 4 byte characters
247
+ # (those with start byte 0xf0)
240
248
  # State: 'f'
241
249
  # o Input = 0x90-0xBF: change state to B
242
250
  # o Others: ERROR
@@ -250,6 +258,7 @@ class Validator
250
258
  end
251
259
 
252
260
  # The first continuation byte for the remaining 4 byte characters
261
+ # (those with start byte 0xf4)
253
262
  # State: 'g'
254
263
  # o Input = 0x80-0x8F: change state to B
255
264
  # o Others: ERROR
@@ -271,10 +280,12 @@ class Validator
271
280
  puts "State at end: #{state}" if DEBUG
272
281
  # Catch truncation at end of string
273
282
  if valid and state != 'start'
283
+ puts "Resetting valid value" if DEBUG
274
284
  valid = false
275
285
  end
276
286
  #
277
287
  if !valid and raise_on_error
288
+ puts "Raising Error" if DEBUG
278
289
  raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})"
279
290
  end
280
291
  #
@@ -34,5 +34,15 @@ class TestRaiseRequect < Test::Unit::TestCase
34
34
  end
35
35
  end
36
36
 
37
+ # Check message from raise
38
+ def test_0030_check_raise_message
39
+ #
40
+ begin
41
+ @validator.valid_encoding?("a\xffb\xfec", true)
42
+ rescue UTF8::ValidationError => e
43
+ assert e.message =~ /^Invalid byte/
44
+ end
45
+ end
46
+
37
47
  end
38
48
 
@@ -7,6 +7,10 @@ require 'helper'
7
7
  #
8
8
  # Tests for the #{UTF8::Validator} implementation.
9
9
  #
10
+ # Some test data pulled directly from:
11
+ #
12
+ # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
13
+ #
10
14
  class TestUtf8Validator < Test::Unit::TestCase
11
15
  #
12
16
  def setup
@@ -92,6 +96,22 @@ class TestUtf8Validator < Test::Unit::TestCase
92
96
  end
93
97
  end
94
98
 
99
+ # Boundary conditions
100
+ def test_0070_boundary_conditions
101
+ test_data = [
102
+ "\xed\x9f\xbf", # = "\ud7ff"
103
+ "\xee\x80\x80", # = "\ue000"
104
+ "\xef\xbf\xbd", # = "\ufffd"
105
+ # "\xf4\x8f\xbf\xbf", # = "\U0010ffff" / maybe _should_ fail ??
106
+ # "\xf4\x90\x80\x80", # = "\ufffd" / maybe _should_ fail ?? / research
107
+
108
+ ]
109
+ test_data.each do |string|
110
+ assert @validator.valid_encoding?(string), "boundary conditions: #{string}"
111
+ assert string.force_encoding("UTF-8").valid_encoding?, "boundary conditions 19: #{string}" if RUBY_VERSION =~ /1\.9/
112
+ end
113
+ end
114
+
95
115
  #--
96
116
  # Validation should fail for the following tests
97
117
  #--
@@ -108,8 +128,13 @@ class TestUtf8Validator < Test::Unit::TestCase
108
128
  # UTF-16 Surrogate Halves
109
129
  def test0520_utf16_surrogate_halves
110
130
  test_data = [
111
- "\xed\xa0\x80", # u-800 (lowest)
112
- "\xed\xbf\xbf", # u-fff (highest)
131
+ "\xed\xa0\x80",
132
+ "\xed\xad\xbf",
133
+ "\xed\xae\x80",
134
+ "\xed\xaf\xbf",
135
+ "\xed\xb0\x80",
136
+ "\xed\xbe\x80",
137
+ "\xed\xbf\xbf",
113
138
  ]
114
139
  test_data.each do |string|
115
140
  assert !@validator.valid_encoding?(string), "UTF-16 Surrogate Halves: #{string}"
@@ -117,6 +142,14 @@ class TestUtf8Validator < Test::Unit::TestCase
117
142
  end
118
143
  end
119
144
 
145
+ #--
146
+ # I do not see a need to test UTF-16 surrogate pairs. They are guaranteed
147
+ # to alyays fail if the preceding test succeeds. This is because the
148
+ # preceeding test data values are always the first surrogate of the pair.
149
+ #
150
+ # UTF-16 surrogates are clearly something I do not understand.
151
+ #--
152
+
120
153
  # Invalid single bytes
121
154
  def test0530_invalid_single_bytes
122
155
  test_data = [
@@ -222,5 +255,21 @@ class TestUtf8Validator < Test::Unit::TestCase
222
255
  end
223
256
  end
224
257
 
258
+
259
+ # Maximum overlong sequences
260
+ def test0580_max_overlong_seqs
261
+ test_data = [
262
+ "\xc1\xbf",
263
+ "\xe0\x9f\xbf",
264
+ "\xf0\x8f\xbf\xbf",
265
+ "\xf8\x87\xbf\xbf\xbf",
266
+ "\xfc\x83\xbf\xbf\xbf\xbf",
267
+ ]
268
+ test_data.each do |string|
269
+ assert !@validator.valid_encoding?(string), "max overlong seq: #{string}"
270
+ assert !string.force_encoding("UTF-8").valid_encoding?, "max overlong seq 19: #{string}" if RUBY_VERSION =~ /1\.9/
271
+ end
272
+ end
273
+
225
274
  end
226
275
 
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{utf8_validator}
8
- s.version = "0.0.1"
8
+ s.version = "0.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Guy Allard"]
12
- s.date = %q{2011-01-25}
12
+ s.date = %q{2011-01-26}
13
13
  s.description = %q{A State Machine implementation of a UTF-8 Encoding
14
14
  Validation algorithm.}
15
15
  s.email = %q{allard.guy.m@gmail.com}
@@ -25,6 +25,7 @@ Validation algorithm.}
25
25
  "README.rdoc",
26
26
  "Rakefile",
27
27
  "VERSION",
28
+ "examples/fullstring.rb",
28
29
  "lib/utf8_validator.rb",
29
30
  "lib/validation/errors.rb",
30
31
  "lib/validation/validator.rb",
@@ -39,6 +40,7 @@ Validation algorithm.}
39
40
  s.rubygems_version = %q{1.3.7}
40
41
  s.summary = %q{A UTF-8 Encoding Validator.}
41
42
  s.test_files = [
43
+ "examples/fullstring.rb",
42
44
  "test/helper.rb",
43
45
  "test/test_raise_request.rb",
44
46
  "test/test_utf8_validator.rb"
@@ -52,18 +54,15 @@ Validation algorithm.}
52
54
  s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
53
55
  s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
54
56
  s.add_development_dependency(%q<rcov>, [">= 0"])
55
- s.add_development_dependency(%q<bundler>, [">= 2.1.2"])
56
57
  else
57
58
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
58
59
  s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
59
60
  s.add_dependency(%q<rcov>, [">= 0"])
60
- s.add_dependency(%q<bundler>, [">= 2.1.2"])
61
61
  end
62
62
  else
63
63
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
64
64
  s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
65
65
  s.add_dependency(%q<rcov>, [">= 0"])
66
- s.add_dependency(%q<bundler>, [">= 2.1.2"])
67
66
  end
68
67
  end
69
68
 
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 1
9
- version: 0.0.1
8
+ - 2
9
+ version: 0.0.2
10
10
  platform: ruby
11
11
  authors:
12
12
  - Guy Allard
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-01-25 00:00:00 -05:00
17
+ date: 2011-01-26 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -60,21 +60,6 @@ dependencies:
60
60
  type: :development
61
61
  prerelease: false
62
62
  version_requirements: *id003
63
- - !ruby/object:Gem::Dependency
64
- name: bundler
65
- requirement: &id004 !ruby/object:Gem::Requirement
66
- none: false
67
- requirements:
68
- - - ">="
69
- - !ruby/object:Gem::Version
70
- segments:
71
- - 2
72
- - 1
73
- - 2
74
- version: 2.1.2
75
- type: :development
76
- prerelease: false
77
- version_requirements: *id004
78
63
  description: |-
79
64
  A State Machine implementation of a UTF-8 Encoding
80
65
  Validation algorithm.
@@ -94,6 +79,7 @@ files:
94
79
  - README.rdoc
95
80
  - Rakefile
96
81
  - VERSION
82
+ - examples/fullstring.rb
97
83
  - lib/utf8_validator.rb
98
84
  - lib/validation/errors.rb
99
85
  - lib/validation/validator.rb
@@ -115,7 +101,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
115
101
  requirements:
116
102
  - - ">="
117
103
  - !ruby/object:Gem::Version
118
- hash: -1120544117494340473
104
+ hash: 2884485592009813991
119
105
  segments:
120
106
  - 0
121
107
  version: "0"
@@ -135,6 +121,7 @@ signing_key:
135
121
  specification_version: 3
136
122
  summary: A UTF-8 Encoding Validator.
137
123
  test_files:
124
+ - examples/fullstring.rb
138
125
  - test/helper.rb
139
126
  - test/test_raise_request.rb
140
127
  - test/test_utf8_validator.rb