utf8_validator 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -16,6 +16,7 @@ That functionality is left as an exercise for the reader.
16
16
 
17
17
  The Unicode Consortium:: At http://unicode.org/ for all the information published there.
18
18
  Frank Yung-Fong Tang:: For the state machine algorithm. See: http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
19
+ Markus Kuhn:: For invalid test data. http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
19
20
 
20
21
  == A Word On Ruby Versions
21
22
 
data/Rakefile CHANGED
@@ -21,11 +21,11 @@ Validation algorithm.}
21
21
  gem.email = "allard.guy.m@gmail.com"
22
22
  gem.authors = ["Guy Allard"]
23
23
 
24
- # Include your dependencies below. Runtime dependencies are required when using your gem,
25
- # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
24
+ # Runtime Dependencies - None at present
26
25
  # gem.add_runtime_dependency 'jabber4r', '> 0.1'
27
-
28
- gem.add_development_dependency 'bundler', '>= 2.1.2'
26
+ #
27
+ # Bundler/Jeweler takes care of this via the Gemfile.lock process
28
+ # gem.add_development_dependency 'bundler', '>= 2.1.2'
29
29
  end
30
30
  Jeweler::RubygemsDotOrgTasks.new
31
31
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
@@ -0,0 +1,79 @@
1
+ # encoding: utf-8
2
+ #
3
+ # Show how to parse a full string with multiple UTF8 validation failures.
4
+ # Accumulate error information, and report it.
5
+ #
6
+ require 'rubygems' unless RUBY_VERSION =~ /1\.9/
7
+ require 'utf8_validator'
8
+ #
9
+ # = Purpose
10
+ #
11
+ # A helper class for processing multiple validation errors in a single string.
12
+ #
13
+ class ValidationHelper
14
+ #
15
+ attr_reader :error_list
16
+ #
17
+ # Get a validator instance.
18
+ #
19
+ def initialize
20
+ @validator = UTF8::Validator.new
21
+ end
22
+ #
23
+ # Validate the whole string.
24
+ #
25
+ def scanstring(string)
26
+ @error_list = ""
27
+ work_string = string
28
+ run_pos = 0
29
+ begin
30
+ @validator.valid_encoding?(work_string, true)
31
+ rescue UTF8::ValidationError => e
32
+ # Extract offset of error, keep running offset up to date
33
+ last_colon = e.message.rindex(':')
34
+ last_lparen = e.message.rindex('(')
35
+ epos = e.message[last_colon+1..last_lparen-1]
36
+ sub_start = epos.to_i
37
+ if run_pos == 0
38
+ run_pos += sub_start
39
+ else
40
+ run_pos += sub_start + 1
41
+ end
42
+ # Start again at error offset + 1
43
+ work_string = work_string[sub_start+1..-1]
44
+ # Build next error message
45
+ next_emsg = e.message[0..last_colon] # Part A of current message
46
+ # Add running offset position
47
+ run_pos_str = sprintf "%d(0x%x)", run_pos, run_pos
48
+ next_emsg += run_pos_str
49
+ #
50
+ @error_list += next_emsg
51
+ @error_list += "\n"
52
+ retry
53
+ end
54
+ end
55
+ end
56
+ #
57
+ puts "Started"
58
+ puts
59
+ #
60
+ helper = ValidationHelper.new
61
+ #
62
+ test_data = [
63
+ "a\xffbc\xfed",
64
+ "abcdefghijk\xffbcdefghijk\xfecdefg",
65
+ "anoerrorsz",
66
+ "errorlast\x80",
67
+ "a\xffbcd\xfeefgh\xfd123",
68
+ ]
69
+ #
70
+ test_data.each do |string|
71
+ puts "/" * 60
72
+ puts "#{string}"
73
+ helper.scanstring(string)
74
+ puts "#{helper.error_list}"
75
+ end
76
+ #
77
+ puts
78
+ puts "Complete"
79
+
@@ -1,4 +1,6 @@
1
- #
1
+ #--
2
+ # Copyright (c) 2011 Guy Allard
3
+ #--
2
4
  require 'validation/validator'
3
5
  require 'validation/errors'
4
6
 
@@ -1,3 +1,6 @@
1
+ #--
2
+ # Copyright (c) 2011 Guy Allard
3
+ #--
1
4
  module UTF8
2
5
  #
3
6
  # == Purpose
@@ -7,8 +10,5 @@ module UTF8
7
10
  #
8
11
  class ValidationError < ::RuntimeError
9
12
  #
10
- def message()
11
- "general UTF-8 validation error"
12
- end
13
13
  end
14
14
  end
@@ -60,30 +60,33 @@ module UTF8
60
60
  # Instances of this class are thread safe, and a single instance may be used
61
61
  # safely by multiple concurrent threads, with one caveat:
62
62
  #
63
- # The value of #{Validator::DEBUG} must not be changed by any thread.
63
+ # The value of #{DEBUG} must not be changed by any thread.
64
64
  #
65
65
  #--
66
66
  # Copyright (c) 2011 Guy Allard
67
- #
67
+ #--
68
68
  class Validator
69
69
  #
70
70
  # For use during development only.
71
71
  #
72
72
  DEBUG=false
73
-
74
73
  #
75
74
  # Validate the supplied string for proper UTF-8 encoding.
76
75
  #
77
76
  # Calling Sequence:
78
77
  #
78
+ # validator = UTF8::Validator.new -> validator
79
79
  # validator.valid_encoding?(string) -> true or false
80
- # validator.valid_encoding?(string, raise_on_error) -> true or exception
80
+ # validator.valid_encoding?(string, true) -> true or exception
81
81
  #
82
82
  # Parameters:
83
83
  #
84
84
  # string:: the string to validate
85
85
  # raise_on_error:: a flag to indicate failure behavior
86
- #
86
+ #
87
+ # When raise_on_error is _true_ and a string fails validation, an
88
+ # error of type #{UTF8::ValidationError} is raised. The byte in error
89
+ # and the location of that byte are described in the error message.
87
90
  #
88
91
  def valid_encoding?(string, raise_on_error = false)
89
92
  bytes = string.bytes
@@ -169,7 +172,7 @@ class Validator
169
172
  else
170
173
  valid = false
171
174
  break
172
- end # of the inner case
175
+ end # of the inner case, the 'start' state
173
176
 
174
177
  # The last continuation byte of a 2, 3, or 4 byte character
175
178
  # State: 'a'
@@ -185,6 +188,7 @@ class Validator
185
188
  end
186
189
 
187
190
  # The first continuation byte for most 3 byte characters
191
+ # (those with start bytes in: 0xe1-0xec or 0xee-0xef)
188
192
  # State: 'b'
189
193
  # o Input = 0x80-0xBF: change state to A
190
194
  # o Others: ERROR
@@ -198,6 +202,7 @@ class Validator
198
202
  end
199
203
 
200
204
  # The first continuation byte for some special 3 byte characters
205
+ # (those with start byte 0xe0)
201
206
  # State: 'c'
202
207
  # o Input = 0xA0-0xBF: change state to A
203
208
  # o Others: ERROR
@@ -211,6 +216,7 @@ class Validator
211
216
  end
212
217
 
213
218
  # The first continuation byte for the remaining 3 byte characters
219
+ # (those with start byte 0xed)
214
220
  # State: 'd'
215
221
  # o Input = 0x80-0x9F: change state to A
216
222
  # o Others: ERROR
@@ -224,6 +230,7 @@ class Validator
224
230
  end
225
231
 
226
232
  # The first continuation byte for some 4 byte characters
233
+ # (those with start bytes in: 0xf1-0xf3)
227
234
  # State: 'e'
228
235
  # o Input = 0x80-0xBF: change state to B
229
236
  # o Others: ERROR
@@ -237,6 +244,7 @@ class Validator
237
244
  end
238
245
 
239
246
  # The first continuation byte for some special 4 byte characters
247
+ # (those with start byte 0xf0)
240
248
  # State: 'f'
241
249
  # o Input = 0x90-0xBF: change state to B
242
250
  # o Others: ERROR
@@ -250,6 +258,7 @@ class Validator
250
258
  end
251
259
 
252
260
  # The first continuation byte for the remaining 4 byte characters
261
+ # (those with start byte 0xf4)
253
262
  # State: 'g'
254
263
  # o Input = 0x80-0x8F: change state to B
255
264
  # o Others: ERROR
@@ -271,10 +280,12 @@ class Validator
271
280
  puts "State at end: #{state}" if DEBUG
272
281
  # Catch truncation at end of string
273
282
  if valid and state != 'start'
283
+ puts "Resetting valid value" if DEBUG
274
284
  valid = false
275
285
  end
276
286
  #
277
287
  if !valid and raise_on_error
288
+ puts "Raising Error" if DEBUG
278
289
  raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})"
279
290
  end
280
291
  #
@@ -34,5 +34,15 @@ class TestRaiseRequect < Test::Unit::TestCase
34
34
  end
35
35
  end
36
36
 
37
+ # Check message from raise
38
+ def test_0030_check_raise_message
39
+ #
40
+ begin
41
+ @validator.valid_encoding?("a\xffb\xfec", true)
42
+ rescue UTF8::ValidationError => e
43
+ assert e.message =~ /^Invalid byte/
44
+ end
45
+ end
46
+
37
47
  end
38
48
 
@@ -7,6 +7,10 @@ require 'helper'
7
7
  #
8
8
  # Tests for the #{UTF8::Validator} implementation.
9
9
  #
10
+ # Some test data pulled directly from:
11
+ #
12
+ # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
13
+ #
10
14
  class TestUtf8Validator < Test::Unit::TestCase
11
15
  #
12
16
  def setup
@@ -92,6 +96,22 @@ class TestUtf8Validator < Test::Unit::TestCase
92
96
  end
93
97
  end
94
98
 
99
+ # Boundary conditions
100
+ def test_0070_boundary_conditions
101
+ test_data = [
102
+ "\xed\x9f\xbf", # = "\ud7ff"
103
+ "\xee\x80\x80", # = "\ue000"
104
+ "\xef\xbf\xbd", # = "\ufffd"
105
+ # "\xf4\x8f\xbf\xbf", # = "\U0010ffff" / maybe _should_ fail ??
106
+ # "\xf4\x90\x80\x80", # = "\ufffd" / maybe _should_ fail ?? / research
107
+
108
+ ]
109
+ test_data.each do |string|
110
+ assert @validator.valid_encoding?(string), "boundary conditions: #{string}"
111
+ assert string.force_encoding("UTF-8").valid_encoding?, "boundary conditions 19: #{string}" if RUBY_VERSION =~ /1\.9/
112
+ end
113
+ end
114
+
95
115
  #--
96
116
  # Validation should fail for the following tests
97
117
  #--
@@ -108,8 +128,13 @@ class TestUtf8Validator < Test::Unit::TestCase
108
128
  # UTF-16 Surrogate Halves
109
129
  def test0520_utf16_surrogate_halves
110
130
  test_data = [
111
- "\xed\xa0\x80", # u-800 (lowest)
112
- "\xed\xbf\xbf", # u-fff (highest)
131
+ "\xed\xa0\x80",
132
+ "\xed\xad\xbf",
133
+ "\xed\xae\x80",
134
+ "\xed\xaf\xbf",
135
+ "\xed\xb0\x80",
136
+ "\xed\xbe\x80",
137
+ "\xed\xbf\xbf",
113
138
  ]
114
139
  test_data.each do |string|
115
140
  assert !@validator.valid_encoding?(string), "UTF-16 Surrogate Halves: #{string}"
@@ -117,6 +142,14 @@ class TestUtf8Validator < Test::Unit::TestCase
117
142
  end
118
143
  end
119
144
 
145
+ #--
146
+ # I do not see a need to test UTF-16 surrogate pairs. They are guaranteed
147
+ # to alyays fail if the preceding test succeeds. This is because the
148
+ # preceeding test data values are always the first surrogate of the pair.
149
+ #
150
+ # UTF-16 surrogates are clearly something I do not understand.
151
+ #--
152
+
120
153
  # Invalid single bytes
121
154
  def test0530_invalid_single_bytes
122
155
  test_data = [
@@ -222,5 +255,21 @@ class TestUtf8Validator < Test::Unit::TestCase
222
255
  end
223
256
  end
224
257
 
258
+
259
+ # Maximum overlong sequences
260
+ def test0580_max_overlong_seqs
261
+ test_data = [
262
+ "\xc1\xbf",
263
+ "\xe0\x9f\xbf",
264
+ "\xf0\x8f\xbf\xbf",
265
+ "\xf8\x87\xbf\xbf\xbf",
266
+ "\xfc\x83\xbf\xbf\xbf\xbf",
267
+ ]
268
+ test_data.each do |string|
269
+ assert !@validator.valid_encoding?(string), "max overlong seq: #{string}"
270
+ assert !string.force_encoding("UTF-8").valid_encoding?, "max overlong seq 19: #{string}" if RUBY_VERSION =~ /1\.9/
271
+ end
272
+ end
273
+
225
274
  end
226
275
 
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{utf8_validator}
8
- s.version = "0.0.1"
8
+ s.version = "0.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Guy Allard"]
12
- s.date = %q{2011-01-25}
12
+ s.date = %q{2011-01-26}
13
13
  s.description = %q{A State Machine implementation of a UTF-8 Encoding
14
14
  Validation algorithm.}
15
15
  s.email = %q{allard.guy.m@gmail.com}
@@ -25,6 +25,7 @@ Validation algorithm.}
25
25
  "README.rdoc",
26
26
  "Rakefile",
27
27
  "VERSION",
28
+ "examples/fullstring.rb",
28
29
  "lib/utf8_validator.rb",
29
30
  "lib/validation/errors.rb",
30
31
  "lib/validation/validator.rb",
@@ -39,6 +40,7 @@ Validation algorithm.}
39
40
  s.rubygems_version = %q{1.3.7}
40
41
  s.summary = %q{A UTF-8 Encoding Validator.}
41
42
  s.test_files = [
43
+ "examples/fullstring.rb",
42
44
  "test/helper.rb",
43
45
  "test/test_raise_request.rb",
44
46
  "test/test_utf8_validator.rb"
@@ -52,18 +54,15 @@ Validation algorithm.}
52
54
  s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
53
55
  s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
54
56
  s.add_development_dependency(%q<rcov>, [">= 0"])
55
- s.add_development_dependency(%q<bundler>, [">= 2.1.2"])
56
57
  else
57
58
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
58
59
  s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
59
60
  s.add_dependency(%q<rcov>, [">= 0"])
60
- s.add_dependency(%q<bundler>, [">= 2.1.2"])
61
61
  end
62
62
  else
63
63
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
64
64
  s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
65
65
  s.add_dependency(%q<rcov>, [">= 0"])
66
- s.add_dependency(%q<bundler>, [">= 2.1.2"])
67
66
  end
68
67
  end
69
68
 
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 1
9
- version: 0.0.1
8
+ - 2
9
+ version: 0.0.2
10
10
  platform: ruby
11
11
  authors:
12
12
  - Guy Allard
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-01-25 00:00:00 -05:00
17
+ date: 2011-01-26 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -60,21 +60,6 @@ dependencies:
60
60
  type: :development
61
61
  prerelease: false
62
62
  version_requirements: *id003
63
- - !ruby/object:Gem::Dependency
64
- name: bundler
65
- requirement: &id004 !ruby/object:Gem::Requirement
66
- none: false
67
- requirements:
68
- - - ">="
69
- - !ruby/object:Gem::Version
70
- segments:
71
- - 2
72
- - 1
73
- - 2
74
- version: 2.1.2
75
- type: :development
76
- prerelease: false
77
- version_requirements: *id004
78
63
  description: |-
79
64
  A State Machine implementation of a UTF-8 Encoding
80
65
  Validation algorithm.
@@ -94,6 +79,7 @@ files:
94
79
  - README.rdoc
95
80
  - Rakefile
96
81
  - VERSION
82
+ - examples/fullstring.rb
97
83
  - lib/utf8_validator.rb
98
84
  - lib/validation/errors.rb
99
85
  - lib/validation/validator.rb
@@ -115,7 +101,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
115
101
  requirements:
116
102
  - - ">="
117
103
  - !ruby/object:Gem::Version
118
- hash: -1120544117494340473
104
+ hash: 2884485592009813991
119
105
  segments:
120
106
  - 0
121
107
  version: "0"
@@ -135,6 +121,7 @@ signing_key:
135
121
  specification_version: 3
136
122
  summary: A UTF-8 Encoding Validator.
137
123
  test_files:
124
+ - examples/fullstring.rb
138
125
  - test/helper.rb
139
126
  - test/test_raise_request.rb
140
127
  - test/test_utf8_validator.rb