utf8_validator 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -0
- data/Rakefile +4 -4
- data/VERSION +1 -1
- data/examples/fullstring.rb +79 -0
- data/lib/utf8_validator.rb +3 -1
- data/lib/validation/errors.rb +3 -3
- data/lib/validation/validator.rb +17 -6
- data/test/test_raise_request.rb +10 -0
- data/test/test_utf8_validator.rb +51 -2
- data/utf8_validator.gemspec +4 -5
- metadata +6 -19
data/README.rdoc
CHANGED
@@ -16,6 +16,7 @@ That functionality is left as an exercise for the reader.
|
|
16
16
|
|
17
17
|
The Unicode Consortium:: At http://unicode.org/ for all the information published there.
|
18
18
|
Frank Yung-Fong Tang:: For the state machine algorithm. See: http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
|
19
|
+
Markus Kuhn:: For invalid test data. http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
19
20
|
|
20
21
|
== A Word On Ruby Versions
|
21
22
|
|
data/Rakefile
CHANGED
@@ -21,11 +21,11 @@ Validation algorithm.}
|
|
21
21
|
gem.email = "allard.guy.m@gmail.com"
|
22
22
|
gem.authors = ["Guy Allard"]
|
23
23
|
|
24
|
-
#
|
25
|
-
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
24
|
+
# Runtime Dependencies - None at present
|
26
25
|
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
27
|
-
|
28
|
-
|
26
|
+
#
|
27
|
+
# Bundler/Jeweler takes care of this via the Gemfile.lock process
|
28
|
+
# gem.add_development_dependency 'bundler', '>= 2.1.2'
|
29
29
|
end
|
30
30
|
Jeweler::RubygemsDotOrgTasks.new
|
31
31
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
# Show how to parse a full string with multiple UTF8 validation failures.
|
4
|
+
# Accumulate error information, and report it.
|
5
|
+
#
|
6
|
+
require 'rubygems' unless RUBY_VERSION =~ /1\.9/
|
7
|
+
require 'utf8_validator'
|
8
|
+
#
|
9
|
+
# = Purpose
|
10
|
+
#
|
11
|
+
# A helper class for processing multiple validation errors in a single string.
|
12
|
+
#
|
13
|
+
class ValidationHelper
|
14
|
+
#
|
15
|
+
attr_reader :error_list
|
16
|
+
#
|
17
|
+
# Get a validator instance.
|
18
|
+
#
|
19
|
+
def initialize
|
20
|
+
@validator = UTF8::Validator.new
|
21
|
+
end
|
22
|
+
#
|
23
|
+
# Validate the whole string.
|
24
|
+
#
|
25
|
+
def scanstring(string)
|
26
|
+
@error_list = ""
|
27
|
+
work_string = string
|
28
|
+
run_pos = 0
|
29
|
+
begin
|
30
|
+
@validator.valid_encoding?(work_string, true)
|
31
|
+
rescue UTF8::ValidationError => e
|
32
|
+
# Extract offset of error, keep running offset up to date
|
33
|
+
last_colon = e.message.rindex(':')
|
34
|
+
last_lparen = e.message.rindex('(')
|
35
|
+
epos = e.message[last_colon+1..last_lparen-1]
|
36
|
+
sub_start = epos.to_i
|
37
|
+
if run_pos == 0
|
38
|
+
run_pos += sub_start
|
39
|
+
else
|
40
|
+
run_pos += sub_start + 1
|
41
|
+
end
|
42
|
+
# Start again at error offset + 1
|
43
|
+
work_string = work_string[sub_start+1..-1]
|
44
|
+
# Build next error message
|
45
|
+
next_emsg = e.message[0..last_colon] # Part A of current message
|
46
|
+
# Add running offset position
|
47
|
+
run_pos_str = sprintf "%d(0x%x)", run_pos, run_pos
|
48
|
+
next_emsg += run_pos_str
|
49
|
+
#
|
50
|
+
@error_list += next_emsg
|
51
|
+
@error_list += "\n"
|
52
|
+
retry
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
#
|
57
|
+
puts "Started"
|
58
|
+
puts
|
59
|
+
#
|
60
|
+
helper = ValidationHelper.new
|
61
|
+
#
|
62
|
+
test_data = [
|
63
|
+
"a\xffbc\xfed",
|
64
|
+
"abcdefghijk\xffbcdefghijk\xfecdefg",
|
65
|
+
"anoerrorsz",
|
66
|
+
"errorlast\x80",
|
67
|
+
"a\xffbcd\xfeefgh\xfd123",
|
68
|
+
]
|
69
|
+
#
|
70
|
+
test_data.each do |string|
|
71
|
+
puts "/" * 60
|
72
|
+
puts "#{string}"
|
73
|
+
helper.scanstring(string)
|
74
|
+
puts "#{helper.error_list}"
|
75
|
+
end
|
76
|
+
#
|
77
|
+
puts
|
78
|
+
puts "Complete"
|
79
|
+
|
data/lib/utf8_validator.rb
CHANGED
data/lib/validation/errors.rb
CHANGED
data/lib/validation/validator.rb
CHANGED
@@ -60,30 +60,33 @@ module UTF8
|
|
60
60
|
# Instances of this class are thread safe, and a single instance may be used
|
61
61
|
# safely by multiple concurrent threads, with one caveat:
|
62
62
|
#
|
63
|
-
# The value of #{
|
63
|
+
# The value of #{DEBUG} must not be changed by any thread.
|
64
64
|
#
|
65
65
|
#--
|
66
66
|
# Copyright (c) 2011 Guy Allard
|
67
|
-
|
67
|
+
#--
|
68
68
|
class Validator
|
69
69
|
#
|
70
70
|
# For use during development only.
|
71
71
|
#
|
72
72
|
DEBUG=false
|
73
|
-
|
74
73
|
#
|
75
74
|
# Validate the supplied string for proper UTF-8 encoding.
|
76
75
|
#
|
77
76
|
# Calling Sequence:
|
78
77
|
#
|
78
|
+
# validator = UTF8::Validator.new -> validator
|
79
79
|
# validator.valid_encoding?(string) -> true or false
|
80
|
-
# validator.valid_encoding?(string,
|
80
|
+
# validator.valid_encoding?(string, true) -> true or exception
|
81
81
|
#
|
82
82
|
# Parameters:
|
83
83
|
#
|
84
84
|
# string:: the string to validate
|
85
85
|
# raise_on_error:: a flag to indicate failure behavior
|
86
|
-
#
|
86
|
+
#
|
87
|
+
# When raise_on_error is _true_ and a string fails validation, an
|
88
|
+
# error of type #{UTF8::ValidationError} is raised. The byte in error
|
89
|
+
# and the location of that byte are described in the error message.
|
87
90
|
#
|
88
91
|
def valid_encoding?(string, raise_on_error = false)
|
89
92
|
bytes = string.bytes
|
@@ -169,7 +172,7 @@ class Validator
|
|
169
172
|
else
|
170
173
|
valid = false
|
171
174
|
break
|
172
|
-
end # of the inner case
|
175
|
+
end # of the inner case, the 'start' state
|
173
176
|
|
174
177
|
# The last continuation byte of a 2, 3, or 4 byte character
|
175
178
|
# State: 'a'
|
@@ -185,6 +188,7 @@ class Validator
|
|
185
188
|
end
|
186
189
|
|
187
190
|
# The first continuation byte for most 3 byte characters
|
191
|
+
# (those with start bytes in: 0xe1-0xec or 0xee-0xef)
|
188
192
|
# State: 'b'
|
189
193
|
# o Input = 0x80-0xBF: change state to A
|
190
194
|
# o Others: ERROR
|
@@ -198,6 +202,7 @@ class Validator
|
|
198
202
|
end
|
199
203
|
|
200
204
|
# The first continuation byte for some special 3 byte characters
|
205
|
+
# (those with start byte 0xe0)
|
201
206
|
# State: 'c'
|
202
207
|
# o Input = 0xA0-0xBF: change state to A
|
203
208
|
# o Others: ERROR
|
@@ -211,6 +216,7 @@ class Validator
|
|
211
216
|
end
|
212
217
|
|
213
218
|
# The first continuation byte for the remaining 3 byte characters
|
219
|
+
# (those with start byte 0xed)
|
214
220
|
# State: 'd'
|
215
221
|
# o Input = 0x80-0x9F: change state to A
|
216
222
|
# o Others: ERROR
|
@@ -224,6 +230,7 @@ class Validator
|
|
224
230
|
end
|
225
231
|
|
226
232
|
# The first continuation byte for some 4 byte characters
|
233
|
+
# (those with start bytes in: 0xf1-0xf3)
|
227
234
|
# State: 'e'
|
228
235
|
# o Input = 0x80-0xBF: change state to B
|
229
236
|
# o Others: ERROR
|
@@ -237,6 +244,7 @@ class Validator
|
|
237
244
|
end
|
238
245
|
|
239
246
|
# The first continuation byte for some special 4 byte characters
|
247
|
+
# (those with start byte 0xf0)
|
240
248
|
# State: 'f'
|
241
249
|
# o Input = 0x90-0xBF: change state to B
|
242
250
|
# o Others: ERROR
|
@@ -250,6 +258,7 @@ class Validator
|
|
250
258
|
end
|
251
259
|
|
252
260
|
# The first continuation byte for the remaining 4 byte characters
|
261
|
+
# (those with start byte 0xf4)
|
253
262
|
# State: 'g'
|
254
263
|
# o Input = 0x80-0x8F: change state to B
|
255
264
|
# o Others: ERROR
|
@@ -271,10 +280,12 @@ class Validator
|
|
271
280
|
puts "State at end: #{state}" if DEBUG
|
272
281
|
# Catch truncation at end of string
|
273
282
|
if valid and state != 'start'
|
283
|
+
puts "Resetting valid value" if DEBUG
|
274
284
|
valid = false
|
275
285
|
end
|
276
286
|
#
|
277
287
|
if !valid and raise_on_error
|
288
|
+
puts "Raising Error" if DEBUG
|
278
289
|
raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})"
|
279
290
|
end
|
280
291
|
#
|
data/test/test_raise_request.rb
CHANGED
@@ -34,5 +34,15 @@ class TestRaiseRequect < Test::Unit::TestCase
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
+
# Check message from raise
|
38
|
+
def test_0030_check_raise_message
|
39
|
+
#
|
40
|
+
begin
|
41
|
+
@validator.valid_encoding?("a\xffb\xfec", true)
|
42
|
+
rescue UTF8::ValidationError => e
|
43
|
+
assert e.message =~ /^Invalid byte/
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
37
47
|
end
|
38
48
|
|
data/test/test_utf8_validator.rb
CHANGED
@@ -7,6 +7,10 @@ require 'helper'
|
|
7
7
|
#
|
8
8
|
# Tests for the #{UTF8::Validator} implementation.
|
9
9
|
#
|
10
|
+
# Some test data pulled directly from:
|
11
|
+
#
|
12
|
+
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
13
|
+
#
|
10
14
|
class TestUtf8Validator < Test::Unit::TestCase
|
11
15
|
#
|
12
16
|
def setup
|
@@ -92,6 +96,22 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
92
96
|
end
|
93
97
|
end
|
94
98
|
|
99
|
+
# Boundary conditions
|
100
|
+
def test_0070_boundary_conditions
|
101
|
+
test_data = [
|
102
|
+
"\xed\x9f\xbf", # = "\ud7ff"
|
103
|
+
"\xee\x80\x80", # = "\ue000"
|
104
|
+
"\xef\xbf\xbd", # = "\ufffd"
|
105
|
+
# "\xf4\x8f\xbf\xbf", # = "\U0010ffff" / maybe _should_ fail ??
|
106
|
+
# "\xf4\x90\x80\x80", # = "\ufffd" / maybe _should_ fail ?? / research
|
107
|
+
|
108
|
+
]
|
109
|
+
test_data.each do |string|
|
110
|
+
assert @validator.valid_encoding?(string), "boundary conditions: #{string}"
|
111
|
+
assert string.force_encoding("UTF-8").valid_encoding?, "boundary conditions 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
95
115
|
#--
|
96
116
|
# Validation should fail for the following tests
|
97
117
|
#--
|
@@ -108,8 +128,13 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
108
128
|
# UTF-16 Surrogate Halves
|
109
129
|
def test0520_utf16_surrogate_halves
|
110
130
|
test_data = [
|
111
|
-
"\xed\xa0\x80",
|
112
|
-
"\xed\
|
131
|
+
"\xed\xa0\x80",
|
132
|
+
"\xed\xad\xbf",
|
133
|
+
"\xed\xae\x80",
|
134
|
+
"\xed\xaf\xbf",
|
135
|
+
"\xed\xb0\x80",
|
136
|
+
"\xed\xbe\x80",
|
137
|
+
"\xed\xbf\xbf",
|
113
138
|
]
|
114
139
|
test_data.each do |string|
|
115
140
|
assert !@validator.valid_encoding?(string), "UTF-16 Surrogate Halves: #{string}"
|
@@ -117,6 +142,14 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
117
142
|
end
|
118
143
|
end
|
119
144
|
|
145
|
+
#--
|
146
|
+
# I do not see a need to test UTF-16 surrogate pairs. They are guaranteed
|
147
|
+
# to alyays fail if the preceding test succeeds. This is because the
|
148
|
+
# preceeding test data values are always the first surrogate of the pair.
|
149
|
+
#
|
150
|
+
# UTF-16 surrogates are clearly something I do not understand.
|
151
|
+
#--
|
152
|
+
|
120
153
|
# Invalid single bytes
|
121
154
|
def test0530_invalid_single_bytes
|
122
155
|
test_data = [
|
@@ -222,5 +255,21 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
222
255
|
end
|
223
256
|
end
|
224
257
|
|
258
|
+
|
259
|
+
# Maximum overlong sequences
|
260
|
+
def test0580_max_overlong_seqs
|
261
|
+
test_data = [
|
262
|
+
"\xc1\xbf",
|
263
|
+
"\xe0\x9f\xbf",
|
264
|
+
"\xf0\x8f\xbf\xbf",
|
265
|
+
"\xf8\x87\xbf\xbf\xbf",
|
266
|
+
"\xfc\x83\xbf\xbf\xbf\xbf",
|
267
|
+
]
|
268
|
+
test_data.each do |string|
|
269
|
+
assert !@validator.valid_encoding?(string), "max overlong seq: #{string}"
|
270
|
+
assert !string.force_encoding("UTF-8").valid_encoding?, "max overlong seq 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
225
274
|
end
|
226
275
|
|
data/utf8_validator.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{utf8_validator}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Guy Allard"]
|
12
|
-
s.date = %q{2011-01-
|
12
|
+
s.date = %q{2011-01-26}
|
13
13
|
s.description = %q{A State Machine implementation of a UTF-8 Encoding
|
14
14
|
Validation algorithm.}
|
15
15
|
s.email = %q{allard.guy.m@gmail.com}
|
@@ -25,6 +25,7 @@ Validation algorithm.}
|
|
25
25
|
"README.rdoc",
|
26
26
|
"Rakefile",
|
27
27
|
"VERSION",
|
28
|
+
"examples/fullstring.rb",
|
28
29
|
"lib/utf8_validator.rb",
|
29
30
|
"lib/validation/errors.rb",
|
30
31
|
"lib/validation/validator.rb",
|
@@ -39,6 +40,7 @@ Validation algorithm.}
|
|
39
40
|
s.rubygems_version = %q{1.3.7}
|
40
41
|
s.summary = %q{A UTF-8 Encoding Validator.}
|
41
42
|
s.test_files = [
|
43
|
+
"examples/fullstring.rb",
|
42
44
|
"test/helper.rb",
|
43
45
|
"test/test_raise_request.rb",
|
44
46
|
"test/test_utf8_validator.rb"
|
@@ -52,18 +54,15 @@ Validation algorithm.}
|
|
52
54
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
53
55
|
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
54
56
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
55
|
-
s.add_development_dependency(%q<bundler>, [">= 2.1.2"])
|
56
57
|
else
|
57
58
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
58
59
|
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
59
60
|
s.add_dependency(%q<rcov>, [">= 0"])
|
60
|
-
s.add_dependency(%q<bundler>, [">= 2.1.2"])
|
61
61
|
end
|
62
62
|
else
|
63
63
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
64
|
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
65
65
|
s.add_dependency(%q<rcov>, [">= 0"])
|
66
|
-
s.add_dependency(%q<bundler>, [">= 2.1.2"])
|
67
66
|
end
|
68
67
|
end
|
69
68
|
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 2
|
9
|
+
version: 0.0.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Guy Allard
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-26 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -60,21 +60,6 @@ dependencies:
|
|
60
60
|
type: :development
|
61
61
|
prerelease: false
|
62
62
|
version_requirements: *id003
|
63
|
-
- !ruby/object:Gem::Dependency
|
64
|
-
name: bundler
|
65
|
-
requirement: &id004 !ruby/object:Gem::Requirement
|
66
|
-
none: false
|
67
|
-
requirements:
|
68
|
-
- - ">="
|
69
|
-
- !ruby/object:Gem::Version
|
70
|
-
segments:
|
71
|
-
- 2
|
72
|
-
- 1
|
73
|
-
- 2
|
74
|
-
version: 2.1.2
|
75
|
-
type: :development
|
76
|
-
prerelease: false
|
77
|
-
version_requirements: *id004
|
78
63
|
description: |-
|
79
64
|
A State Machine implementation of a UTF-8 Encoding
|
80
65
|
Validation algorithm.
|
@@ -94,6 +79,7 @@ files:
|
|
94
79
|
- README.rdoc
|
95
80
|
- Rakefile
|
96
81
|
- VERSION
|
82
|
+
- examples/fullstring.rb
|
97
83
|
- lib/utf8_validator.rb
|
98
84
|
- lib/validation/errors.rb
|
99
85
|
- lib/validation/validator.rb
|
@@ -115,7 +101,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
115
101
|
requirements:
|
116
102
|
- - ">="
|
117
103
|
- !ruby/object:Gem::Version
|
118
|
-
hash:
|
104
|
+
hash: 2884485592009813991
|
119
105
|
segments:
|
120
106
|
- 0
|
121
107
|
version: "0"
|
@@ -135,6 +121,7 @@ signing_key:
|
|
135
121
|
specification_version: 3
|
136
122
|
summary: A UTF-8 Encoding Validator.
|
137
123
|
test_files:
|
124
|
+
- examples/fullstring.rb
|
138
125
|
- test/helper.rb
|
139
126
|
- test/test_raise_request.rb
|
140
127
|
- test/test_utf8_validator.rb
|