utf8_validator 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +1 -0
- data/Rakefile +4 -4
- data/VERSION +1 -1
- data/examples/fullstring.rb +79 -0
- data/lib/utf8_validator.rb +3 -1
- data/lib/validation/errors.rb +3 -3
- data/lib/validation/validator.rb +17 -6
- data/test/test_raise_request.rb +10 -0
- data/test/test_utf8_validator.rb +51 -2
- data/utf8_validator.gemspec +4 -5
- metadata +6 -19
data/README.rdoc
CHANGED
@@ -16,6 +16,7 @@ That functionality is left as an exercise for the reader.
|
|
16
16
|
|
17
17
|
The Unicode Consortium:: At http://unicode.org/ for all the information published there.
|
18
18
|
Frank Yung-Fong Tang:: For the state machine algorithm. See: http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
|
19
|
+
Markus Kuhn:: For invalid test data. http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
19
20
|
|
20
21
|
== A Word On Ruby Versions
|
21
22
|
|
data/Rakefile
CHANGED
@@ -21,11 +21,11 @@ Validation algorithm.}
|
|
21
21
|
gem.email = "allard.guy.m@gmail.com"
|
22
22
|
gem.authors = ["Guy Allard"]
|
23
23
|
|
24
|
-
#
|
25
|
-
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
24
|
+
# Runtime Dependencies - None at present
|
26
25
|
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
27
|
-
|
28
|
-
|
26
|
+
#
|
27
|
+
# Bundler/Jeweler takes care of this via the Gemfile.lock process
|
28
|
+
# gem.add_development_dependency 'bundler', '>= 2.1.2'
|
29
29
|
end
|
30
30
|
Jeweler::RubygemsDotOrgTasks.new
|
31
31
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
# Show how to parse a full string with multiple UTF8 validation failures.
|
4
|
+
# Accumulate error information, and report it.
|
5
|
+
#
|
6
|
+
require 'rubygems' unless RUBY_VERSION =~ /1\.9/
|
7
|
+
require 'utf8_validator'
|
8
|
+
#
|
9
|
+
# = Purpose
|
10
|
+
#
|
11
|
+
# A helper class for processing multiple validation errors in a single string.
|
12
|
+
#
|
13
|
+
class ValidationHelper
|
14
|
+
#
|
15
|
+
attr_reader :error_list
|
16
|
+
#
|
17
|
+
# Get a validator instance.
|
18
|
+
#
|
19
|
+
def initialize
|
20
|
+
@validator = UTF8::Validator.new
|
21
|
+
end
|
22
|
+
#
|
23
|
+
# Validate the whole string.
|
24
|
+
#
|
25
|
+
def scanstring(string)
|
26
|
+
@error_list = ""
|
27
|
+
work_string = string
|
28
|
+
run_pos = 0
|
29
|
+
begin
|
30
|
+
@validator.valid_encoding?(work_string, true)
|
31
|
+
rescue UTF8::ValidationError => e
|
32
|
+
# Extract offset of error, keep running offset up to date
|
33
|
+
last_colon = e.message.rindex(':')
|
34
|
+
last_lparen = e.message.rindex('(')
|
35
|
+
epos = e.message[last_colon+1..last_lparen-1]
|
36
|
+
sub_start = epos.to_i
|
37
|
+
if run_pos == 0
|
38
|
+
run_pos += sub_start
|
39
|
+
else
|
40
|
+
run_pos += sub_start + 1
|
41
|
+
end
|
42
|
+
# Start again at error offset + 1
|
43
|
+
work_string = work_string[sub_start+1..-1]
|
44
|
+
# Build next error message
|
45
|
+
next_emsg = e.message[0..last_colon] # Part A of current message
|
46
|
+
# Add running offset position
|
47
|
+
run_pos_str = sprintf "%d(0x%x)", run_pos, run_pos
|
48
|
+
next_emsg += run_pos_str
|
49
|
+
#
|
50
|
+
@error_list += next_emsg
|
51
|
+
@error_list += "\n"
|
52
|
+
retry
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
#
|
57
|
+
puts "Started"
|
58
|
+
puts
|
59
|
+
#
|
60
|
+
helper = ValidationHelper.new
|
61
|
+
#
|
62
|
+
test_data = [
|
63
|
+
"a\xffbc\xfed",
|
64
|
+
"abcdefghijk\xffbcdefghijk\xfecdefg",
|
65
|
+
"anoerrorsz",
|
66
|
+
"errorlast\x80",
|
67
|
+
"a\xffbcd\xfeefgh\xfd123",
|
68
|
+
]
|
69
|
+
#
|
70
|
+
test_data.each do |string|
|
71
|
+
puts "/" * 60
|
72
|
+
puts "#{string}"
|
73
|
+
helper.scanstring(string)
|
74
|
+
puts "#{helper.error_list}"
|
75
|
+
end
|
76
|
+
#
|
77
|
+
puts
|
78
|
+
puts "Complete"
|
79
|
+
|
data/lib/utf8_validator.rb
CHANGED
data/lib/validation/errors.rb
CHANGED
data/lib/validation/validator.rb
CHANGED
@@ -60,30 +60,33 @@ module UTF8
|
|
60
60
|
# Instances of this class are thread safe, and a single instance may be used
|
61
61
|
# safely by multiple concurrent threads, with one caveat:
|
62
62
|
#
|
63
|
-
# The value of #{
|
63
|
+
# The value of #{DEBUG} must not be changed by any thread.
|
64
64
|
#
|
65
65
|
#--
|
66
66
|
# Copyright (c) 2011 Guy Allard
|
67
|
-
|
67
|
+
#--
|
68
68
|
class Validator
|
69
69
|
#
|
70
70
|
# For use during development only.
|
71
71
|
#
|
72
72
|
DEBUG=false
|
73
|
-
|
74
73
|
#
|
75
74
|
# Validate the supplied string for proper UTF-8 encoding.
|
76
75
|
#
|
77
76
|
# Calling Sequence:
|
78
77
|
#
|
78
|
+
# validator = UTF8::Validator.new -> validator
|
79
79
|
# validator.valid_encoding?(string) -> true or false
|
80
|
-
# validator.valid_encoding?(string,
|
80
|
+
# validator.valid_encoding?(string, true) -> true or exception
|
81
81
|
#
|
82
82
|
# Parameters:
|
83
83
|
#
|
84
84
|
# string:: the string to validate
|
85
85
|
# raise_on_error:: a flag to indicate failure behavior
|
86
|
-
#
|
86
|
+
#
|
87
|
+
# When raise_on_error is _true_ and a string fails validation, an
|
88
|
+
# error of type #{UTF8::ValidationError} is raised. The byte in error
|
89
|
+
# and the location of that byte are described in the error message.
|
87
90
|
#
|
88
91
|
def valid_encoding?(string, raise_on_error = false)
|
89
92
|
bytes = string.bytes
|
@@ -169,7 +172,7 @@ class Validator
|
|
169
172
|
else
|
170
173
|
valid = false
|
171
174
|
break
|
172
|
-
end # of the inner case
|
175
|
+
end # of the inner case, the 'start' state
|
173
176
|
|
174
177
|
# The last continuation byte of a 2, 3, or 4 byte character
|
175
178
|
# State: 'a'
|
@@ -185,6 +188,7 @@ class Validator
|
|
185
188
|
end
|
186
189
|
|
187
190
|
# The first continuation byte for most 3 byte characters
|
191
|
+
# (those with start bytes in: 0xe1-0xec or 0xee-0xef)
|
188
192
|
# State: 'b'
|
189
193
|
# o Input = 0x80-0xBF: change state to A
|
190
194
|
# o Others: ERROR
|
@@ -198,6 +202,7 @@ class Validator
|
|
198
202
|
end
|
199
203
|
|
200
204
|
# The first continuation byte for some special 3 byte characters
|
205
|
+
# (those with start byte 0xe0)
|
201
206
|
# State: 'c'
|
202
207
|
# o Input = 0xA0-0xBF: change state to A
|
203
208
|
# o Others: ERROR
|
@@ -211,6 +216,7 @@ class Validator
|
|
211
216
|
end
|
212
217
|
|
213
218
|
# The first continuation byte for the remaining 3 byte characters
|
219
|
+
# (those with start byte 0xed)
|
214
220
|
# State: 'd'
|
215
221
|
# o Input = 0x80-0x9F: change state to A
|
216
222
|
# o Others: ERROR
|
@@ -224,6 +230,7 @@ class Validator
|
|
224
230
|
end
|
225
231
|
|
226
232
|
# The first continuation byte for some 4 byte characters
|
233
|
+
# (those with start bytes in: 0xf1-0xf3)
|
227
234
|
# State: 'e'
|
228
235
|
# o Input = 0x80-0xBF: change state to B
|
229
236
|
# o Others: ERROR
|
@@ -237,6 +244,7 @@ class Validator
|
|
237
244
|
end
|
238
245
|
|
239
246
|
# The first continuation byte for some special 4 byte characters
|
247
|
+
# (those with start byte 0xf0)
|
240
248
|
# State: 'f'
|
241
249
|
# o Input = 0x90-0xBF: change state to B
|
242
250
|
# o Others: ERROR
|
@@ -250,6 +258,7 @@ class Validator
|
|
250
258
|
end
|
251
259
|
|
252
260
|
# The first continuation byte for the remaining 4 byte characters
|
261
|
+
# (those with start byte 0xf4)
|
253
262
|
# State: 'g'
|
254
263
|
# o Input = 0x80-0x8F: change state to B
|
255
264
|
# o Others: ERROR
|
@@ -271,10 +280,12 @@ class Validator
|
|
271
280
|
puts "State at end: #{state}" if DEBUG
|
272
281
|
# Catch truncation at end of string
|
273
282
|
if valid and state != 'start'
|
283
|
+
puts "Resetting valid value" if DEBUG
|
274
284
|
valid = false
|
275
285
|
end
|
276
286
|
#
|
277
287
|
if !valid and raise_on_error
|
288
|
+
puts "Raising Error" if DEBUG
|
278
289
|
raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})"
|
279
290
|
end
|
280
291
|
#
|
data/test/test_raise_request.rb
CHANGED
@@ -34,5 +34,15 @@ class TestRaiseRequect < Test::Unit::TestCase
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
+
# Check message from raise
|
38
|
+
def test_0030_check_raise_message
|
39
|
+
#
|
40
|
+
begin
|
41
|
+
@validator.valid_encoding?("a\xffb\xfec", true)
|
42
|
+
rescue UTF8::ValidationError => e
|
43
|
+
assert e.message =~ /^Invalid byte/
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
37
47
|
end
|
38
48
|
|
data/test/test_utf8_validator.rb
CHANGED
@@ -7,6 +7,10 @@ require 'helper'
|
|
7
7
|
#
|
8
8
|
# Tests for the #{UTF8::Validator} implementation.
|
9
9
|
#
|
10
|
+
# Some test data pulled directly from:
|
11
|
+
#
|
12
|
+
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
13
|
+
#
|
10
14
|
class TestUtf8Validator < Test::Unit::TestCase
|
11
15
|
#
|
12
16
|
def setup
|
@@ -92,6 +96,22 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
92
96
|
end
|
93
97
|
end
|
94
98
|
|
99
|
+
# Boundary conditions
|
100
|
+
def test_0070_boundary_conditions
|
101
|
+
test_data = [
|
102
|
+
"\xed\x9f\xbf", # = "\ud7ff"
|
103
|
+
"\xee\x80\x80", # = "\ue000"
|
104
|
+
"\xef\xbf\xbd", # = "\ufffd"
|
105
|
+
# "\xf4\x8f\xbf\xbf", # = "\U0010ffff" / maybe _should_ fail ??
|
106
|
+
# "\xf4\x90\x80\x80", # = "\ufffd" / maybe _should_ fail ?? / research
|
107
|
+
|
108
|
+
]
|
109
|
+
test_data.each do |string|
|
110
|
+
assert @validator.valid_encoding?(string), "boundary conditions: #{string}"
|
111
|
+
assert string.force_encoding("UTF-8").valid_encoding?, "boundary conditions 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
95
115
|
#--
|
96
116
|
# Validation should fail for the following tests
|
97
117
|
#--
|
@@ -108,8 +128,13 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
108
128
|
# UTF-16 Surrogate Halves
|
109
129
|
def test0520_utf16_surrogate_halves
|
110
130
|
test_data = [
|
111
|
-
"\xed\xa0\x80",
|
112
|
-
"\xed\
|
131
|
+
"\xed\xa0\x80",
|
132
|
+
"\xed\xad\xbf",
|
133
|
+
"\xed\xae\x80",
|
134
|
+
"\xed\xaf\xbf",
|
135
|
+
"\xed\xb0\x80",
|
136
|
+
"\xed\xbe\x80",
|
137
|
+
"\xed\xbf\xbf",
|
113
138
|
]
|
114
139
|
test_data.each do |string|
|
115
140
|
assert !@validator.valid_encoding?(string), "UTF-16 Surrogate Halves: #{string}"
|
@@ -117,6 +142,14 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
117
142
|
end
|
118
143
|
end
|
119
144
|
|
145
|
+
#--
|
146
|
+
# I do not see a need to test UTF-16 surrogate pairs. They are guaranteed
|
147
|
+
# to alyays fail if the preceding test succeeds. This is because the
|
148
|
+
# preceeding test data values are always the first surrogate of the pair.
|
149
|
+
#
|
150
|
+
# UTF-16 surrogates are clearly something I do not understand.
|
151
|
+
#--
|
152
|
+
|
120
153
|
# Invalid single bytes
|
121
154
|
def test0530_invalid_single_bytes
|
122
155
|
test_data = [
|
@@ -222,5 +255,21 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
222
255
|
end
|
223
256
|
end
|
224
257
|
|
258
|
+
|
259
|
+
# Maximum overlong sequences
|
260
|
+
def test0580_max_overlong_seqs
|
261
|
+
test_data = [
|
262
|
+
"\xc1\xbf",
|
263
|
+
"\xe0\x9f\xbf",
|
264
|
+
"\xf0\x8f\xbf\xbf",
|
265
|
+
"\xf8\x87\xbf\xbf\xbf",
|
266
|
+
"\xfc\x83\xbf\xbf\xbf\xbf",
|
267
|
+
]
|
268
|
+
test_data.each do |string|
|
269
|
+
assert !@validator.valid_encoding?(string), "max overlong seq: #{string}"
|
270
|
+
assert !string.force_encoding("UTF-8").valid_encoding?, "max overlong seq 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
225
274
|
end
|
226
275
|
|
data/utf8_validator.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{utf8_validator}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Guy Allard"]
|
12
|
-
s.date = %q{2011-01-
|
12
|
+
s.date = %q{2011-01-26}
|
13
13
|
s.description = %q{A State Machine implementation of a UTF-8 Encoding
|
14
14
|
Validation algorithm.}
|
15
15
|
s.email = %q{allard.guy.m@gmail.com}
|
@@ -25,6 +25,7 @@ Validation algorithm.}
|
|
25
25
|
"README.rdoc",
|
26
26
|
"Rakefile",
|
27
27
|
"VERSION",
|
28
|
+
"examples/fullstring.rb",
|
28
29
|
"lib/utf8_validator.rb",
|
29
30
|
"lib/validation/errors.rb",
|
30
31
|
"lib/validation/validator.rb",
|
@@ -39,6 +40,7 @@ Validation algorithm.}
|
|
39
40
|
s.rubygems_version = %q{1.3.7}
|
40
41
|
s.summary = %q{A UTF-8 Encoding Validator.}
|
41
42
|
s.test_files = [
|
43
|
+
"examples/fullstring.rb",
|
42
44
|
"test/helper.rb",
|
43
45
|
"test/test_raise_request.rb",
|
44
46
|
"test/test_utf8_validator.rb"
|
@@ -52,18 +54,15 @@ Validation algorithm.}
|
|
52
54
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
53
55
|
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
54
56
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
55
|
-
s.add_development_dependency(%q<bundler>, [">= 2.1.2"])
|
56
57
|
else
|
57
58
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
58
59
|
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
59
60
|
s.add_dependency(%q<rcov>, [">= 0"])
|
60
|
-
s.add_dependency(%q<bundler>, [">= 2.1.2"])
|
61
61
|
end
|
62
62
|
else
|
63
63
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
64
|
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
65
65
|
s.add_dependency(%q<rcov>, [">= 0"])
|
66
|
-
s.add_dependency(%q<bundler>, [">= 2.1.2"])
|
67
66
|
end
|
68
67
|
end
|
69
68
|
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 2
|
9
|
+
version: 0.0.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Guy Allard
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-26 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -60,21 +60,6 @@ dependencies:
|
|
60
60
|
type: :development
|
61
61
|
prerelease: false
|
62
62
|
version_requirements: *id003
|
63
|
-
- !ruby/object:Gem::Dependency
|
64
|
-
name: bundler
|
65
|
-
requirement: &id004 !ruby/object:Gem::Requirement
|
66
|
-
none: false
|
67
|
-
requirements:
|
68
|
-
- - ">="
|
69
|
-
- !ruby/object:Gem::Version
|
70
|
-
segments:
|
71
|
-
- 2
|
72
|
-
- 1
|
73
|
-
- 2
|
74
|
-
version: 2.1.2
|
75
|
-
type: :development
|
76
|
-
prerelease: false
|
77
|
-
version_requirements: *id004
|
78
63
|
description: |-
|
79
64
|
A State Machine implementation of a UTF-8 Encoding
|
80
65
|
Validation algorithm.
|
@@ -94,6 +79,7 @@ files:
|
|
94
79
|
- README.rdoc
|
95
80
|
- Rakefile
|
96
81
|
- VERSION
|
82
|
+
- examples/fullstring.rb
|
97
83
|
- lib/utf8_validator.rb
|
98
84
|
- lib/validation/errors.rb
|
99
85
|
- lib/validation/validator.rb
|
@@ -115,7 +101,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
115
101
|
requirements:
|
116
102
|
- - ">="
|
117
103
|
- !ruby/object:Gem::Version
|
118
|
-
hash:
|
104
|
+
hash: 2884485592009813991
|
119
105
|
segments:
|
120
106
|
- 0
|
121
107
|
version: "0"
|
@@ -135,6 +121,7 @@ signing_key:
|
|
135
121
|
specification_version: 3
|
136
122
|
summary: A UTF-8 Encoding Validator.
|
137
123
|
test_files:
|
124
|
+
- examples/fullstring.rb
|
138
125
|
- test/helper.rb
|
139
126
|
- test/test_raise_request.rb
|
140
127
|
- test/test_utf8_validator.rb
|