utf8_validator 0.0.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +17 -2
- data/VERSION +2 -1
- data/lib/validation/validator.rb +2 -2
- data/test/test_utf8_validator.rb +81 -6
- data/utf8_validator.gemspec +2 -2
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -18,13 +18,28 @@ The Unicode Consortium:: At http://unicode.org/ for all the information publishe
|
|
18
18
|
Frank Yung-Fong Tang:: For the state machine algorithm. See: http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
|
19
19
|
Markus Kuhn:: For invalid test data. http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
20
20
|
|
21
|
+
== Useful Information
|
22
|
+
|
23
|
+
Other interesting and/or useful information can be found:
|
24
|
+
|
25
|
+
* http://software.hixie.ch/utilities/cgi/unicode-decoder/utf8-decoder
|
26
|
+
* http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
27
|
+
|
21
28
|
== A Word On Ruby Versions
|
22
29
|
|
23
|
-
It is expected that this validator will be used in Ruby environments prior to 1.9.x. However, nothing prohibits
|
30
|
+
It is expected that this validator will be used in Ruby environments prior to 1.9.x. However, nothing prohibits use with Ruby 1.9. Tests recognize a 1.9 environment, and insure that 1.9 native behavior matches the validator behavior.
|
24
31
|
|
25
32
|
== Reporting Issues
|
26
33
|
|
27
|
-
Please report issues on the tracker at github
|
34
|
+
Please report issues on the tracker at github:
|
35
|
+
|
36
|
+
* https://github.com/gmallard/utf8_validator/issues
|
37
|
+
|
38
|
+
== Web Based Documentation
|
39
|
+
|
40
|
+
Human readable documentation can be found at:
|
41
|
+
|
42
|
+
* http://gmallard.github.com/utf8_validator
|
28
43
|
|
29
44
|
== Contributing to the utf8_validator gem
|
30
45
|
|
data/VERSION
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
0.0
|
1
|
+
1.0.0
|
2
|
+
|
data/lib/validation/validator.rb
CHANGED
@@ -77,12 +77,12 @@ class Validator
|
|
77
77
|
#
|
78
78
|
# validator = UTF8::Validator.new -> validator
|
79
79
|
# validator.valid_encoding?(string) -> true or false
|
80
|
-
# validator.valid_encoding?(string,
|
80
|
+
# validator.valid_encoding?(string, raise_on_error) -> true or exception
|
81
81
|
#
|
82
82
|
# Parameters:
|
83
83
|
#
|
84
84
|
# string:: the string to validate
|
85
|
-
# raise_on_error:: a flag to indicate failure behavior
|
85
|
+
# raise_on_error:: a boolean flag to indicate requested failure behavior
|
86
86
|
#
|
87
87
|
# When raise_on_error is _true_ and a string fails validation, an
|
88
88
|
# error of type #{UTF8::ValidationError} is raised. The byte in error
|
data/test/test_utf8_validator.rb
CHANGED
@@ -102,9 +102,7 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
102
102
|
"\xed\x9f\xbf", # = "\ud7ff"
|
103
103
|
"\xee\x80\x80", # = "\ue000"
|
104
104
|
"\xef\xbf\xbd", # = "\ufffd"
|
105
|
-
|
106
|
-
# "\xf4\x90\x80\x80", # = "\ufffd" / maybe _should_ fail ?? / research
|
107
|
-
|
105
|
+
"\xf4\x8f\xbf\xbf", # = "\U10ffff" / _should_ this fail ??
|
108
106
|
]
|
109
107
|
test_data.each do |string|
|
110
108
|
assert @validator.valid_encoding?(string), "boundary conditions: #{string}"
|
@@ -144,7 +142,7 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
144
142
|
|
145
143
|
#--
|
146
144
|
# I do not see a need to test UTF-16 surrogate pairs. They are guaranteed
|
147
|
-
# to
|
145
|
+
# to always fail if the preceding test succeeds. This is because the
|
148
146
|
# preceeding test data values are always the first surrogate of the pair.
|
149
147
|
#
|
150
148
|
# UTF-16 surrogates are clearly something I do not understand.
|
@@ -168,10 +166,11 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
168
166
|
def test0540_not_shortest
|
169
167
|
test_data = [
|
170
168
|
"\xc0\x80",
|
169
|
+
"\xc1\x80",
|
170
|
+
"\xc0\x30",
|
171
|
+
"\xc1\x30",
|
171
172
|
"\xe0\x80\x80",
|
172
173
|
"\xf0\x80\x80\x80",
|
173
|
-
"\xf8\x80\x80\x80\x80",
|
174
|
-
"\xfc\x80\x80\x80\x80\x80",
|
175
174
|
]
|
176
175
|
test_data.each do |string|
|
177
176
|
assert !@validator.valid_encoding?(string), "Not shortest: #{string}"
|
@@ -271,5 +270,81 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
271
270
|
end
|
272
271
|
end
|
273
272
|
|
273
|
+
# Boundary conditions
|
274
|
+
def test_0590_boundary_conditions
|
275
|
+
test_data = [
|
276
|
+
"\xf4\x90\x80\x80", # See: http://software.hixie.ch/utilities/cgi/unicode-decoder/utf8-decoder
|
277
|
+
]
|
278
|
+
test_data.each do |string|
|
279
|
+
assert !@validator.valid_encoding?(string), "boundary conditions: #{string}"
|
280
|
+
assert !string.force_encoding("UTF-8").valid_encoding?, "boundary conditions 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
=begin
|
285
|
+
|
286
|
+
The next test is based on examples provided in the Unicode 6.0 specification.
|
287
|
+
See pages 91-92.
|
288
|
+
|
289
|
+
From that specification:
|
290
|
+
|
291
|
+
• The UTF-8 code unit sequence <41 C3 B1 42> is well-formed, because it can be
|
292
|
+
partitioned into subsequences, all of which match the specification for UTF-8
|
293
|
+
in Table 3-7. It consists of the following minimal well-formed code unit subse-
|
294
|
+
quences: <41>, <C3 B1>, and <42>.
|
295
|
+
|
296
|
+
• The UTF-8 code unit sequence <41 C2 C3 B1 42> is ill-formed, because it con-
|
297
|
+
tains one ill-formed subsequence. There is no subsequence for the C2 byte
|
298
|
+
which matches the specification for UTF-8 in Table 3-7. The code unit sequence
|
299
|
+
is partitioned into one minimal well-formed code unit subsequence, <41>, fol-
|
300
|
+
lowed by one ill-formed code unit subsequence, <C2>, followed by two mini-
|
301
|
+
mal well-formed code unit subsequences, <C3 B1> and <42>.
|
302
|
+
|
303
|
+
• In isolation, the UTF-8 code unit sequence <C2 C3> would be ill-formed, but
|
304
|
+
in the context of the UTF-8 code unit sequence <41 C2 C3 B1 42>, <C2 C3>
|
305
|
+
does not constitute an ill-formed code unit subsequence, because the C3 byte is
|
306
|
+
actually the first byte of the minimal well-formed UTF-8 code unit subse-
|
307
|
+
quence <C3 B1>. Ill-formed code unit subsequences do not overlap with mini-
|
308
|
+
mal well-formed code unit subsequences.
|
309
|
+
|
310
|
+
// Above:
|
311
|
+
|
312
|
+
straight from the Unicode 6.0 spec. See page 91.
|
313
|
+
|
314
|
+
• As another example, the code unit sequence <C0 80 61 F3> is a Unicode 8-bit
|
315
|
+
string, but does not consist of a well-formed UTF-8 code unit sequence. That
|
316
|
+
code unit sequence could not result from the specification of the UTF-8 encod-
|
317
|
+
ing form and is thus ill-formed. (The same code unit sequence could, of course,
|
318
|
+
be well-formed in the context of some other character encoding standard using
|
319
|
+
8-bit code units, such as ISO/IEC 8859-1, or vendor code pages.)
|
320
|
+
|
321
|
+
// Above:
|
322
|
+
|
323
|
+
straight from the Unicode 6.0 spec. See page 92.
|
324
|
+
|
325
|
+
=end
|
326
|
+
|
327
|
+
# Tests from examples in the Unicode speciication
|
328
|
+
def test_0600_unicode_specs
|
329
|
+
|
330
|
+
good_data = [
|
331
|
+
"\x41\xc3\xb1\x42",
|
332
|
+
]
|
333
|
+
good_data.each do |string|
|
334
|
+
assert @validator.valid_encoding?(string), "good unicode specs 01: #{string}"
|
335
|
+
assert string.force_encoding("UTF-8").valid_encoding?,
|
336
|
+
"good unicode specs 01 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
337
|
+
end
|
338
|
+
|
339
|
+
bad_data = [
|
340
|
+
"\x41\xc2\xc3\xb1\x42",
|
341
|
+
]
|
342
|
+
bad_data.each do |string|
|
343
|
+
assert !@validator.valid_encoding?(string), "bad unicode specs 01: #{string}"
|
344
|
+
assert !string.force_encoding("UTF-8").valid_encoding?,
|
345
|
+
"bad unicode specs 01 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
346
|
+
end
|
347
|
+
|
348
|
+
end
|
274
349
|
end
|
275
350
|
|
data/utf8_validator.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{utf8_validator}
|
8
|
-
s.version = "0.0
|
8
|
+
s.version = "1.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Guy Allard"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-03-21}
|
13
13
|
s.description = %q{A State Machine implementation of a UTF-8 Encoding
|
14
14
|
Validation algorithm.}
|
15
15
|
s.email = %q{allard.guy.m@gmail.com}
|
metadata
CHANGED
@@ -3,10 +3,10 @@ name: utf8_validator
|
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
|
+
- 1
|
6
7
|
- 0
|
7
8
|
- 0
|
8
|
-
|
9
|
-
version: 0.0.2
|
9
|
+
version: 1.0.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Guy Allard
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-03-21 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -101,7 +101,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
101
101
|
requirements:
|
102
102
|
- - ">="
|
103
103
|
- !ruby/object:Gem::Version
|
104
|
-
hash:
|
104
|
+
hash: -2830837862218191460
|
105
105
|
segments:
|
106
106
|
- 0
|
107
107
|
version: "0"
|