utf8_validator 0.0.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +17 -2
- data/VERSION +2 -1
- data/lib/validation/validator.rb +2 -2
- data/test/test_utf8_validator.rb +81 -6
- data/utf8_validator.gemspec +2 -2
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -18,13 +18,28 @@ The Unicode Consortium:: At http://unicode.org/ for all the information publishe
|
|
18
18
|
Frank Yung-Fong Tang:: For the state machine algorithm. See: http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
|
19
19
|
Markus Kuhn:: For invalid test data. http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
20
20
|
|
21
|
+
== Useful Information
|
22
|
+
|
23
|
+
Other interesting and/or useful information can be found:
|
24
|
+
|
25
|
+
* http://software.hixie.ch/utilities/cgi/unicode-decoder/utf8-decoder
|
26
|
+
* http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
27
|
+
|
21
28
|
== A Word On Ruby Versions
|
22
29
|
|
23
|
-
It is expected that this validator will be used in Ruby environments prior to 1.9.x. However, nothing prohibits
|
30
|
+
It is expected that this validator will be used in Ruby environments prior to 1.9.x. However, nothing prohibits use with Ruby 1.9. Tests recognize a 1.9 environment, and insure that 1.9 native behavior matches the validator behavior.
|
24
31
|
|
25
32
|
== Reporting Issues
|
26
33
|
|
27
|
-
Please report issues on the tracker at github
|
34
|
+
Please report issues on the tracker at github:
|
35
|
+
|
36
|
+
* https://github.com/gmallard/utf8_validator/issues
|
37
|
+
|
38
|
+
== Web Based Documentation
|
39
|
+
|
40
|
+
Human readable documentation can be found at:
|
41
|
+
|
42
|
+
* http://gmallard.github.com/utf8_validator
|
28
43
|
|
29
44
|
== Contributing to the utf8_validator gem
|
30
45
|
|
data/VERSION
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
0.0
|
1
|
+
1.0.0
|
2
|
+
|
data/lib/validation/validator.rb
CHANGED
@@ -77,12 +77,12 @@ class Validator
|
|
77
77
|
#
|
78
78
|
# validator = UTF8::Validator.new -> validator
|
79
79
|
# validator.valid_encoding?(string) -> true or false
|
80
|
-
# validator.valid_encoding?(string,
|
80
|
+
# validator.valid_encoding?(string, raise_on_error) -> true or exception
|
81
81
|
#
|
82
82
|
# Parameters:
|
83
83
|
#
|
84
84
|
# string:: the string to validate
|
85
|
-
# raise_on_error:: a flag to indicate failure behavior
|
85
|
+
# raise_on_error:: a boolean flag to indicate requested failure behavior
|
86
86
|
#
|
87
87
|
# When raise_on_error is _true_ and a string fails validation, an
|
88
88
|
# error of type #{UTF8::ValidationError} is raised. The byte in error
|
data/test/test_utf8_validator.rb
CHANGED
@@ -102,9 +102,7 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
102
102
|
"\xed\x9f\xbf", # = "\ud7ff"
|
103
103
|
"\xee\x80\x80", # = "\ue000"
|
104
104
|
"\xef\xbf\xbd", # = "\ufffd"
|
105
|
-
|
106
|
-
# "\xf4\x90\x80\x80", # = "\ufffd" / maybe _should_ fail ?? / research
|
107
|
-
|
105
|
+
"\xf4\x8f\xbf\xbf", # = "\U10ffff" / _should_ this fail ??
|
108
106
|
]
|
109
107
|
test_data.each do |string|
|
110
108
|
assert @validator.valid_encoding?(string), "boundary conditions: #{string}"
|
@@ -144,7 +142,7 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
144
142
|
|
145
143
|
#--
|
146
144
|
# I do not see a need to test UTF-16 surrogate pairs. They are guaranteed
|
147
|
-
# to
|
145
|
+
# to always fail if the preceding test succeeds. This is because the
|
148
146
|
# preceeding test data values are always the first surrogate of the pair.
|
149
147
|
#
|
150
148
|
# UTF-16 surrogates are clearly something I do not understand.
|
@@ -168,10 +166,11 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
168
166
|
def test0540_not_shortest
|
169
167
|
test_data = [
|
170
168
|
"\xc0\x80",
|
169
|
+
"\xc1\x80",
|
170
|
+
"\xc0\x30",
|
171
|
+
"\xc1\x30",
|
171
172
|
"\xe0\x80\x80",
|
172
173
|
"\xf0\x80\x80\x80",
|
173
|
-
"\xf8\x80\x80\x80\x80",
|
174
|
-
"\xfc\x80\x80\x80\x80\x80",
|
175
174
|
]
|
176
175
|
test_data.each do |string|
|
177
176
|
assert !@validator.valid_encoding?(string), "Not shortest: #{string}"
|
@@ -271,5 +270,81 @@ class TestUtf8Validator < Test::Unit::TestCase
|
|
271
270
|
end
|
272
271
|
end
|
273
272
|
|
273
|
+
# Boundary conditions
|
274
|
+
def test_0590_boundary_conditions
|
275
|
+
test_data = [
|
276
|
+
"\xf4\x90\x80\x80", # See: http://software.hixie.ch/utilities/cgi/unicode-decoder/utf8-decoder
|
277
|
+
]
|
278
|
+
test_data.each do |string|
|
279
|
+
assert !@validator.valid_encoding?(string), "boundary conditions: #{string}"
|
280
|
+
assert !string.force_encoding("UTF-8").valid_encoding?, "boundary conditions 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
=begin
|
285
|
+
|
286
|
+
The next test is based on examples provided in the Unicode 6.0 specification.
|
287
|
+
See pages 91-92.
|
288
|
+
|
289
|
+
From that specification:
|
290
|
+
|
291
|
+
• The UTF-8 code unit sequence <41 C3 B1 42> is well-formed, because it can be
|
292
|
+
partitioned into subsequences, all of which match the specification for UTF-8
|
293
|
+
in Table 3-7. It consists of the following minimal well-formed code unit subse-
|
294
|
+
quences: <41>, <C3 B1>, and <42>.
|
295
|
+
|
296
|
+
• The UTF-8 code unit sequence <41 C2 C3 B1 42> is ill-formed, because it con-
|
297
|
+
tains one ill-formed subsequence. There is no subsequence for the C2 byte
|
298
|
+
which matches the specification for UTF-8 in Table 3-7. The code unit sequence
|
299
|
+
is partitioned into one minimal well-formed code unit subsequence, <41>, fol-
|
300
|
+
lowed by one ill-formed code unit subsequence, <C2>, followed by two mini-
|
301
|
+
mal well-formed code unit subsequences, <C3 B1> and <42>.
|
302
|
+
|
303
|
+
• In isolation, the UTF-8 code unit sequence <C2 C3> would be ill-formed, but
|
304
|
+
in the context of the UTF-8 code unit sequence <41 C2 C3 B1 42>, <C2 C3>
|
305
|
+
does not constitute an ill-formed code unit subsequence, because the C3 byte is
|
306
|
+
actually the first byte of the minimal well-formed UTF-8 code unit subse-
|
307
|
+
quence <C3 B1>. Ill-formed code unit subsequences do not overlap with mini-
|
308
|
+
mal well-formed code unit subsequences.
|
309
|
+
|
310
|
+
// Above:
|
311
|
+
|
312
|
+
straight from the Unicode 6.0 spec. See page 91.
|
313
|
+
|
314
|
+
• As another example, the code unit sequence <C0 80 61 F3> is a Unicode 8-bit
|
315
|
+
string, but does not consist of a well-formed UTF-8 code unit sequence. That
|
316
|
+
code unit sequence could not result from the specification of the UTF-8 encod-
|
317
|
+
ing form and is thus ill-formed. (The same code unit sequence could, of course,
|
318
|
+
be well-formed in the context of some other character encoding standard using
|
319
|
+
8-bit code units, such as ISO/IEC 8859-1, or vendor code pages.)
|
320
|
+
|
321
|
+
// Above:
|
322
|
+
|
323
|
+
straight from the Unicode 6.0 spec. See page 92.
|
324
|
+
|
325
|
+
=end
|
326
|
+
|
327
|
+
# Tests from examples in the Unicode speciication
|
328
|
+
def test_0600_unicode_specs
|
329
|
+
|
330
|
+
good_data = [
|
331
|
+
"\x41\xc3\xb1\x42",
|
332
|
+
]
|
333
|
+
good_data.each do |string|
|
334
|
+
assert @validator.valid_encoding?(string), "good unicode specs 01: #{string}"
|
335
|
+
assert string.force_encoding("UTF-8").valid_encoding?,
|
336
|
+
"good unicode specs 01 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
337
|
+
end
|
338
|
+
|
339
|
+
bad_data = [
|
340
|
+
"\x41\xc2\xc3\xb1\x42",
|
341
|
+
]
|
342
|
+
bad_data.each do |string|
|
343
|
+
assert !@validator.valid_encoding?(string), "bad unicode specs 01: #{string}"
|
344
|
+
assert !string.force_encoding("UTF-8").valid_encoding?,
|
345
|
+
"bad unicode specs 01 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
346
|
+
end
|
347
|
+
|
348
|
+
end
|
274
349
|
end
|
275
350
|
|
data/utf8_validator.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{utf8_validator}
|
8
|
-
s.version = "0.0
|
8
|
+
s.version = "1.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Guy Allard"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-03-21}
|
13
13
|
s.description = %q{A State Machine implementation of a UTF-8 Encoding
|
14
14
|
Validation algorithm.}
|
15
15
|
s.email = %q{allard.guy.m@gmail.com}
|
metadata
CHANGED
@@ -3,10 +3,10 @@ name: utf8_validator
|
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
|
+
- 1
|
6
7
|
- 0
|
7
8
|
- 0
|
8
|
-
|
9
|
-
version: 0.0.2
|
9
|
+
version: 1.0.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Guy Allard
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-03-21 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -101,7 +101,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
101
101
|
requirements:
|
102
102
|
- - ">="
|
103
103
|
- !ruby/object:Gem::Version
|
104
|
-
hash:
|
104
|
+
hash: -2830837862218191460
|
105
105
|
segments:
|
106
106
|
- 0
|
107
107
|
version: "0"
|