utf8_validator 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +12 -0
- data/Gemfile.lock +18 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +43 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/lib/utf8_validator.rb +4 -0
- data/lib/validation/errors.rb +14 -0
- data/lib/validation/validator.rb +284 -0
- data/test/helper.rb +17 -0
- data/test/test_raise_request.rb +38 -0
- data/test/test_utf8_validator.rb +226 -0
- data/utf8_validator.gemspec +69 -0
- metadata +140 -0
data/.document
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "bundler", "~> 1.0.0"
|
10
|
+
gem "jeweler", "~> 1.5.2"
|
11
|
+
gem "rcov", ">= 0"
|
12
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
git (1.2.5)
|
5
|
+
jeweler (1.5.2)
|
6
|
+
bundler (~> 1.0.0)
|
7
|
+
git (>= 1.2.5)
|
8
|
+
rake
|
9
|
+
rake (0.8.7)
|
10
|
+
rcov (0.9.9)
|
11
|
+
|
12
|
+
PLATFORMS
|
13
|
+
ruby
|
14
|
+
|
15
|
+
DEPENDENCIES
|
16
|
+
bundler (~> 1.0.0)
|
17
|
+
jeweler (~> 1.5.2)
|
18
|
+
rcov
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Guy Allard
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
= A UTF-8 Validator State Machine
|
2
|
+
|
3
|
+
Provides an implementation of a state machine for validating UTF-8 encoded strings. Clients may request that encoding errors be reported in several ways:
|
4
|
+
|
5
|
+
* simple true / false indicator
|
6
|
+
* a raised exception
|
7
|
+
|
8
|
+
== What This gem does Not Provide
|
9
|
+
|
10
|
+
* UTF-8 Encoding
|
11
|
+
* UTF-8 Decoding
|
12
|
+
|
13
|
+
That functionality is left as an exercise for the reader.
|
14
|
+
|
15
|
+
== Thanks To
|
16
|
+
|
17
|
+
The Unicode Consortium:: At http://unicode.org/ for all the information published there.
|
18
|
+
Frank Yung-Fong Tang:: For the state machine algorithm. See: http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
|
19
|
+
|
20
|
+
== A Word On Ruby Versions
|
21
|
+
|
22
|
+
It is expected that this validator will be used in Ruby environments prior to 1.9.x. However, nothing prohibits it's use with Ruby 1.9.
|
23
|
+
|
24
|
+
== Reporting Issues
|
25
|
+
|
26
|
+
Please report issues on the tracker at github.
|
27
|
+
|
28
|
+
== Contributing to the utf8_validator gem
|
29
|
+
|
30
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
31
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
|
32
|
+
* Fork the project.
|
33
|
+
* Start a feature/bugfix branch.
|
34
|
+
* Commit and push until you are happy with your contribution.
|
35
|
+
* Make sure to add tests for it. This is important so it does not break in in a future version unintentionally.
|
36
|
+
* Please try not to modify the Rakefile or VERSION file. If you require your own version please isolate the version update to its own commit so cherry-pick or rebase can be used to skip it.
|
37
|
+
* Request a pull.
|
38
|
+
|
39
|
+
== Copyright
|
40
|
+
|
41
|
+
Copyright (c) 2011 Guy Allard. See LICENSE.txt for
|
42
|
+
further details.
|
43
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
require 'jeweler'
|
13
|
+
Jeweler::Tasks.new do |gem|
|
14
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
15
|
+
gem.name = "utf8_validator"
|
16
|
+
gem.homepage = "http://github.com/gmallard/utf8_validator"
|
17
|
+
gem.license = "MIT"
|
18
|
+
gem.summary = %Q{A UTF-8 Encoding Validator.}
|
19
|
+
gem.description = %Q{A State Machine implementation of a UTF-8 Encoding
|
20
|
+
Validation algorithm.}
|
21
|
+
gem.email = "allard.guy.m@gmail.com"
|
22
|
+
gem.authors = ["Guy Allard"]
|
23
|
+
|
24
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
25
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
26
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
27
|
+
|
28
|
+
gem.add_development_dependency 'bundler', '>= 2.1.2'
|
29
|
+
end
|
30
|
+
Jeweler::RubygemsDotOrgTasks.new
|
31
|
+
|
32
|
+
require 'rake/testtask'
|
33
|
+
Rake::TestTask.new(:test) do |test|
|
34
|
+
test.libs << 'lib' << 'test'
|
35
|
+
test.pattern = 'test/**/test_*.rb'
|
36
|
+
test.verbose = true
|
37
|
+
end
|
38
|
+
|
39
|
+
require 'rcov/rcovtask'
|
40
|
+
Rcov::RcovTask.new do |test|
|
41
|
+
test.libs << 'test'
|
42
|
+
test.pattern = 'test/**/test_*.rb'
|
43
|
+
test.verbose = true
|
44
|
+
end
|
45
|
+
|
46
|
+
task :default => :test
|
47
|
+
|
48
|
+
require 'rake/rdoctask'
|
49
|
+
Rake::RDocTask.new do |rdoc|
|
50
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
51
|
+
|
52
|
+
rdoc.rdoc_dir = 'rdoc'
|
53
|
+
rdoc.title = "UTF-8 Validator #{version}"
|
54
|
+
rdoc.rdoc_files.include('README*')
|
55
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
56
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module UTF8
|
2
|
+
#
|
3
|
+
# == Purpose
|
4
|
+
#
|
5
|
+
# General UTF-8 validation error class. Clients that raise this error
|
6
|
+
# should override the default message.
|
7
|
+
#
|
8
|
+
class ValidationError < ::RuntimeError
|
9
|
+
#
|
10
|
+
def message()
|
11
|
+
"general UTF-8 validation error"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,284 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
=begin
|
4
|
+
|
5
|
+
http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
|
6
|
+
|
7
|
+
* state START
|
8
|
+
|
9
|
+
* Input = 0x00-0x7F : change state to START
|
10
|
+
* Input = 0xC2-0xDF: change state to A
|
11
|
+
* Input = 0xE1-0xEC, 0xEE-0xEF: change state to B
|
12
|
+
* Input = 0xE0: change state to C
|
13
|
+
* Input = 0xED: change state to D
|
14
|
+
* Input = 0xF1-0xF3:change state to E
|
15
|
+
* Input = 0xF0: change state to F
|
16
|
+
* Input = 0xF4: change state to G
|
17
|
+
* Input = Others (0x80-0xBF,0xC0-0xC1, 0xF5-0xFF): ERROR
|
18
|
+
|
19
|
+
* state A
|
20
|
+
o Input = 0x80-0xBF: change state to START
|
21
|
+
o Others: ERROR
|
22
|
+
* state B
|
23
|
+
o Input = 0x80-0xBF: change state to A
|
24
|
+
o Others: ERROR
|
25
|
+
* state C
|
26
|
+
o Input = 0xA0-0xBF: change state to A
|
27
|
+
o Others: ERROR
|
28
|
+
* state D
|
29
|
+
o Input = 0x80-0x9F: change state to A
|
30
|
+
o Others: ERROR
|
31
|
+
* state E
|
32
|
+
o Input = 0x80-0xBF: change state to B
|
33
|
+
o Others: ERROR
|
34
|
+
* state F
|
35
|
+
o Input = 0x90-0xBF: change state to B
|
36
|
+
o Others: ERROR
|
37
|
+
* state G
|
38
|
+
o Input = 0x80-0x8F: change state to B
|
39
|
+
o Others: ERROR
|
40
|
+
|
41
|
+
This state machine can be easily understood by:
|
42
|
+
|
43
|
+
a) examining the machine behavior as documented
|
44
|
+
b) reference to an excellent UTF-8 article with accompanying table here:
|
45
|
+
|
46
|
+
http://en.wikipedia.org/wiki/UTF-8
|
47
|
+
|
48
|
+
=end
|
49
|
+
#
|
50
|
+
# == Purpose
|
51
|
+
#
|
52
|
+
# Container for UTF-8 validator.
|
53
|
+
#
|
54
|
+
module UTF8
|
55
|
+
#
|
56
|
+
# == Purpose
|
57
|
+
#
|
58
|
+
# Validate UTF-8 primarily in a Ruby environments other than 1.9.
|
59
|
+
#
|
60
|
+
# Instances of this class are thread safe, and a single instance may be used
|
61
|
+
# safely by multiple concurrent threads, with one caveat:
|
62
|
+
#
|
63
|
+
# The value of #{Validator::DEBUG} must not be changed by any thread.
|
64
|
+
#
|
65
|
+
#--
|
66
|
+
# Copyright (c) 2011 Guy Allard
|
67
|
+
#
|
68
|
+
class Validator
|
69
|
+
#
|
70
|
+
# For use during development only.
|
71
|
+
#
|
72
|
+
DEBUG=false
|
73
|
+
|
74
|
+
#
|
75
|
+
# Validate the supplied string for proper UTF-8 encoding.
|
76
|
+
#
|
77
|
+
# Calling Sequence:
|
78
|
+
#
|
79
|
+
# validator.valid_encoding?(string) -> true or false
|
80
|
+
# validator.valid_encoding?(string, raise_on_error) -> true or exception
|
81
|
+
#
|
82
|
+
# Parameters:
|
83
|
+
#
|
84
|
+
# string:: the string to validate
|
85
|
+
# raise_on_error:: a flag to indicate failure behavior
|
86
|
+
#
|
87
|
+
#
|
88
|
+
def valid_encoding?(string, raise_on_error = false)
|
89
|
+
bytes = string.bytes
|
90
|
+
#
|
91
|
+
valid = true
|
92
|
+
index = -1
|
93
|
+
nb_hex = nil
|
94
|
+
ni_hex = nil
|
95
|
+
state = "start"
|
96
|
+
next_byte_save = nil
|
97
|
+
#
|
98
|
+
bytes.each do |next_byte|
|
99
|
+
index += 1
|
100
|
+
next_byte_save = next_byte
|
101
|
+
ni_hex = sprintf "%x", index
|
102
|
+
nb_hex = sprintf "%x", next_byte
|
103
|
+
puts "Top: #{next_byte}(0x#{nb_hex}), index: #{index}(0x#{ni_hex})" if DEBUG
|
104
|
+
case state
|
105
|
+
|
106
|
+
# State: 'start'
|
107
|
+
# The 'start' state:
|
108
|
+
# * handles all occurrences of valid single byte characters i.e., the ASCII character set
|
109
|
+
# * provides state transition logic for start bytes of valid characters with 2-4 bytes
|
110
|
+
# * signals a validation failure for all other single bytes
|
111
|
+
#
|
112
|
+
when "start"
|
113
|
+
puts "state: start" if DEBUG
|
114
|
+
case next_byte
|
115
|
+
|
116
|
+
# ASCII
|
117
|
+
# * Input = 0x00-0x7F : change state to START
|
118
|
+
when (0x00..0x7f)
|
119
|
+
puts "state: start 1" if DEBUG
|
120
|
+
state = "start"
|
121
|
+
|
122
|
+
# Start byte of two byte characters
|
123
|
+
# * Input = 0xC2-0xDF: change state to A
|
124
|
+
when (0xc2..0xdf)
|
125
|
+
puts "state: start 2" if DEBUG
|
126
|
+
state = "a"
|
127
|
+
|
128
|
+
# Start byte of some three byte characters
|
129
|
+
# * Input = 0xE1-0xEC, 0xEE-0xEF: change state to B
|
130
|
+
when (0xe1..0xec)
|
131
|
+
puts "state: start 3" if DEBUG
|
132
|
+
state = "b"
|
133
|
+
when (0xee..0xef)
|
134
|
+
puts "state: start 4" if DEBUG
|
135
|
+
state = "b"
|
136
|
+
|
137
|
+
# Start byte of special three byte characters
|
138
|
+
# * Input = 0xE0: change state to C
|
139
|
+
when 0xe0
|
140
|
+
puts "state: start 5" if DEBUG
|
141
|
+
state = "c"
|
142
|
+
|
143
|
+
# Start byte of the remaining three byte characters
|
144
|
+
# * Input = 0xED: change state to D
|
145
|
+
when 0xed
|
146
|
+
puts "state: start 6" if DEBUG
|
147
|
+
state = "d"
|
148
|
+
|
149
|
+
# Start byte of some four byte characters
|
150
|
+
# * Input = 0xF1-0xF3:change state to E
|
151
|
+
when (0xf1..0xf3)
|
152
|
+
puts "state: start 7" if DEBUG
|
153
|
+
state = "e"
|
154
|
+
|
155
|
+
# Start byte of special four byte characters
|
156
|
+
# * Input = 0xF0: change state to F
|
157
|
+
when 0xf0
|
158
|
+
puts "state: start 8" if DEBUG
|
159
|
+
state = "f"
|
160
|
+
|
161
|
+
# Start byte of very special four byte characters
|
162
|
+
# * Input = 0xF4: change state to G
|
163
|
+
when 0xf4
|
164
|
+
puts "state: start 9" if DEBUG
|
165
|
+
state = "g"
|
166
|
+
|
167
|
+
# All other single characters are invalid
|
168
|
+
# * Input = Others (0x80-0xBF,0xC0-0xC1, 0xF5-0xFF): ERROR
|
169
|
+
else
|
170
|
+
valid = false
|
171
|
+
break
|
172
|
+
end # of the inner case
|
173
|
+
|
174
|
+
# The last continuation byte of a 2, 3, or 4 byte character
|
175
|
+
# State: 'a'
|
176
|
+
# o Input = 0x80-0xBF: change state to START
|
177
|
+
# o Others: ERROR
|
178
|
+
when "a"
|
179
|
+
puts "state: a" if DEBUG
|
180
|
+
if (0x80..0xbf) === next_byte
|
181
|
+
state = "start"
|
182
|
+
else
|
183
|
+
valid = false
|
184
|
+
break
|
185
|
+
end
|
186
|
+
|
187
|
+
# The first continuation byte for most 3 byte characters
|
188
|
+
# State: 'b'
|
189
|
+
# o Input = 0x80-0xBF: change state to A
|
190
|
+
# o Others: ERROR
|
191
|
+
when "b"
|
192
|
+
puts "state: b" if DEBUG
|
193
|
+
if (0x80..0xbf) === next_byte
|
194
|
+
state = "a"
|
195
|
+
else
|
196
|
+
valid = false
|
197
|
+
break
|
198
|
+
end
|
199
|
+
|
200
|
+
# The first continuation byte for some special 3 byte characters
|
201
|
+
# State: 'c'
|
202
|
+
# o Input = 0xA0-0xBF: change state to A
|
203
|
+
# o Others: ERROR
|
204
|
+
when "c"
|
205
|
+
puts "state: c" if DEBUG
|
206
|
+
if (0xa0..0xbf) === next_byte
|
207
|
+
state = "a"
|
208
|
+
else
|
209
|
+
valid = false
|
210
|
+
break
|
211
|
+
end
|
212
|
+
|
213
|
+
# The first continuation byte for the remaining 3 byte characters
|
214
|
+
# State: 'd'
|
215
|
+
# o Input = 0x80-0x9F: change state to A
|
216
|
+
# o Others: ERROR
|
217
|
+
when "d"
|
218
|
+
puts "state: d" if DEBUG
|
219
|
+
if (0x80..0x9f) === next_byte
|
220
|
+
state = "a"
|
221
|
+
else
|
222
|
+
valid = false
|
223
|
+
break
|
224
|
+
end
|
225
|
+
|
226
|
+
# The first continuation byte for some 4 byte characters
|
227
|
+
# State: 'e'
|
228
|
+
# o Input = 0x80-0xBF: change state to B
|
229
|
+
# o Others: ERROR
|
230
|
+
when "e"
|
231
|
+
puts "state: e" if DEBUG
|
232
|
+
if (0x80..0xbf) === next_byte
|
233
|
+
state = "b"
|
234
|
+
else
|
235
|
+
valid = false
|
236
|
+
break
|
237
|
+
end
|
238
|
+
|
239
|
+
# The first continuation byte for some special 4 byte characters
|
240
|
+
# State: 'f'
|
241
|
+
# o Input = 0x90-0xBF: change state to B
|
242
|
+
# o Others: ERROR
|
243
|
+
when "f"
|
244
|
+
puts "state: f" if DEBUG
|
245
|
+
if (0x90..0xbf) === next_byte
|
246
|
+
state = "b"
|
247
|
+
else
|
248
|
+
valid = false
|
249
|
+
break
|
250
|
+
end
|
251
|
+
|
252
|
+
# The first continuation byte for the remaining 4 byte characters
|
253
|
+
# State: 'g'
|
254
|
+
# o Input = 0x80-0x8F: change state to B
|
255
|
+
# o Others: ERROR
|
256
|
+
when "g"
|
257
|
+
puts "state: g" if DEBUG
|
258
|
+
if (0x80..0x8f) === next_byte
|
259
|
+
state = "b"
|
260
|
+
else
|
261
|
+
valid = false
|
262
|
+
break
|
263
|
+
end
|
264
|
+
|
265
|
+
#
|
266
|
+
else
|
267
|
+
raise RuntimeError, "state: default"
|
268
|
+
end
|
269
|
+
end
|
270
|
+
#
|
271
|
+
puts "State at end: #{state}" if DEBUG
|
272
|
+
# Catch truncation at end of string
|
273
|
+
if valid and state != 'start'
|
274
|
+
valid = false
|
275
|
+
end
|
276
|
+
#
|
277
|
+
if !valid and raise_on_error
|
278
|
+
raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})"
|
279
|
+
end
|
280
|
+
#
|
281
|
+
valid
|
282
|
+
end # of valid_encoding?
|
283
|
+
end # of class
|
284
|
+
end # of module
|
data/test/helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
|
12
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
require 'utf8_validator'
|
15
|
+
|
16
|
+
class Test::Unit::TestCase
|
17
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
require 'helper'
|
4
|
+
|
5
|
+
#
|
6
|
+
# == Purpose
|
7
|
+
#
|
8
|
+
# Test raise_on_error functionality.
|
9
|
+
#
|
10
|
+
class TestRaiseRequect < Test::Unit::TestCase
|
11
|
+
#
|
12
|
+
def setup
|
13
|
+
@validator = UTF8::Validator.new
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
def teardown
|
18
|
+
@validator = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
# ISO-8859-1 characters
|
22
|
+
def test_0010_check_raise_iso
|
23
|
+
#
|
24
|
+
assert_raise(UTF8::ValidationError) do
|
25
|
+
!@validator.valid_encoding?(0x80.chr, true)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# A regular old bad byte
|
30
|
+
def test_0020_check_raise_badbyte
|
31
|
+
#
|
32
|
+
assert_raise(UTF8::ValidationError) do
|
33
|
+
!@validator.valid_encoding?("a" + 0xff.chr + "b", true)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,226 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
require 'helper'
|
4
|
+
|
5
|
+
#
|
6
|
+
# == Purpose
|
7
|
+
#
|
8
|
+
# Tests for the #{UTF8::Validator} implementation.
|
9
|
+
#
|
10
|
+
class TestUtf8Validator < Test::Unit::TestCase
|
11
|
+
#
|
12
|
+
def setup
|
13
|
+
@validator = UTF8::Validator.new
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
def teardown
|
18
|
+
@validator = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
def test_0010_linkages
|
23
|
+
assert_not_nil @validator
|
24
|
+
end
|
25
|
+
|
26
|
+
#--
|
27
|
+
# Validation should succeed for the following tests
|
28
|
+
#--
|
29
|
+
|
30
|
+
# Some simple ASCII characters
|
31
|
+
def test_0020_simple_ascii
|
32
|
+
test_data = [
|
33
|
+
"a",
|
34
|
+
"abcdefghijjlmnopqrstuvwxyz",
|
35
|
+
"\x00",
|
36
|
+
]
|
37
|
+
test_data.each do |string|
|
38
|
+
assert @validator.valid_encoding?(string), "Simple ASCII: #{string}"
|
39
|
+
assert string.force_encoding("UTF-8").valid_encoding?, "Simple ASCII 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# All ASCII
|
44
|
+
def test_0030_all_ascii
|
45
|
+
0.upto(127) do |i|
|
46
|
+
assert @validator.valid_encoding?(i.chr), "All ASCII: #{i.chr}"
|
47
|
+
assert i.chr.force_encoding("UTF-8").valid_encoding?, "All ASCII 19: #{i.chr}" if RUBY_VERSION =~ /1\.9/
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Simple UTF8 - 2 byte characters
|
52
|
+
def test_0040_simple_utf8_2byte
|
53
|
+
test_data = [
|
54
|
+
"\xc2\x80",
|
55
|
+
"\xc2\xbf",
|
56
|
+
"\xdf\x80",
|
57
|
+
"\xdf\xbf",
|
58
|
+
]
|
59
|
+
test_data.each do |string|
|
60
|
+
assert @validator.valid_encoding?(string), "Simple UTF-8, 2bytes: #{string}"
|
61
|
+
assert string.force_encoding("UTF-8").valid_encoding?, "Simple UTF-8 19, 2bytes: #{string}" if RUBY_VERSION =~ /1\.9/
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Simple UTF8 - 3 byte characters
|
66
|
+
def test_0050_simple_utf8_3byte
|
67
|
+
test_data = [
|
68
|
+
"\xe0\xa0\x80",
|
69
|
+
"\xe0\xbf\x80",
|
70
|
+
"\xe0\xa0\xbf",
|
71
|
+
"\xe0\xbf\xbf",
|
72
|
+
]
|
73
|
+
test_data.each do |string|
|
74
|
+
assert @validator.valid_encoding?(string), "Simple UTF-8, 3bytes: #{string}"
|
75
|
+
assert string.force_encoding("UTF-8").valid_encoding?, "Simple UTF-8 19, 3bytes: #{string}" if RUBY_VERSION =~ /1\.9/
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Simple UTF8 - 4 byte characters
|
80
|
+
def test_0060_simple_utf8_4byte
|
81
|
+
test_data = [
|
82
|
+
"\xf1\x80\x80\x80",
|
83
|
+
"\xf1\xbf\xbf\xbf",
|
84
|
+
"\xf2\x80\x80\x80",
|
85
|
+
"\xf2\xbf\xbf\xbf",
|
86
|
+
"\xf3\x80\x80\x80",
|
87
|
+
"\xf3\xbf\xbf\xbf",
|
88
|
+
]
|
89
|
+
test_data.each do |string|
|
90
|
+
assert @validator.valid_encoding?(string), "Simple UTF-8, 4bytes: #{string}"
|
91
|
+
assert string.force_encoding("UTF-8").valid_encoding?, "Simple UTF-8 19, 4bytes: #{string}" if RUBY_VERSION =~ /1\.9/
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
#--
|
96
|
+
# Validation should fail for the following tests
|
97
|
+
#--
|
98
|
+
|
99
|
+
|
100
|
+
# ISO-8859-1 (C1 points)
|
101
|
+
def test0510_iso_5559_1
|
102
|
+
0x80.upto(0x9f) do |i|
|
103
|
+
assert !@validator.valid_encoding?(i.chr), "ISO-8859-1: #{i}"
|
104
|
+
assert !i.chr.force_encoding("UTF-8").valid_encoding?, "ISO-8859-1 19: #{i}" if RUBY_VERSION =~ /1\.9/
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# UTF-16 Surrogate Halves
|
109
|
+
def test0520_utf16_surrogate_halves
|
110
|
+
test_data = [
|
111
|
+
"\xed\xa0\x80", # u-800 (lowest)
|
112
|
+
"\xed\xbf\xbf", # u-fff (highest)
|
113
|
+
]
|
114
|
+
test_data.each do |string|
|
115
|
+
assert !@validator.valid_encoding?(string), "UTF-16 Surrogate Halves: #{string}"
|
116
|
+
assert !string.force_encoding("UTF-8").valid_encoding?, "UTF-16 Surrogate Halves 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Invalid single bytes
|
121
|
+
def test0530_invalid_single_bytes
|
122
|
+
test_data = [
|
123
|
+
"\xc0",
|
124
|
+
"\xc1",
|
125
|
+
"\xf5","\xf6","\xf7","\xf8","\xf9","\xfa","\xfb","\xfc",
|
126
|
+
"\xfd","\xfe","\xff",
|
127
|
+
]
|
128
|
+
test_data.each do |string|
|
129
|
+
assert !@validator.valid_encoding?(string), "Invalid single bytes: #{string}"
|
130
|
+
assert !string.force_encoding("UTF-8").valid_encoding?, "Invalid single bytes 10: #{string}" if RUBY_VERSION =~ /1\.9/
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Not shortest representation
|
135
|
+
def test0540_not_shortest
|
136
|
+
test_data = [
|
137
|
+
"\xc0\x80",
|
138
|
+
"\xe0\x80\x80",
|
139
|
+
"\xf0\x80\x80\x80",
|
140
|
+
"\xf8\x80\x80\x80\x80",
|
141
|
+
"\xfc\x80\x80\x80\x80\x80",
|
142
|
+
]
|
143
|
+
test_data.each do |string|
|
144
|
+
assert !@validator.valid_encoding?(string), "Not shortest: #{string}"
|
145
|
+
assert !string.force_encoding("UTF-8").valid_encoding?, "Not shortest 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Truncated last character
|
150
|
+
def test0550_truncated_last
|
151
|
+
test_data = [
|
152
|
+
"\xc2", # truncated 2 byte characters
|
153
|
+
"\xdf",
|
154
|
+
"\xe0\xa0", # truncated 3 byte characters
|
155
|
+
"\xe0\xbf",
|
156
|
+
"\xf1\x80\x80", # truncated 4 byte characters
|
157
|
+
"\xf1\xbf\xbf",
|
158
|
+
"\xf2\x80\x80",
|
159
|
+
"\xf2\xbf\xbf",
|
160
|
+
"\xf3\x80\x80",
|
161
|
+
"\xf3\xbf\xbf",
|
162
|
+
]
|
163
|
+
test_data.each do |string|
|
164
|
+
assert !@validator.valid_encoding?(string), "truncated last: #{string}"
|
165
|
+
assert !string.force_encoding("UTF-8").valid_encoding?, "truncated last 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# Truncated in good text
|
170
|
+
def test0560_truncated_in_good
|
171
|
+
test_data = [
|
172
|
+
"\xc2", # truncated 2 byte characters
|
173
|
+
"\xdf",
|
174
|
+
"\xe0\xa0", # truncated 3 byte characters
|
175
|
+
"\xe0\xbf",
|
176
|
+
"\xf1\x80\x80", # truncated 4 byte characters
|
177
|
+
"\xf1\xbf\xbf",
|
178
|
+
"\xf2\x80\x80",
|
179
|
+
"\xf2\xbf\xbf",
|
180
|
+
"\xf3\x80\x80",
|
181
|
+
"\xf3\xbf\xbf",
|
182
|
+
]
|
183
|
+
test_data.each do |string|
|
184
|
+
string = "a" + string + "b"
|
185
|
+
assert !@validator.valid_encoding?(string), "truncated in good: #{string}"
|
186
|
+
assert !string.force_encoding("UTF-8").valid_encoding?, "truncated in good 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
# Miscellaneous Bad
|
191
|
+
def test0570_miscellaneous_bad
|
192
|
+
# perhaps some duplication here
|
193
|
+
test_data = [
|
194
|
+
"bad byte: \372",
|
195
|
+
"\004\b{\f:\tbody\"\001\207\004\b{\b:\016statusmsg\"\aOK:\017statuscodei\000:\tdata{\t:\voutput\"3Enabled, not running, last run 693 seconds ago:\frunningi\000:\fenabledi\006:\flastrunl+\aE\021\022M:\rsenderid\"\032xx.xx.xx.xx:\016requestid\"%849d647bbe3e421ea19ac9f947bbdde4:\020senderagent\"\fpuppetd:\016msgtarget\"%/topic/mcollective.puppetd.reply:\thash\"\001\257ZdQqtaDmmdD0jZinnEcpN+YbkxQDn8uuCnwsQdvGHau6d+gxnnfPLUddWRSb\nZNMs+sQUXgJNfcV1eVBn1H+Z8QQmzYXVDMqz7J43jmgloz5PsLVbN9K3PmX/\ngszqV/WpvIyAqm98ennWqSzpwMuiCC4q2Jr3s3Gm6bUJ6UkKXnY=\n:\fmsgtimel+\a\372\023\022M",
|
196
|
+
"\207",
|
197
|
+
"\xf4\x90\x80\x80",
|
198
|
+
"\xbf",
|
199
|
+
"\xe0\x9f\xbf",
|
200
|
+
"\xf0\x8f\xbf\xbf",
|
201
|
+
"\xf8\x87\xbf\xbf\xbf",
|
202
|
+
"\xfc\x83\xbf\xbf\xbf\xbf",
|
203
|
+
"\xc0\x80",
|
204
|
+
"\xe0\x80\x80",
|
205
|
+
"\xf0\x80\x80\x80",
|
206
|
+
"\xf8\x80\x80\x80\x80",
|
207
|
+
"\xfc\x80\x80\x80\x80\x80",
|
208
|
+
"\xed\xa0\x80",
|
209
|
+
"\xed\xad\xbf",
|
210
|
+
"\xed\xae\x80",
|
211
|
+
"\xed\xaf\xbf",
|
212
|
+
"\xed\xb0\x80",
|
213
|
+
"\xed\xbe\x80",
|
214
|
+
"\xed\xbf\xbf",
|
215
|
+
"\xc0\x00", # too long for \x00
|
216
|
+
"\xe0\x00\x00", # too long for \x00
|
217
|
+
"\xf0\x00\x00\x00", # too long for \x00
|
218
|
+
]
|
219
|
+
test_data.each do |string|
|
220
|
+
assert !@validator.valid_encoding?(string), "miscellaneous bad: #{string}"
|
221
|
+
assert !string.force_encoding("UTF-8").valid_encoding?, "miscellaneous bad 19: #{string}" if RUBY_VERSION =~ /1\.9/
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
end
|
226
|
+
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{utf8_validator}
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Guy Allard"]
|
12
|
+
s.date = %q{2011-01-25}
|
13
|
+
s.description = %q{A State Machine implementation of a UTF-8 Encoding
|
14
|
+
Validation algorithm.}
|
15
|
+
s.email = %q{allard.guy.m@gmail.com}
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.rdoc"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".document",
|
22
|
+
"Gemfile",
|
23
|
+
"Gemfile.lock",
|
24
|
+
"LICENSE.txt",
|
25
|
+
"README.rdoc",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"lib/utf8_validator.rb",
|
29
|
+
"lib/validation/errors.rb",
|
30
|
+
"lib/validation/validator.rb",
|
31
|
+
"test/helper.rb",
|
32
|
+
"test/test_raise_request.rb",
|
33
|
+
"test/test_utf8_validator.rb",
|
34
|
+
"utf8_validator.gemspec"
|
35
|
+
]
|
36
|
+
s.homepage = %q{http://github.com/gmallard/utf8_validator}
|
37
|
+
s.licenses = ["MIT"]
|
38
|
+
s.require_paths = ["lib"]
|
39
|
+
s.rubygems_version = %q{1.3.7}
|
40
|
+
s.summary = %q{A UTF-8 Encoding Validator.}
|
41
|
+
s.test_files = [
|
42
|
+
"test/helper.rb",
|
43
|
+
"test/test_raise_request.rb",
|
44
|
+
"test/test_utf8_validator.rb"
|
45
|
+
]
|
46
|
+
|
47
|
+
if s.respond_to? :specification_version then
|
48
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
49
|
+
s.specification_version = 3
|
50
|
+
|
51
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
52
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
53
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
54
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
55
|
+
s.add_development_dependency(%q<bundler>, [">= 2.1.2"])
|
56
|
+
else
|
57
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
58
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
59
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
60
|
+
s.add_dependency(%q<bundler>, [">= 2.1.2"])
|
61
|
+
end
|
62
|
+
else
|
63
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
65
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
66
|
+
s.add_dependency(%q<bundler>, [">= 2.1.2"])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
metadata
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: utf8_validator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Guy Allard
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-01-25 00:00:00 -05:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: bundler
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - ~>
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 0
|
30
|
+
- 0
|
31
|
+
version: 1.0.0
|
32
|
+
type: :development
|
33
|
+
prerelease: false
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: jeweler
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ~>
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
segments:
|
43
|
+
- 1
|
44
|
+
- 5
|
45
|
+
- 2
|
46
|
+
version: 1.5.2
|
47
|
+
type: :development
|
48
|
+
prerelease: false
|
49
|
+
version_requirements: *id002
|
50
|
+
- !ruby/object:Gem::Dependency
|
51
|
+
name: rcov
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
type: :development
|
61
|
+
prerelease: false
|
62
|
+
version_requirements: *id003
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: bundler
|
65
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
segments:
|
71
|
+
- 2
|
72
|
+
- 1
|
73
|
+
- 2
|
74
|
+
version: 2.1.2
|
75
|
+
type: :development
|
76
|
+
prerelease: false
|
77
|
+
version_requirements: *id004
|
78
|
+
description: |-
|
79
|
+
A State Machine implementation of a UTF-8 Encoding
|
80
|
+
Validation algorithm.
|
81
|
+
email: allard.guy.m@gmail.com
|
82
|
+
executables: []
|
83
|
+
|
84
|
+
extensions: []
|
85
|
+
|
86
|
+
extra_rdoc_files:
|
87
|
+
- LICENSE.txt
|
88
|
+
- README.rdoc
|
89
|
+
files:
|
90
|
+
- .document
|
91
|
+
- Gemfile
|
92
|
+
- Gemfile.lock
|
93
|
+
- LICENSE.txt
|
94
|
+
- README.rdoc
|
95
|
+
- Rakefile
|
96
|
+
- VERSION
|
97
|
+
- lib/utf8_validator.rb
|
98
|
+
- lib/validation/errors.rb
|
99
|
+
- lib/validation/validator.rb
|
100
|
+
- test/helper.rb
|
101
|
+
- test/test_raise_request.rb
|
102
|
+
- test/test_utf8_validator.rb
|
103
|
+
- utf8_validator.gemspec
|
104
|
+
has_rdoc: true
|
105
|
+
homepage: http://github.com/gmallard/utf8_validator
|
106
|
+
licenses:
|
107
|
+
- MIT
|
108
|
+
post_install_message:
|
109
|
+
rdoc_options: []
|
110
|
+
|
111
|
+
require_paths:
|
112
|
+
- lib
|
113
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
114
|
+
none: false
|
115
|
+
requirements:
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
hash: -1120544117494340473
|
119
|
+
segments:
|
120
|
+
- 0
|
121
|
+
version: "0"
|
122
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
123
|
+
none: false
|
124
|
+
requirements:
|
125
|
+
- - ">="
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
segments:
|
128
|
+
- 0
|
129
|
+
version: "0"
|
130
|
+
requirements: []
|
131
|
+
|
132
|
+
rubyforge_project:
|
133
|
+
rubygems_version: 1.3.7
|
134
|
+
signing_key:
|
135
|
+
specification_version: 3
|
136
|
+
summary: A UTF-8 Encoding Validator.
|
137
|
+
test_files:
|
138
|
+
- test/helper.rb
|
139
|
+
- test/test_raise_request.rb
|
140
|
+
- test/test_utf8_validator.rb
|