utf8_validator 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "bundler", "~> 1.0.0"
10
+ gem "jeweler", "~> 1.5.2"
11
+ gem "rcov", ">= 0"
12
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,18 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.5.2)
6
+ bundler (~> 1.0.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ rake (0.8.7)
10
+ rcov (0.9.9)
11
+
12
+ PLATFORMS
13
+ ruby
14
+
15
+ DEPENDENCIES
16
+ bundler (~> 1.0.0)
17
+ jeweler (~> 1.5.2)
18
+ rcov
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Guy Allard
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,43 @@
1
+ = A UTF-8 Validator State Machine
2
+
3
+ Provides an implementation of a state machine for validating UTF-8 encoded strings. Clients may request that encoding errors be reported in several ways:
4
+
5
+ * simple true / false indicator
6
+ * a raised exception
7
+
8
+ == What This gem does Not Provide
9
+
10
+ * UTF-8 Encoding
11
+ * UTF-8 Decoding
12
+
13
+ That functionality is left as an exercise for the reader.
14
+
15
+ == Thanks To
16
+
17
+ The Unicode Consortium:: At http://unicode.org/ for all the information published there.
18
+ Frank Yung-Fong Tang:: For the state machine algorithm. See: http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
19
+
20
+ == A Word On Ruby Versions
21
+
22
+ It is expected that this validator will be used in Ruby environments prior to 1.9.x. However, nothing prohibits it's use with Ruby 1.9.
23
+
24
+ == Reporting Issues
25
+
26
+ Please report issues on the tracker at github.
27
+
28
+ == Contributing to the utf8_validator gem
29
+
30
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
31
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
32
+ * Fork the project.
33
+ * Start a feature/bugfix branch.
34
+ * Commit and push until you are happy with your contribution.
35
+ * Make sure to add tests for it. This is important so it does not break in in a future version unintentionally.
36
+ * Please try not to modify the Rakefile or VERSION file. If you require your own version please isolate the version update to its own commit so cherry-pick or rebase can be used to skip it.
37
+ * Request a pull.
38
+
39
+ == Copyright
40
+
41
+ Copyright (c) 2011 Guy Allard. See LICENSE.txt for
42
+ further details.
43
+
data/Rakefile ADDED
@@ -0,0 +1,56 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "utf8_validator"
16
+ gem.homepage = "http://github.com/gmallard/utf8_validator"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{A UTF-8 Encoding Validator.}
19
+ gem.description = %Q{A State Machine implementation of a UTF-8 Encoding
20
+ Validation algorithm.}
21
+ gem.email = "allard.guy.m@gmail.com"
22
+ gem.authors = ["Guy Allard"]
23
+
24
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
25
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
26
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
27
+
28
+ gem.add_development_dependency 'bundler', '>= 2.1.2'
29
+ end
30
+ Jeweler::RubygemsDotOrgTasks.new
31
+
32
+ require 'rake/testtask'
33
+ Rake::TestTask.new(:test) do |test|
34
+ test.libs << 'lib' << 'test'
35
+ test.pattern = 'test/**/test_*.rb'
36
+ test.verbose = true
37
+ end
38
+
39
+ require 'rcov/rcovtask'
40
+ Rcov::RcovTask.new do |test|
41
+ test.libs << 'test'
42
+ test.pattern = 'test/**/test_*.rb'
43
+ test.verbose = true
44
+ end
45
+
46
+ task :default => :test
47
+
48
+ require 'rake/rdoctask'
49
+ Rake::RDocTask.new do |rdoc|
50
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
51
+
52
+ rdoc.rdoc_dir = 'rdoc'
53
+ rdoc.title = "UTF-8 Validator #{version}"
54
+ rdoc.rdoc_files.include('README*')
55
+ rdoc.rdoc_files.include('lib/**/*.rb')
56
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,4 @@
1
+ #
2
+ require 'validation/validator'
3
+ require 'validation/errors'
4
+
@@ -0,0 +1,14 @@
1
+ module UTF8
2
+ #
3
+ # == Purpose
4
+ #
5
+ # General UTF-8 validation error class. Clients that raise this error
6
+ # should override the default message.
7
+ #
8
+ class ValidationError < ::RuntimeError
9
+ #
10
+ def message()
11
+ "general UTF-8 validation error"
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,284 @@
1
+ # encoding: utf-8
2
+ #
3
+ =begin
4
+
5
+ http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
6
+
7
+ * state START
8
+
9
+ * Input = 0x00-0x7F : change state to START
10
+ * Input = 0xC2-0xDF: change state to A
11
+ * Input = 0xE1-0xEC, 0xEE-0xEF: change state to B
12
+ * Input = 0xE0: change state to C
13
+ * Input = 0xED: change state to D
14
+ * Input = 0xF1-0xF3:change state to E
15
+ * Input = 0xF0: change state to F
16
+ * Input = 0xF4: change state to G
17
+ * Input = Others (0x80-0xBF,0xC0-0xC1, 0xF5-0xFF): ERROR
18
+
19
+ * state A
20
+ o Input = 0x80-0xBF: change state to START
21
+ o Others: ERROR
22
+ * state B
23
+ o Input = 0x80-0xBF: change state to A
24
+ o Others: ERROR
25
+ * state C
26
+ o Input = 0xA0-0xBF: change state to A
27
+ o Others: ERROR
28
+ * state D
29
+ o Input = 0x80-0x9F: change state to A
30
+ o Others: ERROR
31
+ * state E
32
+ o Input = 0x80-0xBF: change state to B
33
+ o Others: ERROR
34
+ * state F
35
+ o Input = 0x90-0xBF: change state to B
36
+ o Others: ERROR
37
+ * state G
38
+ o Input = 0x80-0x8F: change state to B
39
+ o Others: ERROR
40
+
41
+ This state machine can be easily understood by:
42
+
43
+ a) examining the machine behavior as documented
44
+ b) reference to an excellent UTF-8 article with accompanying table here:
45
+
46
+ http://en.wikipedia.org/wiki/UTF-8
47
+
48
+ =end
49
+ #
50
+ # == Purpose
51
+ #
52
+ # Container for UTF-8 validator.
53
+ #
54
+ module UTF8
55
+ #
56
+ # == Purpose
57
+ #
58
+ # Validate UTF-8 primarily in a Ruby environments other than 1.9.
59
+ #
60
+ # Instances of this class are thread safe, and a single instance may be used
61
+ # safely by multiple concurrent threads, with one caveat:
62
+ #
63
+ # The value of #{Validator::DEBUG} must not be changed by any thread.
64
+ #
65
+ #--
66
+ # Copyright (c) 2011 Guy Allard
67
+ #
68
+ class Validator
69
+ #
70
+ # For use during development only.
71
+ #
72
+ DEBUG=false
73
+
74
+ #
75
+ # Validate the supplied string for proper UTF-8 encoding.
76
+ #
77
+ # Calling Sequence:
78
+ #
79
+ # validator.valid_encoding?(string) -> true or false
80
+ # validator.valid_encoding?(string, raise_on_error) -> true or exception
81
+ #
82
+ # Parameters:
83
+ #
84
+ # string:: the string to validate
85
+ # raise_on_error:: a flag to indicate failure behavior
86
+ #
87
+ #
88
+ def valid_encoding?(string, raise_on_error = false)
89
+ bytes = string.bytes
90
+ #
91
+ valid = true
92
+ index = -1
93
+ nb_hex = nil
94
+ ni_hex = nil
95
+ state = "start"
96
+ next_byte_save = nil
97
+ #
98
+ bytes.each do |next_byte|
99
+ index += 1
100
+ next_byte_save = next_byte
101
+ ni_hex = sprintf "%x", index
102
+ nb_hex = sprintf "%x", next_byte
103
+ puts "Top: #{next_byte}(0x#{nb_hex}), index: #{index}(0x#{ni_hex})" if DEBUG
104
+ case state
105
+
106
+ # State: 'start'
107
+ # The 'start' state:
108
+ # * handles all occurrences of valid single byte characters i.e., the ASCII character set
109
+ # * provides state transition logic for start bytes of valid characters with 2-4 bytes
110
+ # * signals a validation failure for all other single bytes
111
+ #
112
+ when "start"
113
+ puts "state: start" if DEBUG
114
+ case next_byte
115
+
116
+ # ASCII
117
+ # * Input = 0x00-0x7F : change state to START
118
+ when (0x00..0x7f)
119
+ puts "state: start 1" if DEBUG
120
+ state = "start"
121
+
122
+ # Start byte of two byte characters
123
+ # * Input = 0xC2-0xDF: change state to A
124
+ when (0xc2..0xdf)
125
+ puts "state: start 2" if DEBUG
126
+ state = "a"
127
+
128
+ # Start byte of some three byte characters
129
+ # * Input = 0xE1-0xEC, 0xEE-0xEF: change state to B
130
+ when (0xe1..0xec)
131
+ puts "state: start 3" if DEBUG
132
+ state = "b"
133
+ when (0xee..0xef)
134
+ puts "state: start 4" if DEBUG
135
+ state = "b"
136
+
137
+ # Start byte of special three byte characters
138
+ # * Input = 0xE0: change state to C
139
+ when 0xe0
140
+ puts "state: start 5" if DEBUG
141
+ state = "c"
142
+
143
+ # Start byte of the remaining three byte characters
144
+ # * Input = 0xED: change state to D
145
+ when 0xed
146
+ puts "state: start 6" if DEBUG
147
+ state = "d"
148
+
149
+ # Start byte of some four byte characters
150
+ # * Input = 0xF1-0xF3:change state to E
151
+ when (0xf1..0xf3)
152
+ puts "state: start 7" if DEBUG
153
+ state = "e"
154
+
155
+ # Start byte of special four byte characters
156
+ # * Input = 0xF0: change state to F
157
+ when 0xf0
158
+ puts "state: start 8" if DEBUG
159
+ state = "f"
160
+
161
+ # Start byte of very special four byte characters
162
+ # * Input = 0xF4: change state to G
163
+ when 0xf4
164
+ puts "state: start 9" if DEBUG
165
+ state = "g"
166
+
167
+ # All other single characters are invalid
168
+ # * Input = Others (0x80-0xBF,0xC0-0xC1, 0xF5-0xFF): ERROR
169
+ else
170
+ valid = false
171
+ break
172
+ end # of the inner case
173
+
174
+ # The last continuation byte of a 2, 3, or 4 byte character
175
+ # State: 'a'
176
+ # o Input = 0x80-0xBF: change state to START
177
+ # o Others: ERROR
178
+ when "a"
179
+ puts "state: a" if DEBUG
180
+ if (0x80..0xbf) === next_byte
181
+ state = "start"
182
+ else
183
+ valid = false
184
+ break
185
+ end
186
+
187
+ # The first continuation byte for most 3 byte characters
188
+ # State: 'b'
189
+ # o Input = 0x80-0xBF: change state to A
190
+ # o Others: ERROR
191
+ when "b"
192
+ puts "state: b" if DEBUG
193
+ if (0x80..0xbf) === next_byte
194
+ state = "a"
195
+ else
196
+ valid = false
197
+ break
198
+ end
199
+
200
+ # The first continuation byte for some special 3 byte characters
201
+ # State: 'c'
202
+ # o Input = 0xA0-0xBF: change state to A
203
+ # o Others: ERROR
204
+ when "c"
205
+ puts "state: c" if DEBUG
206
+ if (0xa0..0xbf) === next_byte
207
+ state = "a"
208
+ else
209
+ valid = false
210
+ break
211
+ end
212
+
213
+ # The first continuation byte for the remaining 3 byte characters
214
+ # State: 'd'
215
+ # o Input = 0x80-0x9F: change state to A
216
+ # o Others: ERROR
217
+ when "d"
218
+ puts "state: d" if DEBUG
219
+ if (0x80..0x9f) === next_byte
220
+ state = "a"
221
+ else
222
+ valid = false
223
+ break
224
+ end
225
+
226
+ # The first continuation byte for some 4 byte characters
227
+ # State: 'e'
228
+ # o Input = 0x80-0xBF: change state to B
229
+ # o Others: ERROR
230
+ when "e"
231
+ puts "state: e" if DEBUG
232
+ if (0x80..0xbf) === next_byte
233
+ state = "b"
234
+ else
235
+ valid = false
236
+ break
237
+ end
238
+
239
+ # The first continuation byte for some special 4 byte characters
240
+ # State: 'f'
241
+ # o Input = 0x90-0xBF: change state to B
242
+ # o Others: ERROR
243
+ when "f"
244
+ puts "state: f" if DEBUG
245
+ if (0x90..0xbf) === next_byte
246
+ state = "b"
247
+ else
248
+ valid = false
249
+ break
250
+ end
251
+
252
+ # The first continuation byte for the remaining 4 byte characters
253
+ # State: 'g'
254
+ # o Input = 0x80-0x8F: change state to B
255
+ # o Others: ERROR
256
+ when "g"
257
+ puts "state: g" if DEBUG
258
+ if (0x80..0x8f) === next_byte
259
+ state = "b"
260
+ else
261
+ valid = false
262
+ break
263
+ end
264
+
265
+ #
266
+ else
267
+ raise RuntimeError, "state: default"
268
+ end
269
+ end
270
+ #
271
+ puts "State at end: #{state}" if DEBUG
272
+ # Catch truncation at end of string
273
+ if valid and state != 'start'
274
+ valid = false
275
+ end
276
+ #
277
+ if !valid and raise_on_error
278
+ raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})"
279
+ end
280
+ #
281
+ valid
282
+ end # of valid_encoding?
283
+ end # of class
284
+ end # of module
data/test/helper.rb ADDED
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+
12
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ require 'utf8_validator'
15
+
16
+ class Test::Unit::TestCase
17
+ end
@@ -0,0 +1,38 @@
1
+ # encoding: utf-8
2
+ #
3
+ require 'helper'
4
+
5
+ #
6
+ # == Purpose
7
+ #
8
+ # Test raise_on_error functionality.
9
+ #
10
+ class TestRaiseRequect < Test::Unit::TestCase
11
+ #
12
+ def setup
13
+ @validator = UTF8::Validator.new
14
+ end
15
+
16
+ #
17
+ def teardown
18
+ @validator = nil
19
+ end
20
+
21
+ # ISO-8859-1 characters
22
+ def test_0010_check_raise_iso
23
+ #
24
+ assert_raise(UTF8::ValidationError) do
25
+ !@validator.valid_encoding?(0x80.chr, true)
26
+ end
27
+ end
28
+
29
+ # A regular old bad byte
30
+ def test_0020_check_raise_badbyte
31
+ #
32
+ assert_raise(UTF8::ValidationError) do
33
+ !@validator.valid_encoding?("a" + 0xff.chr + "b", true)
34
+ end
35
+ end
36
+
37
+ end
38
+
@@ -0,0 +1,226 @@
1
+ # encoding: utf-8
2
+ #
3
+ require 'helper'
4
+
5
+ #
6
+ # == Purpose
7
+ #
8
+ # Tests for the #{UTF8::Validator} implementation.
9
+ #
10
+ class TestUtf8Validator < Test::Unit::TestCase
11
+ #
12
+ def setup
13
+ @validator = UTF8::Validator.new
14
+ end
15
+
16
+ #
17
+ def teardown
18
+ @validator = nil
19
+ end
20
+
21
+ #
22
+ def test_0010_linkages
23
+ assert_not_nil @validator
24
+ end
25
+
26
+ #--
27
+ # Validation should succeed for the following tests
28
+ #--
29
+
30
+ # Some simple ASCII characters
31
+ def test_0020_simple_ascii
32
+ test_data = [
33
+ "a",
34
+ "abcdefghijjlmnopqrstuvwxyz",
35
+ "\x00",
36
+ ]
37
+ test_data.each do |string|
38
+ assert @validator.valid_encoding?(string), "Simple ASCII: #{string}"
39
+ assert string.force_encoding("UTF-8").valid_encoding?, "Simple ASCII 19: #{string}" if RUBY_VERSION =~ /1\.9/
40
+ end
41
+ end
42
+
43
+ # All ASCII
44
+ def test_0030_all_ascii
45
+ 0.upto(127) do |i|
46
+ assert @validator.valid_encoding?(i.chr), "All ASCII: #{i.chr}"
47
+ assert i.chr.force_encoding("UTF-8").valid_encoding?, "All ASCII 19: #{i.chr}" if RUBY_VERSION =~ /1\.9/
48
+ end
49
+ end
50
+
51
+ # Simple UTF8 - 2 byte characters
52
+ def test_0040_simple_utf8_2byte
53
+ test_data = [
54
+ "\xc2\x80",
55
+ "\xc2\xbf",
56
+ "\xdf\x80",
57
+ "\xdf\xbf",
58
+ ]
59
+ test_data.each do |string|
60
+ assert @validator.valid_encoding?(string), "Simple UTF-8, 2bytes: #{string}"
61
+ assert string.force_encoding("UTF-8").valid_encoding?, "Simple UTF-8 19, 2bytes: #{string}" if RUBY_VERSION =~ /1\.9/
62
+ end
63
+ end
64
+
65
+ # Simple UTF8 - 3 byte characters
66
+ def test_0050_simple_utf8_3byte
67
+ test_data = [
68
+ "\xe0\xa0\x80",
69
+ "\xe0\xbf\x80",
70
+ "\xe0\xa0\xbf",
71
+ "\xe0\xbf\xbf",
72
+ ]
73
+ test_data.each do |string|
74
+ assert @validator.valid_encoding?(string), "Simple UTF-8, 3bytes: #{string}"
75
+ assert string.force_encoding("UTF-8").valid_encoding?, "Simple UTF-8 19, 3bytes: #{string}" if RUBY_VERSION =~ /1\.9/
76
+ end
77
+ end
78
+
79
+ # Simple UTF8 - 4 byte characters
80
+ def test_0060_simple_utf8_4byte
81
+ test_data = [
82
+ "\xf1\x80\x80\x80",
83
+ "\xf1\xbf\xbf\xbf",
84
+ "\xf2\x80\x80\x80",
85
+ "\xf2\xbf\xbf\xbf",
86
+ "\xf3\x80\x80\x80",
87
+ "\xf3\xbf\xbf\xbf",
88
+ ]
89
+ test_data.each do |string|
90
+ assert @validator.valid_encoding?(string), "Simple UTF-8, 4bytes: #{string}"
91
+ assert string.force_encoding("UTF-8").valid_encoding?, "Simple UTF-8 19, 4bytes: #{string}" if RUBY_VERSION =~ /1\.9/
92
+ end
93
+ end
94
+
95
+ #--
96
+ # Validation should fail for the following tests
97
+ #--
98
+
99
+
100
+ # ISO-8859-1 (C1 points)
101
+ def test0510_iso_5559_1
102
+ 0x80.upto(0x9f) do |i|
103
+ assert !@validator.valid_encoding?(i.chr), "ISO-8859-1: #{i}"
104
+ assert !i.chr.force_encoding("UTF-8").valid_encoding?, "ISO-8859-1 19: #{i}" if RUBY_VERSION =~ /1\.9/
105
+ end
106
+ end
107
+
108
+ # UTF-16 Surrogate Halves
109
+ def test0520_utf16_surrogate_halves
110
+ test_data = [
111
+ "\xed\xa0\x80", # u-800 (lowest)
112
+ "\xed\xbf\xbf", # u-fff (highest)
113
+ ]
114
+ test_data.each do |string|
115
+ assert !@validator.valid_encoding?(string), "UTF-16 Surrogate Halves: #{string}"
116
+ assert !string.force_encoding("UTF-8").valid_encoding?, "UTF-16 Surrogate Halves 19: #{string}" if RUBY_VERSION =~ /1\.9/
117
+ end
118
+ end
119
+
120
+ # Invalid single bytes
121
+ def test0530_invalid_single_bytes
122
+ test_data = [
123
+ "\xc0",
124
+ "\xc1",
125
+ "\xf5","\xf6","\xf7","\xf8","\xf9","\xfa","\xfb","\xfc",
126
+ "\xfd","\xfe","\xff",
127
+ ]
128
+ test_data.each do |string|
129
+ assert !@validator.valid_encoding?(string), "Invalid single bytes: #{string}"
130
+ assert !string.force_encoding("UTF-8").valid_encoding?, "Invalid single bytes 10: #{string}" if RUBY_VERSION =~ /1\.9/
131
+ end
132
+ end
133
+
134
+ # Not shortest representation
135
+ def test0540_not_shortest
136
+ test_data = [
137
+ "\xc0\x80",
138
+ "\xe0\x80\x80",
139
+ "\xf0\x80\x80\x80",
140
+ "\xf8\x80\x80\x80\x80",
141
+ "\xfc\x80\x80\x80\x80\x80",
142
+ ]
143
+ test_data.each do |string|
144
+ assert !@validator.valid_encoding?(string), "Not shortest: #{string}"
145
+ assert !string.force_encoding("UTF-8").valid_encoding?, "Not shortest 19: #{string}" if RUBY_VERSION =~ /1\.9/
146
+ end
147
+ end
148
+
149
+ # Truncated last character
150
+ def test0550_truncated_last
151
+ test_data = [
152
+ "\xc2", # truncated 2 byte characters
153
+ "\xdf",
154
+ "\xe0\xa0", # truncated 3 byte characters
155
+ "\xe0\xbf",
156
+ "\xf1\x80\x80", # truncated 4 byte characters
157
+ "\xf1\xbf\xbf",
158
+ "\xf2\x80\x80",
159
+ "\xf2\xbf\xbf",
160
+ "\xf3\x80\x80",
161
+ "\xf3\xbf\xbf",
162
+ ]
163
+ test_data.each do |string|
164
+ assert !@validator.valid_encoding?(string), "truncated last: #{string}"
165
+ assert !string.force_encoding("UTF-8").valid_encoding?, "truncated last 19: #{string}" if RUBY_VERSION =~ /1\.9/
166
+ end
167
+ end
168
+
169
+ # Truncated in good text
170
+ def test0560_truncated_in_good
171
+ test_data = [
172
+ "\xc2", # truncated 2 byte characters
173
+ "\xdf",
174
+ "\xe0\xa0", # truncated 3 byte characters
175
+ "\xe0\xbf",
176
+ "\xf1\x80\x80", # truncated 4 byte characters
177
+ "\xf1\xbf\xbf",
178
+ "\xf2\x80\x80",
179
+ "\xf2\xbf\xbf",
180
+ "\xf3\x80\x80",
181
+ "\xf3\xbf\xbf",
182
+ ]
183
+ test_data.each do |string|
184
+ string = "a" + string + "b"
185
+ assert !@validator.valid_encoding?(string), "truncated in good: #{string}"
186
+ assert !string.force_encoding("UTF-8").valid_encoding?, "truncated in good 19: #{string}" if RUBY_VERSION =~ /1\.9/
187
+ end
188
+ end
189
+
190
+ # Miscellaneous Bad
191
+ def test0570_miscellaneous_bad
192
+ # perhaps some duplication here
193
+ test_data = [
194
+ "bad byte: \372",
195
+ "\004\b{\f:\tbody\"\001\207\004\b{\b:\016statusmsg\"\aOK:\017statuscodei\000:\tdata{\t:\voutput\"3Enabled, not running, last run 693 seconds ago:\frunningi\000:\fenabledi\006:\flastrunl+\aE\021\022M:\rsenderid\"\032xx.xx.xx.xx:\016requestid\"%849d647bbe3e421ea19ac9f947bbdde4:\020senderagent\"\fpuppetd:\016msgtarget\"%/topic/mcollective.puppetd.reply:\thash\"\001\257ZdQqtaDmmdD0jZinnEcpN+YbkxQDn8uuCnwsQdvGHau6d+gxnnfPLUddWRSb\nZNMs+sQUXgJNfcV1eVBn1H+Z8QQmzYXVDMqz7J43jmgloz5PsLVbN9K3PmX/\ngszqV/WpvIyAqm98ennWqSzpwMuiCC4q2Jr3s3Gm6bUJ6UkKXnY=\n:\fmsgtimel+\a\372\023\022M",
196
+ "\207",
197
+ "\xf4\x90\x80\x80",
198
+ "\xbf",
199
+ "\xe0\x9f\xbf",
200
+ "\xf0\x8f\xbf\xbf",
201
+ "\xf8\x87\xbf\xbf\xbf",
202
+ "\xfc\x83\xbf\xbf\xbf\xbf",
203
+ "\xc0\x80",
204
+ "\xe0\x80\x80",
205
+ "\xf0\x80\x80\x80",
206
+ "\xf8\x80\x80\x80\x80",
207
+ "\xfc\x80\x80\x80\x80\x80",
208
+ "\xed\xa0\x80",
209
+ "\xed\xad\xbf",
210
+ "\xed\xae\x80",
211
+ "\xed\xaf\xbf",
212
+ "\xed\xb0\x80",
213
+ "\xed\xbe\x80",
214
+ "\xed\xbf\xbf",
215
+ "\xc0\x00", # too long for \x00
216
+ "\xe0\x00\x00", # too long for \x00
217
+ "\xf0\x00\x00\x00", # too long for \x00
218
+ ]
219
+ test_data.each do |string|
220
+ assert !@validator.valid_encoding?(string), "miscellaneous bad: #{string}"
221
+ assert !string.force_encoding("UTF-8").valid_encoding?, "miscellaneous bad 19: #{string}" if RUBY_VERSION =~ /1\.9/
222
+ end
223
+ end
224
+
225
+ end
226
+
@@ -0,0 +1,69 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{utf8_validator}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Guy Allard"]
12
+ s.date = %q{2011-01-25}
13
+ s.description = %q{A State Machine implementation of a UTF-8 Encoding
14
+ Validation algorithm.}
15
+ s.email = %q{allard.guy.m@gmail.com}
16
+ s.extra_rdoc_files = [
17
+ "LICENSE.txt",
18
+ "README.rdoc"
19
+ ]
20
+ s.files = [
21
+ ".document",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "LICENSE.txt",
25
+ "README.rdoc",
26
+ "Rakefile",
27
+ "VERSION",
28
+ "lib/utf8_validator.rb",
29
+ "lib/validation/errors.rb",
30
+ "lib/validation/validator.rb",
31
+ "test/helper.rb",
32
+ "test/test_raise_request.rb",
33
+ "test/test_utf8_validator.rb",
34
+ "utf8_validator.gemspec"
35
+ ]
36
+ s.homepage = %q{http://github.com/gmallard/utf8_validator}
37
+ s.licenses = ["MIT"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.7}
40
+ s.summary = %q{A UTF-8 Encoding Validator.}
41
+ s.test_files = [
42
+ "test/helper.rb",
43
+ "test/test_raise_request.rb",
44
+ "test/test_utf8_validator.rb"
45
+ ]
46
+
47
+ if s.respond_to? :specification_version then
48
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
+ s.specification_version = 3
50
+
51
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
52
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
53
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
54
+ s.add_development_dependency(%q<rcov>, [">= 0"])
55
+ s.add_development_dependency(%q<bundler>, [">= 2.1.2"])
56
+ else
57
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
58
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
59
+ s.add_dependency(%q<rcov>, [">= 0"])
60
+ s.add_dependency(%q<bundler>, [">= 2.1.2"])
61
+ end
62
+ else
63
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
64
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
65
+ s.add_dependency(%q<rcov>, [">= 0"])
66
+ s.add_dependency(%q<bundler>, [">= 2.1.2"])
67
+ end
68
+ end
69
+
metadata ADDED
@@ -0,0 +1,140 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: utf8_validator
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Guy Allard
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-01-25 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: bundler
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ~>
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 0
30
+ - 0
31
+ version: 1.0.0
32
+ type: :development
33
+ prerelease: false
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: jeweler
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ~>
41
+ - !ruby/object:Gem::Version
42
+ segments:
43
+ - 1
44
+ - 5
45
+ - 2
46
+ version: 1.5.2
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: *id002
50
+ - !ruby/object:Gem::Dependency
51
+ name: rcov
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ segments:
58
+ - 0
59
+ version: "0"
60
+ type: :development
61
+ prerelease: false
62
+ version_requirements: *id003
63
+ - !ruby/object:Gem::Dependency
64
+ name: bundler
65
+ requirement: &id004 !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ segments:
71
+ - 2
72
+ - 1
73
+ - 2
74
+ version: 2.1.2
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: *id004
78
+ description: |-
79
+ A State Machine implementation of a UTF-8 Encoding
80
+ Validation algorithm.
81
+ email: allard.guy.m@gmail.com
82
+ executables: []
83
+
84
+ extensions: []
85
+
86
+ extra_rdoc_files:
87
+ - LICENSE.txt
88
+ - README.rdoc
89
+ files:
90
+ - .document
91
+ - Gemfile
92
+ - Gemfile.lock
93
+ - LICENSE.txt
94
+ - README.rdoc
95
+ - Rakefile
96
+ - VERSION
97
+ - lib/utf8_validator.rb
98
+ - lib/validation/errors.rb
99
+ - lib/validation/validator.rb
100
+ - test/helper.rb
101
+ - test/test_raise_request.rb
102
+ - test/test_utf8_validator.rb
103
+ - utf8_validator.gemspec
104
+ has_rdoc: true
105
+ homepage: http://github.com/gmallard/utf8_validator
106
+ licenses:
107
+ - MIT
108
+ post_install_message:
109
+ rdoc_options: []
110
+
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ none: false
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ hash: -1120544117494340473
119
+ segments:
120
+ - 0
121
+ version: "0"
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ none: false
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ segments:
128
+ - 0
129
+ version: "0"
130
+ requirements: []
131
+
132
+ rubyforge_project:
133
+ rubygems_version: 1.3.7
134
+ signing_key:
135
+ specification_version: 3
136
+ summary: A UTF-8 Encoding Validator.
137
+ test_files:
138
+ - test/helper.rb
139
+ - test/test_raise_request.rb
140
+ - test/test_utf8_validator.rb