utf8_validator 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "bundler", "~> 1.0.0"
10
+ gem "jeweler", "~> 1.5.2"
11
+ gem "rcov", ">= 0"
12
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,18 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.5.2)
6
+ bundler (~> 1.0.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ rake (0.8.7)
10
+ rcov (0.9.9)
11
+
12
+ PLATFORMS
13
+ ruby
14
+
15
+ DEPENDENCIES
16
+ bundler (~> 1.0.0)
17
+ jeweler (~> 1.5.2)
18
+ rcov
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Guy Allard
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,43 @@
1
+ = A UTF-8 Validator State Machine
2
+
3
+ Provides an implementation of a state machine for validating UTF-8 encoded strings. Clients may request that encoding errors be reported in several ways:
4
+
5
+ * simple true / false indicator
6
+ * a raised exception
7
+
8
+ == What This gem does Not Provide
9
+
10
+ * UTF-8 Encoding
11
+ * UTF-8 Decoding
12
+
13
+ That functionality is left as an exercise for the reader.
14
+
15
+ == Thanks To
16
+
17
+ The Unicode Consortium:: At http://unicode.org/ for all the information published there.
18
+ Frank Yung-Fong Tang:: For the state machine algorithm. See: http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
19
+
20
+ == A Word On Ruby Versions
21
+
22
+ It is expected that this validator will be used in Ruby environments prior to 1.9.x. However, nothing prohibits it's use with Ruby 1.9.
23
+
24
+ == Reporting Issues
25
+
26
+ Please report issues on the tracker at github.
27
+
28
+ == Contributing to the utf8_validator gem
29
+
30
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
31
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
32
+ * Fork the project.
33
+ * Start a feature/bugfix branch.
34
+ * Commit and push until you are happy with your contribution.
35
+ * Make sure to add tests for it. This is important so it does not break in in a future version unintentionally.
36
+ * Please try not to modify the Rakefile or VERSION file. If you require your own version please isolate the version update to its own commit so cherry-pick or rebase can be used to skip it.
37
+ * Request a pull.
38
+
39
+ == Copyright
40
+
41
+ Copyright (c) 2011 Guy Allard. See LICENSE.txt for
42
+ further details.
43
+
data/Rakefile ADDED
@@ -0,0 +1,56 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "utf8_validator"
16
+ gem.homepage = "http://github.com/gmallard/utf8_validator"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{A UTF-8 Encoding Validator.}
19
+ gem.description = %Q{A State Machine implementation of a UTF-8 Encoding
20
+ Validation algorithm.}
21
+ gem.email = "allard.guy.m@gmail.com"
22
+ gem.authors = ["Guy Allard"]
23
+
24
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
25
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
26
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
27
+
28
+ gem.add_development_dependency 'bundler', '>= 2.1.2'
29
+ end
30
+ Jeweler::RubygemsDotOrgTasks.new
31
+
32
+ require 'rake/testtask'
33
+ Rake::TestTask.new(:test) do |test|
34
+ test.libs << 'lib' << 'test'
35
+ test.pattern = 'test/**/test_*.rb'
36
+ test.verbose = true
37
+ end
38
+
39
+ require 'rcov/rcovtask'
40
+ Rcov::RcovTask.new do |test|
41
+ test.libs << 'test'
42
+ test.pattern = 'test/**/test_*.rb'
43
+ test.verbose = true
44
+ end
45
+
46
+ task :default => :test
47
+
48
+ require 'rake/rdoctask'
49
+ Rake::RDocTask.new do |rdoc|
50
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
51
+
52
+ rdoc.rdoc_dir = 'rdoc'
53
+ rdoc.title = "UTF-8 Validator #{version}"
54
+ rdoc.rdoc_files.include('README*')
55
+ rdoc.rdoc_files.include('lib/**/*.rb')
56
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,4 @@
1
+ #
2
+ require 'validation/validator'
3
+ require 'validation/errors'
4
+
@@ -0,0 +1,14 @@
1
+ module UTF8
2
+ #
3
+ # == Purpose
4
+ #
5
+ # General UTF-8 validation error class. Clients that raise this error
6
+ # should override the default message.
7
+ #
8
+ class ValidationError < ::RuntimeError
9
+ #
10
+ def message()
11
+ "general UTF-8 validation error"
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,284 @@
1
+ # encoding: utf-8
2
+ #
3
+ =begin
4
+
5
+ http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
6
+
7
+ * state START
8
+
9
+ * Input = 0x00-0x7F : change state to START
10
+ * Input = 0xC2-0xDF: change state to A
11
+ * Input = 0xE1-0xEC, 0xEE-0xEF: change state to B
12
+ * Input = 0xE0: change state to C
13
+ * Input = 0xED: change state to D
14
+ * Input = 0xF1-0xF3:change state to E
15
+ * Input = 0xF0: change state to F
16
+ * Input = 0xF4: change state to G
17
+ * Input = Others (0x80-0xBF,0xC0-0xC1, 0xF5-0xFF): ERROR
18
+
19
+ * state A
20
+ o Input = 0x80-0xBF: change state to START
21
+ o Others: ERROR
22
+ * state B
23
+ o Input = 0x80-0xBF: change state to A
24
+ o Others: ERROR
25
+ * state C
26
+ o Input = 0xA0-0xBF: change state to A
27
+ o Others: ERROR
28
+ * state D
29
+ o Input = 0x80-0x9F: change state to A
30
+ o Others: ERROR
31
+ * state E
32
+ o Input = 0x80-0xBF: change state to B
33
+ o Others: ERROR
34
+ * state F
35
+ o Input = 0x90-0xBF: change state to B
36
+ o Others: ERROR
37
+ * state G
38
+ o Input = 0x80-0x8F: change state to B
39
+ o Others: ERROR
40
+
41
+ This state machine can be easily understood by:
42
+
43
+ a) examining the machine behavior as documented
44
+ b) reference to an excellent UTF-8 article with accompanying table here:
45
+
46
+ http://en.wikipedia.org/wiki/UTF-8
47
+
48
+ =end
49
+ #
50
+ # == Purpose
51
+ #
52
+ # Container for UTF-8 validator.
53
+ #
54
+ module UTF8
55
+ #
56
+ # == Purpose
57
+ #
58
+ # Validate UTF-8 primarily in a Ruby environments other than 1.9.
59
+ #
60
+ # Instances of this class are thread safe, and a single instance may be used
61
+ # safely by multiple concurrent threads, with one caveat:
62
+ #
63
+ # The value of #{Validator::DEBUG} must not be changed by any thread.
64
+ #
65
+ #--
66
+ # Copyright (c) 2011 Guy Allard
67
+ #
68
+ class Validator
69
+ #
70
+ # For use during development only.
71
+ #
72
+ DEBUG=false
73
+
74
+ #
75
+ # Validate the supplied string for proper UTF-8 encoding.
76
+ #
77
+ # Calling Sequence:
78
+ #
79
+ # validator.valid_encoding?(string) -> true or false
80
+ # validator.valid_encoding?(string, raise_on_error) -> true or exception
81
+ #
82
+ # Parameters:
83
+ #
84
+ # string:: the string to validate
85
+ # raise_on_error:: a flag to indicate failure behavior
86
+ #
87
+ #
88
+ def valid_encoding?(string, raise_on_error = false)
89
+ bytes = string.bytes
90
+ #
91
+ valid = true
92
+ index = -1
93
+ nb_hex = nil
94
+ ni_hex = nil
95
+ state = "start"
96
+ next_byte_save = nil
97
+ #
98
+ bytes.each do |next_byte|
99
+ index += 1
100
+ next_byte_save = next_byte
101
+ ni_hex = sprintf "%x", index
102
+ nb_hex = sprintf "%x", next_byte
103
+ puts "Top: #{next_byte}(0x#{nb_hex}), index: #{index}(0x#{ni_hex})" if DEBUG
104
+ case state
105
+
106
+ # State: 'start'
107
+ # The 'start' state:
108
+ # * handles all occurrences of valid single byte characters i.e., the ASCII character set
109
+ # * provides state transition logic for start bytes of valid characters with 2-4 bytes
110
+ # * signals a validation failure for all other single bytes
111
+ #
112
+ when "start"
113
+ puts "state: start" if DEBUG
114
+ case next_byte
115
+
116
+ # ASCII
117
+ # * Input = 0x00-0x7F : change state to START
118
+ when (0x00..0x7f)
119
+ puts "state: start 1" if DEBUG
120
+ state = "start"
121
+
122
+ # Start byte of two byte characters
123
+ # * Input = 0xC2-0xDF: change state to A
124
+ when (0xc2..0xdf)
125
+ puts "state: start 2" if DEBUG
126
+ state = "a"
127
+
128
+ # Start byte of some three byte characters
129
+ # * Input = 0xE1-0xEC, 0xEE-0xEF: change state to B
130
+ when (0xe1..0xec)
131
+ puts "state: start 3" if DEBUG
132
+ state = "b"
133
+ when (0xee..0xef)
134
+ puts "state: start 4" if DEBUG
135
+ state = "b"
136
+
137
+ # Start byte of special three byte characters
138
+ # * Input = 0xE0: change state to C
139
+ when 0xe0
140
+ puts "state: start 5" if DEBUG
141
+ state = "c"
142
+
143
+ # Start byte of the remaining three byte characters
144
+ # * Input = 0xED: change state to D
145
+ when 0xed
146
+ puts "state: start 6" if DEBUG
147
+ state = "d"
148
+
149
+ # Start byte of some four byte characters
150
+ # * Input = 0xF1-0xF3:change state to E
151
+ when (0xf1..0xf3)
152
+ puts "state: start 7" if DEBUG
153
+ state = "e"
154
+
155
+ # Start byte of special four byte characters
156
+ # * Input = 0xF0: change state to F
157
+ when 0xf0
158
+ puts "state: start 8" if DEBUG
159
+ state = "f"
160
+
161
+ # Start byte of very special four byte characters
162
+ # * Input = 0xF4: change state to G
163
+ when 0xf4
164
+ puts "state: start 9" if DEBUG
165
+ state = "g"
166
+
167
+ # All other single characters are invalid
168
+ # * Input = Others (0x80-0xBF,0xC0-0xC1, 0xF5-0xFF): ERROR
169
+ else
170
+ valid = false
171
+ break
172
+ end # of the inner case
173
+
174
+ # The last continuation byte of a 2, 3, or 4 byte character
175
+ # State: 'a'
176
+ # o Input = 0x80-0xBF: change state to START
177
+ # o Others: ERROR
178
+ when "a"
179
+ puts "state: a" if DEBUG
180
+ if (0x80..0xbf) === next_byte
181
+ state = "start"
182
+ else
183
+ valid = false
184
+ break
185
+ end
186
+
187
+ # The first continuation byte for most 3 byte characters
188
+ # State: 'b'
189
+ # o Input = 0x80-0xBF: change state to A
190
+ # o Others: ERROR
191
+ when "b"
192
+ puts "state: b" if DEBUG
193
+ if (0x80..0xbf) === next_byte
194
+ state = "a"
195
+ else
196
+ valid = false
197
+ break
198
+ end
199
+
200
+ # The first continuation byte for some special 3 byte characters
201
+ # State: 'c'
202
+ # o Input = 0xA0-0xBF: change state to A
203
+ # o Others: ERROR
204
+ when "c"
205
+ puts "state: c" if DEBUG
206
+ if (0xa0..0xbf) === next_byte
207
+ state = "a"
208
+ else
209
+ valid = false
210
+ break
211
+ end
212
+
213
+ # The first continuation byte for the remaining 3 byte characters
214
+ # State: 'd'
215
+ # o Input = 0x80-0x9F: change state to A
216
+ # o Others: ERROR
217
+ when "d"
218
+ puts "state: d" if DEBUG
219
+ if (0x80..0x9f) === next_byte
220
+ state = "a"
221
+ else
222
+ valid = false
223
+ break
224
+ end
225
+
226
+ # The first continuation byte for some 4 byte characters
227
+ # State: 'e'
228
+ # o Input = 0x80-0xBF: change state to B
229
+ # o Others: ERROR
230
+ when "e"
231
+ puts "state: e" if DEBUG
232
+ if (0x80..0xbf) === next_byte
233
+ state = "b"
234
+ else
235
+ valid = false
236
+ break
237
+ end
238
+
239
+ # The first continuation byte for some special 4 byte characters
240
+ # State: 'f'
241
+ # o Input = 0x90-0xBF: change state to B
242
+ # o Others: ERROR
243
+ when "f"
244
+ puts "state: f" if DEBUG
245
+ if (0x90..0xbf) === next_byte
246
+ state = "b"
247
+ else
248
+ valid = false
249
+ break
250
+ end
251
+
252
+ # The first continuation byte for the remaining 4 byte characters
253
+ # State: 'g'
254
+ # o Input = 0x80-0x8F: change state to B
255
+ # o Others: ERROR
256
+ when "g"
257
+ puts "state: g" if DEBUG
258
+ if (0x80..0x8f) === next_byte
259
+ state = "b"
260
+ else
261
+ valid = false
262
+ break
263
+ end
264
+
265
+ #
266
+ else
267
+ raise RuntimeError, "state: default"
268
+ end
269
+ end
270
+ #
271
+ puts "State at end: #{state}" if DEBUG
272
+ # Catch truncation at end of string
273
+ if valid and state != 'start'
274
+ valid = false
275
+ end
276
+ #
277
+ if !valid and raise_on_error
278
+ raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})"
279
+ end
280
+ #
281
+ valid
282
+ end # of valid_encoding?
283
+ end # of class
284
+ end # of module
data/test/helper.rb ADDED
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+
12
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ require 'utf8_validator'
15
+
16
+ class Test::Unit::TestCase
17
+ end
@@ -0,0 +1,38 @@
1
+ # encoding: utf-8
2
+ #
3
+ require 'helper'
4
+
5
+ #
6
+ # == Purpose
7
+ #
8
+ # Test raise_on_error functionality.
9
+ #
10
+ class TestRaiseRequect < Test::Unit::TestCase
11
+ #
12
+ def setup
13
+ @validator = UTF8::Validator.new
14
+ end
15
+
16
+ #
17
+ def teardown
18
+ @validator = nil
19
+ end
20
+
21
+ # ISO-8859-1 characters
22
+ def test_0010_check_raise_iso
23
+ #
24
+ assert_raise(UTF8::ValidationError) do
25
+ !@validator.valid_encoding?(0x80.chr, true)
26
+ end
27
+ end
28
+
29
+ # A regular old bad byte
30
+ def test_0020_check_raise_badbyte
31
+ #
32
+ assert_raise(UTF8::ValidationError) do
33
+ !@validator.valid_encoding?("a" + 0xff.chr + "b", true)
34
+ end
35
+ end
36
+
37
+ end
38
+
@@ -0,0 +1,226 @@
1
+ # encoding: utf-8
2
+ #
3
+ require 'helper'
4
+
5
+ #
6
+ # == Purpose
7
+ #
8
+ # Tests for the #{UTF8::Validator} implementation.
9
+ #
10
+ class TestUtf8Validator < Test::Unit::TestCase
11
+ #
12
+ def setup
13
+ @validator = UTF8::Validator.new
14
+ end
15
+
16
+ #
17
+ def teardown
18
+ @validator = nil
19
+ end
20
+
21
+ #
22
+ def test_0010_linkages
23
+ assert_not_nil @validator
24
+ end
25
+
26
+ #--
27
+ # Validation should succeed for the following tests
28
+ #--
29
+
30
+ # Some simple ASCII characters
31
+ def test_0020_simple_ascii
32
+ test_data = [
33
+ "a",
34
+ "abcdefghijjlmnopqrstuvwxyz",
35
+ "\x00",
36
+ ]
37
+ test_data.each do |string|
38
+ assert @validator.valid_encoding?(string), "Simple ASCII: #{string}"
39
+ assert string.force_encoding("UTF-8").valid_encoding?, "Simple ASCII 19: #{string}" if RUBY_VERSION =~ /1\.9/
40
+ end
41
+ end
42
+
43
+ # All ASCII
44
+ def test_0030_all_ascii
45
+ 0.upto(127) do |i|
46
+ assert @validator.valid_encoding?(i.chr), "All ASCII: #{i.chr}"
47
+ assert i.chr.force_encoding("UTF-8").valid_encoding?, "All ASCII 19: #{i.chr}" if RUBY_VERSION =~ /1\.9/
48
+ end
49
+ end
50
+
51
+ # Simple UTF8 - 2 byte characters
52
+ def test_0040_simple_utf8_2byte
53
+ test_data = [
54
+ "\xc2\x80",
55
+ "\xc2\xbf",
56
+ "\xdf\x80",
57
+ "\xdf\xbf",
58
+ ]
59
+ test_data.each do |string|
60
+ assert @validator.valid_encoding?(string), "Simple UTF-8, 2bytes: #{string}"
61
+ assert string.force_encoding("UTF-8").valid_encoding?, "Simple UTF-8 19, 2bytes: #{string}" if RUBY_VERSION =~ /1\.9/
62
+ end
63
+ end
64
+
65
+ # Simple UTF8 - 3 byte characters
66
+ def test_0050_simple_utf8_3byte
67
+ test_data = [
68
+ "\xe0\xa0\x80",
69
+ "\xe0\xbf\x80",
70
+ "\xe0\xa0\xbf",
71
+ "\xe0\xbf\xbf",
72
+ ]
73
+ test_data.each do |string|
74
+ assert @validator.valid_encoding?(string), "Simple UTF-8, 3bytes: #{string}"
75
+ assert string.force_encoding("UTF-8").valid_encoding?, "Simple UTF-8 19, 3bytes: #{string}" if RUBY_VERSION =~ /1\.9/
76
+ end
77
+ end
78
+
79
+ # Simple UTF8 - 4 byte characters
80
+ def test_0060_simple_utf8_4byte
81
+ test_data = [
82
+ "\xf1\x80\x80\x80",
83
+ "\xf1\xbf\xbf\xbf",
84
+ "\xf2\x80\x80\x80",
85
+ "\xf2\xbf\xbf\xbf",
86
+ "\xf3\x80\x80\x80",
87
+ "\xf3\xbf\xbf\xbf",
88
+ ]
89
+ test_data.each do |string|
90
+ assert @validator.valid_encoding?(string), "Simple UTF-8, 4bytes: #{string}"
91
+ assert string.force_encoding("UTF-8").valid_encoding?, "Simple UTF-8 19, 4bytes: #{string}" if RUBY_VERSION =~ /1\.9/
92
+ end
93
+ end
94
+
95
+ #--
96
+ # Validation should fail for the following tests
97
+ #--
98
+
99
+
100
+ # ISO-8859-1 (C1 points)
101
+ def test0510_iso_5559_1
102
+ 0x80.upto(0x9f) do |i|
103
+ assert !@validator.valid_encoding?(i.chr), "ISO-8859-1: #{i}"
104
+ assert !i.chr.force_encoding("UTF-8").valid_encoding?, "ISO-8859-1 19: #{i}" if RUBY_VERSION =~ /1\.9/
105
+ end
106
+ end
107
+
108
+ # UTF-16 Surrogate Halves
109
+ def test0520_utf16_surrogate_halves
110
+ test_data = [
111
+ "\xed\xa0\x80", # u-800 (lowest)
112
+ "\xed\xbf\xbf", # u-fff (highest)
113
+ ]
114
+ test_data.each do |string|
115
+ assert !@validator.valid_encoding?(string), "UTF-16 Surrogate Halves: #{string}"
116
+ assert !string.force_encoding("UTF-8").valid_encoding?, "UTF-16 Surrogate Halves 19: #{string}" if RUBY_VERSION =~ /1\.9/
117
+ end
118
+ end
119
+
120
+ # Invalid single bytes
121
+ def test0530_invalid_single_bytes
122
+ test_data = [
123
+ "\xc0",
124
+ "\xc1",
125
+ "\xf5","\xf6","\xf7","\xf8","\xf9","\xfa","\xfb","\xfc",
126
+ "\xfd","\xfe","\xff",
127
+ ]
128
+ test_data.each do |string|
129
+ assert !@validator.valid_encoding?(string), "Invalid single bytes: #{string}"
130
+ assert !string.force_encoding("UTF-8").valid_encoding?, "Invalid single bytes 10: #{string}" if RUBY_VERSION =~ /1\.9/
131
+ end
132
+ end
133
+
134
+ # Not shortest representation
135
+ def test0540_not_shortest
136
+ test_data = [
137
+ "\xc0\x80",
138
+ "\xe0\x80\x80",
139
+ "\xf0\x80\x80\x80",
140
+ "\xf8\x80\x80\x80\x80",
141
+ "\xfc\x80\x80\x80\x80\x80",
142
+ ]
143
+ test_data.each do |string|
144
+ assert !@validator.valid_encoding?(string), "Not shortest: #{string}"
145
+ assert !string.force_encoding("UTF-8").valid_encoding?, "Not shortest 19: #{string}" if RUBY_VERSION =~ /1\.9/
146
+ end
147
+ end
148
+
149
+ # Truncated last character
150
+ def test0550_truncated_last
151
+ test_data = [
152
+ "\xc2", # truncated 2 byte characters
153
+ "\xdf",
154
+ "\xe0\xa0", # truncated 3 byte characters
155
+ "\xe0\xbf",
156
+ "\xf1\x80\x80", # truncated 4 byte characters
157
+ "\xf1\xbf\xbf",
158
+ "\xf2\x80\x80",
159
+ "\xf2\xbf\xbf",
160
+ "\xf3\x80\x80",
161
+ "\xf3\xbf\xbf",
162
+ ]
163
+ test_data.each do |string|
164
+ assert !@validator.valid_encoding?(string), "truncated last: #{string}"
165
+ assert !string.force_encoding("UTF-8").valid_encoding?, "truncated last 19: #{string}" if RUBY_VERSION =~ /1\.9/
166
+ end
167
+ end
168
+
169
+ # Truncated in good text
170
+ def test0560_truncated_in_good
171
+ test_data = [
172
+ "\xc2", # truncated 2 byte characters
173
+ "\xdf",
174
+ "\xe0\xa0", # truncated 3 byte characters
175
+ "\xe0\xbf",
176
+ "\xf1\x80\x80", # truncated 4 byte characters
177
+ "\xf1\xbf\xbf",
178
+ "\xf2\x80\x80",
179
+ "\xf2\xbf\xbf",
180
+ "\xf3\x80\x80",
181
+ "\xf3\xbf\xbf",
182
+ ]
183
+ test_data.each do |string|
184
+ string = "a" + string + "b"
185
+ assert !@validator.valid_encoding?(string), "truncated in good: #{string}"
186
+ assert !string.force_encoding("UTF-8").valid_encoding?, "truncated in good 19: #{string}" if RUBY_VERSION =~ /1\.9/
187
+ end
188
+ end
189
+
190
+ # Miscellaneous Bad
191
+ def test0570_miscellaneous_bad
192
+ # perhaps some duplication here
193
+ test_data = [
194
+ "bad byte: \372",
195
+ "\004\b{\f:\tbody\"\001\207\004\b{\b:\016statusmsg\"\aOK:\017statuscodei\000:\tdata{\t:\voutput\"3Enabled, not running, last run 693 seconds ago:\frunningi\000:\fenabledi\006:\flastrunl+\aE\021\022M:\rsenderid\"\032xx.xx.xx.xx:\016requestid\"%849d647bbe3e421ea19ac9f947bbdde4:\020senderagent\"\fpuppetd:\016msgtarget\"%/topic/mcollective.puppetd.reply:\thash\"\001\257ZdQqtaDmmdD0jZinnEcpN+YbkxQDn8uuCnwsQdvGHau6d+gxnnfPLUddWRSb\nZNMs+sQUXgJNfcV1eVBn1H+Z8QQmzYXVDMqz7J43jmgloz5PsLVbN9K3PmX/\ngszqV/WpvIyAqm98ennWqSzpwMuiCC4q2Jr3s3Gm6bUJ6UkKXnY=\n:\fmsgtimel+\a\372\023\022M",
196
+ "\207",
197
+ "\xf4\x90\x80\x80",
198
+ "\xbf",
199
+ "\xe0\x9f\xbf",
200
+ "\xf0\x8f\xbf\xbf",
201
+ "\xf8\x87\xbf\xbf\xbf",
202
+ "\xfc\x83\xbf\xbf\xbf\xbf",
203
+ "\xc0\x80",
204
+ "\xe0\x80\x80",
205
+ "\xf0\x80\x80\x80",
206
+ "\xf8\x80\x80\x80\x80",
207
+ "\xfc\x80\x80\x80\x80\x80",
208
+ "\xed\xa0\x80",
209
+ "\xed\xad\xbf",
210
+ "\xed\xae\x80",
211
+ "\xed\xaf\xbf",
212
+ "\xed\xb0\x80",
213
+ "\xed\xbe\x80",
214
+ "\xed\xbf\xbf",
215
+ "\xc0\x00", # too long for \x00
216
+ "\xe0\x00\x00", # too long for \x00
217
+ "\xf0\x00\x00\x00", # too long for \x00
218
+ ]
219
+ test_data.each do |string|
220
+ assert !@validator.valid_encoding?(string), "miscellaneous bad: #{string}"
221
+ assert !string.force_encoding("UTF-8").valid_encoding?, "miscellaneous bad 19: #{string}" if RUBY_VERSION =~ /1\.9/
222
+ end
223
+ end
224
+
225
+ end
226
+
@@ -0,0 +1,69 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{utf8_validator}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Guy Allard"]
12
+ s.date = %q{2011-01-25}
13
+ s.description = %q{A State Machine implementation of a UTF-8 Encoding
14
+ Validation algorithm.}
15
+ s.email = %q{allard.guy.m@gmail.com}
16
+ s.extra_rdoc_files = [
17
+ "LICENSE.txt",
18
+ "README.rdoc"
19
+ ]
20
+ s.files = [
21
+ ".document",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "LICENSE.txt",
25
+ "README.rdoc",
26
+ "Rakefile",
27
+ "VERSION",
28
+ "lib/utf8_validator.rb",
29
+ "lib/validation/errors.rb",
30
+ "lib/validation/validator.rb",
31
+ "test/helper.rb",
32
+ "test/test_raise_request.rb",
33
+ "test/test_utf8_validator.rb",
34
+ "utf8_validator.gemspec"
35
+ ]
36
+ s.homepage = %q{http://github.com/gmallard/utf8_validator}
37
+ s.licenses = ["MIT"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.7}
40
+ s.summary = %q{A UTF-8 Encoding Validator.}
41
+ s.test_files = [
42
+ "test/helper.rb",
43
+ "test/test_raise_request.rb",
44
+ "test/test_utf8_validator.rb"
45
+ ]
46
+
47
+ if s.respond_to? :specification_version then
48
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
+ s.specification_version = 3
50
+
51
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
52
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
53
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
54
+ s.add_development_dependency(%q<rcov>, [">= 0"])
55
+ s.add_development_dependency(%q<bundler>, [">= 2.1.2"])
56
+ else
57
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
58
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
59
+ s.add_dependency(%q<rcov>, [">= 0"])
60
+ s.add_dependency(%q<bundler>, [">= 2.1.2"])
61
+ end
62
+ else
63
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
64
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
65
+ s.add_dependency(%q<rcov>, [">= 0"])
66
+ s.add_dependency(%q<bundler>, [">= 2.1.2"])
67
+ end
68
+ end
69
+
metadata ADDED
@@ -0,0 +1,140 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: utf8_validator
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Guy Allard
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-01-25 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: bundler
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ~>
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 0
30
+ - 0
31
+ version: 1.0.0
32
+ type: :development
33
+ prerelease: false
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: jeweler
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ~>
41
+ - !ruby/object:Gem::Version
42
+ segments:
43
+ - 1
44
+ - 5
45
+ - 2
46
+ version: 1.5.2
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: *id002
50
+ - !ruby/object:Gem::Dependency
51
+ name: rcov
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ segments:
58
+ - 0
59
+ version: "0"
60
+ type: :development
61
+ prerelease: false
62
+ version_requirements: *id003
63
+ - !ruby/object:Gem::Dependency
64
+ name: bundler
65
+ requirement: &id004 !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ segments:
71
+ - 2
72
+ - 1
73
+ - 2
74
+ version: 2.1.2
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: *id004
78
+ description: |-
79
+ A State Machine implementation of a UTF-8 Encoding
80
+ Validation algorithm.
81
+ email: allard.guy.m@gmail.com
82
+ executables: []
83
+
84
+ extensions: []
85
+
86
+ extra_rdoc_files:
87
+ - LICENSE.txt
88
+ - README.rdoc
89
+ files:
90
+ - .document
91
+ - Gemfile
92
+ - Gemfile.lock
93
+ - LICENSE.txt
94
+ - README.rdoc
95
+ - Rakefile
96
+ - VERSION
97
+ - lib/utf8_validator.rb
98
+ - lib/validation/errors.rb
99
+ - lib/validation/validator.rb
100
+ - test/helper.rb
101
+ - test/test_raise_request.rb
102
+ - test/test_utf8_validator.rb
103
+ - utf8_validator.gemspec
104
+ has_rdoc: true
105
+ homepage: http://github.com/gmallard/utf8_validator
106
+ licenses:
107
+ - MIT
108
+ post_install_message:
109
+ rdoc_options: []
110
+
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ none: false
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ hash: -1120544117494340473
119
+ segments:
120
+ - 0
121
+ version: "0"
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ none: false
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ segments:
128
+ - 0
129
+ version: "0"
130
+ requirements: []
131
+
132
+ rubyforge_project:
133
+ rubygems_version: 1.3.7
134
+ signing_key:
135
+ specification_version: 3
136
+ summary: A UTF-8 Encoding Validator.
137
+ test_files:
138
+ - test/helper.rb
139
+ - test/test_raise_request.rb
140
+ - test/test_utf8_validator.rb