utf8_utils 0.0.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +29 -7
- data/Rakefile +9 -0
- data/lib/utf8_utils.rb +46 -149
- data/lib/utf8_utils/byte.rb +86 -0
- data/lib/utf8_utils/char.rb +52 -0
- data/lib/utf8_utils/chars.rb +59 -0
- data/lib/utf8_utils/version.rb +3 -3
- data/test/utf8_utils_test.rb +46 -25
- metadata +19 -5
data/README.md
CHANGED
@@ -11,7 +11,7 @@ issues with it, I'll probably try patching it into ActiveSupport.
|
|
11
11
|
|
12
12
|
Here's what happens when you try to access a string with invalid UTF-8 characters in Ruby 1.9:
|
13
13
|
|
14
|
-
ruby-1.9.1-p378 > "my messed up \x92 string".split(//)
|
14
|
+
ruby-1.9.1-p378 > "my messed up \x92 string".split(//u)
|
15
15
|
ArgumentError: invalid byte sequence in UTF-8
|
16
16
|
from (irb):3:in `split'
|
17
17
|
from (irb):3
|
@@ -19,7 +19,7 @@ Here's what happens when you try to access a string with invalid UTF-8 character
|
|
19
19
|
|
20
20
|
## The Solution
|
21
21
|
|
22
|
-
ruby-1.9.1-p378 > "my messed up \x92 string".
|
22
|
+
ruby-1.9.1-p378 > "my messed up \x92 string".to_utf8_chars.tidy_bytes.to_s.split(//u)
|
23
23
|
=> ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
|
24
24
|
|
25
25
|
Amazing in its brevity and elegance, huh? Ok, maybe not really but if you have
|
@@ -30,6 +30,25 @@ Note that like ActiveSupport, it naively assumes if you have invalid UTF8
|
|
30
30
|
characters, they are either Windows CP1251 or ISO8859-1. In practice this isn't
|
31
31
|
a bad assumption, but may not always work.
|
32
32
|
|
33
|
+
Unlike ActiveSupport, however, the performance of this library is **very** poor
|
34
|
+
right now. Since my intention is for this to be used mostly for very short
|
35
|
+
strings, it should, however, be good enough for many kinds of applications.
|
36
|
+
|
37
|
+
How poor is "very poor?" Have a look:
|
38
|
+
|
39
|
+
|
40
|
+
| ACTIVE_SUPPORT | UTF8_UTILS |
|
41
|
+
----------------------------------------------------------
|
42
|
+
tidy bytes x2000 | 0.087 | 1.225 |
|
43
|
+
==========================================================
|
44
|
+
Total | 0.087 | 1.225 |
|
45
|
+
|
46
|
+
|
47
|
+
This will improve quite a bit soon, as I'm pretty well aware of where the
|
48
|
+
slowness is coming from. If performance is important for you now though, by all
|
49
|
+
means use another library (if you can find one) until I've made a few more
|
50
|
+
releases.
|
51
|
+
|
33
52
|
## Getting it
|
34
53
|
|
35
54
|
gem install utf8_utils
|
@@ -37,15 +56,16 @@ a bad assumption, but may not always work.
|
|
37
56
|
|
38
57
|
## Using it
|
39
58
|
|
59
|
+
# encoding: utf-8
|
40
60
|
require "utf8_utils"
|
41
61
|
|
42
|
-
#
|
43
|
-
"hello
|
44
|
-
puts
|
62
|
+
# Iterate over multibyte characters
|
63
|
+
"hello ーチエンジンの日本".to_utf8_chars.each_char do |char|
|
64
|
+
puts char.valid?
|
45
65
|
end
|
46
66
|
|
47
67
|
# tidy bytes
|
48
|
-
good_string = bad_string.
|
68
|
+
good_string = bad_string.to_utf8_chars.tidy_bytes.to_s
|
49
69
|
|
50
70
|
## API Docs
|
51
71
|
|
@@ -53,6 +73,8 @@ a bad assumption, but may not always work.
|
|
53
73
|
|
54
74
|
## Credits
|
55
75
|
|
56
|
-
Created by Norman Clarke
|
76
|
+
Created by Norman Clarke. Some code was taken from
|
77
|
+
[ActiveRecord](http://github.com/rails/rails/tree/master/activesupport/), as
|
78
|
+
indicated in the source code.
|
57
79
|
|
58
80
|
Copyright (c) 2010, released under the MIT license.
|
data/Rakefile
CHANGED
@@ -9,6 +9,15 @@ CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
|
|
9
9
|
Rake::GemPackageTask.new(eval(File.read("utf8_utils.gemspec"))) { |pkg| }
|
10
10
|
Rake::TestTask.new(:test) { |t| t.pattern = "test/**/*_test.rb" }
|
11
11
|
|
12
|
+
begin
|
13
|
+
require "yard"
|
14
|
+
YARD::Rake::YardocTask.new do |t|
|
15
|
+
t.options = ["--output-dir=doc"]
|
16
|
+
t.options << "--files" << "README.md"
|
17
|
+
end
|
18
|
+
rescue LoadError
|
19
|
+
end
|
20
|
+
|
12
21
|
Rake::RDocTask.new do |r|
|
13
22
|
r.rdoc_dir = "doc"
|
14
23
|
r.rdoc_files.include "lib/**/*.rb"
|
data/lib/utf8_utils.rb
CHANGED
@@ -1,156 +1,53 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
module UTF8Utils
|
5
|
-
|
6
|
-
class Codepoints
|
7
|
-
|
8
|
-
attr_accessor :chars
|
9
|
-
attr :position
|
10
|
-
|
11
|
-
include Enumerable
|
12
|
-
|
13
|
-
CP1251 = {
|
14
|
-
128 => [226, 130, 172],
|
15
|
-
129 => nil,
|
16
|
-
130 => [226, 128, 154],
|
17
|
-
131 => [198, 146],
|
18
|
-
132 => [226, 128, 158],
|
19
|
-
133 => [226, 128, 166],
|
20
|
-
134 => [226, 128, 160],
|
21
|
-
135 => [226, 128, 161],
|
22
|
-
136 => [203, 134],
|
23
|
-
137 => [226, 128, 176],
|
24
|
-
138 => [197, 160],
|
25
|
-
139 => [226, 128, 185],
|
26
|
-
140 => [197, 146],
|
27
|
-
141 => nil,
|
28
|
-
142 => [197, 189],
|
29
|
-
143 => nil,
|
30
|
-
144 => nil,
|
31
|
-
145 => [226, 128, 152],
|
32
|
-
146 => [226, 128, 153],
|
33
|
-
147 => [226, 128, 156],
|
34
|
-
148 => [226, 128, 157],
|
35
|
-
149 => [226, 128, 162],
|
36
|
-
150 => [226, 128, 147],
|
37
|
-
151 => [226, 128, 148],
|
38
|
-
152 => [203, 156],
|
39
|
-
153 => [226, 132, 162],
|
40
|
-
154 => [197, 161],
|
41
|
-
155 => [226, 128, 186],
|
42
|
-
156 => [197, 147],
|
43
|
-
157 => nil,
|
44
|
-
158 => [197, 190],
|
45
|
-
159 => [197, 184]
|
46
|
-
}
|
47
|
-
|
48
|
-
def initialize(string)
|
49
|
-
@position = 0
|
50
|
-
# 1.8.6's `each_byte` does not return an Enumerable
|
51
|
-
if RUBY_VERSION < "1.8.7"
|
52
|
-
@chars = []
|
53
|
-
string.each_byte { |b| @chars << b }
|
54
|
-
else
|
55
|
-
# Create an array of bytes without raising an ArgumentError in 1.9.x
|
56
|
-
# when the string contains invalid UTF-8 characters
|
57
|
-
@chars = string.each_byte.entries
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
# Attempt to clean up malformed characters.
|
62
|
-
def tidy_bytes
|
63
|
-
Codepoints.new(entries.map {|c| c.tidy.to_char}.compact.join)
|
64
|
-
end
|
65
|
-
|
66
|
-
# Cast to string.
|
67
|
-
def to_s
|
68
|
-
entries.map {|e| e.to_char}.join
|
69
|
-
end
|
70
|
-
|
71
|
-
private
|
72
|
-
|
73
|
-
def each(&block)
|
74
|
-
while codepoint = next_codepoint
|
75
|
-
yield codepoint
|
76
|
-
end
|
77
|
-
@position = 0
|
78
|
-
end
|
79
|
-
|
80
|
-
alias :each_codepoint :each
|
81
|
-
public :each_codepoint
|
1
|
+
require File.expand_path("../utf8_utils/byte", __FILE__)
|
2
|
+
require File.expand_path("../utf8_utils/char", __FILE__)
|
3
|
+
require File.expand_path("../utf8_utils/chars", __FILE__)
|
82
4
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
when 224..239 then 3
|
88
|
-
else 4
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
def next_codepoint
|
93
|
-
codepoint = Codepoint.new(chars.slice(position, bytes_to_pull))
|
94
|
-
if codepoint.invalid?
|
95
|
-
codepoint = Codepoint.new(chars.slice(position, 1))
|
96
|
-
end
|
97
|
-
@position = position + codepoint.size
|
98
|
-
codepoint unless codepoint.empty?
|
99
|
-
end
|
100
|
-
|
101
|
-
end
|
102
|
-
|
103
|
-
class Codepoint < Array
|
104
|
-
|
105
|
-
# Borrowed from the regexp in ActiveSupport, which in turn had been borrowed from
|
106
|
-
# the Kconv library by Shinji KONO - (also as seen on the W3C site).
|
107
|
-
# See also http://en.wikipedia.org/wiki/UTF-8
|
108
|
-
def valid?
|
109
|
-
if length == 1
|
110
|
-
(0..127) === self[0]
|
111
|
-
elsif length == 2
|
112
|
-
(192..223) === self[0] && (128..191) === self[1]
|
113
|
-
elsif length == 3
|
114
|
-
(self[0] == 224 && ((160..191) === self[1] && (128..191) === self[2])) ||
|
115
|
-
((225..239) === self[0] && (128..191) === self[1] && (128..191) === self[2])
|
116
|
-
elsif length == 4
|
117
|
-
(self[0] == 240 && (144..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
|
118
|
-
((241..243) === self[0] && (128..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
|
119
|
-
(self[0] == 244 && (128..143) === self[1] && (128..191) === self[2] && (128..191) === self[3])
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
# Attempt to rescue a valid UTF-8 character from a malformed codepoint. It will first
|
124
|
-
# attempt to convert from CP1251, and if this isn't possible, it prepends a valid leading
|
125
|
-
# byte, treating the character as the last byte in a two-byte codepoint.
|
126
|
-
# Note that much of the logic here is taken from ActiveSupport; the difference is that this
|
127
|
-
# works for Ruby 1.8.6 - 1.9.1.
|
128
|
-
def tidy
|
129
|
-
return self if valid?
|
130
|
-
if Codepoints::CP1251.key? self[0]
|
131
|
-
self.class.new [Codepoints::CP1251[self[0]]]
|
132
|
-
elsif self[0] < 192
|
133
|
-
self.class.new [194, self[0]]
|
134
|
-
else
|
135
|
-
self.class.new [195, self[0] - 64]
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
def invalid?
|
140
|
-
!valid?
|
141
|
-
end
|
5
|
+
# Wraps a string as an array of bytes and allows some naive cleanup operations
|
6
|
+
# as a workaround for Ruby 1.9's crappy encoding support that throws exceptions
|
7
|
+
# when attempting to access UTF8 strings with invalid characters.
|
8
|
+
module UTF8Utils
|
142
9
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
10
|
+
# CP1251 decimal byte => UTF-8 approximation as an array of bytes
|
11
|
+
CP1251 = {
|
12
|
+
128 => [226, 130, 172],
|
13
|
+
129 => nil,
|
14
|
+
130 => [226, 128, 154],
|
15
|
+
131 => [198, 146],
|
16
|
+
132 => [226, 128, 158],
|
17
|
+
133 => [226, 128, 166],
|
18
|
+
134 => [226, 128, 160],
|
19
|
+
135 => [226, 128, 161],
|
20
|
+
136 => [203, 134],
|
21
|
+
137 => [226, 128, 176],
|
22
|
+
138 => [197, 160],
|
23
|
+
139 => [226, 128, 185],
|
24
|
+
140 => [197, 146],
|
25
|
+
141 => nil,
|
26
|
+
142 => [197, 189],
|
27
|
+
143 => nil,
|
28
|
+
144 => nil,
|
29
|
+
145 => [226, 128, 152],
|
30
|
+
146 => [226, 128, 153],
|
31
|
+
147 => [226, 128, 156],
|
32
|
+
148 => [226, 128, 157],
|
33
|
+
149 => [226, 128, 162],
|
34
|
+
150 => [226, 128, 147],
|
35
|
+
151 => [226, 128, 148],
|
36
|
+
152 => [203, 156],
|
37
|
+
153 => [226, 132, 162],
|
38
|
+
154 => [197, 161],
|
39
|
+
155 => [226, 128, 186],
|
40
|
+
156 => [197, 147],
|
41
|
+
157 => nil,
|
42
|
+
158 => [197, 190],
|
43
|
+
159 => [197, 184]
|
44
|
+
}
|
147
45
|
|
148
|
-
end
|
149
46
|
end
|
150
47
|
|
151
|
-
# Get an array of UTF8
|
48
|
+
# Get an array of UTF8 charsfrom a string.
|
152
49
|
class String
|
153
|
-
def
|
154
|
-
UTF8Utils::
|
50
|
+
def to_utf8_chars
|
51
|
+
UTF8Utils::Chars.new self
|
155
52
|
end
|
156
|
-
end
|
53
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module UTF8Utils
|
2
|
+
|
3
|
+
# A single UTF-8 byte.
|
4
|
+
class Byte
|
5
|
+
|
6
|
+
attr_reader :byte
|
7
|
+
|
8
|
+
def initialize(byte)
|
9
|
+
@byte = byte
|
10
|
+
end
|
11
|
+
|
12
|
+
def codepoint_mask
|
13
|
+
case leading_1_bits
|
14
|
+
when 0 then 0
|
15
|
+
when 1 then 0b1000_0000
|
16
|
+
when 2 then 0b1100_0000
|
17
|
+
when 3 then 0b1110_0000
|
18
|
+
when 4 then 0b1111_0000
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Is this a continuation byte?
|
23
|
+
def continuation?
|
24
|
+
leading_1_bits == 1
|
25
|
+
end
|
26
|
+
|
27
|
+
# How many continuation bytes should follow this byte?
|
28
|
+
def continuations
|
29
|
+
bits = leading_1_bits
|
30
|
+
bits < 2 ? 0 : bits - 1
|
31
|
+
end
|
32
|
+
|
33
|
+
def invalid?
|
34
|
+
!valid?
|
35
|
+
end
|
36
|
+
|
37
|
+
# From Wikipedia's entry on UTF-8:
|
38
|
+
#
|
39
|
+
# The UTF-8 encoding is variable-width, with each character represented by 1
|
40
|
+
# to 4 bytes. Each byte has 0–4 leading consecutive 1 bits followed by a zero bit
|
41
|
+
# to indicate its type. N 1 bits indicates the first byte in a N-byte sequence,
|
42
|
+
# with the exception that zero 1 bits indicates a one-byte sequence while one 1
|
43
|
+
# bit indicates a continuation byte in a multi-byte sequence (this was done for
|
44
|
+
# ASCII compatibility).
|
45
|
+
# @see http://en.wikipedia.org/wiki/Utf-8
|
46
|
+
def leading_1_bits
|
47
|
+
nibble = byte >> 4
|
48
|
+
if nibble < 0b1000 then 0 # single-byte chars
|
49
|
+
elsif nibble < 0b1100 then 1 # continuation byte
|
50
|
+
elsif nibble < 0b1110 then 2 # start of 2-byte char
|
51
|
+
elsif nibble < 0b1111 then 3 # 3-byte char
|
52
|
+
else 4 # 4-byte char
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Start of a 2-byte sequence, but code point ≤ 127
|
57
|
+
# @see http://tools.ietf.org/html/rfc3629
|
58
|
+
def overlong?
|
59
|
+
(192..193) === byte
|
60
|
+
end
|
61
|
+
|
62
|
+
# RFC 3629 reserves 245-253 for the leading bytes of 4-6 byte sequences.
|
63
|
+
# @see http://tools.ietf.org/html/rfc3629
|
64
|
+
def restricted?
|
65
|
+
(245..253) === byte
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_i
|
69
|
+
byte
|
70
|
+
end
|
71
|
+
|
72
|
+
# Bytes 254 and 255 are not defined by the original UTF-8 spec.
|
73
|
+
def undefined?
|
74
|
+
(254..255) === byte
|
75
|
+
end
|
76
|
+
|
77
|
+
def valid?
|
78
|
+
!(overlong? or restricted? or undefined?)
|
79
|
+
end
|
80
|
+
|
81
|
+
def codepoint_bits
|
82
|
+
byte ^ codepoint_mask
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module UTF8Utils
|
2
|
+
|
3
|
+
class Char < Array
|
4
|
+
|
5
|
+
# Given the first byte, how many bytes long should this character be?
|
6
|
+
def expected_length
|
7
|
+
(first.continuations rescue 0) + 1
|
8
|
+
end
|
9
|
+
|
10
|
+
# Is the character invalid?
|
11
|
+
def invalid?
|
12
|
+
!valid?
|
13
|
+
end
|
14
|
+
|
15
|
+
# Attempt to rescue a valid UTF-8 character from a malformed character. It
|
16
|
+
# will first attempt to convert from CP1251, and if this isn't possible, it
|
17
|
+
# prepends a valid leading byte, treating the character as the last byte in
|
18
|
+
# a two-byte character. Note that much of the logic here is taken from
|
19
|
+
# ActiveSupport; the difference is that this works for Ruby 1.8.6 - 1.9.1.
|
20
|
+
def tidy
|
21
|
+
return self if valid?
|
22
|
+
byte = first.to_i
|
23
|
+
if UTF8Utils::CP1251.key? byte
|
24
|
+
self.class.new [UTF8Utils::CP1251[byte]]
|
25
|
+
elsif byte < 192
|
26
|
+
self.class.new [194, byte]
|
27
|
+
else
|
28
|
+
self.class.new [195, byte - 64]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Get a multibyte character from the bytes.
|
33
|
+
def to_s
|
34
|
+
flatten.map {|b| b.to_i }.pack("C*").unpack("U*").pack("U*")
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_codepoint
|
38
|
+
flatten.map {|b| b.to_i }.pack("C*").unpack("U*")[0]
|
39
|
+
end
|
40
|
+
|
41
|
+
def valid?
|
42
|
+
return false if length != expected_length
|
43
|
+
each_with_index do |byte, index|
|
44
|
+
return false if byte.invalid?
|
45
|
+
return false if index == 0 and byte.continuation?
|
46
|
+
return false if index > 0 and !byte.continuation?
|
47
|
+
end
|
48
|
+
true
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module UTF8Utils
|
2
|
+
class Chars
|
3
|
+
|
4
|
+
attr :bytes
|
5
|
+
attr :position
|
6
|
+
|
7
|
+
include Enumerable
|
8
|
+
|
9
|
+
def initialize(string)
|
10
|
+
@position = 0
|
11
|
+
begin
|
12
|
+
# Create an array of bytes without raising an ArgumentError in 1.9.x
|
13
|
+
# when the string contains invalid UTF-8 characters
|
14
|
+
@bytes = string.each_byte.map {|b| Byte.new(b)}
|
15
|
+
rescue LocalJumpError
|
16
|
+
# 1.8.6's `each_byte` does not return an Enumerable
|
17
|
+
@bytes = []
|
18
|
+
string.each_byte { |b| @bytes << Byte.new(b) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Attempt to clean up malformed characters.
|
23
|
+
def tidy_bytes
|
24
|
+
Chars.new(entries.map {|c| c.tidy.to_s}.compact.join)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Cast to string.
|
28
|
+
def to_s
|
29
|
+
entries.flatten.map {|b| b.to_i }.pack("C*").unpack("U*").pack("U*")
|
30
|
+
end
|
31
|
+
|
32
|
+
def first
|
33
|
+
entries.first
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def each(&block)
|
39
|
+
while char = next_char
|
40
|
+
yield char
|
41
|
+
end
|
42
|
+
@position = 0
|
43
|
+
end
|
44
|
+
|
45
|
+
alias :each_char :each
|
46
|
+
public :each_char
|
47
|
+
|
48
|
+
def next_char
|
49
|
+
return if !bytes[position]
|
50
|
+
char = Char.new(bytes.slice(position, bytes[position].continuations + 1))
|
51
|
+
if char.invalid?
|
52
|
+
char = Char.new(bytes.slice(position, 1))
|
53
|
+
end
|
54
|
+
@position = position + char.size
|
55
|
+
char unless char.empty?
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
data/lib/utf8_utils/version.rb
CHANGED
data/test/utf8_utils_test.rb
CHANGED
@@ -1,49 +1,70 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
+
require "rubygems"
|
3
4
|
require "test/unit"
|
4
|
-
require
|
5
|
+
require "mocha"
|
6
|
+
require File.expand_path("../../lib/utf8_utils", __FILE__)
|
5
7
|
|
6
|
-
|
8
|
+
module UTF8ByteTest
|
7
9
|
|
8
|
-
def
|
9
|
-
|
10
|
+
def test_leading_1_bits
|
11
|
+
[0, 128, 194, 224, 240].each_with_index do |n, i|
|
12
|
+
byte = UTF8Utils::Byte.new(n)
|
13
|
+
assert_equal i, byte.leading_1_bits
|
14
|
+
end
|
10
15
|
end
|
11
16
|
|
12
|
-
def
|
13
|
-
|
17
|
+
def test_invalid_bytes
|
18
|
+
[192, 193, 245, 255].each do |n|
|
19
|
+
assert !UTF8Utils::Byte.new(n).valid?
|
20
|
+
end
|
14
21
|
end
|
15
22
|
|
16
|
-
def
|
17
|
-
|
23
|
+
def test_continuation
|
24
|
+
assert UTF8Utils::Byte.new(130).continuation?
|
18
25
|
end
|
19
26
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
27
|
+
end
|
28
|
+
|
29
|
+
class UTF8UtilsTest < Test::Unit::TestCase
|
30
|
+
|
31
|
+
include UTF8ByteTest
|
32
|
+
|
33
|
+
def test_entries_should_be_one_byte_for_ascii_char
|
34
|
+
assert_equal 1, "a".to_utf8_chars.first.length
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_entries_should_be_two_bytes_for_latin_char_with_diacritics
|
38
|
+
assert_equal 2, "¡".to_utf8_chars.first.length
|
26
39
|
end
|
27
40
|
|
28
|
-
def
|
29
|
-
"
|
41
|
+
def test_entries_should_be_three_bytes_for_basic_multilingual_char
|
42
|
+
assert_equal 3, "आ".to_utf8_chars.first.length
|
30
43
|
end
|
31
44
|
|
32
|
-
def
|
33
|
-
|
45
|
+
def test_entries_should_be_four_bytes_for_other_chars
|
46
|
+
u = UTF8Utils::Chars.new("")
|
47
|
+
# Editors tend to freak out with chars in this plane, so just stub the
|
48
|
+
# chars field instead. This char is U+10404, DESERET CAPITAL LETTER LONG O.
|
49
|
+
u.stubs(:bytes).returns([240, 144, 144, 132].map { |b| UTF8Utils::Byte.new(b)})
|
50
|
+
assert_equal 4, u.first.length
|
34
51
|
end
|
35
52
|
|
36
|
-
def
|
37
|
-
|
53
|
+
def test_should_detect_valid_chars
|
54
|
+
"cañón आ".to_utf8_chars.each_char {|c| assert c.valid? }
|
38
55
|
end
|
39
56
|
|
40
|
-
def
|
41
|
-
|
57
|
+
def test_should_detect_invalid_chars
|
58
|
+
"\x92".to_utf8_chars.each_char {|c| assert c.invalid? }
|
42
59
|
end
|
43
60
|
|
44
|
-
def
|
45
|
-
|
46
|
-
|
61
|
+
def test_should_split_correctly_with_invalid_chars
|
62
|
+
assert_equal 3, "a\x92a".to_utf8_chars.entries.length
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_should_tidy_bytes
|
66
|
+
assert_equal "a’a", "a\x92a".to_utf8_chars.tidy_bytes.to_s
|
67
|
+
assert_equal "Simón Bolívar", "Sim\xF3n Bol\xEDvar".to_utf8_chars.tidy_bytes.to_s
|
47
68
|
end
|
48
69
|
|
49
70
|
end
|
metadata
CHANGED
@@ -3,10 +3,10 @@ name: utf8_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
|
+
- 1
|
6
7
|
- 0
|
7
8
|
- 0
|
8
|
-
|
9
|
-
version: 0.0.1
|
9
|
+
version: 1.0.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Norman Clarke
|
@@ -14,10 +14,21 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-04-07 00:00:00 -03:00
|
18
18
|
default_executable:
|
19
|
-
dependencies:
|
20
|
-
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: mocha
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :development
|
31
|
+
version_requirements: *id001
|
21
32
|
description: Utilities for cleaning up UTF8 strings. Compatible with Ruby 1.8.6 - 1.9.x
|
22
33
|
email: norman@njclarke.com
|
23
34
|
executables: []
|
@@ -27,6 +38,9 @@ extensions: []
|
|
27
38
|
extra_rdoc_files: []
|
28
39
|
|
29
40
|
files:
|
41
|
+
- lib/utf8_utils/byte.rb
|
42
|
+
- lib/utf8_utils/char.rb
|
43
|
+
- lib/utf8_utils/chars.rb
|
30
44
|
- lib/utf8_utils/version.rb
|
31
45
|
- lib/utf8_utils.rb
|
32
46
|
- README.md
|