utf8_utils 0.0.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +29 -7
- data/Rakefile +9 -0
- data/lib/utf8_utils.rb +46 -149
- data/lib/utf8_utils/byte.rb +86 -0
- data/lib/utf8_utils/char.rb +52 -0
- data/lib/utf8_utils/chars.rb +59 -0
- data/lib/utf8_utils/version.rb +3 -3
- data/test/utf8_utils_test.rb +46 -25
- metadata +19 -5
data/README.md
CHANGED
@@ -11,7 +11,7 @@ issues with it, I'll probably try patching it into ActiveSupport.
|
|
11
11
|
|
12
12
|
Here's what happens when you try to access a string with invalid UTF-8 characters in Ruby 1.9:
|
13
13
|
|
14
|
-
ruby-1.9.1-p378 > "my messed up \x92 string".split(//)
|
14
|
+
ruby-1.9.1-p378 > "my messed up \x92 string".split(//u)
|
15
15
|
ArgumentError: invalid byte sequence in UTF-8
|
16
16
|
from (irb):3:in `split'
|
17
17
|
from (irb):3
|
@@ -19,7 +19,7 @@ Here's what happens when you try to access a string with invalid UTF-8 character
|
|
19
19
|
|
20
20
|
## The Solution
|
21
21
|
|
22
|
-
ruby-1.9.1-p378 > "my messed up \x92 string".
|
22
|
+
ruby-1.9.1-p378 > "my messed up \x92 string".to_utf8_chars.tidy_bytes.to_s.split(//u)
|
23
23
|
=> ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
|
24
24
|
|
25
25
|
Amazing in its brevity and elegance, huh? Ok, maybe not really but if you have
|
@@ -30,6 +30,25 @@ Note that like ActiveSupport, it naively assumes if you have invalid UTF8
|
|
30
30
|
characters, they are either Windows CP1251 or ISO8859-1. In practice this isn't
|
31
31
|
a bad assumption, but may not always work.
|
32
32
|
|
33
|
+
Unlike ActiveSupport, however, the performance of this library is **very** poor
|
34
|
+
right now. Since my intention is for this to be used mostly for very short
|
35
|
+
strings, it should, however, be good enough for many kinds of applications.
|
36
|
+
|
37
|
+
How poor is "very poor?" Have a look:
|
38
|
+
|
39
|
+
|
40
|
+
| ACTIVE_SUPPORT | UTF8_UTILS |
|
41
|
+
----------------------------------------------------------
|
42
|
+
tidy bytes x2000 | 0.087 | 1.225 |
|
43
|
+
==========================================================
|
44
|
+
Total | 0.087 | 1.225 |
|
45
|
+
|
46
|
+
|
47
|
+
This will improve quite a bit soon, as I'm pretty well aware of where the
|
48
|
+
slowness is coming from. If performance is important for you now though, by all
|
49
|
+
means use another library (if you can find one) until I've made a few more
|
50
|
+
releases.
|
51
|
+
|
33
52
|
## Getting it
|
34
53
|
|
35
54
|
gem install utf8_utils
|
@@ -37,15 +56,16 @@ a bad assumption, but may not always work.
|
|
37
56
|
|
38
57
|
## Using it
|
39
58
|
|
59
|
+
# encoding: utf-8
|
40
60
|
require "utf8_utils"
|
41
61
|
|
42
|
-
#
|
43
|
-
"hello
|
44
|
-
puts
|
62
|
+
# Iterate over multibyte characters
|
63
|
+
"hello ーチエンジンの日本".to_utf8_chars.each_char do |char|
|
64
|
+
puts char.valid?
|
45
65
|
end
|
46
66
|
|
47
67
|
# tidy bytes
|
48
|
-
good_string = bad_string.
|
68
|
+
good_string = bad_string.to_utf8_chars.tidy_bytes.to_s
|
49
69
|
|
50
70
|
## API Docs
|
51
71
|
|
@@ -53,6 +73,8 @@ a bad assumption, but may not always work.
|
|
53
73
|
|
54
74
|
## Credits
|
55
75
|
|
56
|
-
Created by Norman Clarke
|
76
|
+
Created by Norman Clarke. Some code was taken from
|
77
|
+
[ActiveRecord](http://github.com/rails/rails/tree/master/activesupport/), as
|
78
|
+
indicated in the source code.
|
57
79
|
|
58
80
|
Copyright (c) 2010, released under the MIT license.
|
data/Rakefile
CHANGED
@@ -9,6 +9,15 @@ CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
|
|
9
9
|
Rake::GemPackageTask.new(eval(File.read("utf8_utils.gemspec"))) { |pkg| }
|
10
10
|
Rake::TestTask.new(:test) { |t| t.pattern = "test/**/*_test.rb" }
|
11
11
|
|
12
|
+
begin
|
13
|
+
require "yard"
|
14
|
+
YARD::Rake::YardocTask.new do |t|
|
15
|
+
t.options = ["--output-dir=doc"]
|
16
|
+
t.options << "--files" << "README.md"
|
17
|
+
end
|
18
|
+
rescue LoadError
|
19
|
+
end
|
20
|
+
|
12
21
|
Rake::RDocTask.new do |r|
|
13
22
|
r.rdoc_dir = "doc"
|
14
23
|
r.rdoc_files.include "lib/**/*.rb"
|
data/lib/utf8_utils.rb
CHANGED
@@ -1,156 +1,53 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
module UTF8Utils
|
5
|
-
|
6
|
-
class Codepoints
|
7
|
-
|
8
|
-
attr_accessor :chars
|
9
|
-
attr :position
|
10
|
-
|
11
|
-
include Enumerable
|
12
|
-
|
13
|
-
CP1251 = {
|
14
|
-
128 => [226, 130, 172],
|
15
|
-
129 => nil,
|
16
|
-
130 => [226, 128, 154],
|
17
|
-
131 => [198, 146],
|
18
|
-
132 => [226, 128, 158],
|
19
|
-
133 => [226, 128, 166],
|
20
|
-
134 => [226, 128, 160],
|
21
|
-
135 => [226, 128, 161],
|
22
|
-
136 => [203, 134],
|
23
|
-
137 => [226, 128, 176],
|
24
|
-
138 => [197, 160],
|
25
|
-
139 => [226, 128, 185],
|
26
|
-
140 => [197, 146],
|
27
|
-
141 => nil,
|
28
|
-
142 => [197, 189],
|
29
|
-
143 => nil,
|
30
|
-
144 => nil,
|
31
|
-
145 => [226, 128, 152],
|
32
|
-
146 => [226, 128, 153],
|
33
|
-
147 => [226, 128, 156],
|
34
|
-
148 => [226, 128, 157],
|
35
|
-
149 => [226, 128, 162],
|
36
|
-
150 => [226, 128, 147],
|
37
|
-
151 => [226, 128, 148],
|
38
|
-
152 => [203, 156],
|
39
|
-
153 => [226, 132, 162],
|
40
|
-
154 => [197, 161],
|
41
|
-
155 => [226, 128, 186],
|
42
|
-
156 => [197, 147],
|
43
|
-
157 => nil,
|
44
|
-
158 => [197, 190],
|
45
|
-
159 => [197, 184]
|
46
|
-
}
|
47
|
-
|
48
|
-
def initialize(string)
|
49
|
-
@position = 0
|
50
|
-
# 1.8.6's `each_byte` does not return an Enumerable
|
51
|
-
if RUBY_VERSION < "1.8.7"
|
52
|
-
@chars = []
|
53
|
-
string.each_byte { |b| @chars << b }
|
54
|
-
else
|
55
|
-
# Create an array of bytes without raising an ArgumentError in 1.9.x
|
56
|
-
# when the string contains invalid UTF-8 characters
|
57
|
-
@chars = string.each_byte.entries
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
# Attempt to clean up malformed characters.
|
62
|
-
def tidy_bytes
|
63
|
-
Codepoints.new(entries.map {|c| c.tidy.to_char}.compact.join)
|
64
|
-
end
|
65
|
-
|
66
|
-
# Cast to string.
|
67
|
-
def to_s
|
68
|
-
entries.map {|e| e.to_char}.join
|
69
|
-
end
|
70
|
-
|
71
|
-
private
|
72
|
-
|
73
|
-
def each(&block)
|
74
|
-
while codepoint = next_codepoint
|
75
|
-
yield codepoint
|
76
|
-
end
|
77
|
-
@position = 0
|
78
|
-
end
|
79
|
-
|
80
|
-
alias :each_codepoint :each
|
81
|
-
public :each_codepoint
|
1
|
+
require File.expand_path("../utf8_utils/byte", __FILE__)
|
2
|
+
require File.expand_path("../utf8_utils/char", __FILE__)
|
3
|
+
require File.expand_path("../utf8_utils/chars", __FILE__)
|
82
4
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
when 224..239 then 3
|
88
|
-
else 4
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
def next_codepoint
|
93
|
-
codepoint = Codepoint.new(chars.slice(position, bytes_to_pull))
|
94
|
-
if codepoint.invalid?
|
95
|
-
codepoint = Codepoint.new(chars.slice(position, 1))
|
96
|
-
end
|
97
|
-
@position = position + codepoint.size
|
98
|
-
codepoint unless codepoint.empty?
|
99
|
-
end
|
100
|
-
|
101
|
-
end
|
102
|
-
|
103
|
-
class Codepoint < Array
|
104
|
-
|
105
|
-
# Borrowed from the regexp in ActiveSupport, which in turn had been borrowed from
|
106
|
-
# the Kconv library by Shinji KONO - (also as seen on the W3C site).
|
107
|
-
# See also http://en.wikipedia.org/wiki/UTF-8
|
108
|
-
def valid?
|
109
|
-
if length == 1
|
110
|
-
(0..127) === self[0]
|
111
|
-
elsif length == 2
|
112
|
-
(192..223) === self[0] && (128..191) === self[1]
|
113
|
-
elsif length == 3
|
114
|
-
(self[0] == 224 && ((160..191) === self[1] && (128..191) === self[2])) ||
|
115
|
-
((225..239) === self[0] && (128..191) === self[1] && (128..191) === self[2])
|
116
|
-
elsif length == 4
|
117
|
-
(self[0] == 240 && (144..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
|
118
|
-
((241..243) === self[0] && (128..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
|
119
|
-
(self[0] == 244 && (128..143) === self[1] && (128..191) === self[2] && (128..191) === self[3])
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
# Attempt to rescue a valid UTF-8 character from a malformed codepoint. It will first
|
124
|
-
# attempt to convert from CP1251, and if this isn't possible, it prepends a valid leading
|
125
|
-
# byte, treating the character as the last byte in a two-byte codepoint.
|
126
|
-
# Note that much of the logic here is taken from ActiveSupport; the difference is that this
|
127
|
-
# works for Ruby 1.8.6 - 1.9.1.
|
128
|
-
def tidy
|
129
|
-
return self if valid?
|
130
|
-
if Codepoints::CP1251.key? self[0]
|
131
|
-
self.class.new [Codepoints::CP1251[self[0]]]
|
132
|
-
elsif self[0] < 192
|
133
|
-
self.class.new [194, self[0]]
|
134
|
-
else
|
135
|
-
self.class.new [195, self[0] - 64]
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
def invalid?
|
140
|
-
!valid?
|
141
|
-
end
|
5
|
+
# Wraps a string as an array of bytes and allows some naive cleanup operations
|
6
|
+
# as a workaround for Ruby 1.9's crappy encoding support that throws exceptions
|
7
|
+
# when attempting to access UTF8 strings with invalid characters.
|
8
|
+
module UTF8Utils
|
142
9
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
10
|
+
# CP1251 decimal byte => UTF-8 approximation as an array of bytes
|
11
|
+
CP1251 = {
|
12
|
+
128 => [226, 130, 172],
|
13
|
+
129 => nil,
|
14
|
+
130 => [226, 128, 154],
|
15
|
+
131 => [198, 146],
|
16
|
+
132 => [226, 128, 158],
|
17
|
+
133 => [226, 128, 166],
|
18
|
+
134 => [226, 128, 160],
|
19
|
+
135 => [226, 128, 161],
|
20
|
+
136 => [203, 134],
|
21
|
+
137 => [226, 128, 176],
|
22
|
+
138 => [197, 160],
|
23
|
+
139 => [226, 128, 185],
|
24
|
+
140 => [197, 146],
|
25
|
+
141 => nil,
|
26
|
+
142 => [197, 189],
|
27
|
+
143 => nil,
|
28
|
+
144 => nil,
|
29
|
+
145 => [226, 128, 152],
|
30
|
+
146 => [226, 128, 153],
|
31
|
+
147 => [226, 128, 156],
|
32
|
+
148 => [226, 128, 157],
|
33
|
+
149 => [226, 128, 162],
|
34
|
+
150 => [226, 128, 147],
|
35
|
+
151 => [226, 128, 148],
|
36
|
+
152 => [203, 156],
|
37
|
+
153 => [226, 132, 162],
|
38
|
+
154 => [197, 161],
|
39
|
+
155 => [226, 128, 186],
|
40
|
+
156 => [197, 147],
|
41
|
+
157 => nil,
|
42
|
+
158 => [197, 190],
|
43
|
+
159 => [197, 184]
|
44
|
+
}
|
147
45
|
|
148
|
-
end
|
149
46
|
end
|
150
47
|
|
151
|
-
# Get an array of UTF8
|
48
|
+
# Get an array of UTF8 charsfrom a string.
|
152
49
|
class String
|
153
|
-
def
|
154
|
-
UTF8Utils::
|
50
|
+
def to_utf8_chars
|
51
|
+
UTF8Utils::Chars.new self
|
155
52
|
end
|
156
|
-
end
|
53
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module UTF8Utils
|
2
|
+
|
3
|
+
# A single UTF-8 byte.
|
4
|
+
class Byte
|
5
|
+
|
6
|
+
attr_reader :byte
|
7
|
+
|
8
|
+
def initialize(byte)
|
9
|
+
@byte = byte
|
10
|
+
end
|
11
|
+
|
12
|
+
def codepoint_mask
|
13
|
+
case leading_1_bits
|
14
|
+
when 0 then 0
|
15
|
+
when 1 then 0b1000_0000
|
16
|
+
when 2 then 0b1100_0000
|
17
|
+
when 3 then 0b1110_0000
|
18
|
+
when 4 then 0b1111_0000
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Is this a continuation byte?
|
23
|
+
def continuation?
|
24
|
+
leading_1_bits == 1
|
25
|
+
end
|
26
|
+
|
27
|
+
# How many continuation bytes should follow this byte?
|
28
|
+
def continuations
|
29
|
+
bits = leading_1_bits
|
30
|
+
bits < 2 ? 0 : bits - 1
|
31
|
+
end
|
32
|
+
|
33
|
+
def invalid?
|
34
|
+
!valid?
|
35
|
+
end
|
36
|
+
|
37
|
+
# From Wikipedia's entry on UTF-8:
|
38
|
+
#
|
39
|
+
# The UTF-8 encoding is variable-width, with each character represented by 1
|
40
|
+
# to 4 bytes. Each byte has 0–4 leading consecutive 1 bits followed by a zero bit
|
41
|
+
# to indicate its type. N 1 bits indicates the first byte in a N-byte sequence,
|
42
|
+
# with the exception that zero 1 bits indicates a one-byte sequence while one 1
|
43
|
+
# bit indicates a continuation byte in a multi-byte sequence (this was done for
|
44
|
+
# ASCII compatibility).
|
45
|
+
# @see http://en.wikipedia.org/wiki/Utf-8
|
46
|
+
def leading_1_bits
|
47
|
+
nibble = byte >> 4
|
48
|
+
if nibble < 0b1000 then 0 # single-byte chars
|
49
|
+
elsif nibble < 0b1100 then 1 # continuation byte
|
50
|
+
elsif nibble < 0b1110 then 2 # start of 2-byte char
|
51
|
+
elsif nibble < 0b1111 then 3 # 3-byte char
|
52
|
+
else 4 # 4-byte char
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Start of a 2-byte sequence, but code point ≤ 127
|
57
|
+
# @see http://tools.ietf.org/html/rfc3629
|
58
|
+
def overlong?
|
59
|
+
(192..193) === byte
|
60
|
+
end
|
61
|
+
|
62
|
+
# RFC 3629 reserves 245-253 for the leading bytes of 4-6 byte sequences.
|
63
|
+
# @see http://tools.ietf.org/html/rfc3629
|
64
|
+
def restricted?
|
65
|
+
(245..253) === byte
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_i
|
69
|
+
byte
|
70
|
+
end
|
71
|
+
|
72
|
+
# Bytes 254 and 255 are not defined by the original UTF-8 spec.
|
73
|
+
def undefined?
|
74
|
+
(254..255) === byte
|
75
|
+
end
|
76
|
+
|
77
|
+
def valid?
|
78
|
+
!(overlong? or restricted? or undefined?)
|
79
|
+
end
|
80
|
+
|
81
|
+
def codepoint_bits
|
82
|
+
byte ^ codepoint_mask
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module UTF8Utils
|
2
|
+
|
3
|
+
class Char < Array
|
4
|
+
|
5
|
+
# Given the first byte, how many bytes long should this character be?
|
6
|
+
def expected_length
|
7
|
+
(first.continuations rescue 0) + 1
|
8
|
+
end
|
9
|
+
|
10
|
+
# Is the character invalid?
|
11
|
+
def invalid?
|
12
|
+
!valid?
|
13
|
+
end
|
14
|
+
|
15
|
+
# Attempt to rescue a valid UTF-8 character from a malformed character. It
|
16
|
+
# will first attempt to convert from CP1251, and if this isn't possible, it
|
17
|
+
# prepends a valid leading byte, treating the character as the last byte in
|
18
|
+
# a two-byte character. Note that much of the logic here is taken from
|
19
|
+
# ActiveSupport; the difference is that this works for Ruby 1.8.6 - 1.9.1.
|
20
|
+
def tidy
|
21
|
+
return self if valid?
|
22
|
+
byte = first.to_i
|
23
|
+
if UTF8Utils::CP1251.key? byte
|
24
|
+
self.class.new [UTF8Utils::CP1251[byte]]
|
25
|
+
elsif byte < 192
|
26
|
+
self.class.new [194, byte]
|
27
|
+
else
|
28
|
+
self.class.new [195, byte - 64]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Get a multibyte character from the bytes.
|
33
|
+
def to_s
|
34
|
+
flatten.map {|b| b.to_i }.pack("C*").unpack("U*").pack("U*")
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_codepoint
|
38
|
+
flatten.map {|b| b.to_i }.pack("C*").unpack("U*")[0]
|
39
|
+
end
|
40
|
+
|
41
|
+
def valid?
|
42
|
+
return false if length != expected_length
|
43
|
+
each_with_index do |byte, index|
|
44
|
+
return false if byte.invalid?
|
45
|
+
return false if index == 0 and byte.continuation?
|
46
|
+
return false if index > 0 and !byte.continuation?
|
47
|
+
end
|
48
|
+
true
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module UTF8Utils
|
2
|
+
class Chars
|
3
|
+
|
4
|
+
attr :bytes
|
5
|
+
attr :position
|
6
|
+
|
7
|
+
include Enumerable
|
8
|
+
|
9
|
+
def initialize(string)
|
10
|
+
@position = 0
|
11
|
+
begin
|
12
|
+
# Create an array of bytes without raising an ArgumentError in 1.9.x
|
13
|
+
# when the string contains invalid UTF-8 characters
|
14
|
+
@bytes = string.each_byte.map {|b| Byte.new(b)}
|
15
|
+
rescue LocalJumpError
|
16
|
+
# 1.8.6's `each_byte` does not return an Enumerable
|
17
|
+
@bytes = []
|
18
|
+
string.each_byte { |b| @bytes << Byte.new(b) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Attempt to clean up malformed characters.
|
23
|
+
def tidy_bytes
|
24
|
+
Chars.new(entries.map {|c| c.tidy.to_s}.compact.join)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Cast to string.
|
28
|
+
def to_s
|
29
|
+
entries.flatten.map {|b| b.to_i }.pack("C*").unpack("U*").pack("U*")
|
30
|
+
end
|
31
|
+
|
32
|
+
def first
|
33
|
+
entries.first
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def each(&block)
|
39
|
+
while char = next_char
|
40
|
+
yield char
|
41
|
+
end
|
42
|
+
@position = 0
|
43
|
+
end
|
44
|
+
|
45
|
+
alias :each_char :each
|
46
|
+
public :each_char
|
47
|
+
|
48
|
+
def next_char
|
49
|
+
return if !bytes[position]
|
50
|
+
char = Char.new(bytes.slice(position, bytes[position].continuations + 1))
|
51
|
+
if char.invalid?
|
52
|
+
char = Char.new(bytes.slice(position, 1))
|
53
|
+
end
|
54
|
+
@position = position + char.size
|
55
|
+
char unless char.empty?
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
data/lib/utf8_utils/version.rb
CHANGED
data/test/utf8_utils_test.rb
CHANGED
@@ -1,49 +1,70 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
+
require "rubygems"
|
3
4
|
require "test/unit"
|
4
|
-
require
|
5
|
+
require "mocha"
|
6
|
+
require File.expand_path("../../lib/utf8_utils", __FILE__)
|
5
7
|
|
6
|
-
|
8
|
+
module UTF8ByteTest
|
7
9
|
|
8
|
-
def
|
9
|
-
|
10
|
+
def test_leading_1_bits
|
11
|
+
[0, 128, 194, 224, 240].each_with_index do |n, i|
|
12
|
+
byte = UTF8Utils::Byte.new(n)
|
13
|
+
assert_equal i, byte.leading_1_bits
|
14
|
+
end
|
10
15
|
end
|
11
16
|
|
12
|
-
def
|
13
|
-
|
17
|
+
def test_invalid_bytes
|
18
|
+
[192, 193, 245, 255].each do |n|
|
19
|
+
assert !UTF8Utils::Byte.new(n).valid?
|
20
|
+
end
|
14
21
|
end
|
15
22
|
|
16
|
-
def
|
17
|
-
|
23
|
+
def test_continuation
|
24
|
+
assert UTF8Utils::Byte.new(130).continuation?
|
18
25
|
end
|
19
26
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
27
|
+
end
|
28
|
+
|
29
|
+
class UTF8UtilsTest < Test::Unit::TestCase
|
30
|
+
|
31
|
+
include UTF8ByteTest
|
32
|
+
|
33
|
+
def test_entries_should_be_one_byte_for_ascii_char
|
34
|
+
assert_equal 1, "a".to_utf8_chars.first.length
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_entries_should_be_two_bytes_for_latin_char_with_diacritics
|
38
|
+
assert_equal 2, "¡".to_utf8_chars.first.length
|
26
39
|
end
|
27
40
|
|
28
|
-
def
|
29
|
-
"
|
41
|
+
def test_entries_should_be_three_bytes_for_basic_multilingual_char
|
42
|
+
assert_equal 3, "आ".to_utf8_chars.first.length
|
30
43
|
end
|
31
44
|
|
32
|
-
def
|
33
|
-
|
45
|
+
def test_entries_should_be_four_bytes_for_other_chars
|
46
|
+
u = UTF8Utils::Chars.new("")
|
47
|
+
# Editors tend to freak out with chars in this plane, so just stub the
|
48
|
+
# chars field instead. This char is U+10404, DESERET CAPITAL LETTER LONG O.
|
49
|
+
u.stubs(:bytes).returns([240, 144, 144, 132].map { |b| UTF8Utils::Byte.new(b)})
|
50
|
+
assert_equal 4, u.first.length
|
34
51
|
end
|
35
52
|
|
36
|
-
def
|
37
|
-
|
53
|
+
def test_should_detect_valid_chars
|
54
|
+
"cañón आ".to_utf8_chars.each_char {|c| assert c.valid? }
|
38
55
|
end
|
39
56
|
|
40
|
-
def
|
41
|
-
|
57
|
+
def test_should_detect_invalid_chars
|
58
|
+
"\x92".to_utf8_chars.each_char {|c| assert c.invalid? }
|
42
59
|
end
|
43
60
|
|
44
|
-
def
|
45
|
-
|
46
|
-
|
61
|
+
def test_should_split_correctly_with_invalid_chars
|
62
|
+
assert_equal 3, "a\x92a".to_utf8_chars.entries.length
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_should_tidy_bytes
|
66
|
+
assert_equal "a’a", "a\x92a".to_utf8_chars.tidy_bytes.to_s
|
67
|
+
assert_equal "Simón Bolívar", "Sim\xF3n Bol\xEDvar".to_utf8_chars.tidy_bytes.to_s
|
47
68
|
end
|
48
69
|
|
49
70
|
end
|
metadata
CHANGED
@@ -3,10 +3,10 @@ name: utf8_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
|
+
- 1
|
6
7
|
- 0
|
7
8
|
- 0
|
8
|
-
|
9
|
-
version: 0.0.1
|
9
|
+
version: 1.0.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Norman Clarke
|
@@ -14,10 +14,21 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-04-07 00:00:00 -03:00
|
18
18
|
default_executable:
|
19
|
-
dependencies:
|
20
|
-
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: mocha
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :development
|
31
|
+
version_requirements: *id001
|
21
32
|
description: Utilities for cleaning up UTF8 strings. Compatible with Ruby 1.8.6 - 1.9.x
|
22
33
|
email: norman@njclarke.com
|
23
34
|
executables: []
|
@@ -27,6 +38,9 @@ extensions: []
|
|
27
38
|
extra_rdoc_files: []
|
28
39
|
|
29
40
|
files:
|
41
|
+
- lib/utf8_utils/byte.rb
|
42
|
+
- lib/utf8_utils/char.rb
|
43
|
+
- lib/utf8_utils/chars.rb
|
30
44
|
- lib/utf8_utils/version.rb
|
31
45
|
- lib/utf8_utils.rb
|
32
46
|
- README.md
|