unicode 0.4.2-x86-mswin32-60 → 0.4.3-x86-mswin32-60
Sign up to get free protection for your applications and to get access to all the features.
- data/README +29 -7
- data/ext/unicode/unicode.c +379 -16
- data/ext/unicode/unidata.map +24536 -24435
- data/ext/unicode/wstring.c +69 -1
- data/ext/unicode/wstring.h +2 -0
- data/lib/unicode/1.8/unicode_native.so +0 -0
- data/lib/unicode/1.9/unicode_native.so +0 -0
- data/tools/README +3 -2
- data/tools/mkunidata.rb +136 -12
- data/unicode.gemspec +2 -2
- metadata +5 -5
data/ext/unicode/wstring.c
CHANGED
@@ -43,7 +43,10 @@ WStr_free(WString* str)
|
|
43
43
|
{
|
44
44
|
str->size = 0;
|
45
45
|
str->len = 0;
|
46
|
-
|
46
|
+
if (str->str) {
|
47
|
+
free(str->str);
|
48
|
+
str->str = NULL;
|
49
|
+
}
|
47
50
|
}
|
48
51
|
|
49
52
|
int
|
@@ -164,6 +167,59 @@ WStr_allocWithUTF8(WString* s, const char* in)
|
|
164
167
|
return s;
|
165
168
|
}
|
166
169
|
|
170
|
+
WString*
|
171
|
+
WStr_allocWithUTF8L(WString* s, const char* in, int len)
|
172
|
+
{
|
173
|
+
int i;
|
174
|
+
int u = 0;
|
175
|
+
int rest = 0;
|
176
|
+
|
177
|
+
WStr_alloc(s);
|
178
|
+
if (in == NULL)
|
179
|
+
return s;
|
180
|
+
for (i = 0; i < len; i++) {
|
181
|
+
unsigned char c = in[i];
|
182
|
+
if ((c & 0xc0) == 0x80) {
|
183
|
+
if (rest == 0)
|
184
|
+
return NULL;
|
185
|
+
u = (u << 6) | (c & 63);
|
186
|
+
rest--;
|
187
|
+
if (rest == 0) {
|
188
|
+
WStr_addWChar(s, u);
|
189
|
+
}
|
190
|
+
}
|
191
|
+
else if ((c & 0x80) == 0) { /* 0b0nnnnnnn (7bit) */
|
192
|
+
WStr_addWChar(s, c);
|
193
|
+
rest = 0;
|
194
|
+
}
|
195
|
+
else if ((c & 0xe0) == 0xc0) { /* 0b110nnnnn (11bit) */
|
196
|
+
rest = 1;
|
197
|
+
u = c & 31;
|
198
|
+
}
|
199
|
+
else if ((c & 0xf0) == 0xe0) { /* 0b1110nnnn (16bit) */
|
200
|
+
rest = 2;
|
201
|
+
u = c & 15;
|
202
|
+
}
|
203
|
+
else if ((c & 0xf8) == 0xf0) { /* 0b11110nnn (21bit) */
|
204
|
+
rest = 3;
|
205
|
+
u = c & 7;
|
206
|
+
}
|
207
|
+
else if ((c & 0xfc) == 0xf8) { /* 0b111110nn (26bit) */
|
208
|
+
rest = 4;
|
209
|
+
u = c & 3;
|
210
|
+
}
|
211
|
+
else if ((c & 0xfe) == 0xfc) { /* 0b1111110n (31bit) */
|
212
|
+
rest = 5;
|
213
|
+
u = c & 1;
|
214
|
+
}
|
215
|
+
else {
|
216
|
+
return NULL;
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
220
|
+
return s;
|
221
|
+
}
|
222
|
+
|
167
223
|
UString*
|
168
224
|
WStr_convertIntoUString(WString* wstr, UString* ustr)
|
169
225
|
{
|
@@ -176,6 +232,18 @@ WStr_convertIntoUString(WString* wstr, UString* ustr)
|
|
176
232
|
return ustr;
|
177
233
|
}
|
178
234
|
|
235
|
+
UString*
|
236
|
+
WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr)
|
237
|
+
{
|
238
|
+
int i;
|
239
|
+
|
240
|
+
for (i = start; i < wstr->len && i < start + len; i++) {
|
241
|
+
UniStr_addWChar(ustr, wstr->str[i]);
|
242
|
+
}
|
243
|
+
|
244
|
+
return ustr;
|
245
|
+
}
|
246
|
+
|
179
247
|
void
|
180
248
|
WStr_dump(WString* s)
|
181
249
|
{
|
data/ext/unicode/wstring.h
CHANGED
@@ -24,6 +24,7 @@ typedef struct _WString {
|
|
24
24
|
|
25
25
|
WString* WStr_alloc(WString* str);
|
26
26
|
WString* WStr_allocWithUTF8(WString* s, const char* u);
|
27
|
+
WString* WStr_allocWithUTF8L(WString* s, const char* u, int len);
|
27
28
|
WString* WStr_enlarge(WString* str, int size);
|
28
29
|
void WStr_free(WString* str);
|
29
30
|
int WStr_addWChars(WString* s, const int* a, int len);
|
@@ -32,6 +33,7 @@ int WStr_pushWString(WString* s, const WString* add);
|
|
32
33
|
int WStr_addWChar2(WString* s, int a1, int a2);
|
33
34
|
int WStr_addWChar3(WString* s, int a1, int a2, int a3);
|
34
35
|
UString* WStr_convertIntoUString(WString* wstr, UString* ustr);
|
36
|
+
UString* WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr);
|
35
37
|
void WStr_dump(WString* s);
|
36
38
|
|
37
39
|
#ifdef __cplusplus
|
Binary file
|
Binary file
|
data/tools/README
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
The bundled unidata.map is created from UnicodeData.txt,
|
2
|
-
DerivedNormalizationProps.txt
|
2
|
+
DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
|
3
|
+
of Unicode 6.0.
|
3
4
|
|
4
5
|
To update unidata.map,
|
5
6
|
|
6
|
-
ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map
|
7
|
+
ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt > unidata.map
|
data/tools/mkunidata.rb
CHANGED
@@ -7,22 +7,102 @@
|
|
7
7
|
HEAD=<<EOS
|
8
8
|
/*
|
9
9
|
* UnicodeData
|
10
|
-
* Copyright 1999, 2004, 2010 by yoshidam
|
10
|
+
* Copyright 1999, 2004, 2010, 2012 by yoshidam
|
11
11
|
*
|
12
12
|
*/
|
13
13
|
|
14
14
|
#ifndef _UNIDATA_MAP
|
15
15
|
#define _UNIDATA_MAP
|
16
16
|
|
17
|
+
EOS
|
18
|
+
|
19
|
+
HEAD1=<<EOS
|
20
|
+
|
21
|
+
enum GeneralCategory {
|
22
|
+
/* Letter */
|
23
|
+
c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
|
24
|
+
/* Mark */
|
25
|
+
c_Mn, c_Mc, c_Me,
|
26
|
+
/* Number */
|
27
|
+
c_Nd, c_Nl, c_No,
|
28
|
+
/* Punctuation */
|
29
|
+
c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
|
30
|
+
/* Symbol */
|
31
|
+
c_Sm, c_Sc, c_Sk, c_So,
|
32
|
+
/* Separator */
|
33
|
+
c_Zs, c_Zl, c_Zp,
|
34
|
+
/* Other */
|
35
|
+
c_Cc, c_Cf, c_Cs, c_Co, c_Cn
|
36
|
+
};
|
37
|
+
|
38
|
+
const char* const gencat_abbr[] = {
|
39
|
+
"", /* 0 */
|
40
|
+
/* Letter */
|
41
|
+
"Lu", "Ll", "Lt", "LC", "Lm", "Lo",
|
42
|
+
/* Mark */
|
43
|
+
"Mn", "Mc", "Me",
|
44
|
+
/* Number */
|
45
|
+
"Nd", "Nl", "No",
|
46
|
+
/* Punctuation */
|
47
|
+
"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
|
48
|
+
/* Symbol */
|
49
|
+
"Sm", "Sc", "Sk", "So",
|
50
|
+
/* Separator */
|
51
|
+
"Zs", "Zl", "Zp",
|
52
|
+
/* Other */
|
53
|
+
"Cc", "Cf", "Cs", "Co", "Cn"
|
54
|
+
};
|
55
|
+
|
56
|
+
const char* const gencat_long[] = {
|
57
|
+
"",
|
58
|
+
"Uppercase_Letter",
|
59
|
+
"Lowercase_Letter",
|
60
|
+
"Titlecase_Letter",
|
61
|
+
"Cased_Letter",
|
62
|
+
"Modifier_Letter",
|
63
|
+
"Other_Letter",
|
64
|
+
"Nonspacing_Mark",
|
65
|
+
"Spacing_Mark",
|
66
|
+
"Enclosing_Mark",
|
67
|
+
"Decimal_Number",
|
68
|
+
"Letter_Number",
|
69
|
+
"Other_Number",
|
70
|
+
"Connector_Punctuation",
|
71
|
+
"Dash_Punctuation",
|
72
|
+
"Open_Punctuation",
|
73
|
+
"Close_Punctuation",
|
74
|
+
"Initial_Punctuation",
|
75
|
+
"Final_Punctuation",
|
76
|
+
"Other_Punctuation",
|
77
|
+
"Math_Symbol",
|
78
|
+
"Currency_Symbol",
|
79
|
+
"Modifier_Symbol",
|
80
|
+
"Other_Symbol",
|
81
|
+
"Space_Separator",
|
82
|
+
"Line_Separator",
|
83
|
+
"Paragraph_Separator",
|
84
|
+
"Control",
|
85
|
+
"Format",
|
86
|
+
"Surrogate",
|
87
|
+
"Private_Use",
|
88
|
+
"Unassigned"
|
89
|
+
};
|
90
|
+
|
91
|
+
enum EastAsianWidth {
|
92
|
+
w_N = 1, w_A, w_H, w_W, w_F, w_Na
|
93
|
+
};
|
94
|
+
|
17
95
|
struct unicode_data {
|
18
96
|
const int code;
|
19
|
-
const int combining_class;
|
20
|
-
const int exclusion;
|
21
97
|
const char* const canon;
|
22
98
|
const char* const compat;
|
23
|
-
const char* uppercase;
|
24
|
-
const char* lowercase;
|
25
|
-
const char* titlecase;
|
99
|
+
const char* const uppercase;
|
100
|
+
const char* const lowercase;
|
101
|
+
const char* const titlecase;
|
102
|
+
const unsigned char combining_class;
|
103
|
+
const unsigned char exclusion;
|
104
|
+
const unsigned char general_category;
|
105
|
+
const unsigned char east_asian_width;
|
26
106
|
};
|
27
107
|
|
28
108
|
static const struct unicode_data unidata[] = {
|
@@ -81,6 +161,11 @@ def printstr(str)
|
|
81
161
|
return '"' + ret + '"'
|
82
162
|
end
|
83
163
|
|
164
|
+
if ARGV.length != 4
|
165
|
+
puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
|
166
|
+
exit 0
|
167
|
+
end
|
168
|
+
|
84
169
|
## scan Composition Exclusions
|
85
170
|
exclusion = {}
|
86
171
|
open(ARGV[1]) do |f|
|
@@ -123,6 +208,7 @@ end
|
|
123
208
|
|
124
209
|
## scan UnicodeData
|
125
210
|
udata = {}
|
211
|
+
range_data = []
|
126
212
|
open(ARGV[0]) do |f|
|
127
213
|
while l = f.gets
|
128
214
|
l.chomp!
|
@@ -135,13 +221,46 @@ open(ARGV[0]) do |f|
|
|
135
221
|
upcase = hex_or_nil(upcase)
|
136
222
|
lowcase = hex_or_nil(lowcase)
|
137
223
|
titlecase = hex_or_nil(titlecase)
|
138
|
-
udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
|
224
|
+
udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
|
225
|
+
if charname =~ /^<(.*, (First|Last))>$/
|
226
|
+
charname = $1.upcase.gsub(/,? /, '_')
|
227
|
+
range_data << [charname, code]
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
## scan EastAsianWidth
|
233
|
+
ea_width = {}
|
234
|
+
open(ARGV[3]) do |f|
|
235
|
+
while l = f.gets
|
236
|
+
l.chomp!
|
237
|
+
next if l =~ /^\#/ || l =~ /^$/
|
238
|
+
l =~ /^(.*)\s+#\s*(.*)$/
|
239
|
+
l = $1
|
240
|
+
comment = $2
|
241
|
+
code,width = l.split(/;/)
|
242
|
+
if code =~ /\.\./
|
243
|
+
start_code, end_code = code.split('..')
|
244
|
+
start_code = start_code.hex
|
245
|
+
end_code = end_code.hex
|
246
|
+
(start_code..end_code).each do |code|
|
247
|
+
ea_width[code] = width
|
248
|
+
end
|
249
|
+
next
|
250
|
+
end
|
251
|
+
code = code.hex
|
252
|
+
ea_width[code] = width
|
139
253
|
end
|
140
254
|
end
|
141
255
|
|
142
256
|
print HEAD
|
257
|
+
range_data.each do |charname, code|
|
258
|
+
printf("#define %s\t(0x%04x)\n", charname, code)
|
259
|
+
end
|
260
|
+
|
261
|
+
print HEAD1
|
143
262
|
udata.sort.each do |code, data|
|
144
|
-
ccclass, canon, compat, upcase, lowcase, titlecase = data
|
263
|
+
ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
|
145
264
|
## Exclusions
|
146
265
|
ex = 0
|
147
266
|
if exclusion[code] ## Script-specifics or Post Composition Version
|
@@ -160,10 +279,15 @@ udata.sort.each do |code, data|
|
|
160
279
|
titlecase = casing[code][1] if casing[code][1]
|
161
280
|
upcase = casing[code][2] if casing[code][2]
|
162
281
|
end
|
163
|
-
|
164
|
-
|
282
|
+
width = 'N'
|
283
|
+
if ea_width[code]
|
284
|
+
width = ea_width[code]
|
285
|
+
end
|
286
|
+
|
287
|
+
printf(" { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
|
288
|
+
code, printstr(canon),
|
165
289
|
printstr(compat), printstr(upcase), printstr(lowcase),
|
166
|
-
printstr(titlecase))
|
290
|
+
printstr(titlecase), ccclass, ex, gencat, width)
|
167
291
|
end
|
168
|
-
printf(" { -1,
|
292
|
+
printf(" { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
|
169
293
|
print TAIL
|
data/unicode.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{unicode}
|
5
|
-
s.version = "0.4.
|
5
|
+
s.version = "0.4.3"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = [%q{Yoshida Masato}]
|
9
|
-
s.date = %q{
|
9
|
+
s.date = %q{2012-08-07}
|
10
10
|
s.email = %q{yoshidam@yoshidam.net}
|
11
11
|
s.extensions = [%q{ext/unicode/extconf.rb}]
|
12
12
|
s.extra_rdoc_files = [%q{README}]
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 9
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 4
|
9
|
-
-
|
10
|
-
version: 0.4.
|
9
|
+
- 3
|
10
|
+
version: 0.4.3
|
11
11
|
platform: x86-mswin32-60
|
12
12
|
authors:
|
13
13
|
- Yoshida Masato
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2012-08-07 00:00:00 Z
|
19
19
|
dependencies: []
|
20
20
|
|
21
21
|
description: Unicode normalization library.
|
@@ -73,7 +73,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
73
73
|
requirements: []
|
74
74
|
|
75
75
|
rubyforge_project:
|
76
|
-
rubygems_version: 1.8.
|
76
|
+
rubygems_version: 1.8.24
|
77
77
|
signing_key:
|
78
78
|
specification_version: 3
|
79
79
|
summary: Unicode normalization library.
|