unicode 0.4.2-x86-mswin32-60 → 0.4.3-x86-mswin32-60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +29 -7
- data/ext/unicode/unicode.c +379 -16
- data/ext/unicode/unidata.map +24536 -24435
- data/ext/unicode/wstring.c +69 -1
- data/ext/unicode/wstring.h +2 -0
- data/lib/unicode/1.8/unicode_native.so +0 -0
- data/lib/unicode/1.9/unicode_native.so +0 -0
- data/tools/README +3 -2
- data/tools/mkunidata.rb +136 -12
- data/unicode.gemspec +2 -2
- metadata +5 -5
data/ext/unicode/wstring.c
CHANGED
@@ -43,7 +43,10 @@ WStr_free(WString* str)
|
|
43
43
|
{
|
44
44
|
str->size = 0;
|
45
45
|
str->len = 0;
|
46
|
-
|
46
|
+
if (str->str) {
|
47
|
+
free(str->str);
|
48
|
+
str->str = NULL;
|
49
|
+
}
|
47
50
|
}
|
48
51
|
|
49
52
|
int
|
@@ -164,6 +167,59 @@ WStr_allocWithUTF8(WString* s, const char* in)
|
|
164
167
|
return s;
|
165
168
|
}
|
166
169
|
|
170
|
+
WString*
|
171
|
+
WStr_allocWithUTF8L(WString* s, const char* in, int len)
|
172
|
+
{
|
173
|
+
int i;
|
174
|
+
int u = 0;
|
175
|
+
int rest = 0;
|
176
|
+
|
177
|
+
WStr_alloc(s);
|
178
|
+
if (in == NULL)
|
179
|
+
return s;
|
180
|
+
for (i = 0; i < len; i++) {
|
181
|
+
unsigned char c = in[i];
|
182
|
+
if ((c & 0xc0) == 0x80) {
|
183
|
+
if (rest == 0)
|
184
|
+
return NULL;
|
185
|
+
u = (u << 6) | (c & 63);
|
186
|
+
rest--;
|
187
|
+
if (rest == 0) {
|
188
|
+
WStr_addWChar(s, u);
|
189
|
+
}
|
190
|
+
}
|
191
|
+
else if ((c & 0x80) == 0) { /* 0b0nnnnnnn (7bit) */
|
192
|
+
WStr_addWChar(s, c);
|
193
|
+
rest = 0;
|
194
|
+
}
|
195
|
+
else if ((c & 0xe0) == 0xc0) { /* 0b110nnnnn (11bit) */
|
196
|
+
rest = 1;
|
197
|
+
u = c & 31;
|
198
|
+
}
|
199
|
+
else if ((c & 0xf0) == 0xe0) { /* 0b1110nnnn (16bit) */
|
200
|
+
rest = 2;
|
201
|
+
u = c & 15;
|
202
|
+
}
|
203
|
+
else if ((c & 0xf8) == 0xf0) { /* 0b11110nnn (21bit) */
|
204
|
+
rest = 3;
|
205
|
+
u = c & 7;
|
206
|
+
}
|
207
|
+
else if ((c & 0xfc) == 0xf8) { /* 0b111110nn (26bit) */
|
208
|
+
rest = 4;
|
209
|
+
u = c & 3;
|
210
|
+
}
|
211
|
+
else if ((c & 0xfe) == 0xfc) { /* 0b1111110n (31bit) */
|
212
|
+
rest = 5;
|
213
|
+
u = c & 1;
|
214
|
+
}
|
215
|
+
else {
|
216
|
+
return NULL;
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
220
|
+
return s;
|
221
|
+
}
|
222
|
+
|
167
223
|
UString*
|
168
224
|
WStr_convertIntoUString(WString* wstr, UString* ustr)
|
169
225
|
{
|
@@ -176,6 +232,18 @@ WStr_convertIntoUString(WString* wstr, UString* ustr)
|
|
176
232
|
return ustr;
|
177
233
|
}
|
178
234
|
|
235
|
+
UString*
|
236
|
+
WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr)
|
237
|
+
{
|
238
|
+
int i;
|
239
|
+
|
240
|
+
for (i = start; i < wstr->len && i < start + len; i++) {
|
241
|
+
UniStr_addWChar(ustr, wstr->str[i]);
|
242
|
+
}
|
243
|
+
|
244
|
+
return ustr;
|
245
|
+
}
|
246
|
+
|
179
247
|
void
|
180
248
|
WStr_dump(WString* s)
|
181
249
|
{
|
data/ext/unicode/wstring.h
CHANGED
@@ -24,6 +24,7 @@ typedef struct _WString {
|
|
24
24
|
|
25
25
|
WString* WStr_alloc(WString* str);
|
26
26
|
WString* WStr_allocWithUTF8(WString* s, const char* u);
|
27
|
+
WString* WStr_allocWithUTF8L(WString* s, const char* u, int len);
|
27
28
|
WString* WStr_enlarge(WString* str, int size);
|
28
29
|
void WStr_free(WString* str);
|
29
30
|
int WStr_addWChars(WString* s, const int* a, int len);
|
@@ -32,6 +33,7 @@ int WStr_pushWString(WString* s, const WString* add);
|
|
32
33
|
int WStr_addWChar2(WString* s, int a1, int a2);
|
33
34
|
int WStr_addWChar3(WString* s, int a1, int a2, int a3);
|
34
35
|
UString* WStr_convertIntoUString(WString* wstr, UString* ustr);
|
36
|
+
UString* WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr);
|
35
37
|
void WStr_dump(WString* s);
|
36
38
|
|
37
39
|
#ifdef __cplusplus
|
Binary file
|
Binary file
|
data/tools/README
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
The bundled unidata.map is created from UnicodeData.txt,
|
2
|
-
DerivedNormalizationProps.txt
|
2
|
+
DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
|
3
|
+
of Unicode 6.0.
|
3
4
|
|
4
5
|
To update unidata.map,
|
5
6
|
|
6
|
-
ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map
|
7
|
+
ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt > unidata.map
|
data/tools/mkunidata.rb
CHANGED
@@ -7,22 +7,102 @@
|
|
7
7
|
HEAD=<<EOS
|
8
8
|
/*
|
9
9
|
* UnicodeData
|
10
|
-
* Copyright 1999, 2004, 2010 by yoshidam
|
10
|
+
* Copyright 1999, 2004, 2010, 2012 by yoshidam
|
11
11
|
*
|
12
12
|
*/
|
13
13
|
|
14
14
|
#ifndef _UNIDATA_MAP
|
15
15
|
#define _UNIDATA_MAP
|
16
16
|
|
17
|
+
EOS
|
18
|
+
|
19
|
+
HEAD1=<<EOS
|
20
|
+
|
21
|
+
enum GeneralCategory {
|
22
|
+
/* Letter */
|
23
|
+
c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
|
24
|
+
/* Mark */
|
25
|
+
c_Mn, c_Mc, c_Me,
|
26
|
+
/* Number */
|
27
|
+
c_Nd, c_Nl, c_No,
|
28
|
+
/* Punctuation */
|
29
|
+
c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
|
30
|
+
/* Symbol */
|
31
|
+
c_Sm, c_Sc, c_Sk, c_So,
|
32
|
+
/* Separator */
|
33
|
+
c_Zs, c_Zl, c_Zp,
|
34
|
+
/* Other */
|
35
|
+
c_Cc, c_Cf, c_Cs, c_Co, c_Cn
|
36
|
+
};
|
37
|
+
|
38
|
+
const char* const gencat_abbr[] = {
|
39
|
+
"", /* 0 */
|
40
|
+
/* Letter */
|
41
|
+
"Lu", "Ll", "Lt", "LC", "Lm", "Lo",
|
42
|
+
/* Mark */
|
43
|
+
"Mn", "Mc", "Me",
|
44
|
+
/* Number */
|
45
|
+
"Nd", "Nl", "No",
|
46
|
+
/* Punctuation */
|
47
|
+
"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
|
48
|
+
/* Symbol */
|
49
|
+
"Sm", "Sc", "Sk", "So",
|
50
|
+
/* Separator */
|
51
|
+
"Zs", "Zl", "Zp",
|
52
|
+
/* Other */
|
53
|
+
"Cc", "Cf", "Cs", "Co", "Cn"
|
54
|
+
};
|
55
|
+
|
56
|
+
const char* const gencat_long[] = {
|
57
|
+
"",
|
58
|
+
"Uppercase_Letter",
|
59
|
+
"Lowercase_Letter",
|
60
|
+
"Titlecase_Letter",
|
61
|
+
"Cased_Letter",
|
62
|
+
"Modifier_Letter",
|
63
|
+
"Other_Letter",
|
64
|
+
"Nonspacing_Mark",
|
65
|
+
"Spacing_Mark",
|
66
|
+
"Enclosing_Mark",
|
67
|
+
"Decimal_Number",
|
68
|
+
"Letter_Number",
|
69
|
+
"Other_Number",
|
70
|
+
"Connector_Punctuation",
|
71
|
+
"Dash_Punctuation",
|
72
|
+
"Open_Punctuation",
|
73
|
+
"Close_Punctuation",
|
74
|
+
"Initial_Punctuation",
|
75
|
+
"Final_Punctuation",
|
76
|
+
"Other_Punctuation",
|
77
|
+
"Math_Symbol",
|
78
|
+
"Currency_Symbol",
|
79
|
+
"Modifier_Symbol",
|
80
|
+
"Other_Symbol",
|
81
|
+
"Space_Separator",
|
82
|
+
"Line_Separator",
|
83
|
+
"Paragraph_Separator",
|
84
|
+
"Control",
|
85
|
+
"Format",
|
86
|
+
"Surrogate",
|
87
|
+
"Private_Use",
|
88
|
+
"Unassigned"
|
89
|
+
};
|
90
|
+
|
91
|
+
enum EastAsianWidth {
|
92
|
+
w_N = 1, w_A, w_H, w_W, w_F, w_Na
|
93
|
+
};
|
94
|
+
|
17
95
|
struct unicode_data {
|
18
96
|
const int code;
|
19
|
-
const int combining_class;
|
20
|
-
const int exclusion;
|
21
97
|
const char* const canon;
|
22
98
|
const char* const compat;
|
23
|
-
const char* uppercase;
|
24
|
-
const char* lowercase;
|
25
|
-
const char* titlecase;
|
99
|
+
const char* const uppercase;
|
100
|
+
const char* const lowercase;
|
101
|
+
const char* const titlecase;
|
102
|
+
const unsigned char combining_class;
|
103
|
+
const unsigned char exclusion;
|
104
|
+
const unsigned char general_category;
|
105
|
+
const unsigned char east_asian_width;
|
26
106
|
};
|
27
107
|
|
28
108
|
static const struct unicode_data unidata[] = {
|
@@ -81,6 +161,11 @@ def printstr(str)
|
|
81
161
|
return '"' + ret + '"'
|
82
162
|
end
|
83
163
|
|
164
|
+
if ARGV.length != 4
|
165
|
+
puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
|
166
|
+
exit 0
|
167
|
+
end
|
168
|
+
|
84
169
|
## scan Composition Exclusions
|
85
170
|
exclusion = {}
|
86
171
|
open(ARGV[1]) do |f|
|
@@ -123,6 +208,7 @@ end
|
|
123
208
|
|
124
209
|
## scan UnicodeData
|
125
210
|
udata = {}
|
211
|
+
range_data = []
|
126
212
|
open(ARGV[0]) do |f|
|
127
213
|
while l = f.gets
|
128
214
|
l.chomp!
|
@@ -135,13 +221,46 @@ open(ARGV[0]) do |f|
|
|
135
221
|
upcase = hex_or_nil(upcase)
|
136
222
|
lowcase = hex_or_nil(lowcase)
|
137
223
|
titlecase = hex_or_nil(titlecase)
|
138
|
-
udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
|
224
|
+
udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
|
225
|
+
if charname =~ /^<(.*, (First|Last))>$/
|
226
|
+
charname = $1.upcase.gsub(/,? /, '_')
|
227
|
+
range_data << [charname, code]
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
## scan EastAsianWidth
|
233
|
+
ea_width = {}
|
234
|
+
open(ARGV[3]) do |f|
|
235
|
+
while l = f.gets
|
236
|
+
l.chomp!
|
237
|
+
next if l =~ /^\#/ || l =~ /^$/
|
238
|
+
l =~ /^(.*)\s+#\s*(.*)$/
|
239
|
+
l = $1
|
240
|
+
comment = $2
|
241
|
+
code,width = l.split(/;/)
|
242
|
+
if code =~ /\.\./
|
243
|
+
start_code, end_code = code.split('..')
|
244
|
+
start_code = start_code.hex
|
245
|
+
end_code = end_code.hex
|
246
|
+
(start_code..end_code).each do |code|
|
247
|
+
ea_width[code] = width
|
248
|
+
end
|
249
|
+
next
|
250
|
+
end
|
251
|
+
code = code.hex
|
252
|
+
ea_width[code] = width
|
139
253
|
end
|
140
254
|
end
|
141
255
|
|
142
256
|
print HEAD
|
257
|
+
range_data.each do |charname, code|
|
258
|
+
printf("#define %s\t(0x%04x)\n", charname, code)
|
259
|
+
end
|
260
|
+
|
261
|
+
print HEAD1
|
143
262
|
udata.sort.each do |code, data|
|
144
|
-
ccclass, canon, compat, upcase, lowcase, titlecase = data
|
263
|
+
ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
|
145
264
|
## Exclusions
|
146
265
|
ex = 0
|
147
266
|
if exclusion[code] ## Script-specifics or Post Composition Version
|
@@ -160,10 +279,15 @@ udata.sort.each do |code, data|
|
|
160
279
|
titlecase = casing[code][1] if casing[code][1]
|
161
280
|
upcase = casing[code][2] if casing[code][2]
|
162
281
|
end
|
163
|
-
|
164
|
-
|
282
|
+
width = 'N'
|
283
|
+
if ea_width[code]
|
284
|
+
width = ea_width[code]
|
285
|
+
end
|
286
|
+
|
287
|
+
printf(" { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
|
288
|
+
code, printstr(canon),
|
165
289
|
printstr(compat), printstr(upcase), printstr(lowcase),
|
166
|
-
printstr(titlecase))
|
290
|
+
printstr(titlecase), ccclass, ex, gencat, width)
|
167
291
|
end
|
168
|
-
printf(" { -1,
|
292
|
+
printf(" { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
|
169
293
|
print TAIL
|
data/unicode.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{unicode}
|
5
|
-
s.version = "0.4.
|
5
|
+
s.version = "0.4.3"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = [%q{Yoshida Masato}]
|
9
|
-
s.date = %q{
|
9
|
+
s.date = %q{2012-08-07}
|
10
10
|
s.email = %q{yoshidam@yoshidam.net}
|
11
11
|
s.extensions = [%q{ext/unicode/extconf.rb}]
|
12
12
|
s.extra_rdoc_files = [%q{README}]
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 9
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 4
|
9
|
-
-
|
10
|
-
version: 0.4.
|
9
|
+
- 3
|
10
|
+
version: 0.4.3
|
11
11
|
platform: x86-mswin32-60
|
12
12
|
authors:
|
13
13
|
- Yoshida Masato
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2012-08-07 00:00:00 Z
|
19
19
|
dependencies: []
|
20
20
|
|
21
21
|
description: Unicode normalization library.
|
@@ -73,7 +73,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
73
73
|
requirements: []
|
74
74
|
|
75
75
|
rubyforge_project:
|
76
|
-
rubygems_version: 1.8.
|
76
|
+
rubygems_version: 1.8.24
|
77
77
|
signing_key:
|
78
78
|
specification_version: 3
|
79
79
|
summary: Unicode normalization library.
|