unicode 0.4.2-x86-mingw32 → 0.4.3-x86-mingw32

Sign up to get free protection for your applications and to get access to all the features.
@@ -43,7 +43,10 @@ WStr_free(WString* str)
43
43
  {
44
44
  str->size = 0;
45
45
  str->len = 0;
46
- free(str->str);
46
+ if (str->str) {
47
+ free(str->str);
48
+ str->str = NULL;
49
+ }
47
50
  }
48
51
 
49
52
  int
@@ -164,6 +167,59 @@ WStr_allocWithUTF8(WString* s, const char* in)
164
167
  return s;
165
168
  }
166
169
 
170
+ WString*
171
+ WStr_allocWithUTF8L(WString* s, const char* in, int len)
172
+ {
173
+ int i;
174
+ int u = 0;
175
+ int rest = 0;
176
+
177
+ WStr_alloc(s);
178
+ if (in == NULL)
179
+ return s;
180
+ for (i = 0; i < len; i++) {
181
+ unsigned char c = in[i];
182
+ if ((c & 0xc0) == 0x80) {
183
+ if (rest == 0)
184
+ return NULL;
185
+ u = (u << 6) | (c & 63);
186
+ rest--;
187
+ if (rest == 0) {
188
+ WStr_addWChar(s, u);
189
+ }
190
+ }
191
+ else if ((c & 0x80) == 0) { /* 0b0nnnnnnn (7bit) */
192
+ WStr_addWChar(s, c);
193
+ rest = 0;
194
+ }
195
+ else if ((c & 0xe0) == 0xc0) { /* 0b110nnnnn (11bit) */
196
+ rest = 1;
197
+ u = c & 31;
198
+ }
199
+ else if ((c & 0xf0) == 0xe0) { /* 0b1110nnnn (16bit) */
200
+ rest = 2;
201
+ u = c & 15;
202
+ }
203
+ else if ((c & 0xf8) == 0xf0) { /* 0b11110nnn (21bit) */
204
+ rest = 3;
205
+ u = c & 7;
206
+ }
207
+ else if ((c & 0xfc) == 0xf8) { /* 0b111110nn (26bit) */
208
+ rest = 4;
209
+ u = c & 3;
210
+ }
211
+ else if ((c & 0xfe) == 0xfc) { /* 0b1111110n (31bit) */
212
+ rest = 5;
213
+ u = c & 1;
214
+ }
215
+ else {
216
+ return NULL;
217
+ }
218
+ }
219
+
220
+ return s;
221
+ }
222
+
167
223
  UString*
168
224
  WStr_convertIntoUString(WString* wstr, UString* ustr)
169
225
  {
@@ -176,6 +232,18 @@ WStr_convertIntoUString(WString* wstr, UString* ustr)
176
232
  return ustr;
177
233
  }
178
234
 
235
+ UString*
236
+ WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr)
237
+ {
238
+ int i;
239
+
240
+ for (i = start; i < wstr->len && i < start + len; i++) {
241
+ UniStr_addWChar(ustr, wstr->str[i]);
242
+ }
243
+
244
+ return ustr;
245
+ }
246
+
179
247
  void
180
248
  WStr_dump(WString* s)
181
249
  {
@@ -24,6 +24,7 @@ typedef struct _WString {
24
24
 
25
25
  WString* WStr_alloc(WString* str);
26
26
  WString* WStr_allocWithUTF8(WString* s, const char* u);
27
+ WString* WStr_allocWithUTF8L(WString* s, const char* u, int len);
27
28
  WString* WStr_enlarge(WString* str, int size);
28
29
  void WStr_free(WString* str);
29
30
  int WStr_addWChars(WString* s, const int* a, int len);
@@ -32,6 +33,7 @@ int WStr_pushWString(WString* s, const WString* add);
32
33
  int WStr_addWChar2(WString* s, int a1, int a2);
33
34
  int WStr_addWChar3(WString* s, int a1, int a2, int a3);
34
35
  UString* WStr_convertIntoUString(WString* wstr, UString* ustr);
36
+ UString* WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr);
35
37
  void WStr_dump(WString* s);
36
38
 
37
39
  #ifdef __cplusplus
Binary file
Binary file
data/tools/README CHANGED
@@ -1,6 +1,7 @@
1
1
  The bundled unidata.map is created from UnicodeData.txt,
2
- DerivedNormalizationProps.txt and SpecialCasing.txt of Unicode 6.0.
2
+ DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
3
+ of Unicode 6.0.
3
4
 
4
5
  To update unidata.map,
5
6
 
6
- ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map
7
+ ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt > unidata.map
data/tools/mkunidata.rb CHANGED
@@ -7,22 +7,102 @@
7
7
  HEAD=<<EOS
8
8
  /*
9
9
  * UnicodeData
10
- * Copyright 1999, 2004, 2010 by yoshidam
10
+ * Copyright 1999, 2004, 2010, 2012 by yoshidam
11
11
  *
12
12
  */
13
13
 
14
14
  #ifndef _UNIDATA_MAP
15
15
  #define _UNIDATA_MAP
16
16
 
17
+ EOS
18
+
19
+ HEAD1=<<EOS
20
+
21
+ enum GeneralCategory {
22
+ /* Letter */
23
+ c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
24
+ /* Mark */
25
+ c_Mn, c_Mc, c_Me,
26
+ /* Number */
27
+ c_Nd, c_Nl, c_No,
28
+ /* Punctuation */
29
+ c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
30
+ /* Symbol */
31
+ c_Sm, c_Sc, c_Sk, c_So,
32
+ /* Separator */
33
+ c_Zs, c_Zl, c_Zp,
34
+ /* Other */
35
+ c_Cc, c_Cf, c_Cs, c_Co, c_Cn
36
+ };
37
+
38
+ const char* const gencat_abbr[] = {
39
+ "", /* 0 */
40
+ /* Letter */
41
+ "Lu", "Ll", "Lt", "LC", "Lm", "Lo",
42
+ /* Mark */
43
+ "Mn", "Mc", "Me",
44
+ /* Number */
45
+ "Nd", "Nl", "No",
46
+ /* Punctuation */
47
+ "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
48
+ /* Symbol */
49
+ "Sm", "Sc", "Sk", "So",
50
+ /* Separator */
51
+ "Zs", "Zl", "Zp",
52
+ /* Other */
53
+ "Cc", "Cf", "Cs", "Co", "Cn"
54
+ };
55
+
56
+ const char* const gencat_long[] = {
57
+ "",
58
+ "Uppercase_Letter",
59
+ "Lowercase_Letter",
60
+ "Titlecase_Letter",
61
+ "Cased_Letter",
62
+ "Modifier_Letter",
63
+ "Other_Letter",
64
+ "Nonspacing_Mark",
65
+ "Spacing_Mark",
66
+ "Enclosing_Mark",
67
+ "Decimal_Number",
68
+ "Letter_Number",
69
+ "Other_Number",
70
+ "Connector_Punctuation",
71
+ "Dash_Punctuation",
72
+ "Open_Punctuation",
73
+ "Close_Punctuation",
74
+ "Initial_Punctuation",
75
+ "Final_Punctuation",
76
+ "Other_Punctuation",
77
+ "Math_Symbol",
78
+ "Currency_Symbol",
79
+ "Modifier_Symbol",
80
+ "Other_Symbol",
81
+ "Space_Separator",
82
+ "Line_Separator",
83
+ "Paragraph_Separator",
84
+ "Control",
85
+ "Format",
86
+ "Surrogate",
87
+ "Private_Use",
88
+ "Unassigned"
89
+ };
90
+
91
+ enum EastAsianWidth {
92
+ w_N = 1, w_A, w_H, w_W, w_F, w_Na
93
+ };
94
+
17
95
  struct unicode_data {
18
96
  const int code;
19
- const int combining_class;
20
- const int exclusion;
21
97
  const char* const canon;
22
98
  const char* const compat;
23
- const char* uppercase;
24
- const char* lowercase;
25
- const char* titlecase;
99
+ const char* const uppercase;
100
+ const char* const lowercase;
101
+ const char* const titlecase;
102
+ const unsigned char combining_class;
103
+ const unsigned char exclusion;
104
+ const unsigned char general_category;
105
+ const unsigned char east_asian_width;
26
106
  };
27
107
 
28
108
  static const struct unicode_data unidata[] = {
@@ -81,6 +161,11 @@ def printstr(str)
81
161
  return '"' + ret + '"'
82
162
  end
83
163
 
164
+ if ARGV.length != 4
165
+ puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
166
+ exit 0
167
+ end
168
+
84
169
  ## scan Composition Exclusions
85
170
  exclusion = {}
86
171
  open(ARGV[1]) do |f|
@@ -123,6 +208,7 @@ end
123
208
 
124
209
  ## scan UnicodeData
125
210
  udata = {}
211
+ range_data = []
126
212
  open(ARGV[0]) do |f|
127
213
  while l = f.gets
128
214
  l.chomp!
@@ -135,13 +221,46 @@ open(ARGV[0]) do |f|
135
221
  upcase = hex_or_nil(upcase)
136
222
  lowcase = hex_or_nil(lowcase)
137
223
  titlecase = hex_or_nil(titlecase)
138
- udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
224
+ udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
225
+ if charname =~ /^<(.*, (First|Last))>$/
226
+ charname = $1.upcase.gsub(/,? /, '_')
227
+ range_data << [charname, code]
228
+ end
229
+ end
230
+ end
231
+
232
+ ## scan EastAsianWidth
233
+ ea_width = {}
234
+ open(ARGV[3]) do |f|
235
+ while l = f.gets
236
+ l.chomp!
237
+ next if l =~ /^\#/ || l =~ /^$/
238
+ l =~ /^(.*)\s+#\s*(.*)$/
239
+ l = $1
240
+ comment = $2
241
+ code,width = l.split(/;/)
242
+ if code =~ /\.\./
243
+ start_code, end_code = code.split('..')
244
+ start_code = start_code.hex
245
+ end_code = end_code.hex
246
+ (start_code..end_code).each do |code|
247
+ ea_width[code] = width
248
+ end
249
+ next
250
+ end
251
+ code = code.hex
252
+ ea_width[code] = width
139
253
  end
140
254
  end
141
255
 
142
256
  print HEAD
257
+ range_data.each do |charname, code|
258
+ printf("#define %s\t(0x%04x)\n", charname, code)
259
+ end
260
+
261
+ print HEAD1
143
262
  udata.sort.each do |code, data|
144
- ccclass, canon, compat, upcase, lowcase, titlecase = data
263
+ ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
145
264
  ## Exclusions
146
265
  ex = 0
147
266
  if exclusion[code] ## Script-specifics or Post Composition Version
@@ -160,10 +279,15 @@ udata.sort.each do |code, data|
160
279
  titlecase = casing[code][1] if casing[code][1]
161
280
  upcase = casing[code][2] if casing[code][2]
162
281
  end
163
- printf(" { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
164
- code, ccclass, ex, printstr(canon),
282
+ width = 'N'
283
+ if ea_width[code]
284
+ width = ea_width[code]
285
+ end
286
+
287
+ printf(" { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
288
+ code, printstr(canon),
165
289
  printstr(compat), printstr(upcase), printstr(lowcase),
166
- printstr(titlecase))
290
+ printstr(titlecase), ccclass, ex, gencat, width)
167
291
  end
168
- printf(" { -1, 0, 0, NULL, NULL, NULL, NULL, NULL }\n")
292
+ printf(" { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
169
293
  print TAIL
data/unicode.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{unicode}
5
- s.version = "0.4.2"
5
+ s.version = "0.4.3"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = [%q{Yoshida Masato}]
9
- s.date = %q{2011-02-03}
9
+ s.date = %q{2012-08-07}
10
10
  s.email = %q{yoshidam@yoshidam.net}
11
11
  s.extensions = [%q{ext/unicode/extconf.rb}]
12
12
  s.extra_rdoc_files = [%q{README}]
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
4
+ hash: 9
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 4
9
- - 2
10
- version: 0.4.2
9
+ - 3
10
+ version: 0.4.3
11
11
  platform: x86-mingw32
12
12
  authors:
13
13
  - Yoshida Masato
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-02-03 00:00:00 Z
18
+ date: 2012-08-07 00:00:00 Z
19
19
  dependencies: []
20
20
 
21
21
  description: Unicode normalization library.
@@ -73,7 +73,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
73
73
  requirements: []
74
74
 
75
75
  rubyforge_project:
76
- rubygems_version: 1.8.17
76
+ rubygems_version: 1.8.24
77
77
  signing_key:
78
78
  specification_version: 3
79
79
  summary: Unicode normalization library.