unicode 0.4.2-x86-mswin32-60 → 0.4.3-x86-mswin32-60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,7 +43,10 @@ WStr_free(WString* str)
43
43
  {
44
44
  str->size = 0;
45
45
  str->len = 0;
46
- free(str->str);
46
+ if (str->str) {
47
+ free(str->str);
48
+ str->str = NULL;
49
+ }
47
50
  }
48
51
 
49
52
  int
@@ -164,6 +167,59 @@ WStr_allocWithUTF8(WString* s, const char* in)
164
167
  return s;
165
168
  }
166
169
 
170
+ WString*
171
+ WStr_allocWithUTF8L(WString* s, const char* in, int len)
172
+ {
173
+ int i;
174
+ int u = 0;
175
+ int rest = 0;
176
+
177
+ WStr_alloc(s);
178
+ if (in == NULL)
179
+ return s;
180
+ for (i = 0; i < len; i++) {
181
+ unsigned char c = in[i];
182
+ if ((c & 0xc0) == 0x80) {
183
+ if (rest == 0)
184
+ return NULL;
185
+ u = (u << 6) | (c & 63);
186
+ rest--;
187
+ if (rest == 0) {
188
+ WStr_addWChar(s, u);
189
+ }
190
+ }
191
+ else if ((c & 0x80) == 0) { /* 0b0nnnnnnn (7bit) */
192
+ WStr_addWChar(s, c);
193
+ rest = 0;
194
+ }
195
+ else if ((c & 0xe0) == 0xc0) { /* 0b110nnnnn (11bit) */
196
+ rest = 1;
197
+ u = c & 31;
198
+ }
199
+ else if ((c & 0xf0) == 0xe0) { /* 0b1110nnnn (16bit) */
200
+ rest = 2;
201
+ u = c & 15;
202
+ }
203
+ else if ((c & 0xf8) == 0xf0) { /* 0b11110nnn (21bit) */
204
+ rest = 3;
205
+ u = c & 7;
206
+ }
207
+ else if ((c & 0xfc) == 0xf8) { /* 0b111110nn (26bit) */
208
+ rest = 4;
209
+ u = c & 3;
210
+ }
211
+ else if ((c & 0xfe) == 0xfc) { /* 0b1111110n (31bit) */
212
+ rest = 5;
213
+ u = c & 1;
214
+ }
215
+ else {
216
+ return NULL;
217
+ }
218
+ }
219
+
220
+ return s;
221
+ }
222
+
167
223
  UString*
168
224
  WStr_convertIntoUString(WString* wstr, UString* ustr)
169
225
  {
@@ -176,6 +232,18 @@ WStr_convertIntoUString(WString* wstr, UString* ustr)
176
232
  return ustr;
177
233
  }
178
234
 
235
+ UString*
236
+ WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr)
237
+ {
238
+ int i;
239
+
240
+ for (i = start; i < wstr->len && i < start + len; i++) {
241
+ UniStr_addWChar(ustr, wstr->str[i]);
242
+ }
243
+
244
+ return ustr;
245
+ }
246
+
179
247
  void
180
248
  WStr_dump(WString* s)
181
249
  {
@@ -24,6 +24,7 @@ typedef struct _WString {
24
24
 
25
25
  WString* WStr_alloc(WString* str);
26
26
  WString* WStr_allocWithUTF8(WString* s, const char* u);
27
+ WString* WStr_allocWithUTF8L(WString* s, const char* u, int len);
27
28
  WString* WStr_enlarge(WString* str, int size);
28
29
  void WStr_free(WString* str);
29
30
  int WStr_addWChars(WString* s, const int* a, int len);
@@ -32,6 +33,7 @@ int WStr_pushWString(WString* s, const WString* add);
32
33
  int WStr_addWChar2(WString* s, int a1, int a2);
33
34
  int WStr_addWChar3(WString* s, int a1, int a2, int a3);
34
35
  UString* WStr_convertIntoUString(WString* wstr, UString* ustr);
36
+ UString* WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr);
35
37
  void WStr_dump(WString* s);
36
38
 
37
39
  #ifdef __cplusplus
Binary file
Binary file
data/tools/README CHANGED
@@ -1,6 +1,7 @@
1
1
  The bundled unidata.map is created from UnicodeData.txt,
2
- DerivedNormalizationProps.txt and SpecialCasing.txt of Unicode 6.0.
2
+ DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
3
+ of Unicode 6.0.
3
4
 
4
5
  To update unidata.map,
5
6
 
6
- ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map
7
+ ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt > unidata.map
data/tools/mkunidata.rb CHANGED
@@ -7,22 +7,102 @@
7
7
  HEAD=<<EOS
8
8
  /*
9
9
  * UnicodeData
10
- * Copyright 1999, 2004, 2010 by yoshidam
10
+ * Copyright 1999, 2004, 2010, 2012 by yoshidam
11
11
  *
12
12
  */
13
13
 
14
14
  #ifndef _UNIDATA_MAP
15
15
  #define _UNIDATA_MAP
16
16
 
17
+ EOS
18
+
19
+ HEAD1=<<EOS
20
+
21
+ enum GeneralCategory {
22
+ /* Letter */
23
+ c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
24
+ /* Mark */
25
+ c_Mn, c_Mc, c_Me,
26
+ /* Number */
27
+ c_Nd, c_Nl, c_No,
28
+ /* Punctuation */
29
+ c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
30
+ /* Symbol */
31
+ c_Sm, c_Sc, c_Sk, c_So,
32
+ /* Separator */
33
+ c_Zs, c_Zl, c_Zp,
34
+ /* Other */
35
+ c_Cc, c_Cf, c_Cs, c_Co, c_Cn
36
+ };
37
+
38
+ const char* const gencat_abbr[] = {
39
+ "", /* 0 */
40
+ /* Letter */
41
+ "Lu", "Ll", "Lt", "LC", "Lm", "Lo",
42
+ /* Mark */
43
+ "Mn", "Mc", "Me",
44
+ /* Number */
45
+ "Nd", "Nl", "No",
46
+ /* Punctuation */
47
+ "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
48
+ /* Symbol */
49
+ "Sm", "Sc", "Sk", "So",
50
+ /* Separator */
51
+ "Zs", "Zl", "Zp",
52
+ /* Other */
53
+ "Cc", "Cf", "Cs", "Co", "Cn"
54
+ };
55
+
56
+ const char* const gencat_long[] = {
57
+ "",
58
+ "Uppercase_Letter",
59
+ "Lowercase_Letter",
60
+ "Titlecase_Letter",
61
+ "Cased_Letter",
62
+ "Modifier_Letter",
63
+ "Other_Letter",
64
+ "Nonspacing_Mark",
65
+ "Spacing_Mark",
66
+ "Enclosing_Mark",
67
+ "Decimal_Number",
68
+ "Letter_Number",
69
+ "Other_Number",
70
+ "Connector_Punctuation",
71
+ "Dash_Punctuation",
72
+ "Open_Punctuation",
73
+ "Close_Punctuation",
74
+ "Initial_Punctuation",
75
+ "Final_Punctuation",
76
+ "Other_Punctuation",
77
+ "Math_Symbol",
78
+ "Currency_Symbol",
79
+ "Modifier_Symbol",
80
+ "Other_Symbol",
81
+ "Space_Separator",
82
+ "Line_Separator",
83
+ "Paragraph_Separator",
84
+ "Control",
85
+ "Format",
86
+ "Surrogate",
87
+ "Private_Use",
88
+ "Unassigned"
89
+ };
90
+
91
+ enum EastAsianWidth {
92
+ w_N = 1, w_A, w_H, w_W, w_F, w_Na
93
+ };
94
+
17
95
  struct unicode_data {
18
96
  const int code;
19
- const int combining_class;
20
- const int exclusion;
21
97
  const char* const canon;
22
98
  const char* const compat;
23
- const char* uppercase;
24
- const char* lowercase;
25
- const char* titlecase;
99
+ const char* const uppercase;
100
+ const char* const lowercase;
101
+ const char* const titlecase;
102
+ const unsigned char combining_class;
103
+ const unsigned char exclusion;
104
+ const unsigned char general_category;
105
+ const unsigned char east_asian_width;
26
106
  };
27
107
 
28
108
  static const struct unicode_data unidata[] = {
@@ -81,6 +161,11 @@ def printstr(str)
81
161
  return '"' + ret + '"'
82
162
  end
83
163
 
164
+ if ARGV.length != 4
165
+ puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
166
+ exit 0
167
+ end
168
+
84
169
  ## scan Composition Exclusions
85
170
  exclusion = {}
86
171
  open(ARGV[1]) do |f|
@@ -123,6 +208,7 @@ end
123
208
 
124
209
  ## scan UnicodeData
125
210
  udata = {}
211
+ range_data = []
126
212
  open(ARGV[0]) do |f|
127
213
  while l = f.gets
128
214
  l.chomp!
@@ -135,13 +221,46 @@ open(ARGV[0]) do |f|
135
221
  upcase = hex_or_nil(upcase)
136
222
  lowcase = hex_or_nil(lowcase)
137
223
  titlecase = hex_or_nil(titlecase)
138
- udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
224
+ udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
225
+ if charname =~ /^<(.*, (First|Last))>$/
226
+ charname = $1.upcase.gsub(/,? /, '_')
227
+ range_data << [charname, code]
228
+ end
229
+ end
230
+ end
231
+
232
+ ## scan EastAsianWidth
233
+ ea_width = {}
234
+ open(ARGV[3]) do |f|
235
+ while l = f.gets
236
+ l.chomp!
237
+ next if l =~ /^\#/ || l =~ /^$/
238
+ l =~ /^(.*)\s+#\s*(.*)$/
239
+ l = $1
240
+ comment = $2
241
+ code,width = l.split(/;/)
242
+ if code =~ /\.\./
243
+ start_code, end_code = code.split('..')
244
+ start_code = start_code.hex
245
+ end_code = end_code.hex
246
+ (start_code..end_code).each do |code|
247
+ ea_width[code] = width
248
+ end
249
+ next
250
+ end
251
+ code = code.hex
252
+ ea_width[code] = width
139
253
  end
140
254
  end
141
255
 
142
256
  print HEAD
257
+ range_data.each do |charname, code|
258
+ printf("#define %s\t(0x%04x)\n", charname, code)
259
+ end
260
+
261
+ print HEAD1
143
262
  udata.sort.each do |code, data|
144
- ccclass, canon, compat, upcase, lowcase, titlecase = data
263
+ ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
145
264
  ## Exclusions
146
265
  ex = 0
147
266
  if exclusion[code] ## Script-specifics or Post Composition Version
@@ -160,10 +279,15 @@ udata.sort.each do |code, data|
160
279
  titlecase = casing[code][1] if casing[code][1]
161
280
  upcase = casing[code][2] if casing[code][2]
162
281
  end
163
- printf(" { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
164
- code, ccclass, ex, printstr(canon),
282
+ width = 'N'
283
+ if ea_width[code]
284
+ width = ea_width[code]
285
+ end
286
+
287
+ printf(" { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
288
+ code, printstr(canon),
165
289
  printstr(compat), printstr(upcase), printstr(lowcase),
166
- printstr(titlecase))
290
+ printstr(titlecase), ccclass, ex, gencat, width)
167
291
  end
168
- printf(" { -1, 0, 0, NULL, NULL, NULL, NULL, NULL }\n")
292
+ printf(" { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
169
293
  print TAIL
data/unicode.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{unicode}
5
- s.version = "0.4.2"
5
+ s.version = "0.4.3"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = [%q{Yoshida Masato}]
9
- s.date = %q{2011-02-03}
9
+ s.date = %q{2012-08-07}
10
10
  s.email = %q{yoshidam@yoshidam.net}
11
11
  s.extensions = [%q{ext/unicode/extconf.rb}]
12
12
  s.extra_rdoc_files = [%q{README}]
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
4
+ hash: 9
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 4
9
- - 2
10
- version: 0.4.2
9
+ - 3
10
+ version: 0.4.3
11
11
  platform: x86-mswin32-60
12
12
  authors:
13
13
  - Yoshida Masato
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-02-03 00:00:00 Z
18
+ date: 2012-08-07 00:00:00 Z
19
19
  dependencies: []
20
20
 
21
21
  description: Unicode normalization library.
@@ -73,7 +73,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
73
73
  requirements: []
74
74
 
75
75
  rubyforge_project:
76
- rubygems_version: 1.8.17
76
+ rubygems_version: 1.8.24
77
77
  signing_key:
78
78
  specification_version: 3
79
79
  summary: Unicode normalization library.