unicode 0.4.2 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (8) hide show
  1. data/README +29 -7
  2. data/tools/README +3 -2
  3. data/tools/mkunidata.rb +136 -12
  4. data/unicode.c +379 -16
  5. data/unidata.map +24536 -24435
  6. data/wstring.c +69 -1
  7. data/wstring.h +2 -0
  8. metadata +20 -38
data/README CHANGED
@@ -1,5 +1,5 @@
1
1
  Unicode Library for Ruby
2
- Version 0.4.2
2
+ Version 0.4.3
3
3
 
4
4
  Yoshida Masato
5
5
 
@@ -7,14 +7,14 @@
7
7
  - Introduction
8
8
 
9
9
  Unicode string manipulation library for Ruby.
10
- This library is based on UTR #15 Unicode Normalization Forms(*1).
10
+ This library is based on UAX #15 Unicode Normalization Forms(*1).
11
11
 
12
12
  *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
13
13
 
14
14
 
15
15
  - Install
16
16
 
17
- This can work with ruby-1.8 or later. I recommend you to
17
+ This can work with ruby-1.8.7 or later. I recommend you to
18
18
  use ruby-1.9.3 or later.
19
19
 
20
20
  Make and install usually.
@@ -79,7 +79,7 @@
79
79
  These are aliases of decompose/decompose_compat.
80
80
 
81
81
  Unicode::normalize_D_safe(str) (Unicode::nfd_safe(str))
82
- This is an aliase of decompose_safe.
82
+ This is an alias of decompose_safe.
83
83
 
84
84
  Unicode::normalize_C(str) (Unicode::nfc(str))
85
85
  Unicode::normalize_KC(str) (Unicode::nfkc(str))
@@ -98,14 +98,35 @@
98
98
  The mappings that are used by these functions are not normative
99
99
  in UnicodeData.txt.
100
100
 
101
+ Unicode::categories(str)
102
+ Unicode::abbr_categories(str)
103
+ Get an array of general category names of the string.
104
+ get_abbr_categories returns abbreviated names.
105
+ These can be called with a block.
106
+
107
+ Unicode.get_category do |category| p category end
108
+
109
+ Unicode::text_elements(str)
110
+ Get an array of text elements.
111
+ A text element is a unit that is displayed as a single character.
112
+ These can be called with a block.
113
+
114
+ Unicode::width(str[, cjk])
115
+ Estimate the display width on the fixed pitch text terminal.
116
+ It based on Markus Kuhn's mk_wcwidth.
117
+ If the optional argument 'cjk' is true, East Asian
118
+ Ambiguous characters are treated as wide characters.
119
+
120
+ Unicode.width("\u03b1") #=> 1
121
+ Unicode.width("\u03b1", true) #=> 2
122
+
123
+
101
124
  - Bugs
102
125
 
103
- UTR #15 suggests that the look up for Normalization Form C
126
+ UAX #15 suggests that the look up for Normalization Form C
104
127
  should not be implemented with a hash of string for better
105
128
  performance.
106
129
 
107
- Case conversion functions should reflecte UTR #21.
108
-
109
130
 
110
131
  - Copying
111
132
 
@@ -123,6 +144,7 @@
123
144
 
124
145
  - History
125
146
 
147
+ Aug 8, 2012 version 0.4.3 add categories, text_elements and width
126
148
  Feb 29, 2012 version 0.4.2 add decompose_safe
127
149
  Feb 3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
128
150
  Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
data/tools/README CHANGED
@@ -1,6 +1,7 @@
1
1
  The bundled unidata.map is created from UnicodeData.txt,
2
- DerivedNormalizationProps.txt and SpecialCasing.txt of Unicode 6.0.
2
+ DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
3
+ of Unicode 6.0.
3
4
 
4
5
  To update unidata.map,
5
6
 
6
- ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map
7
+ ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt > unidata.map
data/tools/mkunidata.rb CHANGED
@@ -7,22 +7,102 @@
7
7
  HEAD=<<EOS
8
8
  /*
9
9
  * UnicodeData
10
- * Copyright 1999, 2004, 2010 by yoshidam
10
+ * Copyright 1999, 2004, 2010, 2012 by yoshidam
11
11
  *
12
12
  */
13
13
 
14
14
  #ifndef _UNIDATA_MAP
15
15
  #define _UNIDATA_MAP
16
16
 
17
+ EOS
18
+
19
+ HEAD1=<<EOS
20
+
21
+ enum GeneralCategory {
22
+ /* Letter */
23
+ c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
24
+ /* Mark */
25
+ c_Mn, c_Mc, c_Me,
26
+ /* Number */
27
+ c_Nd, c_Nl, c_No,
28
+ /* Punctuation */
29
+ c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
30
+ /* Symbol */
31
+ c_Sm, c_Sc, c_Sk, c_So,
32
+ /* Separator */
33
+ c_Zs, c_Zl, c_Zp,
34
+ /* Other */
35
+ c_Cc, c_Cf, c_Cs, c_Co, c_Cn
36
+ };
37
+
38
+ const char* const gencat_abbr[] = {
39
+ "", /* 0 */
40
+ /* Letter */
41
+ "Lu", "Ll", "Lt", "LC", "Lm", "Lo",
42
+ /* Mark */
43
+ "Mn", "Mc", "Me",
44
+ /* Number */
45
+ "Nd", "Nl", "No",
46
+ /* Punctuation */
47
+ "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
48
+ /* Symbol */
49
+ "Sm", "Sc", "Sk", "So",
50
+ /* Separator */
51
+ "Zs", "Zl", "Zp",
52
+ /* Other */
53
+ "Cc", "Cf", "Cs", "Co", "Cn"
54
+ };
55
+
56
+ const char* const gencat_long[] = {
57
+ "",
58
+ "Uppercase_Letter",
59
+ "Lowercase_Letter",
60
+ "Titlecase_Letter",
61
+ "Cased_Letter",
62
+ "Modifier_Letter",
63
+ "Other_Letter",
64
+ "Nonspacing_Mark",
65
+ "Spacing_Mark",
66
+ "Enclosing_Mark",
67
+ "Decimal_Number",
68
+ "Letter_Number",
69
+ "Other_Number",
70
+ "Connector_Punctuation",
71
+ "Dash_Punctuation",
72
+ "Open_Punctuation",
73
+ "Close_Punctuation",
74
+ "Initial_Punctuation",
75
+ "Final_Punctuation",
76
+ "Other_Punctuation",
77
+ "Math_Symbol",
78
+ "Currency_Symbol",
79
+ "Modifier_Symbol",
80
+ "Other_Symbol",
81
+ "Space_Separator",
82
+ "Line_Separator",
83
+ "Paragraph_Separator",
84
+ "Control",
85
+ "Format",
86
+ "Surrogate",
87
+ "Private_Use",
88
+ "Unassigned"
89
+ };
90
+
91
+ enum EastAsianWidth {
92
+ w_N = 1, w_A, w_H, w_W, w_F, w_Na
93
+ };
94
+
17
95
  struct unicode_data {
18
96
  const int code;
19
- const int combining_class;
20
- const int exclusion;
21
97
  const char* const canon;
22
98
  const char* const compat;
23
- const char* uppercase;
24
- const char* lowercase;
25
- const char* titlecase;
99
+ const char* const uppercase;
100
+ const char* const lowercase;
101
+ const char* const titlecase;
102
+ const unsigned char combining_class;
103
+ const unsigned char exclusion;
104
+ const unsigned char general_category;
105
+ const unsigned char east_asian_width;
26
106
  };
27
107
 
28
108
  static const struct unicode_data unidata[] = {
@@ -81,6 +161,11 @@ def printstr(str)
81
161
  return '"' + ret + '"'
82
162
  end
83
163
 
164
+ if ARGV.length != 4
165
+ puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
166
+ exit 0
167
+ end
168
+
84
169
  ## scan Composition Exclusions
85
170
  exclusion = {}
86
171
  open(ARGV[1]) do |f|
@@ -123,6 +208,7 @@ end
123
208
 
124
209
  ## scan UnicodeData
125
210
  udata = {}
211
+ range_data = []
126
212
  open(ARGV[0]) do |f|
127
213
  while l = f.gets
128
214
  l.chomp!
@@ -135,13 +221,46 @@ open(ARGV[0]) do |f|
135
221
  upcase = hex_or_nil(upcase)
136
222
  lowcase = hex_or_nil(lowcase)
137
223
  titlecase = hex_or_nil(titlecase)
138
- udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
224
+ udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
225
+ if charname =~ /^<(.*, (First|Last))>$/
226
+ charname = $1.upcase.gsub(/,? /, '_')
227
+ range_data << [charname, code]
228
+ end
229
+ end
230
+ end
231
+
232
+ ## scan EastAsianWidth
233
+ ea_width = {}
234
+ open(ARGV[3]) do |f|
235
+ while l = f.gets
236
+ l.chomp!
237
+ next if l =~ /^\#/ || l =~ /^$/
238
+ l =~ /^(.*)\s+#\s*(.*)$/
239
+ l = $1
240
+ comment = $2
241
+ code,width = l.split(/;/)
242
+ if code =~ /\.\./
243
+ start_code, end_code = code.split('..')
244
+ start_code = start_code.hex
245
+ end_code = end_code.hex
246
+ (start_code..end_code).each do |code|
247
+ ea_width[code] = width
248
+ end
249
+ next
250
+ end
251
+ code = code.hex
252
+ ea_width[code] = width
139
253
  end
140
254
  end
141
255
 
142
256
  print HEAD
257
+ range_data.each do |charname, code|
258
+ printf("#define %s\t(0x%04x)\n", charname, code)
259
+ end
260
+
261
+ print HEAD1
143
262
  udata.sort.each do |code, data|
144
- ccclass, canon, compat, upcase, lowcase, titlecase = data
263
+ ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
145
264
  ## Exclusions
146
265
  ex = 0
147
266
  if exclusion[code] ## Script-specifics or Post Composition Version
@@ -160,10 +279,15 @@ udata.sort.each do |code, data|
160
279
  titlecase = casing[code][1] if casing[code][1]
161
280
  upcase = casing[code][2] if casing[code][2]
162
281
  end
163
- printf(" { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
164
- code, ccclass, ex, printstr(canon),
282
+ width = 'N'
283
+ if ea_width[code]
284
+ width = ea_width[code]
285
+ end
286
+
287
+ printf(" { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
288
+ code, printstr(canon),
165
289
  printstr(compat), printstr(upcase), printstr(lowcase),
166
- printstr(titlecase))
290
+ printstr(titlecase), ccclass, ex, gencat, width)
167
291
  end
168
- printf(" { -1, 0, 0, NULL, NULL, NULL, NULL, NULL }\n")
292
+ printf(" { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
169
293
  print TAIL
data/unicode.c CHANGED
@@ -1,5 +1,6 @@
1
1
  /*
2
- * Unicode Library version 0.4
2
+ * Unicode Library version 0.4.3
3
+ * Aug 8, 2012: version 0.4
3
4
  * Oct 14, 2010: version 0.4
4
5
  * Feb 26, 2010: version 0.3
5
6
  * Dec 29, 2009: version 0.2
@@ -7,7 +8,7 @@
7
8
  *
8
9
  */
9
10
 
10
- #define UNICODE_VERSION "0.4.2"
11
+ #define UNICODE_VERSION "0.4.3"
11
12
 
12
13
  #include "ruby.h"
13
14
  #ifdef HAVE_RUBY_IO_H
@@ -54,6 +55,8 @@ taintObject(VALUE src, VALUE obj) {
54
55
  static VALUE mUnicode;
55
56
  static VALUE unicode_data;
56
57
  static VALUE composition_table;
58
+ static VALUE catname_long[c_Cn+1];
59
+ static VALUE catname_abbr[c_Cn+1];
57
60
 
58
61
  /* Hangul */
59
62
  #define SBASE (0xac00)
@@ -66,6 +69,86 @@ static VALUE composition_table;
66
69
  #define NCOUNT (VCOUNT * TCOUNT) /* 588 */
67
70
  #define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
68
71
 
72
+ VALUE
73
+ get_unidata(int ucs) {
74
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
75
+ if (!NIL_P(ch))
76
+ return ch;
77
+ #ifdef CJK_IDEOGRAPH_EXTENSION_A_FIRST
78
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_A_FIRST &&
79
+ ucs <= CJK_IDEOGRAPH_EXTENSION_A_LAST)
80
+ return rb_hash_aref(unicode_data,
81
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_A_FIRST));
82
+ #endif
83
+ #ifdef CJK_IDEOGRAPH_FIRST
84
+ else if (ucs >= CJK_IDEOGRAPH_FIRST &&
85
+ ucs <= CJK_IDEOGRAPH_LAST)
86
+ return rb_hash_aref(unicode_data,
87
+ INT2FIX(CJK_IDEOGRAPH_FIRST));
88
+ #endif
89
+ #ifdef HANGUL_SYLLABLE_FIRST
90
+ else if (ucs >= HANGUL_SYLLABLE_FIRST &&
91
+ ucs <= HANGUL_SYLLABLE_LAST)
92
+ return rb_hash_aref(unicode_data,
93
+ INT2FIX(HANGUL_SYLLABLE_FIRST));
94
+ #endif
95
+ #ifdef NON_PRIVATE_USE_HIGH_SURROGATE_FIRST
96
+ else if (ucs >= NON_PRIVATE_USE_HIGH_SURROGATE_FIRST &&
97
+ ucs <= NON_PRIVATE_USE_HIGH_SURROGATE_LAST)
98
+ return rb_hash_aref(unicode_data,
99
+ INT2FIX(NON_PRIVATE_USE_HIGH_SURROGATE_FIRST));
100
+ #endif
101
+ #ifdef PRIVATE_USE_HIGH_SURROGATE_FIRST
102
+ else if (ucs >= PRIVATE_USE_HIGH_SURROGATE_FIRST &&
103
+ ucs <= PRIVATE_USE_HIGH_SURROGATE_LAST)
104
+ return rb_hash_aref(unicode_data,
105
+ INT2FIX(PRIVATE_USE_HIGH_SURROGATE_FIRST));
106
+ #endif
107
+ #ifdef LOW_SURROGATE_FIRST
108
+ else if (ucs >= LOW_SURROGATE_FIRST &&
109
+ ucs <= LOW_SURROGATE_LAST)
110
+ return rb_hash_aref(unicode_data,
111
+ INT2FIX(LOW_SURROGATE_FIRST));
112
+ #endif
113
+ #ifdef PRIVATE_USE_FIRST
114
+ else if (ucs >= PRIVATE_USE_FIRST &&
115
+ ucs <= PRIVATE_USE_LAST)
116
+ return rb_hash_aref(unicode_data,
117
+ INT2FIX(PRIVATE_USE_FIRST));
118
+ #endif
119
+ #ifdef CJK_IDEOGRAPH_EXTENSION_B_FIRST
120
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_B_FIRST &&
121
+ ucs <= CJK_IDEOGRAPH_EXTENSION_B_LAST)
122
+ return rb_hash_aref(unicode_data,
123
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_B_FIRST));
124
+ #endif
125
+ #ifdef CJK_IDEOGRAPH_EXTENSION_C_FIRST
126
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_C_FIRST &&
127
+ ucs <= CJK_IDEOGRAPH_EXTENSION_C_LAST)
128
+ return rb_hash_aref(unicode_data,
129
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_C_FIRST));
130
+ #endif
131
+ #ifdef CJK_IDEOGRAPH_EXTENSION_D_FIRST
132
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_D_FIRST &&
133
+ ucs <= CJK_IDEOGRAPH_EXTENSION_D_LAST)
134
+ return rb_hash_aref(unicode_data,
135
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_D_FIRST));
136
+ #endif
137
+ #ifdef PLANE_15_PRIVATE_USE_FIRST
138
+ else if (ucs >= PLANE_15_PRIVATE_USE_FIRST &&
139
+ ucs <= PLANE_15_PRIVATE_USE_LAST)
140
+ return rb_hash_aref(unicode_data,
141
+ INT2FIX(PLANE_15_PRIVATE_USE_FIRST));
142
+ #endif
143
+ #ifdef PLANE_16_PRIVATE_USE_FIRST
144
+ else if (ucs >= PLANE_16_PRIVATE_USE_FIRST &&
145
+ ucs <= PLANE_16_PRIVATE_USE_LAST)
146
+ return rb_hash_aref(unicode_data,
147
+ INT2FIX(PLANE_16_PRIVATE_USE_FIRST));
148
+ #endif
149
+ return Qnil;
150
+ }
151
+
69
152
  static int
70
153
  get_cc(int ucs)
71
154
  {
@@ -77,6 +160,28 @@ get_cc(int ucs)
77
160
  return 0;
78
161
  }
79
162
 
163
+ static int
164
+ get_gencat(int ucs)
165
+ {
166
+ VALUE ch = get_unidata(ucs);
167
+
168
+ if (!NIL_P(ch)) {
169
+ return unidata[FIX2INT(ch)].general_category;
170
+ }
171
+ return c_Cn; /* Unassigned */
172
+ }
173
+
174
+ static int
175
+ get_eawidth(int ucs)
176
+ {
177
+ VALUE ch = get_unidata(ucs);
178
+
179
+ if (!NIL_P(ch)) {
180
+ return unidata[FIX2INT(ch)].east_asian_width;
181
+ }
182
+ return w_N; /* Neutral */
183
+ }
184
+
80
185
  static const char*
81
186
  get_canon(int ucs)
82
187
  {
@@ -538,8 +643,8 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
538
643
  CONVERT_TO_UTF8(str1);
539
644
  CONVERT_TO_UTF8(str2);
540
645
  #endif
541
- WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
542
- WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
646
+ WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
647
+ WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
543
648
  WStr_alloc(&result1);
544
649
  WStr_alloc(&result2);
545
650
  decompose_internal(&wstr1, &result1);
@@ -580,8 +685,8 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
580
685
  CONVERT_TO_UTF8(str1);
581
686
  CONVERT_TO_UTF8(str2);
582
687
  #endif
583
- WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
584
- WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
688
+ WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
689
+ WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
585
690
  WStr_alloc(&result1);
586
691
  WStr_alloc(&result2);
587
692
  decompose_compat_internal(&wstr1, &result1);
@@ -617,7 +722,7 @@ unicode_decompose(VALUE obj, VALUE str)
617
722
  #ifdef HAVE_RUBY_ENCODING_H
618
723
  CONVERT_TO_UTF8(str);
619
724
  #endif
620
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
725
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
621
726
  WStr_alloc(&result);
622
727
  decompose_internal(&ustr, &result);
623
728
  WStr_free(&ustr);
@@ -643,7 +748,7 @@ unicode_decompose_safe(VALUE obj, VALUE str)
643
748
  #ifdef HAVE_RUBY_ENCODING_H
644
749
  CONVERT_TO_UTF8(str);
645
750
  #endif
646
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
751
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
647
752
  WStr_alloc(&result);
648
753
  decompose_safe_internal(&ustr, &result);
649
754
  WStr_free(&ustr);
@@ -669,7 +774,7 @@ unicode_decompose_compat(VALUE obj, VALUE str)
669
774
  #ifdef HAVE_RUBY_ENCODING_H
670
775
  CONVERT_TO_UTF8(str);
671
776
  #endif
672
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
777
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
673
778
  WStr_alloc(&result);
674
779
  decompose_compat_internal(&ustr, &result);
675
780
  WStr_free(&ustr);
@@ -695,7 +800,7 @@ unicode_compose(VALUE obj, VALUE str)
695
800
  #ifdef HAVE_RUBY_ENCODING_H
696
801
  CONVERT_TO_UTF8(str);
697
802
  #endif
698
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
803
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
699
804
  sort_canonical(&ustr);
700
805
  WStr_alloc(&result);
701
806
  compose_internal(&ustr, &result);
@@ -722,7 +827,7 @@ unicode_normalize_C(VALUE obj, VALUE str)
722
827
  #ifdef HAVE_RUBY_ENCODING_H
723
828
  CONVERT_TO_UTF8(str);
724
829
  #endif
725
- WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
830
+ WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
726
831
  WStr_alloc(&ustr2);
727
832
  decompose_internal(&ustr1, &ustr2);
728
833
  WStr_free(&ustr1);
@@ -752,7 +857,7 @@ unicode_normalize_safe(VALUE obj, VALUE str)
752
857
  #ifdef HAVE_RUBY_ENCODING_H
753
858
  CONVERT_TO_UTF8(str);
754
859
  #endif
755
- WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
860
+ WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
756
861
  WStr_alloc(&ustr2);
757
862
  decompose_safe_internal(&ustr1, &ustr2);
758
863
  WStr_free(&ustr1);
@@ -782,7 +887,7 @@ unicode_normalize_KC(VALUE obj, VALUE str)
782
887
  #ifdef HAVE_RUBY_ENCODING_H
783
888
  CONVERT_TO_UTF8(str);
784
889
  #endif
785
- WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
890
+ WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
786
891
  WStr_alloc(&ustr2);
787
892
  decompose_compat_internal(&ustr1, &ustr2);
788
893
  WStr_free(&ustr1);
@@ -811,7 +916,7 @@ unicode_upcase(VALUE obj, VALUE str)
811
916
  #ifdef HAVE_RUBY_ENCODING_H
812
917
  CONVERT_TO_UTF8(str);
813
918
  #endif
814
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
919
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
815
920
  WStr_alloc(&result);
816
921
  upcase_internal(&ustr, &result);
817
922
  //sort_canonical(&result);
@@ -837,7 +942,7 @@ unicode_downcase(VALUE obj, VALUE str)
837
942
  #ifdef HAVE_RUBY_ENCODING_H
838
943
  CONVERT_TO_UTF8(str);
839
944
  #endif
840
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
945
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
841
946
  WStr_alloc(&result);
842
947
  downcase_internal(&ustr, &result);
843
948
  //sort_canonical(&result);
@@ -868,7 +973,7 @@ unicode_capitalize(VALUE obj, VALUE str)
868
973
  #ifdef HAVE_RUBY_ENCODING_H
869
974
  CONVERT_TO_UTF8(str);
870
975
  #endif
871
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
976
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
872
977
  WStr_alloc(&result);
873
978
  capitalize_internal(&ustr, &result);
874
979
  //sort_canonical(&result);
@@ -882,6 +987,248 @@ unicode_capitalize(VALUE obj, VALUE str)
882
987
  return vret;
883
988
  }
884
989
 
990
+ typedef struct _get_categories_param {
991
+ WString* wstr;
992
+ VALUE str;
993
+ VALUE* catname;
994
+ } get_categories_param;
995
+
996
+ static VALUE
997
+ get_categories_internal(get_categories_param* param)
998
+ {
999
+ WString* wstr = param->wstr;
1000
+ VALUE str = param->str;
1001
+ VALUE* catname = param->catname;
1002
+ int pos;
1003
+ int block_p = rb_block_given_p();
1004
+ volatile VALUE ret = str;
1005
+
1006
+ if (!block_p)
1007
+ ret = rb_ary_new();
1008
+ for (pos = 0; pos < wstr->len; pos++) {
1009
+ int gencat = get_gencat(wstr->str[pos]);
1010
+ if (!block_p)
1011
+ rb_ary_push(ret, catname[gencat]);
1012
+ else {
1013
+ rb_yield(catname[gencat]);
1014
+ }
1015
+ }
1016
+
1017
+ return ret;
1018
+ }
1019
+
1020
+ VALUE
1021
+ get_categories_ensure(WString* wstr)
1022
+ {
1023
+ WStr_free(wstr);
1024
+ return Qnil;
1025
+ }
1026
+
1027
+ VALUE
1028
+ unicode_get_categories(VALUE obj, VALUE str)
1029
+ {
1030
+ WString wstr;
1031
+ get_categories_param param = { &wstr, str, catname_long };
1032
+
1033
+ Check_Type(str, T_STRING);
1034
+ #ifdef HAVE_RUBY_ENCODING_H
1035
+ CONVERT_TO_UTF8(str);
1036
+ #endif
1037
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1038
+
1039
+ return rb_ensure(get_categories_internal, (VALUE)&param,
1040
+ get_categories_ensure, (VALUE)&wstr);
1041
+ /* wstr will be freed in get_text_elements_ensure() */
1042
+ }
1043
+
1044
+
1045
+ VALUE
1046
+ unicode_get_abbr_categories(VALUE obj, VALUE str)
1047
+ {
1048
+ WString wstr;
1049
+ get_categories_param param = { &wstr, str, catname_abbr };
1050
+
1051
+ Check_Type(str, T_STRING);
1052
+ #ifdef HAVE_RUBY_ENCODING_H
1053
+ CONVERT_TO_UTF8(str);
1054
+ #endif
1055
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1056
+
1057
+ return rb_ensure(get_categories_internal, (VALUE)&param,
1058
+ get_categories_ensure, (VALUE)&wstr);
1059
+ /* wstr will be freed in get_text_elements_ensure() */
1060
+ }
1061
+
1062
+ VALUE
1063
+ unicode_wcswidth(int argc, VALUE* argv, VALUE obj)
1064
+ {
1065
+ WString wstr;
1066
+ int i, count;
1067
+ int width = 0;
1068
+ int cjk_p = 0;
1069
+ VALUE str;
1070
+ VALUE cjk;
1071
+
1072
+ count = rb_scan_args(argc, argv, "11", &str, &cjk);
1073
+ if (count > 1)
1074
+ cjk_p = RTEST(cjk);
1075
+ Check_Type(str, T_STRING);
1076
+ #ifdef HAVE_RUBY_ENCODING_H
1077
+ CONVERT_TO_UTF8(str);
1078
+ #endif
1079
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1080
+ for (i = 0; i <wstr.len; i++) {
1081
+ int c = wstr.str[i];
1082
+ int cat = get_gencat(c);
1083
+ int eaw = get_eawidth(c);
1084
+ if ((c > 0 && c < 32) || (c >= 0x7f && c < 0xa0)) {
1085
+ /* Control Characters */
1086
+ width = -1;
1087
+ break;
1088
+ }
1089
+ else if (c != 0x00ad && /* SOFT HYPHEN */
1090
+ (cat == c_Mn || cat == c_Me || /* Non-spacing Marks */
1091
+ cat == c_Cf || /* Format */
1092
+ c == 0 || /* NUL */
1093
+ (c >= 0x1160 && c <= 0x11ff))) /* HANGUL JUNGSEONG/JONGSEONG */
1094
+ /* zero width */ ;
1095
+ else if (eaw == w_F || eaw == w_W || /* Fullwidth or Wide */
1096
+ (c >= 0x4db6 && c <= 0x4dbf) || /* CJK Reserved */
1097
+ (c >= 0x9fcd && c <= 0x9fff) || /* CJK Reserved */
1098
+ (c >= 0xfa6e && c <= 0xfa6f) || /* CJK Reserved */
1099
+ (c >= 0xfada && c <= 0xfaff) || /* CJK Reserved */
1100
+ (c >= 0x2a6d7 && c <= 0x2a6ff) || /* CJK Reserved */
1101
+ (c >= 0x2b735 && c <= 0x2b73f) || /* CJK Reserved */
1102
+ (c >= 0x2b81e && c <= 0x2f7ff) || /* CJK Reserved */
1103
+ (c >= 0x2fa1e && c <= 0x2fffd) || /* CJK Reserved */
1104
+ (c >= 0x30000 && c <= 0x3fffd) || /* CJK Reserved */
1105
+ (cjk_p && eaw == w_A)) /* East Asian Ambiguous */
1106
+ width += 2;
1107
+ else
1108
+ width++; /* Halfwidth or Neutral */
1109
+ }
1110
+ WStr_free(&wstr);
1111
+
1112
+ return INT2FIX(width);
1113
+ }
1114
+
1115
+ VALUE
1116
+ wstring_to_rstring(WString* wstr, int start, int len) {
1117
+ UString ret;
1118
+ volatile VALUE vret;
1119
+
1120
+ UniStr_alloc(&ret);
1121
+ WStr_convertIntoUString2(wstr, start, len, &ret);
1122
+ vret = ENC_(rb_str_new((char*)ret.str, ret.len));
1123
+ UniStr_free(&ret);
1124
+
1125
+ return vret;
1126
+ }
1127
+
1128
+ typedef struct _get_text_elements_param {
1129
+ WString* wstr;
1130
+ VALUE str;
1131
+ } get_text_elements_param;
1132
+
1133
+ VALUE
1134
+ get_text_elements_internal(get_text_elements_param* param)
1135
+ {
1136
+ WString* wstr = param->wstr;
1137
+ VALUE str = param->str;
1138
+ int start_pos;
1139
+ int block_p = rb_block_given_p();
1140
+ volatile VALUE ret = str;
1141
+
1142
+ if (!block_p)
1143
+ ret = rb_ary_new();
1144
+ for (start_pos = 0; start_pos < wstr->len;) {
1145
+ int c0 = wstr->str[start_pos];
1146
+ int cat = get_gencat(c0);
1147
+ int length = 1;
1148
+ int j;
1149
+
1150
+ if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
1151
+ volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
1152
+ if (!block_p)
1153
+ rb_ary_push(ret, rstr);
1154
+ else
1155
+ rb_yield(rstr);
1156
+ start_pos++;
1157
+ continue;
1158
+ }
1159
+
1160
+ for (j = start_pos + 1; j < wstr->len; j++) {
1161
+ int c1 = wstr->str[j];
1162
+ int cat = get_gencat(c1);
1163
+ if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
1164
+ j + 1 < wstr->len &&
1165
+ c1 >= VBASE && c1 < VBASE + VCOUNT &&
1166
+ wstr->str[j+1] >= TBASE && wstr->str[j+1] < TBASE + TCOUNT) {
1167
+ /* Hangul L+V+T */
1168
+ length += 2;
1169
+ j++;
1170
+ }
1171
+ else if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
1172
+ c1 >= VBASE && c1< VBASE + VCOUNT) {
1173
+ /* Hangul L+V */
1174
+ length++;
1175
+ }
1176
+ else if (c0 >= SBASE && c0 < SBASE + SCOUNT &&
1177
+ (c0 - SBASE) % TCOUNT == 0 &&
1178
+ c1 >= TBASE && c1 < TBASE + TCOUNT) {
1179
+ /* Hangul LV+T */
1180
+ length++;
1181
+ }
1182
+ else if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
1183
+ /* Mark */
1184
+ length++;
1185
+ }
1186
+ else {
1187
+ volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
1188
+ if (!block_p)
1189
+ rb_ary_push(ret, rstr);
1190
+ else
1191
+ rb_yield(rstr);
1192
+ length = 0;
1193
+ break;
1194
+ }
1195
+ }
1196
+ if (length > 0) {
1197
+ volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
1198
+ if (!block_p)
1199
+ rb_ary_push(ret, rstr);
1200
+ else
1201
+ rb_yield(rstr);
1202
+ }
1203
+ start_pos = j;
1204
+ }
1205
+ return ret;
1206
+ }
1207
+
1208
+ VALUE
1209
+ get_text_elements_ensure(WString* wstr)
1210
+ {
1211
+ WStr_free(wstr);
1212
+ return Qnil;
1213
+ }
1214
+
1215
+ VALUE
1216
+ unicode_get_text_elements(VALUE obj, VALUE str)
1217
+ {
1218
+ WString wstr;
1219
+ get_text_elements_param param = { &wstr, str };
1220
+
1221
+ Check_Type(str, T_STRING);
1222
+ #ifdef HAVE_RUBY_ENCODING_H
1223
+ CONVERT_TO_UTF8(str);
1224
+ #endif
1225
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1226
+
1227
+ return rb_ensure(get_text_elements_internal, (VALUE)&param,
1228
+ get_text_elements_ensure, (VALUE)&wstr);
1229
+ /* wstr will be freed in get_text_elements_ensure() */
1230
+ }
1231
+
885
1232
  void
886
1233
  Init_unicode()
887
1234
  {
@@ -909,6 +1256,13 @@ Init_unicode()
909
1256
  }
910
1257
  }
911
1258
 
1259
+ for (i = 0; i < c_Cn + 1; i++) {
1260
+ catname_abbr[i] = ID2SYM(rb_intern(gencat_abbr[i]));
1261
+ catname_long[i] = ID2SYM(rb_intern(gencat_long[i]));
1262
+ rb_global_variable(&catname_abbr[i]);
1263
+ rb_global_variable(&catname_long[i]);
1264
+ }
1265
+
912
1266
  rb_define_module_function(mUnicode, "strcmp",
913
1267
  unicode_strcmp, 2);
914
1268
  rb_define_module_function(mUnicode, "strcmp_compat",
@@ -957,6 +1311,15 @@ Init_unicode()
957
1311
  rb_define_module_function(mUnicode, "capitalize",
958
1312
  unicode_capitalize, 1);
959
1313
 
1314
+ rb_define_module_function(mUnicode, "categories",
1315
+ unicode_get_categories, 1);
1316
+ rb_define_module_function(mUnicode, "abbr_categories",
1317
+ unicode_get_abbr_categories, 1);
1318
+ rb_define_module_function(mUnicode, "width",
1319
+ unicode_wcswidth, -1);
1320
+ rb_define_module_function(mUnicode, "text_elements",
1321
+ unicode_get_text_elements, 1);
1322
+
960
1323
  rb_define_const(mUnicode, "VERSION",
961
1324
  rb_str_new2(UNICODE_VERSION));
962
1325
  }