unicode 0.4.2-x86-mingw32 → 0.4.3-x86-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -1,5 +1,5 @@
1
1
  Unicode Library for Ruby
2
- Version 0.4.2
2
+ Version 0.4.3
3
3
 
4
4
  Yoshida Masato
5
5
 
@@ -7,14 +7,14 @@
7
7
  - Introduction
8
8
 
9
9
  Unicode string manipulation library for Ruby.
10
- This library is based on UTR #15 Unicode Normalization Forms(*1).
10
+ This library is based on UAX #15 Unicode Normalization Forms(*1).
11
11
 
12
12
  *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
13
13
 
14
14
 
15
15
  - Install
16
16
 
17
- This can work with ruby-1.8 or later. I recommend you to
17
+ This can work with ruby-1.8.7 or later. I recommend you to
18
18
  use ruby-1.9.3 or later.
19
19
 
20
20
  Make and install usually.
@@ -79,7 +79,7 @@
79
79
  These are aliases of decompose/decompose_compat.
80
80
 
81
81
  Unicode::normalize_D_safe(str) (Unicode::nfd_safe(str))
82
- This is an aliase of decompose_safe.
82
+ This is an alias of decompose_safe.
83
83
 
84
84
  Unicode::normalize_C(str) (Unicode::nfc(str))
85
85
  Unicode::normalize_KC(str) (Unicode::nfkc(str))
@@ -98,14 +98,35 @@
98
98
  The mappings that are used by these functions are not normative
99
99
  in UnicodeData.txt.
100
100
 
101
+ Unicode::categories(str)
102
+ Unicode::abbr_categories(str)
103
+ Get an array of general category names of the string.
104
+ get_abbr_categories returns abbreviated names.
105
+ These can be called with a block.
106
+
107
+ Unicode.get_category do |category| p category end
108
+
109
+ Unicode::text_elements(str)
110
+ Get an array of text elements.
111
+ A text element is a unit that is displayed as a single character.
112
+ These can be called with a block.
113
+
114
+ Unicode::width(str[, cjk])
115
+ Estimate the display width on the fixed pitch text terminal.
116
+ It based on Markus Kuhn's mk_wcwidth.
117
+ If the optional argument 'cjk' is true, East Asian
118
+ Ambiguous characters are treated as wide characters.
119
+
120
+ Unicode.width("\u03b1") #=> 1
121
+ Unicode.width("\u03b1", true) #=> 2
122
+
123
+
101
124
  - Bugs
102
125
 
103
- UTR #15 suggests that the look up for Normalization Form C
126
+ UAX #15 suggests that the look up for Normalization Form C
104
127
  should not be implemented with a hash of string for better
105
128
  performance.
106
129
 
107
- Case conversion functions should reflecte UTR #21.
108
-
109
130
 
110
131
  - Copying
111
132
 
@@ -123,6 +144,7 @@
123
144
 
124
145
  - History
125
146
 
147
+ Aug 8, 2012 version 0.4.3 add categories, text_elements and width
126
148
  Feb 29, 2012 version 0.4.2 add decompose_safe
127
149
  Feb 3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
128
150
  Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
@@ -1,5 +1,6 @@
1
1
  /*
2
- * Unicode Library version 0.4
2
+ * Unicode Library version 0.4.3
3
+ * Aug 8, 2012: version 0.4
3
4
  * Oct 14, 2010: version 0.4
4
5
  * Feb 26, 2010: version 0.3
5
6
  * Dec 29, 2009: version 0.2
@@ -7,7 +8,7 @@
7
8
  *
8
9
  */
9
10
 
10
- #define UNICODE_VERSION "0.4.2"
11
+ #define UNICODE_VERSION "0.4.3"
11
12
 
12
13
  #include "ruby.h"
13
14
  #ifdef HAVE_RUBY_IO_H
@@ -54,6 +55,8 @@ taintObject(VALUE src, VALUE obj) {
54
55
  static VALUE mUnicode;
55
56
  static VALUE unicode_data;
56
57
  static VALUE composition_table;
58
+ static VALUE catname_long[c_Cn+1];
59
+ static VALUE catname_abbr[c_Cn+1];
57
60
 
58
61
  /* Hangul */
59
62
  #define SBASE (0xac00)
@@ -66,6 +69,86 @@ static VALUE composition_table;
66
69
  #define NCOUNT (VCOUNT * TCOUNT) /* 588 */
67
70
  #define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
68
71
 
72
+ VALUE
73
+ get_unidata(int ucs) {
74
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
75
+ if (!NIL_P(ch))
76
+ return ch;
77
+ #ifdef CJK_IDEOGRAPH_EXTENSION_A_FIRST
78
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_A_FIRST &&
79
+ ucs <= CJK_IDEOGRAPH_EXTENSION_A_LAST)
80
+ return rb_hash_aref(unicode_data,
81
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_A_FIRST));
82
+ #endif
83
+ #ifdef CJK_IDEOGRAPH_FIRST
84
+ else if (ucs >= CJK_IDEOGRAPH_FIRST &&
85
+ ucs <= CJK_IDEOGRAPH_LAST)
86
+ return rb_hash_aref(unicode_data,
87
+ INT2FIX(CJK_IDEOGRAPH_FIRST));
88
+ #endif
89
+ #ifdef HANGUL_SYLLABLE_FIRST
90
+ else if (ucs >= HANGUL_SYLLABLE_FIRST &&
91
+ ucs <= HANGUL_SYLLABLE_LAST)
92
+ return rb_hash_aref(unicode_data,
93
+ INT2FIX(HANGUL_SYLLABLE_FIRST));
94
+ #endif
95
+ #ifdef NON_PRIVATE_USE_HIGH_SURROGATE_FIRST
96
+ else if (ucs >= NON_PRIVATE_USE_HIGH_SURROGATE_FIRST &&
97
+ ucs <= NON_PRIVATE_USE_HIGH_SURROGATE_LAST)
98
+ return rb_hash_aref(unicode_data,
99
+ INT2FIX(NON_PRIVATE_USE_HIGH_SURROGATE_FIRST));
100
+ #endif
101
+ #ifdef PRIVATE_USE_HIGH_SURROGATE_FIRST
102
+ else if (ucs >= PRIVATE_USE_HIGH_SURROGATE_FIRST &&
103
+ ucs <= PRIVATE_USE_HIGH_SURROGATE_LAST)
104
+ return rb_hash_aref(unicode_data,
105
+ INT2FIX(PRIVATE_USE_HIGH_SURROGATE_FIRST));
106
+ #endif
107
+ #ifdef LOW_SURROGATE_FIRST
108
+ else if (ucs >= LOW_SURROGATE_FIRST &&
109
+ ucs <= LOW_SURROGATE_LAST)
110
+ return rb_hash_aref(unicode_data,
111
+ INT2FIX(LOW_SURROGATE_FIRST));
112
+ #endif
113
+ #ifdef PRIVATE_USE_FIRST
114
+ else if (ucs >= PRIVATE_USE_FIRST &&
115
+ ucs <= PRIVATE_USE_LAST)
116
+ return rb_hash_aref(unicode_data,
117
+ INT2FIX(PRIVATE_USE_FIRST));
118
+ #endif
119
+ #ifdef CJK_IDEOGRAPH_EXTENSION_B_FIRST
120
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_B_FIRST &&
121
+ ucs <= CJK_IDEOGRAPH_EXTENSION_B_LAST)
122
+ return rb_hash_aref(unicode_data,
123
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_B_FIRST));
124
+ #endif
125
+ #ifdef CJK_IDEOGRAPH_EXTENSION_C_FIRST
126
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_C_FIRST &&
127
+ ucs <= CJK_IDEOGRAPH_EXTENSION_C_LAST)
128
+ return rb_hash_aref(unicode_data,
129
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_C_FIRST));
130
+ #endif
131
+ #ifdef CJK_IDEOGRAPH_EXTENSION_D_FIRST
132
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_D_FIRST &&
133
+ ucs <= CJK_IDEOGRAPH_EXTENSION_D_LAST)
134
+ return rb_hash_aref(unicode_data,
135
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_D_FIRST));
136
+ #endif
137
+ #ifdef PLANE_15_PRIVATE_USE_FIRST
138
+ else if (ucs >= PLANE_15_PRIVATE_USE_FIRST &&
139
+ ucs <= PLANE_15_PRIVATE_USE_LAST)
140
+ return rb_hash_aref(unicode_data,
141
+ INT2FIX(PLANE_15_PRIVATE_USE_FIRST));
142
+ #endif
143
+ #ifdef PLANE_16_PRIVATE_USE_FIRST
144
+ else if (ucs >= PLANE_16_PRIVATE_USE_FIRST &&
145
+ ucs <= PLANE_16_PRIVATE_USE_LAST)
146
+ return rb_hash_aref(unicode_data,
147
+ INT2FIX(PLANE_16_PRIVATE_USE_FIRST));
148
+ #endif
149
+ return Qnil;
150
+ }
151
+
69
152
  static int
70
153
  get_cc(int ucs)
71
154
  {
@@ -77,6 +160,28 @@ get_cc(int ucs)
77
160
  return 0;
78
161
  }
79
162
 
163
+ static int
164
+ get_gencat(int ucs)
165
+ {
166
+ VALUE ch = get_unidata(ucs);
167
+
168
+ if (!NIL_P(ch)) {
169
+ return unidata[FIX2INT(ch)].general_category;
170
+ }
171
+ return c_Cn; /* Unassigned */
172
+ }
173
+
174
+ static int
175
+ get_eawidth(int ucs)
176
+ {
177
+ VALUE ch = get_unidata(ucs);
178
+
179
+ if (!NIL_P(ch)) {
180
+ return unidata[FIX2INT(ch)].east_asian_width;
181
+ }
182
+ return w_N; /* Neutral */
183
+ }
184
+
80
185
  static const char*
81
186
  get_canon(int ucs)
82
187
  {
@@ -538,8 +643,8 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
538
643
  CONVERT_TO_UTF8(str1);
539
644
  CONVERT_TO_UTF8(str2);
540
645
  #endif
541
- WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
542
- WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
646
+ WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
647
+ WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
543
648
  WStr_alloc(&result1);
544
649
  WStr_alloc(&result2);
545
650
  decompose_internal(&wstr1, &result1);
@@ -580,8 +685,8 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
580
685
  CONVERT_TO_UTF8(str1);
581
686
  CONVERT_TO_UTF8(str2);
582
687
  #endif
583
- WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
584
- WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
688
+ WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
689
+ WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
585
690
  WStr_alloc(&result1);
586
691
  WStr_alloc(&result2);
587
692
  decompose_compat_internal(&wstr1, &result1);
@@ -617,7 +722,7 @@ unicode_decompose(VALUE obj, VALUE str)
617
722
  #ifdef HAVE_RUBY_ENCODING_H
618
723
  CONVERT_TO_UTF8(str);
619
724
  #endif
620
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
725
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
621
726
  WStr_alloc(&result);
622
727
  decompose_internal(&ustr, &result);
623
728
  WStr_free(&ustr);
@@ -643,7 +748,7 @@ unicode_decompose_safe(VALUE obj, VALUE str)
643
748
  #ifdef HAVE_RUBY_ENCODING_H
644
749
  CONVERT_TO_UTF8(str);
645
750
  #endif
646
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
751
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
647
752
  WStr_alloc(&result);
648
753
  decompose_safe_internal(&ustr, &result);
649
754
  WStr_free(&ustr);
@@ -669,7 +774,7 @@ unicode_decompose_compat(VALUE obj, VALUE str)
669
774
  #ifdef HAVE_RUBY_ENCODING_H
670
775
  CONVERT_TO_UTF8(str);
671
776
  #endif
672
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
777
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
673
778
  WStr_alloc(&result);
674
779
  decompose_compat_internal(&ustr, &result);
675
780
  WStr_free(&ustr);
@@ -695,7 +800,7 @@ unicode_compose(VALUE obj, VALUE str)
695
800
  #ifdef HAVE_RUBY_ENCODING_H
696
801
  CONVERT_TO_UTF8(str);
697
802
  #endif
698
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
803
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
699
804
  sort_canonical(&ustr);
700
805
  WStr_alloc(&result);
701
806
  compose_internal(&ustr, &result);
@@ -722,7 +827,7 @@ unicode_normalize_C(VALUE obj, VALUE str)
722
827
  #ifdef HAVE_RUBY_ENCODING_H
723
828
  CONVERT_TO_UTF8(str);
724
829
  #endif
725
- WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
830
+ WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
726
831
  WStr_alloc(&ustr2);
727
832
  decompose_internal(&ustr1, &ustr2);
728
833
  WStr_free(&ustr1);
@@ -752,7 +857,7 @@ unicode_normalize_safe(VALUE obj, VALUE str)
752
857
  #ifdef HAVE_RUBY_ENCODING_H
753
858
  CONVERT_TO_UTF8(str);
754
859
  #endif
755
- WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
860
+ WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
756
861
  WStr_alloc(&ustr2);
757
862
  decompose_safe_internal(&ustr1, &ustr2);
758
863
  WStr_free(&ustr1);
@@ -782,7 +887,7 @@ unicode_normalize_KC(VALUE obj, VALUE str)
782
887
  #ifdef HAVE_RUBY_ENCODING_H
783
888
  CONVERT_TO_UTF8(str);
784
889
  #endif
785
- WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
890
+ WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
786
891
  WStr_alloc(&ustr2);
787
892
  decompose_compat_internal(&ustr1, &ustr2);
788
893
  WStr_free(&ustr1);
@@ -811,7 +916,7 @@ unicode_upcase(VALUE obj, VALUE str)
811
916
  #ifdef HAVE_RUBY_ENCODING_H
812
917
  CONVERT_TO_UTF8(str);
813
918
  #endif
814
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
919
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
815
920
  WStr_alloc(&result);
816
921
  upcase_internal(&ustr, &result);
817
922
  //sort_canonical(&result);
@@ -837,7 +942,7 @@ unicode_downcase(VALUE obj, VALUE str)
837
942
  #ifdef HAVE_RUBY_ENCODING_H
838
943
  CONVERT_TO_UTF8(str);
839
944
  #endif
840
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
945
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
841
946
  WStr_alloc(&result);
842
947
  downcase_internal(&ustr, &result);
843
948
  //sort_canonical(&result);
@@ -868,7 +973,7 @@ unicode_capitalize(VALUE obj, VALUE str)
868
973
  #ifdef HAVE_RUBY_ENCODING_H
869
974
  CONVERT_TO_UTF8(str);
870
975
  #endif
871
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
976
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
872
977
  WStr_alloc(&result);
873
978
  capitalize_internal(&ustr, &result);
874
979
  //sort_canonical(&result);
@@ -882,6 +987,248 @@ unicode_capitalize(VALUE obj, VALUE str)
882
987
  return vret;
883
988
  }
884
989
 
990
+ typedef struct _get_categories_param {
991
+ WString* wstr;
992
+ VALUE str;
993
+ VALUE* catname;
994
+ } get_categories_param;
995
+
996
+ static VALUE
997
+ get_categories_internal(get_categories_param* param)
998
+ {
999
+ WString* wstr = param->wstr;
1000
+ VALUE str = param->str;
1001
+ VALUE* catname = param->catname;
1002
+ int pos;
1003
+ int block_p = rb_block_given_p();
1004
+ volatile VALUE ret = str;
1005
+
1006
+ if (!block_p)
1007
+ ret = rb_ary_new();
1008
+ for (pos = 0; pos < wstr->len; pos++) {
1009
+ int gencat = get_gencat(wstr->str[pos]);
1010
+ if (!block_p)
1011
+ rb_ary_push(ret, catname[gencat]);
1012
+ else {
1013
+ rb_yield(catname[gencat]);
1014
+ }
1015
+ }
1016
+
1017
+ return ret;
1018
+ }
1019
+
1020
+ VALUE
1021
+ get_categories_ensure(WString* wstr)
1022
+ {
1023
+ WStr_free(wstr);
1024
+ return Qnil;
1025
+ }
1026
+
1027
+ VALUE
1028
+ unicode_get_categories(VALUE obj, VALUE str)
1029
+ {
1030
+ WString wstr;
1031
+ get_categories_param param = { &wstr, str, catname_long };
1032
+
1033
+ Check_Type(str, T_STRING);
1034
+ #ifdef HAVE_RUBY_ENCODING_H
1035
+ CONVERT_TO_UTF8(str);
1036
+ #endif
1037
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1038
+
1039
+ return rb_ensure(get_categories_internal, (VALUE)&param,
1040
+ get_categories_ensure, (VALUE)&wstr);
1041
+ /* wstr will be freed in get_text_elements_ensure() */
1042
+ }
1043
+
1044
+
1045
+ VALUE
1046
+ unicode_get_abbr_categories(VALUE obj, VALUE str)
1047
+ {
1048
+ WString wstr;
1049
+ get_categories_param param = { &wstr, str, catname_abbr };
1050
+
1051
+ Check_Type(str, T_STRING);
1052
+ #ifdef HAVE_RUBY_ENCODING_H
1053
+ CONVERT_TO_UTF8(str);
1054
+ #endif
1055
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1056
+
1057
+ return rb_ensure(get_categories_internal, (VALUE)&param,
1058
+ get_categories_ensure, (VALUE)&wstr);
1059
+ /* wstr will be freed in get_text_elements_ensure() */
1060
+ }
1061
+
1062
+ VALUE
1063
+ unicode_wcswidth(int argc, VALUE* argv, VALUE obj)
1064
+ {
1065
+ WString wstr;
1066
+ int i, count;
1067
+ int width = 0;
1068
+ int cjk_p = 0;
1069
+ VALUE str;
1070
+ VALUE cjk;
1071
+
1072
+ count = rb_scan_args(argc, argv, "11", &str, &cjk);
1073
+ if (count > 1)
1074
+ cjk_p = RTEST(cjk);
1075
+ Check_Type(str, T_STRING);
1076
+ #ifdef HAVE_RUBY_ENCODING_H
1077
+ CONVERT_TO_UTF8(str);
1078
+ #endif
1079
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1080
+ for (i = 0; i <wstr.len; i++) {
1081
+ int c = wstr.str[i];
1082
+ int cat = get_gencat(c);
1083
+ int eaw = get_eawidth(c);
1084
+ if ((c > 0 && c < 32) || (c >= 0x7f && c < 0xa0)) {
1085
+ /* Control Characters */
1086
+ width = -1;
1087
+ break;
1088
+ }
1089
+ else if (c != 0x00ad && /* SOFT HYPHEN */
1090
+ (cat == c_Mn || cat == c_Me || /* Non-spacing Marks */
1091
+ cat == c_Cf || /* Format */
1092
+ c == 0 || /* NUL */
1093
+ (c >= 0x1160 && c <= 0x11ff))) /* HANGUL JUNGSEONG/JONGSEONG */
1094
+ /* zero width */ ;
1095
+ else if (eaw == w_F || eaw == w_W || /* Fullwidth or Wide */
1096
+ (c >= 0x4db6 && c <= 0x4dbf) || /* CJK Reserved */
1097
+ (c >= 0x9fcd && c <= 0x9fff) || /* CJK Reserved */
1098
+ (c >= 0xfa6e && c <= 0xfa6f) || /* CJK Reserved */
1099
+ (c >= 0xfada && c <= 0xfaff) || /* CJK Reserved */
1100
+ (c >= 0x2a6d7 && c <= 0x2a6ff) || /* CJK Reserved */
1101
+ (c >= 0x2b735 && c <= 0x2b73f) || /* CJK Reserved */
1102
+ (c >= 0x2b81e && c <= 0x2f7ff) || /* CJK Reserved */
1103
+ (c >= 0x2fa1e && c <= 0x2fffd) || /* CJK Reserved */
1104
+ (c >= 0x30000 && c <= 0x3fffd) || /* CJK Reserved */
1105
+ (cjk_p && eaw == w_A)) /* East Asian Ambiguous */
1106
+ width += 2;
1107
+ else
1108
+ width++; /* Halfwidth or Neutral */
1109
+ }
1110
+ WStr_free(&wstr);
1111
+
1112
+ return INT2FIX(width);
1113
+ }
1114
+
1115
+ VALUE
1116
+ wstring_to_rstring(WString* wstr, int start, int len) {
1117
+ UString ret;
1118
+ volatile VALUE vret;
1119
+
1120
+ UniStr_alloc(&ret);
1121
+ WStr_convertIntoUString2(wstr, start, len, &ret);
1122
+ vret = ENC_(rb_str_new((char*)ret.str, ret.len));
1123
+ UniStr_free(&ret);
1124
+
1125
+ return vret;
1126
+ }
1127
+
1128
+ typedef struct _get_text_elements_param {
1129
+ WString* wstr;
1130
+ VALUE str;
1131
+ } get_text_elements_param;
1132
+
1133
+ VALUE
1134
+ get_text_elements_internal(get_text_elements_param* param)
1135
+ {
1136
+ WString* wstr = param->wstr;
1137
+ VALUE str = param->str;
1138
+ int start_pos;
1139
+ int block_p = rb_block_given_p();
1140
+ volatile VALUE ret = str;
1141
+
1142
+ if (!block_p)
1143
+ ret = rb_ary_new();
1144
+ for (start_pos = 0; start_pos < wstr->len;) {
1145
+ int c0 = wstr->str[start_pos];
1146
+ int cat = get_gencat(c0);
1147
+ int length = 1;
1148
+ int j;
1149
+
1150
+ if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
1151
+ volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
1152
+ if (!block_p)
1153
+ rb_ary_push(ret, rstr);
1154
+ else
1155
+ rb_yield(rstr);
1156
+ start_pos++;
1157
+ continue;
1158
+ }
1159
+
1160
+ for (j = start_pos + 1; j < wstr->len; j++) {
1161
+ int c1 = wstr->str[j];
1162
+ int cat = get_gencat(c1);
1163
+ if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
1164
+ j + 1 < wstr->len &&
1165
+ c1 >= VBASE && c1 < VBASE + VCOUNT &&
1166
+ wstr->str[j+1] >= TBASE && wstr->str[j+1] < TBASE + TCOUNT) {
1167
+ /* Hangul L+V+T */
1168
+ length += 2;
1169
+ j++;
1170
+ }
1171
+ else if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
1172
+ c1 >= VBASE && c1< VBASE + VCOUNT) {
1173
+ /* Hangul L+V */
1174
+ length++;
1175
+ }
1176
+ else if (c0 >= SBASE && c0 < SBASE + SCOUNT &&
1177
+ (c0 - SBASE) % TCOUNT == 0 &&
1178
+ c1 >= TBASE && c1 < TBASE + TCOUNT) {
1179
+ /* Hangul LV+T */
1180
+ length++;
1181
+ }
1182
+ else if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
1183
+ /* Mark */
1184
+ length++;
1185
+ }
1186
+ else {
1187
+ volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
1188
+ if (!block_p)
1189
+ rb_ary_push(ret, rstr);
1190
+ else
1191
+ rb_yield(rstr);
1192
+ length = 0;
1193
+ break;
1194
+ }
1195
+ }
1196
+ if (length > 0) {
1197
+ volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
1198
+ if (!block_p)
1199
+ rb_ary_push(ret, rstr);
1200
+ else
1201
+ rb_yield(rstr);
1202
+ }
1203
+ start_pos = j;
1204
+ }
1205
+ return ret;
1206
+ }
1207
+
1208
+ VALUE
1209
+ get_text_elements_ensure(WString* wstr)
1210
+ {
1211
+ WStr_free(wstr);
1212
+ return Qnil;
1213
+ }
1214
+
1215
+ VALUE
1216
+ unicode_get_text_elements(VALUE obj, VALUE str)
1217
+ {
1218
+ WString wstr;
1219
+ get_text_elements_param param = { &wstr, str };
1220
+
1221
+ Check_Type(str, T_STRING);
1222
+ #ifdef HAVE_RUBY_ENCODING_H
1223
+ CONVERT_TO_UTF8(str);
1224
+ #endif
1225
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1226
+
1227
+ return rb_ensure(get_text_elements_internal, (VALUE)&param,
1228
+ get_text_elements_ensure, (VALUE)&wstr);
1229
+ /* wstr will be freed in get_text_elements_ensure() */
1230
+ }
1231
+
885
1232
  void
886
1233
  Init_unicode_native()
887
1234
  {
@@ -909,6 +1256,13 @@ Init_unicode_native()
909
1256
  }
910
1257
  }
911
1258
 
1259
+ for (i = 0; i < c_Cn + 1; i++) {
1260
+ catname_abbr[i] = ID2SYM(rb_intern(gencat_abbr[i]));
1261
+ catname_long[i] = ID2SYM(rb_intern(gencat_long[i]));
1262
+ rb_global_variable(&catname_abbr[i]);
1263
+ rb_global_variable(&catname_long[i]);
1264
+ }
1265
+
912
1266
  rb_define_module_function(mUnicode, "strcmp",
913
1267
  unicode_strcmp, 2);
914
1268
  rb_define_module_function(mUnicode, "strcmp_compat",
@@ -957,6 +1311,15 @@ Init_unicode_native()
957
1311
  rb_define_module_function(mUnicode, "capitalize",
958
1312
  unicode_capitalize, 1);
959
1313
 
1314
+ rb_define_module_function(mUnicode, "categories",
1315
+ unicode_get_categories, 1);
1316
+ rb_define_module_function(mUnicode, "abbr_categories",
1317
+ unicode_get_abbr_categories, 1);
1318
+ rb_define_module_function(mUnicode, "width",
1319
+ unicode_wcswidth, -1);
1320
+ rb_define_module_function(mUnicode, "text_elements",
1321
+ unicode_get_text_elements, 1);
1322
+
960
1323
  rb_define_const(mUnicode, "VERSION",
961
1324
  rb_str_new2(UNICODE_VERSION));
962
1325
  }