unicode 0.4.2-x86-mswin32-60 → 0.4.3-x86-mswin32-60

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -1,5 +1,5 @@
1
1
  Unicode Library for Ruby
2
- Version 0.4.2
2
+ Version 0.4.3
3
3
 
4
4
  Yoshida Masato
5
5
 
@@ -7,14 +7,14 @@
7
7
  - Introduction
8
8
 
9
9
  Unicode string manipulation library for Ruby.
10
- This library is based on UTR #15 Unicode Normalization Forms(*1).
10
+ This library is based on UAX #15 Unicode Normalization Forms(*1).
11
11
 
12
12
  *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
13
13
 
14
14
 
15
15
  - Install
16
16
 
17
- This can work with ruby-1.8 or later. I recommend you to
17
+ This can work with ruby-1.8.7 or later. I recommend you to
18
18
  use ruby-1.9.3 or later.
19
19
 
20
20
  Make and install usually.
@@ -79,7 +79,7 @@
79
79
  These are aliases of decompose/decompose_compat.
80
80
 
81
81
  Unicode::normalize_D_safe(str) (Unicode::nfd_safe(str))
82
- This is an aliase of decompose_safe.
82
+ This is an alias of decompose_safe.
83
83
 
84
84
  Unicode::normalize_C(str) (Unicode::nfc(str))
85
85
  Unicode::normalize_KC(str) (Unicode::nfkc(str))
@@ -98,14 +98,35 @@
98
98
  The mappings that are used by these functions are not normative
99
99
  in UnicodeData.txt.
100
100
 
101
+ Unicode::categories(str)
102
+ Unicode::abbr_categories(str)
103
+ Get an array of general category names of the string.
104
+ get_abbr_categories returns abbreviated names.
105
+ These can be called with a block.
106
+
107
+ Unicode.get_category do |category| p category end
108
+
109
+ Unicode::text_elements(str)
110
+ Get an array of text elements.
111
+ A text element is a unit that is displayed as a single character.
112
+ These can be called with a block.
113
+
114
+ Unicode::width(str[, cjk])
115
+ Estimate the display width on the fixed pitch text terminal.
116
+ It based on Markus Kuhn's mk_wcwidth.
117
+ If the optional argument 'cjk' is true, East Asian
118
+ Ambiguous characters are treated as wide characters.
119
+
120
+ Unicode.width("\u03b1") #=> 1
121
+ Unicode.width("\u03b1", true) #=> 2
122
+
123
+
101
124
  - Bugs
102
125
 
103
- UTR #15 suggests that the look up for Normalization Form C
126
+ UAX #15 suggests that the look up for Normalization Form C
104
127
  should not be implemented with a hash of string for better
105
128
  performance.
106
129
 
107
- Case conversion functions should reflecte UTR #21.
108
-
109
130
 
110
131
  - Copying
111
132
 
@@ -123,6 +144,7 @@
123
144
 
124
145
  - History
125
146
 
147
+ Aug 8, 2012 version 0.4.3 add categories, text_elements and width
126
148
  Feb 29, 2012 version 0.4.2 add decompose_safe
127
149
  Feb 3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
128
150
  Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
@@ -1,5 +1,6 @@
1
1
  /*
2
- * Unicode Library version 0.4
2
+ * Unicode Library version 0.4.3
3
+ * Aug 8, 2012: version 0.4
3
4
  * Oct 14, 2010: version 0.4
4
5
  * Feb 26, 2010: version 0.3
5
6
  * Dec 29, 2009: version 0.2
@@ -7,7 +8,7 @@
7
8
  *
8
9
  */
9
10
 
10
- #define UNICODE_VERSION "0.4.2"
11
+ #define UNICODE_VERSION "0.4.3"
11
12
 
12
13
  #include "ruby.h"
13
14
  #ifdef HAVE_RUBY_IO_H
@@ -54,6 +55,8 @@ taintObject(VALUE src, VALUE obj) {
54
55
  static VALUE mUnicode;
55
56
  static VALUE unicode_data;
56
57
  static VALUE composition_table;
58
+ static VALUE catname_long[c_Cn+1];
59
+ static VALUE catname_abbr[c_Cn+1];
57
60
 
58
61
  /* Hangul */
59
62
  #define SBASE (0xac00)
@@ -66,6 +69,86 @@ static VALUE composition_table;
66
69
  #define NCOUNT (VCOUNT * TCOUNT) /* 588 */
67
70
  #define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
68
71
 
72
+ VALUE
73
+ get_unidata(int ucs) {
74
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
75
+ if (!NIL_P(ch))
76
+ return ch;
77
+ #ifdef CJK_IDEOGRAPH_EXTENSION_A_FIRST
78
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_A_FIRST &&
79
+ ucs <= CJK_IDEOGRAPH_EXTENSION_A_LAST)
80
+ return rb_hash_aref(unicode_data,
81
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_A_FIRST));
82
+ #endif
83
+ #ifdef CJK_IDEOGRAPH_FIRST
84
+ else if (ucs >= CJK_IDEOGRAPH_FIRST &&
85
+ ucs <= CJK_IDEOGRAPH_LAST)
86
+ return rb_hash_aref(unicode_data,
87
+ INT2FIX(CJK_IDEOGRAPH_FIRST));
88
+ #endif
89
+ #ifdef HANGUL_SYLLABLE_FIRST
90
+ else if (ucs >= HANGUL_SYLLABLE_FIRST &&
91
+ ucs <= HANGUL_SYLLABLE_LAST)
92
+ return rb_hash_aref(unicode_data,
93
+ INT2FIX(HANGUL_SYLLABLE_FIRST));
94
+ #endif
95
+ #ifdef NON_PRIVATE_USE_HIGH_SURROGATE_FIRST
96
+ else if (ucs >= NON_PRIVATE_USE_HIGH_SURROGATE_FIRST &&
97
+ ucs <= NON_PRIVATE_USE_HIGH_SURROGATE_LAST)
98
+ return rb_hash_aref(unicode_data,
99
+ INT2FIX(NON_PRIVATE_USE_HIGH_SURROGATE_FIRST));
100
+ #endif
101
+ #ifdef PRIVATE_USE_HIGH_SURROGATE_FIRST
102
+ else if (ucs >= PRIVATE_USE_HIGH_SURROGATE_FIRST &&
103
+ ucs <= PRIVATE_USE_HIGH_SURROGATE_LAST)
104
+ return rb_hash_aref(unicode_data,
105
+ INT2FIX(PRIVATE_USE_HIGH_SURROGATE_FIRST));
106
+ #endif
107
+ #ifdef LOW_SURROGATE_FIRST
108
+ else if (ucs >= LOW_SURROGATE_FIRST &&
109
+ ucs <= LOW_SURROGATE_LAST)
110
+ return rb_hash_aref(unicode_data,
111
+ INT2FIX(LOW_SURROGATE_FIRST));
112
+ #endif
113
+ #ifdef PRIVATE_USE_FIRST
114
+ else if (ucs >= PRIVATE_USE_FIRST &&
115
+ ucs <= PRIVATE_USE_LAST)
116
+ return rb_hash_aref(unicode_data,
117
+ INT2FIX(PRIVATE_USE_FIRST));
118
+ #endif
119
+ #ifdef CJK_IDEOGRAPH_EXTENSION_B_FIRST
120
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_B_FIRST &&
121
+ ucs <= CJK_IDEOGRAPH_EXTENSION_B_LAST)
122
+ return rb_hash_aref(unicode_data,
123
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_B_FIRST));
124
+ #endif
125
+ #ifdef CJK_IDEOGRAPH_EXTENSION_C_FIRST
126
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_C_FIRST &&
127
+ ucs <= CJK_IDEOGRAPH_EXTENSION_C_LAST)
128
+ return rb_hash_aref(unicode_data,
129
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_C_FIRST));
130
+ #endif
131
+ #ifdef CJK_IDEOGRAPH_EXTENSION_D_FIRST
132
+ else if (ucs >= CJK_IDEOGRAPH_EXTENSION_D_FIRST &&
133
+ ucs <= CJK_IDEOGRAPH_EXTENSION_D_LAST)
134
+ return rb_hash_aref(unicode_data,
135
+ INT2FIX(CJK_IDEOGRAPH_EXTENSION_D_FIRST));
136
+ #endif
137
+ #ifdef PLANE_15_PRIVATE_USE_FIRST
138
+ else if (ucs >= PLANE_15_PRIVATE_USE_FIRST &&
139
+ ucs <= PLANE_15_PRIVATE_USE_LAST)
140
+ return rb_hash_aref(unicode_data,
141
+ INT2FIX(PLANE_15_PRIVATE_USE_FIRST));
142
+ #endif
143
+ #ifdef PLANE_16_PRIVATE_USE_FIRST
144
+ else if (ucs >= PLANE_16_PRIVATE_USE_FIRST &&
145
+ ucs <= PLANE_16_PRIVATE_USE_LAST)
146
+ return rb_hash_aref(unicode_data,
147
+ INT2FIX(PLANE_16_PRIVATE_USE_FIRST));
148
+ #endif
149
+ return Qnil;
150
+ }
151
+
69
152
  static int
70
153
  get_cc(int ucs)
71
154
  {
@@ -77,6 +160,28 @@ get_cc(int ucs)
77
160
  return 0;
78
161
  }
79
162
 
163
+ static int
164
+ get_gencat(int ucs)
165
+ {
166
+ VALUE ch = get_unidata(ucs);
167
+
168
+ if (!NIL_P(ch)) {
169
+ return unidata[FIX2INT(ch)].general_category;
170
+ }
171
+ return c_Cn; /* Unassigned */
172
+ }
173
+
174
+ static int
175
+ get_eawidth(int ucs)
176
+ {
177
+ VALUE ch = get_unidata(ucs);
178
+
179
+ if (!NIL_P(ch)) {
180
+ return unidata[FIX2INT(ch)].east_asian_width;
181
+ }
182
+ return w_N; /* Neutral */
183
+ }
184
+
80
185
  static const char*
81
186
  get_canon(int ucs)
82
187
  {
@@ -538,8 +643,8 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
538
643
  CONVERT_TO_UTF8(str1);
539
644
  CONVERT_TO_UTF8(str2);
540
645
  #endif
541
- WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
542
- WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
646
+ WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
647
+ WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
543
648
  WStr_alloc(&result1);
544
649
  WStr_alloc(&result2);
545
650
  decompose_internal(&wstr1, &result1);
@@ -580,8 +685,8 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
580
685
  CONVERT_TO_UTF8(str1);
581
686
  CONVERT_TO_UTF8(str2);
582
687
  #endif
583
- WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
584
- WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
688
+ WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
689
+ WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
585
690
  WStr_alloc(&result1);
586
691
  WStr_alloc(&result2);
587
692
  decompose_compat_internal(&wstr1, &result1);
@@ -617,7 +722,7 @@ unicode_decompose(VALUE obj, VALUE str)
617
722
  #ifdef HAVE_RUBY_ENCODING_H
618
723
  CONVERT_TO_UTF8(str);
619
724
  #endif
620
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
725
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
621
726
  WStr_alloc(&result);
622
727
  decompose_internal(&ustr, &result);
623
728
  WStr_free(&ustr);
@@ -643,7 +748,7 @@ unicode_decompose_safe(VALUE obj, VALUE str)
643
748
  #ifdef HAVE_RUBY_ENCODING_H
644
749
  CONVERT_TO_UTF8(str);
645
750
  #endif
646
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
751
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
647
752
  WStr_alloc(&result);
648
753
  decompose_safe_internal(&ustr, &result);
649
754
  WStr_free(&ustr);
@@ -669,7 +774,7 @@ unicode_decompose_compat(VALUE obj, VALUE str)
669
774
  #ifdef HAVE_RUBY_ENCODING_H
670
775
  CONVERT_TO_UTF8(str);
671
776
  #endif
672
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
777
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
673
778
  WStr_alloc(&result);
674
779
  decompose_compat_internal(&ustr, &result);
675
780
  WStr_free(&ustr);
@@ -695,7 +800,7 @@ unicode_compose(VALUE obj, VALUE str)
695
800
  #ifdef HAVE_RUBY_ENCODING_H
696
801
  CONVERT_TO_UTF8(str);
697
802
  #endif
698
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
803
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
699
804
  sort_canonical(&ustr);
700
805
  WStr_alloc(&result);
701
806
  compose_internal(&ustr, &result);
@@ -722,7 +827,7 @@ unicode_normalize_C(VALUE obj, VALUE str)
722
827
  #ifdef HAVE_RUBY_ENCODING_H
723
828
  CONVERT_TO_UTF8(str);
724
829
  #endif
725
- WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
830
+ WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
726
831
  WStr_alloc(&ustr2);
727
832
  decompose_internal(&ustr1, &ustr2);
728
833
  WStr_free(&ustr1);
@@ -752,7 +857,7 @@ unicode_normalize_safe(VALUE obj, VALUE str)
752
857
  #ifdef HAVE_RUBY_ENCODING_H
753
858
  CONVERT_TO_UTF8(str);
754
859
  #endif
755
- WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
860
+ WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
756
861
  WStr_alloc(&ustr2);
757
862
  decompose_safe_internal(&ustr1, &ustr2);
758
863
  WStr_free(&ustr1);
@@ -782,7 +887,7 @@ unicode_normalize_KC(VALUE obj, VALUE str)
782
887
  #ifdef HAVE_RUBY_ENCODING_H
783
888
  CONVERT_TO_UTF8(str);
784
889
  #endif
785
- WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
890
+ WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
786
891
  WStr_alloc(&ustr2);
787
892
  decompose_compat_internal(&ustr1, &ustr2);
788
893
  WStr_free(&ustr1);
@@ -811,7 +916,7 @@ unicode_upcase(VALUE obj, VALUE str)
811
916
  #ifdef HAVE_RUBY_ENCODING_H
812
917
  CONVERT_TO_UTF8(str);
813
918
  #endif
814
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
919
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
815
920
  WStr_alloc(&result);
816
921
  upcase_internal(&ustr, &result);
817
922
  //sort_canonical(&result);
@@ -837,7 +942,7 @@ unicode_downcase(VALUE obj, VALUE str)
837
942
  #ifdef HAVE_RUBY_ENCODING_H
838
943
  CONVERT_TO_UTF8(str);
839
944
  #endif
840
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
945
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
841
946
  WStr_alloc(&result);
842
947
  downcase_internal(&ustr, &result);
843
948
  //sort_canonical(&result);
@@ -868,7 +973,7 @@ unicode_capitalize(VALUE obj, VALUE str)
868
973
  #ifdef HAVE_RUBY_ENCODING_H
869
974
  CONVERT_TO_UTF8(str);
870
975
  #endif
871
- WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
976
+ WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
872
977
  WStr_alloc(&result);
873
978
  capitalize_internal(&ustr, &result);
874
979
  //sort_canonical(&result);
@@ -882,6 +987,248 @@ unicode_capitalize(VALUE obj, VALUE str)
882
987
  return vret;
883
988
  }
884
989
 
990
+ typedef struct _get_categories_param {
991
+ WString* wstr;
992
+ VALUE str;
993
+ VALUE* catname;
994
+ } get_categories_param;
995
+
996
+ static VALUE
997
+ get_categories_internal(get_categories_param* param)
998
+ {
999
+ WString* wstr = param->wstr;
1000
+ VALUE str = param->str;
1001
+ VALUE* catname = param->catname;
1002
+ int pos;
1003
+ int block_p = rb_block_given_p();
1004
+ volatile VALUE ret = str;
1005
+
1006
+ if (!block_p)
1007
+ ret = rb_ary_new();
1008
+ for (pos = 0; pos < wstr->len; pos++) {
1009
+ int gencat = get_gencat(wstr->str[pos]);
1010
+ if (!block_p)
1011
+ rb_ary_push(ret, catname[gencat]);
1012
+ else {
1013
+ rb_yield(catname[gencat]);
1014
+ }
1015
+ }
1016
+
1017
+ return ret;
1018
+ }
1019
+
1020
+ VALUE
1021
+ get_categories_ensure(WString* wstr)
1022
+ {
1023
+ WStr_free(wstr);
1024
+ return Qnil;
1025
+ }
1026
+
1027
+ VALUE
1028
+ unicode_get_categories(VALUE obj, VALUE str)
1029
+ {
1030
+ WString wstr;
1031
+ get_categories_param param = { &wstr, str, catname_long };
1032
+
1033
+ Check_Type(str, T_STRING);
1034
+ #ifdef HAVE_RUBY_ENCODING_H
1035
+ CONVERT_TO_UTF8(str);
1036
+ #endif
1037
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1038
+
1039
+ return rb_ensure(get_categories_internal, (VALUE)&param,
1040
+ get_categories_ensure, (VALUE)&wstr);
1041
+ /* wstr will be freed in get_text_elements_ensure() */
1042
+ }
1043
+
1044
+
1045
+ VALUE
1046
+ unicode_get_abbr_categories(VALUE obj, VALUE str)
1047
+ {
1048
+ WString wstr;
1049
+ get_categories_param param = { &wstr, str, catname_abbr };
1050
+
1051
+ Check_Type(str, T_STRING);
1052
+ #ifdef HAVE_RUBY_ENCODING_H
1053
+ CONVERT_TO_UTF8(str);
1054
+ #endif
1055
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1056
+
1057
+ return rb_ensure(get_categories_internal, (VALUE)&param,
1058
+ get_categories_ensure, (VALUE)&wstr);
1059
+ /* wstr will be freed in get_text_elements_ensure() */
1060
+ }
1061
+
1062
+ VALUE
1063
+ unicode_wcswidth(int argc, VALUE* argv, VALUE obj)
1064
+ {
1065
+ WString wstr;
1066
+ int i, count;
1067
+ int width = 0;
1068
+ int cjk_p = 0;
1069
+ VALUE str;
1070
+ VALUE cjk;
1071
+
1072
+ count = rb_scan_args(argc, argv, "11", &str, &cjk);
1073
+ if (count > 1)
1074
+ cjk_p = RTEST(cjk);
1075
+ Check_Type(str, T_STRING);
1076
+ #ifdef HAVE_RUBY_ENCODING_H
1077
+ CONVERT_TO_UTF8(str);
1078
+ #endif
1079
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1080
+ for (i = 0; i <wstr.len; i++) {
1081
+ int c = wstr.str[i];
1082
+ int cat = get_gencat(c);
1083
+ int eaw = get_eawidth(c);
1084
+ if ((c > 0 && c < 32) || (c >= 0x7f && c < 0xa0)) {
1085
+ /* Control Characters */
1086
+ width = -1;
1087
+ break;
1088
+ }
1089
+ else if (c != 0x00ad && /* SOFT HYPHEN */
1090
+ (cat == c_Mn || cat == c_Me || /* Non-spacing Marks */
1091
+ cat == c_Cf || /* Format */
1092
+ c == 0 || /* NUL */
1093
+ (c >= 0x1160 && c <= 0x11ff))) /* HANGUL JUNGSEONG/JONGSEONG */
1094
+ /* zero width */ ;
1095
+ else if (eaw == w_F || eaw == w_W || /* Fullwidth or Wide */
1096
+ (c >= 0x4db6 && c <= 0x4dbf) || /* CJK Reserved */
1097
+ (c >= 0x9fcd && c <= 0x9fff) || /* CJK Reserved */
1098
+ (c >= 0xfa6e && c <= 0xfa6f) || /* CJK Reserved */
1099
+ (c >= 0xfada && c <= 0xfaff) || /* CJK Reserved */
1100
+ (c >= 0x2a6d7 && c <= 0x2a6ff) || /* CJK Reserved */
1101
+ (c >= 0x2b735 && c <= 0x2b73f) || /* CJK Reserved */
1102
+ (c >= 0x2b81e && c <= 0x2f7ff) || /* CJK Reserved */
1103
+ (c >= 0x2fa1e && c <= 0x2fffd) || /* CJK Reserved */
1104
+ (c >= 0x30000 && c <= 0x3fffd) || /* CJK Reserved */
1105
+ (cjk_p && eaw == w_A)) /* East Asian Ambiguous */
1106
+ width += 2;
1107
+ else
1108
+ width++; /* Halfwidth or Neutral */
1109
+ }
1110
+ WStr_free(&wstr);
1111
+
1112
+ return INT2FIX(width);
1113
+ }
1114
+
1115
+ VALUE
1116
+ wstring_to_rstring(WString* wstr, int start, int len) {
1117
+ UString ret;
1118
+ volatile VALUE vret;
1119
+
1120
+ UniStr_alloc(&ret);
1121
+ WStr_convertIntoUString2(wstr, start, len, &ret);
1122
+ vret = ENC_(rb_str_new((char*)ret.str, ret.len));
1123
+ UniStr_free(&ret);
1124
+
1125
+ return vret;
1126
+ }
1127
+
1128
+ typedef struct _get_text_elements_param {
1129
+ WString* wstr;
1130
+ VALUE str;
1131
+ } get_text_elements_param;
1132
+
1133
+ VALUE
1134
+ get_text_elements_internal(get_text_elements_param* param)
1135
+ {
1136
+ WString* wstr = param->wstr;
1137
+ VALUE str = param->str;
1138
+ int start_pos;
1139
+ int block_p = rb_block_given_p();
1140
+ volatile VALUE ret = str;
1141
+
1142
+ if (!block_p)
1143
+ ret = rb_ary_new();
1144
+ for (start_pos = 0; start_pos < wstr->len;) {
1145
+ int c0 = wstr->str[start_pos];
1146
+ int cat = get_gencat(c0);
1147
+ int length = 1;
1148
+ int j;
1149
+
1150
+ if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
1151
+ volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
1152
+ if (!block_p)
1153
+ rb_ary_push(ret, rstr);
1154
+ else
1155
+ rb_yield(rstr);
1156
+ start_pos++;
1157
+ continue;
1158
+ }
1159
+
1160
+ for (j = start_pos + 1; j < wstr->len; j++) {
1161
+ int c1 = wstr->str[j];
1162
+ int cat = get_gencat(c1);
1163
+ if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
1164
+ j + 1 < wstr->len &&
1165
+ c1 >= VBASE && c1 < VBASE + VCOUNT &&
1166
+ wstr->str[j+1] >= TBASE && wstr->str[j+1] < TBASE + TCOUNT) {
1167
+ /* Hangul L+V+T */
1168
+ length += 2;
1169
+ j++;
1170
+ }
1171
+ else if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
1172
+ c1 >= VBASE && c1< VBASE + VCOUNT) {
1173
+ /* Hangul L+V */
1174
+ length++;
1175
+ }
1176
+ else if (c0 >= SBASE && c0 < SBASE + SCOUNT &&
1177
+ (c0 - SBASE) % TCOUNT == 0 &&
1178
+ c1 >= TBASE && c1 < TBASE + TCOUNT) {
1179
+ /* Hangul LV+T */
1180
+ length++;
1181
+ }
1182
+ else if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
1183
+ /* Mark */
1184
+ length++;
1185
+ }
1186
+ else {
1187
+ volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
1188
+ if (!block_p)
1189
+ rb_ary_push(ret, rstr);
1190
+ else
1191
+ rb_yield(rstr);
1192
+ length = 0;
1193
+ break;
1194
+ }
1195
+ }
1196
+ if (length > 0) {
1197
+ volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
1198
+ if (!block_p)
1199
+ rb_ary_push(ret, rstr);
1200
+ else
1201
+ rb_yield(rstr);
1202
+ }
1203
+ start_pos = j;
1204
+ }
1205
+ return ret;
1206
+ }
1207
+
1208
+ VALUE
1209
+ get_text_elements_ensure(WString* wstr)
1210
+ {
1211
+ WStr_free(wstr);
1212
+ return Qnil;
1213
+ }
1214
+
1215
+ VALUE
1216
+ unicode_get_text_elements(VALUE obj, VALUE str)
1217
+ {
1218
+ WString wstr;
1219
+ get_text_elements_param param = { &wstr, str };
1220
+
1221
+ Check_Type(str, T_STRING);
1222
+ #ifdef HAVE_RUBY_ENCODING_H
1223
+ CONVERT_TO_UTF8(str);
1224
+ #endif
1225
+ WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
1226
+
1227
+ return rb_ensure(get_text_elements_internal, (VALUE)&param,
1228
+ get_text_elements_ensure, (VALUE)&wstr);
1229
+ /* wstr will be freed in get_text_elements_ensure() */
1230
+ }
1231
+
885
1232
  void
886
1233
  Init_unicode_native()
887
1234
  {
@@ -909,6 +1256,13 @@ Init_unicode_native()
909
1256
  }
910
1257
  }
911
1258
 
1259
+ for (i = 0; i < c_Cn + 1; i++) {
1260
+ catname_abbr[i] = ID2SYM(rb_intern(gencat_abbr[i]));
1261
+ catname_long[i] = ID2SYM(rb_intern(gencat_long[i]));
1262
+ rb_global_variable(&catname_abbr[i]);
1263
+ rb_global_variable(&catname_long[i]);
1264
+ }
1265
+
912
1266
  rb_define_module_function(mUnicode, "strcmp",
913
1267
  unicode_strcmp, 2);
914
1268
  rb_define_module_function(mUnicode, "strcmp_compat",
@@ -957,6 +1311,15 @@ Init_unicode_native()
957
1311
  rb_define_module_function(mUnicode, "capitalize",
958
1312
  unicode_capitalize, 1);
959
1313
 
1314
+ rb_define_module_function(mUnicode, "categories",
1315
+ unicode_get_categories, 1);
1316
+ rb_define_module_function(mUnicode, "abbr_categories",
1317
+ unicode_get_abbr_categories, 1);
1318
+ rb_define_module_function(mUnicode, "width",
1319
+ unicode_wcswidth, -1);
1320
+ rb_define_module_function(mUnicode, "text_elements",
1321
+ unicode_get_text_elements, 1);
1322
+
960
1323
  rb_define_const(mUnicode, "VERSION",
961
1324
  rb_str_new2(UNICODE_VERSION));
962
1325
  }