unicode 0.4.2-x86-mswin32-60 → 0.4.3-x86-mswin32-60
Sign up to get free protection for your applications and to get access to all the features.
- data/README +29 -7
- data/ext/unicode/unicode.c +379 -16
- data/ext/unicode/unidata.map +24536 -24435
- data/ext/unicode/wstring.c +69 -1
- data/ext/unicode/wstring.h +2 -0
- data/lib/unicode/1.8/unicode_native.so +0 -0
- data/lib/unicode/1.9/unicode_native.so +0 -0
- data/tools/README +3 -2
- data/tools/mkunidata.rb +136 -12
- data/unicode.gemspec +2 -2
- metadata +5 -5
data/README
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
Unicode Library for Ruby
|
2
|
-
Version 0.4.
|
2
|
+
Version 0.4.3
|
3
3
|
|
4
4
|
Yoshida Masato
|
5
5
|
|
@@ -7,14 +7,14 @@
|
|
7
7
|
- Introduction
|
8
8
|
|
9
9
|
Unicode string manipulation library for Ruby.
|
10
|
-
This library is based on
|
10
|
+
This library is based on UAX #15 Unicode Normalization Forms(*1).
|
11
11
|
|
12
12
|
*1 <URL:http://www.unicode.org/unicode/reports/tr15/>
|
13
13
|
|
14
14
|
|
15
15
|
- Install
|
16
16
|
|
17
|
-
This can work with ruby-1.8 or later. I recommend you to
|
17
|
+
This can work with ruby-1.8.7 or later. I recommend you to
|
18
18
|
use ruby-1.9.3 or later.
|
19
19
|
|
20
20
|
Make and install usually.
|
@@ -79,7 +79,7 @@
|
|
79
79
|
These are aliases of decompose/decompose_compat.
|
80
80
|
|
81
81
|
Unicode::normalize_D_safe(str) (Unicode::nfd_safe(str))
|
82
|
-
This is an
|
82
|
+
This is an alias of decompose_safe.
|
83
83
|
|
84
84
|
Unicode::normalize_C(str) (Unicode::nfc(str))
|
85
85
|
Unicode::normalize_KC(str) (Unicode::nfkc(str))
|
@@ -98,14 +98,35 @@
|
|
98
98
|
The mappings that are used by these functions are not normative
|
99
99
|
in UnicodeData.txt.
|
100
100
|
|
101
|
+
Unicode::categories(str)
|
102
|
+
Unicode::abbr_categories(str)
|
103
|
+
Get an array of general category names of the string.
|
104
|
+
get_abbr_categories returns abbreviated names.
|
105
|
+
These can be called with a block.
|
106
|
+
|
107
|
+
Unicode.get_category do |category| p category end
|
108
|
+
|
109
|
+
Unicode::text_elements(str)
|
110
|
+
Get an array of text elements.
|
111
|
+
A text element is a unit that is displayed as a single character.
|
112
|
+
These can be called with a block.
|
113
|
+
|
114
|
+
Unicode::width(str[, cjk])
|
115
|
+
Estimate the display width on the fixed pitch text terminal.
|
116
|
+
It based on Markus Kuhn's mk_wcwidth.
|
117
|
+
If the optional argument 'cjk' is true, East Asian
|
118
|
+
Ambiguous characters are treated as wide characters.
|
119
|
+
|
120
|
+
Unicode.width("\u03b1") #=> 1
|
121
|
+
Unicode.width("\u03b1", true) #=> 2
|
122
|
+
|
123
|
+
|
101
124
|
- Bugs
|
102
125
|
|
103
|
-
|
126
|
+
UAX #15 suggests that the look up for Normalization Form C
|
104
127
|
should not be implemented with a hash of string for better
|
105
128
|
performance.
|
106
129
|
|
107
|
-
Case conversion functions should reflecte UTR #21.
|
108
|
-
|
109
130
|
|
110
131
|
- Copying
|
111
132
|
|
@@ -123,6 +144,7 @@
|
|
123
144
|
|
124
145
|
- History
|
125
146
|
|
147
|
+
Aug 8, 2012 version 0.4.3 add categories, text_elements and width
|
126
148
|
Feb 29, 2012 version 0.4.2 add decompose_safe
|
127
149
|
Feb 3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
|
128
150
|
Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
|
data/ext/unicode/unicode.c
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
/*
|
2
|
-
* Unicode Library version 0.4
|
2
|
+
* Unicode Library version 0.4.3
|
3
|
+
* Aug 8, 2012: version 0.4
|
3
4
|
* Oct 14, 2010: version 0.4
|
4
5
|
* Feb 26, 2010: version 0.3
|
5
6
|
* Dec 29, 2009: version 0.2
|
@@ -7,7 +8,7 @@
|
|
7
8
|
*
|
8
9
|
*/
|
9
10
|
|
10
|
-
#define UNICODE_VERSION "0.4.
|
11
|
+
#define UNICODE_VERSION "0.4.3"
|
11
12
|
|
12
13
|
#include "ruby.h"
|
13
14
|
#ifdef HAVE_RUBY_IO_H
|
@@ -54,6 +55,8 @@ taintObject(VALUE src, VALUE obj) {
|
|
54
55
|
static VALUE mUnicode;
|
55
56
|
static VALUE unicode_data;
|
56
57
|
static VALUE composition_table;
|
58
|
+
static VALUE catname_long[c_Cn+1];
|
59
|
+
static VALUE catname_abbr[c_Cn+1];
|
57
60
|
|
58
61
|
/* Hangul */
|
59
62
|
#define SBASE (0xac00)
|
@@ -66,6 +69,86 @@ static VALUE composition_table;
|
|
66
69
|
#define NCOUNT (VCOUNT * TCOUNT) /* 588 */
|
67
70
|
#define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
|
68
71
|
|
72
|
+
VALUE
|
73
|
+
get_unidata(int ucs) {
|
74
|
+
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
75
|
+
if (!NIL_P(ch))
|
76
|
+
return ch;
|
77
|
+
#ifdef CJK_IDEOGRAPH_EXTENSION_A_FIRST
|
78
|
+
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_A_FIRST &&
|
79
|
+
ucs <= CJK_IDEOGRAPH_EXTENSION_A_LAST)
|
80
|
+
return rb_hash_aref(unicode_data,
|
81
|
+
INT2FIX(CJK_IDEOGRAPH_EXTENSION_A_FIRST));
|
82
|
+
#endif
|
83
|
+
#ifdef CJK_IDEOGRAPH_FIRST
|
84
|
+
else if (ucs >= CJK_IDEOGRAPH_FIRST &&
|
85
|
+
ucs <= CJK_IDEOGRAPH_LAST)
|
86
|
+
return rb_hash_aref(unicode_data,
|
87
|
+
INT2FIX(CJK_IDEOGRAPH_FIRST));
|
88
|
+
#endif
|
89
|
+
#ifdef HANGUL_SYLLABLE_FIRST
|
90
|
+
else if (ucs >= HANGUL_SYLLABLE_FIRST &&
|
91
|
+
ucs <= HANGUL_SYLLABLE_LAST)
|
92
|
+
return rb_hash_aref(unicode_data,
|
93
|
+
INT2FIX(HANGUL_SYLLABLE_FIRST));
|
94
|
+
#endif
|
95
|
+
#ifdef NON_PRIVATE_USE_HIGH_SURROGATE_FIRST
|
96
|
+
else if (ucs >= NON_PRIVATE_USE_HIGH_SURROGATE_FIRST &&
|
97
|
+
ucs <= NON_PRIVATE_USE_HIGH_SURROGATE_LAST)
|
98
|
+
return rb_hash_aref(unicode_data,
|
99
|
+
INT2FIX(NON_PRIVATE_USE_HIGH_SURROGATE_FIRST));
|
100
|
+
#endif
|
101
|
+
#ifdef PRIVATE_USE_HIGH_SURROGATE_FIRST
|
102
|
+
else if (ucs >= PRIVATE_USE_HIGH_SURROGATE_FIRST &&
|
103
|
+
ucs <= PRIVATE_USE_HIGH_SURROGATE_LAST)
|
104
|
+
return rb_hash_aref(unicode_data,
|
105
|
+
INT2FIX(PRIVATE_USE_HIGH_SURROGATE_FIRST));
|
106
|
+
#endif
|
107
|
+
#ifdef LOW_SURROGATE_FIRST
|
108
|
+
else if (ucs >= LOW_SURROGATE_FIRST &&
|
109
|
+
ucs <= LOW_SURROGATE_LAST)
|
110
|
+
return rb_hash_aref(unicode_data,
|
111
|
+
INT2FIX(LOW_SURROGATE_FIRST));
|
112
|
+
#endif
|
113
|
+
#ifdef PRIVATE_USE_FIRST
|
114
|
+
else if (ucs >= PRIVATE_USE_FIRST &&
|
115
|
+
ucs <= PRIVATE_USE_LAST)
|
116
|
+
return rb_hash_aref(unicode_data,
|
117
|
+
INT2FIX(PRIVATE_USE_FIRST));
|
118
|
+
#endif
|
119
|
+
#ifdef CJK_IDEOGRAPH_EXTENSION_B_FIRST
|
120
|
+
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_B_FIRST &&
|
121
|
+
ucs <= CJK_IDEOGRAPH_EXTENSION_B_LAST)
|
122
|
+
return rb_hash_aref(unicode_data,
|
123
|
+
INT2FIX(CJK_IDEOGRAPH_EXTENSION_B_FIRST));
|
124
|
+
#endif
|
125
|
+
#ifdef CJK_IDEOGRAPH_EXTENSION_C_FIRST
|
126
|
+
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_C_FIRST &&
|
127
|
+
ucs <= CJK_IDEOGRAPH_EXTENSION_C_LAST)
|
128
|
+
return rb_hash_aref(unicode_data,
|
129
|
+
INT2FIX(CJK_IDEOGRAPH_EXTENSION_C_FIRST));
|
130
|
+
#endif
|
131
|
+
#ifdef CJK_IDEOGRAPH_EXTENSION_D_FIRST
|
132
|
+
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_D_FIRST &&
|
133
|
+
ucs <= CJK_IDEOGRAPH_EXTENSION_D_LAST)
|
134
|
+
return rb_hash_aref(unicode_data,
|
135
|
+
INT2FIX(CJK_IDEOGRAPH_EXTENSION_D_FIRST));
|
136
|
+
#endif
|
137
|
+
#ifdef PLANE_15_PRIVATE_USE_FIRST
|
138
|
+
else if (ucs >= PLANE_15_PRIVATE_USE_FIRST &&
|
139
|
+
ucs <= PLANE_15_PRIVATE_USE_LAST)
|
140
|
+
return rb_hash_aref(unicode_data,
|
141
|
+
INT2FIX(PLANE_15_PRIVATE_USE_FIRST));
|
142
|
+
#endif
|
143
|
+
#ifdef PLANE_16_PRIVATE_USE_FIRST
|
144
|
+
else if (ucs >= PLANE_16_PRIVATE_USE_FIRST &&
|
145
|
+
ucs <= PLANE_16_PRIVATE_USE_LAST)
|
146
|
+
return rb_hash_aref(unicode_data,
|
147
|
+
INT2FIX(PLANE_16_PRIVATE_USE_FIRST));
|
148
|
+
#endif
|
149
|
+
return Qnil;
|
150
|
+
}
|
151
|
+
|
69
152
|
static int
|
70
153
|
get_cc(int ucs)
|
71
154
|
{
|
@@ -77,6 +160,28 @@ get_cc(int ucs)
|
|
77
160
|
return 0;
|
78
161
|
}
|
79
162
|
|
163
|
+
static int
|
164
|
+
get_gencat(int ucs)
|
165
|
+
{
|
166
|
+
VALUE ch = get_unidata(ucs);
|
167
|
+
|
168
|
+
if (!NIL_P(ch)) {
|
169
|
+
return unidata[FIX2INT(ch)].general_category;
|
170
|
+
}
|
171
|
+
return c_Cn; /* Unassigned */
|
172
|
+
}
|
173
|
+
|
174
|
+
static int
|
175
|
+
get_eawidth(int ucs)
|
176
|
+
{
|
177
|
+
VALUE ch = get_unidata(ucs);
|
178
|
+
|
179
|
+
if (!NIL_P(ch)) {
|
180
|
+
return unidata[FIX2INT(ch)].east_asian_width;
|
181
|
+
}
|
182
|
+
return w_N; /* Neutral */
|
183
|
+
}
|
184
|
+
|
80
185
|
static const char*
|
81
186
|
get_canon(int ucs)
|
82
187
|
{
|
@@ -538,8 +643,8 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
|
|
538
643
|
CONVERT_TO_UTF8(str1);
|
539
644
|
CONVERT_TO_UTF8(str2);
|
540
645
|
#endif
|
541
|
-
|
542
|
-
|
646
|
+
WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
|
647
|
+
WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
|
543
648
|
WStr_alloc(&result1);
|
544
649
|
WStr_alloc(&result2);
|
545
650
|
decompose_internal(&wstr1, &result1);
|
@@ -580,8 +685,8 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
|
|
580
685
|
CONVERT_TO_UTF8(str1);
|
581
686
|
CONVERT_TO_UTF8(str2);
|
582
687
|
#endif
|
583
|
-
|
584
|
-
|
688
|
+
WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
|
689
|
+
WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
|
585
690
|
WStr_alloc(&result1);
|
586
691
|
WStr_alloc(&result2);
|
587
692
|
decompose_compat_internal(&wstr1, &result1);
|
@@ -617,7 +722,7 @@ unicode_decompose(VALUE obj, VALUE str)
|
|
617
722
|
#ifdef HAVE_RUBY_ENCODING_H
|
618
723
|
CONVERT_TO_UTF8(str);
|
619
724
|
#endif
|
620
|
-
|
725
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
621
726
|
WStr_alloc(&result);
|
622
727
|
decompose_internal(&ustr, &result);
|
623
728
|
WStr_free(&ustr);
|
@@ -643,7 +748,7 @@ unicode_decompose_safe(VALUE obj, VALUE str)
|
|
643
748
|
#ifdef HAVE_RUBY_ENCODING_H
|
644
749
|
CONVERT_TO_UTF8(str);
|
645
750
|
#endif
|
646
|
-
|
751
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
647
752
|
WStr_alloc(&result);
|
648
753
|
decompose_safe_internal(&ustr, &result);
|
649
754
|
WStr_free(&ustr);
|
@@ -669,7 +774,7 @@ unicode_decompose_compat(VALUE obj, VALUE str)
|
|
669
774
|
#ifdef HAVE_RUBY_ENCODING_H
|
670
775
|
CONVERT_TO_UTF8(str);
|
671
776
|
#endif
|
672
|
-
|
777
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
673
778
|
WStr_alloc(&result);
|
674
779
|
decompose_compat_internal(&ustr, &result);
|
675
780
|
WStr_free(&ustr);
|
@@ -695,7 +800,7 @@ unicode_compose(VALUE obj, VALUE str)
|
|
695
800
|
#ifdef HAVE_RUBY_ENCODING_H
|
696
801
|
CONVERT_TO_UTF8(str);
|
697
802
|
#endif
|
698
|
-
|
803
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
699
804
|
sort_canonical(&ustr);
|
700
805
|
WStr_alloc(&result);
|
701
806
|
compose_internal(&ustr, &result);
|
@@ -722,7 +827,7 @@ unicode_normalize_C(VALUE obj, VALUE str)
|
|
722
827
|
#ifdef HAVE_RUBY_ENCODING_H
|
723
828
|
CONVERT_TO_UTF8(str);
|
724
829
|
#endif
|
725
|
-
|
830
|
+
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
|
726
831
|
WStr_alloc(&ustr2);
|
727
832
|
decompose_internal(&ustr1, &ustr2);
|
728
833
|
WStr_free(&ustr1);
|
@@ -752,7 +857,7 @@ unicode_normalize_safe(VALUE obj, VALUE str)
|
|
752
857
|
#ifdef HAVE_RUBY_ENCODING_H
|
753
858
|
CONVERT_TO_UTF8(str);
|
754
859
|
#endif
|
755
|
-
|
860
|
+
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
|
756
861
|
WStr_alloc(&ustr2);
|
757
862
|
decompose_safe_internal(&ustr1, &ustr2);
|
758
863
|
WStr_free(&ustr1);
|
@@ -782,7 +887,7 @@ unicode_normalize_KC(VALUE obj, VALUE str)
|
|
782
887
|
#ifdef HAVE_RUBY_ENCODING_H
|
783
888
|
CONVERT_TO_UTF8(str);
|
784
889
|
#endif
|
785
|
-
|
890
|
+
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
|
786
891
|
WStr_alloc(&ustr2);
|
787
892
|
decompose_compat_internal(&ustr1, &ustr2);
|
788
893
|
WStr_free(&ustr1);
|
@@ -811,7 +916,7 @@ unicode_upcase(VALUE obj, VALUE str)
|
|
811
916
|
#ifdef HAVE_RUBY_ENCODING_H
|
812
917
|
CONVERT_TO_UTF8(str);
|
813
918
|
#endif
|
814
|
-
|
919
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
815
920
|
WStr_alloc(&result);
|
816
921
|
upcase_internal(&ustr, &result);
|
817
922
|
//sort_canonical(&result);
|
@@ -837,7 +942,7 @@ unicode_downcase(VALUE obj, VALUE str)
|
|
837
942
|
#ifdef HAVE_RUBY_ENCODING_H
|
838
943
|
CONVERT_TO_UTF8(str);
|
839
944
|
#endif
|
840
|
-
|
945
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
841
946
|
WStr_alloc(&result);
|
842
947
|
downcase_internal(&ustr, &result);
|
843
948
|
//sort_canonical(&result);
|
@@ -868,7 +973,7 @@ unicode_capitalize(VALUE obj, VALUE str)
|
|
868
973
|
#ifdef HAVE_RUBY_ENCODING_H
|
869
974
|
CONVERT_TO_UTF8(str);
|
870
975
|
#endif
|
871
|
-
|
976
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
872
977
|
WStr_alloc(&result);
|
873
978
|
capitalize_internal(&ustr, &result);
|
874
979
|
//sort_canonical(&result);
|
@@ -882,6 +987,248 @@ unicode_capitalize(VALUE obj, VALUE str)
|
|
882
987
|
return vret;
|
883
988
|
}
|
884
989
|
|
990
|
+
typedef struct _get_categories_param {
|
991
|
+
WString* wstr;
|
992
|
+
VALUE str;
|
993
|
+
VALUE* catname;
|
994
|
+
} get_categories_param;
|
995
|
+
|
996
|
+
static VALUE
|
997
|
+
get_categories_internal(get_categories_param* param)
|
998
|
+
{
|
999
|
+
WString* wstr = param->wstr;
|
1000
|
+
VALUE str = param->str;
|
1001
|
+
VALUE* catname = param->catname;
|
1002
|
+
int pos;
|
1003
|
+
int block_p = rb_block_given_p();
|
1004
|
+
volatile VALUE ret = str;
|
1005
|
+
|
1006
|
+
if (!block_p)
|
1007
|
+
ret = rb_ary_new();
|
1008
|
+
for (pos = 0; pos < wstr->len; pos++) {
|
1009
|
+
int gencat = get_gencat(wstr->str[pos]);
|
1010
|
+
if (!block_p)
|
1011
|
+
rb_ary_push(ret, catname[gencat]);
|
1012
|
+
else {
|
1013
|
+
rb_yield(catname[gencat]);
|
1014
|
+
}
|
1015
|
+
}
|
1016
|
+
|
1017
|
+
return ret;
|
1018
|
+
}
|
1019
|
+
|
1020
|
+
VALUE
|
1021
|
+
get_categories_ensure(WString* wstr)
|
1022
|
+
{
|
1023
|
+
WStr_free(wstr);
|
1024
|
+
return Qnil;
|
1025
|
+
}
|
1026
|
+
|
1027
|
+
VALUE
|
1028
|
+
unicode_get_categories(VALUE obj, VALUE str)
|
1029
|
+
{
|
1030
|
+
WString wstr;
|
1031
|
+
get_categories_param param = { &wstr, str, catname_long };
|
1032
|
+
|
1033
|
+
Check_Type(str, T_STRING);
|
1034
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
1035
|
+
CONVERT_TO_UTF8(str);
|
1036
|
+
#endif
|
1037
|
+
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1038
|
+
|
1039
|
+
return rb_ensure(get_categories_internal, (VALUE)¶m,
|
1040
|
+
get_categories_ensure, (VALUE)&wstr);
|
1041
|
+
/* wstr will be freed in get_text_elements_ensure() */
|
1042
|
+
}
|
1043
|
+
|
1044
|
+
|
1045
|
+
VALUE
|
1046
|
+
unicode_get_abbr_categories(VALUE obj, VALUE str)
|
1047
|
+
{
|
1048
|
+
WString wstr;
|
1049
|
+
get_categories_param param = { &wstr, str, catname_abbr };
|
1050
|
+
|
1051
|
+
Check_Type(str, T_STRING);
|
1052
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
1053
|
+
CONVERT_TO_UTF8(str);
|
1054
|
+
#endif
|
1055
|
+
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1056
|
+
|
1057
|
+
return rb_ensure(get_categories_internal, (VALUE)¶m,
|
1058
|
+
get_categories_ensure, (VALUE)&wstr);
|
1059
|
+
/* wstr will be freed in get_text_elements_ensure() */
|
1060
|
+
}
|
1061
|
+
|
1062
|
+
VALUE
|
1063
|
+
unicode_wcswidth(int argc, VALUE* argv, VALUE obj)
|
1064
|
+
{
|
1065
|
+
WString wstr;
|
1066
|
+
int i, count;
|
1067
|
+
int width = 0;
|
1068
|
+
int cjk_p = 0;
|
1069
|
+
VALUE str;
|
1070
|
+
VALUE cjk;
|
1071
|
+
|
1072
|
+
count = rb_scan_args(argc, argv, "11", &str, &cjk);
|
1073
|
+
if (count > 1)
|
1074
|
+
cjk_p = RTEST(cjk);
|
1075
|
+
Check_Type(str, T_STRING);
|
1076
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
1077
|
+
CONVERT_TO_UTF8(str);
|
1078
|
+
#endif
|
1079
|
+
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1080
|
+
for (i = 0; i <wstr.len; i++) {
|
1081
|
+
int c = wstr.str[i];
|
1082
|
+
int cat = get_gencat(c);
|
1083
|
+
int eaw = get_eawidth(c);
|
1084
|
+
if ((c > 0 && c < 32) || (c >= 0x7f && c < 0xa0)) {
|
1085
|
+
/* Control Characters */
|
1086
|
+
width = -1;
|
1087
|
+
break;
|
1088
|
+
}
|
1089
|
+
else if (c != 0x00ad && /* SOFT HYPHEN */
|
1090
|
+
(cat == c_Mn || cat == c_Me || /* Non-spacing Marks */
|
1091
|
+
cat == c_Cf || /* Format */
|
1092
|
+
c == 0 || /* NUL */
|
1093
|
+
(c >= 0x1160 && c <= 0x11ff))) /* HANGUL JUNGSEONG/JONGSEONG */
|
1094
|
+
/* zero width */ ;
|
1095
|
+
else if (eaw == w_F || eaw == w_W || /* Fullwidth or Wide */
|
1096
|
+
(c >= 0x4db6 && c <= 0x4dbf) || /* CJK Reserved */
|
1097
|
+
(c >= 0x9fcd && c <= 0x9fff) || /* CJK Reserved */
|
1098
|
+
(c >= 0xfa6e && c <= 0xfa6f) || /* CJK Reserved */
|
1099
|
+
(c >= 0xfada && c <= 0xfaff) || /* CJK Reserved */
|
1100
|
+
(c >= 0x2a6d7 && c <= 0x2a6ff) || /* CJK Reserved */
|
1101
|
+
(c >= 0x2b735 && c <= 0x2b73f) || /* CJK Reserved */
|
1102
|
+
(c >= 0x2b81e && c <= 0x2f7ff) || /* CJK Reserved */
|
1103
|
+
(c >= 0x2fa1e && c <= 0x2fffd) || /* CJK Reserved */
|
1104
|
+
(c >= 0x30000 && c <= 0x3fffd) || /* CJK Reserved */
|
1105
|
+
(cjk_p && eaw == w_A)) /* East Asian Ambiguous */
|
1106
|
+
width += 2;
|
1107
|
+
else
|
1108
|
+
width++; /* Halfwidth or Neutral */
|
1109
|
+
}
|
1110
|
+
WStr_free(&wstr);
|
1111
|
+
|
1112
|
+
return INT2FIX(width);
|
1113
|
+
}
|
1114
|
+
|
1115
|
+
VALUE
|
1116
|
+
wstring_to_rstring(WString* wstr, int start, int len) {
|
1117
|
+
UString ret;
|
1118
|
+
volatile VALUE vret;
|
1119
|
+
|
1120
|
+
UniStr_alloc(&ret);
|
1121
|
+
WStr_convertIntoUString2(wstr, start, len, &ret);
|
1122
|
+
vret = ENC_(rb_str_new((char*)ret.str, ret.len));
|
1123
|
+
UniStr_free(&ret);
|
1124
|
+
|
1125
|
+
return vret;
|
1126
|
+
}
|
1127
|
+
|
1128
|
+
typedef struct _get_text_elements_param {
|
1129
|
+
WString* wstr;
|
1130
|
+
VALUE str;
|
1131
|
+
} get_text_elements_param;
|
1132
|
+
|
1133
|
+
VALUE
|
1134
|
+
get_text_elements_internal(get_text_elements_param* param)
|
1135
|
+
{
|
1136
|
+
WString* wstr = param->wstr;
|
1137
|
+
VALUE str = param->str;
|
1138
|
+
int start_pos;
|
1139
|
+
int block_p = rb_block_given_p();
|
1140
|
+
volatile VALUE ret = str;
|
1141
|
+
|
1142
|
+
if (!block_p)
|
1143
|
+
ret = rb_ary_new();
|
1144
|
+
for (start_pos = 0; start_pos < wstr->len;) {
|
1145
|
+
int c0 = wstr->str[start_pos];
|
1146
|
+
int cat = get_gencat(c0);
|
1147
|
+
int length = 1;
|
1148
|
+
int j;
|
1149
|
+
|
1150
|
+
if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
|
1151
|
+
volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
|
1152
|
+
if (!block_p)
|
1153
|
+
rb_ary_push(ret, rstr);
|
1154
|
+
else
|
1155
|
+
rb_yield(rstr);
|
1156
|
+
start_pos++;
|
1157
|
+
continue;
|
1158
|
+
}
|
1159
|
+
|
1160
|
+
for (j = start_pos + 1; j < wstr->len; j++) {
|
1161
|
+
int c1 = wstr->str[j];
|
1162
|
+
int cat = get_gencat(c1);
|
1163
|
+
if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
|
1164
|
+
j + 1 < wstr->len &&
|
1165
|
+
c1 >= VBASE && c1 < VBASE + VCOUNT &&
|
1166
|
+
wstr->str[j+1] >= TBASE && wstr->str[j+1] < TBASE + TCOUNT) {
|
1167
|
+
/* Hangul L+V+T */
|
1168
|
+
length += 2;
|
1169
|
+
j++;
|
1170
|
+
}
|
1171
|
+
else if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
|
1172
|
+
c1 >= VBASE && c1< VBASE + VCOUNT) {
|
1173
|
+
/* Hangul L+V */
|
1174
|
+
length++;
|
1175
|
+
}
|
1176
|
+
else if (c0 >= SBASE && c0 < SBASE + SCOUNT &&
|
1177
|
+
(c0 - SBASE) % TCOUNT == 0 &&
|
1178
|
+
c1 >= TBASE && c1 < TBASE + TCOUNT) {
|
1179
|
+
/* Hangul LV+T */
|
1180
|
+
length++;
|
1181
|
+
}
|
1182
|
+
else if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
|
1183
|
+
/* Mark */
|
1184
|
+
length++;
|
1185
|
+
}
|
1186
|
+
else {
|
1187
|
+
volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
|
1188
|
+
if (!block_p)
|
1189
|
+
rb_ary_push(ret, rstr);
|
1190
|
+
else
|
1191
|
+
rb_yield(rstr);
|
1192
|
+
length = 0;
|
1193
|
+
break;
|
1194
|
+
}
|
1195
|
+
}
|
1196
|
+
if (length > 0) {
|
1197
|
+
volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
|
1198
|
+
if (!block_p)
|
1199
|
+
rb_ary_push(ret, rstr);
|
1200
|
+
else
|
1201
|
+
rb_yield(rstr);
|
1202
|
+
}
|
1203
|
+
start_pos = j;
|
1204
|
+
}
|
1205
|
+
return ret;
|
1206
|
+
}
|
1207
|
+
|
1208
|
+
VALUE
|
1209
|
+
get_text_elements_ensure(WString* wstr)
|
1210
|
+
{
|
1211
|
+
WStr_free(wstr);
|
1212
|
+
return Qnil;
|
1213
|
+
}
|
1214
|
+
|
1215
|
+
VALUE
|
1216
|
+
unicode_get_text_elements(VALUE obj, VALUE str)
|
1217
|
+
{
|
1218
|
+
WString wstr;
|
1219
|
+
get_text_elements_param param = { &wstr, str };
|
1220
|
+
|
1221
|
+
Check_Type(str, T_STRING);
|
1222
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
1223
|
+
CONVERT_TO_UTF8(str);
|
1224
|
+
#endif
|
1225
|
+
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1226
|
+
|
1227
|
+
return rb_ensure(get_text_elements_internal, (VALUE)¶m,
|
1228
|
+
get_text_elements_ensure, (VALUE)&wstr);
|
1229
|
+
/* wstr will be freed in get_text_elements_ensure() */
|
1230
|
+
}
|
1231
|
+
|
885
1232
|
void
|
886
1233
|
Init_unicode_native()
|
887
1234
|
{
|
@@ -909,6 +1256,13 @@ Init_unicode_native()
|
|
909
1256
|
}
|
910
1257
|
}
|
911
1258
|
|
1259
|
+
for (i = 0; i < c_Cn + 1; i++) {
|
1260
|
+
catname_abbr[i] = ID2SYM(rb_intern(gencat_abbr[i]));
|
1261
|
+
catname_long[i] = ID2SYM(rb_intern(gencat_long[i]));
|
1262
|
+
rb_global_variable(&catname_abbr[i]);
|
1263
|
+
rb_global_variable(&catname_long[i]);
|
1264
|
+
}
|
1265
|
+
|
912
1266
|
rb_define_module_function(mUnicode, "strcmp",
|
913
1267
|
unicode_strcmp, 2);
|
914
1268
|
rb_define_module_function(mUnicode, "strcmp_compat",
|
@@ -957,6 +1311,15 @@ Init_unicode_native()
|
|
957
1311
|
rb_define_module_function(mUnicode, "capitalize",
|
958
1312
|
unicode_capitalize, 1);
|
959
1313
|
|
1314
|
+
rb_define_module_function(mUnicode, "categories",
|
1315
|
+
unicode_get_categories, 1);
|
1316
|
+
rb_define_module_function(mUnicode, "abbr_categories",
|
1317
|
+
unicode_get_abbr_categories, 1);
|
1318
|
+
rb_define_module_function(mUnicode, "width",
|
1319
|
+
unicode_wcswidth, -1);
|
1320
|
+
rb_define_module_function(mUnicode, "text_elements",
|
1321
|
+
unicode_get_text_elements, 1);
|
1322
|
+
|
960
1323
|
rb_define_const(mUnicode, "VERSION",
|
961
1324
|
rb_str_new2(UNICODE_VERSION));
|
962
1325
|
}
|