unicode 0.4.2-x86-mingw32 → 0.4.3-x86-mingw32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +29 -7
- data/ext/unicode/unicode.c +379 -16
- data/ext/unicode/unidata.map +24536 -24435
- data/ext/unicode/wstring.c +69 -1
- data/ext/unicode/wstring.h +2 -0
- data/lib/unicode/1.8/unicode_native.so +0 -0
- data/lib/unicode/1.9/unicode_native.so +0 -0
- data/tools/README +3 -2
- data/tools/mkunidata.rb +136 -12
- data/unicode.gemspec +2 -2
- metadata +5 -5
data/README
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
Unicode Library for Ruby
|
2
|
-
Version 0.4.
|
2
|
+
Version 0.4.3
|
3
3
|
|
4
4
|
Yoshida Masato
|
5
5
|
|
@@ -7,14 +7,14 @@
|
|
7
7
|
- Introduction
|
8
8
|
|
9
9
|
Unicode string manipulation library for Ruby.
|
10
|
-
This library is based on
|
10
|
+
This library is based on UAX #15 Unicode Normalization Forms(*1).
|
11
11
|
|
12
12
|
*1 <URL:http://www.unicode.org/unicode/reports/tr15/>
|
13
13
|
|
14
14
|
|
15
15
|
- Install
|
16
16
|
|
17
|
-
This can work with ruby-1.8 or later. I recommend you to
|
17
|
+
This can work with ruby-1.8.7 or later. I recommend you to
|
18
18
|
use ruby-1.9.3 or later.
|
19
19
|
|
20
20
|
Make and install usually.
|
@@ -79,7 +79,7 @@
|
|
79
79
|
These are aliases of decompose/decompose_compat.
|
80
80
|
|
81
81
|
Unicode::normalize_D_safe(str) (Unicode::nfd_safe(str))
|
82
|
-
This is an
|
82
|
+
This is an alias of decompose_safe.
|
83
83
|
|
84
84
|
Unicode::normalize_C(str) (Unicode::nfc(str))
|
85
85
|
Unicode::normalize_KC(str) (Unicode::nfkc(str))
|
@@ -98,14 +98,35 @@
|
|
98
98
|
The mappings that are used by these functions are not normative
|
99
99
|
in UnicodeData.txt.
|
100
100
|
|
101
|
+
Unicode::categories(str)
|
102
|
+
Unicode::abbr_categories(str)
|
103
|
+
Get an array of general category names of the string.
|
104
|
+
get_abbr_categories returns abbreviated names.
|
105
|
+
These can be called with a block.
|
106
|
+
|
107
|
+
Unicode.get_category do |category| p category end
|
108
|
+
|
109
|
+
Unicode::text_elements(str)
|
110
|
+
Get an array of text elements.
|
111
|
+
A text element is a unit that is displayed as a single character.
|
112
|
+
These can be called with a block.
|
113
|
+
|
114
|
+
Unicode::width(str[, cjk])
|
115
|
+
Estimate the display width on the fixed pitch text terminal.
|
116
|
+
It based on Markus Kuhn's mk_wcwidth.
|
117
|
+
If the optional argument 'cjk' is true, East Asian
|
118
|
+
Ambiguous characters are treated as wide characters.
|
119
|
+
|
120
|
+
Unicode.width("\u03b1") #=> 1
|
121
|
+
Unicode.width("\u03b1", true) #=> 2
|
122
|
+
|
123
|
+
|
101
124
|
- Bugs
|
102
125
|
|
103
|
-
|
126
|
+
UAX #15 suggests that the look up for Normalization Form C
|
104
127
|
should not be implemented with a hash of string for better
|
105
128
|
performance.
|
106
129
|
|
107
|
-
Case conversion functions should reflecte UTR #21.
|
108
|
-
|
109
130
|
|
110
131
|
- Copying
|
111
132
|
|
@@ -123,6 +144,7 @@
|
|
123
144
|
|
124
145
|
- History
|
125
146
|
|
147
|
+
Aug 8, 2012 version 0.4.3 add categories, text_elements and width
|
126
148
|
Feb 29, 2012 version 0.4.2 add decompose_safe
|
127
149
|
Feb 3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
|
128
150
|
Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
|
data/ext/unicode/unicode.c
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
/*
|
2
|
-
* Unicode Library version 0.4
|
2
|
+
* Unicode Library version 0.4.3
|
3
|
+
* Aug 8, 2012: version 0.4
|
3
4
|
* Oct 14, 2010: version 0.4
|
4
5
|
* Feb 26, 2010: version 0.3
|
5
6
|
* Dec 29, 2009: version 0.2
|
@@ -7,7 +8,7 @@
|
|
7
8
|
*
|
8
9
|
*/
|
9
10
|
|
10
|
-
#define UNICODE_VERSION "0.4.
|
11
|
+
#define UNICODE_VERSION "0.4.3"
|
11
12
|
|
12
13
|
#include "ruby.h"
|
13
14
|
#ifdef HAVE_RUBY_IO_H
|
@@ -54,6 +55,8 @@ taintObject(VALUE src, VALUE obj) {
|
|
54
55
|
static VALUE mUnicode;
|
55
56
|
static VALUE unicode_data;
|
56
57
|
static VALUE composition_table;
|
58
|
+
static VALUE catname_long[c_Cn+1];
|
59
|
+
static VALUE catname_abbr[c_Cn+1];
|
57
60
|
|
58
61
|
/* Hangul */
|
59
62
|
#define SBASE (0xac00)
|
@@ -66,6 +69,86 @@ static VALUE composition_table;
|
|
66
69
|
#define NCOUNT (VCOUNT * TCOUNT) /* 588 */
|
67
70
|
#define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
|
68
71
|
|
72
|
+
VALUE
|
73
|
+
get_unidata(int ucs) {
|
74
|
+
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
75
|
+
if (!NIL_P(ch))
|
76
|
+
return ch;
|
77
|
+
#ifdef CJK_IDEOGRAPH_EXTENSION_A_FIRST
|
78
|
+
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_A_FIRST &&
|
79
|
+
ucs <= CJK_IDEOGRAPH_EXTENSION_A_LAST)
|
80
|
+
return rb_hash_aref(unicode_data,
|
81
|
+
INT2FIX(CJK_IDEOGRAPH_EXTENSION_A_FIRST));
|
82
|
+
#endif
|
83
|
+
#ifdef CJK_IDEOGRAPH_FIRST
|
84
|
+
else if (ucs >= CJK_IDEOGRAPH_FIRST &&
|
85
|
+
ucs <= CJK_IDEOGRAPH_LAST)
|
86
|
+
return rb_hash_aref(unicode_data,
|
87
|
+
INT2FIX(CJK_IDEOGRAPH_FIRST));
|
88
|
+
#endif
|
89
|
+
#ifdef HANGUL_SYLLABLE_FIRST
|
90
|
+
else if (ucs >= HANGUL_SYLLABLE_FIRST &&
|
91
|
+
ucs <= HANGUL_SYLLABLE_LAST)
|
92
|
+
return rb_hash_aref(unicode_data,
|
93
|
+
INT2FIX(HANGUL_SYLLABLE_FIRST));
|
94
|
+
#endif
|
95
|
+
#ifdef NON_PRIVATE_USE_HIGH_SURROGATE_FIRST
|
96
|
+
else if (ucs >= NON_PRIVATE_USE_HIGH_SURROGATE_FIRST &&
|
97
|
+
ucs <= NON_PRIVATE_USE_HIGH_SURROGATE_LAST)
|
98
|
+
return rb_hash_aref(unicode_data,
|
99
|
+
INT2FIX(NON_PRIVATE_USE_HIGH_SURROGATE_FIRST));
|
100
|
+
#endif
|
101
|
+
#ifdef PRIVATE_USE_HIGH_SURROGATE_FIRST
|
102
|
+
else if (ucs >= PRIVATE_USE_HIGH_SURROGATE_FIRST &&
|
103
|
+
ucs <= PRIVATE_USE_HIGH_SURROGATE_LAST)
|
104
|
+
return rb_hash_aref(unicode_data,
|
105
|
+
INT2FIX(PRIVATE_USE_HIGH_SURROGATE_FIRST));
|
106
|
+
#endif
|
107
|
+
#ifdef LOW_SURROGATE_FIRST
|
108
|
+
else if (ucs >= LOW_SURROGATE_FIRST &&
|
109
|
+
ucs <= LOW_SURROGATE_LAST)
|
110
|
+
return rb_hash_aref(unicode_data,
|
111
|
+
INT2FIX(LOW_SURROGATE_FIRST));
|
112
|
+
#endif
|
113
|
+
#ifdef PRIVATE_USE_FIRST
|
114
|
+
else if (ucs >= PRIVATE_USE_FIRST &&
|
115
|
+
ucs <= PRIVATE_USE_LAST)
|
116
|
+
return rb_hash_aref(unicode_data,
|
117
|
+
INT2FIX(PRIVATE_USE_FIRST));
|
118
|
+
#endif
|
119
|
+
#ifdef CJK_IDEOGRAPH_EXTENSION_B_FIRST
|
120
|
+
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_B_FIRST &&
|
121
|
+
ucs <= CJK_IDEOGRAPH_EXTENSION_B_LAST)
|
122
|
+
return rb_hash_aref(unicode_data,
|
123
|
+
INT2FIX(CJK_IDEOGRAPH_EXTENSION_B_FIRST));
|
124
|
+
#endif
|
125
|
+
#ifdef CJK_IDEOGRAPH_EXTENSION_C_FIRST
|
126
|
+
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_C_FIRST &&
|
127
|
+
ucs <= CJK_IDEOGRAPH_EXTENSION_C_LAST)
|
128
|
+
return rb_hash_aref(unicode_data,
|
129
|
+
INT2FIX(CJK_IDEOGRAPH_EXTENSION_C_FIRST));
|
130
|
+
#endif
|
131
|
+
#ifdef CJK_IDEOGRAPH_EXTENSION_D_FIRST
|
132
|
+
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_D_FIRST &&
|
133
|
+
ucs <= CJK_IDEOGRAPH_EXTENSION_D_LAST)
|
134
|
+
return rb_hash_aref(unicode_data,
|
135
|
+
INT2FIX(CJK_IDEOGRAPH_EXTENSION_D_FIRST));
|
136
|
+
#endif
|
137
|
+
#ifdef PLANE_15_PRIVATE_USE_FIRST
|
138
|
+
else if (ucs >= PLANE_15_PRIVATE_USE_FIRST &&
|
139
|
+
ucs <= PLANE_15_PRIVATE_USE_LAST)
|
140
|
+
return rb_hash_aref(unicode_data,
|
141
|
+
INT2FIX(PLANE_15_PRIVATE_USE_FIRST));
|
142
|
+
#endif
|
143
|
+
#ifdef PLANE_16_PRIVATE_USE_FIRST
|
144
|
+
else if (ucs >= PLANE_16_PRIVATE_USE_FIRST &&
|
145
|
+
ucs <= PLANE_16_PRIVATE_USE_LAST)
|
146
|
+
return rb_hash_aref(unicode_data,
|
147
|
+
INT2FIX(PLANE_16_PRIVATE_USE_FIRST));
|
148
|
+
#endif
|
149
|
+
return Qnil;
|
150
|
+
}
|
151
|
+
|
69
152
|
static int
|
70
153
|
get_cc(int ucs)
|
71
154
|
{
|
@@ -77,6 +160,28 @@ get_cc(int ucs)
|
|
77
160
|
return 0;
|
78
161
|
}
|
79
162
|
|
163
|
+
static int
|
164
|
+
get_gencat(int ucs)
|
165
|
+
{
|
166
|
+
VALUE ch = get_unidata(ucs);
|
167
|
+
|
168
|
+
if (!NIL_P(ch)) {
|
169
|
+
return unidata[FIX2INT(ch)].general_category;
|
170
|
+
}
|
171
|
+
return c_Cn; /* Unassigned */
|
172
|
+
}
|
173
|
+
|
174
|
+
static int
|
175
|
+
get_eawidth(int ucs)
|
176
|
+
{
|
177
|
+
VALUE ch = get_unidata(ucs);
|
178
|
+
|
179
|
+
if (!NIL_P(ch)) {
|
180
|
+
return unidata[FIX2INT(ch)].east_asian_width;
|
181
|
+
}
|
182
|
+
return w_N; /* Neutral */
|
183
|
+
}
|
184
|
+
|
80
185
|
static const char*
|
81
186
|
get_canon(int ucs)
|
82
187
|
{
|
@@ -538,8 +643,8 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
|
|
538
643
|
CONVERT_TO_UTF8(str1);
|
539
644
|
CONVERT_TO_UTF8(str2);
|
540
645
|
#endif
|
541
|
-
|
542
|
-
|
646
|
+
WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
|
647
|
+
WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
|
543
648
|
WStr_alloc(&result1);
|
544
649
|
WStr_alloc(&result2);
|
545
650
|
decompose_internal(&wstr1, &result1);
|
@@ -580,8 +685,8 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
|
|
580
685
|
CONVERT_TO_UTF8(str1);
|
581
686
|
CONVERT_TO_UTF8(str2);
|
582
687
|
#endif
|
583
|
-
|
584
|
-
|
688
|
+
WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
|
689
|
+
WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
|
585
690
|
WStr_alloc(&result1);
|
586
691
|
WStr_alloc(&result2);
|
587
692
|
decompose_compat_internal(&wstr1, &result1);
|
@@ -617,7 +722,7 @@ unicode_decompose(VALUE obj, VALUE str)
|
|
617
722
|
#ifdef HAVE_RUBY_ENCODING_H
|
618
723
|
CONVERT_TO_UTF8(str);
|
619
724
|
#endif
|
620
|
-
|
725
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
621
726
|
WStr_alloc(&result);
|
622
727
|
decompose_internal(&ustr, &result);
|
623
728
|
WStr_free(&ustr);
|
@@ -643,7 +748,7 @@ unicode_decompose_safe(VALUE obj, VALUE str)
|
|
643
748
|
#ifdef HAVE_RUBY_ENCODING_H
|
644
749
|
CONVERT_TO_UTF8(str);
|
645
750
|
#endif
|
646
|
-
|
751
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
647
752
|
WStr_alloc(&result);
|
648
753
|
decompose_safe_internal(&ustr, &result);
|
649
754
|
WStr_free(&ustr);
|
@@ -669,7 +774,7 @@ unicode_decompose_compat(VALUE obj, VALUE str)
|
|
669
774
|
#ifdef HAVE_RUBY_ENCODING_H
|
670
775
|
CONVERT_TO_UTF8(str);
|
671
776
|
#endif
|
672
|
-
|
777
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
673
778
|
WStr_alloc(&result);
|
674
779
|
decompose_compat_internal(&ustr, &result);
|
675
780
|
WStr_free(&ustr);
|
@@ -695,7 +800,7 @@ unicode_compose(VALUE obj, VALUE str)
|
|
695
800
|
#ifdef HAVE_RUBY_ENCODING_H
|
696
801
|
CONVERT_TO_UTF8(str);
|
697
802
|
#endif
|
698
|
-
|
803
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
699
804
|
sort_canonical(&ustr);
|
700
805
|
WStr_alloc(&result);
|
701
806
|
compose_internal(&ustr, &result);
|
@@ -722,7 +827,7 @@ unicode_normalize_C(VALUE obj, VALUE str)
|
|
722
827
|
#ifdef HAVE_RUBY_ENCODING_H
|
723
828
|
CONVERT_TO_UTF8(str);
|
724
829
|
#endif
|
725
|
-
|
830
|
+
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
|
726
831
|
WStr_alloc(&ustr2);
|
727
832
|
decompose_internal(&ustr1, &ustr2);
|
728
833
|
WStr_free(&ustr1);
|
@@ -752,7 +857,7 @@ unicode_normalize_safe(VALUE obj, VALUE str)
|
|
752
857
|
#ifdef HAVE_RUBY_ENCODING_H
|
753
858
|
CONVERT_TO_UTF8(str);
|
754
859
|
#endif
|
755
|
-
|
860
|
+
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
|
756
861
|
WStr_alloc(&ustr2);
|
757
862
|
decompose_safe_internal(&ustr1, &ustr2);
|
758
863
|
WStr_free(&ustr1);
|
@@ -782,7 +887,7 @@ unicode_normalize_KC(VALUE obj, VALUE str)
|
|
782
887
|
#ifdef HAVE_RUBY_ENCODING_H
|
783
888
|
CONVERT_TO_UTF8(str);
|
784
889
|
#endif
|
785
|
-
|
890
|
+
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
|
786
891
|
WStr_alloc(&ustr2);
|
787
892
|
decompose_compat_internal(&ustr1, &ustr2);
|
788
893
|
WStr_free(&ustr1);
|
@@ -811,7 +916,7 @@ unicode_upcase(VALUE obj, VALUE str)
|
|
811
916
|
#ifdef HAVE_RUBY_ENCODING_H
|
812
917
|
CONVERT_TO_UTF8(str);
|
813
918
|
#endif
|
814
|
-
|
919
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
815
920
|
WStr_alloc(&result);
|
816
921
|
upcase_internal(&ustr, &result);
|
817
922
|
//sort_canonical(&result);
|
@@ -837,7 +942,7 @@ unicode_downcase(VALUE obj, VALUE str)
|
|
837
942
|
#ifdef HAVE_RUBY_ENCODING_H
|
838
943
|
CONVERT_TO_UTF8(str);
|
839
944
|
#endif
|
840
|
-
|
945
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
841
946
|
WStr_alloc(&result);
|
842
947
|
downcase_internal(&ustr, &result);
|
843
948
|
//sort_canonical(&result);
|
@@ -868,7 +973,7 @@ unicode_capitalize(VALUE obj, VALUE str)
|
|
868
973
|
#ifdef HAVE_RUBY_ENCODING_H
|
869
974
|
CONVERT_TO_UTF8(str);
|
870
975
|
#endif
|
871
|
-
|
976
|
+
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
872
977
|
WStr_alloc(&result);
|
873
978
|
capitalize_internal(&ustr, &result);
|
874
979
|
//sort_canonical(&result);
|
@@ -882,6 +987,248 @@ unicode_capitalize(VALUE obj, VALUE str)
|
|
882
987
|
return vret;
|
883
988
|
}
|
884
989
|
|
990
|
+
typedef struct _get_categories_param {
|
991
|
+
WString* wstr;
|
992
|
+
VALUE str;
|
993
|
+
VALUE* catname;
|
994
|
+
} get_categories_param;
|
995
|
+
|
996
|
+
static VALUE
|
997
|
+
get_categories_internal(get_categories_param* param)
|
998
|
+
{
|
999
|
+
WString* wstr = param->wstr;
|
1000
|
+
VALUE str = param->str;
|
1001
|
+
VALUE* catname = param->catname;
|
1002
|
+
int pos;
|
1003
|
+
int block_p = rb_block_given_p();
|
1004
|
+
volatile VALUE ret = str;
|
1005
|
+
|
1006
|
+
if (!block_p)
|
1007
|
+
ret = rb_ary_new();
|
1008
|
+
for (pos = 0; pos < wstr->len; pos++) {
|
1009
|
+
int gencat = get_gencat(wstr->str[pos]);
|
1010
|
+
if (!block_p)
|
1011
|
+
rb_ary_push(ret, catname[gencat]);
|
1012
|
+
else {
|
1013
|
+
rb_yield(catname[gencat]);
|
1014
|
+
}
|
1015
|
+
}
|
1016
|
+
|
1017
|
+
return ret;
|
1018
|
+
}
|
1019
|
+
|
1020
|
+
VALUE
|
1021
|
+
get_categories_ensure(WString* wstr)
|
1022
|
+
{
|
1023
|
+
WStr_free(wstr);
|
1024
|
+
return Qnil;
|
1025
|
+
}
|
1026
|
+
|
1027
|
+
VALUE
|
1028
|
+
unicode_get_categories(VALUE obj, VALUE str)
|
1029
|
+
{
|
1030
|
+
WString wstr;
|
1031
|
+
get_categories_param param = { &wstr, str, catname_long };
|
1032
|
+
|
1033
|
+
Check_Type(str, T_STRING);
|
1034
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
1035
|
+
CONVERT_TO_UTF8(str);
|
1036
|
+
#endif
|
1037
|
+
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1038
|
+
|
1039
|
+
return rb_ensure(get_categories_internal, (VALUE)¶m,
|
1040
|
+
get_categories_ensure, (VALUE)&wstr);
|
1041
|
+
/* wstr will be freed in get_text_elements_ensure() */
|
1042
|
+
}
|
1043
|
+
|
1044
|
+
|
1045
|
+
VALUE
|
1046
|
+
unicode_get_abbr_categories(VALUE obj, VALUE str)
|
1047
|
+
{
|
1048
|
+
WString wstr;
|
1049
|
+
get_categories_param param = { &wstr, str, catname_abbr };
|
1050
|
+
|
1051
|
+
Check_Type(str, T_STRING);
|
1052
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
1053
|
+
CONVERT_TO_UTF8(str);
|
1054
|
+
#endif
|
1055
|
+
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1056
|
+
|
1057
|
+
return rb_ensure(get_categories_internal, (VALUE)¶m,
|
1058
|
+
get_categories_ensure, (VALUE)&wstr);
|
1059
|
+
/* wstr will be freed in get_text_elements_ensure() */
|
1060
|
+
}
|
1061
|
+
|
1062
|
+
VALUE
|
1063
|
+
unicode_wcswidth(int argc, VALUE* argv, VALUE obj)
|
1064
|
+
{
|
1065
|
+
WString wstr;
|
1066
|
+
int i, count;
|
1067
|
+
int width = 0;
|
1068
|
+
int cjk_p = 0;
|
1069
|
+
VALUE str;
|
1070
|
+
VALUE cjk;
|
1071
|
+
|
1072
|
+
count = rb_scan_args(argc, argv, "11", &str, &cjk);
|
1073
|
+
if (count > 1)
|
1074
|
+
cjk_p = RTEST(cjk);
|
1075
|
+
Check_Type(str, T_STRING);
|
1076
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
1077
|
+
CONVERT_TO_UTF8(str);
|
1078
|
+
#endif
|
1079
|
+
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1080
|
+
for (i = 0; i <wstr.len; i++) {
|
1081
|
+
int c = wstr.str[i];
|
1082
|
+
int cat = get_gencat(c);
|
1083
|
+
int eaw = get_eawidth(c);
|
1084
|
+
if ((c > 0 && c < 32) || (c >= 0x7f && c < 0xa0)) {
|
1085
|
+
/* Control Characters */
|
1086
|
+
width = -1;
|
1087
|
+
break;
|
1088
|
+
}
|
1089
|
+
else if (c != 0x00ad && /* SOFT HYPHEN */
|
1090
|
+
(cat == c_Mn || cat == c_Me || /* Non-spacing Marks */
|
1091
|
+
cat == c_Cf || /* Format */
|
1092
|
+
c == 0 || /* NUL */
|
1093
|
+
(c >= 0x1160 && c <= 0x11ff))) /* HANGUL JUNGSEONG/JONGSEONG */
|
1094
|
+
/* zero width */ ;
|
1095
|
+
else if (eaw == w_F || eaw == w_W || /* Fullwidth or Wide */
|
1096
|
+
(c >= 0x4db6 && c <= 0x4dbf) || /* CJK Reserved */
|
1097
|
+
(c >= 0x9fcd && c <= 0x9fff) || /* CJK Reserved */
|
1098
|
+
(c >= 0xfa6e && c <= 0xfa6f) || /* CJK Reserved */
|
1099
|
+
(c >= 0xfada && c <= 0xfaff) || /* CJK Reserved */
|
1100
|
+
(c >= 0x2a6d7 && c <= 0x2a6ff) || /* CJK Reserved */
|
1101
|
+
(c >= 0x2b735 && c <= 0x2b73f) || /* CJK Reserved */
|
1102
|
+
(c >= 0x2b81e && c <= 0x2f7ff) || /* CJK Reserved */
|
1103
|
+
(c >= 0x2fa1e && c <= 0x2fffd) || /* CJK Reserved */
|
1104
|
+
(c >= 0x30000 && c <= 0x3fffd) || /* CJK Reserved */
|
1105
|
+
(cjk_p && eaw == w_A)) /* East Asian Ambiguous */
|
1106
|
+
width += 2;
|
1107
|
+
else
|
1108
|
+
width++; /* Halfwidth or Neutral */
|
1109
|
+
}
|
1110
|
+
WStr_free(&wstr);
|
1111
|
+
|
1112
|
+
return INT2FIX(width);
|
1113
|
+
}
|
1114
|
+
|
1115
|
+
VALUE
|
1116
|
+
wstring_to_rstring(WString* wstr, int start, int len) {
|
1117
|
+
UString ret;
|
1118
|
+
volatile VALUE vret;
|
1119
|
+
|
1120
|
+
UniStr_alloc(&ret);
|
1121
|
+
WStr_convertIntoUString2(wstr, start, len, &ret);
|
1122
|
+
vret = ENC_(rb_str_new((char*)ret.str, ret.len));
|
1123
|
+
UniStr_free(&ret);
|
1124
|
+
|
1125
|
+
return vret;
|
1126
|
+
}
|
1127
|
+
|
1128
|
+
typedef struct _get_text_elements_param {
|
1129
|
+
WString* wstr;
|
1130
|
+
VALUE str;
|
1131
|
+
} get_text_elements_param;
|
1132
|
+
|
1133
|
+
VALUE
|
1134
|
+
get_text_elements_internal(get_text_elements_param* param)
|
1135
|
+
{
|
1136
|
+
WString* wstr = param->wstr;
|
1137
|
+
VALUE str = param->str;
|
1138
|
+
int start_pos;
|
1139
|
+
int block_p = rb_block_given_p();
|
1140
|
+
volatile VALUE ret = str;
|
1141
|
+
|
1142
|
+
if (!block_p)
|
1143
|
+
ret = rb_ary_new();
|
1144
|
+
for (start_pos = 0; start_pos < wstr->len;) {
|
1145
|
+
int c0 = wstr->str[start_pos];
|
1146
|
+
int cat = get_gencat(c0);
|
1147
|
+
int length = 1;
|
1148
|
+
int j;
|
1149
|
+
|
1150
|
+
if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
|
1151
|
+
volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
|
1152
|
+
if (!block_p)
|
1153
|
+
rb_ary_push(ret, rstr);
|
1154
|
+
else
|
1155
|
+
rb_yield(rstr);
|
1156
|
+
start_pos++;
|
1157
|
+
continue;
|
1158
|
+
}
|
1159
|
+
|
1160
|
+
for (j = start_pos + 1; j < wstr->len; j++) {
|
1161
|
+
int c1 = wstr->str[j];
|
1162
|
+
int cat = get_gencat(c1);
|
1163
|
+
if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
|
1164
|
+
j + 1 < wstr->len &&
|
1165
|
+
c1 >= VBASE && c1 < VBASE + VCOUNT &&
|
1166
|
+
wstr->str[j+1] >= TBASE && wstr->str[j+1] < TBASE + TCOUNT) {
|
1167
|
+
/* Hangul L+V+T */
|
1168
|
+
length += 2;
|
1169
|
+
j++;
|
1170
|
+
}
|
1171
|
+
else if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
|
1172
|
+
c1 >= VBASE && c1< VBASE + VCOUNT) {
|
1173
|
+
/* Hangul L+V */
|
1174
|
+
length++;
|
1175
|
+
}
|
1176
|
+
else if (c0 >= SBASE && c0 < SBASE + SCOUNT &&
|
1177
|
+
(c0 - SBASE) % TCOUNT == 0 &&
|
1178
|
+
c1 >= TBASE && c1 < TBASE + TCOUNT) {
|
1179
|
+
/* Hangul LV+T */
|
1180
|
+
length++;
|
1181
|
+
}
|
1182
|
+
else if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
|
1183
|
+
/* Mark */
|
1184
|
+
length++;
|
1185
|
+
}
|
1186
|
+
else {
|
1187
|
+
volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
|
1188
|
+
if (!block_p)
|
1189
|
+
rb_ary_push(ret, rstr);
|
1190
|
+
else
|
1191
|
+
rb_yield(rstr);
|
1192
|
+
length = 0;
|
1193
|
+
break;
|
1194
|
+
}
|
1195
|
+
}
|
1196
|
+
if (length > 0) {
|
1197
|
+
volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
|
1198
|
+
if (!block_p)
|
1199
|
+
rb_ary_push(ret, rstr);
|
1200
|
+
else
|
1201
|
+
rb_yield(rstr);
|
1202
|
+
}
|
1203
|
+
start_pos = j;
|
1204
|
+
}
|
1205
|
+
return ret;
|
1206
|
+
}
|
1207
|
+
|
1208
|
+
VALUE
|
1209
|
+
get_text_elements_ensure(WString* wstr)
|
1210
|
+
{
|
1211
|
+
WStr_free(wstr);
|
1212
|
+
return Qnil;
|
1213
|
+
}
|
1214
|
+
|
1215
|
+
VALUE
|
1216
|
+
unicode_get_text_elements(VALUE obj, VALUE str)
|
1217
|
+
{
|
1218
|
+
WString wstr;
|
1219
|
+
get_text_elements_param param = { &wstr, str };
|
1220
|
+
|
1221
|
+
Check_Type(str, T_STRING);
|
1222
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
1223
|
+
CONVERT_TO_UTF8(str);
|
1224
|
+
#endif
|
1225
|
+
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1226
|
+
|
1227
|
+
return rb_ensure(get_text_elements_internal, (VALUE)¶m,
|
1228
|
+
get_text_elements_ensure, (VALUE)&wstr);
|
1229
|
+
/* wstr will be freed in get_text_elements_ensure() */
|
1230
|
+
}
|
1231
|
+
|
885
1232
|
void
|
886
1233
|
Init_unicode_native()
|
887
1234
|
{
|
@@ -909,6 +1256,13 @@ Init_unicode_native()
|
|
909
1256
|
}
|
910
1257
|
}
|
911
1258
|
|
1259
|
+
for (i = 0; i < c_Cn + 1; i++) {
|
1260
|
+
catname_abbr[i] = ID2SYM(rb_intern(gencat_abbr[i]));
|
1261
|
+
catname_long[i] = ID2SYM(rb_intern(gencat_long[i]));
|
1262
|
+
rb_global_variable(&catname_abbr[i]);
|
1263
|
+
rb_global_variable(&catname_long[i]);
|
1264
|
+
}
|
1265
|
+
|
912
1266
|
rb_define_module_function(mUnicode, "strcmp",
|
913
1267
|
unicode_strcmp, 2);
|
914
1268
|
rb_define_module_function(mUnicode, "strcmp_compat",
|
@@ -957,6 +1311,15 @@ Init_unicode_native()
|
|
957
1311
|
rb_define_module_function(mUnicode, "capitalize",
|
958
1312
|
unicode_capitalize, 1);
|
959
1313
|
|
1314
|
+
rb_define_module_function(mUnicode, "categories",
|
1315
|
+
unicode_get_categories, 1);
|
1316
|
+
rb_define_module_function(mUnicode, "abbr_categories",
|
1317
|
+
unicode_get_abbr_categories, 1);
|
1318
|
+
rb_define_module_function(mUnicode, "width",
|
1319
|
+
unicode_wcswidth, -1);
|
1320
|
+
rb_define_module_function(mUnicode, "text_elements",
|
1321
|
+
unicode_get_text_elements, 1);
|
1322
|
+
|
960
1323
|
rb_define_const(mUnicode, "VERSION",
|
961
1324
|
rb_str_new2(UNICODE_VERSION));
|
962
1325
|
}
|