unicode 0.4.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +15 -2
- data/unicode.c +118 -0
- metadata +4 -4
data/README
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
Unicode Library for Ruby
|
2
|
-
Version 0.4.
|
2
|
+
Version 0.4.2
|
3
3
|
|
4
4
|
Yoshida Masato
|
5
5
|
|
@@ -15,7 +15,7 @@
|
|
15
15
|
- Install
|
16
16
|
|
17
17
|
This can work with ruby-1.8 or later. I recommend you to
|
18
|
-
use ruby-1.9.
|
18
|
+
use ruby-1.9.3 or later.
|
19
19
|
|
20
20
|
Make and install usually.
|
21
21
|
For example, when Ruby supports dynamic linking on your OS,
|
@@ -59,6 +59,11 @@
|
|
59
59
|
mapping in UnicodeData.txt and the Hangul decomposition
|
60
60
|
algorithm.
|
61
61
|
|
62
|
+
Unicode::decompose_safe(str)
|
63
|
+
Decompose Unicode string with a non-standard mapping.
|
64
|
+
It does not decompose the characters in
|
65
|
+
CompositionExclusions.txt.
|
66
|
+
|
62
67
|
Unicode::compose(str)
|
63
68
|
Compose Unicode string. Before composing, the trailing
|
64
69
|
characters are sorted in canonical order.
|
@@ -73,12 +78,19 @@
|
|
73
78
|
Normalize Unicode string in form D or form KD.
|
74
79
|
These are aliases of decompose/decompose_compat.
|
75
80
|
|
81
|
+
Unicode::normalize_D_safe(str) (Unicode::nfd_safe(str))
|
82
|
+
This is an aliase of decompose_safe.
|
83
|
+
|
76
84
|
Unicode::normalize_C(str) (Unicode::nfc(str))
|
77
85
|
Unicode::normalize_KC(str) (Unicode::nfkc(str))
|
78
86
|
Normalize Unicode string in form C or form KC.
|
79
87
|
normalize_C = decompose + compose
|
80
88
|
normalize_KC = decompose_compat + compose
|
81
89
|
|
90
|
+
Unicode::normalize_C_safe(str) (Unicode::nfc_safe(str))
|
91
|
+
Normalize Unicode string with decompose_safe.
|
92
|
+
normalize_C_safe = decompose_safe + compose
|
93
|
+
|
82
94
|
Unicode::upcase(str)
|
83
95
|
Unicode::downcase(str)
|
84
96
|
Unicode::capitalize(str)
|
@@ -111,6 +123,7 @@
|
|
111
123
|
|
112
124
|
- History
|
113
125
|
|
126
|
+
Feb 29, 2012 version 0.4.2 add decompose_safe
|
114
127
|
Feb 3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
|
115
128
|
Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
|
116
129
|
Feb 26, 2010 version 0.3.0 fix a capitalize bug and support SpecialCasing
|
data/unicode.c
CHANGED
@@ -7,6 +7,8 @@
|
|
7
7
|
*
|
8
8
|
*/
|
9
9
|
|
10
|
+
#define UNICODE_VERSION "0.4.2"
|
11
|
+
|
10
12
|
#include "ruby.h"
|
11
13
|
#ifdef HAVE_RUBY_IO_H
|
12
14
|
# include "ruby/io.h"
|
@@ -86,6 +88,19 @@ get_canon(int ucs)
|
|
86
88
|
return NULL;
|
87
89
|
}
|
88
90
|
|
91
|
+
static const char*
|
92
|
+
get_canon_ex(int ucs)
|
93
|
+
{
|
94
|
+
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
95
|
+
|
96
|
+
if (!NIL_P(ch)) {
|
97
|
+
int i = FIX2INT(ch);
|
98
|
+
if (!unidata[i].exclusion)
|
99
|
+
return unidata[i].canon;
|
100
|
+
}
|
101
|
+
return NULL;
|
102
|
+
}
|
103
|
+
|
89
104
|
static const char*
|
90
105
|
get_compat(int ucs)
|
91
106
|
{
|
@@ -215,6 +230,40 @@ decompose_internal(WString* ustr, WString* result)
|
|
215
230
|
return result;
|
216
231
|
}
|
217
232
|
|
233
|
+
/*
|
234
|
+
* push decomposed str into result
|
235
|
+
*/
|
236
|
+
static WString*
|
237
|
+
decompose_safe_internal(WString* ustr, WString* result)
|
238
|
+
{
|
239
|
+
int i;
|
240
|
+
int len = ustr->len;
|
241
|
+
|
242
|
+
for (i = 0; i < len; i++) {
|
243
|
+
int ucs = ustr->str[i];
|
244
|
+
if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
|
245
|
+
int l, v, t;
|
246
|
+
decompose_hangul(ucs, &l, &v, &t);
|
247
|
+
WStr_addWChar(result, l);
|
248
|
+
if (v) WStr_addWChar(result, v);
|
249
|
+
if (t) WStr_addWChar(result, t);
|
250
|
+
}
|
251
|
+
else {
|
252
|
+
const char* dc = get_canon_ex(ucs);
|
253
|
+
if (!dc) {
|
254
|
+
WStr_addWChar(result, ucs);
|
255
|
+
}
|
256
|
+
else {
|
257
|
+
WString wdc;
|
258
|
+
WStr_allocWithUTF8(&wdc, dc);
|
259
|
+
decompose_safe_internal(&wdc, result);
|
260
|
+
WStr_free(&wdc);
|
261
|
+
}
|
262
|
+
}
|
263
|
+
}
|
264
|
+
return result;
|
265
|
+
}
|
266
|
+
|
218
267
|
/*
|
219
268
|
* push compatibility decomposed str into result
|
220
269
|
*/
|
@@ -582,6 +631,32 @@ unicode_decompose(VALUE obj, VALUE str)
|
|
582
631
|
return vret;
|
583
632
|
}
|
584
633
|
|
634
|
+
static VALUE
|
635
|
+
unicode_decompose_safe(VALUE obj, VALUE str)
|
636
|
+
{
|
637
|
+
WString ustr;
|
638
|
+
WString result;
|
639
|
+
UString ret;
|
640
|
+
VALUE vret;
|
641
|
+
|
642
|
+
Check_Type(str, T_STRING);
|
643
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
644
|
+
CONVERT_TO_UTF8(str);
|
645
|
+
#endif
|
646
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
647
|
+
WStr_alloc(&result);
|
648
|
+
decompose_safe_internal(&ustr, &result);
|
649
|
+
WStr_free(&ustr);
|
650
|
+
sort_canonical(&result);
|
651
|
+
UniStr_alloc(&ret);
|
652
|
+
WStr_convertIntoUString(&result, &ret);
|
653
|
+
WStr_free(&result);
|
654
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
655
|
+
UniStr_free(&ret);
|
656
|
+
|
657
|
+
return vret;
|
658
|
+
}
|
659
|
+
|
585
660
|
static VALUE
|
586
661
|
unicode_decompose_compat(VALUE obj, VALUE str)
|
587
662
|
{
|
@@ -664,6 +739,36 @@ unicode_normalize_C(VALUE obj, VALUE str)
|
|
664
739
|
return vret;
|
665
740
|
}
|
666
741
|
|
742
|
+
static VALUE
|
743
|
+
unicode_normalize_safe(VALUE obj, VALUE str)
|
744
|
+
{
|
745
|
+
WString ustr1;
|
746
|
+
WString ustr2;
|
747
|
+
WString result;
|
748
|
+
UString ret;
|
749
|
+
VALUE vret;
|
750
|
+
|
751
|
+
Check_Type(str, T_STRING);
|
752
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
753
|
+
CONVERT_TO_UTF8(str);
|
754
|
+
#endif
|
755
|
+
WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
|
756
|
+
WStr_alloc(&ustr2);
|
757
|
+
decompose_safe_internal(&ustr1, &ustr2);
|
758
|
+
WStr_free(&ustr1);
|
759
|
+
sort_canonical(&ustr2);
|
760
|
+
WStr_alloc(&result);
|
761
|
+
compose_internal(&ustr2, &result);
|
762
|
+
WStr_free(&ustr2);
|
763
|
+
UniStr_alloc(&ret);
|
764
|
+
WStr_convertIntoUString(&result, &ret);
|
765
|
+
WStr_free(&result);
|
766
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
767
|
+
UniStr_free(&ret);
|
768
|
+
|
769
|
+
return vret;
|
770
|
+
}
|
771
|
+
|
667
772
|
static VALUE
|
668
773
|
unicode_normalize_KC(VALUE obj, VALUE str)
|
669
774
|
{
|
@@ -811,6 +916,8 @@ Init_unicode()
|
|
811
916
|
|
812
917
|
rb_define_module_function(mUnicode, "decompose",
|
813
918
|
unicode_decompose, 1);
|
919
|
+
rb_define_module_function(mUnicode, "decompose_safe",
|
920
|
+
unicode_decompose_safe, 1);
|
814
921
|
rb_define_module_function(mUnicode, "decompose_compat",
|
815
922
|
unicode_decompose_compat, 1);
|
816
923
|
rb_define_module_function(mUnicode, "compose",
|
@@ -818,20 +925,28 @@ Init_unicode()
|
|
818
925
|
|
819
926
|
rb_define_module_function(mUnicode, "normalize_D",
|
820
927
|
unicode_decompose, 1);
|
928
|
+
rb_define_module_function(mUnicode, "normalize_D_safe",
|
929
|
+
unicode_decompose_safe, 1);
|
821
930
|
rb_define_module_function(mUnicode, "normalize_KD",
|
822
931
|
unicode_decompose_compat, 1);
|
823
932
|
rb_define_module_function(mUnicode, "normalize_C",
|
824
933
|
unicode_normalize_C, 1);
|
934
|
+
rb_define_module_function(mUnicode, "normalize_C_safe",
|
935
|
+
unicode_normalize_safe, 1);
|
825
936
|
rb_define_module_function(mUnicode, "normalize_KC",
|
826
937
|
unicode_normalize_KC, 1);
|
827
938
|
|
828
939
|
/* aliases */
|
829
940
|
rb_define_module_function(mUnicode, "nfd",
|
830
941
|
unicode_decompose, 1);
|
942
|
+
rb_define_module_function(mUnicode, "nfd_safe",
|
943
|
+
unicode_decompose_safe, 1);
|
831
944
|
rb_define_module_function(mUnicode, "nfkd",
|
832
945
|
unicode_decompose_compat, 1);
|
833
946
|
rb_define_module_function(mUnicode, "nfc",
|
834
947
|
unicode_normalize_C, 1);
|
948
|
+
rb_define_module_function(mUnicode, "nfc_safe",
|
949
|
+
unicode_normalize_safe, 1);
|
835
950
|
rb_define_module_function(mUnicode, "nfkc",
|
836
951
|
unicode_normalize_KC, 1);
|
837
952
|
|
@@ -841,4 +956,7 @@ Init_unicode()
|
|
841
956
|
unicode_downcase, 1);
|
842
957
|
rb_define_module_function(mUnicode, "capitalize",
|
843
958
|
unicode_capitalize, 1);
|
959
|
+
|
960
|
+
rb_define_const(mUnicode, "VERSION",
|
961
|
+
rb_str_new2(UNICODE_VERSION));
|
844
962
|
}
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 11
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 4
|
9
|
-
-
|
10
|
-
version: 0.4.
|
9
|
+
- 2
|
10
|
+
version: 0.4.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Yoshida Masato
|
@@ -67,7 +67,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
67
67
|
requirements: []
|
68
68
|
|
69
69
|
rubyforge_project:
|
70
|
-
rubygems_version: 1.8.
|
70
|
+
rubygems_version: 1.8.17
|
71
71
|
signing_key:
|
72
72
|
specification_version: 3
|
73
73
|
summary: Unicode normalization library.
|