unicode 0.4.1-x86-mswin32-60 → 0.4.2-x86-mswin32-60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +15 -2
- data/ext/unicode/unicode.c +119 -1
- data/lib/unicode/1.8/unicode_native.so +0 -0
- data/lib/unicode/1.9/unicode_native.so +0 -0
- data/unicode.gemspec +1 -1
- metadata +4 -4
data/README
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
Unicode Library for Ruby
|
2
|
-
Version 0.4.
|
2
|
+
Version 0.4.2
|
3
3
|
|
4
4
|
Yoshida Masato
|
5
5
|
|
@@ -15,7 +15,7 @@
|
|
15
15
|
- Install
|
16
16
|
|
17
17
|
This can work with ruby-1.8 or later. I recommend you to
|
18
|
-
use ruby-1.9.
|
18
|
+
use ruby-1.9.3 or later.
|
19
19
|
|
20
20
|
Make and install usually.
|
21
21
|
For example, when Ruby supports dynamic linking on your OS,
|
@@ -59,6 +59,11 @@
|
|
59
59
|
mapping in UnicodeData.txt and the Hangul decomposition
|
60
60
|
algorithm.
|
61
61
|
|
62
|
+
Unicode::decompose_safe(str)
|
63
|
+
Decompose Unicode string with a non-standard mapping.
|
64
|
+
It does not decompose the characters in
|
65
|
+
CompositionExclusions.txt.
|
66
|
+
|
62
67
|
Unicode::compose(str)
|
63
68
|
Compose Unicode string. Before composing, the trailing
|
64
69
|
characters are sorted in canonical order.
|
@@ -73,12 +78,19 @@
|
|
73
78
|
Normalize Unicode string in form D or form KD.
|
74
79
|
These are aliases of decompose/decompose_compat.
|
75
80
|
|
81
|
+
Unicode::normalize_D_safe(str) (Unicode::nfd_safe(str))
|
82
|
+
This is an aliase of decompose_safe.
|
83
|
+
|
76
84
|
Unicode::normalize_C(str) (Unicode::nfc(str))
|
77
85
|
Unicode::normalize_KC(str) (Unicode::nfkc(str))
|
78
86
|
Normalize Unicode string in form C or form KC.
|
79
87
|
normalize_C = decompose + compose
|
80
88
|
normalize_KC = decompose_compat + compose
|
81
89
|
|
90
|
+
Unicode::normalize_C_safe(str) (Unicode::nfc_safe(str))
|
91
|
+
Normalize Unicode string with decompose_safe.
|
92
|
+
normalize_C_safe = decompose_safe + compose
|
93
|
+
|
82
94
|
Unicode::upcase(str)
|
83
95
|
Unicode::downcase(str)
|
84
96
|
Unicode::capitalize(str)
|
@@ -111,6 +123,7 @@
|
|
111
123
|
|
112
124
|
- History
|
113
125
|
|
126
|
+
Feb 29, 2012 version 0.4.2 add decompose_safe
|
114
127
|
Feb 3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
|
115
128
|
Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
|
116
129
|
Feb 26, 2010 version 0.3.0 fix a capitalize bug and support SpecialCasing
|
data/ext/unicode/unicode.c
CHANGED
@@ -7,6 +7,8 @@
|
|
7
7
|
*
|
8
8
|
*/
|
9
9
|
|
10
|
+
#define UNICODE_VERSION "0.4.2"
|
11
|
+
|
10
12
|
#include "ruby.h"
|
11
13
|
#ifdef HAVE_RUBY_IO_H
|
12
14
|
# include "ruby/io.h"
|
@@ -86,6 +88,19 @@ get_canon(int ucs)
|
|
86
88
|
return NULL;
|
87
89
|
}
|
88
90
|
|
91
|
+
static const char*
|
92
|
+
get_canon_ex(int ucs)
|
93
|
+
{
|
94
|
+
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
95
|
+
|
96
|
+
if (!NIL_P(ch)) {
|
97
|
+
int i = FIX2INT(ch);
|
98
|
+
if (!unidata[i].exclusion)
|
99
|
+
return unidata[i].canon;
|
100
|
+
}
|
101
|
+
return NULL;
|
102
|
+
}
|
103
|
+
|
89
104
|
static const char*
|
90
105
|
get_compat(int ucs)
|
91
106
|
{
|
@@ -216,7 +231,41 @@ decompose_internal(WString* ustr, WString* result)
|
|
216
231
|
}
|
217
232
|
|
218
233
|
/*
|
219
|
-
* push
|
234
|
+
* push decomposed str into result
|
235
|
+
*/
|
236
|
+
static WString*
|
237
|
+
decompose_safe_internal(WString* ustr, WString* result)
|
238
|
+
{
|
239
|
+
int i;
|
240
|
+
int len = ustr->len;
|
241
|
+
|
242
|
+
for (i = 0; i < len; i++) {
|
243
|
+
int ucs = ustr->str[i];
|
244
|
+
if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
|
245
|
+
int l, v, t;
|
246
|
+
decompose_hangul(ucs, &l, &v, &t);
|
247
|
+
WStr_addWChar(result, l);
|
248
|
+
if (v) WStr_addWChar(result, v);
|
249
|
+
if (t) WStr_addWChar(result, t);
|
250
|
+
}
|
251
|
+
else {
|
252
|
+
const char* dc = get_canon_ex(ucs);
|
253
|
+
if (!dc) {
|
254
|
+
WStr_addWChar(result, ucs);
|
255
|
+
}
|
256
|
+
else {
|
257
|
+
WString wdc;
|
258
|
+
WStr_allocWithUTF8(&wdc, dc);
|
259
|
+
decompose_safe_internal(&wdc, result);
|
260
|
+
WStr_free(&wdc);
|
261
|
+
}
|
262
|
+
}
|
263
|
+
}
|
264
|
+
return result;
|
265
|
+
}
|
266
|
+
|
267
|
+
/*
|
268
|
+
* push compatibility decomposed str into result
|
220
269
|
*/
|
221
270
|
static WString*
|
222
271
|
decompose_compat_internal(WString* ustr, WString* result)
|
@@ -582,6 +631,32 @@ unicode_decompose(VALUE obj, VALUE str)
|
|
582
631
|
return vret;
|
583
632
|
}
|
584
633
|
|
634
|
+
static VALUE
|
635
|
+
unicode_decompose_safe(VALUE obj, VALUE str)
|
636
|
+
{
|
637
|
+
WString ustr;
|
638
|
+
WString result;
|
639
|
+
UString ret;
|
640
|
+
VALUE vret;
|
641
|
+
|
642
|
+
Check_Type(str, T_STRING);
|
643
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
644
|
+
CONVERT_TO_UTF8(str);
|
645
|
+
#endif
|
646
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
647
|
+
WStr_alloc(&result);
|
648
|
+
decompose_safe_internal(&ustr, &result);
|
649
|
+
WStr_free(&ustr);
|
650
|
+
sort_canonical(&result);
|
651
|
+
UniStr_alloc(&ret);
|
652
|
+
WStr_convertIntoUString(&result, &ret);
|
653
|
+
WStr_free(&result);
|
654
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
655
|
+
UniStr_free(&ret);
|
656
|
+
|
657
|
+
return vret;
|
658
|
+
}
|
659
|
+
|
585
660
|
static VALUE
|
586
661
|
unicode_decompose_compat(VALUE obj, VALUE str)
|
587
662
|
{
|
@@ -664,6 +739,36 @@ unicode_normalize_C(VALUE obj, VALUE str)
|
|
664
739
|
return vret;
|
665
740
|
}
|
666
741
|
|
742
|
+
static VALUE
|
743
|
+
unicode_normalize_safe(VALUE obj, VALUE str)
|
744
|
+
{
|
745
|
+
WString ustr1;
|
746
|
+
WString ustr2;
|
747
|
+
WString result;
|
748
|
+
UString ret;
|
749
|
+
VALUE vret;
|
750
|
+
|
751
|
+
Check_Type(str, T_STRING);
|
752
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
753
|
+
CONVERT_TO_UTF8(str);
|
754
|
+
#endif
|
755
|
+
WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
|
756
|
+
WStr_alloc(&ustr2);
|
757
|
+
decompose_safe_internal(&ustr1, &ustr2);
|
758
|
+
WStr_free(&ustr1);
|
759
|
+
sort_canonical(&ustr2);
|
760
|
+
WStr_alloc(&result);
|
761
|
+
compose_internal(&ustr2, &result);
|
762
|
+
WStr_free(&ustr2);
|
763
|
+
UniStr_alloc(&ret);
|
764
|
+
WStr_convertIntoUString(&result, &ret);
|
765
|
+
WStr_free(&result);
|
766
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
767
|
+
UniStr_free(&ret);
|
768
|
+
|
769
|
+
return vret;
|
770
|
+
}
|
771
|
+
|
667
772
|
static VALUE
|
668
773
|
unicode_normalize_KC(VALUE obj, VALUE str)
|
669
774
|
{
|
@@ -811,6 +916,8 @@ Init_unicode_native()
|
|
811
916
|
|
812
917
|
rb_define_module_function(mUnicode, "decompose",
|
813
918
|
unicode_decompose, 1);
|
919
|
+
rb_define_module_function(mUnicode, "decompose_safe",
|
920
|
+
unicode_decompose_safe, 1);
|
814
921
|
rb_define_module_function(mUnicode, "decompose_compat",
|
815
922
|
unicode_decompose_compat, 1);
|
816
923
|
rb_define_module_function(mUnicode, "compose",
|
@@ -818,20 +925,28 @@ Init_unicode_native()
|
|
818
925
|
|
819
926
|
rb_define_module_function(mUnicode, "normalize_D",
|
820
927
|
unicode_decompose, 1);
|
928
|
+
rb_define_module_function(mUnicode, "normalize_D_safe",
|
929
|
+
unicode_decompose_safe, 1);
|
821
930
|
rb_define_module_function(mUnicode, "normalize_KD",
|
822
931
|
unicode_decompose_compat, 1);
|
823
932
|
rb_define_module_function(mUnicode, "normalize_C",
|
824
933
|
unicode_normalize_C, 1);
|
934
|
+
rb_define_module_function(mUnicode, "normalize_C_safe",
|
935
|
+
unicode_normalize_safe, 1);
|
825
936
|
rb_define_module_function(mUnicode, "normalize_KC",
|
826
937
|
unicode_normalize_KC, 1);
|
827
938
|
|
828
939
|
/* aliases */
|
829
940
|
rb_define_module_function(mUnicode, "nfd",
|
830
941
|
unicode_decompose, 1);
|
942
|
+
rb_define_module_function(mUnicode, "nfd_safe",
|
943
|
+
unicode_decompose_safe, 1);
|
831
944
|
rb_define_module_function(mUnicode, "nfkd",
|
832
945
|
unicode_decompose_compat, 1);
|
833
946
|
rb_define_module_function(mUnicode, "nfc",
|
834
947
|
unicode_normalize_C, 1);
|
948
|
+
rb_define_module_function(mUnicode, "nfc_safe",
|
949
|
+
unicode_normalize_safe, 1);
|
835
950
|
rb_define_module_function(mUnicode, "nfkc",
|
836
951
|
unicode_normalize_KC, 1);
|
837
952
|
|
@@ -841,4 +956,7 @@ Init_unicode_native()
|
|
841
956
|
unicode_downcase, 1);
|
842
957
|
rb_define_module_function(mUnicode, "capitalize",
|
843
958
|
unicode_capitalize, 1);
|
959
|
+
|
960
|
+
rb_define_const(mUnicode, "VERSION",
|
961
|
+
rb_str_new2(UNICODE_VERSION));
|
844
962
|
}
|
Binary file
|
Binary file
|
data/unicode.gemspec
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 11
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 4
|
9
|
-
-
|
10
|
-
version: 0.4.
|
9
|
+
- 2
|
10
|
+
version: 0.4.2
|
11
11
|
platform: x86-mswin32-60
|
12
12
|
authors:
|
13
13
|
- Yoshida Masato
|
@@ -73,7 +73,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
73
73
|
requirements: []
|
74
74
|
|
75
75
|
rubyforge_project:
|
76
|
-
rubygems_version: 1.8.
|
76
|
+
rubygems_version: 1.8.17
|
77
77
|
signing_key:
|
78
78
|
specification_version: 3
|
79
79
|
summary: Unicode normalization library.
|