ninjudd-icunicode 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +43 -0
- data/VERSION.yml +4 -0
- data/ext/extconf.rb +4 -0
- data/ext/icunicode.c +85 -0
- data/test/icunicode_test.rb +7 -0
- data/test/test_helper.rb +10 -0
- metadata +60 -0
data/README.rdoc
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
= ICUnicode
|
|
2
|
+
|
|
3
|
+
Unicode sorting is complicated (http://unicode.org/reports/tr10), and Ruby doesn't do it
|
|
4
|
+
correctly. But there is a widely-used implementation of the Unicode collation algorithm in
|
|
5
|
+
the ICU (International Components for Unicode) libraries. There is also no way to do
|
|
6
|
+
Transliteration in Ruby (http://userguide.icu-project.org/transforms/general). This gem is
|
|
7
|
+
a simple C wrapper around ucol_getSortKey from the ICU Collation API and utrans_transUChars
|
|
8
|
+
from the ICU Transliteration API. These are added as simple methods on String.
|
|
9
|
+
|
|
10
|
+
== Usage:
|
|
11
|
+
|
|
12
|
+
["cafe", "cafes", "caf\303\251"].sort
|
|
13
|
+
=> ["cafe", "cafes", "caf\303\251"]
|
|
14
|
+
|
|
15
|
+
require 'icunicode'
|
|
16
|
+
|
|
17
|
+
["cafe", "cafes", "caf\303\251"].sort_by {|s| s.unicode_sort_key}
|
|
18
|
+
=> ["cafe", "caf\303\251", "cafes"]
|
|
19
|
+
|
|
20
|
+
"blueberry".transliterate("Katakana").transliterate("Latin")
|
|
21
|
+
=> "burueberrui"
|
|
22
|
+
|
|
23
|
+
"blueberry".transliterate("Greek").transliterate("Latin")
|
|
24
|
+
=> "blyeberry"
|
|
25
|
+
|
|
26
|
+
== Install:
|
|
27
|
+
|
|
28
|
+
You must install ICU first. You can download the source from http://site.icu-project.org/download,
|
|
29
|
+
or on Mac, you can install with MacPorts:
|
|
30
|
+
|
|
31
|
+
sudo port install icu
|
|
32
|
+
|
|
33
|
+
Then install the gem:
|
|
34
|
+
|
|
35
|
+
sudo gem install ninjudd-icunicode -s http://gems.github.com
|
|
36
|
+
|
|
37
|
+
== To do:
|
|
38
|
+
|
|
39
|
+
Add support for locales other than en-US. Increase buffer size or make it grow dynamically.
|
|
40
|
+
|
|
41
|
+
== License:
|
|
42
|
+
|
|
43
|
+
Copyright (c) 2009 Justin Balthrop, Geni.com; Published under The MIT License, see LICENSE
|
data/VERSION.yml
ADDED
data/ext/extconf.rb
ADDED
data/ext/icunicode.c
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#include "ruby.h"
|
|
2
|
+
#include "unicode/ucol.h"
|
|
3
|
+
#include "unicode/utrans.h"
|
|
4
|
+
#include "unicode/ustring.h"
|
|
5
|
+
#include "unicode/ustdio.h"
|
|
6
|
+
|
|
7
|
+
#define BUF_SIZE 1000
|
|
8
|
+
|
|
9
|
+
static void to_utf16(VALUE string, UChar *ustr, int32_t *ulen) {
|
|
10
|
+
UErrorCode status = U_ZERO_ERROR;
|
|
11
|
+
|
|
12
|
+
string = StringValue(string);
|
|
13
|
+
u_strFromUTF8(ustr, BUF_SIZE, ulen, RSTRING_PTR(string), RSTRING_LEN(string), &status);
|
|
14
|
+
if (status == U_INVALID_CHAR_FOUND) ulen = 0;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
static VALUE to_utf8(UChar *ustr, int32_t ulen) {
|
|
18
|
+
char str[BUF_SIZE];
|
|
19
|
+
int32_t len = 0;
|
|
20
|
+
UErrorCode status = U_ZERO_ERROR;
|
|
21
|
+
|
|
22
|
+
u_strToUTF8(str, BUF_SIZE, &len, ustr, ulen, &status);
|
|
23
|
+
if (status == U_INVALID_CHAR_FOUND) len = 0;
|
|
24
|
+
return rb_str_new(str, len);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/*
|
|
28
|
+
* call-seq:
|
|
29
|
+
* string.unicode_sort_key -> string
|
|
30
|
+
*
|
|
31
|
+
* Returns a string that will sort according to the Unicode collation algorithm.
|
|
32
|
+
*
|
|
33
|
+
*/
|
|
34
|
+
static VALUE unicode_sort_key(VALUE string) {
|
|
35
|
+
char str[BUF_SIZE];
|
|
36
|
+
UChar ustr[BUF_SIZE];
|
|
37
|
+
int32_t len = 0;
|
|
38
|
+
int32_t ulen = 0;
|
|
39
|
+
UErrorCode status = U_ZERO_ERROR;
|
|
40
|
+
UCollator *col;
|
|
41
|
+
|
|
42
|
+
to_utf16(string, ustr, &ulen);
|
|
43
|
+
|
|
44
|
+
col = ucol_open("en_US", &status);
|
|
45
|
+
if (U_SUCCESS(status)) {
|
|
46
|
+
len = ucol_getSortKey(col, ustr, ulen, (uint8_t*)str, BUF_SIZE);
|
|
47
|
+
ucol_close(col);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return rb_str_new(str, len - 1);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/*
|
|
54
|
+
* call-seq:
|
|
55
|
+
* string.transliterate(transform) -> string
|
|
56
|
+
*
|
|
57
|
+
* Transliterates string using transform.
|
|
58
|
+
*
|
|
59
|
+
*/
|
|
60
|
+
static VALUE unicode_transliterate(VALUE string, VALUE transform) {
|
|
61
|
+
UChar str[BUF_SIZE];
|
|
62
|
+
UChar trn[BUF_SIZE];
|
|
63
|
+
int32_t slen = 0;
|
|
64
|
+
int32_t tlen = 0;
|
|
65
|
+
UErrorCode status = U_ZERO_ERROR;
|
|
66
|
+
UTransliterator *trans;
|
|
67
|
+
|
|
68
|
+
to_utf16(string, str, &slen);
|
|
69
|
+
to_utf16(transform, trn, &tlen);
|
|
70
|
+
|
|
71
|
+
trans = utrans_openU(trn, tlen, UTRANS_FORWARD, NULL, 0, NULL, &status);
|
|
72
|
+
if (trans) {
|
|
73
|
+
utrans_transUChars(trans, str, &slen, BUF_SIZE, 0, &slen, &status);
|
|
74
|
+
utrans_close(trans);
|
|
75
|
+
} else {
|
|
76
|
+
rb_raise(rb_eArgError, "invalid transform: %s", RSTRING_PTR(transform));
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
to_utf8(str, slen);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
void Init_icunicode() {
|
|
83
|
+
rb_define_method(rb_cString, "unicode_sort_key", unicode_sort_key, 0);
|
|
84
|
+
rb_define_method(rb_cString, "transliterate", unicode_transliterate, 1);
|
|
85
|
+
}
|
data/test/test_helper.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: ninjudd-icunicode
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Justin Balthrop
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
|
|
12
|
+
date: 2009-08-15 00:00:00 -07:00
|
|
13
|
+
default_executable:
|
|
14
|
+
dependencies: []
|
|
15
|
+
|
|
16
|
+
description: ICU Unicode Transliteration and Collation in Ruby.
|
|
17
|
+
email: code@justinbalthrop.com
|
|
18
|
+
executables: []
|
|
19
|
+
|
|
20
|
+
extensions:
|
|
21
|
+
- ext/extconf.rb
|
|
22
|
+
extra_rdoc_files: []
|
|
23
|
+
|
|
24
|
+
files:
|
|
25
|
+
- README.rdoc
|
|
26
|
+
- VERSION.yml
|
|
27
|
+
- ext/icunicode.c
|
|
28
|
+
- ext/extconf.rb
|
|
29
|
+
- test/test_helper.rb
|
|
30
|
+
- test/icunicode_test.rb
|
|
31
|
+
has_rdoc: true
|
|
32
|
+
homepage: http://github.com/ninjudd/unicode_collation
|
|
33
|
+
licenses:
|
|
34
|
+
post_install_message:
|
|
35
|
+
rdoc_options:
|
|
36
|
+
- --inline-source
|
|
37
|
+
- --charset=UTF-8
|
|
38
|
+
require_paths:
|
|
39
|
+
- ext
|
|
40
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
41
|
+
requirements:
|
|
42
|
+
- - ">="
|
|
43
|
+
- !ruby/object:Gem::Version
|
|
44
|
+
version: "0"
|
|
45
|
+
version:
|
|
46
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
47
|
+
requirements:
|
|
48
|
+
- - ">="
|
|
49
|
+
- !ruby/object:Gem::Version
|
|
50
|
+
version: "0"
|
|
51
|
+
version:
|
|
52
|
+
requirements: []
|
|
53
|
+
|
|
54
|
+
rubyforge_project:
|
|
55
|
+
rubygems_version: 1.3.5
|
|
56
|
+
signing_key:
|
|
57
|
+
specification_version: 2
|
|
58
|
+
summary: Unicode Transliteration and Collation in Ruby.
|
|
59
|
+
test_files: []
|
|
60
|
+
|