icunicode 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +6 -0
- data/LICENSE +20 -0
- data/README.rdoc +43 -0
- data/Rakefile +63 -0
- data/VERSION.yml +5 -0
- data/ext/.gitignore +5 -0
- data/ext/extconf.rb +4 -0
- data/ext/icunicode.c +114 -0
- data/icunicode.gemspec +53 -0
- data/test/icunicode_test.rb +16 -0
- data/test/test_helper.rb +10 -0
- metadata +79 -0
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2008 Justin Balthrop
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
= ICUnicode
|
2
|
+
|
3
|
+
Unicode sorting is complicated (http://unicode.org/reports/tr10), and Ruby doesn't do it
|
4
|
+
correctly. But there is a widely-used implementation of the Unicode collation algorithm in
|
5
|
+
the ICU (International Components for Unicode) libraries. There is also no way to do
|
6
|
+
Transliteration in Ruby (http://userguide.icu-project.org/transforms/general). This gem is
|
7
|
+
a simple C wrapper around ucol_getSortKey from the ICU Collation API and utrans_transUChars
|
8
|
+
from the ICU Transliteration API. These are added as simple methods on String.
|
9
|
+
|
10
|
+
== Usage:
|
11
|
+
|
12
|
+
["cafe", "cafes", "caf\303\251"].sort
|
13
|
+
=> ["cafe", "cafes", "caf\303\251"]
|
14
|
+
|
15
|
+
require 'icunicode'
|
16
|
+
|
17
|
+
["cafe", "cafes", "caf\303\251"].sort_by {|s| s.unicode_sort_key}
|
18
|
+
=> ["cafe", "caf\303\251", "cafes"]
|
19
|
+
|
20
|
+
"blueberry".transliterate("Katakana").transliterate("Latin")
|
21
|
+
=> "burueberrui"
|
22
|
+
|
23
|
+
"blueberry".transliterate("Greek").transliterate("Latin")
|
24
|
+
=> "blyeberry"
|
25
|
+
|
26
|
+
== Install:
|
27
|
+
|
28
|
+
You must install ICU first. You can download the source from http://site.icu-project.org/download,
|
29
|
+
or on Mac, you can install with MacPorts:
|
30
|
+
|
31
|
+
sudo port install icu
|
32
|
+
|
33
|
+
Then install the gem:
|
34
|
+
|
35
|
+
sudo gem install icunicode
|
36
|
+
|
37
|
+
== To do:
|
38
|
+
|
39
|
+
Add support for locales other than en-US. Increase buffer size or make it grow dynamically.
|
40
|
+
|
41
|
+
== License:
|
42
|
+
|
43
|
+
Copyright (c) 2009 Justin Balthrop, Geni.com; Published under The MIT License, see LICENSE
|
data/Rakefile
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "icunicode"
|
8
|
+
gem.summary = %Q{Unicode Transliteration and Collation in Ruby.}
|
9
|
+
gem.description = "ICU Unicode Transliteration and Collation in Ruby."
|
10
|
+
gem.email = "code@justinbalthrop.com"
|
11
|
+
gem.homepage = "http://github.com/ninjudd/icunicode"
|
12
|
+
gem.authors = ["Justin Balthrop"]
|
13
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
14
|
+
end
|
15
|
+
Jeweler::GemcutterTasks.new
|
16
|
+
rescue LoadError
|
17
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
18
|
+
end
|
19
|
+
|
20
|
+
require 'rake/testtask'
|
21
|
+
Rake::TestTask.new(:test) do |test|
|
22
|
+
test.libs << 'lib' << 'test'
|
23
|
+
test.pattern = 'test/**/*_test.rb'
|
24
|
+
test.verbose = true
|
25
|
+
end
|
26
|
+
|
27
|
+
begin
|
28
|
+
require 'rcov/rcovtask'
|
29
|
+
Rcov::RcovTask.new do |test|
|
30
|
+
test.libs << 'test'
|
31
|
+
test.pattern = 'test/**/*_test.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
rescue LoadError
|
35
|
+
task :rcov do
|
36
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
task :test => :check_dependencies do
|
41
|
+
`cd ext && ruby extconf.rb && make && cp icunicode.bundle icunicode.o ../test/`
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
task :default => :test
|
46
|
+
|
47
|
+
task :clean do
|
48
|
+
`rm -rf ext/lib ext/bin ext/sbin ext/share ext/include`
|
49
|
+
end
|
50
|
+
|
51
|
+
require 'rake/rdoctask'
|
52
|
+
Rake::RDocTask.new do |rdoc|
|
53
|
+
if File.exist?('VERSION')
|
54
|
+
version = File.read('VERSION')
|
55
|
+
else
|
56
|
+
version = ""
|
57
|
+
end
|
58
|
+
|
59
|
+
rdoc.rdoc_dir = 'rdoc'
|
60
|
+
rdoc.title = "icunicode #{version}"
|
61
|
+
rdoc.rdoc_files.include('README*')
|
62
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
63
|
+
end
|
data/VERSION.yml
ADDED
data/ext/extconf.rb
ADDED
data/ext/icunicode.c
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "unicode/ucol.h"
|
3
|
+
#include "unicode/utrans.h"
|
4
|
+
#include "unicode/ustring.h"
|
5
|
+
#include "unicode/ustdio.h"
|
6
|
+
|
7
|
+
#define BUF_SIZE 1000
|
8
|
+
|
9
|
+
VALUE cTransliterator;
|
10
|
+
VALUE trans_hash;
|
11
|
+
|
12
|
+
static void to_utf16(VALUE string, UChar *ustr, int32_t *ulen) {
|
13
|
+
UErrorCode status = U_ZERO_ERROR;
|
14
|
+
|
15
|
+
string = StringValue(string);
|
16
|
+
u_strFromUTF8(ustr, BUF_SIZE, ulen, RSTRING_PTR(string), RSTRING_LEN(string), &status);
|
17
|
+
if (status == U_INVALID_CHAR_FOUND) ulen = 0;
|
18
|
+
}
|
19
|
+
|
20
|
+
static VALUE to_utf8(UChar *ustr, int32_t ulen) {
|
21
|
+
char str[BUF_SIZE];
|
22
|
+
int32_t len = 0;
|
23
|
+
UErrorCode status = U_ZERO_ERROR;
|
24
|
+
|
25
|
+
u_strToUTF8(str, BUF_SIZE, &len, ustr, ulen, &status);
|
26
|
+
if (status == U_INVALID_CHAR_FOUND) len = 0;
|
27
|
+
return rb_str_new(str, len);
|
28
|
+
}
|
29
|
+
|
30
|
+
/*
|
31
|
+
* call-seq:
|
32
|
+
* string.unicode_sort_key -> string
|
33
|
+
*
|
34
|
+
* Returns a string that will sort according to the Unicode collation algorithm.
|
35
|
+
*
|
36
|
+
*/
|
37
|
+
static VALUE unicode_sort_key(VALUE string) {
|
38
|
+
char str[BUF_SIZE];
|
39
|
+
UChar ustr[BUF_SIZE];
|
40
|
+
int32_t len = 0;
|
41
|
+
int32_t ulen = 0;
|
42
|
+
UErrorCode status = U_ZERO_ERROR;
|
43
|
+
UCollator *col;
|
44
|
+
|
45
|
+
to_utf16(string, ustr, &ulen);
|
46
|
+
|
47
|
+
col = ucol_open("en_US", &status);
|
48
|
+
if (U_SUCCESS(status)) {
|
49
|
+
len = ucol_getSortKey(col, ustr, ulen, (uint8_t*)str, BUF_SIZE);
|
50
|
+
ucol_close(col);
|
51
|
+
}
|
52
|
+
|
53
|
+
return rb_str_new(str, len - 1);
|
54
|
+
}
|
55
|
+
|
56
|
+
static void trans_free(void *trans) {
|
57
|
+
utrans_close(trans);
|
58
|
+
}
|
59
|
+
|
60
|
+
static UTransliterator* get_trans(VALUE transform) {
|
61
|
+
UChar str[BUF_SIZE];
|
62
|
+
int32_t len = 0;
|
63
|
+
UTransliterator *trans;
|
64
|
+
UErrorCode status = U_ZERO_ERROR;
|
65
|
+
VALUE obj;
|
66
|
+
|
67
|
+
obj = rb_hash_aref(trans_hash, transform);
|
68
|
+
if (NIL_P(obj)) {
|
69
|
+
to_utf16(transform, str, &len);
|
70
|
+
trans = utrans_openU(str, len, UTRANS_FORWARD, NULL, 0, NULL, &status);
|
71
|
+
if (trans) {
|
72
|
+
obj = Data_Wrap_Struct(rb_cObject, 0, trans_free, trans);
|
73
|
+
rb_hash_aset(trans_hash, transform, obj);
|
74
|
+
} else {
|
75
|
+
rb_raise(rb_eArgError, "invalid transform: %s", RSTRING_PTR(transform));
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
Data_Get_Struct(obj, UTransliterator, trans);
|
80
|
+
return trans;
|
81
|
+
}
|
82
|
+
|
83
|
+
/*
|
84
|
+
* call-seq:
|
85
|
+
* string.transliterate(transform_string) -> string
|
86
|
+
*
|
87
|
+
* Transliterates string using transform.
|
88
|
+
*
|
89
|
+
*/
|
90
|
+
static VALUE unicode_transliterate(int argc, VALUE *argv, VALUE string) {
|
91
|
+
UChar str[BUF_SIZE];
|
92
|
+
int32_t slen = 0;
|
93
|
+
UErrorCode status = U_ZERO_ERROR;
|
94
|
+
UTransliterator *trans;
|
95
|
+
VALUE transform;
|
96
|
+
|
97
|
+
rb_scan_args(argc, argv, "01", &transform);
|
98
|
+
if (NIL_P(transform)) transform = rb_str_new2("Latin; Lower; NFD; [^[:letter:] [:space:] [0-9] [:punctuation:]] Remove; NFC");
|
99
|
+
|
100
|
+
to_utf16(string, str, &slen);
|
101
|
+
|
102
|
+
trans = get_trans(transform);
|
103
|
+
utrans_transUChars(trans, str, &slen, BUF_SIZE, 0, &slen, &status);
|
104
|
+
|
105
|
+
to_utf8(str, slen);
|
106
|
+
}
|
107
|
+
|
108
|
+
void Init_icunicode() {
|
109
|
+
rb_define_method(rb_cString, "unicode_sort_key", unicode_sort_key, 0);
|
110
|
+
rb_define_method(rb_cString, "transliterate", unicode_transliterate, -1);
|
111
|
+
|
112
|
+
trans_hash = rb_hash_new();
|
113
|
+
rb_global_variable(&trans_hash);
|
114
|
+
}
|
data/icunicode.gemspec
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{icunicode}
|
8
|
+
s.version = "0.1.4"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Justin Balthrop"]
|
12
|
+
s.date = %q{2010-08-26}
|
13
|
+
s.description = %q{ICU Unicode Transliteration and Collation in Ruby.}
|
14
|
+
s.email = %q{code@justinbalthrop.com}
|
15
|
+
s.extensions = ["ext/extconf.rb"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE",
|
18
|
+
"README.rdoc"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.rdoc",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION.yml",
|
26
|
+
"ext/.gitignore",
|
27
|
+
"ext/extconf.rb",
|
28
|
+
"ext/icunicode.c",
|
29
|
+
"icunicode.gemspec",
|
30
|
+
"test/icunicode_test.rb",
|
31
|
+
"test/test_helper.rb"
|
32
|
+
]
|
33
|
+
s.homepage = %q{http://github.com/ninjudd/icunicode}
|
34
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
35
|
+
s.require_paths = ["lib"]
|
36
|
+
s.rubygems_version = %q{1.3.7}
|
37
|
+
s.summary = %q{Unicode Transliteration and Collation in Ruby.}
|
38
|
+
s.test_files = [
|
39
|
+
"test/icunicode_test.rb",
|
40
|
+
"test/test_helper.rb"
|
41
|
+
]
|
42
|
+
|
43
|
+
if s.respond_to? :specification_version then
|
44
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
45
|
+
s.specification_version = 3
|
46
|
+
|
47
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
48
|
+
else
|
49
|
+
end
|
50
|
+
else
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/test_helper'
|
2
|
+
|
3
|
+
class UnicodeCollationTest < Test::Unit::TestCase
|
4
|
+
should "sort using unicode collation" do
|
5
|
+
assert_equal ["cafe", "café", "cafes"], ["cafe", "cafes", "café"].sort_by {|s| s.unicode_sort_key}
|
6
|
+
assert_equal ["role", "Role", "rôle"], ["rôle", "role", "Role"].sort_by {|s| s.unicode_sort_key}
|
7
|
+
assert_equal ["cote", "coté", "côte", "côté"], ["côté", "coté", "cote", "côte"].sort_by {|s| s.unicode_sort_key}
|
8
|
+
end
|
9
|
+
|
10
|
+
should "transliterate" do
|
11
|
+
assert_equal "ムクドナルデ'ス", "mcdonald's".transliterate('Katakana')
|
12
|
+
assert_equal "さむらい", "samurai".transliterate('Hiragana')
|
13
|
+
assert_equal "θε γρεατ γρεεκ", "the great greek".transliterate('Greek')
|
14
|
+
assert_equal "фром руссиа уитх лове", "from russia with love".transliterate('Cyrillic')
|
15
|
+
end
|
16
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: icunicode
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Justin Balthrop
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-08-26 00:00:00 -07:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: ICU Unicode Transliteration and Collation in Ruby.
|
23
|
+
email: code@justinbalthrop.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions:
|
27
|
+
- ext/extconf.rb
|
28
|
+
extra_rdoc_files:
|
29
|
+
- LICENSE
|
30
|
+
- README.rdoc
|
31
|
+
files:
|
32
|
+
- .gitignore
|
33
|
+
- LICENSE
|
34
|
+
- README.rdoc
|
35
|
+
- Rakefile
|
36
|
+
- VERSION.yml
|
37
|
+
- ext/.gitignore
|
38
|
+
- ext/extconf.rb
|
39
|
+
- ext/icunicode.c
|
40
|
+
- icunicode.gemspec
|
41
|
+
- test/icunicode_test.rb
|
42
|
+
- test/test_helper.rb
|
43
|
+
has_rdoc: true
|
44
|
+
homepage: http://github.com/ninjudd/icunicode
|
45
|
+
licenses: []
|
46
|
+
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options:
|
49
|
+
- --charset=UTF-8
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
hash: 3
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project:
|
73
|
+
rubygems_version: 1.3.7
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: Unicode Transliteration and Collation in Ruby.
|
77
|
+
test_files:
|
78
|
+
- test/icunicode_test.rb
|
79
|
+
- test/test_helper.rb
|