icunicode 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ *.sw?
2
+ .DS_Store
3
+ *.bundle
4
+ *.o
5
+ pkg
6
+ coverage
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 Justin Balthrop
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,43 @@
1
+ = ICUnicode
2
+
3
+ Unicode sorting is complicated (http://unicode.org/reports/tr10), and Ruby doesn't do it
4
+ correctly. But there is a widely-used implementation of the Unicode collation algorithm in
5
+ the ICU (International Components for Unicode) libraries. There is also no way to do
6
+ Transliteration in Ruby (http://userguide.icu-project.org/transforms/general). This gem is
7
+ a simple C wrapper around ucol_getSortKey from the ICU Collation API and utrans_transUChars
8
+ from the ICU Transliteration API. These are added as simple methods on String.
9
+
10
+ == Usage:
11
+
12
+ ["cafe", "cafes", "caf\303\251"].sort
13
+ => ["cafe", "cafes", "caf\303\251"]
14
+
15
+ require 'icunicode'
16
+
17
+ ["cafe", "cafes", "caf\303\251"].sort_by {|s| s.unicode_sort_key}
18
+ => ["cafe", "caf\303\251", "cafes"]
19
+
20
+ "blueberry".transliterate("Katakana").transliterate("Latin")
21
+ => "burueberrui"
22
+
23
+ "blueberry".transliterate("Greek").transliterate("Latin")
24
+ => "blyeberry"
25
+
26
+ == Install:
27
+
28
+ You must install ICU first. You can download the source from http://site.icu-project.org/download,
29
+ or on Mac, you can install with MacPorts:
30
+
31
+ sudo port install icu
32
+
33
+ Then install the gem:
34
+
35
+ sudo gem install icunicode
36
+
37
+ == To do:
38
+
39
+ Add support for locales other than en-US. Increase buffer size or make it grow dynamically.
40
+
41
+ == License:
42
+
43
+ Copyright (c) 2009 Justin Balthrop, Geni.com; Published under The MIT License, see LICENSE
@@ -0,0 +1,63 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "icunicode"
8
+ gem.summary = %Q{Unicode Transliteration and Collation in Ruby.}
9
+ gem.description = "ICU Unicode Transliteration and Collation in Ruby."
10
+ gem.email = "code@justinbalthrop.com"
11
+ gem.homepage = "http://github.com/ninjudd/icunicode"
12
+ gem.authors = ["Justin Balthrop"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/*_test.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/*_test.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+ task :test => :check_dependencies do
41
+ `cd ext && ruby extconf.rb && make && cp icunicode.bundle icunicode.o ../test/`
42
+ end
43
+
44
+
45
+ task :default => :test
46
+
47
+ task :clean do
48
+ `rm -rf ext/lib ext/bin ext/sbin ext/share ext/include`
49
+ end
50
+
51
+ require 'rake/rdoctask'
52
+ Rake::RDocTask.new do |rdoc|
53
+ if File.exist?('VERSION')
54
+ version = File.read('VERSION')
55
+ else
56
+ version = ""
57
+ end
58
+
59
+ rdoc.rdoc_dir = 'rdoc'
60
+ rdoc.title = "icunicode #{version}"
61
+ rdoc.rdoc_files.include('README*')
62
+ rdoc.rdoc_files.include('lib/**/*.rb')
63
+ end
@@ -0,0 +1,5 @@
1
+ ---
2
+ :build:
3
+ :major: 0
4
+ :minor: 1
5
+ :patch: 4
@@ -0,0 +1,5 @@
1
+ Makefile
2
+ *.bundle
3
+ *.o
4
+ *.so
5
+ mkmf.log
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+ have_library('icui18n')
3
+ have_library('icuio')
4
+ create_makefile('icunicode')
@@ -0,0 +1,114 @@
1
+ #include "ruby.h"
2
+ #include "unicode/ucol.h"
3
+ #include "unicode/utrans.h"
4
+ #include "unicode/ustring.h"
5
+ #include "unicode/ustdio.h"
6
+
7
+ #define BUF_SIZE 1000
8
+
9
+ VALUE cTransliterator;
10
+ VALUE trans_hash;
11
+
12
+ static void to_utf16(VALUE string, UChar *ustr, int32_t *ulen) {
13
+ UErrorCode status = U_ZERO_ERROR;
14
+
15
+ string = StringValue(string);
16
+ u_strFromUTF8(ustr, BUF_SIZE, ulen, RSTRING_PTR(string), RSTRING_LEN(string), &status);
17
+ if (status == U_INVALID_CHAR_FOUND) ulen = 0;
18
+ }
19
+
20
+ static VALUE to_utf8(UChar *ustr, int32_t ulen) {
21
+ char str[BUF_SIZE];
22
+ int32_t len = 0;
23
+ UErrorCode status = U_ZERO_ERROR;
24
+
25
+ u_strToUTF8(str, BUF_SIZE, &len, ustr, ulen, &status);
26
+ if (status == U_INVALID_CHAR_FOUND) len = 0;
27
+ return rb_str_new(str, len);
28
+ }
29
+
30
+ /*
31
+ * call-seq:
32
+ * string.unicode_sort_key -> string
33
+ *
34
+ * Returns a string that will sort according to the Unicode collation algorithm.
35
+ *
36
+ */
37
+ static VALUE unicode_sort_key(VALUE string) {
38
+ char str[BUF_SIZE];
39
+ UChar ustr[BUF_SIZE];
40
+ int32_t len = 0;
41
+ int32_t ulen = 0;
42
+ UErrorCode status = U_ZERO_ERROR;
43
+ UCollator *col;
44
+
45
+ to_utf16(string, ustr, &ulen);
46
+
47
+ col = ucol_open("en_US", &status);
48
+ if (U_SUCCESS(status)) {
49
+ len = ucol_getSortKey(col, ustr, ulen, (uint8_t*)str, BUF_SIZE);
50
+ ucol_close(col);
51
+ }
52
+
53
+ return rb_str_new(str, len - 1);
54
+ }
55
+
56
+ static void trans_free(void *trans) {
57
+ utrans_close(trans);
58
+ }
59
+
60
+ static UTransliterator* get_trans(VALUE transform) {
61
+ UChar str[BUF_SIZE];
62
+ int32_t len = 0;
63
+ UTransliterator *trans;
64
+ UErrorCode status = U_ZERO_ERROR;
65
+ VALUE obj;
66
+
67
+ obj = rb_hash_aref(trans_hash, transform);
68
+ if (NIL_P(obj)) {
69
+ to_utf16(transform, str, &len);
70
+ trans = utrans_openU(str, len, UTRANS_FORWARD, NULL, 0, NULL, &status);
71
+ if (trans) {
72
+ obj = Data_Wrap_Struct(rb_cObject, 0, trans_free, trans);
73
+ rb_hash_aset(trans_hash, transform, obj);
74
+ } else {
75
+ rb_raise(rb_eArgError, "invalid transform: %s", RSTRING_PTR(transform));
76
+ }
77
+ }
78
+
79
+ Data_Get_Struct(obj, UTransliterator, trans);
80
+ return trans;
81
+ }
82
+
83
+ /*
84
+ * call-seq:
85
+ * string.transliterate(transform_string) -> string
86
+ *
87
+ * Transliterates string using transform.
88
+ *
89
+ */
90
+ static VALUE unicode_transliterate(int argc, VALUE *argv, VALUE string) {
91
+ UChar str[BUF_SIZE];
92
+ int32_t slen = 0;
93
+ UErrorCode status = U_ZERO_ERROR;
94
+ UTransliterator *trans;
95
+ VALUE transform;
96
+
97
+ rb_scan_args(argc, argv, "01", &transform);
98
+ if (NIL_P(transform)) transform = rb_str_new2("Latin; Lower; NFD; [^[:letter:] [:space:] [0-9] [:punctuation:]] Remove; NFC");
99
+
100
+ to_utf16(string, str, &slen);
101
+
102
+ trans = get_trans(transform);
103
+ utrans_transUChars(trans, str, &slen, BUF_SIZE, 0, &slen, &status);
104
+
105
+ to_utf8(str, slen);
106
+ }
107
+
108
+ void Init_icunicode() {
109
+ rb_define_method(rb_cString, "unicode_sort_key", unicode_sort_key, 0);
110
+ rb_define_method(rb_cString, "transliterate", unicode_transliterate, -1);
111
+
112
+ trans_hash = rb_hash_new();
113
+ rb_global_variable(&trans_hash);
114
+ }
@@ -0,0 +1,53 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{icunicode}
8
+ s.version = "0.1.4"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Justin Balthrop"]
12
+ s.date = %q{2010-08-26}
13
+ s.description = %q{ICU Unicode Transliteration and Collation in Ruby.}
14
+ s.email = %q{code@justinbalthrop.com}
15
+ s.extensions = ["ext/extconf.rb"]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE",
18
+ "README.rdoc"
19
+ ]
20
+ s.files = [
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION.yml",
26
+ "ext/.gitignore",
27
+ "ext/extconf.rb",
28
+ "ext/icunicode.c",
29
+ "icunicode.gemspec",
30
+ "test/icunicode_test.rb",
31
+ "test/test_helper.rb"
32
+ ]
33
+ s.homepage = %q{http://github.com/ninjudd/icunicode}
34
+ s.rdoc_options = ["--charset=UTF-8"]
35
+ s.require_paths = ["lib"]
36
+ s.rubygems_version = %q{1.3.7}
37
+ s.summary = %q{Unicode Transliteration and Collation in Ruby.}
38
+ s.test_files = [
39
+ "test/icunicode_test.rb",
40
+ "test/test_helper.rb"
41
+ ]
42
+
43
+ if s.respond_to? :specification_version then
44
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
45
+ s.specification_version = 3
46
+
47
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
48
+ else
49
+ end
50
+ else
51
+ end
52
+ end
53
+
@@ -0,0 +1,16 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+
3
+ class UnicodeCollationTest < Test::Unit::TestCase
4
+ should "sort using unicode collation" do
5
+ assert_equal ["cafe", "café", "cafes"], ["cafe", "cafes", "café"].sort_by {|s| s.unicode_sort_key}
6
+ assert_equal ["role", "Role", "rôle"], ["rôle", "role", "Role"].sort_by {|s| s.unicode_sort_key}
7
+ assert_equal ["cote", "coté", "côte", "côté"], ["côté", "coté", "cote", "côte"].sort_by {|s| s.unicode_sort_key}
8
+ end
9
+
10
+ should "transliterate" do
11
+ assert_equal "ムクドナルデ'ス", "mcdonald's".transliterate('Katakana')
12
+ assert_equal "さむらい", "samurai".transliterate('Hiragana')
13
+ assert_equal "θε γρεατ γρεεκ", "the great greek".transliterate('Greek')
14
+ assert_equal "фром руссиа уитх лове", "from russia with love".transliterate('Cyrillic')
15
+ end
16
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'mocha'
5
+
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'icunicode'
8
+
9
+ class Test::Unit::TestCase
10
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: icunicode
3
+ version: !ruby/object:Gem::Version
4
+ hash: 19
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 4
10
+ version: 0.1.4
11
+ platform: ruby
12
+ authors:
13
+ - Justin Balthrop
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-08-26 00:00:00 -07:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: ICU Unicode Transliteration and Collation in Ruby.
23
+ email: code@justinbalthrop.com
24
+ executables: []
25
+
26
+ extensions:
27
+ - ext/extconf.rb
28
+ extra_rdoc_files:
29
+ - LICENSE
30
+ - README.rdoc
31
+ files:
32
+ - .gitignore
33
+ - LICENSE
34
+ - README.rdoc
35
+ - Rakefile
36
+ - VERSION.yml
37
+ - ext/.gitignore
38
+ - ext/extconf.rb
39
+ - ext/icunicode.c
40
+ - icunicode.gemspec
41
+ - test/icunicode_test.rb
42
+ - test/test_helper.rb
43
+ has_rdoc: true
44
+ homepage: http://github.com/ninjudd/icunicode
45
+ licenses: []
46
+
47
+ post_install_message:
48
+ rdoc_options:
49
+ - --charset=UTF-8
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ hash: 3
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ requirements: []
71
+
72
+ rubyforge_project:
73
+ rubygems_version: 1.3.7
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: Unicode Transliteration and Collation in Ruby.
77
+ test_files:
78
+ - test/icunicode_test.rb
79
+ - test/test_helper.rb