icunicode 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ *.sw?
2
+ .DS_Store
3
+ *.bundle
4
+ *.o
5
+ pkg
6
+ coverage
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 Justin Balthrop
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,43 @@
1
+ = ICUnicode
2
+
3
+ Unicode sorting is complicated (http://unicode.org/reports/tr10), and Ruby doesn't do it
4
+ correctly. But there is a widely-used implementation of the Unicode collation algorithm in
5
+ the ICU (International Components for Unicode) libraries. There is also no way to do
6
+ Transliteration in Ruby (http://userguide.icu-project.org/transforms/general). This gem is
7
+ a simple C wrapper around ucol_getSortKey from the ICU Collation API and utrans_transUChars
8
+ from the ICU Transliteration API. These are added as simple methods on String.
9
+
10
+ == Usage:
11
+
12
+ ["cafe", "cafes", "caf\303\251"].sort
13
+ => ["cafe", "cafes", "caf\303\251"]
14
+
15
+ require 'icunicode'
16
+
17
+ ["cafe", "cafes", "caf\303\251"].sort_by {|s| s.unicode_sort_key}
18
+ => ["cafe", "caf\303\251", "cafes"]
19
+
20
+ "blueberry".transliterate("Katakana").transliterate("Latin")
21
+ => "burueberrui"
22
+
23
+ "blueberry".transliterate("Greek").transliterate("Latin")
24
+ => "blyeberry"
25
+
26
+ == Install:
27
+
28
+ You must install ICU first. You can download the source from http://site.icu-project.org/download,
29
+ or on Mac, you can install with MacPorts:
30
+
31
+ sudo port install icu
32
+
33
+ Then install the gem:
34
+
35
+ sudo gem install icunicode
36
+
37
+ == To do:
38
+
39
+ Add support for locales other than en-US. Increase buffer size or make it grow dynamically.
40
+
41
+ == License:
42
+
43
+ Copyright (c) 2009 Justin Balthrop, Geni.com; Published under The MIT License, see LICENSE
@@ -0,0 +1,63 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "icunicode"
8
+ gem.summary = %Q{Unicode Transliteration and Collation in Ruby.}
9
+ gem.description = "ICU Unicode Transliteration and Collation in Ruby."
10
+ gem.email = "code@justinbalthrop.com"
11
+ gem.homepage = "http://github.com/ninjudd/icunicode"
12
+ gem.authors = ["Justin Balthrop"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/*_test.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/*_test.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+ task :test => :check_dependencies do
41
+ `cd ext && ruby extconf.rb && make && cp icunicode.bundle icunicode.o ../test/`
42
+ end
43
+
44
+
45
+ task :default => :test
46
+
47
+ task :clean do
48
+ `rm -rf ext/lib ext/bin ext/sbin ext/share ext/include`
49
+ end
50
+
51
+ require 'rake/rdoctask'
52
+ Rake::RDocTask.new do |rdoc|
53
+ if File.exist?('VERSION')
54
+ version = File.read('VERSION')
55
+ else
56
+ version = ""
57
+ end
58
+
59
+ rdoc.rdoc_dir = 'rdoc'
60
+ rdoc.title = "icunicode #{version}"
61
+ rdoc.rdoc_files.include('README*')
62
+ rdoc.rdoc_files.include('lib/**/*.rb')
63
+ end
@@ -0,0 +1,5 @@
1
+ ---
2
+ :build:
3
+ :major: 0
4
+ :minor: 1
5
+ :patch: 4
@@ -0,0 +1,5 @@
1
+ Makefile
2
+ *.bundle
3
+ *.o
4
+ *.so
5
+ mkmf.log
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+ have_library('icui18n')
3
+ have_library('icuio')
4
+ create_makefile('icunicode')
@@ -0,0 +1,114 @@
1
+ #include "ruby.h"
2
+ #include "unicode/ucol.h"
3
+ #include "unicode/utrans.h"
4
+ #include "unicode/ustring.h"
5
+ #include "unicode/ustdio.h"
6
+
7
+ #define BUF_SIZE 1000
8
+
9
+ VALUE cTransliterator;
10
+ VALUE trans_hash;
11
+
12
+ static void to_utf16(VALUE string, UChar *ustr, int32_t *ulen) {
13
+ UErrorCode status = U_ZERO_ERROR;
14
+
15
+ string = StringValue(string);
16
+ u_strFromUTF8(ustr, BUF_SIZE, ulen, RSTRING_PTR(string), RSTRING_LEN(string), &status);
17
+ if (status == U_INVALID_CHAR_FOUND) ulen = 0;
18
+ }
19
+
20
+ static VALUE to_utf8(UChar *ustr, int32_t ulen) {
21
+ char str[BUF_SIZE];
22
+ int32_t len = 0;
23
+ UErrorCode status = U_ZERO_ERROR;
24
+
25
+ u_strToUTF8(str, BUF_SIZE, &len, ustr, ulen, &status);
26
+ if (status == U_INVALID_CHAR_FOUND) len = 0;
27
+ return rb_str_new(str, len);
28
+ }
29
+
30
+ /*
31
+ * call-seq:
32
+ * string.unicode_sort_key -> string
33
+ *
34
+ * Returns a string that will sort according to the Unicode collation algorithm.
35
+ *
36
+ */
37
+ static VALUE unicode_sort_key(VALUE string) {
38
+ char str[BUF_SIZE];
39
+ UChar ustr[BUF_SIZE];
40
+ int32_t len = 0;
41
+ int32_t ulen = 0;
42
+ UErrorCode status = U_ZERO_ERROR;
43
+ UCollator *col;
44
+
45
+ to_utf16(string, ustr, &ulen);
46
+
47
+ col = ucol_open("en_US", &status);
48
+ if (U_SUCCESS(status)) {
49
+ len = ucol_getSortKey(col, ustr, ulen, (uint8_t*)str, BUF_SIZE);
50
+ ucol_close(col);
51
+ }
52
+
53
+ return rb_str_new(str, len - 1);
54
+ }
55
+
56
+ static void trans_free(void *trans) {
57
+ utrans_close(trans);
58
+ }
59
+
60
+ static UTransliterator* get_trans(VALUE transform) {
61
+ UChar str[BUF_SIZE];
62
+ int32_t len = 0;
63
+ UTransliterator *trans;
64
+ UErrorCode status = U_ZERO_ERROR;
65
+ VALUE obj;
66
+
67
+ obj = rb_hash_aref(trans_hash, transform);
68
+ if (NIL_P(obj)) {
69
+ to_utf16(transform, str, &len);
70
+ trans = utrans_openU(str, len, UTRANS_FORWARD, NULL, 0, NULL, &status);
71
+ if (trans) {
72
+ obj = Data_Wrap_Struct(rb_cObject, 0, trans_free, trans);
73
+ rb_hash_aset(trans_hash, transform, obj);
74
+ } else {
75
+ rb_raise(rb_eArgError, "invalid transform: %s", RSTRING_PTR(transform));
76
+ }
77
+ }
78
+
79
+ Data_Get_Struct(obj, UTransliterator, trans);
80
+ return trans;
81
+ }
82
+
83
+ /*
84
+ * call-seq:
85
+ * string.transliterate(transform_string) -> string
86
+ *
87
+ * Transliterates string using transform.
88
+ *
89
+ */
90
+ static VALUE unicode_transliterate(int argc, VALUE *argv, VALUE string) {
91
+ UChar str[BUF_SIZE];
92
+ int32_t slen = 0;
93
+ UErrorCode status = U_ZERO_ERROR;
94
+ UTransliterator *trans;
95
+ VALUE transform;
96
+
97
+ rb_scan_args(argc, argv, "01", &transform);
98
+ if (NIL_P(transform)) transform = rb_str_new2("Latin; Lower; NFD; [^[:letter:] [:space:] [0-9] [:punctuation:]] Remove; NFC");
99
+
100
+ to_utf16(string, str, &slen);
101
+
102
+ trans = get_trans(transform);
103
+ utrans_transUChars(trans, str, &slen, BUF_SIZE, 0, &slen, &status);
104
+
105
+ to_utf8(str, slen);
106
+ }
107
+
108
+ void Init_icunicode() {
109
+ rb_define_method(rb_cString, "unicode_sort_key", unicode_sort_key, 0);
110
+ rb_define_method(rb_cString, "transliterate", unicode_transliterate, -1);
111
+
112
+ trans_hash = rb_hash_new();
113
+ rb_global_variable(&trans_hash);
114
+ }
@@ -0,0 +1,53 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{icunicode}
8
+ s.version = "0.1.4"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Justin Balthrop"]
12
+ s.date = %q{2010-08-26}
13
+ s.description = %q{ICU Unicode Transliteration and Collation in Ruby.}
14
+ s.email = %q{code@justinbalthrop.com}
15
+ s.extensions = ["ext/extconf.rb"]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE",
18
+ "README.rdoc"
19
+ ]
20
+ s.files = [
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION.yml",
26
+ "ext/.gitignore",
27
+ "ext/extconf.rb",
28
+ "ext/icunicode.c",
29
+ "icunicode.gemspec",
30
+ "test/icunicode_test.rb",
31
+ "test/test_helper.rb"
32
+ ]
33
+ s.homepage = %q{http://github.com/ninjudd/icunicode}
34
+ s.rdoc_options = ["--charset=UTF-8"]
35
+ s.require_paths = ["lib"]
36
+ s.rubygems_version = %q{1.3.7}
37
+ s.summary = %q{Unicode Transliteration and Collation in Ruby.}
38
+ s.test_files = [
39
+ "test/icunicode_test.rb",
40
+ "test/test_helper.rb"
41
+ ]
42
+
43
+ if s.respond_to? :specification_version then
44
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
45
+ s.specification_version = 3
46
+
47
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
48
+ else
49
+ end
50
+ else
51
+ end
52
+ end
53
+
@@ -0,0 +1,16 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+
3
+ class UnicodeCollationTest < Test::Unit::TestCase
4
+ should "sort using unicode collation" do
5
+ assert_equal ["cafe", "café", "cafes"], ["cafe", "cafes", "café"].sort_by {|s| s.unicode_sort_key}
6
+ assert_equal ["role", "Role", "rôle"], ["rôle", "role", "Role"].sort_by {|s| s.unicode_sort_key}
7
+ assert_equal ["cote", "coté", "côte", "côté"], ["côté", "coté", "cote", "côte"].sort_by {|s| s.unicode_sort_key}
8
+ end
9
+
10
+ should "transliterate" do
11
+ assert_equal "ムクドナルデ'ス", "mcdonald's".transliterate('Katakana')
12
+ assert_equal "さむらい", "samurai".transliterate('Hiragana')
13
+ assert_equal "θε γρεατ γρεεκ", "the great greek".transliterate('Greek')
14
+ assert_equal "фром руссиа уитх лове", "from russia with love".transliterate('Cyrillic')
15
+ end
16
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'mocha'
5
+
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'icunicode'
8
+
9
+ class Test::Unit::TestCase
10
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: icunicode
3
+ version: !ruby/object:Gem::Version
4
+ hash: 19
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 4
10
+ version: 0.1.4
11
+ platform: ruby
12
+ authors:
13
+ - Justin Balthrop
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-08-26 00:00:00 -07:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: ICU Unicode Transliteration and Collation in Ruby.
23
+ email: code@justinbalthrop.com
24
+ executables: []
25
+
26
+ extensions:
27
+ - ext/extconf.rb
28
+ extra_rdoc_files:
29
+ - LICENSE
30
+ - README.rdoc
31
+ files:
32
+ - .gitignore
33
+ - LICENSE
34
+ - README.rdoc
35
+ - Rakefile
36
+ - VERSION.yml
37
+ - ext/.gitignore
38
+ - ext/extconf.rb
39
+ - ext/icunicode.c
40
+ - icunicode.gemspec
41
+ - test/icunicode_test.rb
42
+ - test/test_helper.rb
43
+ has_rdoc: true
44
+ homepage: http://github.com/ninjudd/icunicode
45
+ licenses: []
46
+
47
+ post_install_message:
48
+ rdoc_options:
49
+ - --charset=UTF-8
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ hash: 3
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ requirements: []
71
+
72
+ rubyforge_project:
73
+ rubygems_version: 1.3.7
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: Unicode Transliteration and Collation in Ruby.
77
+ test_files:
78
+ - test/icunicode_test.rb
79
+ - test/test_helper.rb