ffi-icu 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +3 -0
- data/LICENSE +1 -1
- data/README.rdoc +1 -1
- data/Rakefile +7 -26
- data/ffi-icu.gemspec +33 -0
- data/lib/ffi-icu.rb +2 -0
- data/lib/ffi-icu/break_iterator.rb +67 -0
- data/lib/ffi-icu/core_ext/string.rb +5 -0
- data/lib/ffi-icu/lib.rb +33 -2
- data/lib/ffi-icu/normalization.rb +10 -9
- data/lib/ffi-icu/transliteration.rb +18 -7
- data/lib/ffi-icu/uchar.rb +6 -2
- data/lib/ffi-icu/version.rb +3 -0
- data/spec/break_iterator_spec.rb +28 -0
- data/spec/collation_spec.rb +1 -0
- data/spec/spec_helper.rb +2 -3
- data/spec/transliteration_spec.rb +5 -0
- metadata +19 -35
- data/VERSION +0 -1
data/.gitignore
CHANGED
data/Gemfile
ADDED
data/LICENSE
CHANGED
data/README.rdoc
CHANGED
data/Rakefile
CHANGED
@@ -1,39 +1,20 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'rake'
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
Jeweler::Tasks.new do |gem|
|
7
|
-
gem.name = "ffi-icu"
|
8
|
-
gem.summary = %Q{Simple FFI wrappers for things I need from ICU.}
|
9
|
-
gem.description = %Q{Provides charset detection, locale sensitive collation and more.}
|
10
|
-
gem.email = "jari.bakken@gmail.com"
|
11
|
-
gem.homepage = "http://github.com/jarib/ffi-icu"
|
12
|
-
gem.authors = ["Jari Bakken"]
|
13
|
-
|
14
|
-
gem.add_dependency "ffi", ">= 0.6.3"
|
15
|
-
gem.add_development_dependency "rspec", ">= 1.3.0"
|
16
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
-
end
|
4
|
+
require 'bundler'
|
5
|
+
Bundler::GemHelper.install_tasks
|
18
6
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
23
|
-
|
24
|
-
require 'spec/rake/spectask'
|
25
|
-
Spec::Rake::SpecTask.new(:spec) do |spec|
|
26
|
-
spec.libs << 'lib' << 'spec'
|
27
|
-
spec.spec_files = FileList['spec/**/*_spec.rb']
|
7
|
+
require 'rspec/core/rake_task'
|
8
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
9
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
28
10
|
end
|
29
11
|
|
30
|
-
|
31
|
-
spec.libs << 'lib' << 'spec'
|
12
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
32
13
|
spec.pattern = 'spec/**/*_spec.rb'
|
33
14
|
spec.rcov = true
|
34
15
|
end
|
35
16
|
|
36
|
-
task :spec
|
17
|
+
task :spec
|
37
18
|
|
38
19
|
task :default => :spec
|
39
20
|
|
data/ffi-icu.gemspec
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
require File.expand_path("../lib/ffi-icu/version", __FILE__)
|
7
|
+
|
8
|
+
Gem::Specification.new do |s|
|
9
|
+
s.name = %q{ffi-icu}
|
10
|
+
s.version = ICU::VERSION
|
11
|
+
|
12
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
13
|
+
s.authors = ["Jari Bakken"]
|
14
|
+
s.date = %q{2010-08-23}
|
15
|
+
s.description = %q{Provides charset detection, locale sensitive collation and more. Depends on libicu.}
|
16
|
+
s.email = %q{jari.bakken@gmail.com}
|
17
|
+
s.extra_rdoc_files = [
|
18
|
+
"LICENSE",
|
19
|
+
"README.rdoc"
|
20
|
+
]
|
21
|
+
s.files = `git ls-files`.split("\n")
|
22
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
23
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
24
|
+
s.require_paths = ["lib"]
|
25
|
+
|
26
|
+
s.homepage = %q{http://github.com/jarib/ffi-icu}
|
27
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
28
|
+
s.summary = %q{Simple FFI wrappers for things I need from ICU.}
|
29
|
+
|
30
|
+
s.add_runtime_dependency(%q<ffi>, ["~> 1.0.9"])
|
31
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.5.0"])
|
32
|
+
end
|
33
|
+
|
data/lib/ffi-icu.rb
CHANGED
@@ -20,12 +20,14 @@ module ICU
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
+
require "ffi-icu/core_ext/string"
|
23
24
|
require "ffi-icu/lib"
|
24
25
|
require "ffi-icu/uchar"
|
25
26
|
require "ffi-icu/chardet"
|
26
27
|
require "ffi-icu/collation"
|
27
28
|
require "ffi-icu/transliteration"
|
28
29
|
require "ffi-icu/normalization"
|
30
|
+
require "ffi-icu/break_iterator"
|
29
31
|
|
30
32
|
unless ICU.ruby19?
|
31
33
|
require 'jcode'
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module ICU
|
2
|
+
class BreakIterator
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
UBRK_DONE = -1
|
6
|
+
|
7
|
+
def self.available_locales
|
8
|
+
(0...Lib.ubrk_countAvailable).map do |idx|
|
9
|
+
Lib.ubrk_getAvailable idx
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(type, locale)
|
14
|
+
ptr = Lib.check_error { |err| Lib.ubrk_open(type, locale, nil, 0, err) }
|
15
|
+
|
16
|
+
@iterator = FFI::AutoPointer.new(ptr, Lib.method(:ubrk_close))
|
17
|
+
end
|
18
|
+
|
19
|
+
def text=(str)
|
20
|
+
Lib.check_error { |err|
|
21
|
+
Lib.ubrk_setText @iterator, UCharPointer.from_string(str), str.length, err
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def each(&blk)
|
26
|
+
int = first
|
27
|
+
|
28
|
+
while int != UBRK_DONE
|
29
|
+
yield int
|
30
|
+
int = self.next
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def next
|
35
|
+
Lib.ubrk_next @iterator
|
36
|
+
end
|
37
|
+
|
38
|
+
def previous
|
39
|
+
Lib.ubrk_next @iterator
|
40
|
+
end
|
41
|
+
|
42
|
+
def first
|
43
|
+
Lib.ubrk_first @iterator
|
44
|
+
end
|
45
|
+
|
46
|
+
def last
|
47
|
+
Lib.ubrk_last @iterator
|
48
|
+
end
|
49
|
+
|
50
|
+
def preceding
|
51
|
+
Lib.ubrk_preceding @iterator
|
52
|
+
end
|
53
|
+
|
54
|
+
def following
|
55
|
+
Lib.ubrk_following @iterator
|
56
|
+
end
|
57
|
+
|
58
|
+
def current
|
59
|
+
Lib.ubrk_current @iterator
|
60
|
+
end
|
61
|
+
|
62
|
+
def boundary?(index)
|
63
|
+
Lib.ubrk_isBoundary(@iterator, Integer(index)) != 0
|
64
|
+
end
|
65
|
+
|
66
|
+
end # BreakIterator
|
67
|
+
end # ICU
|
data/lib/ffi-icu/lib.rb
CHANGED
@@ -20,7 +20,7 @@ module ICU
|
|
20
20
|
# let the user tell us where the lib is
|
21
21
|
if ENV['FFI_ICU_LIB']
|
22
22
|
libs = ENV['FFI_ICU_LIB'].split(",")
|
23
|
-
ffi_lib
|
23
|
+
ffi_lib(*libs)
|
24
24
|
|
25
25
|
if ENV['FFI_ICU_VERSION_SUFFIX']
|
26
26
|
return ENV['FFI_ICU_VERSION_SUFFIX']
|
@@ -145,7 +145,9 @@ module ICU
|
|
145
145
|
:pre_context, :pointer,
|
146
146
|
:post_context, :pointer
|
147
147
|
|
148
|
-
|
148
|
+
def to_s
|
149
|
+
"#<%s:%x line: %d offset: %d" % [self.class, hash*2, self[:line], self[:offset]]
|
150
|
+
end
|
149
151
|
end
|
150
152
|
|
151
153
|
class UTransPosition < FFI::Struct
|
@@ -179,5 +181,34 @@ module ICU
|
|
179
181
|
]
|
180
182
|
|
181
183
|
attach_function :unorm_normalize, "unorm_normalize#{suffix}", [:pointer, :int32_t, :normalization_mode, :int32_t, :pointer, :int32_t, :pointer], :int32_t
|
184
|
+
|
185
|
+
#
|
186
|
+
# Text Boundary Analysis
|
187
|
+
#
|
188
|
+
|
189
|
+
enum :iterator_type, [ :character, :word, :line, :sentence, :title]
|
190
|
+
enum :word_break, [ :none, 0,
|
191
|
+
:none_limit, 100,
|
192
|
+
:number, 100,
|
193
|
+
:number_limit, 200,
|
194
|
+
:letter, 200,
|
195
|
+
:letter_limit, 300,
|
196
|
+
:kana, 300,
|
197
|
+
:kana_limit, 400,
|
198
|
+
:ideo, 400,
|
199
|
+
:ideo_limit, 400
|
200
|
+
]
|
201
|
+
|
202
|
+
attach_function :ubrk_countAvailable, "ubrk_countAvailable#{suffix}", [], :int32_t
|
203
|
+
attach_function :ubrk_getAvailable, "ubrk_getAvailable#{suffix}", [:int32_t], :string
|
204
|
+
|
205
|
+
attach_function :ubrk_open, "ubrk_open#{suffix}", [:iterator_type, :string, :pointer, :int32_t, :pointer], :pointer
|
206
|
+
attach_function :ubrk_close, "ubrk_close#{suffix}", [:pointer], :void
|
207
|
+
attach_function :ubrk_setText, "ubrk_setText#{suffix}", [:pointer, :pointer, :int32_t, :pointer], :void
|
208
|
+
attach_function :ubrk_current, "ubrk_current#{suffix}", [:pointer], :int32_t
|
209
|
+
attach_function :ubrk_next, "ubrk_next#{suffix}", [:pointer], :int32_t
|
210
|
+
attach_function :ubrk_previous, "ubrk_previous#{suffix}", [:pointer], :int32_t
|
211
|
+
attach_function :ubrk_first, "ubrk_first#{suffix}", [:pointer], :int32_t
|
212
|
+
attach_function :ubrk_last, "ubrk_last#{suffix}", [:pointer], :int32_t
|
182
213
|
end # Lib
|
183
214
|
end # ICU
|
@@ -2,27 +2,28 @@ module ICU
|
|
2
2
|
module Normalization
|
3
3
|
|
4
4
|
def self.normalize(input, mode = :default)
|
5
|
-
input_length
|
6
|
-
needed_length
|
7
|
-
|
5
|
+
input_length = input.unpack("U*").size
|
6
|
+
needed_length = out_length = options = 0
|
7
|
+
in_ptr = UCharPointer.from_string(input)
|
8
|
+
out_ptr = UCharPointer.new(out_length)
|
8
9
|
|
9
10
|
retried = false
|
10
|
-
ptr = nil
|
11
11
|
|
12
12
|
begin
|
13
13
|
Lib.check_error do |error|
|
14
|
-
needed_length = Lib.unorm_normalize(
|
14
|
+
needed_length = Lib.unorm_normalize(in_ptr, input_length, mode, options, out_ptr, out_length, error)
|
15
15
|
end
|
16
16
|
rescue BufferOverflowError
|
17
|
-
raise if retried
|
18
|
-
|
19
|
-
|
17
|
+
raise BufferOverflowError, "needed: #{needed_length}" if retried
|
18
|
+
|
19
|
+
out_ptr = out_ptr.resized_to needed_length
|
20
|
+
out_length = needed_length + 1
|
20
21
|
|
21
22
|
retried = true
|
22
23
|
retry
|
23
24
|
end
|
24
25
|
|
25
|
-
|
26
|
+
out_ptr.string
|
26
27
|
end
|
27
28
|
|
28
29
|
end # Normalization
|
@@ -2,8 +2,8 @@ module ICU
|
|
2
2
|
module Transliteration
|
3
3
|
|
4
4
|
class << self
|
5
|
-
def transliterate(translit_id, str)
|
6
|
-
t = Transliterator.new translit_id
|
5
|
+
def transliterate(translit_id, str, rules = nil)
|
6
|
+
t = Transliterator.new translit_id, rules
|
7
7
|
res = t.transliterate str
|
8
8
|
t.close
|
9
9
|
|
@@ -25,11 +25,22 @@ module ICU
|
|
25
25
|
|
26
26
|
class Transliterator
|
27
27
|
|
28
|
-
def initialize(id, direction = :forward)
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
28
|
+
def initialize(id, rules = nil, direction = :forward)
|
29
|
+
if rules
|
30
|
+
rules_length = rules.length + 1
|
31
|
+
rules = UCharPointer.from_string(rules)
|
32
|
+
else
|
33
|
+
rules_length = 0
|
34
|
+
end
|
35
|
+
|
36
|
+
parse_error = Lib::UParseError.new
|
37
|
+
begin
|
38
|
+
Lib.check_error do |status|
|
39
|
+
# couldn't get utrans_openU to work properly, so using deprecated utrans_open for now
|
40
|
+
@tr = Lib.utrans_open(id, direction, rules, rules_length, @parse_error, status)
|
41
|
+
end
|
42
|
+
rescue ICU::Error => ex
|
43
|
+
raise ex, "#{ex.message} (#{parse_error})"
|
33
44
|
end
|
34
45
|
end
|
35
46
|
|
data/lib/ffi-icu/uchar.rb
CHANGED
@@ -8,15 +8,19 @@ module ICU
|
|
8
8
|
str = str.encode("UTF-8") if str.respond_to? :encode
|
9
9
|
bytes = str.unpack("U*")
|
10
10
|
|
11
|
-
ptr = new
|
11
|
+
ptr = new bytes.size
|
12
12
|
ptr.put_array_of_uint16 0, bytes
|
13
13
|
|
14
14
|
ptr
|
15
15
|
end
|
16
16
|
|
17
|
+
def initialize(size)
|
18
|
+
super UCHAR_TYPE, size
|
19
|
+
end
|
20
|
+
|
17
21
|
def resized_to(new_size)
|
18
22
|
raise "new_size must be larger than current size" if new_size < size
|
19
|
-
resized = self.class.new
|
23
|
+
resized = self.class.new new_size
|
20
24
|
resized.put_bytes(0, get_bytes(0, size))
|
21
25
|
|
22
26
|
resized
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
module ICU
|
6
|
+
describe BreakIterator do
|
7
|
+
|
8
|
+
it "should return available locales" do
|
9
|
+
locales = ICU::BreakIterator.available_locales
|
10
|
+
locales.should be_kind_of(Array)
|
11
|
+
locales.should_not be_empty
|
12
|
+
locales.should include("en_US")
|
13
|
+
end
|
14
|
+
|
15
|
+
it "finds all word boundaries in an English string" do
|
16
|
+
iterator = BreakIterator.new :word, "en_US"
|
17
|
+
iterator.text = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."
|
18
|
+
iterator.to_a.should == [0, 5, 6, 11, 12, 17, 18, 21, 22, 26, 27, 28, 39, 40, 51, 52, 56, 57, 58, 61, 62, 64, 65, 72, 73, 79, 80, 90, 91, 93, 94, 100, 101, 103, 104, 110, 111, 116, 117, 123, 124]
|
19
|
+
end
|
20
|
+
|
21
|
+
it "finds all sentence boundaries in an English string" do
|
22
|
+
iterator = BreakIterator.new :sentence, "en_US"
|
23
|
+
iterator.text = "This is a sentence. This is another sentence, with a comma in it."
|
24
|
+
iterator.to_a.should == [0, 20, 65]
|
25
|
+
end
|
26
|
+
|
27
|
+
end # BreakIterator
|
28
|
+
end # ICU
|
data/spec/collation_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -29,5 +29,10 @@ module ICU
|
|
29
29
|
ids.should be_kind_of(Array)
|
30
30
|
ids.should_not be_empty
|
31
31
|
end
|
32
|
+
|
33
|
+
# it "should transliterate custom rules" do
|
34
|
+
# ICU::Transliteration.translit("Accents-Any", "âêîôû", "NFD; [:Nonspacing Mark:] Remove; NFC").should == "aeiou"
|
35
|
+
# end
|
36
|
+
|
32
37
|
end # Transliteration
|
33
38
|
end # ICU
|
metadata
CHANGED
@@ -1,13 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ffi-icu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 2
|
10
|
-
version: 0.0.2
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.3
|
11
6
|
platform: ruby
|
12
7
|
authors:
|
13
8
|
- Jari Bakken
|
@@ -15,8 +10,7 @@ autorequire:
|
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
12
|
|
18
|
-
date: 2010-
|
19
|
-
default_executable:
|
13
|
+
date: 2010-08-23 00:00:00 Z
|
20
14
|
dependencies:
|
21
15
|
- !ruby/object:Gem::Dependency
|
22
16
|
name: ffi
|
@@ -24,14 +18,9 @@ dependencies:
|
|
24
18
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
19
|
none: false
|
26
20
|
requirements:
|
27
|
-
- -
|
21
|
+
- - ~>
|
28
22
|
- !ruby/object:Gem::Version
|
29
|
-
|
30
|
-
segments:
|
31
|
-
- 0
|
32
|
-
- 6
|
33
|
-
- 3
|
34
|
-
version: 0.6.3
|
23
|
+
version: 1.0.9
|
35
24
|
type: :runtime
|
36
25
|
version_requirements: *id001
|
37
26
|
- !ruby/object:Gem::Dependency
|
@@ -40,17 +29,12 @@ dependencies:
|
|
40
29
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
30
|
none: false
|
42
31
|
requirements:
|
43
|
-
- -
|
32
|
+
- - ~>
|
44
33
|
- !ruby/object:Gem::Version
|
45
|
-
|
46
|
-
segments:
|
47
|
-
- 1
|
48
|
-
- 3
|
49
|
-
- 0
|
50
|
-
version: 1.3.0
|
34
|
+
version: 2.5.0
|
51
35
|
type: :development
|
52
36
|
version_requirements: *id002
|
53
|
-
description: Provides charset detection, locale sensitive collation and more.
|
37
|
+
description: Provides charset detection, locale sensitive collation and more. Depends on libicu.
|
54
38
|
email: jari.bakken@gmail.com
|
55
39
|
executables: []
|
56
40
|
|
@@ -62,19 +46,24 @@ extra_rdoc_files:
|
|
62
46
|
files:
|
63
47
|
- .document
|
64
48
|
- .gitignore
|
49
|
+
- Gemfile
|
65
50
|
- LICENSE
|
66
51
|
- README.rdoc
|
67
52
|
- Rakefile
|
68
|
-
- VERSION
|
69
53
|
- benchmark/detect.rb
|
70
54
|
- benchmark/shared.rb
|
55
|
+
- ffi-icu.gemspec
|
71
56
|
- lib/ffi-icu.rb
|
57
|
+
- lib/ffi-icu/break_iterator.rb
|
72
58
|
- lib/ffi-icu/chardet.rb
|
73
59
|
- lib/ffi-icu/collation.rb
|
60
|
+
- lib/ffi-icu/core_ext/string.rb
|
74
61
|
- lib/ffi-icu/lib.rb
|
75
62
|
- lib/ffi-icu/normalization.rb
|
76
63
|
- lib/ffi-icu/transliteration.rb
|
77
64
|
- lib/ffi-icu/uchar.rb
|
65
|
+
- lib/ffi-icu/version.rb
|
66
|
+
- spec/break_iterator_spec.rb
|
78
67
|
- spec/chardet_spec.rb
|
79
68
|
- spec/collation_spec.rb
|
80
69
|
- spec/normalization_spec.rb
|
@@ -82,7 +71,6 @@ files:
|
|
82
71
|
- spec/spec_helper.rb
|
83
72
|
- spec/transliteration_spec.rb
|
84
73
|
- test.c
|
85
|
-
has_rdoc: true
|
86
74
|
homepage: http://github.com/jarib/ffi-icu
|
87
75
|
licenses: []
|
88
76
|
|
@@ -96,29 +84,25 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
96
84
|
requirements:
|
97
85
|
- - ">="
|
98
86
|
- !ruby/object:Gem::Version
|
99
|
-
hash: 3
|
100
|
-
segments:
|
101
|
-
- 0
|
102
87
|
version: "0"
|
103
88
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
104
89
|
none: false
|
105
90
|
requirements:
|
106
91
|
- - ">="
|
107
92
|
- !ruby/object:Gem::Version
|
108
|
-
hash: 3
|
109
|
-
segments:
|
110
|
-
- 0
|
111
93
|
version: "0"
|
112
94
|
requirements: []
|
113
95
|
|
114
96
|
rubyforge_project:
|
115
|
-
rubygems_version: 1.
|
97
|
+
rubygems_version: 1.8.2
|
116
98
|
signing_key:
|
117
99
|
specification_version: 3
|
118
100
|
summary: Simple FFI wrappers for things I need from ICU.
|
119
101
|
test_files:
|
102
|
+
- spec/break_iterator_spec.rb
|
120
103
|
- spec/chardet_spec.rb
|
104
|
+
- spec/collation_spec.rb
|
121
105
|
- spec/normalization_spec.rb
|
122
|
-
- spec/
|
106
|
+
- spec/spec.opts
|
123
107
|
- spec/spec_helper.rb
|
124
|
-
- spec/
|
108
|
+
- spec/transliteration_spec.rb
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.0.2
|