ffi-icu 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +3 -0
- data/LICENSE +1 -1
- data/README.rdoc +1 -1
- data/Rakefile +7 -26
- data/ffi-icu.gemspec +33 -0
- data/lib/ffi-icu.rb +2 -0
- data/lib/ffi-icu/break_iterator.rb +67 -0
- data/lib/ffi-icu/core_ext/string.rb +5 -0
- data/lib/ffi-icu/lib.rb +33 -2
- data/lib/ffi-icu/normalization.rb +10 -9
- data/lib/ffi-icu/transliteration.rb +18 -7
- data/lib/ffi-icu/uchar.rb +6 -2
- data/lib/ffi-icu/version.rb +3 -0
- data/spec/break_iterator_spec.rb +28 -0
- data/spec/collation_spec.rb +1 -0
- data/spec/spec_helper.rb +2 -3
- data/spec/transliteration_spec.rb +5 -0
- metadata +19 -35
- data/VERSION +0 -1
data/.gitignore
CHANGED
data/Gemfile
ADDED
data/LICENSE
CHANGED
data/README.rdoc
CHANGED
data/Rakefile
CHANGED
@@ -1,39 +1,20 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'rake'
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
Jeweler::Tasks.new do |gem|
|
7
|
-
gem.name = "ffi-icu"
|
8
|
-
gem.summary = %Q{Simple FFI wrappers for things I need from ICU.}
|
9
|
-
gem.description = %Q{Provides charset detection, locale sensitive collation and more.}
|
10
|
-
gem.email = "jari.bakken@gmail.com"
|
11
|
-
gem.homepage = "http://github.com/jarib/ffi-icu"
|
12
|
-
gem.authors = ["Jari Bakken"]
|
13
|
-
|
14
|
-
gem.add_dependency "ffi", ">= 0.6.3"
|
15
|
-
gem.add_development_dependency "rspec", ">= 1.3.0"
|
16
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
-
end
|
4
|
+
require 'bundler'
|
5
|
+
Bundler::GemHelper.install_tasks
|
18
6
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
23
|
-
|
24
|
-
require 'spec/rake/spectask'
|
25
|
-
Spec::Rake::SpecTask.new(:spec) do |spec|
|
26
|
-
spec.libs << 'lib' << 'spec'
|
27
|
-
spec.spec_files = FileList['spec/**/*_spec.rb']
|
7
|
+
require 'rspec/core/rake_task'
|
8
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
9
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
28
10
|
end
|
29
11
|
|
30
|
-
|
31
|
-
spec.libs << 'lib' << 'spec'
|
12
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
32
13
|
spec.pattern = 'spec/**/*_spec.rb'
|
33
14
|
spec.rcov = true
|
34
15
|
end
|
35
16
|
|
36
|
-
task :spec
|
17
|
+
task :spec
|
37
18
|
|
38
19
|
task :default => :spec
|
39
20
|
|
data/ffi-icu.gemspec
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
require File.expand_path("../lib/ffi-icu/version", __FILE__)
|
7
|
+
|
8
|
+
Gem::Specification.new do |s|
|
9
|
+
s.name = %q{ffi-icu}
|
10
|
+
s.version = ICU::VERSION
|
11
|
+
|
12
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
13
|
+
s.authors = ["Jari Bakken"]
|
14
|
+
s.date = %q{2010-08-23}
|
15
|
+
s.description = %q{Provides charset detection, locale sensitive collation and more. Depends on libicu.}
|
16
|
+
s.email = %q{jari.bakken@gmail.com}
|
17
|
+
s.extra_rdoc_files = [
|
18
|
+
"LICENSE",
|
19
|
+
"README.rdoc"
|
20
|
+
]
|
21
|
+
s.files = `git ls-files`.split("\n")
|
22
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
23
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
24
|
+
s.require_paths = ["lib"]
|
25
|
+
|
26
|
+
s.homepage = %q{http://github.com/jarib/ffi-icu}
|
27
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
28
|
+
s.summary = %q{Simple FFI wrappers for things I need from ICU.}
|
29
|
+
|
30
|
+
s.add_runtime_dependency(%q<ffi>, ["~> 1.0.9"])
|
31
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.5.0"])
|
32
|
+
end
|
33
|
+
|
data/lib/ffi-icu.rb
CHANGED
@@ -20,12 +20,14 @@ module ICU
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
+
require "ffi-icu/core_ext/string"
|
23
24
|
require "ffi-icu/lib"
|
24
25
|
require "ffi-icu/uchar"
|
25
26
|
require "ffi-icu/chardet"
|
26
27
|
require "ffi-icu/collation"
|
27
28
|
require "ffi-icu/transliteration"
|
28
29
|
require "ffi-icu/normalization"
|
30
|
+
require "ffi-icu/break_iterator"
|
29
31
|
|
30
32
|
unless ICU.ruby19?
|
31
33
|
require 'jcode'
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module ICU
|
2
|
+
class BreakIterator
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
UBRK_DONE = -1
|
6
|
+
|
7
|
+
def self.available_locales
|
8
|
+
(0...Lib.ubrk_countAvailable).map do |idx|
|
9
|
+
Lib.ubrk_getAvailable idx
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(type, locale)
|
14
|
+
ptr = Lib.check_error { |err| Lib.ubrk_open(type, locale, nil, 0, err) }
|
15
|
+
|
16
|
+
@iterator = FFI::AutoPointer.new(ptr, Lib.method(:ubrk_close))
|
17
|
+
end
|
18
|
+
|
19
|
+
def text=(str)
|
20
|
+
Lib.check_error { |err|
|
21
|
+
Lib.ubrk_setText @iterator, UCharPointer.from_string(str), str.length, err
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def each(&blk)
|
26
|
+
int = first
|
27
|
+
|
28
|
+
while int != UBRK_DONE
|
29
|
+
yield int
|
30
|
+
int = self.next
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def next
|
35
|
+
Lib.ubrk_next @iterator
|
36
|
+
end
|
37
|
+
|
38
|
+
def previous
|
39
|
+
Lib.ubrk_next @iterator
|
40
|
+
end
|
41
|
+
|
42
|
+
def first
|
43
|
+
Lib.ubrk_first @iterator
|
44
|
+
end
|
45
|
+
|
46
|
+
def last
|
47
|
+
Lib.ubrk_last @iterator
|
48
|
+
end
|
49
|
+
|
50
|
+
def preceding
|
51
|
+
Lib.ubrk_preceding @iterator
|
52
|
+
end
|
53
|
+
|
54
|
+
def following
|
55
|
+
Lib.ubrk_following @iterator
|
56
|
+
end
|
57
|
+
|
58
|
+
def current
|
59
|
+
Lib.ubrk_current @iterator
|
60
|
+
end
|
61
|
+
|
62
|
+
def boundary?(index)
|
63
|
+
Lib.ubrk_isBoundary(@iterator, Integer(index)) != 0
|
64
|
+
end
|
65
|
+
|
66
|
+
end # BreakIterator
|
67
|
+
end # ICU
|
data/lib/ffi-icu/lib.rb
CHANGED
@@ -20,7 +20,7 @@ module ICU
|
|
20
20
|
# let the user tell us where the lib is
|
21
21
|
if ENV['FFI_ICU_LIB']
|
22
22
|
libs = ENV['FFI_ICU_LIB'].split(",")
|
23
|
-
ffi_lib
|
23
|
+
ffi_lib(*libs)
|
24
24
|
|
25
25
|
if ENV['FFI_ICU_VERSION_SUFFIX']
|
26
26
|
return ENV['FFI_ICU_VERSION_SUFFIX']
|
@@ -145,7 +145,9 @@ module ICU
|
|
145
145
|
:pre_context, :pointer,
|
146
146
|
:post_context, :pointer
|
147
147
|
|
148
|
-
|
148
|
+
def to_s
|
149
|
+
"#<%s:%x line: %d offset: %d" % [self.class, hash*2, self[:line], self[:offset]]
|
150
|
+
end
|
149
151
|
end
|
150
152
|
|
151
153
|
class UTransPosition < FFI::Struct
|
@@ -179,5 +181,34 @@ module ICU
|
|
179
181
|
]
|
180
182
|
|
181
183
|
attach_function :unorm_normalize, "unorm_normalize#{suffix}", [:pointer, :int32_t, :normalization_mode, :int32_t, :pointer, :int32_t, :pointer], :int32_t
|
184
|
+
|
185
|
+
#
|
186
|
+
# Text Boundary Analysis
|
187
|
+
#
|
188
|
+
|
189
|
+
enum :iterator_type, [ :character, :word, :line, :sentence, :title]
|
190
|
+
enum :word_break, [ :none, 0,
|
191
|
+
:none_limit, 100,
|
192
|
+
:number, 100,
|
193
|
+
:number_limit, 200,
|
194
|
+
:letter, 200,
|
195
|
+
:letter_limit, 300,
|
196
|
+
:kana, 300,
|
197
|
+
:kana_limit, 400,
|
198
|
+
:ideo, 400,
|
199
|
+
:ideo_limit, 400
|
200
|
+
]
|
201
|
+
|
202
|
+
attach_function :ubrk_countAvailable, "ubrk_countAvailable#{suffix}", [], :int32_t
|
203
|
+
attach_function :ubrk_getAvailable, "ubrk_getAvailable#{suffix}", [:int32_t], :string
|
204
|
+
|
205
|
+
attach_function :ubrk_open, "ubrk_open#{suffix}", [:iterator_type, :string, :pointer, :int32_t, :pointer], :pointer
|
206
|
+
attach_function :ubrk_close, "ubrk_close#{suffix}", [:pointer], :void
|
207
|
+
attach_function :ubrk_setText, "ubrk_setText#{suffix}", [:pointer, :pointer, :int32_t, :pointer], :void
|
208
|
+
attach_function :ubrk_current, "ubrk_current#{suffix}", [:pointer], :int32_t
|
209
|
+
attach_function :ubrk_next, "ubrk_next#{suffix}", [:pointer], :int32_t
|
210
|
+
attach_function :ubrk_previous, "ubrk_previous#{suffix}", [:pointer], :int32_t
|
211
|
+
attach_function :ubrk_first, "ubrk_first#{suffix}", [:pointer], :int32_t
|
212
|
+
attach_function :ubrk_last, "ubrk_last#{suffix}", [:pointer], :int32_t
|
182
213
|
end # Lib
|
183
214
|
end # ICU
|
@@ -2,27 +2,28 @@ module ICU
|
|
2
2
|
module Normalization
|
3
3
|
|
4
4
|
def self.normalize(input, mode = :default)
|
5
|
-
input_length
|
6
|
-
needed_length
|
7
|
-
|
5
|
+
input_length = input.unpack("U*").size
|
6
|
+
needed_length = out_length = options = 0
|
7
|
+
in_ptr = UCharPointer.from_string(input)
|
8
|
+
out_ptr = UCharPointer.new(out_length)
|
8
9
|
|
9
10
|
retried = false
|
10
|
-
ptr = nil
|
11
11
|
|
12
12
|
begin
|
13
13
|
Lib.check_error do |error|
|
14
|
-
needed_length = Lib.unorm_normalize(
|
14
|
+
needed_length = Lib.unorm_normalize(in_ptr, input_length, mode, options, out_ptr, out_length, error)
|
15
15
|
end
|
16
16
|
rescue BufferOverflowError
|
17
|
-
raise if retried
|
18
|
-
|
19
|
-
|
17
|
+
raise BufferOverflowError, "needed: #{needed_length}" if retried
|
18
|
+
|
19
|
+
out_ptr = out_ptr.resized_to needed_length
|
20
|
+
out_length = needed_length + 1
|
20
21
|
|
21
22
|
retried = true
|
22
23
|
retry
|
23
24
|
end
|
24
25
|
|
25
|
-
|
26
|
+
out_ptr.string
|
26
27
|
end
|
27
28
|
|
28
29
|
end # Normalization
|
@@ -2,8 +2,8 @@ module ICU
|
|
2
2
|
module Transliteration
|
3
3
|
|
4
4
|
class << self
|
5
|
-
def transliterate(translit_id, str)
|
6
|
-
t = Transliterator.new translit_id
|
5
|
+
def transliterate(translit_id, str, rules = nil)
|
6
|
+
t = Transliterator.new translit_id, rules
|
7
7
|
res = t.transliterate str
|
8
8
|
t.close
|
9
9
|
|
@@ -25,11 +25,22 @@ module ICU
|
|
25
25
|
|
26
26
|
class Transliterator
|
27
27
|
|
28
|
-
def initialize(id, direction = :forward)
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
28
|
+
def initialize(id, rules = nil, direction = :forward)
|
29
|
+
if rules
|
30
|
+
rules_length = rules.length + 1
|
31
|
+
rules = UCharPointer.from_string(rules)
|
32
|
+
else
|
33
|
+
rules_length = 0
|
34
|
+
end
|
35
|
+
|
36
|
+
parse_error = Lib::UParseError.new
|
37
|
+
begin
|
38
|
+
Lib.check_error do |status|
|
39
|
+
# couldn't get utrans_openU to work properly, so using deprecated utrans_open for now
|
40
|
+
@tr = Lib.utrans_open(id, direction, rules, rules_length, @parse_error, status)
|
41
|
+
end
|
42
|
+
rescue ICU::Error => ex
|
43
|
+
raise ex, "#{ex.message} (#{parse_error})"
|
33
44
|
end
|
34
45
|
end
|
35
46
|
|
data/lib/ffi-icu/uchar.rb
CHANGED
@@ -8,15 +8,19 @@ module ICU
|
|
8
8
|
str = str.encode("UTF-8") if str.respond_to? :encode
|
9
9
|
bytes = str.unpack("U*")
|
10
10
|
|
11
|
-
ptr = new
|
11
|
+
ptr = new bytes.size
|
12
12
|
ptr.put_array_of_uint16 0, bytes
|
13
13
|
|
14
14
|
ptr
|
15
15
|
end
|
16
16
|
|
17
|
+
def initialize(size)
|
18
|
+
super UCHAR_TYPE, size
|
19
|
+
end
|
20
|
+
|
17
21
|
def resized_to(new_size)
|
18
22
|
raise "new_size must be larger than current size" if new_size < size
|
19
|
-
resized = self.class.new
|
23
|
+
resized = self.class.new new_size
|
20
24
|
resized.put_bytes(0, get_bytes(0, size))
|
21
25
|
|
22
26
|
resized
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
module ICU
|
6
|
+
describe BreakIterator do
|
7
|
+
|
8
|
+
it "should return available locales" do
|
9
|
+
locales = ICU::BreakIterator.available_locales
|
10
|
+
locales.should be_kind_of(Array)
|
11
|
+
locales.should_not be_empty
|
12
|
+
locales.should include("en_US")
|
13
|
+
end
|
14
|
+
|
15
|
+
it "finds all word boundaries in an English string" do
|
16
|
+
iterator = BreakIterator.new :word, "en_US"
|
17
|
+
iterator.text = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."
|
18
|
+
iterator.to_a.should == [0, 5, 6, 11, 12, 17, 18, 21, 22, 26, 27, 28, 39, 40, 51, 52, 56, 57, 58, 61, 62, 64, 65, 72, 73, 79, 80, 90, 91, 93, 94, 100, 101, 103, 104, 110, 111, 116, 117, 123, 124]
|
19
|
+
end
|
20
|
+
|
21
|
+
it "finds all sentence boundaries in an English string" do
|
22
|
+
iterator = BreakIterator.new :sentence, "en_US"
|
23
|
+
iterator.text = "This is a sentence. This is another sentence, with a comma in it."
|
24
|
+
iterator.to_a.should == [0, 20, 65]
|
25
|
+
end
|
26
|
+
|
27
|
+
end # BreakIterator
|
28
|
+
end # ICU
|
data/spec/collation_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -29,5 +29,10 @@ module ICU
|
|
29
29
|
ids.should be_kind_of(Array)
|
30
30
|
ids.should_not be_empty
|
31
31
|
end
|
32
|
+
|
33
|
+
# it "should transliterate custom rules" do
|
34
|
+
# ICU::Transliteration.translit("Accents-Any", "âêîôû", "NFD; [:Nonspacing Mark:] Remove; NFC").should == "aeiou"
|
35
|
+
# end
|
36
|
+
|
32
37
|
end # Transliteration
|
33
38
|
end # ICU
|
metadata
CHANGED
@@ -1,13 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ffi-icu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 2
|
10
|
-
version: 0.0.2
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.3
|
11
6
|
platform: ruby
|
12
7
|
authors:
|
13
8
|
- Jari Bakken
|
@@ -15,8 +10,7 @@ autorequire:
|
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
12
|
|
18
|
-
date: 2010-
|
19
|
-
default_executable:
|
13
|
+
date: 2010-08-23 00:00:00 Z
|
20
14
|
dependencies:
|
21
15
|
- !ruby/object:Gem::Dependency
|
22
16
|
name: ffi
|
@@ -24,14 +18,9 @@ dependencies:
|
|
24
18
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
19
|
none: false
|
26
20
|
requirements:
|
27
|
-
- -
|
21
|
+
- - ~>
|
28
22
|
- !ruby/object:Gem::Version
|
29
|
-
|
30
|
-
segments:
|
31
|
-
- 0
|
32
|
-
- 6
|
33
|
-
- 3
|
34
|
-
version: 0.6.3
|
23
|
+
version: 1.0.9
|
35
24
|
type: :runtime
|
36
25
|
version_requirements: *id001
|
37
26
|
- !ruby/object:Gem::Dependency
|
@@ -40,17 +29,12 @@ dependencies:
|
|
40
29
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
30
|
none: false
|
42
31
|
requirements:
|
43
|
-
- -
|
32
|
+
- - ~>
|
44
33
|
- !ruby/object:Gem::Version
|
45
|
-
|
46
|
-
segments:
|
47
|
-
- 1
|
48
|
-
- 3
|
49
|
-
- 0
|
50
|
-
version: 1.3.0
|
34
|
+
version: 2.5.0
|
51
35
|
type: :development
|
52
36
|
version_requirements: *id002
|
53
|
-
description: Provides charset detection, locale sensitive collation and more.
|
37
|
+
description: Provides charset detection, locale sensitive collation and more. Depends on libicu.
|
54
38
|
email: jari.bakken@gmail.com
|
55
39
|
executables: []
|
56
40
|
|
@@ -62,19 +46,24 @@ extra_rdoc_files:
|
|
62
46
|
files:
|
63
47
|
- .document
|
64
48
|
- .gitignore
|
49
|
+
- Gemfile
|
65
50
|
- LICENSE
|
66
51
|
- README.rdoc
|
67
52
|
- Rakefile
|
68
|
-
- VERSION
|
69
53
|
- benchmark/detect.rb
|
70
54
|
- benchmark/shared.rb
|
55
|
+
- ffi-icu.gemspec
|
71
56
|
- lib/ffi-icu.rb
|
57
|
+
- lib/ffi-icu/break_iterator.rb
|
72
58
|
- lib/ffi-icu/chardet.rb
|
73
59
|
- lib/ffi-icu/collation.rb
|
60
|
+
- lib/ffi-icu/core_ext/string.rb
|
74
61
|
- lib/ffi-icu/lib.rb
|
75
62
|
- lib/ffi-icu/normalization.rb
|
76
63
|
- lib/ffi-icu/transliteration.rb
|
77
64
|
- lib/ffi-icu/uchar.rb
|
65
|
+
- lib/ffi-icu/version.rb
|
66
|
+
- spec/break_iterator_spec.rb
|
78
67
|
- spec/chardet_spec.rb
|
79
68
|
- spec/collation_spec.rb
|
80
69
|
- spec/normalization_spec.rb
|
@@ -82,7 +71,6 @@ files:
|
|
82
71
|
- spec/spec_helper.rb
|
83
72
|
- spec/transliteration_spec.rb
|
84
73
|
- test.c
|
85
|
-
has_rdoc: true
|
86
74
|
homepage: http://github.com/jarib/ffi-icu
|
87
75
|
licenses: []
|
88
76
|
|
@@ -96,29 +84,25 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
96
84
|
requirements:
|
97
85
|
- - ">="
|
98
86
|
- !ruby/object:Gem::Version
|
99
|
-
hash: 3
|
100
|
-
segments:
|
101
|
-
- 0
|
102
87
|
version: "0"
|
103
88
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
104
89
|
none: false
|
105
90
|
requirements:
|
106
91
|
- - ">="
|
107
92
|
- !ruby/object:Gem::Version
|
108
|
-
hash: 3
|
109
|
-
segments:
|
110
|
-
- 0
|
111
93
|
version: "0"
|
112
94
|
requirements: []
|
113
95
|
|
114
96
|
rubyforge_project:
|
115
|
-
rubygems_version: 1.
|
97
|
+
rubygems_version: 1.8.2
|
116
98
|
signing_key:
|
117
99
|
specification_version: 3
|
118
100
|
summary: Simple FFI wrappers for things I need from ICU.
|
119
101
|
test_files:
|
102
|
+
- spec/break_iterator_spec.rb
|
120
103
|
- spec/chardet_spec.rb
|
104
|
+
- spec/collation_spec.rb
|
121
105
|
- spec/normalization_spec.rb
|
122
|
-
- spec/
|
106
|
+
- spec/spec.opts
|
123
107
|
- spec/spec_helper.rb
|
124
|
-
- spec/
|
108
|
+
- spec/transliteration_spec.rb
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.0.2
|