ffi-icu 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +116 -0
- data/Rakefile +1 -1
- data/ffi-icu.gemspec +3 -6
- data/lib/ffi-icu/break_iterator.rb +27 -3
- data/lib/ffi-icu/lib.rb +1 -1
- data/lib/ffi-icu/version.rb +1 -1
- data/spec/break_iterator_spec.rb +15 -1
- metadata +4 -4
- data/README.rdoc +0 -85
data/README.md
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
ffi-icu
|
2
|
+
=======
|
3
|
+
|
4
|
+
Simple FFI wrappers for things I need from ICU. For the full thing, check out [ICU4R](http://icu4r.rubyforge.org/) instead.
|
5
|
+
|
6
|
+
Gem
|
7
|
+
---
|
8
|
+
|
9
|
+
[Rubygem](http://rubygems.org/gems/ffi-icu "ffi-icu")
|
10
|
+
|
11
|
+
gem install ffi-icu
|
12
|
+
|
13
|
+
Dependencies
|
14
|
+
------------
|
15
|
+
|
16
|
+
ICU.
|
17
|
+
|
18
|
+
If you get messages that the library or functions are not found, you can
|
19
|
+
set some environment varibles to tell ffi-icu where to find it, e.g.:
|
20
|
+
|
21
|
+
$ export FFI_ICU_LIB="icui18n.so"
|
22
|
+
$ export FFI_ICU_VERSION_SUFFIX="_3_8"
|
23
|
+
$ ruby -r ffi-icu program.rb
|
24
|
+
|
25
|
+
Features
|
26
|
+
========
|
27
|
+
|
28
|
+
Character Encoding Detection
|
29
|
+
----------------------------
|
30
|
+
|
31
|
+
Examples:
|
32
|
+
|
33
|
+
```ruby
|
34
|
+
|
35
|
+
match = ICU::CharDet.detect(str)
|
36
|
+
match.name # => "UTF-8"
|
37
|
+
match.confidence # => 80
|
38
|
+
```
|
39
|
+
|
40
|
+
or
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
detector = ICU::CharDet::Detector.new
|
44
|
+
detector.detect(str) => #<struct ICU::CharDet::Detector::Match ...>
|
45
|
+
```
|
46
|
+
|
47
|
+
Why not just use rchardet?
|
48
|
+
|
49
|
+
* speed
|
50
|
+
* 1.9 support
|
51
|
+
|
52
|
+
Locale Sensitive Collation
|
53
|
+
--------------------------
|
54
|
+
|
55
|
+
Examples:
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
ICU::Collation.collate("nb", %w[å æ ø]) == %w[æ ø å] #=> true
|
59
|
+
```
|
60
|
+
|
61
|
+
or
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
collator = ICU::Collation::Collator.new("nb")
|
65
|
+
collator.compare("a", "b") #=> -1
|
66
|
+
collator.greater?("z", "a") #=> true
|
67
|
+
collator.collate(%w[å æ ø]) #=> ["æ", "ø", "å"]
|
68
|
+
```
|
69
|
+
|
70
|
+
Text Boundary Analysis
|
71
|
+
----------------------
|
72
|
+
|
73
|
+
Examples:
|
74
|
+
|
75
|
+
```ruby
|
76
|
+
iterator = ICU::BreakIterator.new(:word, "en_US")
|
77
|
+
iterator.text = "This is a sentence."
|
78
|
+
iterator.to_a #=> [0, 4, 5, 7, 8, 9, 10, 18, 19]
|
79
|
+
```
|
80
|
+
|
81
|
+
Tested on:
|
82
|
+
==========
|
83
|
+
|
84
|
+
Platforms:
|
85
|
+
|
86
|
+
* OS X 10.6
|
87
|
+
* Arch Linux
|
88
|
+
|
89
|
+
Rubies:
|
90
|
+
|
91
|
+
* MRI 1.9.1
|
92
|
+
* MRI 1.8.7
|
93
|
+
|
94
|
+
TODO:
|
95
|
+
=====
|
96
|
+
|
97
|
+
* Useful ICU stuff:
|
98
|
+
- number formatting (decimal points, thousand separators, currency)
|
99
|
+
- date formatting
|
100
|
+
* Windows?!
|
101
|
+
|
102
|
+
Note on Patches/Pull Requests
|
103
|
+
=============================
|
104
|
+
|
105
|
+
* Fork the project.
|
106
|
+
* Make your feature addition or bug fix.
|
107
|
+
* Add tests for it. This is important so I don't break it in a
|
108
|
+
future version unintentionally.
|
109
|
+
* Commit, do not mess with rakefile, version, or history.
|
110
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
111
|
+
* Send me a pull request. Bonus points for topic branches.
|
112
|
+
|
113
|
+
Copyright
|
114
|
+
=========
|
115
|
+
|
116
|
+
Copyright (c) 2010-2011 Jari Bakken. See LICENSE for details.
|
data/Rakefile
CHANGED
data/ffi-icu.gemspec
CHANGED
@@ -14,18 +14,15 @@ Gem::Specification.new do |s|
|
|
14
14
|
s.date = %q{2010-08-23}
|
15
15
|
s.description = %q{Provides charset detection, locale sensitive collation and more. Depends on libicu.}
|
16
16
|
s.email = %q{jari.bakken@gmail.com}
|
17
|
-
s.extra_rdoc_files = [
|
18
|
-
"LICENSE",
|
19
|
-
"README.rdoc"
|
20
|
-
]
|
17
|
+
s.extra_rdoc_files = ["LICENSE", "README.md"]
|
21
18
|
s.files = `git ls-files`.split("\n")
|
22
19
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
23
20
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
24
21
|
s.require_paths = ["lib"]
|
25
|
-
|
22
|
+
|
26
23
|
s.homepage = %q{http://github.com/jarib/ffi-icu}
|
27
24
|
s.rdoc_options = ["--charset=UTF-8"]
|
28
|
-
s.summary = %q{Simple FFI wrappers for things I need from ICU.}
|
25
|
+
s.summary = %q{Simple Ruby FFI wrappers for things I need from ICU.}
|
29
26
|
|
30
27
|
s.add_runtime_dependency(%q<ffi>, ["~> 1.0.9"])
|
31
28
|
s.add_development_dependency(%q<rspec>, ["~> 2.5.0"])
|
@@ -2,7 +2,9 @@ module ICU
|
|
2
2
|
class BreakIterator
|
3
3
|
include Enumerable
|
4
4
|
|
5
|
-
|
5
|
+
attr_reader :text
|
6
|
+
|
7
|
+
DONE = -1
|
6
8
|
|
7
9
|
def self.available_locales
|
8
10
|
(0...Lib.ubrk_countAvailable).map do |idx|
|
@@ -12,11 +14,12 @@ module ICU
|
|
12
14
|
|
13
15
|
def initialize(type, locale)
|
14
16
|
ptr = Lib.check_error { |err| Lib.ubrk_open(type, locale, nil, 0, err) }
|
15
|
-
|
16
17
|
@iterator = FFI::AutoPointer.new(ptr, Lib.method(:ubrk_close))
|
17
18
|
end
|
18
19
|
|
19
20
|
def text=(str)
|
21
|
+
@text = str
|
22
|
+
|
20
23
|
Lib.check_error { |err|
|
21
24
|
Lib.ubrk_setText @iterator, UCharPointer.from_string(str), str.jlength, err
|
22
25
|
}
|
@@ -27,10 +30,31 @@ module ICU
|
|
27
30
|
|
28
31
|
int = first
|
29
32
|
|
30
|
-
while int !=
|
33
|
+
while int != DONE
|
31
34
|
yield int
|
32
35
|
int = self.next
|
33
36
|
end
|
37
|
+
|
38
|
+
self
|
39
|
+
end
|
40
|
+
|
41
|
+
def each_substring(&blk)
|
42
|
+
return to_enum(:each_substring) unless block_given?
|
43
|
+
|
44
|
+
# each_char needed for 1.8, where String#[] works on bytes, not characters
|
45
|
+
chars = text.each_char.to_a
|
46
|
+
low = first
|
47
|
+
|
48
|
+
while (high = self.next) != DONE
|
49
|
+
yield chars[low...high].join
|
50
|
+
low = high
|
51
|
+
end
|
52
|
+
|
53
|
+
self
|
54
|
+
end
|
55
|
+
|
56
|
+
def substrings
|
57
|
+
each_substring.to_a
|
34
58
|
end
|
35
59
|
|
36
60
|
def next
|
data/lib/ffi-icu/lib.rb
CHANGED
data/lib/ffi-icu/version.rb
CHANGED
data/spec/break_iterator_spec.rb
CHANGED
@@ -18,7 +18,21 @@ module ICU
|
|
18
18
|
iterator.to_a.should == [0, 5, 6, 11, 12, 17, 18, 21, 22, 26, 27, 28, 39, 40, 51, 52, 56, 57, 58, 61, 62, 64, 65, 72, 73, 79, 80, 90, 91, 93, 94, 100, 101, 103, 104, 110, 111, 116, 117, 123, 124]
|
19
19
|
end
|
20
20
|
|
21
|
-
it "
|
21
|
+
it "returns each substring" do
|
22
|
+
iterator = BreakIterator.new :word, "en_US"
|
23
|
+
iterator.text = "Lorem ipsum dolor sit amet."
|
24
|
+
|
25
|
+
iterator.substrings.should == ["Lorem", " ", "ipsum", " ", "dolor", " ", "sit", " ", "amet", "."]
|
26
|
+
end
|
27
|
+
|
28
|
+
it "returns the substrings of a non-ASCII string" do
|
29
|
+
iterator = BreakIterator.new :word, "th_TH"
|
30
|
+
iterator.text = "รู้อะไรไม่สู้รู้วิชา รู้รักษาตัวรอดเป็นยอดดี"
|
31
|
+
|
32
|
+
iterator.substrings.should == ["รู้", "อะไร", "ไม่สู้", "รู้", "วิชา", " ", "รู้", "รักษา", "ตัว", "รอด", "เป็น", "ยอดดี"]
|
33
|
+
end
|
34
|
+
|
35
|
+
it "finds all word boundaries in a non-ASCII string" do
|
22
36
|
iterator = BreakIterator.new :word, "th_TH"
|
23
37
|
iterator.text = "การทดลอง"
|
24
38
|
iterator.to_a.should == [0, 3, 8]
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: ffi-icu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.7
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jari Bakken
|
@@ -42,13 +42,13 @@ extensions: []
|
|
42
42
|
|
43
43
|
extra_rdoc_files:
|
44
44
|
- LICENSE
|
45
|
-
- README.
|
45
|
+
- README.md
|
46
46
|
files:
|
47
47
|
- .document
|
48
48
|
- .gitignore
|
49
49
|
- Gemfile
|
50
50
|
- LICENSE
|
51
|
-
- README.
|
51
|
+
- README.md
|
52
52
|
- Rakefile
|
53
53
|
- benchmark/detect.rb
|
54
54
|
- benchmark/shared.rb
|
@@ -97,7 +97,7 @@ rubyforge_project:
|
|
97
97
|
rubygems_version: 1.8.2
|
98
98
|
signing_key:
|
99
99
|
specification_version: 3
|
100
|
-
summary: Simple FFI wrappers for things I need from ICU.
|
100
|
+
summary: Simple Ruby FFI wrappers for things I need from ICU.
|
101
101
|
test_files:
|
102
102
|
- spec/break_iterator_spec.rb
|
103
103
|
- spec/chardet_spec.rb
|
data/README.rdoc
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
= ffi-icu
|
2
|
-
|
3
|
-
Simple FFI wrappers for things I need from ICU. For the full thing, check out ICU4R instead.
|
4
|
-
|
5
|
-
= Gem
|
6
|
-
|
7
|
-
* http://rubygems.org/gems/ffi-icu
|
8
|
-
|
9
|
-
gem install ffi-icu
|
10
|
-
|
11
|
-
= Dependencies
|
12
|
-
|
13
|
-
ICU. If you get messages that the library or functions are not found, you can
|
14
|
-
set some environment varibles to tell ffi-icu where to find it, i.e.:
|
15
|
-
|
16
|
-
FFI_ICU_LIB="icui18n.so" FFI_ICU_VERSION_SUFFIX="_3_8" ruby -r ffi-icu
|
17
|
-
|
18
|
-
= Features
|
19
|
-
|
20
|
-
== Character Encoding Detection
|
21
|
-
|
22
|
-
=== Examples:
|
23
|
-
|
24
|
-
match = ICU::CharDet.detect(str)
|
25
|
-
match.name # => "UTF-8"
|
26
|
-
match.confidence # => 80
|
27
|
-
|
28
|
-
or
|
29
|
-
|
30
|
-
detector = ICU::CharDet::Detector.new
|
31
|
-
detector.detect(str)
|
32
|
-
detector.close
|
33
|
-
|
34
|
-
=== Why not just use rchardet?
|
35
|
-
|
36
|
-
* this is faster
|
37
|
-
* rchardet does not work well on 1.9
|
38
|
-
* none of the rchardet forks claiming to work on 1.9 actually does
|
39
|
-
|
40
|
-
== Locale Sensitive Collation
|
41
|
-
|
42
|
-
=== Examples:
|
43
|
-
|
44
|
-
ICU::Collation.collate("nb", %w[å æ ø]) == %w[æ ø å] #=> true
|
45
|
-
|
46
|
-
or
|
47
|
-
|
48
|
-
collator = ICU::Collation::Collator.new("nb")
|
49
|
-
collator.compare("a", "b") #=> -1
|
50
|
-
collator.greater?("z", "a") #=> true
|
51
|
-
collator.collate(%w[å æ ø]) #=> ["æ", "ø", "å"]
|
52
|
-
|
53
|
-
= Tested on:
|
54
|
-
|
55
|
-
Platforms:
|
56
|
-
|
57
|
-
* OS X 10.6
|
58
|
-
* Debian Linux
|
59
|
-
* Arch Linux
|
60
|
-
|
61
|
-
Rubies:
|
62
|
-
|
63
|
-
* MRI 1.9.1
|
64
|
-
* MRI 1.8.7
|
65
|
-
|
66
|
-
= TODO:
|
67
|
-
|
68
|
-
* Useful ICU stuff:
|
69
|
-
- number formatting (decimal points, thousand separators, currency)
|
70
|
-
- date formatting
|
71
|
-
* Windows?!
|
72
|
-
|
73
|
-
== Note on Patches/Pull Requests
|
74
|
-
|
75
|
-
* Fork the project.
|
76
|
-
* Make your feature addition or bug fix.
|
77
|
-
* Add tests for it. This is important so I don't break it in a
|
78
|
-
future version unintentionally.
|
79
|
-
* Commit, do not mess with rakefile, version, or history.
|
80
|
-
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
81
|
-
* Send me a pull request. Bonus points for topic branches.
|
82
|
-
|
83
|
-
== Copyright
|
84
|
-
|
85
|
-
Copyright (c) 2010-2011 Jari Bakken. See LICENSE for details.
|