ffi-icu 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +116 -0
- data/Rakefile +1 -1
- data/ffi-icu.gemspec +3 -6
- data/lib/ffi-icu/break_iterator.rb +27 -3
- data/lib/ffi-icu/lib.rb +1 -1
- data/lib/ffi-icu/version.rb +1 -1
- data/spec/break_iterator_spec.rb +15 -1
- metadata +4 -4
- data/README.rdoc +0 -85
data/README.md
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
ffi-icu
|
2
|
+
=======
|
3
|
+
|
4
|
+
Simple FFI wrappers for things I need from ICU. For the full thing, check out [ICU4R](http://icu4r.rubyforge.org/) instead.
|
5
|
+
|
6
|
+
Gem
|
7
|
+
---
|
8
|
+
|
9
|
+
[Rubygem](http://rubygems.org/gems/ffi-icu "ffi-icu")
|
10
|
+
|
11
|
+
gem install ffi-icu
|
12
|
+
|
13
|
+
Dependencies
|
14
|
+
------------
|
15
|
+
|
16
|
+
ICU.
|
17
|
+
|
18
|
+
If you get messages that the library or functions are not found, you can
|
19
|
+
set some environment varibles to tell ffi-icu where to find it, e.g.:
|
20
|
+
|
21
|
+
$ export FFI_ICU_LIB="icui18n.so"
|
22
|
+
$ export FFI_ICU_VERSION_SUFFIX="_3_8"
|
23
|
+
$ ruby -r ffi-icu program.rb
|
24
|
+
|
25
|
+
Features
|
26
|
+
========
|
27
|
+
|
28
|
+
Character Encoding Detection
|
29
|
+
----------------------------
|
30
|
+
|
31
|
+
Examples:
|
32
|
+
|
33
|
+
```ruby
|
34
|
+
|
35
|
+
match = ICU::CharDet.detect(str)
|
36
|
+
match.name # => "UTF-8"
|
37
|
+
match.confidence # => 80
|
38
|
+
```
|
39
|
+
|
40
|
+
or
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
detector = ICU::CharDet::Detector.new
|
44
|
+
detector.detect(str) => #<struct ICU::CharDet::Detector::Match ...>
|
45
|
+
```
|
46
|
+
|
47
|
+
Why not just use rchardet?
|
48
|
+
|
49
|
+
* speed
|
50
|
+
* 1.9 support
|
51
|
+
|
52
|
+
Locale Sensitive Collation
|
53
|
+
--------------------------
|
54
|
+
|
55
|
+
Examples:
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
ICU::Collation.collate("nb", %w[å æ ø]) == %w[æ ø å] #=> true
|
59
|
+
```
|
60
|
+
|
61
|
+
or
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
collator = ICU::Collation::Collator.new("nb")
|
65
|
+
collator.compare("a", "b") #=> -1
|
66
|
+
collator.greater?("z", "a") #=> true
|
67
|
+
collator.collate(%w[å æ ø]) #=> ["æ", "ø", "å"]
|
68
|
+
```
|
69
|
+
|
70
|
+
Text Boundary Analysis
|
71
|
+
----------------------
|
72
|
+
|
73
|
+
Examples:
|
74
|
+
|
75
|
+
```ruby
|
76
|
+
iterator = ICU::BreakIterator.new(:word, "en_US")
|
77
|
+
iterator.text = "This is a sentence."
|
78
|
+
iterator.to_a #=> [0, 4, 5, 7, 8, 9, 10, 18, 19]
|
79
|
+
```
|
80
|
+
|
81
|
+
Tested on:
|
82
|
+
==========
|
83
|
+
|
84
|
+
Platforms:
|
85
|
+
|
86
|
+
* OS X 10.6
|
87
|
+
* Arch Linux
|
88
|
+
|
89
|
+
Rubies:
|
90
|
+
|
91
|
+
* MRI 1.9.1
|
92
|
+
* MRI 1.8.7
|
93
|
+
|
94
|
+
TODO:
|
95
|
+
=====
|
96
|
+
|
97
|
+
* Useful ICU stuff:
|
98
|
+
- number formatting (decimal points, thousand separators, currency)
|
99
|
+
- date formatting
|
100
|
+
* Windows?!
|
101
|
+
|
102
|
+
Note on Patches/Pull Requests
|
103
|
+
=============================
|
104
|
+
|
105
|
+
* Fork the project.
|
106
|
+
* Make your feature addition or bug fix.
|
107
|
+
* Add tests for it. This is important so I don't break it in a
|
108
|
+
future version unintentionally.
|
109
|
+
* Commit, do not mess with rakefile, version, or history.
|
110
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
111
|
+
* Send me a pull request. Bonus points for topic branches.
|
112
|
+
|
113
|
+
Copyright
|
114
|
+
=========
|
115
|
+
|
116
|
+
Copyright (c) 2010-2011 Jari Bakken. See LICENSE for details.
|
data/Rakefile
CHANGED
data/ffi-icu.gemspec
CHANGED
@@ -14,18 +14,15 @@ Gem::Specification.new do |s|
|
|
14
14
|
s.date = %q{2010-08-23}
|
15
15
|
s.description = %q{Provides charset detection, locale sensitive collation and more. Depends on libicu.}
|
16
16
|
s.email = %q{jari.bakken@gmail.com}
|
17
|
-
s.extra_rdoc_files = [
|
18
|
-
"LICENSE",
|
19
|
-
"README.rdoc"
|
20
|
-
]
|
17
|
+
s.extra_rdoc_files = ["LICENSE", "README.md"]
|
21
18
|
s.files = `git ls-files`.split("\n")
|
22
19
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
23
20
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
24
21
|
s.require_paths = ["lib"]
|
25
|
-
|
22
|
+
|
26
23
|
s.homepage = %q{http://github.com/jarib/ffi-icu}
|
27
24
|
s.rdoc_options = ["--charset=UTF-8"]
|
28
|
-
s.summary = %q{Simple FFI wrappers for things I need from ICU.}
|
25
|
+
s.summary = %q{Simple Ruby FFI wrappers for things I need from ICU.}
|
29
26
|
|
30
27
|
s.add_runtime_dependency(%q<ffi>, ["~> 1.0.9"])
|
31
28
|
s.add_development_dependency(%q<rspec>, ["~> 2.5.0"])
|
@@ -2,7 +2,9 @@ module ICU
|
|
2
2
|
class BreakIterator
|
3
3
|
include Enumerable
|
4
4
|
|
5
|
-
|
5
|
+
attr_reader :text
|
6
|
+
|
7
|
+
DONE = -1
|
6
8
|
|
7
9
|
def self.available_locales
|
8
10
|
(0...Lib.ubrk_countAvailable).map do |idx|
|
@@ -12,11 +14,12 @@ module ICU
|
|
12
14
|
|
13
15
|
def initialize(type, locale)
|
14
16
|
ptr = Lib.check_error { |err| Lib.ubrk_open(type, locale, nil, 0, err) }
|
15
|
-
|
16
17
|
@iterator = FFI::AutoPointer.new(ptr, Lib.method(:ubrk_close))
|
17
18
|
end
|
18
19
|
|
19
20
|
def text=(str)
|
21
|
+
@text = str
|
22
|
+
|
20
23
|
Lib.check_error { |err|
|
21
24
|
Lib.ubrk_setText @iterator, UCharPointer.from_string(str), str.jlength, err
|
22
25
|
}
|
@@ -27,10 +30,31 @@ module ICU
|
|
27
30
|
|
28
31
|
int = first
|
29
32
|
|
30
|
-
while int !=
|
33
|
+
while int != DONE
|
31
34
|
yield int
|
32
35
|
int = self.next
|
33
36
|
end
|
37
|
+
|
38
|
+
self
|
39
|
+
end
|
40
|
+
|
41
|
+
def each_substring(&blk)
|
42
|
+
return to_enum(:each_substring) unless block_given?
|
43
|
+
|
44
|
+
# each_char needed for 1.8, where String#[] works on bytes, not characters
|
45
|
+
chars = text.each_char.to_a
|
46
|
+
low = first
|
47
|
+
|
48
|
+
while (high = self.next) != DONE
|
49
|
+
yield chars[low...high].join
|
50
|
+
low = high
|
51
|
+
end
|
52
|
+
|
53
|
+
self
|
54
|
+
end
|
55
|
+
|
56
|
+
def substrings
|
57
|
+
each_substring.to_a
|
34
58
|
end
|
35
59
|
|
36
60
|
def next
|
data/lib/ffi-icu/lib.rb
CHANGED
data/lib/ffi-icu/version.rb
CHANGED
data/spec/break_iterator_spec.rb
CHANGED
@@ -18,7 +18,21 @@ module ICU
|
|
18
18
|
iterator.to_a.should == [0, 5, 6, 11, 12, 17, 18, 21, 22, 26, 27, 28, 39, 40, 51, 52, 56, 57, 58, 61, 62, 64, 65, 72, 73, 79, 80, 90, 91, 93, 94, 100, 101, 103, 104, 110, 111, 116, 117, 123, 124]
|
19
19
|
end
|
20
20
|
|
21
|
-
it "
|
21
|
+
it "returns each substring" do
|
22
|
+
iterator = BreakIterator.new :word, "en_US"
|
23
|
+
iterator.text = "Lorem ipsum dolor sit amet."
|
24
|
+
|
25
|
+
iterator.substrings.should == ["Lorem", " ", "ipsum", " ", "dolor", " ", "sit", " ", "amet", "."]
|
26
|
+
end
|
27
|
+
|
28
|
+
it "returns the substrings of a non-ASCII string" do
|
29
|
+
iterator = BreakIterator.new :word, "th_TH"
|
30
|
+
iterator.text = "รู้อะไรไม่สู้รู้วิชา รู้รักษาตัวรอดเป็นยอดดี"
|
31
|
+
|
32
|
+
iterator.substrings.should == ["รู้", "อะไร", "ไม่สู้", "รู้", "วิชา", " ", "รู้", "รักษา", "ตัว", "รอด", "เป็น", "ยอดดี"]
|
33
|
+
end
|
34
|
+
|
35
|
+
it "finds all word boundaries in a non-ASCII string" do
|
22
36
|
iterator = BreakIterator.new :word, "th_TH"
|
23
37
|
iterator.text = "การทดลอง"
|
24
38
|
iterator.to_a.should == [0, 3, 8]
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: ffi-icu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.7
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jari Bakken
|
@@ -42,13 +42,13 @@ extensions: []
|
|
42
42
|
|
43
43
|
extra_rdoc_files:
|
44
44
|
- LICENSE
|
45
|
-
- README.
|
45
|
+
- README.md
|
46
46
|
files:
|
47
47
|
- .document
|
48
48
|
- .gitignore
|
49
49
|
- Gemfile
|
50
50
|
- LICENSE
|
51
|
-
- README.
|
51
|
+
- README.md
|
52
52
|
- Rakefile
|
53
53
|
- benchmark/detect.rb
|
54
54
|
- benchmark/shared.rb
|
@@ -97,7 +97,7 @@ rubyforge_project:
|
|
97
97
|
rubygems_version: 1.8.2
|
98
98
|
signing_key:
|
99
99
|
specification_version: 3
|
100
|
-
summary: Simple FFI wrappers for things I need from ICU.
|
100
|
+
summary: Simple Ruby FFI wrappers for things I need from ICU.
|
101
101
|
test_files:
|
102
102
|
- spec/break_iterator_spec.rb
|
103
103
|
- spec/chardet_spec.rb
|
data/README.rdoc
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
= ffi-icu
|
2
|
-
|
3
|
-
Simple FFI wrappers for things I need from ICU. For the full thing, check out ICU4R instead.
|
4
|
-
|
5
|
-
= Gem
|
6
|
-
|
7
|
-
* http://rubygems.org/gems/ffi-icu
|
8
|
-
|
9
|
-
gem install ffi-icu
|
10
|
-
|
11
|
-
= Dependencies
|
12
|
-
|
13
|
-
ICU. If you get messages that the library or functions are not found, you can
|
14
|
-
set some environment varibles to tell ffi-icu where to find it, i.e.:
|
15
|
-
|
16
|
-
FFI_ICU_LIB="icui18n.so" FFI_ICU_VERSION_SUFFIX="_3_8" ruby -r ffi-icu
|
17
|
-
|
18
|
-
= Features
|
19
|
-
|
20
|
-
== Character Encoding Detection
|
21
|
-
|
22
|
-
=== Examples:
|
23
|
-
|
24
|
-
match = ICU::CharDet.detect(str)
|
25
|
-
match.name # => "UTF-8"
|
26
|
-
match.confidence # => 80
|
27
|
-
|
28
|
-
or
|
29
|
-
|
30
|
-
detector = ICU::CharDet::Detector.new
|
31
|
-
detector.detect(str)
|
32
|
-
detector.close
|
33
|
-
|
34
|
-
=== Why not just use rchardet?
|
35
|
-
|
36
|
-
* this is faster
|
37
|
-
* rchardet does not work well on 1.9
|
38
|
-
* none of the rchardet forks claiming to work on 1.9 actually does
|
39
|
-
|
40
|
-
== Locale Sensitive Collation
|
41
|
-
|
42
|
-
=== Examples:
|
43
|
-
|
44
|
-
ICU::Collation.collate("nb", %w[å æ ø]) == %w[æ ø å] #=> true
|
45
|
-
|
46
|
-
or
|
47
|
-
|
48
|
-
collator = ICU::Collation::Collator.new("nb")
|
49
|
-
collator.compare("a", "b") #=> -1
|
50
|
-
collator.greater?("z", "a") #=> true
|
51
|
-
collator.collate(%w[å æ ø]) #=> ["æ", "ø", "å"]
|
52
|
-
|
53
|
-
= Tested on:
|
54
|
-
|
55
|
-
Platforms:
|
56
|
-
|
57
|
-
* OS X 10.6
|
58
|
-
* Debian Linux
|
59
|
-
* Arch Linux
|
60
|
-
|
61
|
-
Rubies:
|
62
|
-
|
63
|
-
* MRI 1.9.1
|
64
|
-
* MRI 1.8.7
|
65
|
-
|
66
|
-
= TODO:
|
67
|
-
|
68
|
-
* Useful ICU stuff:
|
69
|
-
- number formatting (decimal points, thousand separators, currency)
|
70
|
-
- date formatting
|
71
|
-
* Windows?!
|
72
|
-
|
73
|
-
== Note on Patches/Pull Requests
|
74
|
-
|
75
|
-
* Fork the project.
|
76
|
-
* Make your feature addition or bug fix.
|
77
|
-
* Add tests for it. This is important so I don't break it in a
|
78
|
-
future version unintentionally.
|
79
|
-
* Commit, do not mess with rakefile, version, or history.
|
80
|
-
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
81
|
-
* Send me a pull request. Bonus points for topic branches.
|
82
|
-
|
83
|
-
== Copyright
|
84
|
-
|
85
|
-
Copyright (c) 2010-2011 Jari Bakken. See LICENSE for details.
|