ffi-icu 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md ADDED
@@ -0,0 +1,116 @@
1
+ ffi-icu
2
+ =======
3
+
4
+ Simple FFI wrappers for things I need from ICU. For the full thing, check out [ICU4R](http://icu4r.rubyforge.org/) instead.
5
+
6
+ Gem
7
+ ---
8
+
9
+ [Rubygem](http://rubygems.org/gems/ffi-icu "ffi-icu")
10
+
11
+ gem install ffi-icu
12
+
13
+ Dependencies
14
+ ------------
15
+
16
+ ICU.
17
+
18
+ If you get messages that the library or functions are not found, you can
19
+ set some environment varibles to tell ffi-icu where to find it, e.g.:
20
+
21
+ $ export FFI_ICU_LIB="icui18n.so"
22
+ $ export FFI_ICU_VERSION_SUFFIX="_3_8"
23
+ $ ruby -r ffi-icu program.rb
24
+
25
+ Features
26
+ ========
27
+
28
+ Character Encoding Detection
29
+ ----------------------------
30
+
31
+ Examples:
32
+
33
+ ```ruby
34
+
35
+ match = ICU::CharDet.detect(str)
36
+ match.name # => "UTF-8"
37
+ match.confidence # => 80
38
+ ```
39
+
40
+ or
41
+
42
+ ```ruby
43
+ detector = ICU::CharDet::Detector.new
44
+ detector.detect(str) => #<struct ICU::CharDet::Detector::Match ...>
45
+ ```
46
+
47
+ Why not just use rchardet?
48
+
49
+ * speed
50
+ * 1.9 support
51
+
52
+ Locale Sensitive Collation
53
+ --------------------------
54
+
55
+ Examples:
56
+
57
+ ```ruby
58
+ ICU::Collation.collate("nb", %w[å æ ø]) == %w[æ ø å] #=> true
59
+ ```
60
+
61
+ or
62
+
63
+ ```ruby
64
+ collator = ICU::Collation::Collator.new("nb")
65
+ collator.compare("a", "b") #=> -1
66
+ collator.greater?("z", "a") #=> true
67
+ collator.collate(%w[å æ ø]) #=> ["æ", "ø", "å"]
68
+ ```
69
+
70
+ Text Boundary Analysis
71
+ ----------------------
72
+
73
+ Examples:
74
+
75
+ ```ruby
76
+ iterator = ICU::BreakIterator.new(:word, "en_US")
77
+ iterator.text = "This is a sentence."
78
+ iterator.to_a #=> [0, 4, 5, 7, 8, 9, 10, 18, 19]
79
+ ```
80
+
81
+ Tested on:
82
+ ==========
83
+
84
+ Platforms:
85
+
86
+ * OS X 10.6
87
+ * Arch Linux
88
+
89
+ Rubies:
90
+
91
+ * MRI 1.9.1
92
+ * MRI 1.8.7
93
+
94
+ TODO:
95
+ =====
96
+
97
+ * Useful ICU stuff:
98
+ - number formatting (decimal points, thousand separators, currency)
99
+ - date formatting
100
+ * Windows?!
101
+
102
+ Note on Patches/Pull Requests
103
+ =============================
104
+
105
+ * Fork the project.
106
+ * Make your feature addition or bug fix.
107
+ * Add tests for it. This is important so I don't break it in a
108
+ future version unintentionally.
109
+ * Commit, do not mess with rakefile, version, or history.
110
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
111
+ * Send me a pull request. Bonus points for topic branches.
112
+
113
+ Copyright
114
+ =========
115
+
116
+ Copyright (c) 2010-2011 Jari Bakken. See LICENSE for details.
data/Rakefile CHANGED
@@ -21,7 +21,7 @@ task :default => :spec
21
21
  begin
22
22
  require 'yard'
23
23
  YARD::Rake::YardocTask.new
24
- rescue LoadError
24
+ rescue
25
25
  task :yardoc do
26
26
  abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
27
27
  end
data/ffi-icu.gemspec CHANGED
@@ -14,18 +14,15 @@ Gem::Specification.new do |s|
14
14
  s.date = %q{2010-08-23}
15
15
  s.description = %q{Provides charset detection, locale sensitive collation and more. Depends on libicu.}
16
16
  s.email = %q{jari.bakken@gmail.com}
17
- s.extra_rdoc_files = [
18
- "LICENSE",
19
- "README.rdoc"
20
- ]
17
+ s.extra_rdoc_files = ["LICENSE", "README.md"]
21
18
  s.files = `git ls-files`.split("\n")
22
19
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
23
20
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
24
21
  s.require_paths = ["lib"]
25
-
22
+
26
23
  s.homepage = %q{http://github.com/jarib/ffi-icu}
27
24
  s.rdoc_options = ["--charset=UTF-8"]
28
- s.summary = %q{Simple FFI wrappers for things I need from ICU.}
25
+ s.summary = %q{Simple Ruby FFI wrappers for things I need from ICU.}
29
26
 
30
27
  s.add_runtime_dependency(%q<ffi>, ["~> 1.0.9"])
31
28
  s.add_development_dependency(%q<rspec>, ["~> 2.5.0"])
@@ -2,7 +2,9 @@ module ICU
2
2
  class BreakIterator
3
3
  include Enumerable
4
4
 
5
- UBRK_DONE = -1
5
+ attr_reader :text
6
+
7
+ DONE = -1
6
8
 
7
9
  def self.available_locales
8
10
  (0...Lib.ubrk_countAvailable).map do |idx|
@@ -12,11 +14,12 @@ module ICU
12
14
 
13
15
  def initialize(type, locale)
14
16
  ptr = Lib.check_error { |err| Lib.ubrk_open(type, locale, nil, 0, err) }
15
-
16
17
  @iterator = FFI::AutoPointer.new(ptr, Lib.method(:ubrk_close))
17
18
  end
18
19
 
19
20
  def text=(str)
21
+ @text = str
22
+
20
23
  Lib.check_error { |err|
21
24
  Lib.ubrk_setText @iterator, UCharPointer.from_string(str), str.jlength, err
22
25
  }
@@ -27,10 +30,31 @@ module ICU
27
30
 
28
31
  int = first
29
32
 
30
- while int != UBRK_DONE
33
+ while int != DONE
31
34
  yield int
32
35
  int = self.next
33
36
  end
37
+
38
+ self
39
+ end
40
+
41
+ def each_substring(&blk)
42
+ return to_enum(:each_substring) unless block_given?
43
+
44
+ # each_char needed for 1.8, where String#[] works on bytes, not characters
45
+ chars = text.each_char.to_a
46
+ low = first
47
+
48
+ while (high = self.next) != DONE
49
+ yield chars[low...high].join
50
+ low = high
51
+ end
52
+
53
+ self
54
+ end
55
+
56
+ def substrings
57
+ each_substring.to_a
34
58
  end
35
59
 
36
60
  def next
data/lib/ffi-icu/lib.rb CHANGED
@@ -77,7 +77,7 @@ module ICU
77
77
  raise Error, name
78
78
  end
79
79
  elsif error_code < 0
80
- warn "ffi-icu: #{Lib.u_errorName error_code}"
80
+ $stderr.puts "ffi-icu: #{Lib.u_errorName error_code}" if $DEBUG || $VERBOSE
81
81
  end
82
82
 
83
83
  ret
@@ -1,3 +1,3 @@
1
1
  module ICU
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
@@ -18,7 +18,21 @@ module ICU
18
18
  iterator.to_a.should == [0, 5, 6, 11, 12, 17, 18, 21, 22, 26, 27, 28, 39, 40, 51, 52, 56, 57, 58, 61, 62, 64, 65, 72, 73, 79, 80, 90, 91, 93, 94, 100, 101, 103, 104, 110, 111, 116, 117, 123, 124]
19
19
  end
20
20
 
21
- it "finds all word boundaries in a Thai string" do
21
+ it "returns each substring" do
22
+ iterator = BreakIterator.new :word, "en_US"
23
+ iterator.text = "Lorem ipsum dolor sit amet."
24
+
25
+ iterator.substrings.should == ["Lorem", " ", "ipsum", " ", "dolor", " ", "sit", " ", "amet", "."]
26
+ end
27
+
28
+ it "returns the substrings of a non-ASCII string" do
29
+ iterator = BreakIterator.new :word, "th_TH"
30
+ iterator.text = "รู้อะไรไม่สู้รู้วิชา รู้รักษาตัวรอดเป็นยอดดี"
31
+
32
+ iterator.substrings.should == ["รู้", "อะไร", "ไม่สู้", "รู้", "วิชา", " ", "รู้", "รักษา", "ตัว", "รอด", "เป็น", "ยอดดี"]
33
+ end
34
+
35
+ it "finds all word boundaries in a non-ASCII string" do
22
36
  iterator = BreakIterator.new :word, "th_TH"
23
37
  iterator.text = "การทดลอง"
24
38
  iterator.to_a.should == [0, 3, 8]
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: ffi-icu
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.6
5
+ version: 0.0.7
6
6
  platform: ruby
7
7
  authors:
8
8
  - Jari Bakken
@@ -42,13 +42,13 @@ extensions: []
42
42
 
43
43
  extra_rdoc_files:
44
44
  - LICENSE
45
- - README.rdoc
45
+ - README.md
46
46
  files:
47
47
  - .document
48
48
  - .gitignore
49
49
  - Gemfile
50
50
  - LICENSE
51
- - README.rdoc
51
+ - README.md
52
52
  - Rakefile
53
53
  - benchmark/detect.rb
54
54
  - benchmark/shared.rb
@@ -97,7 +97,7 @@ rubyforge_project:
97
97
  rubygems_version: 1.8.2
98
98
  signing_key:
99
99
  specification_version: 3
100
- summary: Simple FFI wrappers for things I need from ICU.
100
+ summary: Simple Ruby FFI wrappers for things I need from ICU.
101
101
  test_files:
102
102
  - spec/break_iterator_spec.rb
103
103
  - spec/chardet_spec.rb
data/README.rdoc DELETED
@@ -1,85 +0,0 @@
1
- = ffi-icu
2
-
3
- Simple FFI wrappers for things I need from ICU. For the full thing, check out ICU4R instead.
4
-
5
- = Gem
6
-
7
- * http://rubygems.org/gems/ffi-icu
8
-
9
- gem install ffi-icu
10
-
11
- = Dependencies
12
-
13
- ICU. If you get messages that the library or functions are not found, you can
14
- set some environment varibles to tell ffi-icu where to find it, i.e.:
15
-
16
- FFI_ICU_LIB="icui18n.so" FFI_ICU_VERSION_SUFFIX="_3_8" ruby -r ffi-icu
17
-
18
- = Features
19
-
20
- == Character Encoding Detection
21
-
22
- === Examples:
23
-
24
- match = ICU::CharDet.detect(str)
25
- match.name # => "UTF-8"
26
- match.confidence # => 80
27
-
28
- or
29
-
30
- detector = ICU::CharDet::Detector.new
31
- detector.detect(str)
32
- detector.close
33
-
34
- === Why not just use rchardet?
35
-
36
- * this is faster
37
- * rchardet does not work well on 1.9
38
- * none of the rchardet forks claiming to work on 1.9 actually does
39
-
40
- == Locale Sensitive Collation
41
-
42
- === Examples:
43
-
44
- ICU::Collation.collate("nb", %w[å æ ø]) == %w[æ ø å] #=> true
45
-
46
- or
47
-
48
- collator = ICU::Collation::Collator.new("nb")
49
- collator.compare("a", "b") #=> -1
50
- collator.greater?("z", "a") #=> true
51
- collator.collate(%w[å æ ø]) #=> ["æ", "ø", "å"]
52
-
53
- = Tested on:
54
-
55
- Platforms:
56
-
57
- * OS X 10.6
58
- * Debian Linux
59
- * Arch Linux
60
-
61
- Rubies:
62
-
63
- * MRI 1.9.1
64
- * MRI 1.8.7
65
-
66
- = TODO:
67
-
68
- * Useful ICU stuff:
69
- - number formatting (decimal points, thousand separators, currency)
70
- - date formatting
71
- * Windows?!
72
-
73
- == Note on Patches/Pull Requests
74
-
75
- * Fork the project.
76
- * Make your feature addition or bug fix.
77
- * Add tests for it. This is important so I don't break it in a
78
- future version unintentionally.
79
- * Commit, do not mess with rakefile, version, or history.
80
- (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
81
- * Send me a pull request. Bonus points for topic branches.
82
-
83
- == Copyright
84
-
85
- Copyright (c) 2010-2011 Jari Bakken. See LICENSE for details.