ruby-stemmer 0.8.5 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,9 +1,10 @@
1
- = Ruby-Stemmer
1
+ = Ruby-Stemmer {Project Status}[http://stillmaintained.com/aurelian/ruby-stemmer.png]
2
2
 
3
3
  Ruby-Stemmer exposes SnowBall API to Ruby.
4
4
 
5
5
  This package includes libstemmer_c library released under BSD licence
6
- and available for free at: http://snowball.tartarus.org/dist/libstemmer_c.tgz.
6
+ and available for free [here]{http://snowball.tartarus.org/dist/libstemmer_c.tgz}.
7
+
7
8
  Support for latin language is also included and it has been generated with the snowball compiler using
8
9
  {schinke contribution}[http://snowball.tartarus.org/otherapps/schinke/intro.html]
9
10
 
@@ -88,7 +89,7 @@ For further reference on stem vs. root, please check wikipedia articles on the t
88
89
  * {fast-stemmer}[http://github.com/romanbsd/fast-stemmer] (ext)
89
90
  * {uea-stemmer}[http://github.com/ealdent/uea-stemmer] (ext)
90
91
  * {stemmer}[http://rubyforge.org/projects/stemmer] (pure ruby)
91
- * [add yours]
92
+ * add yours
92
93
 
93
94
  == Copyright
94
95
 
@@ -96,11 +97,12 @@ Copyright (c) 2008-2011 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE fo
96
97
 
97
98
  == Contributors
98
99
 
99
- * Aurelian Oancea
100
- * Yury Korolev - various bug fixes
101
- * Aaron Patterson - rake compiler (windows support), code cleanup
100
+ * {Aurelian Oancea}[https://github.com/aurelian]
101
+ * {Yury Korolev}[https://github.com/yury] - various bug fixes
102
+ * {Aaron Patterson}[https://github.com/tenderlove] - rake compiler (windows support), code cleanup
103
+ * {Damián Silvani}[https://github.com/munshkr] - Ruby 1.9 encoding
102
104
 
103
105
  == Real life usage
104
-
105
106
  * http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
106
- * {textamatch_rb}[http://github.com/dimus/taxamatch_rb] is using the Ruby-Stemmer to catch errors in suffixes while it discovers if two scientific names are actually the same.
107
+
108
+ # encoding: utf-8
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.5
1
+ 0.9.0
@@ -1,6 +1,27 @@
1
1
  #include "ruby.h"
2
2
  #include <libstemmer.h>
3
3
 
4
+
5
+ #ifdef HAVE_RUBY_ENCODING_H
6
+
7
+ #include <ruby/encoding.h>
8
+
9
+ #define ENCODED_STR_NEW2(str, encoding) \
10
+ ({ \
11
+ VALUE _string = rb_str_new2((const char *)str); \
12
+ int _enc = rb_enc_get_index(encoding); \
13
+ rb_enc_associate_index(_string, _enc); \
14
+ _string; \
15
+ })
16
+
17
+ #else
18
+
19
+ #define ENCODED_STR_NEW2(str, encoding) \
20
+ rb_str_new2((const char *)str)
21
+
22
+ #endif
23
+
24
+
4
25
  VALUE rb_mLingua;
5
26
  VALUE rb_cStemmer;
6
27
  VALUE rb_eStemmerError;
@@ -63,7 +84,9 @@ rb_stemmer_stem(VALUE self, VALUE word) {
63
84
  (sb_symbol *)RSTRING_PTR(s_word),
64
85
  RSTRING_LEN(s_word)
65
86
  );
66
- return rb_str_new2((char *)stemmed);
87
+
88
+ VALUE rb_enc = rb_iv_get(self, "@encoding");
89
+ return ENCODED_STR_NEW2((char *)stemmed, rb_enc);
67
90
  }
68
91
 
69
92
  static void
@@ -85,7 +108,7 @@ void Init_stemmer_native() {
85
108
  rb_mLingua = rb_define_module("Lingua");
86
109
  rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
87
110
  rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
88
- rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
111
+ rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
89
112
  rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
90
113
  rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
91
114
  }
@@ -37,10 +37,24 @@ module Lingua
37
37
  # require 'lingua/stemmer'
38
38
  # s = Lingua::Stemmer.new :language => 'fr'
39
39
  #
40
- def initialize options = {}
40
+ def initialize(options={})
41
41
  @language = (options[:language] || 'en').to_s
42
42
  @encoding = (options[:encoding] || 'UTF_8').to_s
43
- native_init @language, @encoding
43
+
44
+ if RUBY_VERSION >= "1.9"
45
+ if not @encoding.is_a?(Encoding)
46
+ @encoding = Encoding.find(@encoding.gsub("_", "-"))
47
+ end
48
+ else
49
+ @encoding = @encoding.upcase.gsub("-", "_")
50
+ end
51
+
52
+ native_init(@language, native_encoding(@encoding))
53
+ end
54
+
55
+ private
56
+ def native_encoding(enc)
57
+ RUBY_VERSION >= "1.9" ? enc.name.gsub('-', '_') : enc
44
58
  end
45
59
  end
46
60
  end
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require 'helper'
2
3
 
3
4
  class TestStemmer < Test::Unit::TestCase
@@ -35,7 +36,12 @@ class TestStemmer < Test::Unit::TestCase
35
36
  assert_equal word, "install"
36
37
  end
37
38
  assert_kind_of ::Lingua::Stemmer, stemmer
38
- assert_equal stemmer.encoding, "UTF_8"
39
+
40
+ if RUBY_VERSION >= '1.9'
41
+ assert_equal stemmer.encoding, Encoding::UTF_8
42
+ else
43
+ assert_equal stemmer.encoding, "UTF_8"
44
+ end
39
45
  end
40
46
 
41
47
  def test_array_stemmer
@@ -51,4 +57,43 @@ class TestStemmer < Test::Unit::TestCase
51
57
  }.new.stem('cow')
52
58
  end
53
59
  end
60
+
61
+ def test_default_encoding_option
62
+ if RUBY_VERSION >= '1.9'
63
+ assert_equal ::Lingua::Stemmer.new.encoding, Encoding::UTF_8
64
+ else
65
+ assert_equal ::Lingua::Stemmer.new.encoding, "UTF_8"
66
+ end
67
+ end
68
+
69
+ def test_different_encoding_options
70
+ if RUBY_VERSION >= '1.9'
71
+ assert_equal ::Lingua::Stemmer.new(:encoding => "ISO_8859_1").encoding, Encoding::ISO_8859_1
72
+ assert_equal ::Lingua::Stemmer.new(:encoding => "UTF-8").encoding, Encoding::UTF_8
73
+ assert_equal ::Lingua::Stemmer.new(:encoding => "utf-8").encoding, Encoding::UTF_8
74
+ assert_equal ::Lingua::Stemmer.new(:encoding => :ISO_8859_1).encoding, Encoding::ISO_8859_1
75
+ assert_equal ::Lingua::Stemmer.new(:encoding => Encoding::UTF_8).encoding, Encoding::UTF_8
76
+ else
77
+ assert_equal ::Lingua::Stemmer.new(:encoding => "ISO_8859_1").encoding, "ISO_8859_1"
78
+ assert_equal ::Lingua::Stemmer.new(:encoding => "UTF-8").encoding, "UTF_8"
79
+ assert_equal ::Lingua::Stemmer.new(:encoding => "utf-8").encoding, "UTF_8"
80
+ assert_equal ::Lingua::Stemmer.new(:encoding => :ISO_8859_1).encoding, "ISO_8859_1"
81
+ end
82
+ end
83
+
84
+ if RUBY_VERSION >= '1.9'
85
+ def test_string_encoding
86
+ word = "așezare"
87
+
88
+ stem = ::Lingua.stemmer(word, :language => "ro", :encoding => "UTF_8")
89
+ assert_equal word.encoding, stem.encoding
90
+
91
+ s = ::Lingua::Stemmer.new(:language => "ro", :encoding => "UTF_8")
92
+ assert_equal s.stem(word).encoding, word.encoding
93
+
94
+ stem = ::Lingua.stemmer("installation", :language => "fr", :encoding => "ISO-8859-1")
95
+ assert_equal stem.encoding, Encoding::ISO_8859_1
96
+ end
97
+ end
98
+
54
99
  end
metadata CHANGED
@@ -1,12 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-stemmer
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 0
7
- - 8
8
- - 5
9
- version: 0.8.5
4
+ prerelease:
5
+ version: 0.9.0
10
6
  platform: ruby
11
7
  authors:
12
8
  - Aurelian Oancea
@@ -15,7 +11,7 @@ autorequire:
15
11
  bindir: bin
16
12
  cert_chain: []
17
13
 
18
- date: 2011-03-08 00:00:00 +01:00
14
+ date: 2011-03-12 00:00:00 +01:00
19
15
  default_executable:
20
16
  dependencies: []
21
17
 
@@ -135,21 +131,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
135
131
  requirements:
136
132
  - - ">="
137
133
  - !ruby/object:Gem::Version
138
- segments:
139
- - 0
140
134
  version: "0"
141
135
  required_rubygems_version: !ruby/object:Gem::Requirement
142
136
  none: false
143
137
  requirements:
144
138
  - - ">="
145
139
  - !ruby/object:Gem::Version
146
- segments:
147
- - 0
148
140
  version: "0"
149
141
  requirements: []
150
142
 
151
143
  rubyforge_project: ruby-stemmer
152
- rubygems_version: 1.3.7
144
+ rubygems_version: 1.6.2
153
145
  signing_key:
154
146
  specification_version: 3
155
147
  summary: Expose libstemmer_c to Ruby.