ruby-stemmer 0.8.5 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,10 @@
1
- = Ruby-Stemmer
1
+ = Ruby-Stemmer {Project Status}[http://stillmaintained.com/aurelian/ruby-stemmer.png]
2
2
 
3
3
  Ruby-Stemmer exposes SnowBall API to Ruby.
4
4
 
5
5
  This package includes libstemmer_c library released under BSD licence
6
- and available for free at: http://snowball.tartarus.org/dist/libstemmer_c.tgz.
6
+ and available for free [here]{http://snowball.tartarus.org/dist/libstemmer_c.tgz}.
7
+
7
8
  Support for latin language is also included and it has been generated with the snowball compiler using
8
9
  {schinke contribution}[http://snowball.tartarus.org/otherapps/schinke/intro.html]
9
10
 
@@ -88,7 +89,7 @@ For further reference on stem vs. root, please check wikipedia articles on the t
88
89
  * {fast-stemmer}[http://github.com/romanbsd/fast-stemmer] (ext)
89
90
  * {uea-stemmer}[http://github.com/ealdent/uea-stemmer] (ext)
90
91
  * {stemmer}[http://rubyforge.org/projects/stemmer] (pure ruby)
91
- * [add yours]
92
+ * add yours
92
93
 
93
94
  == Copyright
94
95
 
@@ -96,11 +97,12 @@ Copyright (c) 2008-2011 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE fo
96
97
 
97
98
  == Contributors
98
99
 
99
- * Aurelian Oancea
100
- * Yury Korolev - various bug fixes
101
- * Aaron Patterson - rake compiler (windows support), code cleanup
100
+ * {Aurelian Oancea}[https://github.com/aurelian]
101
+ * {Yury Korolev}[https://github.com/yury] - various bug fixes
102
+ * {Aaron Patterson}[https://github.com/tenderlove] - rake compiler (windows support), code cleanup
103
+ * {Damián Silvani}[https://github.com/munshkr] - Ruby 1.9 encoding
102
104
 
103
105
  == Real life usage
104
-
105
106
  * http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
106
- * {textamatch_rb}[http://github.com/dimus/taxamatch_rb] is using the Ruby-Stemmer to catch errors in suffixes while it discovers if two scientific names are actually the same.
107
+
108
+ # encoding: utf-8
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.5
1
+ 0.9.0
@@ -1,6 +1,27 @@
1
1
  #include "ruby.h"
2
2
  #include <libstemmer.h>
3
3
 
4
+
5
+ #ifdef HAVE_RUBY_ENCODING_H
6
+
7
+ #include <ruby/encoding.h>
8
+
9
+ #define ENCODED_STR_NEW2(str, encoding) \
10
+ ({ \
11
+ VALUE _string = rb_str_new2((const char *)str); \
12
+ int _enc = rb_enc_get_index(encoding); \
13
+ rb_enc_associate_index(_string, _enc); \
14
+ _string; \
15
+ })
16
+
17
+ #else
18
+
19
+ #define ENCODED_STR_NEW2(str, encoding) \
20
+ rb_str_new2((const char *)str)
21
+
22
+ #endif
23
+
24
+
4
25
  VALUE rb_mLingua;
5
26
  VALUE rb_cStemmer;
6
27
  VALUE rb_eStemmerError;
@@ -63,7 +84,9 @@ rb_stemmer_stem(VALUE self, VALUE word) {
63
84
  (sb_symbol *)RSTRING_PTR(s_word),
64
85
  RSTRING_LEN(s_word)
65
86
  );
66
- return rb_str_new2((char *)stemmed);
87
+
88
+ VALUE rb_enc = rb_iv_get(self, "@encoding");
89
+ return ENCODED_STR_NEW2((char *)stemmed, rb_enc);
67
90
  }
68
91
 
69
92
  static void
@@ -85,7 +108,7 @@ void Init_stemmer_native() {
85
108
  rb_mLingua = rb_define_module("Lingua");
86
109
  rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
87
110
  rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
88
- rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
111
+ rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
89
112
  rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
90
113
  rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
91
114
  }
@@ -37,10 +37,24 @@ module Lingua
37
37
  # require 'lingua/stemmer'
38
38
  # s = Lingua::Stemmer.new :language => 'fr'
39
39
  #
40
- def initialize options = {}
40
+ def initialize(options={})
41
41
  @language = (options[:language] || 'en').to_s
42
42
  @encoding = (options[:encoding] || 'UTF_8').to_s
43
- native_init @language, @encoding
43
+
44
+ if RUBY_VERSION >= "1.9"
45
+ if not @encoding.is_a?(Encoding)
46
+ @encoding = Encoding.find(@encoding.gsub("_", "-"))
47
+ end
48
+ else
49
+ @encoding = @encoding.upcase.gsub("-", "_")
50
+ end
51
+
52
+ native_init(@language, native_encoding(@encoding))
53
+ end
54
+
55
+ private
56
+ def native_encoding(enc)
57
+ RUBY_VERSION >= "1.9" ? enc.name.gsub('-', '_') : enc
44
58
  end
45
59
  end
46
60
  end
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require 'helper'
2
3
 
3
4
  class TestStemmer < Test::Unit::TestCase
@@ -35,7 +36,12 @@ class TestStemmer < Test::Unit::TestCase
35
36
  assert_equal word, "install"
36
37
  end
37
38
  assert_kind_of ::Lingua::Stemmer, stemmer
38
- assert_equal stemmer.encoding, "UTF_8"
39
+
40
+ if RUBY_VERSION >= '1.9'
41
+ assert_equal stemmer.encoding, Encoding::UTF_8
42
+ else
43
+ assert_equal stemmer.encoding, "UTF_8"
44
+ end
39
45
  end
40
46
 
41
47
  def test_array_stemmer
@@ -51,4 +57,43 @@ class TestStemmer < Test::Unit::TestCase
51
57
  }.new.stem('cow')
52
58
  end
53
59
  end
60
+
61
+ def test_default_encoding_option
62
+ if RUBY_VERSION >= '1.9'
63
+ assert_equal ::Lingua::Stemmer.new.encoding, Encoding::UTF_8
64
+ else
65
+ assert_equal ::Lingua::Stemmer.new.encoding, "UTF_8"
66
+ end
67
+ end
68
+
69
+ def test_different_encoding_options
70
+ if RUBY_VERSION >= '1.9'
71
+ assert_equal ::Lingua::Stemmer.new(:encoding => "ISO_8859_1").encoding, Encoding::ISO_8859_1
72
+ assert_equal ::Lingua::Stemmer.new(:encoding => "UTF-8").encoding, Encoding::UTF_8
73
+ assert_equal ::Lingua::Stemmer.new(:encoding => "utf-8").encoding, Encoding::UTF_8
74
+ assert_equal ::Lingua::Stemmer.new(:encoding => :ISO_8859_1).encoding, Encoding::ISO_8859_1
75
+ assert_equal ::Lingua::Stemmer.new(:encoding => Encoding::UTF_8).encoding, Encoding::UTF_8
76
+ else
77
+ assert_equal ::Lingua::Stemmer.new(:encoding => "ISO_8859_1").encoding, "ISO_8859_1"
78
+ assert_equal ::Lingua::Stemmer.new(:encoding => "UTF-8").encoding, "UTF_8"
79
+ assert_equal ::Lingua::Stemmer.new(:encoding => "utf-8").encoding, "UTF_8"
80
+ assert_equal ::Lingua::Stemmer.new(:encoding => :ISO_8859_1).encoding, "ISO_8859_1"
81
+ end
82
+ end
83
+
84
+ if RUBY_VERSION >= '1.9'
85
+ def test_string_encoding
86
+ word = "așezare"
87
+
88
+ stem = ::Lingua.stemmer(word, :language => "ro", :encoding => "UTF_8")
89
+ assert_equal word.encoding, stem.encoding
90
+
91
+ s = ::Lingua::Stemmer.new(:language => "ro", :encoding => "UTF_8")
92
+ assert_equal s.stem(word).encoding, word.encoding
93
+
94
+ stem = ::Lingua.stemmer("installation", :language => "fr", :encoding => "ISO-8859-1")
95
+ assert_equal stem.encoding, Encoding::ISO_8859_1
96
+ end
97
+ end
98
+
54
99
  end
metadata CHANGED
@@ -1,12 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-stemmer
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 0
7
- - 8
8
- - 5
9
- version: 0.8.5
4
+ prerelease:
5
+ version: 0.9.0
10
6
  platform: ruby
11
7
  authors:
12
8
  - Aurelian Oancea
@@ -15,7 +11,7 @@ autorequire:
15
11
  bindir: bin
16
12
  cert_chain: []
17
13
 
18
- date: 2011-03-08 00:00:00 +01:00
14
+ date: 2011-03-12 00:00:00 +01:00
19
15
  default_executable:
20
16
  dependencies: []
21
17
 
@@ -135,21 +131,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
135
131
  requirements:
136
132
  - - ">="
137
133
  - !ruby/object:Gem::Version
138
- segments:
139
- - 0
140
134
  version: "0"
141
135
  required_rubygems_version: !ruby/object:Gem::Requirement
142
136
  none: false
143
137
  requirements:
144
138
  - - ">="
145
139
  - !ruby/object:Gem::Version
146
- segments:
147
- - 0
148
140
  version: "0"
149
141
  requirements: []
150
142
 
151
143
  rubyforge_project: ruby-stemmer
152
- rubygems_version: 1.3.7
144
+ rubygems_version: 1.6.2
153
145
  signing_key:
154
146
  specification_version: 3
155
147
  summary: Expose libstemmer_c to Ruby.