stemmer4r 0.4 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,12 +74,22 @@ stemmer_free(struct sb_stemmer* stemmer)
74
74
  *
75
75
  * === Examples
76
76
  *
77
- * stemmer = Stemmer.new('en')
78
- * puts stemmer.stem('testing')
77
+ * stemmer = Stemmer.new('en')
78
+ * puts stemmer.stem('testing')
79
79
  *
80
- * stemmer = Stemmer.new('fr')
81
- * puts stemmer.stem('�t�')
80
+ * stemmer = Stemmer.new('fr')
82
81
  *
82
+ * You can stem:
83
+ *
84
+ * * a single word:
85
+ * puts stemmer.stem('�t�')
86
+ *
87
+ * * an array:
88
+ * puts stemmer.stem(%w{t�l�vision chapeau ordinateur})
89
+ *
90
+ * * a sentence:
91
+ * puts stemmer.stem("Une t�l�vision sur un chapeau d'ordinateur")
92
+ *
83
93
  */
84
94
  static VALUE
85
95
  stemmer_allocate(VALUE klass, VALUE algorithm)
@@ -107,30 +117,67 @@ stemmer_allocate(VALUE klass, VALUE algorithm)
107
117
 
108
118
  /*
109
119
  * Document-method: stem
110
- * call-seq: stem(str)
120
+ * call-seq: stem(obj)
111
121
  *
112
- * Stems string and returns the result.
122
+ * Stems a word, a sentence or an array of words and returns the result (always in lowercase).
123
+ * +obj+ is always converted to lowercase before stemming (mandatory for snowball algorithms to work).
113
124
  *
114
125
  * === Parameters
115
126
  *
116
- * +str+:: string to stem. String must be encoding the 'right' way (iso-8859-1 for french for example).
127
+ * +obj+:: word, sentence (string) or array of words to stem. All strings must be encoding the 'right' way (iso-8859-1 for french for example).
117
128
  */
118
129
  static VALUE
119
- stemmer_stem(VALUE self, VALUE word)
130
+ stemmer_stem(VALUE self, VALUE obj)
120
131
  {
121
132
  VALUE ret;
133
+ VALUE word_lowercase;
122
134
  sb_symbol *cword;
135
+ sb_symbol *t1;
123
136
  const sb_symbol *rcword;
124
137
  struct sb_stemmer *stemmer;
138
+ int i;
125
139
 
126
- Check_Type(word, T_STRING);
127
-
128
- cword = strdup(STR2CSTR(word));
129
140
  Data_Get_Struct(self, struct sb_stemmer, stemmer);
130
- rcword = sb_stemmer_stem(stemmer, cword, RSTRING(word)->len);
131
141
 
132
- ret = rb_str_new2(rcword);
133
- free(cword);
142
+ switch (TYPE(obj))
143
+ {
144
+ case T_STRING:
145
+ word_lowercase = rb_funcall2(obj, rb_intern("downcase"), 0, 0);
146
+ cword = strdup(STR2CSTR(word_lowercase));
147
+ // Sentence
148
+ if (strchr(cword, ' '))
149
+ {
150
+ ret = rb_str_new2("");
151
+ for (t1 = strtok(cword, " "); t1 != NULL; t1 = strtok(NULL, " "))
152
+ {
153
+ rb_str_cat2(ret, sb_stemmer_stem(stemmer, t1, strlen(t1)));
154
+ rb_str_cat2(ret, " ");
155
+ }
156
+ RSTRING(ret)->len--;
157
+ RSTRING(ret)->ptr[RSTRING(ret)->len] = '\0';
158
+ }
159
+ else
160
+ {
161
+ // A word
162
+ rcword = sb_stemmer_stem(stemmer, cword, RSTRING(obj)->len);
163
+ ret = rb_str_new2(rcword);
164
+ }
165
+ free(cword);
166
+ break;
167
+ case T_ARRAY:
168
+ ret = rb_ary_new2(RARRAY(obj)->len);
169
+ for (i = 0; i < RARRAY(obj)->len; i++)
170
+ {
171
+ word_lowercase = rb_funcall2(RARRAY(obj)->ptr[i], rb_intern("downcase"), 0, 0);
172
+ cword = strdup(STR2CSTR(word_lowercase));
173
+ rb_ary_push(ret, rb_str_new2(sb_stemmer_stem(stemmer, cword, RSTRING(RARRAY(obj)->ptr[i])->len)));
174
+ }
175
+ free(cword);
176
+ break;
177
+ default:
178
+ rb_raise(rb_eTypeError, "not valid value");
179
+ break;
180
+ }
134
181
 
135
182
  return ret;
136
183
  }
data/stemmer4r.gemspec CHANGED
@@ -3,7 +3,7 @@ require 'rake'
3
3
 
4
4
  spec = Gem::Specification.new do |s|
5
5
  s.name = 'stemmer4r'
6
- s.version = '0.4'
6
+ s.version = '0.5'
7
7
  s.author = "Fabien POTENCIER"
8
8
  s.email = "fabien.potencier@gmail.com"
9
9
  s.homepage = "http://stemmer4r.rubyforge.org"
data/test/test.rb CHANGED
@@ -7,9 +7,25 @@ rescue LoadError
7
7
  require 'stemmer4r'
8
8
  end
9
9
 
10
+ require 'benchmark'
10
11
  require 'test/unit'
11
12
 
12
13
  class StemmerTest < Test::Unit::TestCase
14
+ def test_sentence
15
+ s = Stemmer.new('fr')
16
+ assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
17
+ end
18
+
19
+ def test_array
20
+ s = Stemmer.new('fr')
21
+ assert_equal('ordin jeux avion aim', s.stem(%w{ordinateur jeux avion aimer}).join(' '))
22
+ end
23
+
24
+ def test_lowercase
25
+ s = Stemmer.new('fr')
26
+ assert_equal(s.stem('aimera'), s.stem('AIMERA'))
27
+ end
28
+
13
29
  def test_valid_language
14
30
  assert_not_nil(Stemmer.new('fr'))
15
31
  assert_nothing_raised() { Stemmer.new('fr') }
@@ -18,13 +34,34 @@ class StemmerTest < Test::Unit::TestCase
18
34
 
19
35
  def test_stemmer
20
36
  dir = File.dirname(__FILE__)
37
+ input = {}
38
+ output = {}
39
+ s = {}
21
40
  %w{da de nl en es fi fr it no pt ru sv}.each { |language|
22
- puts "Testing '#{language}' algorithm..."
23
- stemmer = Stemmer.new(language)
24
- input = File.new("#{dir}/tests/#{language}/voc.txt")
25
- output = File.new("#{dir}/tests/#{language}/output.txt")
26
- input.each_line { |word|
27
- assert_equal(output.gets.chomp!, stemmer.stem(word.chomp!))
41
+ input[language] = IO.read("#{dir}/tests/#{language}/voc.txt").split(/\s+/)
42
+ output[language] = IO.read("#{dir}/tests/#{language}/output.txt").split(/\s+/)
43
+ s[language] = Stemmer.new(language)
44
+ }
45
+ Benchmark.bm { |x|
46
+ x.report {
47
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
48
+ puts "Testing '#{language}' algorithm (one word at a time)..."
49
+ input[language].length.times { |i|
50
+ assert_equal(output[language][i], s[language].stem(input[language][i]))
51
+ }
52
+ }
53
+ }
54
+ x.report {
55
+ %w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
56
+ puts "Testing '#{language}' algorithm (a single array)..."
57
+ assert_equal(output[language], s[language].stem(input[language]))
58
+ }
59
+ }
60
+ x.report {
61
+ %w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
62
+ puts "Testing '#{language}' algorithm (a single sentence)..."
63
+ assert_equal(output[language].join(' '), s[language].stem(input[language].join(' ')))
64
+ }
28
65
  }
29
66
  }
30
67
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.10
3
3
  specification_version: 1
4
4
  name: stemmer4r
5
5
  version: !ruby/object:Gem::Version
6
- version: "0.4"
7
- date: 2005-05-11
6
+ version: "0.5"
7
+ date: 2005-05-12
8
8
  summary: Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
9
9
  require_paths:
10
10
  - "."