stemmer4r 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -74,12 +74,22 @@ stemmer_free(struct sb_stemmer* stemmer)
74
74
  *
75
75
  * === Examples
76
76
  *
77
- * stemmer = Stemmer.new('en')
78
- * puts stemmer.stem('testing')
77
+ * stemmer = Stemmer.new('en')
78
+ * puts stemmer.stem('testing')
79
79
  *
80
- * stemmer = Stemmer.new('fr')
81
- * puts stemmer.stem('�t�')
80
+ * stemmer = Stemmer.new('fr')
82
81
  *
82
+ * You can stem:
83
+ *
84
+ * * a single word:
85
+ * puts stemmer.stem('�t�')
86
+ *
87
+ * * an array:
88
+ * puts stemmer.stem(%w{t�l�vision chapeau ordinateur})
89
+ *
90
+ * * a sentence:
91
+ * puts stemmer.stem("Une t�l�vision sur un chapeau d'ordinateur")
92
+ *
83
93
  */
84
94
  static VALUE
85
95
  stemmer_allocate(VALUE klass, VALUE algorithm)
@@ -107,30 +117,67 @@ stemmer_allocate(VALUE klass, VALUE algorithm)
107
117
 
108
118
  /*
109
119
  * Document-method: stem
110
- * call-seq: stem(str)
120
+ * call-seq: stem(obj)
111
121
  *
112
- * Stems string and returns the result.
122
+ * Stems a word, a sentence or an array of words and returns the result (always in lowercase).
123
+ * +obj+ is always converted to lowercase before stemming (mandatory for snowball algorithms to work).
113
124
  *
114
125
  * === Parameters
115
126
  *
116
- * +str+:: string to stem. String must be encoding the 'right' way (iso-8859-1 for french for example).
127
+ * +obj+:: word, sentence (string) or array of words to stem. All strings must be encoding the 'right' way (iso-8859-1 for french for example).
117
128
  */
118
129
  static VALUE
119
- stemmer_stem(VALUE self, VALUE word)
130
+ stemmer_stem(VALUE self, VALUE obj)
120
131
  {
121
132
  VALUE ret;
133
+ VALUE word_lowercase;
122
134
  sb_symbol *cword;
135
+ sb_symbol *t1;
123
136
  const sb_symbol *rcword;
124
137
  struct sb_stemmer *stemmer;
138
+ int i;
125
139
 
126
- Check_Type(word, T_STRING);
127
-
128
- cword = strdup(STR2CSTR(word));
129
140
  Data_Get_Struct(self, struct sb_stemmer, stemmer);
130
- rcword = sb_stemmer_stem(stemmer, cword, RSTRING(word)->len);
131
141
 
132
- ret = rb_str_new2(rcword);
133
- free(cword);
142
+ switch (TYPE(obj))
143
+ {
144
+ case T_STRING:
145
+ word_lowercase = rb_funcall2(obj, rb_intern("downcase"), 0, 0);
146
+ cword = strdup(STR2CSTR(word_lowercase));
147
+ // Sentence
148
+ if (strchr(cword, ' '))
149
+ {
150
+ ret = rb_str_new2("");
151
+ for (t1 = strtok(cword, " "); t1 != NULL; t1 = strtok(NULL, " "))
152
+ {
153
+ rb_str_cat2(ret, sb_stemmer_stem(stemmer, t1, strlen(t1)));
154
+ rb_str_cat2(ret, " ");
155
+ }
156
+ RSTRING(ret)->len--;
157
+ RSTRING(ret)->ptr[RSTRING(ret)->len] = '\0';
158
+ }
159
+ else
160
+ {
161
+ // A word
162
+ rcword = sb_stemmer_stem(stemmer, cword, RSTRING(obj)->len);
163
+ ret = rb_str_new2(rcword);
164
+ }
165
+ free(cword);
166
+ break;
167
+ case T_ARRAY:
168
+ ret = rb_ary_new2(RARRAY(obj)->len);
169
+ for (i = 0; i < RARRAY(obj)->len; i++)
170
+ {
171
+ word_lowercase = rb_funcall2(RARRAY(obj)->ptr[i], rb_intern("downcase"), 0, 0);
172
+ cword = strdup(STR2CSTR(word_lowercase));
173
+ rb_ary_push(ret, rb_str_new2(sb_stemmer_stem(stemmer, cword, RSTRING(RARRAY(obj)->ptr[i])->len)));
174
+ }
175
+ free(cword);
176
+ break;
177
+ default:
178
+ rb_raise(rb_eTypeError, "not valid value");
179
+ break;
180
+ }
134
181
 
135
182
  return ret;
136
183
  }
data/stemmer4r.gemspec CHANGED
@@ -3,7 +3,7 @@ require 'rake'
3
3
 
4
4
  spec = Gem::Specification.new do |s|
5
5
  s.name = 'stemmer4r'
6
- s.version = '0.4'
6
+ s.version = '0.5'
7
7
  s.author = "Fabien POTENCIER"
8
8
  s.email = "fabien.potencier@gmail.com"
9
9
  s.homepage = "http://stemmer4r.rubyforge.org"
data/test/test.rb CHANGED
@@ -7,9 +7,25 @@ rescue LoadError
7
7
  require 'stemmer4r'
8
8
  end
9
9
 
10
+ require 'benchmark'
10
11
  require 'test/unit'
11
12
 
12
13
  class StemmerTest < Test::Unit::TestCase
14
+ def test_sentence
15
+ s = Stemmer.new('fr')
16
+ assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
17
+ end
18
+
19
+ def test_array
20
+ s = Stemmer.new('fr')
21
+ assert_equal('ordin jeux avion aim', s.stem(%w{ordinateur jeux avion aimer}).join(' '))
22
+ end
23
+
24
+ def test_lowercase
25
+ s = Stemmer.new('fr')
26
+ assert_equal(s.stem('aimera'), s.stem('AIMERA'))
27
+ end
28
+
13
29
  def test_valid_language
14
30
  assert_not_nil(Stemmer.new('fr'))
15
31
  assert_nothing_raised() { Stemmer.new('fr') }
@@ -18,13 +34,34 @@ class StemmerTest < Test::Unit::TestCase
18
34
 
19
35
  def test_stemmer
20
36
  dir = File.dirname(__FILE__)
37
+ input = {}
38
+ output = {}
39
+ s = {}
21
40
  %w{da de nl en es fi fr it no pt ru sv}.each { |language|
22
- puts "Testing '#{language}' algorithm..."
23
- stemmer = Stemmer.new(language)
24
- input = File.new("#{dir}/tests/#{language}/voc.txt")
25
- output = File.new("#{dir}/tests/#{language}/output.txt")
26
- input.each_line { |word|
27
- assert_equal(output.gets.chomp!, stemmer.stem(word.chomp!))
41
+ input[language] = IO.read("#{dir}/tests/#{language}/voc.txt").split(/\s+/)
42
+ output[language] = IO.read("#{dir}/tests/#{language}/output.txt").split(/\s+/)
43
+ s[language] = Stemmer.new(language)
44
+ }
45
+ Benchmark.bm { |x|
46
+ x.report {
47
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
48
+ puts "Testing '#{language}' algorithm (one word at a time)..."
49
+ input[language].length.times { |i|
50
+ assert_equal(output[language][i], s[language].stem(input[language][i]))
51
+ }
52
+ }
53
+ }
54
+ x.report {
55
+ %w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
56
+ puts "Testing '#{language}' algorithm (a single array)..."
57
+ assert_equal(output[language], s[language].stem(input[language]))
58
+ }
59
+ }
60
+ x.report {
61
+ %w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
62
+ puts "Testing '#{language}' algorithm (a single sentence)..."
63
+ assert_equal(output[language].join(' '), s[language].stem(input[language].join(' ')))
64
+ }
28
65
  }
29
66
  }
30
67
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.10
3
3
  specification_version: 1
4
4
  name: stemmer4r
5
5
  version: !ruby/object:Gem::Version
6
- version: "0.4"
7
- date: 2005-05-11
6
+ version: "0.5"
7
+ date: 2005-05-12
8
8
  summary: Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
9
9
  require_paths:
10
10
  - "."