stemmer4r 0.4 → 0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/stemmer4r/stemmer4r.c +61 -14
- data/stemmer4r.gemspec +1 -1
- data/test/test.rb +43 -6
- metadata +2 -2
data/ext/stemmer4r/stemmer4r.c
CHANGED
@@ -74,12 +74,22 @@ stemmer_free(struct sb_stemmer* stemmer)
|
|
74
74
|
*
|
75
75
|
* === Examples
|
76
76
|
*
|
77
|
-
*
|
78
|
-
*
|
77
|
+
* stemmer = Stemmer.new('en')
|
78
|
+
* puts stemmer.stem('testing')
|
79
79
|
*
|
80
|
-
*
|
81
|
-
* puts stemmer.stem('�t�')
|
80
|
+
* stemmer = Stemmer.new('fr')
|
82
81
|
*
|
82
|
+
* You can stem:
|
83
|
+
*
|
84
|
+
* * a single word:
|
85
|
+
* puts stemmer.stem('�t�')
|
86
|
+
*
|
87
|
+
* * an array:
|
88
|
+
* puts stemmer.stem(%w{t�l�vision chapeau ordinateur})
|
89
|
+
*
|
90
|
+
* * a sentence:
|
91
|
+
* puts stemmer.stem("Une t�l�vision sur un chapeau d'ordinateur")
|
92
|
+
*
|
83
93
|
*/
|
84
94
|
static VALUE
|
85
95
|
stemmer_allocate(VALUE klass, VALUE algorithm)
|
@@ -107,30 +117,67 @@ stemmer_allocate(VALUE klass, VALUE algorithm)
|
|
107
117
|
|
108
118
|
/*
|
109
119
|
* Document-method: stem
|
110
|
-
* call-seq: stem(
|
120
|
+
* call-seq: stem(obj)
|
111
121
|
*
|
112
|
-
* Stems
|
122
|
+
* Stems a word, a sentence or an array of words and returns the result (always in lowercase).
|
123
|
+
* +obj+ is always converted to lowercase before stemming (mandatory for snowball algorithms to work).
|
113
124
|
*
|
114
125
|
* === Parameters
|
115
126
|
*
|
116
|
-
* +
|
127
|
+
* +obj+:: word, sentence (string) or array of words to stem. All strings must be encoding the 'right' way (iso-8859-1 for french for example).
|
117
128
|
*/
|
118
129
|
static VALUE
|
119
|
-
stemmer_stem(VALUE self, VALUE
|
130
|
+
stemmer_stem(VALUE self, VALUE obj)
|
120
131
|
{
|
121
132
|
VALUE ret;
|
133
|
+
VALUE word_lowercase;
|
122
134
|
sb_symbol *cword;
|
135
|
+
sb_symbol *t1;
|
123
136
|
const sb_symbol *rcword;
|
124
137
|
struct sb_stemmer *stemmer;
|
138
|
+
int i;
|
125
139
|
|
126
|
-
Check_Type(word, T_STRING);
|
127
|
-
|
128
|
-
cword = strdup(STR2CSTR(word));
|
129
140
|
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
130
|
-
rcword = sb_stemmer_stem(stemmer, cword, RSTRING(word)->len);
|
131
141
|
|
132
|
-
|
133
|
-
|
142
|
+
switch (TYPE(obj))
|
143
|
+
{
|
144
|
+
case T_STRING:
|
145
|
+
word_lowercase = rb_funcall2(obj, rb_intern("downcase"), 0, 0);
|
146
|
+
cword = strdup(STR2CSTR(word_lowercase));
|
147
|
+
// Sentence
|
148
|
+
if (strchr(cword, ' '))
|
149
|
+
{
|
150
|
+
ret = rb_str_new2("");
|
151
|
+
for (t1 = strtok(cword, " "); t1 != NULL; t1 = strtok(NULL, " "))
|
152
|
+
{
|
153
|
+
rb_str_cat2(ret, sb_stemmer_stem(stemmer, t1, strlen(t1)));
|
154
|
+
rb_str_cat2(ret, " ");
|
155
|
+
}
|
156
|
+
RSTRING(ret)->len--;
|
157
|
+
RSTRING(ret)->ptr[RSTRING(ret)->len] = '\0';
|
158
|
+
}
|
159
|
+
else
|
160
|
+
{
|
161
|
+
// A word
|
162
|
+
rcword = sb_stemmer_stem(stemmer, cword, RSTRING(obj)->len);
|
163
|
+
ret = rb_str_new2(rcword);
|
164
|
+
}
|
165
|
+
free(cword);
|
166
|
+
break;
|
167
|
+
case T_ARRAY:
|
168
|
+
ret = rb_ary_new2(RARRAY(obj)->len);
|
169
|
+
for (i = 0; i < RARRAY(obj)->len; i++)
|
170
|
+
{
|
171
|
+
word_lowercase = rb_funcall2(RARRAY(obj)->ptr[i], rb_intern("downcase"), 0, 0);
|
172
|
+
cword = strdup(STR2CSTR(word_lowercase));
|
173
|
+
rb_ary_push(ret, rb_str_new2(sb_stemmer_stem(stemmer, cword, RSTRING(RARRAY(obj)->ptr[i])->len)));
|
174
|
+
}
|
175
|
+
free(cword);
|
176
|
+
break;
|
177
|
+
default:
|
178
|
+
rb_raise(rb_eTypeError, "not valid value");
|
179
|
+
break;
|
180
|
+
}
|
134
181
|
|
135
182
|
return ret;
|
136
183
|
}
|
data/stemmer4r.gemspec
CHANGED
data/test/test.rb
CHANGED
@@ -7,9 +7,25 @@ rescue LoadError
|
|
7
7
|
require 'stemmer4r'
|
8
8
|
end
|
9
9
|
|
10
|
+
require 'benchmark'
|
10
11
|
require 'test/unit'
|
11
12
|
|
12
13
|
class StemmerTest < Test::Unit::TestCase
|
14
|
+
def test_sentence
|
15
|
+
s = Stemmer.new('fr')
|
16
|
+
assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_array
|
20
|
+
s = Stemmer.new('fr')
|
21
|
+
assert_equal('ordin jeux avion aim', s.stem(%w{ordinateur jeux avion aimer}).join(' '))
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_lowercase
|
25
|
+
s = Stemmer.new('fr')
|
26
|
+
assert_equal(s.stem('aimera'), s.stem('AIMERA'))
|
27
|
+
end
|
28
|
+
|
13
29
|
def test_valid_language
|
14
30
|
assert_not_nil(Stemmer.new('fr'))
|
15
31
|
assert_nothing_raised() { Stemmer.new('fr') }
|
@@ -18,13 +34,34 @@ class StemmerTest < Test::Unit::TestCase
|
|
18
34
|
|
19
35
|
def test_stemmer
|
20
36
|
dir = File.dirname(__FILE__)
|
37
|
+
input = {}
|
38
|
+
output = {}
|
39
|
+
s = {}
|
21
40
|
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
41
|
+
input[language] = IO.read("#{dir}/tests/#{language}/voc.txt").split(/\s+/)
|
42
|
+
output[language] = IO.read("#{dir}/tests/#{language}/output.txt").split(/\s+/)
|
43
|
+
s[language] = Stemmer.new(language)
|
44
|
+
}
|
45
|
+
Benchmark.bm { |x|
|
46
|
+
x.report {
|
47
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
48
|
+
puts "Testing '#{language}' algorithm (one word at a time)..."
|
49
|
+
input[language].length.times { |i|
|
50
|
+
assert_equal(output[language][i], s[language].stem(input[language][i]))
|
51
|
+
}
|
52
|
+
}
|
53
|
+
}
|
54
|
+
x.report {
|
55
|
+
%w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
|
56
|
+
puts "Testing '#{language}' algorithm (a single array)..."
|
57
|
+
assert_equal(output[language], s[language].stem(input[language]))
|
58
|
+
}
|
59
|
+
}
|
60
|
+
x.report {
|
61
|
+
%w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
|
62
|
+
puts "Testing '#{language}' algorithm (a single sentence)..."
|
63
|
+
assert_equal(output[language].join(' '), s[language].stem(input[language].join(' ')))
|
64
|
+
}
|
28
65
|
}
|
29
66
|
}
|
30
67
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.10
|
|
3
3
|
specification_version: 1
|
4
4
|
name: stemmer4r
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: "0.
|
7
|
-
date: 2005-05-
|
6
|
+
version: "0.5"
|
7
|
+
date: 2005-05-12
|
8
8
|
summary: Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
|
9
9
|
require_paths:
|
10
10
|
- "."
|