stemmer4r 0.4 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/stemmer4r/stemmer4r.c +61 -14
- data/stemmer4r.gemspec +1 -1
- data/test/test.rb +43 -6
- metadata +2 -2
data/ext/stemmer4r/stemmer4r.c
CHANGED
@@ -74,12 +74,22 @@ stemmer_free(struct sb_stemmer* stemmer)
|
|
74
74
|
*
|
75
75
|
* === Examples
|
76
76
|
*
|
77
|
-
*
|
78
|
-
*
|
77
|
+
* stemmer = Stemmer.new('en')
|
78
|
+
* puts stemmer.stem('testing')
|
79
79
|
*
|
80
|
-
*
|
81
|
-
* puts stemmer.stem('�t�')
|
80
|
+
* stemmer = Stemmer.new('fr')
|
82
81
|
*
|
82
|
+
* You can stem:
|
83
|
+
*
|
84
|
+
* * a single word:
|
85
|
+
* puts stemmer.stem('�t�')
|
86
|
+
*
|
87
|
+
* * an array:
|
88
|
+
* puts stemmer.stem(%w{t�l�vision chapeau ordinateur})
|
89
|
+
*
|
90
|
+
* * a sentence:
|
91
|
+
* puts stemmer.stem("Une t�l�vision sur un chapeau d'ordinateur")
|
92
|
+
*
|
83
93
|
*/
|
84
94
|
static VALUE
|
85
95
|
stemmer_allocate(VALUE klass, VALUE algorithm)
|
@@ -107,30 +117,67 @@ stemmer_allocate(VALUE klass, VALUE algorithm)
|
|
107
117
|
|
108
118
|
/*
|
109
119
|
* Document-method: stem
|
110
|
-
* call-seq: stem(
|
120
|
+
* call-seq: stem(obj)
|
111
121
|
*
|
112
|
-
* Stems
|
122
|
+
* Stems a word, a sentence or an array of words and returns the result (always in lowercase).
|
123
|
+
* +obj+ is always converted to lowercase before stemming (mandatory for snowball algorithms to work).
|
113
124
|
*
|
114
125
|
* === Parameters
|
115
126
|
*
|
116
|
-
* +
|
127
|
+
* +obj+:: word, sentence (string) or array of words to stem. All strings must be encoding the 'right' way (iso-8859-1 for french for example).
|
117
128
|
*/
|
118
129
|
static VALUE
|
119
|
-
stemmer_stem(VALUE self, VALUE
|
130
|
+
stemmer_stem(VALUE self, VALUE obj)
|
120
131
|
{
|
121
132
|
VALUE ret;
|
133
|
+
VALUE word_lowercase;
|
122
134
|
sb_symbol *cword;
|
135
|
+
sb_symbol *t1;
|
123
136
|
const sb_symbol *rcword;
|
124
137
|
struct sb_stemmer *stemmer;
|
138
|
+
int i;
|
125
139
|
|
126
|
-
Check_Type(word, T_STRING);
|
127
|
-
|
128
|
-
cword = strdup(STR2CSTR(word));
|
129
140
|
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
130
|
-
rcword = sb_stemmer_stem(stemmer, cword, RSTRING(word)->len);
|
131
141
|
|
132
|
-
|
133
|
-
|
142
|
+
switch (TYPE(obj))
|
143
|
+
{
|
144
|
+
case T_STRING:
|
145
|
+
word_lowercase = rb_funcall2(obj, rb_intern("downcase"), 0, 0);
|
146
|
+
cword = strdup(STR2CSTR(word_lowercase));
|
147
|
+
// Sentence
|
148
|
+
if (strchr(cword, ' '))
|
149
|
+
{
|
150
|
+
ret = rb_str_new2("");
|
151
|
+
for (t1 = strtok(cword, " "); t1 != NULL; t1 = strtok(NULL, " "))
|
152
|
+
{
|
153
|
+
rb_str_cat2(ret, sb_stemmer_stem(stemmer, t1, strlen(t1)));
|
154
|
+
rb_str_cat2(ret, " ");
|
155
|
+
}
|
156
|
+
RSTRING(ret)->len--;
|
157
|
+
RSTRING(ret)->ptr[RSTRING(ret)->len] = '\0';
|
158
|
+
}
|
159
|
+
else
|
160
|
+
{
|
161
|
+
// A word
|
162
|
+
rcword = sb_stemmer_stem(stemmer, cword, RSTRING(obj)->len);
|
163
|
+
ret = rb_str_new2(rcword);
|
164
|
+
}
|
165
|
+
free(cword);
|
166
|
+
break;
|
167
|
+
case T_ARRAY:
|
168
|
+
ret = rb_ary_new2(RARRAY(obj)->len);
|
169
|
+
for (i = 0; i < RARRAY(obj)->len; i++)
|
170
|
+
{
|
171
|
+
word_lowercase = rb_funcall2(RARRAY(obj)->ptr[i], rb_intern("downcase"), 0, 0);
|
172
|
+
cword = strdup(STR2CSTR(word_lowercase));
|
173
|
+
rb_ary_push(ret, rb_str_new2(sb_stemmer_stem(stemmer, cword, RSTRING(RARRAY(obj)->ptr[i])->len)));
|
174
|
+
}
|
175
|
+
free(cword);
|
176
|
+
break;
|
177
|
+
default:
|
178
|
+
rb_raise(rb_eTypeError, "not valid value");
|
179
|
+
break;
|
180
|
+
}
|
134
181
|
|
135
182
|
return ret;
|
136
183
|
}
|
data/stemmer4r.gemspec
CHANGED
data/test/test.rb
CHANGED
@@ -7,9 +7,25 @@ rescue LoadError
|
|
7
7
|
require 'stemmer4r'
|
8
8
|
end
|
9
9
|
|
10
|
+
require 'benchmark'
|
10
11
|
require 'test/unit'
|
11
12
|
|
12
13
|
class StemmerTest < Test::Unit::TestCase
|
14
|
+
def test_sentence
|
15
|
+
s = Stemmer.new('fr')
|
16
|
+
assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_array
|
20
|
+
s = Stemmer.new('fr')
|
21
|
+
assert_equal('ordin jeux avion aim', s.stem(%w{ordinateur jeux avion aimer}).join(' '))
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_lowercase
|
25
|
+
s = Stemmer.new('fr')
|
26
|
+
assert_equal(s.stem('aimera'), s.stem('AIMERA'))
|
27
|
+
end
|
28
|
+
|
13
29
|
def test_valid_language
|
14
30
|
assert_not_nil(Stemmer.new('fr'))
|
15
31
|
assert_nothing_raised() { Stemmer.new('fr') }
|
@@ -18,13 +34,34 @@ class StemmerTest < Test::Unit::TestCase
|
|
18
34
|
|
19
35
|
def test_stemmer
|
20
36
|
dir = File.dirname(__FILE__)
|
37
|
+
input = {}
|
38
|
+
output = {}
|
39
|
+
s = {}
|
21
40
|
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
41
|
+
input[language] = IO.read("#{dir}/tests/#{language}/voc.txt").split(/\s+/)
|
42
|
+
output[language] = IO.read("#{dir}/tests/#{language}/output.txt").split(/\s+/)
|
43
|
+
s[language] = Stemmer.new(language)
|
44
|
+
}
|
45
|
+
Benchmark.bm { |x|
|
46
|
+
x.report {
|
47
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
48
|
+
puts "Testing '#{language}' algorithm (one word at a time)..."
|
49
|
+
input[language].length.times { |i|
|
50
|
+
assert_equal(output[language][i], s[language].stem(input[language][i]))
|
51
|
+
}
|
52
|
+
}
|
53
|
+
}
|
54
|
+
x.report {
|
55
|
+
%w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
|
56
|
+
puts "Testing '#{language}' algorithm (a single array)..."
|
57
|
+
assert_equal(output[language], s[language].stem(input[language]))
|
58
|
+
}
|
59
|
+
}
|
60
|
+
x.report {
|
61
|
+
%w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
|
62
|
+
puts "Testing '#{language}' algorithm (a single sentence)..."
|
63
|
+
assert_equal(output[language].join(' '), s[language].stem(input[language].join(' ')))
|
64
|
+
}
|
28
65
|
}
|
29
66
|
}
|
30
67
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.10
|
|
3
3
|
specification_version: 1
|
4
4
|
name: stemmer4r
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: "0.
|
7
|
-
date: 2005-05-
|
6
|
+
version: "0.5"
|
7
|
+
date: 2005-05-12
|
8
8
|
summary: Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
|
9
9
|
require_paths:
|
10
10
|
- "."
|