fast-stemmer 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 Roman Shterenzon
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ Fast-stemmer is simply a wrapping around multithreaded
2
+ Porter stemming algorithm.
3
+
4
+ This gem adds a String#stem method, and it conflicts with the stemmer gem.
5
+ It's in order of magnitude faster (and uses much less memory) than the latter.
6
+
7
+ For the original work please see:
8
+ http://tartarus.org/~martin/PorterStemmer/
9
+
10
+ Usage:
11
+
12
+ require 'rubygems'
13
+ require 'fast_stemmer'
14
+
15
+ Stemmer::stem_word('running') # -> 'run'
16
+ 'running'.stem # -> 'run'
17
+
18
+
19
+ COPYRIGHT
20
+ =========
21
+
22
+ Copyright (c) 2008 Roman Shterenzon. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,35 @@
1
+ require 'rake'
2
+
3
+ begin
4
+ require 'jeweler'
5
+ Jeweler::Tasks.new do |s|
6
+ s.name = "fast-stemmer"
7
+ s.summary = %Q{Fast Porter stemmer based on a C version of algorithm}
8
+ s.email = "romanbsd@yahoo.com"
9
+ s.homepage = "http://github.com/romanbsd/fast-stemmer"
10
+ s.description = s.summary
11
+ s.authors = ["Roman Shterenzon"]
12
+ s.extensions = ['ext/extconf.rb']
13
+ s.files = FileList["[A-Z]*", "{ext,lib,test}/**/*"]
14
+ end
15
+ rescue LoadError
16
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
17
+ end
18
+
19
+ require 'rake/rdoctask'
20
+ Rake::RDocTask.new do |rdoc|
21
+ rdoc.rdoc_dir = 'rdoc'
22
+ rdoc.title = 'fast-stemmer'
23
+ rdoc.options << '--line-numbers' << '--inline-source'
24
+ rdoc.rdoc_files.include('README*')
25
+ rdoc.rdoc_files.include('lib/**/*.rb')
26
+ end
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |t|
30
+ t.libs << 'lib' << 'test'
31
+ t.pattern = 'test/**/*_test.rb'
32
+ t.verbose = false
33
+ end
34
+
35
+ task :default => :test
data/VERSION.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ :major: 1
3
+ :minor: 0
4
+ :patch: 0
data/ext/extconf.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'mkmf'
2
+ create_makefile('stemmer')
data/ext/porter.c ADDED
@@ -0,0 +1,441 @@
1
+
2
+ /* This is the Porter stemming algorithm, coded up as thread-safe ANSI C
3
+ by the author.
4
+
5
+ It may be be regarded as cononical, in that it follows the algorithm
6
+ presented in
7
+
8
+ Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
9
+ no. 3, pp 130-137,
10
+
11
+ only differing from it at the points maked --DEPARTURE-- below.
12
+
13
+ See also http://www.tartarus.org/~martin/PorterStemmer
14
+
15
+ The algorithm as described in the paper could be exactly replicated
16
+ by adjusting the points of DEPARTURE, but this is barely necessary,
17
+ because (a) the points of DEPARTURE are definitely improvements, and
18
+ (b) no encoding of the Porter stemmer I have seen is anything like
19
+ as exact as this version, even with the points of DEPARTURE!
20
+
21
+ You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
22
+ 'stem' takes a list of inputs and sends the stemmed equivalent to
23
+ stdout.
24
+
25
+ The algorithm as encoded here is particularly fast.
26
+
27
+ Release 2 (the more old-fashioned, non-thread-safe version may be
28
+ regarded as release 1.)
29
+ */
30
+
31
+ #include <stdlib.h> /* for malloc, free */
32
+ #include <string.h> /* for memcmp, memmove */
33
+
34
+ /* You will probably want to move the following declarations to a central
35
+ header file.
36
+ */
37
+
38
+ struct stemmer;
39
+
40
+ extern struct stemmer * create_stemmer(void);
41
+ extern void free_stemmer(struct stemmer * z);
42
+
43
+ extern int stem(struct stemmer * z, char * b, int k);
44
+
45
+
46
+
47
+ /* The main part of the stemming algorithm starts here.
48
+ */
49
+
50
+ #define TRUE 1
51
+ #define FALSE 0
52
+
53
+ /* stemmer is a structure for a few local bits of data,
54
+ */
55
+
56
+ struct stemmer {
57
+ char * b; /* buffer for word to be stemmed */
58
+ int k; /* offset to the end of the string */
59
+ int j; /* a general offset into the string */
60
+ };
61
+
62
+
63
+ /* Member b is a buffer holding a word to be stemmed. The letters are in
64
+ b[0], b[1] ... ending at b[z->k]. Member k is readjusted downwards as
65
+ the stemming progresses. Zero termination is not in fact used in the
66
+ algorithm.
67
+
68
+ Note that only lower case sequences are stemmed. Forcing to lower case
69
+ should be done before stem(...) is called.
70
+
71
+
72
+ Typical usage is:
73
+
74
+ struct stemmer * z = create_stemmer();
75
+ char b[] = "pencils";
76
+ int res = stem(z, b, 6);
77
+ /- stem the 7 characters of b[0] to b[6]. The result, res,
78
+ will be 5 (the 's' is removed). -/
79
+ free_stemmer(z);
80
+ */
81
+
82
+
83
+ extern struct stemmer * create_stemmer(void)
84
+ {
85
+ return (struct stemmer *) malloc(sizeof(struct stemmer));
86
+ /* assume malloc succeeds */
87
+ }
88
+
89
+ extern void free_stemmer(struct stemmer * z)
90
+ {
91
+ free(z);
92
+ }
93
+
94
+
95
+ /* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here
96
+ and below we drop 'z->' in comments.
97
+ */
98
+
99
+ static int cons(struct stemmer * z, int i)
100
+ { switch (z->b[i])
101
+ { case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
102
+ case 'y': return (i == 0) ? TRUE : !cons(z, i - 1);
103
+ default: return TRUE;
104
+ }
105
+ }
106
+
107
+ /* m(z) measures the number of consonant sequences between 0 and j. if c is
108
+ a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
109
+ presence,
110
+
111
+ <c><v> gives 0
112
+ <c>vc<v> gives 1
113
+ <c>vcvc<v> gives 2
114
+ <c>vcvcvc<v> gives 3
115
+ ....
116
+ */
117
+
118
+ static int m(struct stemmer * z)
119
+ { int n = 0;
120
+ int i = 0;
121
+ int j = z->j;
122
+ while(TRUE)
123
+ { if (i > j) return n;
124
+ if (! cons(z, i)) break; i++;
125
+ }
126
+ i++;
127
+ while(TRUE)
128
+ { while(TRUE)
129
+ { if (i > j) return n;
130
+ if (cons(z, i)) break;
131
+ i++;
132
+ }
133
+ i++;
134
+ n++;
135
+ while(TRUE)
136
+ { if (i > j) return n;
137
+ if (! cons(z, i)) break;
138
+ i++;
139
+ }
140
+ i++;
141
+ }
142
+ }
143
+
144
+ /* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */
145
+
146
+ static int vowelinstem(struct stemmer * z)
147
+ {
148
+ int j = z->j;
149
+ int i; for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE;
150
+ return FALSE;
151
+ }
152
+
153
+ /* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */
154
+
155
+ static int doublec(struct stemmer * z, int j)
156
+ {
157
+ char * b = z->b;
158
+ if (j < 1) return FALSE;
159
+ if (b[j] != b[j - 1]) return FALSE;
160
+ return cons(z, j);
161
+ }
162
+
163
+ /* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
164
+ and also if the second c is not w,x or y. this is used when trying to
165
+ restore an e at the end of a short word. e.g.
166
+
167
+ cav(e), lov(e), hop(e), crim(e), but
168
+ snow, box, tray.
169
+
170
+ */
171
+
172
+ static int cvc(struct stemmer * z, int i)
173
+ { if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE;
174
+ { int ch = z->b[i];
175
+ if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE;
176
+ }
177
+ return TRUE;
178
+ }
179
+
180
+ /* ends(z, s) is TRUE <=> 0,...k ends with the string s. */
181
+
182
+ static int ends(struct stemmer * z, char * s)
183
+ { int length = s[0];
184
+ char * b = z->b;
185
+ int k = z->k;
186
+ if (s[length] != b[k]) return FALSE; /* tiny speed-up */
187
+ if (length > k + 1) return FALSE;
188
+ if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE;
189
+ z->j = k-length;
190
+ return TRUE;
191
+ }
192
+
193
+ /* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting
194
+ k. */
195
+
196
+ static void setto(struct stemmer * z, char * s)
197
+ { int length = s[0];
198
+ int j = z->j;
199
+ memmove(z->b + j + 1, s + 1, length);
200
+ z->k = j+length;
201
+ }
202
+
203
+ /* r(z, s) is used further down. */
204
+
205
+ static void r(struct stemmer * z, char * s) { if (m(z) > 0) setto(z, s); }
206
+
207
+ /* step1ab(z) gets rid of plurals and -ed or -ing. e.g.
208
+
209
+ caresses -> caress
210
+ ponies -> poni
211
+ ties -> ti
212
+ caress -> caress
213
+ cats -> cat
214
+
215
+ feed -> feed
216
+ agreed -> agree
217
+ disabled -> disable
218
+
219
+ matting -> mat
220
+ mating -> mate
221
+ meeting -> meet
222
+ milling -> mill
223
+ messing -> mess
224
+
225
+ meetings -> meet
226
+
227
+ */
228
+
229
+ static void step1ab(struct stemmer * z)
230
+ {
231
+ char * b = z->b;
232
+ if (b[z->k] == 's')
233
+ { if (ends(z, "\04" "sses")) z->k -= 2; else
234
+ if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else
235
+ if (b[z->k - 1] != 's') z->k--;
236
+ }
237
+ if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else
238
+ if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z))
239
+ { z->k = z->j;
240
+ if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else
241
+ if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else
242
+ if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else
243
+ if (doublec(z, z->k))
244
+ { z->k--;
245
+ { int ch = b[z->k];
246
+ if (ch == 'l' || ch == 's' || ch == 'z') z->k++;
247
+ }
248
+ }
249
+ else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e");
250
+ }
251
+ }
252
+
253
+ /* step1c(z) turns terminal y to i when there is another vowel in the stem. */
254
+
255
+ static void step1c(struct stemmer * z)
256
+ {
257
+ if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i';
258
+ }
259
+
260
+
261
+ /* step2(z) maps double suffices to single ones. so -ization ( = -ize plus
262
+ -ation) maps to -ize etc. note that the string before the suffix must give
263
+ m(z) > 0. */
264
+
265
+ static void step2(struct stemmer * z) { switch (z->b[z->k-1])
266
+ {
267
+ case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; }
268
+ if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; }
269
+ break;
270
+ case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; }
271
+ if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; }
272
+ break;
273
+ case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; }
274
+ break;
275
+ case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/
276
+
277
+ /* To match the published algorithm, replace this line with
278
+ case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */
279
+
280
+ if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; }
281
+ if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; }
282
+ if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; }
283
+ if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; }
284
+ break;
285
+ case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; }
286
+ if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; }
287
+ if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; }
288
+ break;
289
+ case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; }
290
+ if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; }
291
+ if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; }
292
+ if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; }
293
+ break;
294
+ case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; }
295
+ if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; }
296
+ if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; }
297
+ break;
298
+ case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/
299
+
300
+ /* To match the published algorithm, delete this line */
301
+
302
+ } }
303
+
304
+ /* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */
305
+
306
+ static void step3(struct stemmer * z) { switch (z->b[z->k])
307
+ {
308
+ case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; }
309
+ if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; }
310
+ if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; }
311
+ break;
312
+ case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; }
313
+ break;
314
+ case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; }
315
+ if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; }
316
+ break;
317
+ case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; }
318
+ break;
319
+ } }
320
+
321
+ /* step4(z) takes off -ant, -ence etc., in context <c>vcvc<v>. */
322
+
323
+ static void step4(struct stemmer * z)
324
+ { switch (z->b[z->k-1])
325
+ { case 'a': if (ends(z, "\02" "al")) break; return;
326
+ case 'c': if (ends(z, "\04" "ance")) break;
327
+ if (ends(z, "\04" "ence")) break; return;
328
+ case 'e': if (ends(z, "\02" "er")) break; return;
329
+ case 'i': if (ends(z, "\02" "ic")) break; return;
330
+ case 'l': if (ends(z, "\04" "able")) break;
331
+ if (ends(z, "\04" "ible")) break; return;
332
+ case 'n': if (ends(z, "\03" "ant")) break;
333
+ if (ends(z, "\05" "ement")) break;
334
+ if (ends(z, "\04" "ment")) break;
335
+ if (ends(z, "\03" "ent")) break; return;
336
+ case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break;
337
+ if (ends(z, "\02" "ou")) break; return;
338
+ /* takes care of -ous */
339
+ case 's': if (ends(z, "\03" "ism")) break; return;
340
+ case 't': if (ends(z, "\03" "ate")) break;
341
+ if (ends(z, "\03" "iti")) break; return;
342
+ case 'u': if (ends(z, "\03" "ous")) break; return;
343
+ case 'v': if (ends(z, "\03" "ive")) break; return;
344
+ case 'z': if (ends(z, "\03" "ize")) break; return;
345
+ default: return;
346
+ }
347
+ if (m(z) > 1) z->k = z->j;
348
+ }
349
+
350
+ /* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if
351
+ m(z) > 1. */
352
+
353
+ static void step5(struct stemmer * z)
354
+ {
355
+ char * b = z->b;
356
+ z->j = z->k;
357
+ if (b[z->k] == 'e')
358
+ { int a = m(z);
359
+ if (a > 1 || a == 1 && !cvc(z, z->k - 1)) z->k--;
360
+ }
361
+ if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;
362
+ }
363
+
364
+ /* In stem(z, b, k), b is a char pointer, and the string to be stemmed is
365
+ from b[0] to b[k] inclusive. Possibly b[k+1] == '\0', but it is not
366
+ important. The stemmer adjusts the characters b[0] ... b[k] and returns
367
+ the new end-point of the string, k'. Stemming never increases word
368
+ length, so 0 <= k' <= k.
369
+ */
370
+
371
+ extern int stem(struct stemmer * z, char * b, int k)
372
+ {
373
+ if (k <= 1) return k; /*-DEPARTURE-*/
374
+ z->b = b; z->k = k; /* copy the parameters into z */
375
+
376
+ /* With this line, strings of length 1 or 2 don't go through the
377
+ stemming process, although no mention is made of this in the
378
+ published algorithm. Remove the line to match the published
379
+ algorithm. */
380
+
381
+ step1ab(z); step1c(z); step2(z); step3(z); step4(z); step5(z);
382
+ return z->k;
383
+ }
384
+
385
+ /*--------------------stemmer definition ends here------------------------*/
386
+ #if 0
387
+ #include <stdio.h>
388
+ #include <stdlib.h> /* for malloc, free */
389
+ #include <ctype.h> /* for isupper, islower, tolower */
390
+
391
+ static char * s; /* buffer for words tobe stemmed */
392
+
393
+ #define INC 50 /* size units in which s is increased */
394
+ static int i_max = INC; /* maximum offset in s */
395
+
396
+ #define LETTER(ch) (isupper(ch) || islower(ch))
397
+
398
+ void stemfile(struct stemmer * z, FILE * f)
399
+ { while(TRUE)
400
+ { int ch = getc(f);
401
+ if (ch == EOF) return;
402
+ if (LETTER(ch))
403
+ { int i = 0;
404
+ while(TRUE)
405
+ { if (i == i_max)
406
+ { i_max += INC;
407
+ s = realloc(s, i_max + 1);
408
+ }
409
+ ch = tolower(ch); /* forces lower case */
410
+
411
+ s[i] = ch; i++;
412
+ ch = getc(f);
413
+ if (!LETTER(ch)) { ungetc(ch,f); break; }
414
+ }
415
+ s[stem(z, s, i - 1) + 1] = 0;
416
+ /* the previous line calls the stemmer and uses its result to
417
+ zero-terminate the string in s */
418
+ printf("%s",s);
419
+ }
420
+ else putchar(ch);
421
+ }
422
+ }
423
+
424
+ int main(int argc, char * argv[])
425
+ { int i;
426
+
427
+ struct stemmer * z = create_stemmer();
428
+
429
+ s = (char *) malloc(i_max + 1);
430
+ for (i = 1; i < argc; i++)
431
+ { FILE * f = fopen(argv[i],"r");
432
+ if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
433
+ stemfile(z, f);
434
+ }
435
+ free(s);
436
+
437
+ free_stemmer(z);
438
+
439
+ return 0;
440
+ }
441
+ #endif
data/ext/porter_wrap.c ADDED
@@ -0,0 +1,47 @@
1
+ #include <ruby.h>
2
+
3
+ #ifndef RSTRING_PTR
4
+ #define RSTRING_PTR(str) (RSTRING(str)->ptr)
5
+ #endif
6
+
7
+ extern struct stemmer * create_stemmer(void);
8
+ extern void free_stemmer(struct stemmer * z);
9
+ extern int stem(struct stemmer * z, char * b, int k);
10
+
11
+ /* copied from porter.c */
12
+ struct stemmer {
13
+ char * b; /* buffer for word to be stemmed */
14
+ int k; /* offset to the end of the string */
15
+ int j; /* a general offset into the string */
16
+ };
17
+
18
+ static VALUE stem_word(VALUE self, VALUE arg)
19
+ {
20
+ int length, i;
21
+ char *word;
22
+ char *res;
23
+ struct stemmer z;
24
+ VALUE str, rv;
25
+
26
+ str = StringValue(arg);
27
+ word = RSTRING_PTR(str);
28
+
29
+ length = stem(&z, word, strlen(word)-1);
30
+ /* length is the index of last char, add one for size and one for '\0' */
31
+ res = (char *)malloc((length+2) * sizeof(char));
32
+ for (i=0; i<=length; i++)
33
+ {
34
+ res[i] = word[i];
35
+ }
36
+ res[length+1] = 0;
37
+ rv = rb_str_new2(res);
38
+ free(res);
39
+ return rv;
40
+ }
41
+
42
+ VALUE mStemmer;
43
+
44
+ void Init_stemmer(void) {
45
+ mStemmer = rb_define_module("Stemmer");
46
+ rb_define_module_function(mStemmer, "stem_word", stem_word, 1);
47
+ }
@@ -0,0 +1,7 @@
1
+ require 'stemmer'
2
+
3
+ class String
4
+ def stem
5
+ Stemmer.stem_word(self)
6
+ end
7
+ end
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+ require 'test/unit'
3
+ require File.join(File.dirname(__FILE__), '..', 'lib', 'fast_stemmer')
4
+
5
+ class TestStemmer < Test::Unit::TestCase
6
+ def setup
7
+ @stems = { 'riding' => 'ride',
8
+ 'forestalled' => 'forestal',
9
+ 'combined' => 'combin',
10
+ 'ran' => 'ran',
11
+ 'seen' => 'seen',
12
+ 'excused' => 'excus'
13
+ }
14
+ end
15
+
16
+ def test_stems
17
+ @stems.each {|stem| assert_equal(stem[1], stem[0].stem)}
18
+ end
19
+ end
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fast-stemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Roman Shterenzon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-18 00:00:00 -08:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Fast Porter stemmer based on a C version of algorithm
17
+ email: romanbsd@yahoo.com
18
+ executables: []
19
+
20
+ extensions:
21
+ - ext/extconf.rb
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README
25
+ files:
26
+ - LICENSE
27
+ - README
28
+ - Rakefile
29
+ - VERSION.yml
30
+ - ext/extconf.rb
31
+ - ext/porter.c
32
+ - ext/porter_wrap.c
33
+ - lib/fast_stemmer.rb
34
+ - test/fast_stemmer_test.rb
35
+ has_rdoc: true
36
+ homepage: http://github.com/romanbsd/fast-stemmer
37
+ licenses: []
38
+
39
+ post_install_message:
40
+ rdoc_options:
41
+ - --charset=UTF-8
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: "0"
49
+ version:
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ requirements: []
57
+
58
+ rubyforge_project:
59
+ rubygems_version: 1.3.5
60
+ signing_key:
61
+ specification_version: 3
62
+ summary: Fast Porter stemmer based on a C version of algorithm
63
+ test_files:
64
+ - test/fast_stemmer_test.rb