stemmer4r 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CVS/Entries +5 -0
- data/CVS/Repository +1 -0
- data/CVS/Root +1 -0
- data/LICENSE +20 -0
- data/README +9 -0
- data/ext/CVS/Entries +1 -0
- data/ext/CVS/Repository +1 -0
- data/ext/CVS/Root +1 -0
- data/ext/stemmer4r/CVS/Entries +4 -0
- data/ext/stemmer4r/CVS/Repository +1 -0
- data/ext/stemmer4r/CVS/Root +1 -0
- data/ext/stemmer4r/depend +14 -0
- data/ext/stemmer4r/extconf.rb +8 -0
- data/ext/stemmer4r/libstemmer_c/CVS/Entries +7 -0
- data/ext/stemmer4r/libstemmer_c/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/MANIFEST +39 -0
- data/ext/stemmer4r/libstemmer_c/Makefile +5 -0
- data/ext/stemmer4r/libstemmer_c/include/CVS/Entries +2 -0
- data/ext/stemmer4r/libstemmer_c/include/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/include/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/include/libstemmer.h +63 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/CVS/Entries +3 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/libstemmer.c +78 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/modules.h +96 -0
- data/ext/stemmer4r/libstemmer_c/mkinc.mak +42 -0
- data/ext/stemmer4r/libstemmer_c/runtime/CVS/Entries +5 -0
- data/ext/stemmer4r/libstemmer_c/runtime/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/runtime/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/runtime/api.c +69 -0
- data/ext/stemmer4r/libstemmer_c/runtime/api.h +27 -0
- data/ext/stemmer4r/libstemmer_c/runtime/header.h +56 -0
- data/ext/stemmer4r/libstemmer_c/runtime/utilities.c +403 -0
- data/ext/stemmer4r/libstemmer_c/src_c/CVS/Entries +33 -0
- data/ext/stemmer4r/libstemmer_c/src_c/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/src_c/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_danish.c +330 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_danish.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_dutch.c +635 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_dutch.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_english.c +1109 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_english.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_finnish.c +792 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_finnish.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_french.c +1276 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_french.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german.c +504 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german2.c +549 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german2.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_italian.c +1087 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_italian.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_kraaij_pohlmann.c +1780 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_kraaij_pohlmann.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_lovins.c +1752 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_lovins.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_norwegian.c +279 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_norwegian.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_porter.c +776 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_porter.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_portuguese.c +1027 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_portuguese.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_russian.c +701 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_russian.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_spanish.c +1109 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_spanish.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_swedish.c +299 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_swedish.h +16 -0
- data/ext/stemmer4r/stemmer4r.c +146 -0
- data/stemmer4r.gemspec +23 -0
- data/test/CVS/Entries +2 -0
- data/test/CVS/Repository +1 -0
- data/test/CVS/Root +1 -0
- data/test/test.rb +31 -0
- data/test/tests/CVS/Entries +12 -0
- data/test/tests/CVS/Repository +1 -0
- data/test/tests/CVS/Root +1 -0
- data/test/tests/da/CVS/Entries +3 -0
- data/test/tests/da/CVS/Repository +1 -0
- data/test/tests/da/CVS/Root +1 -0
- data/test/tests/da/output.txt +23829 -0
- data/test/tests/da/voc.txt +23829 -0
- data/test/tests/de/CVS/Entries +3 -0
- data/test/tests/de/CVS/Repository +1 -0
- data/test/tests/de/CVS/Root +1 -0
- data/test/tests/de/output.txt +35033 -0
- data/test/tests/de/voc.txt +35033 -0
- data/test/tests/en/CVS/Entries +3 -0
- data/test/tests/en/CVS/Repository +1 -0
- data/test/tests/en/CVS/Root +1 -0
- data/test/tests/en/output.txt +29400 -0
- data/test/tests/en/voc.txt +29400 -0
- data/test/tests/es/CVS/Entries +3 -0
- data/test/tests/es/CVS/Repository +1 -0
- data/test/tests/es/CVS/Root +1 -0
- data/test/tests/es/output.txt +28390 -0
- data/test/tests/es/voc.txt +28390 -0
- data/test/tests/fi/CVS/Entries +3 -0
- data/test/tests/fi/CVS/Repository +1 -0
- data/test/tests/fi/CVS/Root +1 -0
- data/test/tests/fi/output.txt +50000 -0
- data/test/tests/fi/voc.txt +50000 -0
- data/test/tests/fr/CVS/Entries +3 -0
- data/test/tests/fr/CVS/Repository +1 -0
- data/test/tests/fr/CVS/Root +1 -0
- data/test/tests/fr/output.txt +20403 -0
- data/test/tests/fr/voc.txt +20403 -0
- data/test/tests/it/CVS/Entries +3 -0
- data/test/tests/it/CVS/Repository +1 -0
- data/test/tests/it/CVS/Root +1 -0
- data/test/tests/it/output.txt +35494 -0
- data/test/tests/it/voc.txt +35494 -0
- data/test/tests/nl/CVS/Entries +3 -0
- data/test/tests/nl/CVS/Repository +1 -0
- data/test/tests/nl/CVS/Root +1 -0
- data/test/tests/nl/output.txt +45669 -0
- data/test/tests/nl/voc.txt +45669 -0
- data/test/tests/no/CVS/Entries +3 -0
- data/test/tests/no/CVS/Repository +1 -0
- data/test/tests/no/CVS/Root +1 -0
- data/test/tests/no/output.txt +20628 -0
- data/test/tests/no/voc.txt +20628 -0
- data/test/tests/pt/CVS/Entries +3 -0
- data/test/tests/pt/CVS/Repository +1 -0
- data/test/tests/pt/CVS/Root +1 -0
- data/test/tests/pt/output.txt +32016 -0
- data/test/tests/pt/voc.txt +32016 -0
- data/test/tests/ru/CVS/Entries +3 -0
- data/test/tests/ru/CVS/Repository +1 -0
- data/test/tests/ru/CVS/Root +1 -0
- data/test/tests/ru/output.txt +49673 -0
- data/test/tests/ru/voc.txt +49673 -0
- data/test/tests/sv/CVS/Entries +3 -0
- data/test/tests/sv/CVS/Repository +1 -0
- data/test/tests/sv/CVS/Root +1 -0
- data/test/tests/sv/output.txt +30623 -0
- data/test/tests/sv/voc.txt +30623 -0
- metadata +221 -0
data/CVS/Entries
ADDED
data/CVS/Repository
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
stemmer4r
|
data/CVS/Root
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2005 Fabien POTENCIER
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be included
|
12
|
+
in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
15
|
+
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
16
|
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
This is stemmer4r, a Ruby extension that wraps the snowball stemmer library (libstemmer).
|
2
|
+
|
3
|
+
For more information on libstemmer itself please refer to it's web page at:
|
4
|
+
|
5
|
+
http://snowball.tartarus.org/
|
6
|
+
|
7
|
+
Please direct any questions to the author,
|
8
|
+
|
9
|
+
Fabien POTENCIER <fabien.potencier@gmail.com>
|
data/ext/CVS/Entries
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
D/stemmer4r////
|
data/ext/CVS/Repository
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
stemmer4r/ext
|
data/ext/CVS/Root
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
@@ -0,0 +1 @@
|
|
1
|
+
stemmer4r/ext/stemmer4r
|
@@ -0,0 +1 @@
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
@@ -0,0 +1,14 @@
|
|
1
|
+
OBJS += libstemmer_c/libstemmer.o
|
2
|
+
|
3
|
+
stemmer4r.so: libstemmer_c/libstemmer.o
|
4
|
+
|
5
|
+
stemmer4r.o: stemmer4r.c /usr/local/ruby/lib/ruby/1.8/i686-linux/ruby.h \
|
6
|
+
/usr/local/ruby/lib/ruby/1.8/i686-linux/config.h \
|
7
|
+
/usr/local/ruby/lib/ruby/1.8/i686-linux/defines.h \
|
8
|
+
/usr/local/ruby/lib/ruby/1.8/i686-linux/missing.h \
|
9
|
+
/usr/local/ruby/lib/ruby/1.8/i686-linux/intern.h \
|
10
|
+
libstemmer_c/include/libstemmer.h
|
11
|
+
|
12
|
+
libstemmer_c/libstemmer.o:
|
13
|
+
@$(MAKE) -C libstemmer_c
|
14
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
stemmer4r/ext/stemmer4r/libstemmer_c
|
@@ -0,0 +1 @@
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
@@ -0,0 +1,39 @@
|
|
1
|
+
src_c/stem_danish.c
|
2
|
+
src_c/stem_danish.h
|
3
|
+
src_c/stem_dutch.c
|
4
|
+
src_c/stem_dutch.h
|
5
|
+
src_c/stem_english.c
|
6
|
+
src_c/stem_english.h
|
7
|
+
src_c/stem_finnish.c
|
8
|
+
src_c/stem_finnish.h
|
9
|
+
src_c/stem_french.c
|
10
|
+
src_c/stem_french.h
|
11
|
+
src_c/stem_german.c
|
12
|
+
src_c/stem_german.h
|
13
|
+
src_c/stem_german2.c
|
14
|
+
src_c/stem_german2.h
|
15
|
+
src_c/stem_italian.c
|
16
|
+
src_c/stem_italian.h
|
17
|
+
src_c/stem_kraaij_pohlmann.c
|
18
|
+
src_c/stem_kraaij_pohlmann.h
|
19
|
+
src_c/stem_lovins.c
|
20
|
+
src_c/stem_lovins.h
|
21
|
+
src_c/stem_norwegian.c
|
22
|
+
src_c/stem_norwegian.h
|
23
|
+
src_c/stem_porter.c
|
24
|
+
src_c/stem_porter.h
|
25
|
+
src_c/stem_portuguese.c
|
26
|
+
src_c/stem_portuguese.h
|
27
|
+
src_c/stem_russian.c
|
28
|
+
src_c/stem_russian.h
|
29
|
+
src_c/stem_spanish.c
|
30
|
+
src_c/stem_spanish.h
|
31
|
+
src_c/stem_swedish.c
|
32
|
+
src_c/stem_swedish.h
|
33
|
+
runtime/api.c
|
34
|
+
runtime/api.h
|
35
|
+
runtime/header.h
|
36
|
+
runtime/utilities.c
|
37
|
+
libstemmer/libstemmer.c
|
38
|
+
libstemmer/modules.h
|
39
|
+
include/libstemmer.h
|
@@ -0,0 +1 @@
|
|
1
|
+
stemmer4r/ext/stemmer4r/libstemmer_c/include
|
@@ -0,0 +1 @@
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
@@ -0,0 +1,63 @@
|
|
1
|
+
|
2
|
+
/* Make header file work when included from C++ */
|
3
|
+
#ifdef __cplusplus
|
4
|
+
extern "C" {
|
5
|
+
#endif
|
6
|
+
|
7
|
+
struct sb_stemmer;
|
8
|
+
typedef char sb_symbol;
|
9
|
+
|
10
|
+
/* FIXME - should be able to get a version number for each stemming
|
11
|
+
* algorithm (which will be incremented each time the output changes). */
|
12
|
+
|
13
|
+
/** Returns an array of the names of the available stemming algorithms.
|
14
|
+
* Note that these are the canonical names - aliases (ie, other names for
|
15
|
+
* the same algorithm) will not be included in the list.
|
16
|
+
* The list is terminated with a null pointer.
|
17
|
+
*
|
18
|
+
* The list must not be modified in any way.
|
19
|
+
*/
|
20
|
+
const char ** sb_stemmer_list(void);
|
21
|
+
|
22
|
+
/** Create a new stemmer object, using the specified algorithm.
|
23
|
+
*
|
24
|
+
* @return If the specified algorithm is not recognised, 0 will be
|
25
|
+
* returned; otherwise a pointer to a newly created stemmer for that
|
26
|
+
* algorithm will be returned.
|
27
|
+
*
|
28
|
+
* @note NULL will also be returned if an out of memory error occurs.
|
29
|
+
*/
|
30
|
+
struct sb_stemmer * sb_stemmer_new(const char * algorithm);
|
31
|
+
|
32
|
+
/** Delete a stemmer object.
|
33
|
+
*
|
34
|
+
* This frees all resources allocated for the stemmer. After calling
|
35
|
+
* this function, the supplied stemmer may no longer be used in any way.
|
36
|
+
*
|
37
|
+
* It is safe to pass a null pointer to this function - this will have
|
38
|
+
* no effect.
|
39
|
+
*/
|
40
|
+
void sb_stemmer_delete(struct sb_stemmer * stemmer);
|
41
|
+
|
42
|
+
/** Stem a word.
|
43
|
+
*
|
44
|
+
* The return value is owned by the stemmer - it must not be freed or
|
45
|
+
* modified, and it will become invalid when the stemmer is called again,
|
46
|
+
* or if the stemmer is freed.
|
47
|
+
*
|
48
|
+
* The length of the return value can be obtained using sb_stemmer_length().
|
49
|
+
*
|
50
|
+
* If an out-of-memory error occurs, this will return NULL.
|
51
|
+
*/
|
52
|
+
const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
|
53
|
+
const sb_symbol * word, int size);
|
54
|
+
|
55
|
+
/** Get the length of the result of the last stemmed word.
|
56
|
+
* This should not be called before sb_stemmer_stem() has been called.
|
57
|
+
*/
|
58
|
+
int sb_stemmer_length(struct sb_stemmer * stemmer);
|
59
|
+
|
60
|
+
#ifdef __cplusplus
|
61
|
+
}
|
62
|
+
#endif
|
63
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
stemmer4r/ext/stemmer4r/libstemmer_c/libstemmer
|
@@ -0,0 +1 @@
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
@@ -0,0 +1,78 @@
|
|
1
|
+
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include "../include/libstemmer.h"
|
5
|
+
#include "../runtime/api.h"
|
6
|
+
#include "modules.h"
|
7
|
+
|
8
|
+
struct sb_stemmer {
|
9
|
+
struct SN_env * (*create)(void);
|
10
|
+
void (*close)(struct SN_env *);
|
11
|
+
int (*stem)(struct SN_env *);
|
12
|
+
|
13
|
+
struct SN_env * env;
|
14
|
+
};
|
15
|
+
|
16
|
+
extern const char **
|
17
|
+
sb_stemmer_list(void)
|
18
|
+
{
|
19
|
+
return algorithm_names;
|
20
|
+
}
|
21
|
+
|
22
|
+
extern struct sb_stemmer *
|
23
|
+
sb_stemmer_new(const char * algorithm)
|
24
|
+
{
|
25
|
+
struct stemmer_modules * module;
|
26
|
+
struct sb_stemmer * stemmer =
|
27
|
+
(struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
|
28
|
+
if (stemmer == 0) return NULL;
|
29
|
+
|
30
|
+
for (module = modules; module->name != 0; module++) {
|
31
|
+
if (strcmp(module->name, algorithm) == 0) break;
|
32
|
+
}
|
33
|
+
if (module->name == 0) return NULL;
|
34
|
+
|
35
|
+
stemmer->create = module->create;
|
36
|
+
stemmer->close = module->close;
|
37
|
+
stemmer->stem = module->stem;
|
38
|
+
|
39
|
+
stemmer->env = stemmer->create();
|
40
|
+
if (stemmer->env == NULL)
|
41
|
+
{
|
42
|
+
sb_stemmer_delete(stemmer);
|
43
|
+
return NULL;
|
44
|
+
}
|
45
|
+
|
46
|
+
return stemmer;
|
47
|
+
}
|
48
|
+
|
49
|
+
void
|
50
|
+
sb_stemmer_delete(struct sb_stemmer * stemmer)
|
51
|
+
{
|
52
|
+
if (stemmer == 0) return;
|
53
|
+
if (stemmer->close == 0) return;
|
54
|
+
stemmer->close(stemmer->env);
|
55
|
+
stemmer->close = 0;
|
56
|
+
free(stemmer);
|
57
|
+
}
|
58
|
+
|
59
|
+
const sb_symbol *
|
60
|
+
sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
|
61
|
+
{
|
62
|
+
int ret;
|
63
|
+
if (SN_set_current(stemmer->env, size, word))
|
64
|
+
{
|
65
|
+
stemmer->env->l = 0;
|
66
|
+
return NULL;
|
67
|
+
}
|
68
|
+
ret = stemmer->stem(stemmer->env);
|
69
|
+
if (ret < 0) return NULL;
|
70
|
+
stemmer->env->p[stemmer->env->l] = 0;
|
71
|
+
return stemmer->env->p;
|
72
|
+
}
|
73
|
+
|
74
|
+
int
|
75
|
+
sb_stemmer_length(struct sb_stemmer * stemmer)
|
76
|
+
{
|
77
|
+
return stemmer->env->l;
|
78
|
+
}
|
@@ -0,0 +1,96 @@
|
|
1
|
+
/* libstemmer/modules.h: List of stemming modules.
|
2
|
+
*
|
3
|
+
* This file is generated by mkmodules.pl from a list of module names.
|
4
|
+
* Do not edit manually.
|
5
|
+
*
|
6
|
+
* Modules included by this file are: danish, dutch, english, finnish, french,
|
7
|
+
* german, german2, italian, kraaij_pohlmann, lovins, norwegian, porter,
|
8
|
+
* portuguese, russian, spanish, swedish
|
9
|
+
*/
|
10
|
+
|
11
|
+
#include "../src_c/stem_danish.h"
|
12
|
+
#include "../src_c/stem_dutch.h"
|
13
|
+
#include "../src_c/stem_english.h"
|
14
|
+
#include "../src_c/stem_finnish.h"
|
15
|
+
#include "../src_c/stem_french.h"
|
16
|
+
#include "../src_c/stem_german.h"
|
17
|
+
#include "../src_c/stem_german2.h"
|
18
|
+
#include "../src_c/stem_italian.h"
|
19
|
+
#include "../src_c/stem_kraaij_pohlmann.h"
|
20
|
+
#include "../src_c/stem_lovins.h"
|
21
|
+
#include "../src_c/stem_norwegian.h"
|
22
|
+
#include "../src_c/stem_porter.h"
|
23
|
+
#include "../src_c/stem_portuguese.h"
|
24
|
+
#include "../src_c/stem_russian.h"
|
25
|
+
#include "../src_c/stem_spanish.h"
|
26
|
+
#include "../src_c/stem_swedish.h"
|
27
|
+
|
28
|
+
struct stemmer_modules {
|
29
|
+
const char * name;
|
30
|
+
struct SN_env * (*create)(void);
|
31
|
+
void (*close)(struct SN_env *);
|
32
|
+
int (*stem)(struct SN_env *);
|
33
|
+
};
|
34
|
+
static struct stemmer_modules modules[] = {
|
35
|
+
{"da", danish_create_env, danish_close_env, danish_stem},
|
36
|
+
{"dan", danish_create_env, danish_close_env, danish_stem},
|
37
|
+
{"danish", danish_create_env, danish_close_env, danish_stem},
|
38
|
+
{"de", german_create_env, german_close_env, german_stem},
|
39
|
+
{"deu", german_create_env, german_close_env, german_stem},
|
40
|
+
{"dut", dutch_create_env, dutch_close_env, dutch_stem},
|
41
|
+
{"dutch", dutch_create_env, dutch_close_env, dutch_stem},
|
42
|
+
{"en", english_create_env, english_close_env, english_stem},
|
43
|
+
{"eng", english_create_env, english_close_env, english_stem},
|
44
|
+
{"english", english_create_env, english_close_env, english_stem},
|
45
|
+
{"fi", finnish_create_env, finnish_close_env, finnish_stem},
|
46
|
+
{"fin", finnish_create_env, finnish_close_env, finnish_stem},
|
47
|
+
{"finnish", finnish_create_env, finnish_close_env, finnish_stem},
|
48
|
+
{"fr", french_create_env, french_close_env, french_stem},
|
49
|
+
{"fra", french_create_env, french_close_env, french_stem},
|
50
|
+
{"fre", french_create_env, french_close_env, french_stem},
|
51
|
+
{"french", french_create_env, french_close_env, french_stem},
|
52
|
+
{"ger", german_create_env, german_close_env, german_stem},
|
53
|
+
{"german", german_create_env, german_close_env, german_stem},
|
54
|
+
{"german2", german2_create_env, german2_close_env, german2_stem},
|
55
|
+
{"it", italian_create_env, italian_close_env, italian_stem},
|
56
|
+
{"ita", italian_create_env, italian_close_env, italian_stem},
|
57
|
+
{"italian", italian_create_env, italian_close_env, italian_stem},
|
58
|
+
{"kraaij_pohlmann", kraaij_pohlmann_create_env, kraaij_pohlmann_close_env, kraaij_pohlmann_stem},
|
59
|
+
{"lovins", lovins_create_env, lovins_close_env, lovins_stem},
|
60
|
+
{"nl", dutch_create_env, dutch_close_env, dutch_stem},
|
61
|
+
{"nld", dutch_create_env, dutch_close_env, dutch_stem},
|
62
|
+
{"no", norwegian_create_env, norwegian_close_env, norwegian_stem},
|
63
|
+
{"nor", norwegian_create_env, norwegian_close_env, norwegian_stem},
|
64
|
+
{"norwegian", norwegian_create_env, norwegian_close_env, norwegian_stem},
|
65
|
+
{"por", portuguese_create_env, portuguese_close_env, portuguese_stem},
|
66
|
+
{"porter", porter_create_env, porter_close_env, porter_stem},
|
67
|
+
{"portuguese", portuguese_create_env, portuguese_close_env, portuguese_stem},
|
68
|
+
{"pt", portuguese_create_env, portuguese_close_env, portuguese_stem},
|
69
|
+
{"ru", russian_create_env, russian_close_env, russian_stem},
|
70
|
+
{"rus", russian_create_env, russian_close_env, russian_stem},
|
71
|
+
{"russian", russian_create_env, russian_close_env, russian_stem},
|
72
|
+
{"spanish", spanish_create_env, spanish_close_env, spanish_stem},
|
73
|
+
{"sv", swedish_create_env, swedish_close_env, swedish_stem},
|
74
|
+
{"swe", swedish_create_env, swedish_close_env, swedish_stem},
|
75
|
+
{"swedish", swedish_create_env, swedish_close_env, swedish_stem},
|
76
|
+
{0,0,0,0}
|
77
|
+
};
|
78
|
+
static const char * algorithm_names[] = {
|
79
|
+
"danish",
|
80
|
+
"dutch",
|
81
|
+
"english",
|
82
|
+
"finnish",
|
83
|
+
"french",
|
84
|
+
"german",
|
85
|
+
"german2",
|
86
|
+
"italian",
|
87
|
+
"kraaij_pohlmann",
|
88
|
+
"lovins",
|
89
|
+
"norwegian",
|
90
|
+
"porter",
|
91
|
+
"portuguese",
|
92
|
+
"russian",
|
93
|
+
"spanish",
|
94
|
+
"swedish",
|
95
|
+
0
|
96
|
+
};
|
@@ -0,0 +1,42 @@
|
|
1
|
+
snowball_sources= \
|
2
|
+
libstemmer/libstemmer.c \
|
3
|
+
runtime/api.c \
|
4
|
+
runtime/utilities.c \
|
5
|
+
src_c/stem_danish.c \
|
6
|
+
src_c/stem_dutch.c \
|
7
|
+
src_c/stem_english.c \
|
8
|
+
src_c/stem_finnish.c \
|
9
|
+
src_c/stem_french.c \
|
10
|
+
src_c/stem_german.c \
|
11
|
+
src_c/stem_german2.c \
|
12
|
+
src_c/stem_italian.c \
|
13
|
+
src_c/stem_kraaij_pohlmann.c \
|
14
|
+
src_c/stem_lovins.c \
|
15
|
+
src_c/stem_norwegian.c \
|
16
|
+
src_c/stem_porter.c \
|
17
|
+
src_c/stem_portuguese.c \
|
18
|
+
src_c/stem_russian.c \
|
19
|
+
src_c/stem_spanish.c \
|
20
|
+
src_c/stem_swedish.c \
|
21
|
+
|
22
|
+
snowball_headers= \
|
23
|
+
include/libstemmer.h \
|
24
|
+
libstemmer/modules.h \
|
25
|
+
runtime/api.h \
|
26
|
+
runtime/header.h \
|
27
|
+
src_c/stem_danish.h \
|
28
|
+
src_c/stem_dutch.h \
|
29
|
+
src_c/stem_english.h \
|
30
|
+
src_c/stem_finnish.h \
|
31
|
+
src_c/stem_french.h \
|
32
|
+
src_c/stem_german.h \
|
33
|
+
src_c/stem_german2.h \
|
34
|
+
src_c/stem_italian.h \
|
35
|
+
src_c/stem_kraaij_pohlmann.h \
|
36
|
+
src_c/stem_lovins.h \
|
37
|
+
src_c/stem_norwegian.h \
|
38
|
+
src_c/stem_porter.h \
|
39
|
+
src_c/stem_portuguese.h \
|
40
|
+
src_c/stem_russian.h \
|
41
|
+
src_c/stem_spanish.h \
|
42
|
+
src_c/stem_swedish.h \
|
@@ -0,0 +1 @@
|
|
1
|
+
stemmer4r/ext/stemmer4r/libstemmer_c/runtime
|
@@ -0,0 +1 @@
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
@@ -0,0 +1,69 @@
|
|
1
|
+
|
2
|
+
#include <stdlib.h> /* for calloc, free */
|
3
|
+
#include "header.h"
|
4
|
+
|
5
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
|
6
|
+
{
|
7
|
+
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
|
8
|
+
if (z == NULL) return NULL;
|
9
|
+
z->p = create_s();
|
10
|
+
if (z->p == NULL) goto error;
|
11
|
+
if (S_size)
|
12
|
+
{
|
13
|
+
int i;
|
14
|
+
z->S = (symbol * *) calloc(S_size, sizeof(symbol *));
|
15
|
+
if (z->S == NULL) goto error;
|
16
|
+
|
17
|
+
for (i = 0; i < S_size; i++)
|
18
|
+
{
|
19
|
+
z->S[i] = create_s();
|
20
|
+
if (z->S[i] == NULL) goto error;
|
21
|
+
}
|
22
|
+
z->S_size = S_size;
|
23
|
+
}
|
24
|
+
|
25
|
+
if (I_size)
|
26
|
+
{
|
27
|
+
z->I = (int *) calloc(I_size, sizeof(int));
|
28
|
+
if (z->I == NULL) goto error;
|
29
|
+
z->I_size = I_size;
|
30
|
+
}
|
31
|
+
|
32
|
+
if (B_size)
|
33
|
+
{
|
34
|
+
z->B = (symbol *) calloc(B_size, sizeof(symbol));
|
35
|
+
if (z->B == NULL) goto error;
|
36
|
+
z->B_size = B_size;
|
37
|
+
}
|
38
|
+
|
39
|
+
return z;
|
40
|
+
error:
|
41
|
+
SN_close_env(z);
|
42
|
+
return NULL;
|
43
|
+
}
|
44
|
+
|
45
|
+
extern void SN_close_env(struct SN_env * z)
|
46
|
+
{
|
47
|
+
if (z == NULL) return;
|
48
|
+
if (z->S_size)
|
49
|
+
{
|
50
|
+
int i;
|
51
|
+
for (i = 0; i < z->S_size; i++)
|
52
|
+
{
|
53
|
+
lose_s(z->S[i]);
|
54
|
+
}
|
55
|
+
free(z->S);
|
56
|
+
}
|
57
|
+
if (z->I_size) free(z->I);
|
58
|
+
if (z->B_size) free(z->B);
|
59
|
+
if (z->p) lose_s(z->p);
|
60
|
+
free(z);
|
61
|
+
}
|
62
|
+
|
63
|
+
extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
|
64
|
+
{
|
65
|
+
int err = replace_s(z, 0, z->l, size, s, NULL);
|
66
|
+
z->c = 0;
|
67
|
+
return err;
|
68
|
+
}
|
69
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
typedef unsigned char symbol;
|
3
|
+
|
4
|
+
/* Or replace 'char' above with 'short' for 16 bit characters.
|
5
|
+
|
6
|
+
More precisely, replace 'char' with whatever type guarantees the
|
7
|
+
character width you need. Note however that sizeof(symbol) should divide
|
8
|
+
HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise
|
9
|
+
there is an alignment problem. In the unlikely event of a problem here,
|
10
|
+
consult Martin Porter.
|
11
|
+
|
12
|
+
*/
|
13
|
+
|
14
|
+
struct SN_env {
|
15
|
+
symbol * p;
|
16
|
+
int c; int a; int l; int lb; int bra; int ket;
|
17
|
+
int S_size; int I_size; int B_size;
|
18
|
+
symbol * * S;
|
19
|
+
int * I;
|
20
|
+
symbol * B;
|
21
|
+
};
|
22
|
+
|
23
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
|
24
|
+
extern void SN_close_env(struct SN_env * z);
|
25
|
+
|
26
|
+
extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
|
27
|
+
|
@@ -0,0 +1,56 @@
|
|
1
|
+
|
2
|
+
#include <limits.h>
|
3
|
+
|
4
|
+
#include "api.h"
|
5
|
+
|
6
|
+
#define MAXINT INT_MAX
|
7
|
+
#define MININT INT_MIN
|
8
|
+
|
9
|
+
#define HEAD 2*sizeof(int)
|
10
|
+
|
11
|
+
#define SIZE(p) ((int *)(p))[-1]
|
12
|
+
#define SET_SIZE(p, n) ((int *)(p))[-1] = n
|
13
|
+
#define CAPACITY(p) ((int *)(p))[-2]
|
14
|
+
|
15
|
+
struct among
|
16
|
+
{ int s_size; /* number of chars in string */
|
17
|
+
symbol * s; /* search string */
|
18
|
+
int substring_i;/* index to longest matching substring */
|
19
|
+
int result; /* result of the lookup */
|
20
|
+
int (* function)(struct SN_env *);
|
21
|
+
};
|
22
|
+
|
23
|
+
extern symbol * create_s(void);
|
24
|
+
extern void lose_s(symbol * p);
|
25
|
+
|
26
|
+
extern int in_grouping(struct SN_env * z, unsigned char * s, int min, int max);
|
27
|
+
extern int in_grouping_b(struct SN_env * z, unsigned char * s, int min, int max);
|
28
|
+
extern int out_grouping(struct SN_env * z, unsigned char * s, int min, int max);
|
29
|
+
extern int out_grouping_b(struct SN_env * z, unsigned char * s, int min, int max);
|
30
|
+
|
31
|
+
extern int in_range(struct SN_env * z, int min, int max);
|
32
|
+
extern int in_range_b(struct SN_env * z, int min, int max);
|
33
|
+
extern int out_range(struct SN_env * z, int min, int max);
|
34
|
+
extern int out_range_b(struct SN_env * z, int min, int max);
|
35
|
+
|
36
|
+
extern int eq_s(struct SN_env * z, int s_size, symbol * s);
|
37
|
+
extern int eq_s_b(struct SN_env * z, int s_size, symbol * s);
|
38
|
+
extern int eq_v(struct SN_env * z, symbol * p);
|
39
|
+
extern int eq_v_b(struct SN_env * z, symbol * p);
|
40
|
+
|
41
|
+
extern int find_among(struct SN_env * z, struct among * v, int v_size);
|
42
|
+
extern int find_among_b(struct SN_env * z, struct among * v, int v_size);
|
43
|
+
|
44
|
+
extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
|
45
|
+
extern int slice_from_s(struct SN_env * z, int s_size, symbol * s);
|
46
|
+
extern int slice_from_v(struct SN_env * z, symbol * p);
|
47
|
+
extern int slice_del(struct SN_env * z);
|
48
|
+
|
49
|
+
extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, symbol * s);
|
50
|
+
extern int insert_v(struct SN_env * z, int bra, int ket, symbol * p);
|
51
|
+
|
52
|
+
extern symbol * slice_to(struct SN_env * z, symbol * p);
|
53
|
+
extern symbol * assign_to(struct SN_env * z, symbol * p);
|
54
|
+
|
55
|
+
extern void debug(struct SN_env * z, int number, int line_count);
|
56
|
+
|