stemmer4r 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CVS/Entries +5 -0
- data/CVS/Repository +1 -0
- data/CVS/Root +1 -0
- data/LICENSE +20 -0
- data/README +9 -0
- data/ext/CVS/Entries +1 -0
- data/ext/CVS/Repository +1 -0
- data/ext/CVS/Root +1 -0
- data/ext/stemmer4r/CVS/Entries +4 -0
- data/ext/stemmer4r/CVS/Repository +1 -0
- data/ext/stemmer4r/CVS/Root +1 -0
- data/ext/stemmer4r/depend +14 -0
- data/ext/stemmer4r/extconf.rb +8 -0
- data/ext/stemmer4r/libstemmer_c/CVS/Entries +7 -0
- data/ext/stemmer4r/libstemmer_c/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/MANIFEST +39 -0
- data/ext/stemmer4r/libstemmer_c/Makefile +5 -0
- data/ext/stemmer4r/libstemmer_c/include/CVS/Entries +2 -0
- data/ext/stemmer4r/libstemmer_c/include/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/include/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/include/libstemmer.h +63 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/CVS/Entries +3 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/libstemmer.c +78 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/modules.h +96 -0
- data/ext/stemmer4r/libstemmer_c/mkinc.mak +42 -0
- data/ext/stemmer4r/libstemmer_c/runtime/CVS/Entries +5 -0
- data/ext/stemmer4r/libstemmer_c/runtime/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/runtime/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/runtime/api.c +69 -0
- data/ext/stemmer4r/libstemmer_c/runtime/api.h +27 -0
- data/ext/stemmer4r/libstemmer_c/runtime/header.h +56 -0
- data/ext/stemmer4r/libstemmer_c/runtime/utilities.c +403 -0
- data/ext/stemmer4r/libstemmer_c/src_c/CVS/Entries +33 -0
- data/ext/stemmer4r/libstemmer_c/src_c/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/src_c/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_danish.c +330 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_danish.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_dutch.c +635 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_dutch.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_english.c +1109 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_english.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_finnish.c +792 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_finnish.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_french.c +1276 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_french.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german.c +504 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german2.c +549 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german2.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_italian.c +1087 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_italian.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_kraaij_pohlmann.c +1780 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_kraaij_pohlmann.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_lovins.c +1752 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_lovins.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_norwegian.c +279 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_norwegian.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_porter.c +776 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_porter.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_portuguese.c +1027 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_portuguese.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_russian.c +701 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_russian.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_spanish.c +1109 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_spanish.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_swedish.c +299 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_swedish.h +16 -0
- data/ext/stemmer4r/stemmer4r.c +146 -0
- data/stemmer4r.gemspec +23 -0
- data/test/CVS/Entries +2 -0
- data/test/CVS/Repository +1 -0
- data/test/CVS/Root +1 -0
- data/test/test.rb +31 -0
- data/test/tests/CVS/Entries +12 -0
- data/test/tests/CVS/Repository +1 -0
- data/test/tests/CVS/Root +1 -0
- data/test/tests/da/CVS/Entries +3 -0
- data/test/tests/da/CVS/Repository +1 -0
- data/test/tests/da/CVS/Root +1 -0
- data/test/tests/da/output.txt +23829 -0
- data/test/tests/da/voc.txt +23829 -0
- data/test/tests/de/CVS/Entries +3 -0
- data/test/tests/de/CVS/Repository +1 -0
- data/test/tests/de/CVS/Root +1 -0
- data/test/tests/de/output.txt +35033 -0
- data/test/tests/de/voc.txt +35033 -0
- data/test/tests/en/CVS/Entries +3 -0
- data/test/tests/en/CVS/Repository +1 -0
- data/test/tests/en/CVS/Root +1 -0
- data/test/tests/en/output.txt +29400 -0
- data/test/tests/en/voc.txt +29400 -0
- data/test/tests/es/CVS/Entries +3 -0
- data/test/tests/es/CVS/Repository +1 -0
- data/test/tests/es/CVS/Root +1 -0
- data/test/tests/es/output.txt +28390 -0
- data/test/tests/es/voc.txt +28390 -0
- data/test/tests/fi/CVS/Entries +3 -0
- data/test/tests/fi/CVS/Repository +1 -0
- data/test/tests/fi/CVS/Root +1 -0
- data/test/tests/fi/output.txt +50000 -0
- data/test/tests/fi/voc.txt +50000 -0
- data/test/tests/fr/CVS/Entries +3 -0
- data/test/tests/fr/CVS/Repository +1 -0
- data/test/tests/fr/CVS/Root +1 -0
- data/test/tests/fr/output.txt +20403 -0
- data/test/tests/fr/voc.txt +20403 -0
- data/test/tests/it/CVS/Entries +3 -0
- data/test/tests/it/CVS/Repository +1 -0
- data/test/tests/it/CVS/Root +1 -0
- data/test/tests/it/output.txt +35494 -0
- data/test/tests/it/voc.txt +35494 -0
- data/test/tests/nl/CVS/Entries +3 -0
- data/test/tests/nl/CVS/Repository +1 -0
- data/test/tests/nl/CVS/Root +1 -0
- data/test/tests/nl/output.txt +45669 -0
- data/test/tests/nl/voc.txt +45669 -0
- data/test/tests/no/CVS/Entries +3 -0
- data/test/tests/no/CVS/Repository +1 -0
- data/test/tests/no/CVS/Root +1 -0
- data/test/tests/no/output.txt +20628 -0
- data/test/tests/no/voc.txt +20628 -0
- data/test/tests/pt/CVS/Entries +3 -0
- data/test/tests/pt/CVS/Repository +1 -0
- data/test/tests/pt/CVS/Root +1 -0
- data/test/tests/pt/output.txt +32016 -0
- data/test/tests/pt/voc.txt +32016 -0
- data/test/tests/ru/CVS/Entries +3 -0
- data/test/tests/ru/CVS/Repository +1 -0
- data/test/tests/ru/CVS/Root +1 -0
- data/test/tests/ru/output.txt +49673 -0
- data/test/tests/ru/voc.txt +49673 -0
- data/test/tests/sv/CVS/Entries +3 -0
- data/test/tests/sv/CVS/Repository +1 -0
- data/test/tests/sv/CVS/Root +1 -0
- data/test/tests/sv/output.txt +30623 -0
- data/test/tests/sv/voc.txt +30623 -0
- metadata +221 -0
data/CVS/Entries
ADDED
data/CVS/Repository
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stemmer4r
|
data/CVS/Root
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
data/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Copyright (c) 2005 Fabien POTENCIER
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be included
|
|
12
|
+
in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
|
15
|
+
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
16
|
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
This is stemmer4r, a Ruby extension that wraps the snowball stemmer library (libstemmer).
|
|
2
|
+
|
|
3
|
+
For more information on libstemmer itself please refer to it's web page at:
|
|
4
|
+
|
|
5
|
+
http://snowball.tartarus.org/
|
|
6
|
+
|
|
7
|
+
Please direct any questions to the author,
|
|
8
|
+
|
|
9
|
+
Fabien POTENCIER <fabien.potencier@gmail.com>
|
data/ext/CVS/Entries
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
D/stemmer4r////
|
data/ext/CVS/Repository
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stemmer4r/ext
|
data/ext/CVS/Root
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stemmer4r/ext/stemmer4r
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
OBJS += libstemmer_c/libstemmer.o
|
|
2
|
+
|
|
3
|
+
stemmer4r.so: libstemmer_c/libstemmer.o
|
|
4
|
+
|
|
5
|
+
stemmer4r.o: stemmer4r.c /usr/local/ruby/lib/ruby/1.8/i686-linux/ruby.h \
|
|
6
|
+
/usr/local/ruby/lib/ruby/1.8/i686-linux/config.h \
|
|
7
|
+
/usr/local/ruby/lib/ruby/1.8/i686-linux/defines.h \
|
|
8
|
+
/usr/local/ruby/lib/ruby/1.8/i686-linux/missing.h \
|
|
9
|
+
/usr/local/ruby/lib/ruby/1.8/i686-linux/intern.h \
|
|
10
|
+
libstemmer_c/include/libstemmer.h
|
|
11
|
+
|
|
12
|
+
libstemmer_c/libstemmer.o:
|
|
13
|
+
@$(MAKE) -C libstemmer_c
|
|
14
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stemmer4r/ext/stemmer4r/libstemmer_c
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
src_c/stem_danish.c
|
|
2
|
+
src_c/stem_danish.h
|
|
3
|
+
src_c/stem_dutch.c
|
|
4
|
+
src_c/stem_dutch.h
|
|
5
|
+
src_c/stem_english.c
|
|
6
|
+
src_c/stem_english.h
|
|
7
|
+
src_c/stem_finnish.c
|
|
8
|
+
src_c/stem_finnish.h
|
|
9
|
+
src_c/stem_french.c
|
|
10
|
+
src_c/stem_french.h
|
|
11
|
+
src_c/stem_german.c
|
|
12
|
+
src_c/stem_german.h
|
|
13
|
+
src_c/stem_german2.c
|
|
14
|
+
src_c/stem_german2.h
|
|
15
|
+
src_c/stem_italian.c
|
|
16
|
+
src_c/stem_italian.h
|
|
17
|
+
src_c/stem_kraaij_pohlmann.c
|
|
18
|
+
src_c/stem_kraaij_pohlmann.h
|
|
19
|
+
src_c/stem_lovins.c
|
|
20
|
+
src_c/stem_lovins.h
|
|
21
|
+
src_c/stem_norwegian.c
|
|
22
|
+
src_c/stem_norwegian.h
|
|
23
|
+
src_c/stem_porter.c
|
|
24
|
+
src_c/stem_porter.h
|
|
25
|
+
src_c/stem_portuguese.c
|
|
26
|
+
src_c/stem_portuguese.h
|
|
27
|
+
src_c/stem_russian.c
|
|
28
|
+
src_c/stem_russian.h
|
|
29
|
+
src_c/stem_spanish.c
|
|
30
|
+
src_c/stem_spanish.h
|
|
31
|
+
src_c/stem_swedish.c
|
|
32
|
+
src_c/stem_swedish.h
|
|
33
|
+
runtime/api.c
|
|
34
|
+
runtime/api.h
|
|
35
|
+
runtime/header.h
|
|
36
|
+
runtime/utilities.c
|
|
37
|
+
libstemmer/libstemmer.c
|
|
38
|
+
libstemmer/modules.h
|
|
39
|
+
include/libstemmer.h
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stemmer4r/ext/stemmer4r/libstemmer_c/include
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
|
|
2
|
+
/* Make header file work when included from C++ */
|
|
3
|
+
#ifdef __cplusplus
|
|
4
|
+
extern "C" {
|
|
5
|
+
#endif
|
|
6
|
+
|
|
7
|
+
struct sb_stemmer;
|
|
8
|
+
typedef char sb_symbol;
|
|
9
|
+
|
|
10
|
+
/* FIXME - should be able to get a version number for each stemming
|
|
11
|
+
* algorithm (which will be incremented each time the output changes). */
|
|
12
|
+
|
|
13
|
+
/** Returns an array of the names of the available stemming algorithms.
|
|
14
|
+
* Note that these are the canonical names - aliases (ie, other names for
|
|
15
|
+
* the same algorithm) will not be included in the list.
|
|
16
|
+
* The list is terminated with a null pointer.
|
|
17
|
+
*
|
|
18
|
+
* The list must not be modified in any way.
|
|
19
|
+
*/
|
|
20
|
+
const char ** sb_stemmer_list(void);
|
|
21
|
+
|
|
22
|
+
/** Create a new stemmer object, using the specified algorithm.
|
|
23
|
+
*
|
|
24
|
+
* @return If the specified algorithm is not recognised, 0 will be
|
|
25
|
+
* returned; otherwise a pointer to a newly created stemmer for that
|
|
26
|
+
* algorithm will be returned.
|
|
27
|
+
*
|
|
28
|
+
* @note NULL will also be returned if an out of memory error occurs.
|
|
29
|
+
*/
|
|
30
|
+
struct sb_stemmer * sb_stemmer_new(const char * algorithm);
|
|
31
|
+
|
|
32
|
+
/** Delete a stemmer object.
|
|
33
|
+
*
|
|
34
|
+
* This frees all resources allocated for the stemmer. After calling
|
|
35
|
+
* this function, the supplied stemmer may no longer be used in any way.
|
|
36
|
+
*
|
|
37
|
+
* It is safe to pass a null pointer to this function - this will have
|
|
38
|
+
* no effect.
|
|
39
|
+
*/
|
|
40
|
+
void sb_stemmer_delete(struct sb_stemmer * stemmer);
|
|
41
|
+
|
|
42
|
+
/** Stem a word.
|
|
43
|
+
*
|
|
44
|
+
* The return value is owned by the stemmer - it must not be freed or
|
|
45
|
+
* modified, and it will become invalid when the stemmer is called again,
|
|
46
|
+
* or if the stemmer is freed.
|
|
47
|
+
*
|
|
48
|
+
* The length of the return value can be obtained using sb_stemmer_length().
|
|
49
|
+
*
|
|
50
|
+
* If an out-of-memory error occurs, this will return NULL.
|
|
51
|
+
*/
|
|
52
|
+
const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
|
|
53
|
+
const sb_symbol * word, int size);
|
|
54
|
+
|
|
55
|
+
/** Get the length of the result of the last stemmed word.
|
|
56
|
+
* This should not be called before sb_stemmer_stem() has been called.
|
|
57
|
+
*/
|
|
58
|
+
int sb_stemmer_length(struct sb_stemmer * stemmer);
|
|
59
|
+
|
|
60
|
+
#ifdef __cplusplus
|
|
61
|
+
}
|
|
62
|
+
#endif
|
|
63
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stemmer4r/ext/stemmer4r/libstemmer_c/libstemmer
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
|
|
2
|
+
#include <stdlib.h>
|
|
3
|
+
#include <string.h>
|
|
4
|
+
#include "../include/libstemmer.h"
|
|
5
|
+
#include "../runtime/api.h"
|
|
6
|
+
#include "modules.h"
|
|
7
|
+
|
|
8
|
+
struct sb_stemmer {
|
|
9
|
+
struct SN_env * (*create)(void);
|
|
10
|
+
void (*close)(struct SN_env *);
|
|
11
|
+
int (*stem)(struct SN_env *);
|
|
12
|
+
|
|
13
|
+
struct SN_env * env;
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
extern const char **
|
|
17
|
+
sb_stemmer_list(void)
|
|
18
|
+
{
|
|
19
|
+
return algorithm_names;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
extern struct sb_stemmer *
|
|
23
|
+
sb_stemmer_new(const char * algorithm)
|
|
24
|
+
{
|
|
25
|
+
struct stemmer_modules * module;
|
|
26
|
+
struct sb_stemmer * stemmer =
|
|
27
|
+
(struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
|
|
28
|
+
if (stemmer == 0) return NULL;
|
|
29
|
+
|
|
30
|
+
for (module = modules; module->name != 0; module++) {
|
|
31
|
+
if (strcmp(module->name, algorithm) == 0) break;
|
|
32
|
+
}
|
|
33
|
+
if (module->name == 0) return NULL;
|
|
34
|
+
|
|
35
|
+
stemmer->create = module->create;
|
|
36
|
+
stemmer->close = module->close;
|
|
37
|
+
stemmer->stem = module->stem;
|
|
38
|
+
|
|
39
|
+
stemmer->env = stemmer->create();
|
|
40
|
+
if (stemmer->env == NULL)
|
|
41
|
+
{
|
|
42
|
+
sb_stemmer_delete(stemmer);
|
|
43
|
+
return NULL;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
return stemmer;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
void
|
|
50
|
+
sb_stemmer_delete(struct sb_stemmer * stemmer)
|
|
51
|
+
{
|
|
52
|
+
if (stemmer == 0) return;
|
|
53
|
+
if (stemmer->close == 0) return;
|
|
54
|
+
stemmer->close(stemmer->env);
|
|
55
|
+
stemmer->close = 0;
|
|
56
|
+
free(stemmer);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const sb_symbol *
|
|
60
|
+
sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
|
|
61
|
+
{
|
|
62
|
+
int ret;
|
|
63
|
+
if (SN_set_current(stemmer->env, size, word))
|
|
64
|
+
{
|
|
65
|
+
stemmer->env->l = 0;
|
|
66
|
+
return NULL;
|
|
67
|
+
}
|
|
68
|
+
ret = stemmer->stem(stemmer->env);
|
|
69
|
+
if (ret < 0) return NULL;
|
|
70
|
+
stemmer->env->p[stemmer->env->l] = 0;
|
|
71
|
+
return stemmer->env->p;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
int
|
|
75
|
+
sb_stemmer_length(struct sb_stemmer * stemmer)
|
|
76
|
+
{
|
|
77
|
+
return stemmer->env->l;
|
|
78
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/* libstemmer/modules.h: List of stemming modules.
|
|
2
|
+
*
|
|
3
|
+
* This file is generated by mkmodules.pl from a list of module names.
|
|
4
|
+
* Do not edit manually.
|
|
5
|
+
*
|
|
6
|
+
* Modules included by this file are: danish, dutch, english, finnish, french,
|
|
7
|
+
* german, german2, italian, kraaij_pohlmann, lovins, norwegian, porter,
|
|
8
|
+
* portuguese, russian, spanish, swedish
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
#include "../src_c/stem_danish.h"
|
|
12
|
+
#include "../src_c/stem_dutch.h"
|
|
13
|
+
#include "../src_c/stem_english.h"
|
|
14
|
+
#include "../src_c/stem_finnish.h"
|
|
15
|
+
#include "../src_c/stem_french.h"
|
|
16
|
+
#include "../src_c/stem_german.h"
|
|
17
|
+
#include "../src_c/stem_german2.h"
|
|
18
|
+
#include "../src_c/stem_italian.h"
|
|
19
|
+
#include "../src_c/stem_kraaij_pohlmann.h"
|
|
20
|
+
#include "../src_c/stem_lovins.h"
|
|
21
|
+
#include "../src_c/stem_norwegian.h"
|
|
22
|
+
#include "../src_c/stem_porter.h"
|
|
23
|
+
#include "../src_c/stem_portuguese.h"
|
|
24
|
+
#include "../src_c/stem_russian.h"
|
|
25
|
+
#include "../src_c/stem_spanish.h"
|
|
26
|
+
#include "../src_c/stem_swedish.h"
|
|
27
|
+
|
|
28
|
+
struct stemmer_modules {
|
|
29
|
+
const char * name;
|
|
30
|
+
struct SN_env * (*create)(void);
|
|
31
|
+
void (*close)(struct SN_env *);
|
|
32
|
+
int (*stem)(struct SN_env *);
|
|
33
|
+
};
|
|
34
|
+
static struct stemmer_modules modules[] = {
|
|
35
|
+
{"da", danish_create_env, danish_close_env, danish_stem},
|
|
36
|
+
{"dan", danish_create_env, danish_close_env, danish_stem},
|
|
37
|
+
{"danish", danish_create_env, danish_close_env, danish_stem},
|
|
38
|
+
{"de", german_create_env, german_close_env, german_stem},
|
|
39
|
+
{"deu", german_create_env, german_close_env, german_stem},
|
|
40
|
+
{"dut", dutch_create_env, dutch_close_env, dutch_stem},
|
|
41
|
+
{"dutch", dutch_create_env, dutch_close_env, dutch_stem},
|
|
42
|
+
{"en", english_create_env, english_close_env, english_stem},
|
|
43
|
+
{"eng", english_create_env, english_close_env, english_stem},
|
|
44
|
+
{"english", english_create_env, english_close_env, english_stem},
|
|
45
|
+
{"fi", finnish_create_env, finnish_close_env, finnish_stem},
|
|
46
|
+
{"fin", finnish_create_env, finnish_close_env, finnish_stem},
|
|
47
|
+
{"finnish", finnish_create_env, finnish_close_env, finnish_stem},
|
|
48
|
+
{"fr", french_create_env, french_close_env, french_stem},
|
|
49
|
+
{"fra", french_create_env, french_close_env, french_stem},
|
|
50
|
+
{"fre", french_create_env, french_close_env, french_stem},
|
|
51
|
+
{"french", french_create_env, french_close_env, french_stem},
|
|
52
|
+
{"ger", german_create_env, german_close_env, german_stem},
|
|
53
|
+
{"german", german_create_env, german_close_env, german_stem},
|
|
54
|
+
{"german2", german2_create_env, german2_close_env, german2_stem},
|
|
55
|
+
{"it", italian_create_env, italian_close_env, italian_stem},
|
|
56
|
+
{"ita", italian_create_env, italian_close_env, italian_stem},
|
|
57
|
+
{"italian", italian_create_env, italian_close_env, italian_stem},
|
|
58
|
+
{"kraaij_pohlmann", kraaij_pohlmann_create_env, kraaij_pohlmann_close_env, kraaij_pohlmann_stem},
|
|
59
|
+
{"lovins", lovins_create_env, lovins_close_env, lovins_stem},
|
|
60
|
+
{"nl", dutch_create_env, dutch_close_env, dutch_stem},
|
|
61
|
+
{"nld", dutch_create_env, dutch_close_env, dutch_stem},
|
|
62
|
+
{"no", norwegian_create_env, norwegian_close_env, norwegian_stem},
|
|
63
|
+
{"nor", norwegian_create_env, norwegian_close_env, norwegian_stem},
|
|
64
|
+
{"norwegian", norwegian_create_env, norwegian_close_env, norwegian_stem},
|
|
65
|
+
{"por", portuguese_create_env, portuguese_close_env, portuguese_stem},
|
|
66
|
+
{"porter", porter_create_env, porter_close_env, porter_stem},
|
|
67
|
+
{"portuguese", portuguese_create_env, portuguese_close_env, portuguese_stem},
|
|
68
|
+
{"pt", portuguese_create_env, portuguese_close_env, portuguese_stem},
|
|
69
|
+
{"ru", russian_create_env, russian_close_env, russian_stem},
|
|
70
|
+
{"rus", russian_create_env, russian_close_env, russian_stem},
|
|
71
|
+
{"russian", russian_create_env, russian_close_env, russian_stem},
|
|
72
|
+
{"spanish", spanish_create_env, spanish_close_env, spanish_stem},
|
|
73
|
+
{"sv", swedish_create_env, swedish_close_env, swedish_stem},
|
|
74
|
+
{"swe", swedish_create_env, swedish_close_env, swedish_stem},
|
|
75
|
+
{"swedish", swedish_create_env, swedish_close_env, swedish_stem},
|
|
76
|
+
{0,0,0,0}
|
|
77
|
+
};
|
|
78
|
+
static const char * algorithm_names[] = {
|
|
79
|
+
"danish",
|
|
80
|
+
"dutch",
|
|
81
|
+
"english",
|
|
82
|
+
"finnish",
|
|
83
|
+
"french",
|
|
84
|
+
"german",
|
|
85
|
+
"german2",
|
|
86
|
+
"italian",
|
|
87
|
+
"kraaij_pohlmann",
|
|
88
|
+
"lovins",
|
|
89
|
+
"norwegian",
|
|
90
|
+
"porter",
|
|
91
|
+
"portuguese",
|
|
92
|
+
"russian",
|
|
93
|
+
"spanish",
|
|
94
|
+
"swedish",
|
|
95
|
+
0
|
|
96
|
+
};
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
snowball_sources= \
|
|
2
|
+
libstemmer/libstemmer.c \
|
|
3
|
+
runtime/api.c \
|
|
4
|
+
runtime/utilities.c \
|
|
5
|
+
src_c/stem_danish.c \
|
|
6
|
+
src_c/stem_dutch.c \
|
|
7
|
+
src_c/stem_english.c \
|
|
8
|
+
src_c/stem_finnish.c \
|
|
9
|
+
src_c/stem_french.c \
|
|
10
|
+
src_c/stem_german.c \
|
|
11
|
+
src_c/stem_german2.c \
|
|
12
|
+
src_c/stem_italian.c \
|
|
13
|
+
src_c/stem_kraaij_pohlmann.c \
|
|
14
|
+
src_c/stem_lovins.c \
|
|
15
|
+
src_c/stem_norwegian.c \
|
|
16
|
+
src_c/stem_porter.c \
|
|
17
|
+
src_c/stem_portuguese.c \
|
|
18
|
+
src_c/stem_russian.c \
|
|
19
|
+
src_c/stem_spanish.c \
|
|
20
|
+
src_c/stem_swedish.c \
|
|
21
|
+
|
|
22
|
+
snowball_headers= \
|
|
23
|
+
include/libstemmer.h \
|
|
24
|
+
libstemmer/modules.h \
|
|
25
|
+
runtime/api.h \
|
|
26
|
+
runtime/header.h \
|
|
27
|
+
src_c/stem_danish.h \
|
|
28
|
+
src_c/stem_dutch.h \
|
|
29
|
+
src_c/stem_english.h \
|
|
30
|
+
src_c/stem_finnish.h \
|
|
31
|
+
src_c/stem_french.h \
|
|
32
|
+
src_c/stem_german.h \
|
|
33
|
+
src_c/stem_german2.h \
|
|
34
|
+
src_c/stem_italian.h \
|
|
35
|
+
src_c/stem_kraaij_pohlmann.h \
|
|
36
|
+
src_c/stem_lovins.h \
|
|
37
|
+
src_c/stem_norwegian.h \
|
|
38
|
+
src_c/stem_porter.h \
|
|
39
|
+
src_c/stem_portuguese.h \
|
|
40
|
+
src_c/stem_russian.h \
|
|
41
|
+
src_c/stem_spanish.h \
|
|
42
|
+
src_c/stem_swedish.h \
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stemmer4r/ext/stemmer4r/libstemmer_c/runtime
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
|
|
2
|
+
#include <stdlib.h> /* for calloc, free */
|
|
3
|
+
#include "header.h"
|
|
4
|
+
|
|
5
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
|
|
6
|
+
{
|
|
7
|
+
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
|
|
8
|
+
if (z == NULL) return NULL;
|
|
9
|
+
z->p = create_s();
|
|
10
|
+
if (z->p == NULL) goto error;
|
|
11
|
+
if (S_size)
|
|
12
|
+
{
|
|
13
|
+
int i;
|
|
14
|
+
z->S = (symbol * *) calloc(S_size, sizeof(symbol *));
|
|
15
|
+
if (z->S == NULL) goto error;
|
|
16
|
+
|
|
17
|
+
for (i = 0; i < S_size; i++)
|
|
18
|
+
{
|
|
19
|
+
z->S[i] = create_s();
|
|
20
|
+
if (z->S[i] == NULL) goto error;
|
|
21
|
+
}
|
|
22
|
+
z->S_size = S_size;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
if (I_size)
|
|
26
|
+
{
|
|
27
|
+
z->I = (int *) calloc(I_size, sizeof(int));
|
|
28
|
+
if (z->I == NULL) goto error;
|
|
29
|
+
z->I_size = I_size;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (B_size)
|
|
33
|
+
{
|
|
34
|
+
z->B = (symbol *) calloc(B_size, sizeof(symbol));
|
|
35
|
+
if (z->B == NULL) goto error;
|
|
36
|
+
z->B_size = B_size;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return z;
|
|
40
|
+
error:
|
|
41
|
+
SN_close_env(z);
|
|
42
|
+
return NULL;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
extern void SN_close_env(struct SN_env * z)
|
|
46
|
+
{
|
|
47
|
+
if (z == NULL) return;
|
|
48
|
+
if (z->S_size)
|
|
49
|
+
{
|
|
50
|
+
int i;
|
|
51
|
+
for (i = 0; i < z->S_size; i++)
|
|
52
|
+
{
|
|
53
|
+
lose_s(z->S[i]);
|
|
54
|
+
}
|
|
55
|
+
free(z->S);
|
|
56
|
+
}
|
|
57
|
+
if (z->I_size) free(z->I);
|
|
58
|
+
if (z->B_size) free(z->B);
|
|
59
|
+
if (z->p) lose_s(z->p);
|
|
60
|
+
free(z);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
|
|
64
|
+
{
|
|
65
|
+
int err = replace_s(z, 0, z->l, size, s, NULL);
|
|
66
|
+
z->c = 0;
|
|
67
|
+
return err;
|
|
68
|
+
}
|
|
69
|
+
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
typedef unsigned char symbol;
|
|
3
|
+
|
|
4
|
+
/* Or replace 'char' above with 'short' for 16 bit characters.
|
|
5
|
+
|
|
6
|
+
More precisely, replace 'char' with whatever type guarantees the
|
|
7
|
+
character width you need. Note however that sizeof(symbol) should divide
|
|
8
|
+
HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise
|
|
9
|
+
there is an alignment problem. In the unlikely event of a problem here,
|
|
10
|
+
consult Martin Porter.
|
|
11
|
+
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
struct SN_env {
|
|
15
|
+
symbol * p;
|
|
16
|
+
int c; int a; int l; int lb; int bra; int ket;
|
|
17
|
+
int S_size; int I_size; int B_size;
|
|
18
|
+
symbol * * S;
|
|
19
|
+
int * I;
|
|
20
|
+
symbol * B;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
|
|
24
|
+
extern void SN_close_env(struct SN_env * z);
|
|
25
|
+
|
|
26
|
+
extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
|
|
27
|
+
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
|
|
2
|
+
#include <limits.h>
|
|
3
|
+
|
|
4
|
+
#include "api.h"
|
|
5
|
+
|
|
6
|
+
#define MAXINT INT_MAX
|
|
7
|
+
#define MININT INT_MIN
|
|
8
|
+
|
|
9
|
+
#define HEAD 2*sizeof(int)
|
|
10
|
+
|
|
11
|
+
#define SIZE(p) ((int *)(p))[-1]
|
|
12
|
+
#define SET_SIZE(p, n) ((int *)(p))[-1] = n
|
|
13
|
+
#define CAPACITY(p) ((int *)(p))[-2]
|
|
14
|
+
|
|
15
|
+
struct among
|
|
16
|
+
{ int s_size; /* number of chars in string */
|
|
17
|
+
symbol * s; /* search string */
|
|
18
|
+
int substring_i;/* index to longest matching substring */
|
|
19
|
+
int result; /* result of the lookup */
|
|
20
|
+
int (* function)(struct SN_env *);
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
extern symbol * create_s(void);
|
|
24
|
+
extern void lose_s(symbol * p);
|
|
25
|
+
|
|
26
|
+
extern int in_grouping(struct SN_env * z, unsigned char * s, int min, int max);
|
|
27
|
+
extern int in_grouping_b(struct SN_env * z, unsigned char * s, int min, int max);
|
|
28
|
+
extern int out_grouping(struct SN_env * z, unsigned char * s, int min, int max);
|
|
29
|
+
extern int out_grouping_b(struct SN_env * z, unsigned char * s, int min, int max);
|
|
30
|
+
|
|
31
|
+
extern int in_range(struct SN_env * z, int min, int max);
|
|
32
|
+
extern int in_range_b(struct SN_env * z, int min, int max);
|
|
33
|
+
extern int out_range(struct SN_env * z, int min, int max);
|
|
34
|
+
extern int out_range_b(struct SN_env * z, int min, int max);
|
|
35
|
+
|
|
36
|
+
extern int eq_s(struct SN_env * z, int s_size, symbol * s);
|
|
37
|
+
extern int eq_s_b(struct SN_env * z, int s_size, symbol * s);
|
|
38
|
+
extern int eq_v(struct SN_env * z, symbol * p);
|
|
39
|
+
extern int eq_v_b(struct SN_env * z, symbol * p);
|
|
40
|
+
|
|
41
|
+
extern int find_among(struct SN_env * z, struct among * v, int v_size);
|
|
42
|
+
extern int find_among_b(struct SN_env * z, struct among * v, int v_size);
|
|
43
|
+
|
|
44
|
+
extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
|
|
45
|
+
extern int slice_from_s(struct SN_env * z, int s_size, symbol * s);
|
|
46
|
+
extern int slice_from_v(struct SN_env * z, symbol * p);
|
|
47
|
+
extern int slice_del(struct SN_env * z);
|
|
48
|
+
|
|
49
|
+
extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, symbol * s);
|
|
50
|
+
extern int insert_v(struct SN_env * z, int bra, int ket, symbol * p);
|
|
51
|
+
|
|
52
|
+
extern symbol * slice_to(struct SN_env * z, symbol * p);
|
|
53
|
+
extern symbol * assign_to(struct SN_env * z, symbol * p);
|
|
54
|
+
|
|
55
|
+
extern void debug(struct SN_env * z, int number, int line_count);
|
|
56
|
+
|