ruby-stemmer-dimelo 0.9.3.dimelo1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +117 -0
- data/Rakefile +70 -0
- data/VERSION +1 -0
- data/ext/lingua/extconf.rb +40 -0
- data/ext/lingua/stemmer.c +115 -0
- data/lib/lingua/stemmer.rb +60 -0
- data/libstemmer_c/MANIFEST +72 -0
- data/libstemmer_c/Makefile +9 -0
- data/libstemmer_c/Makefile.windows +15 -0
- data/libstemmer_c/README +125 -0
- data/libstemmer_c/examples/stemwords.c +209 -0
- data/libstemmer_c/include/libstemmer.h +79 -0
- data/libstemmer_c/libstemmer/libstemmer.c +93 -0
- data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
- data/libstemmer_c/libstemmer/modules.h +195 -0
- data/libstemmer_c/libstemmer/modules.txt +51 -0
- data/libstemmer_c/libstemmer/modules_utf8.h +123 -0
- data/libstemmer_c/libstemmer/modules_utf8.txt +50 -0
- data/libstemmer_c/mkinc.mak +86 -0
- data/libstemmer_c/mkinc_utf8.mak +54 -0
- data/libstemmer_c/runtime/api.c +66 -0
- data/libstemmer_c/runtime/api.h +26 -0
- data/libstemmer_c/runtime/header.h +58 -0
- data/libstemmer_c/runtime/utilities.c +478 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1230 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_latin.c +443 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_latin.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_french.c +1230 -0
- data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
- data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_latin.c +443 -0
- data/libstemmer_c/src_c/stem_UTF_8_latin.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/test/helper.rb +3 -0
- data/test/lingua/test_stemmer.rb +99 -0
- metadata +141 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include "../include/libstemmer.h"
|
5
|
+
#include "../runtime/api.h"
|
6
|
+
#include "modules_utf8.h"
|
7
|
+
|
8
|
+
struct sb_stemmer {
|
9
|
+
struct SN_env * (*create)(void);
|
10
|
+
void (*close)(struct SN_env *);
|
11
|
+
int (*stem)(struct SN_env *);
|
12
|
+
|
13
|
+
struct SN_env * env;
|
14
|
+
};
|
15
|
+
|
16
|
+
extern const char **
|
17
|
+
sb_stemmer_list(void)
|
18
|
+
{
|
19
|
+
return algorithm_names;
|
20
|
+
}
|
21
|
+
|
22
|
+
static stemmer_encoding_t
|
23
|
+
sb_getenc(const char * charenc)
|
24
|
+
{
|
25
|
+
struct stemmer_encoding * encoding;
|
26
|
+
if (charenc == NULL) return ENC_UTF_8;
|
27
|
+
for (encoding = encodings; encoding->name != 0; encoding++) {
|
28
|
+
if (strcmp(encoding->name, charenc) == 0) break;
|
29
|
+
}
|
30
|
+
if (encoding->name == NULL) return ENC_UNKNOWN;
|
31
|
+
return encoding->enc;
|
32
|
+
}
|
33
|
+
|
34
|
+
extern struct sb_stemmer *
|
35
|
+
sb_stemmer_new(const char * algorithm, const char * charenc)
|
36
|
+
{
|
37
|
+
stemmer_encoding_t enc;
|
38
|
+
struct stemmer_modules * module;
|
39
|
+
struct sb_stemmer * stemmer =
|
40
|
+
(struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
|
41
|
+
if (stemmer == NULL) return NULL;
|
42
|
+
enc = sb_getenc(charenc);
|
43
|
+
if (enc == ENC_UNKNOWN) return NULL;
|
44
|
+
|
45
|
+
for (module = modules; module->name != 0; module++) {
|
46
|
+
if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
|
47
|
+
}
|
48
|
+
if (module->name == NULL) return NULL;
|
49
|
+
|
50
|
+
stemmer->create = module->create;
|
51
|
+
stemmer->close = module->close;
|
52
|
+
stemmer->stem = module->stem;
|
53
|
+
|
54
|
+
stemmer->env = stemmer->create();
|
55
|
+
if (stemmer->env == NULL)
|
56
|
+
{
|
57
|
+
sb_stemmer_delete(stemmer);
|
58
|
+
return NULL;
|
59
|
+
}
|
60
|
+
|
61
|
+
return stemmer;
|
62
|
+
}
|
63
|
+
|
64
|
+
void
|
65
|
+
sb_stemmer_delete(struct sb_stemmer * stemmer)
|
66
|
+
{
|
67
|
+
if (stemmer == 0) return;
|
68
|
+
if (stemmer->close == 0) return;
|
69
|
+
stemmer->close(stemmer->env);
|
70
|
+
stemmer->close = 0;
|
71
|
+
free(stemmer);
|
72
|
+
}
|
73
|
+
|
74
|
+
const sb_symbol *
|
75
|
+
sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
|
76
|
+
{
|
77
|
+
int ret;
|
78
|
+
if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
|
79
|
+
{
|
80
|
+
stemmer->env->l = 0;
|
81
|
+
return NULL;
|
82
|
+
}
|
83
|
+
ret = stemmer->stem(stemmer->env);
|
84
|
+
if (ret < 0) return NULL;
|
85
|
+
stemmer->env->p[stemmer->env->l] = 0;
|
86
|
+
return (const sb_symbol *)(stemmer->env->p);
|
87
|
+
}
|
88
|
+
|
89
|
+
int
|
90
|
+
sb_stemmer_length(struct sb_stemmer * stemmer)
|
91
|
+
{
|
92
|
+
return stemmer->env->l;
|
93
|
+
}
|
@@ -0,0 +1,195 @@
|
|
1
|
+
/* libstemmer/modules.h: List of stemming modules.
|
2
|
+
*
|
3
|
+
* This file is generated by mkmodules.pl from a list of module names.
|
4
|
+
* Do not edit manually.
|
5
|
+
*
|
6
|
+
* Modules included by this file are: latin, danish, dutch, english, finnish, french,
|
7
|
+
* german, hungarian, italian, norwegian, porter, portuguese, romanian,
|
8
|
+
* russian, spanish, swedish, turkish
|
9
|
+
*/
|
10
|
+
|
11
|
+
#include "../src_c/stem_ISO_8859_1_latin.h"
|
12
|
+
#include "../src_c/stem_UTF_8_latin.h"
|
13
|
+
#include "../src_c/stem_ISO_8859_1_danish.h"
|
14
|
+
#include "../src_c/stem_UTF_8_danish.h"
|
15
|
+
#include "../src_c/stem_ISO_8859_1_dutch.h"
|
16
|
+
#include "../src_c/stem_UTF_8_dutch.h"
|
17
|
+
#include "../src_c/stem_ISO_8859_1_english.h"
|
18
|
+
#include "../src_c/stem_UTF_8_english.h"
|
19
|
+
#include "../src_c/stem_ISO_8859_1_finnish.h"
|
20
|
+
#include "../src_c/stem_UTF_8_finnish.h"
|
21
|
+
#include "../src_c/stem_ISO_8859_1_french.h"
|
22
|
+
#include "../src_c/stem_UTF_8_french.h"
|
23
|
+
#include "../src_c/stem_ISO_8859_1_german.h"
|
24
|
+
#include "../src_c/stem_UTF_8_german.h"
|
25
|
+
#include "../src_c/stem_ISO_8859_1_hungarian.h"
|
26
|
+
#include "../src_c/stem_UTF_8_hungarian.h"
|
27
|
+
#include "../src_c/stem_ISO_8859_1_italian.h"
|
28
|
+
#include "../src_c/stem_UTF_8_italian.h"
|
29
|
+
#include "../src_c/stem_ISO_8859_1_norwegian.h"
|
30
|
+
#include "../src_c/stem_UTF_8_norwegian.h"
|
31
|
+
#include "../src_c/stem_ISO_8859_1_porter.h"
|
32
|
+
#include "../src_c/stem_UTF_8_porter.h"
|
33
|
+
#include "../src_c/stem_ISO_8859_1_portuguese.h"
|
34
|
+
#include "../src_c/stem_UTF_8_portuguese.h"
|
35
|
+
#include "../src_c/stem_ISO_8859_2_romanian.h"
|
36
|
+
#include "../src_c/stem_UTF_8_romanian.h"
|
37
|
+
#include "../src_c/stem_KOI8_R_russian.h"
|
38
|
+
#include "../src_c/stem_UTF_8_russian.h"
|
39
|
+
#include "../src_c/stem_ISO_8859_1_spanish.h"
|
40
|
+
#include "../src_c/stem_UTF_8_spanish.h"
|
41
|
+
#include "../src_c/stem_ISO_8859_1_swedish.h"
|
42
|
+
#include "../src_c/stem_UTF_8_swedish.h"
|
43
|
+
#include "../src_c/stem_UTF_8_turkish.h"
|
44
|
+
|
45
|
+
typedef enum {
|
46
|
+
ENC_UNKNOWN=0,
|
47
|
+
ENC_ISO_8859_1,
|
48
|
+
ENC_ISO_8859_2,
|
49
|
+
ENC_KOI8_R,
|
50
|
+
ENC_UTF_8
|
51
|
+
} stemmer_encoding_t;
|
52
|
+
|
53
|
+
struct stemmer_encoding {
|
54
|
+
const char * name;
|
55
|
+
stemmer_encoding_t enc;
|
56
|
+
};
|
57
|
+
static struct stemmer_encoding encodings[] = {
|
58
|
+
{"ISO_8859_1", ENC_ISO_8859_1},
|
59
|
+
{"ISO_8859_2", ENC_ISO_8859_2},
|
60
|
+
{"KOI8_R", ENC_KOI8_R},
|
61
|
+
{"UTF_8", ENC_UTF_8},
|
62
|
+
{0,ENC_UNKNOWN}
|
63
|
+
};
|
64
|
+
|
65
|
+
struct stemmer_modules {
|
66
|
+
const char * name;
|
67
|
+
stemmer_encoding_t enc;
|
68
|
+
struct SN_env * (*create)(void);
|
69
|
+
void (*close)(struct SN_env *);
|
70
|
+
int (*stem)(struct SN_env *);
|
71
|
+
};
|
72
|
+
static struct stemmer_modules modules[] = {
|
73
|
+
{"latin", ENC_ISO_8859_1, latin_ISO_8859_1_create_env, latin_ISO_8859_1_close_env, latin_ISO_8859_1_stem},
|
74
|
+
{"latin", ENC_UTF_8, latin_UTF_8_create_env, latin_UTF_8_close_env, latin_UTF_8_stem},
|
75
|
+
{"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
76
|
+
{"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
77
|
+
{"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
78
|
+
{"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
79
|
+
{"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
80
|
+
{"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
81
|
+
{"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
82
|
+
{"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
83
|
+
{"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
84
|
+
{"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
85
|
+
{"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
86
|
+
{"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
87
|
+
{"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
88
|
+
{"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
89
|
+
{"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
90
|
+
{"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
91
|
+
{"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
92
|
+
{"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
93
|
+
{"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
94
|
+
{"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
95
|
+
{"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
96
|
+
{"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
97
|
+
{"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
98
|
+
{"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
99
|
+
{"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
100
|
+
{"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
101
|
+
{"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
102
|
+
{"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
103
|
+
{"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
104
|
+
{"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
105
|
+
{"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
106
|
+
{"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
107
|
+
{"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
108
|
+
{"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
109
|
+
{"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
110
|
+
{"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
111
|
+
{"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
112
|
+
{"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
113
|
+
{"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
114
|
+
{"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
115
|
+
{"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
116
|
+
{"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
117
|
+
{"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
118
|
+
{"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
119
|
+
{"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
120
|
+
{"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
121
|
+
{"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
122
|
+
{"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
123
|
+
{"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
124
|
+
{"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
125
|
+
{"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
126
|
+
{"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
127
|
+
{"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
128
|
+
{"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
129
|
+
{"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
130
|
+
{"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
131
|
+
{"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
132
|
+
{"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
133
|
+
{"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
|
134
|
+
{"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
135
|
+
{"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
|
136
|
+
{"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
137
|
+
{"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
|
138
|
+
{"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
139
|
+
{"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
140
|
+
{"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
141
|
+
{"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
|
142
|
+
{"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
|
143
|
+
{"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
144
|
+
{"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
145
|
+
{"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
146
|
+
{"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
147
|
+
{"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
148
|
+
{"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
149
|
+
{"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
150
|
+
{"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
151
|
+
{"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
152
|
+
{"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
153
|
+
{"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
154
|
+
{"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
155
|
+
{"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
156
|
+
{"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
157
|
+
{"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
158
|
+
{"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
159
|
+
{"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
160
|
+
{"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
161
|
+
{"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
162
|
+
{"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
163
|
+
{"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
164
|
+
{"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
165
|
+
{"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
166
|
+
{"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
167
|
+
{"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
168
|
+
{"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
169
|
+
{"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
170
|
+
{"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
171
|
+
{"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
172
|
+
{"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
173
|
+
{"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
174
|
+
{0,ENC_UNKNOWN,0,0,0}
|
175
|
+
};
|
176
|
+
static const char * algorithm_names[] = {
|
177
|
+
"latin",
|
178
|
+
"danish",
|
179
|
+
"dutch",
|
180
|
+
"english",
|
181
|
+
"finnish",
|
182
|
+
"french",
|
183
|
+
"german",
|
184
|
+
"hungarian",
|
185
|
+
"italian",
|
186
|
+
"norwegian",
|
187
|
+
"porter",
|
188
|
+
"portuguese",
|
189
|
+
"romanian",
|
190
|
+
"russian",
|
191
|
+
"spanish",
|
192
|
+
"swedish",
|
193
|
+
"turkish",
|
194
|
+
0
|
195
|
+
};
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# This file contains a list of stemmers to include in the distribution.
|
2
|
+
# The format is a set of space separated lines - on each line:
|
3
|
+
# First item is name of stemmer.
|
4
|
+
# Second item is comma separated list of character sets.
|
5
|
+
# Third item is comma separated list of names to refer to the stemmer by.
|
6
|
+
#
|
7
|
+
# Lines starting with a #, or blank lines, are ignored.
|
8
|
+
|
9
|
+
# List all the main algorithms for each language, in UTF-8, and also with
|
10
|
+
# the most commonly used encoding.
|
11
|
+
|
12
|
+
latin UTF_8,ISO_8859_1 latin
|
13
|
+
danish UTF_8,ISO_8859_1 danish,da,dan
|
14
|
+
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
|
15
|
+
english UTF_8,ISO_8859_1 english,en,eng
|
16
|
+
finnish UTF_8,ISO_8859_1 finnish,fi,fin
|
17
|
+
french UTF_8,ISO_8859_1 french,fr,fre,fra
|
18
|
+
german UTF_8,ISO_8859_1 german,de,ger,deu
|
19
|
+
hungarian UTF_8,ISO_8859_1 hungarian,hu,hun
|
20
|
+
italian UTF_8,ISO_8859_1 italian,it,ita
|
21
|
+
norwegian UTF_8,ISO_8859_1 norwegian,no,nor
|
22
|
+
portuguese UTF_8,ISO_8859_1 portuguese,pt,por
|
23
|
+
romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
|
24
|
+
russian UTF_8,KOI8_R russian,ru,rus
|
25
|
+
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
|
26
|
+
swedish UTF_8,ISO_8859_1 swedish,sv,swe
|
27
|
+
turkish UTF_8 turkish,tr,tur
|
28
|
+
|
29
|
+
# Also include the traditional porter algorithm for english.
|
30
|
+
# The porter algorithm is included in the libstemmer distribution to assist
|
31
|
+
# with backwards compatibility, but for new systems the english algorithm
|
32
|
+
# should be used in preference.
|
33
|
+
porter UTF_8,ISO_8859_1 porter
|
34
|
+
|
35
|
+
# Some other stemmers in the snowball project are not included in the standard
|
36
|
+
# distribution. To compile a libstemmer with them in, add them to this list,
|
37
|
+
# and regenerate the distribution. (You will need a full source checkout for
|
38
|
+
# this.) They are included in the snowball website as curiosities, but are not
|
39
|
+
# intended for general use, and use of them is is not fully supported. These
|
40
|
+
# algorithms are:
|
41
|
+
#
|
42
|
+
# german2 - This is a slight modification of the german stemmer.
|
43
|
+
#german2 UTF_8,ISO_8859_1 german2
|
44
|
+
#
|
45
|
+
# kraaij_pohlmann - This is a different dutch stemmer.
|
46
|
+
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann
|
47
|
+
#
|
48
|
+
# lovins - This is an english stemmer, but fairly outdated, and
|
49
|
+
# only really applicable to a restricted type of input text
|
50
|
+
# (keywords in academic publications).
|
51
|
+
#lovins UTF_8,ISO_8859_1 lovins
|
@@ -0,0 +1,123 @@
|
|
1
|
+
/* libstemmer/modules_utf8.h: List of stemming modules.
|
2
|
+
*
|
3
|
+
* This file is generated by mkmodules.pl from a list of module names.
|
4
|
+
* Do not edit manually.
|
5
|
+
*
|
6
|
+
* Modules included by this file are: latin, danish, dutch, english, finnish, french,
|
7
|
+
* german, hungarian, italian, norwegian, porter, portuguese, romanian,
|
8
|
+
* russian, spanish, swedish, turkish
|
9
|
+
*/
|
10
|
+
|
11
|
+
#include "../src_c/stem_UTF_8_latin.h"
|
12
|
+
#include "../src_c/stem_UTF_8_danish.h"
|
13
|
+
#include "../src_c/stem_UTF_8_dutch.h"
|
14
|
+
#include "../src_c/stem_UTF_8_english.h"
|
15
|
+
#include "../src_c/stem_UTF_8_finnish.h"
|
16
|
+
#include "../src_c/stem_UTF_8_french.h"
|
17
|
+
#include "../src_c/stem_UTF_8_german.h"
|
18
|
+
#include "../src_c/stem_UTF_8_hungarian.h"
|
19
|
+
#include "../src_c/stem_UTF_8_italian.h"
|
20
|
+
#include "../src_c/stem_UTF_8_norwegian.h"
|
21
|
+
#include "../src_c/stem_UTF_8_porter.h"
|
22
|
+
#include "../src_c/stem_UTF_8_portuguese.h"
|
23
|
+
#include "../src_c/stem_UTF_8_romanian.h"
|
24
|
+
#include "../src_c/stem_UTF_8_russian.h"
|
25
|
+
#include "../src_c/stem_UTF_8_spanish.h"
|
26
|
+
#include "../src_c/stem_UTF_8_swedish.h"
|
27
|
+
#include "../src_c/stem_UTF_8_turkish.h"
|
28
|
+
|
29
|
+
typedef enum {
|
30
|
+
ENC_UNKNOWN=0,
|
31
|
+
ENC_UTF_8
|
32
|
+
} stemmer_encoding_t;
|
33
|
+
|
34
|
+
struct stemmer_encoding {
|
35
|
+
const char * name;
|
36
|
+
stemmer_encoding_t enc;
|
37
|
+
};
|
38
|
+
static struct stemmer_encoding encodings[] = {
|
39
|
+
{"UTF_8", ENC_UTF_8},
|
40
|
+
{0,ENC_UNKNOWN}
|
41
|
+
};
|
42
|
+
|
43
|
+
struct stemmer_modules {
|
44
|
+
const char * name;
|
45
|
+
stemmer_encoding_t enc;
|
46
|
+
struct SN_env * (*create)(void);
|
47
|
+
void (*close)(struct SN_env *);
|
48
|
+
int (*stem)(struct SN_env *);
|
49
|
+
};
|
50
|
+
static struct stemmer_modules modules[] = {
|
51
|
+
{"latin", ENC_UTF_8, latin_UTF_8_create_env, latin_UTF_8_close_env, latin_UTF_8_stem},
|
52
|
+
{"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
53
|
+
{"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
54
|
+
{"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
55
|
+
{"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
56
|
+
{"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
57
|
+
{"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
58
|
+
{"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
59
|
+
{"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
60
|
+
{"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
61
|
+
{"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
62
|
+
{"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
63
|
+
{"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
64
|
+
{"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
65
|
+
{"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
66
|
+
{"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
67
|
+
{"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
68
|
+
{"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
69
|
+
{"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
70
|
+
{"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
71
|
+
{"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
72
|
+
{"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
73
|
+
{"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
74
|
+
{"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
75
|
+
{"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
76
|
+
{"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
77
|
+
{"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
78
|
+
{"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
79
|
+
{"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
80
|
+
{"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
81
|
+
{"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
82
|
+
{"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
83
|
+
{"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
84
|
+
{"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
85
|
+
{"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
|
86
|
+
{"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
87
|
+
{"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
88
|
+
{"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
89
|
+
{"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
90
|
+
{"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
91
|
+
{"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
92
|
+
{"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
93
|
+
{"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
94
|
+
{"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
95
|
+
{"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
96
|
+
{"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
97
|
+
{"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
98
|
+
{"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
99
|
+
{"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
100
|
+
{"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
101
|
+
{"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
102
|
+
{"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
103
|
+
{0,ENC_UNKNOWN,0,0,0}
|
104
|
+
};
|
105
|
+
static const char * algorithm_names[] = {
|
106
|
+
"danish",
|
107
|
+
"dutch",
|
108
|
+
"english",
|
109
|
+
"finnish",
|
110
|
+
"french",
|
111
|
+
"german",
|
112
|
+
"hungarian",
|
113
|
+
"italian",
|
114
|
+
"norwegian",
|
115
|
+
"porter",
|
116
|
+
"portuguese",
|
117
|
+
"romanian",
|
118
|
+
"russian",
|
119
|
+
"spanish",
|
120
|
+
"swedish",
|
121
|
+
"turkish",
|
122
|
+
0
|
123
|
+
};
|