chipper 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
/* libstemmer/modules.h: List of stemming modules.
|
|
2
|
+
*
|
|
3
|
+
* This file is generated by mkmodules.pl from a list of module names.
|
|
4
|
+
* Do not edit manually.
|
|
5
|
+
*
|
|
6
|
+
* Modules included by this file are: danish, dutch, english, finnish, french,
|
|
7
|
+
* german, hungarian, italian, norwegian, porter, portuguese, romanian,
|
|
8
|
+
* russian, spanish, swedish, turkish
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
#include "../src_c/stem_ISO_8859_1_danish.h"
|
|
12
|
+
#include "../src_c/stem_UTF_8_danish.h"
|
|
13
|
+
#include "../src_c/stem_ISO_8859_1_dutch.h"
|
|
14
|
+
#include "../src_c/stem_UTF_8_dutch.h"
|
|
15
|
+
#include "../src_c/stem_ISO_8859_1_english.h"
|
|
16
|
+
#include "../src_c/stem_UTF_8_english.h"
|
|
17
|
+
#include "../src_c/stem_ISO_8859_1_finnish.h"
|
|
18
|
+
#include "../src_c/stem_UTF_8_finnish.h"
|
|
19
|
+
#include "../src_c/stem_ISO_8859_1_french.h"
|
|
20
|
+
#include "../src_c/stem_UTF_8_french.h"
|
|
21
|
+
#include "../src_c/stem_ISO_8859_1_german.h"
|
|
22
|
+
#include "../src_c/stem_UTF_8_german.h"
|
|
23
|
+
#include "../src_c/stem_ISO_8859_1_hungarian.h"
|
|
24
|
+
#include "../src_c/stem_UTF_8_hungarian.h"
|
|
25
|
+
#include "../src_c/stem_ISO_8859_1_italian.h"
|
|
26
|
+
#include "../src_c/stem_UTF_8_italian.h"
|
|
27
|
+
#include "../src_c/stem_ISO_8859_1_norwegian.h"
|
|
28
|
+
#include "../src_c/stem_UTF_8_norwegian.h"
|
|
29
|
+
#include "../src_c/stem_ISO_8859_1_porter.h"
|
|
30
|
+
#include "../src_c/stem_UTF_8_porter.h"
|
|
31
|
+
#include "../src_c/stem_ISO_8859_1_portuguese.h"
|
|
32
|
+
#include "../src_c/stem_UTF_8_portuguese.h"
|
|
33
|
+
#include "../src_c/stem_ISO_8859_2_romanian.h"
|
|
34
|
+
#include "../src_c/stem_UTF_8_romanian.h"
|
|
35
|
+
#include "../src_c/stem_KOI8_R_russian.h"
|
|
36
|
+
#include "../src_c/stem_UTF_8_russian.h"
|
|
37
|
+
#include "../src_c/stem_ISO_8859_1_spanish.h"
|
|
38
|
+
#include "../src_c/stem_UTF_8_spanish.h"
|
|
39
|
+
#include "../src_c/stem_ISO_8859_1_swedish.h"
|
|
40
|
+
#include "../src_c/stem_UTF_8_swedish.h"
|
|
41
|
+
#include "../src_c/stem_UTF_8_turkish.h"
|
|
42
|
+
|
|
43
|
+
typedef enum {
|
|
44
|
+
ENC_UNKNOWN=0,
|
|
45
|
+
ENC_ISO_8859_1,
|
|
46
|
+
ENC_ISO_8859_2,
|
|
47
|
+
ENC_KOI8_R,
|
|
48
|
+
ENC_UTF_8
|
|
49
|
+
} stemmer_encoding_t;
|
|
50
|
+
|
|
51
|
+
struct stemmer_encoding {
|
|
52
|
+
const char * name;
|
|
53
|
+
stemmer_encoding_t enc;
|
|
54
|
+
};
|
|
55
|
+
static struct stemmer_encoding encodings[] = {
|
|
56
|
+
{"ISO_8859_1", ENC_ISO_8859_1},
|
|
57
|
+
{"ISO_8859_2", ENC_ISO_8859_2},
|
|
58
|
+
{"KOI8_R", ENC_KOI8_R},
|
|
59
|
+
{"UTF_8", ENC_UTF_8},
|
|
60
|
+
{0,ENC_UNKNOWN}
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
struct stemmer_modules {
|
|
64
|
+
const char * name;
|
|
65
|
+
stemmer_encoding_t enc;
|
|
66
|
+
struct SN_env * (*create)(void);
|
|
67
|
+
void (*close)(struct SN_env *);
|
|
68
|
+
int (*stem)(struct SN_env *);
|
|
69
|
+
};
|
|
70
|
+
static struct stemmer_modules modules[] = {
|
|
71
|
+
{"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
|
72
|
+
{"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
|
73
|
+
{"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
|
74
|
+
{"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
|
75
|
+
{"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
|
76
|
+
{"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
|
77
|
+
{"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
|
78
|
+
{"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
|
79
|
+
{"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
|
80
|
+
{"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
|
81
|
+
{"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
|
82
|
+
{"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
|
83
|
+
{"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
|
84
|
+
{"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
|
85
|
+
{"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
|
86
|
+
{"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
|
87
|
+
{"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
|
88
|
+
{"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
|
89
|
+
{"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
|
90
|
+
{"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
|
91
|
+
{"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
|
92
|
+
{"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
|
93
|
+
{"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
|
94
|
+
{"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
|
95
|
+
{"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
|
96
|
+
{"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
|
97
|
+
{"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
|
98
|
+
{"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
|
99
|
+
{"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
|
100
|
+
{"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
|
101
|
+
{"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
|
102
|
+
{"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
|
103
|
+
{"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
|
104
|
+
{"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
|
105
|
+
{"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
|
106
|
+
{"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
|
107
|
+
{"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
|
108
|
+
{"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
|
109
|
+
{"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
|
110
|
+
{"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
|
111
|
+
{"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
|
112
|
+
{"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
|
113
|
+
{"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
|
114
|
+
{"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
|
115
|
+
{"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
|
116
|
+
{"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
|
117
|
+
{"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
|
118
|
+
{"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
|
119
|
+
{"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
|
120
|
+
{"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
|
121
|
+
{"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
|
122
|
+
{"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
|
123
|
+
{"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
|
124
|
+
{"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
|
125
|
+
{"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
|
126
|
+
{"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
|
127
|
+
{"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
|
128
|
+
{"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
|
129
|
+
{"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
|
|
130
|
+
{"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
|
131
|
+
{"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
|
|
132
|
+
{"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
|
133
|
+
{"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
|
|
134
|
+
{"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
|
135
|
+
{"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
|
136
|
+
{"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
|
137
|
+
{"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
|
|
138
|
+
{"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
|
|
139
|
+
{"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
|
140
|
+
{"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
|
141
|
+
{"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
|
142
|
+
{"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
|
143
|
+
{"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
|
144
|
+
{"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
|
145
|
+
{"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
|
146
|
+
{"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
|
147
|
+
{"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
|
148
|
+
{"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
|
149
|
+
{"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
|
150
|
+
{"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
|
151
|
+
{"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
|
152
|
+
{"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
|
153
|
+
{"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
|
154
|
+
{"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
|
155
|
+
{"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
|
156
|
+
{"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
|
157
|
+
{"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
|
158
|
+
{"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
|
159
|
+
{"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
|
160
|
+
{"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
|
161
|
+
{"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
|
162
|
+
{"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
|
163
|
+
{"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
|
164
|
+
{"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
|
165
|
+
{"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
|
166
|
+
{"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
|
167
|
+
{"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
|
168
|
+
{"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
|
169
|
+
{"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
|
170
|
+
{0,ENC_UNKNOWN,0,0,0}
|
|
171
|
+
};
|
|
172
|
+
static const char * algorithm_names[] = {
|
|
173
|
+
"danish",
|
|
174
|
+
"dutch",
|
|
175
|
+
"english",
|
|
176
|
+
"finnish",
|
|
177
|
+
"french",
|
|
178
|
+
"german",
|
|
179
|
+
"hungarian",
|
|
180
|
+
"italian",
|
|
181
|
+
"norwegian",
|
|
182
|
+
"porter",
|
|
183
|
+
"portuguese",
|
|
184
|
+
"romanian",
|
|
185
|
+
"russian",
|
|
186
|
+
"spanish",
|
|
187
|
+
"swedish",
|
|
188
|
+
"turkish",
|
|
189
|
+
0
|
|
190
|
+
};
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
/* libstemmer/modules_utf8.h: List of stemming modules.
|
|
2
|
+
*
|
|
3
|
+
* This file is generated by mkmodules.pl from a list of module names.
|
|
4
|
+
* Do not edit manually.
|
|
5
|
+
*
|
|
6
|
+
* Modules included by this file are: danish, dutch, english, finnish, french,
|
|
7
|
+
* german, hungarian, italian, norwegian, porter, portuguese, romanian,
|
|
8
|
+
* russian, spanish, swedish, turkish
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
#include "../src_c/stem_UTF_8_danish.h"
|
|
12
|
+
#include "../src_c/stem_UTF_8_dutch.h"
|
|
13
|
+
#include "../src_c/stem_UTF_8_english.h"
|
|
14
|
+
#include "../src_c/stem_UTF_8_finnish.h"
|
|
15
|
+
#include "../src_c/stem_UTF_8_french.h"
|
|
16
|
+
#include "../src_c/stem_UTF_8_german.h"
|
|
17
|
+
#include "../src_c/stem_UTF_8_hungarian.h"
|
|
18
|
+
#include "../src_c/stem_UTF_8_italian.h"
|
|
19
|
+
#include "../src_c/stem_UTF_8_norwegian.h"
|
|
20
|
+
#include "../src_c/stem_UTF_8_porter.h"
|
|
21
|
+
#include "../src_c/stem_UTF_8_portuguese.h"
|
|
22
|
+
#include "../src_c/stem_UTF_8_romanian.h"
|
|
23
|
+
#include "../src_c/stem_UTF_8_russian.h"
|
|
24
|
+
#include "../src_c/stem_UTF_8_spanish.h"
|
|
25
|
+
#include "../src_c/stem_UTF_8_swedish.h"
|
|
26
|
+
#include "../src_c/stem_UTF_8_turkish.h"
|
|
27
|
+
|
|
28
|
+
typedef enum {
|
|
29
|
+
ENC_UNKNOWN=0,
|
|
30
|
+
ENC_UTF_8
|
|
31
|
+
} stemmer_encoding_t;
|
|
32
|
+
|
|
33
|
+
struct stemmer_encoding {
|
|
34
|
+
const char * name;
|
|
35
|
+
stemmer_encoding_t enc;
|
|
36
|
+
};
|
|
37
|
+
static struct stemmer_encoding encodings[] = {
|
|
38
|
+
{"UTF_8", ENC_UTF_8},
|
|
39
|
+
{0,ENC_UNKNOWN}
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
struct stemmer_modules {
|
|
43
|
+
const char * name;
|
|
44
|
+
stemmer_encoding_t enc;
|
|
45
|
+
struct SN_env * (*create)(void);
|
|
46
|
+
void (*close)(struct SN_env *);
|
|
47
|
+
int (*stem)(struct SN_env *);
|
|
48
|
+
};
|
|
49
|
+
static struct stemmer_modules modules[] = {
|
|
50
|
+
{"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
|
51
|
+
{"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
|
52
|
+
{"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
|
53
|
+
{"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
|
54
|
+
{"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
|
55
|
+
{"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
|
56
|
+
{"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
|
57
|
+
{"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
|
58
|
+
{"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
|
59
|
+
{"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
|
60
|
+
{"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
|
61
|
+
{"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
|
62
|
+
{"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
|
63
|
+
{"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
|
64
|
+
{"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
|
65
|
+
{"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
|
66
|
+
{"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
|
67
|
+
{"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
|
68
|
+
{"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
|
69
|
+
{"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
|
70
|
+
{"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
|
71
|
+
{"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
|
72
|
+
{"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
|
73
|
+
{"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
|
74
|
+
{"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
|
75
|
+
{"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
|
76
|
+
{"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
|
77
|
+
{"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
|
78
|
+
{"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
|
79
|
+
{"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
|
80
|
+
{"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
|
81
|
+
{"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
|
82
|
+
{"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
|
83
|
+
{"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
|
|
84
|
+
{"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
|
85
|
+
{"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
|
86
|
+
{"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
|
87
|
+
{"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
|
88
|
+
{"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
|
89
|
+
{"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
|
90
|
+
{"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
|
91
|
+
{"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
|
92
|
+
{"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
|
93
|
+
{"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
|
94
|
+
{"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
|
95
|
+
{"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
|
96
|
+
{"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
|
97
|
+
{"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
|
98
|
+
{"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
|
99
|
+
{"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
|
100
|
+
{"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
|
101
|
+
{0,ENC_UNKNOWN,0,0,0}
|
|
102
|
+
};
|
|
103
|
+
static const char * algorithm_names[] = {
|
|
104
|
+
"danish",
|
|
105
|
+
"dutch",
|
|
106
|
+
"english",
|
|
107
|
+
"finnish",
|
|
108
|
+
"french",
|
|
109
|
+
"german",
|
|
110
|
+
"hungarian",
|
|
111
|
+
"italian",
|
|
112
|
+
"norwegian",
|
|
113
|
+
"porter",
|
|
114
|
+
"portuguese",
|
|
115
|
+
"romanian",
|
|
116
|
+
"russian",
|
|
117
|
+
"spanish",
|
|
118
|
+
"swedish",
|
|
119
|
+
"turkish",
|
|
120
|
+
0
|
|
121
|
+
};
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# libstemmer/mkinc.mak: List of stemming module source files
|
|
2
|
+
#
|
|
3
|
+
# This file is generated by mkmodules.pl from a list of module names.
|
|
4
|
+
# Do not edit manually.
|
|
5
|
+
#
|
|
6
|
+
# Modules included by this file are: danish, dutch, english, finnish, french,
|
|
7
|
+
# german, hungarian, italian, norwegian, porter, portuguese, romanian,
|
|
8
|
+
# russian, spanish, swedish, turkish
|
|
9
|
+
|
|
10
|
+
snowball_sources= \
|
|
11
|
+
src_c/stem_ISO_8859_1_danish.c \
|
|
12
|
+
src_c/stem_UTF_8_danish.c \
|
|
13
|
+
src_c/stem_ISO_8859_1_dutch.c \
|
|
14
|
+
src_c/stem_UTF_8_dutch.c \
|
|
15
|
+
src_c/stem_ISO_8859_1_english.c \
|
|
16
|
+
src_c/stem_UTF_8_english.c \
|
|
17
|
+
src_c/stem_ISO_8859_1_finnish.c \
|
|
18
|
+
src_c/stem_UTF_8_finnish.c \
|
|
19
|
+
src_c/stem_ISO_8859_1_french.c \
|
|
20
|
+
src_c/stem_UTF_8_french.c \
|
|
21
|
+
src_c/stem_ISO_8859_1_german.c \
|
|
22
|
+
src_c/stem_UTF_8_german.c \
|
|
23
|
+
src_c/stem_ISO_8859_1_hungarian.c \
|
|
24
|
+
src_c/stem_UTF_8_hungarian.c \
|
|
25
|
+
src_c/stem_ISO_8859_1_italian.c \
|
|
26
|
+
src_c/stem_UTF_8_italian.c \
|
|
27
|
+
src_c/stem_ISO_8859_1_norwegian.c \
|
|
28
|
+
src_c/stem_UTF_8_norwegian.c \
|
|
29
|
+
src_c/stem_ISO_8859_1_porter.c \
|
|
30
|
+
src_c/stem_UTF_8_porter.c \
|
|
31
|
+
src_c/stem_ISO_8859_1_portuguese.c \
|
|
32
|
+
src_c/stem_UTF_8_portuguese.c \
|
|
33
|
+
src_c/stem_ISO_8859_2_romanian.c \
|
|
34
|
+
src_c/stem_UTF_8_romanian.c \
|
|
35
|
+
src_c/stem_KOI8_R_russian.c \
|
|
36
|
+
src_c/stem_UTF_8_russian.c \
|
|
37
|
+
src_c/stem_ISO_8859_1_spanish.c \
|
|
38
|
+
src_c/stem_UTF_8_spanish.c \
|
|
39
|
+
src_c/stem_ISO_8859_1_swedish.c \
|
|
40
|
+
src_c/stem_UTF_8_swedish.c \
|
|
41
|
+
src_c/stem_UTF_8_turkish.c \
|
|
42
|
+
runtime/api.c \
|
|
43
|
+
runtime/utilities.c \
|
|
44
|
+
libstemmer/libstemmer.c
|
|
45
|
+
|
|
46
|
+
snowball_headers= \
|
|
47
|
+
src_c/stem_ISO_8859_1_danish.h \
|
|
48
|
+
src_c/stem_UTF_8_danish.h \
|
|
49
|
+
src_c/stem_ISO_8859_1_dutch.h \
|
|
50
|
+
src_c/stem_UTF_8_dutch.h \
|
|
51
|
+
src_c/stem_ISO_8859_1_english.h \
|
|
52
|
+
src_c/stem_UTF_8_english.h \
|
|
53
|
+
src_c/stem_ISO_8859_1_finnish.h \
|
|
54
|
+
src_c/stem_UTF_8_finnish.h \
|
|
55
|
+
src_c/stem_ISO_8859_1_french.h \
|
|
56
|
+
src_c/stem_UTF_8_french.h \
|
|
57
|
+
src_c/stem_ISO_8859_1_german.h \
|
|
58
|
+
src_c/stem_UTF_8_german.h \
|
|
59
|
+
src_c/stem_ISO_8859_1_hungarian.h \
|
|
60
|
+
src_c/stem_UTF_8_hungarian.h \
|
|
61
|
+
src_c/stem_ISO_8859_1_italian.h \
|
|
62
|
+
src_c/stem_UTF_8_italian.h \
|
|
63
|
+
src_c/stem_ISO_8859_1_norwegian.h \
|
|
64
|
+
src_c/stem_UTF_8_norwegian.h \
|
|
65
|
+
src_c/stem_ISO_8859_1_porter.h \
|
|
66
|
+
src_c/stem_UTF_8_porter.h \
|
|
67
|
+
src_c/stem_ISO_8859_1_portuguese.h \
|
|
68
|
+
src_c/stem_UTF_8_portuguese.h \
|
|
69
|
+
src_c/stem_ISO_8859_2_romanian.h \
|
|
70
|
+
src_c/stem_UTF_8_romanian.h \
|
|
71
|
+
src_c/stem_KOI8_R_russian.h \
|
|
72
|
+
src_c/stem_UTF_8_russian.h \
|
|
73
|
+
src_c/stem_ISO_8859_1_spanish.h \
|
|
74
|
+
src_c/stem_UTF_8_spanish.h \
|
|
75
|
+
src_c/stem_ISO_8859_1_swedish.h \
|
|
76
|
+
src_c/stem_UTF_8_swedish.h \
|
|
77
|
+
src_c/stem_UTF_8_turkish.h \
|
|
78
|
+
include/libstemmer.h \
|
|
79
|
+
libstemmer/modules.h \
|
|
80
|
+
runtime/api.h \
|
|
81
|
+
runtime/header.h
|
|
82
|
+
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# libstemmer/mkinc_utf8.mak: List of stemming module source files
|
|
2
|
+
#
|
|
3
|
+
# This file is generated by mkmodules.pl from a list of module names.
|
|
4
|
+
# Do not edit manually.
|
|
5
|
+
#
|
|
6
|
+
# Modules included by this file are: danish, dutch, english, finnish, french,
|
|
7
|
+
# german, hungarian, italian, norwegian, porter, portuguese, romanian,
|
|
8
|
+
# russian, spanish, swedish, turkish
|
|
9
|
+
|
|
10
|
+
snowball_sources= \
|
|
11
|
+
src_c/stem_UTF_8_danish.c \
|
|
12
|
+
src_c/stem_UTF_8_dutch.c \
|
|
13
|
+
src_c/stem_UTF_8_english.c \
|
|
14
|
+
src_c/stem_UTF_8_finnish.c \
|
|
15
|
+
src_c/stem_UTF_8_french.c \
|
|
16
|
+
src_c/stem_UTF_8_german.c \
|
|
17
|
+
src_c/stem_UTF_8_hungarian.c \
|
|
18
|
+
src_c/stem_UTF_8_italian.c \
|
|
19
|
+
src_c/stem_UTF_8_norwegian.c \
|
|
20
|
+
src_c/stem_UTF_8_porter.c \
|
|
21
|
+
src_c/stem_UTF_8_portuguese.c \
|
|
22
|
+
src_c/stem_UTF_8_romanian.c \
|
|
23
|
+
src_c/stem_UTF_8_russian.c \
|
|
24
|
+
src_c/stem_UTF_8_spanish.c \
|
|
25
|
+
src_c/stem_UTF_8_swedish.c \
|
|
26
|
+
src_c/stem_UTF_8_turkish.c \
|
|
27
|
+
runtime/api.c \
|
|
28
|
+
runtime/utilities.c \
|
|
29
|
+
libstemmer/libstemmer_utf8.c
|
|
30
|
+
|
|
31
|
+
snowball_headers= \
|
|
32
|
+
src_c/stem_UTF_8_danish.h \
|
|
33
|
+
src_c/stem_UTF_8_dutch.h \
|
|
34
|
+
src_c/stem_UTF_8_english.h \
|
|
35
|
+
src_c/stem_UTF_8_finnish.h \
|
|
36
|
+
src_c/stem_UTF_8_french.h \
|
|
37
|
+
src_c/stem_UTF_8_german.h \
|
|
38
|
+
src_c/stem_UTF_8_hungarian.h \
|
|
39
|
+
src_c/stem_UTF_8_italian.h \
|
|
40
|
+
src_c/stem_UTF_8_norwegian.h \
|
|
41
|
+
src_c/stem_UTF_8_porter.h \
|
|
42
|
+
src_c/stem_UTF_8_portuguese.h \
|
|
43
|
+
src_c/stem_UTF_8_romanian.h \
|
|
44
|
+
src_c/stem_UTF_8_russian.h \
|
|
45
|
+
src_c/stem_UTF_8_spanish.h \
|
|
46
|
+
src_c/stem_UTF_8_swedish.h \
|
|
47
|
+
src_c/stem_UTF_8_turkish.h \
|
|
48
|
+
include/libstemmer.h \
|
|
49
|
+
libstemmer/modules_utf8.h \
|
|
50
|
+
runtime/api.h \
|
|
51
|
+
runtime/header.h
|
|
52
|
+
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
|
|
2
|
+
#include <stdlib.h> /* for calloc, free */
|
|
3
|
+
#include "header.h"
|
|
4
|
+
|
|
5
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
|
|
6
|
+
{
|
|
7
|
+
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
|
|
8
|
+
if (z == NULL) return NULL;
|
|
9
|
+
z->p = create_s();
|
|
10
|
+
if (z->p == NULL) goto error;
|
|
11
|
+
if (S_size)
|
|
12
|
+
{
|
|
13
|
+
int i;
|
|
14
|
+
z->S = (symbol * *) calloc(S_size, sizeof(symbol *));
|
|
15
|
+
if (z->S == NULL) goto error;
|
|
16
|
+
|
|
17
|
+
for (i = 0; i < S_size; i++)
|
|
18
|
+
{
|
|
19
|
+
z->S[i] = create_s();
|
|
20
|
+
if (z->S[i] == NULL) goto error;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (I_size)
|
|
25
|
+
{
|
|
26
|
+
z->I = (int *) calloc(I_size, sizeof(int));
|
|
27
|
+
if (z->I == NULL) goto error;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if (B_size)
|
|
31
|
+
{
|
|
32
|
+
z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
|
|
33
|
+
if (z->B == NULL) goto error;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return z;
|
|
37
|
+
error:
|
|
38
|
+
SN_close_env(z, S_size);
|
|
39
|
+
return NULL;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
extern void SN_close_env(struct SN_env * z, int S_size)
|
|
43
|
+
{
|
|
44
|
+
if (z == NULL) return;
|
|
45
|
+
if (S_size)
|
|
46
|
+
{
|
|
47
|
+
int i;
|
|
48
|
+
for (i = 0; i < S_size; i++)
|
|
49
|
+
{
|
|
50
|
+
lose_s(z->S[i]);
|
|
51
|
+
}
|
|
52
|
+
free(z->S);
|
|
53
|
+
}
|
|
54
|
+
free(z->I);
|
|
55
|
+
free(z->B);
|
|
56
|
+
if (z->p) lose_s(z->p);
|
|
57
|
+
free(z);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
|
|
61
|
+
{
|
|
62
|
+
int err = replace_s(z, 0, z->l, size, s, NULL);
|
|
63
|
+
z->c = 0;
|
|
64
|
+
return err;
|
|
65
|
+
}
|
|
66
|
+
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
|
|
2
|
+
typedef unsigned char symbol;
|
|
3
|
+
|
|
4
|
+
/* Or replace 'char' above with 'short' for 16 bit characters.
|
|
5
|
+
|
|
6
|
+
More precisely, replace 'char' with whatever type guarantees the
|
|
7
|
+
character width you need. Note however that sizeof(symbol) should divide
|
|
8
|
+
HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise
|
|
9
|
+
there is an alignment problem. In the unlikely event of a problem here,
|
|
10
|
+
consult Martin Porter.
|
|
11
|
+
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
struct SN_env {
|
|
15
|
+
symbol * p;
|
|
16
|
+
int c; int l; int lb; int bra; int ket;
|
|
17
|
+
symbol * * S;
|
|
18
|
+
int * I;
|
|
19
|
+
unsigned char * B;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
|
|
23
|
+
extern void SN_close_env(struct SN_env * z, int S_size);
|
|
24
|
+
|
|
25
|
+
extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
|
|
26
|
+
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
|
|
2
|
+
#include <limits.h>
|
|
3
|
+
|
|
4
|
+
#include "api.h"
|
|
5
|
+
|
|
6
|
+
#define MAXINT INT_MAX
|
|
7
|
+
#define MININT INT_MIN
|
|
8
|
+
|
|
9
|
+
#define HEAD 2*sizeof(int)
|
|
10
|
+
|
|
11
|
+
#define SIZE(p) ((int *)(p))[-1]
|
|
12
|
+
#define SET_SIZE(p, n) ((int *)(p))[-1] = n
|
|
13
|
+
#define CAPACITY(p) ((int *)(p))[-2]
|
|
14
|
+
|
|
15
|
+
struct among
|
|
16
|
+
{ int s_size; /* number of chars in string */
|
|
17
|
+
const symbol * s; /* search string */
|
|
18
|
+
int substring_i;/* index to longest matching substring */
|
|
19
|
+
int result; /* result of the lookup */
|
|
20
|
+
int (* function)(struct SN_env *);
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
extern symbol * create_s(void);
|
|
24
|
+
extern void lose_s(symbol * p);
|
|
25
|
+
|
|
26
|
+
extern int skip_utf8(const symbol * p, int c, int lb, int l, int n);
|
|
27
|
+
|
|
28
|
+
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
29
|
+
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
30
|
+
extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
31
|
+
extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
32
|
+
|
|
33
|
+
extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
34
|
+
extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
35
|
+
extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
36
|
+
extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
37
|
+
|
|
38
|
+
extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
|
|
39
|
+
extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
|
|
40
|
+
extern int eq_v(struct SN_env * z, const symbol * p);
|
|
41
|
+
extern int eq_v_b(struct SN_env * z, const symbol * p);
|
|
42
|
+
|
|
43
|
+
extern int find_among(struct SN_env * z, const struct among * v, int v_size);
|
|
44
|
+
extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
|
|
45
|
+
|
|
46
|
+
extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
|
|
47
|
+
extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
|
|
48
|
+
extern int slice_from_v(struct SN_env * z, const symbol * p);
|
|
49
|
+
extern int slice_del(struct SN_env * z);
|
|
50
|
+
|
|
51
|
+
extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
|
|
52
|
+
extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
|
|
53
|
+
|
|
54
|
+
extern symbol * slice_to(struct SN_env * z, symbol * p);
|
|
55
|
+
extern symbol * assign_to(struct SN_env * z, symbol * p);
|
|
56
|
+
|
|
57
|
+
extern void debug(struct SN_env * z, int number, int line_count);
|
|
58
|
+
|