isomorfeus-ferret 0.12.4 → 0.12.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +612 -612
- data/README.md +80 -48
- data/ext/isomorfeus_ferret_ext/bm_hash.c +9 -6
- data/ext/isomorfeus_ferret_ext/bm_micro_string.c +4 -2
- data/ext/isomorfeus_ferret_ext/frb_store.c +34 -5
- data/ext/isomorfeus_ferret_ext/frt_posh.h +11 -19
- data/ext/isomorfeus_ferret_ext/frt_q_parser.c +1844 -1911
- data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +7 -7
- data/ext/isomorfeus_ferret_ext/frt_scanner.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_search.h +1 -1
- data/ext/isomorfeus_ferret_ext/libstemmer.c +14 -11
- data/ext/isomorfeus_ferret_ext/libstemmer.h +4 -9
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +1167 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +1433 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +120 -143
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +1 -2
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +217 -237
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +377 -432
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +298 -342
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +1 -2
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +530 -524
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +201 -214
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +394 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +457 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +396 -439
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +104 -128
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +242 -273
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +406 -461
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +1 -2
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +405 -456
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +108 -126
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +849 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +373 -405
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +288 -305
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.c +1651 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.c +546 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.c +1171 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.c +1436 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +121 -141
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +221 -241
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +381 -431
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +300 -345
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +518 -511
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +201 -209
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.c +3660 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.c +309 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +306 -671
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.c +394 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.c +457 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +400 -442
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.c +824 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.c +408 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +105 -127
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +245 -276
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +409 -464
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +376 -408
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +272 -287
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.c +6530 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +407 -458
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +110 -125
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.c +1865 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +698 -806
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.c +1220 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_api.c +1 -9
- data/ext/isomorfeus_ferret_ext/stem_api.h +1 -3
- data/ext/isomorfeus_ferret_ext/stem_header.h +30 -26
- data/ext/isomorfeus_ferret_ext/stem_modules.h +113 -26
- data/ext/isomorfeus_ferret_ext/stem_modules.txt +18 -5
- data/ext/isomorfeus_ferret_ext/stem_utilities.c +167 -132
- data/ext/isomorfeus_ferret_ext/test.c +7 -1
- data/ext/isomorfeus_ferret_ext/test_search.c +0 -1
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +39 -4
- data/ext/isomorfeus_ferret_ext/q_parser.y +0 -1366
@@ -2,7 +2,7 @@
|
|
2
2
|
#include <stdlib.h> /* for calloc, free */
|
3
3
|
#include "stem_header.h"
|
4
4
|
|
5
|
-
extern struct SN_env * SN_create_env(int S_size, int I_size
|
5
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size)
|
6
6
|
{
|
7
7
|
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
|
8
8
|
if (z == NULL) return NULL;
|
@@ -27,12 +27,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
|
|
27
27
|
if (z->I == NULL) goto error;
|
28
28
|
}
|
29
29
|
|
30
|
-
if (B_size)
|
31
|
-
{
|
32
|
-
z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
|
33
|
-
if (z->B == NULL) goto error;
|
34
|
-
}
|
35
|
-
|
36
30
|
return z;
|
37
31
|
error:
|
38
32
|
SN_close_env(z, S_size);
|
@@ -52,7 +46,6 @@ extern void SN_close_env(struct SN_env * z, int S_size)
|
|
52
46
|
free(z->S);
|
53
47
|
}
|
54
48
|
free(z->I);
|
55
|
-
free(z->B);
|
56
49
|
if (z->p) lose_s(z->p);
|
57
50
|
free(z);
|
58
51
|
}
|
@@ -63,4 +56,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
|
|
63
56
|
z->c = 0;
|
64
57
|
return err;
|
65
58
|
}
|
66
|
-
|
@@ -16,11 +16,9 @@ struct SN_env {
|
|
16
16
|
int c; int l; int lb; int bra; int ket;
|
17
17
|
symbol * * S;
|
18
18
|
int * I;
|
19
|
-
unsigned char * B;
|
20
19
|
};
|
21
20
|
|
22
|
-
extern struct SN_env * SN_create_env(int S_size, int I_size
|
21
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size);
|
23
22
|
extern void SN_close_env(struct SN_env * z, int S_size);
|
24
23
|
|
25
24
|
extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
|
26
|
-
|
@@ -20,38 +20,42 @@ struct among
|
|
20
20
|
int (* function)(struct SN_env *);
|
21
21
|
};
|
22
22
|
|
23
|
-
symbol * create_s(void);
|
24
|
-
void lose_s(symbol * p);
|
23
|
+
extern symbol * create_s(void);
|
24
|
+
extern void lose_s(symbol * p);
|
25
25
|
|
26
|
-
int skip_utf8(const symbol * p, int c, int
|
26
|
+
extern int skip_utf8(const symbol * p, int c, int limit, int n);
|
27
27
|
|
28
|
-
int
|
29
|
-
int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
30
|
-
int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
31
|
-
int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
28
|
+
extern int skip_b_utf8(const symbol * p, int c, int limit, int n);
|
32
29
|
|
33
|
-
int
|
34
|
-
int
|
35
|
-
int
|
36
|
-
int
|
30
|
+
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
31
|
+
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
32
|
+
extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
33
|
+
extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
37
34
|
|
38
|
-
int
|
39
|
-
int
|
40
|
-
int
|
41
|
-
int
|
35
|
+
extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
36
|
+
extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
37
|
+
extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
38
|
+
extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
42
39
|
|
43
|
-
int
|
44
|
-
int
|
40
|
+
extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
|
41
|
+
extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
|
42
|
+
extern int eq_v(struct SN_env * z, const symbol * p);
|
43
|
+
extern int eq_v_b(struct SN_env * z, const symbol * p);
|
45
44
|
|
46
|
-
int
|
47
|
-
int
|
48
|
-
int slice_from_v(struct SN_env * z, const symbol * p);
|
49
|
-
int slice_del(struct SN_env * z);
|
45
|
+
extern int find_among(struct SN_env * z, const struct among * v, int v_size);
|
46
|
+
extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
|
50
47
|
|
51
|
-
int
|
52
|
-
int
|
48
|
+
extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
|
49
|
+
extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
|
50
|
+
extern int slice_from_v(struct SN_env * z, const symbol * p);
|
51
|
+
extern int slice_del(struct SN_env * z);
|
53
52
|
|
54
|
-
|
55
|
-
|
53
|
+
extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
|
54
|
+
extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
|
56
55
|
|
57
|
-
|
56
|
+
extern symbol * slice_to(struct SN_env * z, symbol * p);
|
57
|
+
extern symbol * assign_to(struct SN_env * z, symbol * p);
|
58
|
+
|
59
|
+
extern int len_utf8(const symbol * p);
|
60
|
+
|
61
|
+
extern void debug(struct SN_env * z, int number, int line_count);
|
@@ -3,11 +3,19 @@
|
|
3
3
|
* This file is generated by mkmodules.pl from a list of module names.
|
4
4
|
* Do not edit manually.
|
5
5
|
*
|
6
|
-
* Modules included by this file are:
|
7
|
-
*
|
8
|
-
*
|
6
|
+
* Modules included by this file are: arabic, armenian, basque, catalan,
|
7
|
+
* danish, dutch, english, finnish, french, german, greek, hindi, hungarian,
|
8
|
+
* indonesian, irish, italian, lithuanian, nepali, norwegian, porter,
|
9
|
+
* portuguese, romanian, russian, serbian, spanish, swedish, tamil, turkish,
|
10
|
+
* yiddish
|
9
11
|
*/
|
10
12
|
|
13
|
+
#include "stem_UTF_8_arabic.h"
|
14
|
+
#include "stem_UTF_8_armenian.h"
|
15
|
+
#include "stem_ISO_8859_1_basque.h"
|
16
|
+
#include "stem_UTF_8_basque.h"
|
17
|
+
#include "stem_ISO_8859_1_catalan.h"
|
18
|
+
#include "stem_UTF_8_catalan.h"
|
11
19
|
#include "stem_ISO_8859_1_danish.h"
|
12
20
|
#include "stem_UTF_8_danish.h"
|
13
21
|
#include "stem_ISO_8859_1_dutch.h"
|
@@ -20,10 +28,18 @@
|
|
20
28
|
#include "stem_UTF_8_french.h"
|
21
29
|
#include "stem_ISO_8859_1_german.h"
|
22
30
|
#include "stem_UTF_8_german.h"
|
23
|
-
#include "
|
31
|
+
#include "stem_UTF_8_greek.h"
|
32
|
+
#include "stem_UTF_8_hindi.h"
|
33
|
+
#include "stem_ISO_8859_2_hungarian.h"
|
24
34
|
#include "stem_UTF_8_hungarian.h"
|
35
|
+
#include "stem_ISO_8859_1_indonesian.h"
|
36
|
+
#include "stem_UTF_8_indonesian.h"
|
37
|
+
#include "stem_ISO_8859_1_irish.h"
|
38
|
+
#include "stem_UTF_8_irish.h"
|
25
39
|
#include "stem_ISO_8859_1_italian.h"
|
26
40
|
#include "stem_UTF_8_italian.h"
|
41
|
+
#include "stem_UTF_8_lithuanian.h"
|
42
|
+
#include "stem_UTF_8_nepali.h"
|
27
43
|
#include "stem_ISO_8859_1_norwegian.h"
|
28
44
|
#include "stem_UTF_8_norwegian.h"
|
29
45
|
#include "stem_ISO_8859_1_porter.h"
|
@@ -34,11 +50,14 @@
|
|
34
50
|
#include "stem_UTF_8_romanian.h"
|
35
51
|
#include "stem_KOI8_R_russian.h"
|
36
52
|
#include "stem_UTF_8_russian.h"
|
53
|
+
#include "stem_UTF_8_serbian.h"
|
37
54
|
#include "stem_ISO_8859_1_spanish.h"
|
38
55
|
#include "stem_UTF_8_spanish.h"
|
39
56
|
#include "stem_ISO_8859_1_swedish.h"
|
40
57
|
#include "stem_UTF_8_swedish.h"
|
58
|
+
#include "stem_UTF_8_tamil.h"
|
41
59
|
#include "stem_UTF_8_turkish.h"
|
60
|
+
#include "stem_UTF_8_yiddish.h"
|
42
61
|
|
43
62
|
typedef enum {
|
44
63
|
ENC_UNKNOWN=0,
|
@@ -52,7 +71,7 @@ struct stemmer_encoding {
|
|
52
71
|
const char * name;
|
53
72
|
stemmer_encoding_t enc;
|
54
73
|
};
|
55
|
-
static struct stemmer_encoding encodings[] = {
|
74
|
+
static const struct stemmer_encoding encodings[] = {
|
56
75
|
{"ISO_8859_1", ENC_ISO_8859_1},
|
57
76
|
{"ISO_8859_2", ENC_ISO_8859_2},
|
58
77
|
{"KOI8_R", ENC_KOI8_R},
|
@@ -62,12 +81,27 @@ static struct stemmer_encoding encodings[] = {
|
|
62
81
|
|
63
82
|
struct stemmer_modules {
|
64
83
|
const char * name;
|
65
|
-
stemmer_encoding_t enc;
|
84
|
+
stemmer_encoding_t enc;
|
66
85
|
struct SN_env * (*create)(void);
|
67
86
|
void (*close)(struct SN_env *);
|
68
87
|
int (*stem)(struct SN_env *);
|
69
88
|
};
|
70
|
-
static struct stemmer_modules modules[] = {
|
89
|
+
static const struct stemmer_modules modules[] = {
|
90
|
+
{"ar", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
|
91
|
+
{"ara", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
|
92
|
+
{"arabic", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
|
93
|
+
{"arm", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
94
|
+
{"armenian", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
95
|
+
{"baq", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
96
|
+
{"baq", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
97
|
+
{"basque", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
98
|
+
{"basque", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
99
|
+
{"ca", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
|
100
|
+
{"ca", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
|
101
|
+
{"cat", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
|
102
|
+
{"cat", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
|
103
|
+
{"catalan", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
|
104
|
+
{"catalan", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
|
71
105
|
{"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
72
106
|
{"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
73
107
|
{"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
@@ -82,6 +116,8 @@ static struct stemmer_modules modules[] = {
|
|
82
116
|
{"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
83
117
|
{"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
84
118
|
{"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
119
|
+
{"el", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
120
|
+
{"ell", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
85
121
|
{"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
86
122
|
{"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
87
123
|
{"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
@@ -92,6 +128,10 @@ static struct stemmer_modules modules[] = {
|
|
92
128
|
{"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
93
129
|
{"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
94
130
|
{"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
131
|
+
{"eu", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
132
|
+
{"eu", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
133
|
+
{"eus", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
134
|
+
{"eus", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
95
135
|
{"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
96
136
|
{"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
97
137
|
{"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
@@ -106,22 +146,47 @@ static struct stemmer_modules modules[] = {
|
|
106
146
|
{"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
107
147
|
{"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
108
148
|
{"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
149
|
+
{"ga", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
|
150
|
+
{"ga", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
|
109
151
|
{"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
110
152
|
{"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
111
153
|
{"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
112
154
|
{"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
113
|
-
{"
|
155
|
+
{"gle", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
|
156
|
+
{"gle", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
|
157
|
+
{"gre", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
158
|
+
{"greek", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
159
|
+
{"hi", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
|
160
|
+
{"hin", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
|
161
|
+
{"hindi", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
|
162
|
+
{"hu", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
|
114
163
|
{"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
115
|
-
{"hun",
|
164
|
+
{"hun", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
|
116
165
|
{"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
117
|
-
{"hungarian",
|
166
|
+
{"hungarian", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
|
118
167
|
{"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
168
|
+
{"hy", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
169
|
+
{"hye", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
170
|
+
{"id", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
|
171
|
+
{"id", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
|
172
|
+
{"ind", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
|
173
|
+
{"ind", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
|
174
|
+
{"indonesian", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
|
175
|
+
{"indonesian", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
|
176
|
+
{"irish", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
|
177
|
+
{"irish", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
|
119
178
|
{"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
120
179
|
{"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
121
180
|
{"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
122
181
|
{"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
123
182
|
{"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
124
183
|
{"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
184
|
+
{"lit", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
|
185
|
+
{"lithuanian", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
|
186
|
+
{"lt", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
|
187
|
+
{"ne", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
|
188
|
+
{"nep", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
|
189
|
+
{"nepali", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
|
125
190
|
{"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
126
191
|
{"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
127
192
|
{"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
@@ -154,37 +219,59 @@ static struct stemmer_modules modules[] = {
|
|
154
219
|
{"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
155
220
|
{"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
156
221
|
{"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
222
|
+
{"serbian", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
|
157
223
|
{"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
158
224
|
{"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
159
225
|
{"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
160
226
|
{"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
227
|
+
{"sr", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
|
228
|
+
{"srp", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
|
161
229
|
{"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
162
230
|
{"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
163
231
|
{"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
164
232
|
{"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
165
233
|
{"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
166
234
|
{"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
235
|
+
{"ta", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
|
236
|
+
{"tam", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
|
237
|
+
{"tamil", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
|
167
238
|
{"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
168
239
|
{"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
169
240
|
{"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
241
|
+
{"yi", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
|
242
|
+
{"yid", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
|
243
|
+
{"yiddish", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
|
170
244
|
{0,ENC_UNKNOWN,0,0,0}
|
171
245
|
};
|
172
246
|
static const char * algorithm_names[] = {
|
173
|
-
"
|
174
|
-
"
|
175
|
-
"
|
176
|
-
"
|
177
|
-
"
|
178
|
-
"
|
179
|
-
"
|
180
|
-
"
|
181
|
-
"
|
182
|
-
"
|
183
|
-
"
|
184
|
-
"
|
185
|
-
"
|
186
|
-
"
|
187
|
-
"
|
188
|
-
"
|
247
|
+
"arabic",
|
248
|
+
"armenian",
|
249
|
+
"basque",
|
250
|
+
"catalan",
|
251
|
+
"danish",
|
252
|
+
"dutch",
|
253
|
+
"english",
|
254
|
+
"finnish",
|
255
|
+
"french",
|
256
|
+
"german",
|
257
|
+
"greek",
|
258
|
+
"hindi",
|
259
|
+
"hungarian",
|
260
|
+
"indonesian",
|
261
|
+
"irish",
|
262
|
+
"italian",
|
263
|
+
"lithuanian",
|
264
|
+
"nepali",
|
265
|
+
"norwegian",
|
266
|
+
"porter",
|
267
|
+
"portuguese",
|
268
|
+
"romanian",
|
269
|
+
"russian",
|
270
|
+
"serbian",
|
271
|
+
"spanish",
|
272
|
+
"swedish",
|
273
|
+
"tamil",
|
274
|
+
"turkish",
|
275
|
+
"yiddish",
|
189
276
|
0
|
190
277
|
};
|
@@ -9,27 +9,40 @@
|
|
9
9
|
# List all the main algorithms for each language, in UTF-8, and also with
|
10
10
|
# the most commonly used encoding.
|
11
11
|
|
12
|
+
arabic UTF_8 arabic,ar,ara
|
13
|
+
armenian UTF_8 armenian,hy,hye,arm
|
14
|
+
basque UTF_8,ISO_8859_1 basque,eu,eus,baq
|
15
|
+
catalan UTF_8,ISO_8859_1 catalan,ca,cat
|
12
16
|
danish UTF_8,ISO_8859_1 danish,da,dan
|
13
17
|
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
|
14
18
|
english UTF_8,ISO_8859_1 english,en,eng
|
15
19
|
finnish UTF_8,ISO_8859_1 finnish,fi,fin
|
16
20
|
french UTF_8,ISO_8859_1 french,fr,fre,fra
|
17
21
|
german UTF_8,ISO_8859_1 german,de,ger,deu
|
18
|
-
|
22
|
+
greek UTF_8 greek,el,gre,ell
|
23
|
+
hindi UTF_8 hindi,hi,hin
|
24
|
+
hungarian UTF_8,ISO_8859_2 hungarian,hu,hun
|
25
|
+
indonesian UTF_8,ISO_8859_1 indonesian,id,ind
|
26
|
+
irish UTF_8,ISO_8859_1 irish,ga,gle
|
19
27
|
italian UTF_8,ISO_8859_1 italian,it,ita
|
28
|
+
lithuanian UTF_8 lithuanian,lt,lit
|
29
|
+
nepali UTF_8 nepali,ne,nep
|
20
30
|
norwegian UTF_8,ISO_8859_1 norwegian,no,nor
|
21
31
|
portuguese UTF_8,ISO_8859_1 portuguese,pt,por
|
22
32
|
romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
|
23
33
|
russian UTF_8,KOI8_R russian,ru,rus
|
34
|
+
serbian UTF_8 serbian,sr,srp
|
24
35
|
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
|
25
36
|
swedish UTF_8,ISO_8859_1 swedish,sv,swe
|
37
|
+
tamil UTF_8 tamil,ta,tam
|
26
38
|
turkish UTF_8 turkish,tr,tur
|
39
|
+
yiddish UTF_8 yiddish,yi,yid
|
27
40
|
|
28
41
|
# Also include the traditional porter algorithm for english.
|
29
42
|
# The porter algorithm is included in the libstemmer distribution to assist
|
30
43
|
# with backwards compatibility, but for new systems the english algorithm
|
31
44
|
# should be used in preference.
|
32
|
-
porter UTF_8,ISO_8859_1 porter
|
45
|
+
porter UTF_8,ISO_8859_1 porter english
|
33
46
|
|
34
47
|
# Some other stemmers in the snowball project are not included in the standard
|
35
48
|
# distribution. To compile a libstemmer with them in, add them to this list,
|
@@ -39,12 +52,12 @@ porter UTF_8,ISO_8859_1 porter
|
|
39
52
|
# algorithms are:
|
40
53
|
#
|
41
54
|
# german2 - This is a slight modification of the german stemmer.
|
42
|
-
#german2 UTF_8,ISO_8859_1 german2
|
55
|
+
#german2 UTF_8,ISO_8859_1 german2 german
|
43
56
|
#
|
44
57
|
# kraaij_pohlmann - This is a different dutch stemmer.
|
45
|
-
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann
|
58
|
+
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
|
46
59
|
#
|
47
60
|
# lovins - This is an english stemmer, but fairly outdated, and
|
48
61
|
# only really applicable to a restricted type of input text
|
49
62
|
# (keywords in academic publications).
|
50
|
-
#lovins UTF_8,ISO_8859_1 lovins
|
63
|
+
#lovins UTF_8,ISO_8859_1 lovins english
|