isomorfeus-ferret 0.12.2 → 0.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +612 -612
- data/README.md +80 -44
- data/ext/isomorfeus_ferret_ext/bm_hash.c +9 -6
- data/ext/isomorfeus_ferret_ext/bm_micro_string.c +4 -2
- data/ext/isomorfeus_ferret_ext/frb_search.c +14 -2
- data/ext/isomorfeus_ferret_ext/frb_store.c +34 -5
- data/ext/isomorfeus_ferret_ext/frt_posh.h +11 -19
- data/ext/isomorfeus_ferret_ext/frt_q_parser.c +1844 -1911
- data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +12 -15
- data/ext/isomorfeus_ferret_ext/frt_scanner.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_search.h +1 -1
- data/ext/isomorfeus_ferret_ext/libstemmer.c +14 -11
- data/ext/isomorfeus_ferret_ext/libstemmer.h +4 -9
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +1167 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +1433 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +120 -143
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +1 -2
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +217 -237
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +377 -432
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +298 -342
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +1 -2
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +530 -524
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +201 -214
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +394 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +457 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +396 -439
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +104 -128
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +242 -273
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +406 -461
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +1 -2
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +405 -456
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +108 -126
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +849 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +373 -405
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +288 -305
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.c +1651 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.c +546 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.c +1171 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.c +1436 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +121 -141
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +221 -241
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +381 -431
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +300 -345
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +518 -511
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +201 -209
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.c +3660 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.c +309 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +306 -671
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.c +394 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.c +457 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +400 -442
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.c +824 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.c +408 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +105 -127
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +245 -276
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +409 -464
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +376 -408
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +272 -287
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.c +6530 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +407 -458
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +110 -125
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.c +1865 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +698 -806
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.c +1220 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_api.c +1 -9
- data/ext/isomorfeus_ferret_ext/stem_api.h +1 -3
- data/ext/isomorfeus_ferret_ext/stem_header.h +30 -26
- data/ext/isomorfeus_ferret_ext/stem_modules.h +113 -26
- data/ext/isomorfeus_ferret_ext/stem_modules.txt +18 -5
- data/ext/isomorfeus_ferret_ext/stem_utilities.c +167 -132
- data/ext/isomorfeus_ferret_ext/test.c +7 -1
- data/ext/isomorfeus_ferret_ext/test_search.c +0 -1
- data/lib/isomorfeus/ferret/index/index.rb +1 -1
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +43 -7
- data/ext/isomorfeus_ferret_ext/q_parser.y +0 -1366
@@ -2,7 +2,7 @@
|
|
2
2
|
#include <stdlib.h> /* for calloc, free */
|
3
3
|
#include "stem_header.h"
|
4
4
|
|
5
|
-
extern struct SN_env * SN_create_env(int S_size, int I_size
|
5
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size)
|
6
6
|
{
|
7
7
|
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
|
8
8
|
if (z == NULL) return NULL;
|
@@ -27,12 +27,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
|
|
27
27
|
if (z->I == NULL) goto error;
|
28
28
|
}
|
29
29
|
|
30
|
-
if (B_size)
|
31
|
-
{
|
32
|
-
z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
|
33
|
-
if (z->B == NULL) goto error;
|
34
|
-
}
|
35
|
-
|
36
30
|
return z;
|
37
31
|
error:
|
38
32
|
SN_close_env(z, S_size);
|
@@ -52,7 +46,6 @@ extern void SN_close_env(struct SN_env * z, int S_size)
|
|
52
46
|
free(z->S);
|
53
47
|
}
|
54
48
|
free(z->I);
|
55
|
-
free(z->B);
|
56
49
|
if (z->p) lose_s(z->p);
|
57
50
|
free(z);
|
58
51
|
}
|
@@ -63,4 +56,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
|
|
63
56
|
z->c = 0;
|
64
57
|
return err;
|
65
58
|
}
|
66
|
-
|
@@ -16,11 +16,9 @@ struct SN_env {
|
|
16
16
|
int c; int l; int lb; int bra; int ket;
|
17
17
|
symbol * * S;
|
18
18
|
int * I;
|
19
|
-
unsigned char * B;
|
20
19
|
};
|
21
20
|
|
22
|
-
extern struct SN_env * SN_create_env(int S_size, int I_size
|
21
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size);
|
23
22
|
extern void SN_close_env(struct SN_env * z, int S_size);
|
24
23
|
|
25
24
|
extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
|
26
|
-
|
@@ -20,38 +20,42 @@ struct among
|
|
20
20
|
int (* function)(struct SN_env *);
|
21
21
|
};
|
22
22
|
|
23
|
-
symbol * create_s(void);
|
24
|
-
void lose_s(symbol * p);
|
23
|
+
extern symbol * create_s(void);
|
24
|
+
extern void lose_s(symbol * p);
|
25
25
|
|
26
|
-
int skip_utf8(const symbol * p, int c, int
|
26
|
+
extern int skip_utf8(const symbol * p, int c, int limit, int n);
|
27
27
|
|
28
|
-
int
|
29
|
-
int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
30
|
-
int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
31
|
-
int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
28
|
+
extern int skip_b_utf8(const symbol * p, int c, int limit, int n);
|
32
29
|
|
33
|
-
int
|
34
|
-
int
|
35
|
-
int
|
36
|
-
int
|
30
|
+
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
31
|
+
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
32
|
+
extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
33
|
+
extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
37
34
|
|
38
|
-
int
|
39
|
-
int
|
40
|
-
int
|
41
|
-
int
|
35
|
+
extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
36
|
+
extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
37
|
+
extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
38
|
+
extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
42
39
|
|
43
|
-
int
|
44
|
-
int
|
40
|
+
extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
|
41
|
+
extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
|
42
|
+
extern int eq_v(struct SN_env * z, const symbol * p);
|
43
|
+
extern int eq_v_b(struct SN_env * z, const symbol * p);
|
45
44
|
|
46
|
-
int
|
47
|
-
int
|
48
|
-
int slice_from_v(struct SN_env * z, const symbol * p);
|
49
|
-
int slice_del(struct SN_env * z);
|
45
|
+
extern int find_among(struct SN_env * z, const struct among * v, int v_size);
|
46
|
+
extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
|
50
47
|
|
51
|
-
int
|
52
|
-
int
|
48
|
+
extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
|
49
|
+
extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
|
50
|
+
extern int slice_from_v(struct SN_env * z, const symbol * p);
|
51
|
+
extern int slice_del(struct SN_env * z);
|
53
52
|
|
54
|
-
|
55
|
-
|
53
|
+
extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
|
54
|
+
extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
|
56
55
|
|
57
|
-
|
56
|
+
extern symbol * slice_to(struct SN_env * z, symbol * p);
|
57
|
+
extern symbol * assign_to(struct SN_env * z, symbol * p);
|
58
|
+
|
59
|
+
extern int len_utf8(const symbol * p);
|
60
|
+
|
61
|
+
extern void debug(struct SN_env * z, int number, int line_count);
|
@@ -3,11 +3,19 @@
|
|
3
3
|
* This file is generated by mkmodules.pl from a list of module names.
|
4
4
|
* Do not edit manually.
|
5
5
|
*
|
6
|
-
* Modules included by this file are:
|
7
|
-
*
|
8
|
-
*
|
6
|
+
* Modules included by this file are: arabic, armenian, basque, catalan,
|
7
|
+
* danish, dutch, english, finnish, french, german, greek, hindi, hungarian,
|
8
|
+
* indonesian, irish, italian, lithuanian, nepali, norwegian, porter,
|
9
|
+
* portuguese, romanian, russian, serbian, spanish, swedish, tamil, turkish,
|
10
|
+
* yiddish
|
9
11
|
*/
|
10
12
|
|
13
|
+
#include "stem_UTF_8_arabic.h"
|
14
|
+
#include "stem_UTF_8_armenian.h"
|
15
|
+
#include "stem_ISO_8859_1_basque.h"
|
16
|
+
#include "stem_UTF_8_basque.h"
|
17
|
+
#include "stem_ISO_8859_1_catalan.h"
|
18
|
+
#include "stem_UTF_8_catalan.h"
|
11
19
|
#include "stem_ISO_8859_1_danish.h"
|
12
20
|
#include "stem_UTF_8_danish.h"
|
13
21
|
#include "stem_ISO_8859_1_dutch.h"
|
@@ -20,10 +28,18 @@
|
|
20
28
|
#include "stem_UTF_8_french.h"
|
21
29
|
#include "stem_ISO_8859_1_german.h"
|
22
30
|
#include "stem_UTF_8_german.h"
|
23
|
-
#include "
|
31
|
+
#include "stem_UTF_8_greek.h"
|
32
|
+
#include "stem_UTF_8_hindi.h"
|
33
|
+
#include "stem_ISO_8859_2_hungarian.h"
|
24
34
|
#include "stem_UTF_8_hungarian.h"
|
35
|
+
#include "stem_ISO_8859_1_indonesian.h"
|
36
|
+
#include "stem_UTF_8_indonesian.h"
|
37
|
+
#include "stem_ISO_8859_1_irish.h"
|
38
|
+
#include "stem_UTF_8_irish.h"
|
25
39
|
#include "stem_ISO_8859_1_italian.h"
|
26
40
|
#include "stem_UTF_8_italian.h"
|
41
|
+
#include "stem_UTF_8_lithuanian.h"
|
42
|
+
#include "stem_UTF_8_nepali.h"
|
27
43
|
#include "stem_ISO_8859_1_norwegian.h"
|
28
44
|
#include "stem_UTF_8_norwegian.h"
|
29
45
|
#include "stem_ISO_8859_1_porter.h"
|
@@ -34,11 +50,14 @@
|
|
34
50
|
#include "stem_UTF_8_romanian.h"
|
35
51
|
#include "stem_KOI8_R_russian.h"
|
36
52
|
#include "stem_UTF_8_russian.h"
|
53
|
+
#include "stem_UTF_8_serbian.h"
|
37
54
|
#include "stem_ISO_8859_1_spanish.h"
|
38
55
|
#include "stem_UTF_8_spanish.h"
|
39
56
|
#include "stem_ISO_8859_1_swedish.h"
|
40
57
|
#include "stem_UTF_8_swedish.h"
|
58
|
+
#include "stem_UTF_8_tamil.h"
|
41
59
|
#include "stem_UTF_8_turkish.h"
|
60
|
+
#include "stem_UTF_8_yiddish.h"
|
42
61
|
|
43
62
|
typedef enum {
|
44
63
|
ENC_UNKNOWN=0,
|
@@ -52,7 +71,7 @@ struct stemmer_encoding {
|
|
52
71
|
const char * name;
|
53
72
|
stemmer_encoding_t enc;
|
54
73
|
};
|
55
|
-
static struct stemmer_encoding encodings[] = {
|
74
|
+
static const struct stemmer_encoding encodings[] = {
|
56
75
|
{"ISO_8859_1", ENC_ISO_8859_1},
|
57
76
|
{"ISO_8859_2", ENC_ISO_8859_2},
|
58
77
|
{"KOI8_R", ENC_KOI8_R},
|
@@ -62,12 +81,27 @@ static struct stemmer_encoding encodings[] = {
|
|
62
81
|
|
63
82
|
struct stemmer_modules {
|
64
83
|
const char * name;
|
65
|
-
stemmer_encoding_t enc;
|
84
|
+
stemmer_encoding_t enc;
|
66
85
|
struct SN_env * (*create)(void);
|
67
86
|
void (*close)(struct SN_env *);
|
68
87
|
int (*stem)(struct SN_env *);
|
69
88
|
};
|
70
|
-
static struct stemmer_modules modules[] = {
|
89
|
+
static const struct stemmer_modules modules[] = {
|
90
|
+
{"ar", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
|
91
|
+
{"ara", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
|
92
|
+
{"arabic", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
|
93
|
+
{"arm", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
94
|
+
{"armenian", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
95
|
+
{"baq", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
96
|
+
{"baq", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
97
|
+
{"basque", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
98
|
+
{"basque", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
99
|
+
{"ca", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
|
100
|
+
{"ca", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
|
101
|
+
{"cat", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
|
102
|
+
{"cat", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
|
103
|
+
{"catalan", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
|
104
|
+
{"catalan", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
|
71
105
|
{"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
72
106
|
{"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
73
107
|
{"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
@@ -82,6 +116,8 @@ static struct stemmer_modules modules[] = {
|
|
82
116
|
{"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
83
117
|
{"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
84
118
|
{"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
119
|
+
{"el", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
120
|
+
{"ell", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
85
121
|
{"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
86
122
|
{"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
87
123
|
{"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
@@ -92,6 +128,10 @@ static struct stemmer_modules modules[] = {
|
|
92
128
|
{"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
93
129
|
{"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
94
130
|
{"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
131
|
+
{"eu", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
132
|
+
{"eu", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
133
|
+
{"eus", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
134
|
+
{"eus", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
95
135
|
{"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
96
136
|
{"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
97
137
|
{"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
@@ -106,22 +146,47 @@ static struct stemmer_modules modules[] = {
|
|
106
146
|
{"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
107
147
|
{"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
108
148
|
{"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
149
|
+
{"ga", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
|
150
|
+
{"ga", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
|
109
151
|
{"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
110
152
|
{"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
111
153
|
{"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
112
154
|
{"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
113
|
-
{"
|
155
|
+
{"gle", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
|
156
|
+
{"gle", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
|
157
|
+
{"gre", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
158
|
+
{"greek", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
159
|
+
{"hi", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
|
160
|
+
{"hin", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
|
161
|
+
{"hindi", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
|
162
|
+
{"hu", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
|
114
163
|
{"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
115
|
-
{"hun",
|
164
|
+
{"hun", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
|
116
165
|
{"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
117
|
-
{"hungarian",
|
166
|
+
{"hungarian", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
|
118
167
|
{"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
168
|
+
{"hy", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
169
|
+
{"hye", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
170
|
+
{"id", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
|
171
|
+
{"id", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
|
172
|
+
{"ind", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
|
173
|
+
{"ind", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
|
174
|
+
{"indonesian", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
|
175
|
+
{"indonesian", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
|
176
|
+
{"irish", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
|
177
|
+
{"irish", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
|
119
178
|
{"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
120
179
|
{"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
121
180
|
{"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
122
181
|
{"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
123
182
|
{"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
124
183
|
{"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
184
|
+
{"lit", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
|
185
|
+
{"lithuanian", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
|
186
|
+
{"lt", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
|
187
|
+
{"ne", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
|
188
|
+
{"nep", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
|
189
|
+
{"nepali", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
|
125
190
|
{"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
126
191
|
{"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
127
192
|
{"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
@@ -154,37 +219,59 @@ static struct stemmer_modules modules[] = {
|
|
154
219
|
{"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
155
220
|
{"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
156
221
|
{"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
222
|
+
{"serbian", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
|
157
223
|
{"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
158
224
|
{"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
159
225
|
{"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
160
226
|
{"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
227
|
+
{"sr", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
|
228
|
+
{"srp", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
|
161
229
|
{"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
162
230
|
{"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
163
231
|
{"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
164
232
|
{"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
165
233
|
{"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
166
234
|
{"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
235
|
+
{"ta", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
|
236
|
+
{"tam", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
|
237
|
+
{"tamil", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
|
167
238
|
{"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
168
239
|
{"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
169
240
|
{"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
241
|
+
{"yi", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
|
242
|
+
{"yid", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
|
243
|
+
{"yiddish", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
|
170
244
|
{0,ENC_UNKNOWN,0,0,0}
|
171
245
|
};
|
172
246
|
static const char * algorithm_names[] = {
|
173
|
-
"
|
174
|
-
"
|
175
|
-
"
|
176
|
-
"
|
177
|
-
"
|
178
|
-
"
|
179
|
-
"
|
180
|
-
"
|
181
|
-
"
|
182
|
-
"
|
183
|
-
"
|
184
|
-
"
|
185
|
-
"
|
186
|
-
"
|
187
|
-
"
|
188
|
-
"
|
247
|
+
"arabic",
|
248
|
+
"armenian",
|
249
|
+
"basque",
|
250
|
+
"catalan",
|
251
|
+
"danish",
|
252
|
+
"dutch",
|
253
|
+
"english",
|
254
|
+
"finnish",
|
255
|
+
"french",
|
256
|
+
"german",
|
257
|
+
"greek",
|
258
|
+
"hindi",
|
259
|
+
"hungarian",
|
260
|
+
"indonesian",
|
261
|
+
"irish",
|
262
|
+
"italian",
|
263
|
+
"lithuanian",
|
264
|
+
"nepali",
|
265
|
+
"norwegian",
|
266
|
+
"porter",
|
267
|
+
"portuguese",
|
268
|
+
"romanian",
|
269
|
+
"russian",
|
270
|
+
"serbian",
|
271
|
+
"spanish",
|
272
|
+
"swedish",
|
273
|
+
"tamil",
|
274
|
+
"turkish",
|
275
|
+
"yiddish",
|
189
276
|
0
|
190
277
|
};
|
@@ -9,27 +9,40 @@
|
|
9
9
|
# List all the main algorithms for each language, in UTF-8, and also with
|
10
10
|
# the most commonly used encoding.
|
11
11
|
|
12
|
+
arabic UTF_8 arabic,ar,ara
|
13
|
+
armenian UTF_8 armenian,hy,hye,arm
|
14
|
+
basque UTF_8,ISO_8859_1 basque,eu,eus,baq
|
15
|
+
catalan UTF_8,ISO_8859_1 catalan,ca,cat
|
12
16
|
danish UTF_8,ISO_8859_1 danish,da,dan
|
13
17
|
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
|
14
18
|
english UTF_8,ISO_8859_1 english,en,eng
|
15
19
|
finnish UTF_8,ISO_8859_1 finnish,fi,fin
|
16
20
|
french UTF_8,ISO_8859_1 french,fr,fre,fra
|
17
21
|
german UTF_8,ISO_8859_1 german,de,ger,deu
|
18
|
-
|
22
|
+
greek UTF_8 greek,el,gre,ell
|
23
|
+
hindi UTF_8 hindi,hi,hin
|
24
|
+
hungarian UTF_8,ISO_8859_2 hungarian,hu,hun
|
25
|
+
indonesian UTF_8,ISO_8859_1 indonesian,id,ind
|
26
|
+
irish UTF_8,ISO_8859_1 irish,ga,gle
|
19
27
|
italian UTF_8,ISO_8859_1 italian,it,ita
|
28
|
+
lithuanian UTF_8 lithuanian,lt,lit
|
29
|
+
nepali UTF_8 nepali,ne,nep
|
20
30
|
norwegian UTF_8,ISO_8859_1 norwegian,no,nor
|
21
31
|
portuguese UTF_8,ISO_8859_1 portuguese,pt,por
|
22
32
|
romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
|
23
33
|
russian UTF_8,KOI8_R russian,ru,rus
|
34
|
+
serbian UTF_8 serbian,sr,srp
|
24
35
|
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
|
25
36
|
swedish UTF_8,ISO_8859_1 swedish,sv,swe
|
37
|
+
tamil UTF_8 tamil,ta,tam
|
26
38
|
turkish UTF_8 turkish,tr,tur
|
39
|
+
yiddish UTF_8 yiddish,yi,yid
|
27
40
|
|
28
41
|
# Also include the traditional porter algorithm for english.
|
29
42
|
# The porter algorithm is included in the libstemmer distribution to assist
|
30
43
|
# with backwards compatibility, but for new systems the english algorithm
|
31
44
|
# should be used in preference.
|
32
|
-
porter UTF_8,ISO_8859_1 porter
|
45
|
+
porter UTF_8,ISO_8859_1 porter english
|
33
46
|
|
34
47
|
# Some other stemmers in the snowball project are not included in the standard
|
35
48
|
# distribution. To compile a libstemmer with them in, add them to this list,
|
@@ -39,12 +52,12 @@ porter UTF_8,ISO_8859_1 porter
|
|
39
52
|
# algorithms are:
|
40
53
|
#
|
41
54
|
# german2 - This is a slight modification of the german stemmer.
|
42
|
-
#german2 UTF_8,ISO_8859_1 german2
|
55
|
+
#german2 UTF_8,ISO_8859_1 german2 german
|
43
56
|
#
|
44
57
|
# kraaij_pohlmann - This is a different dutch stemmer.
|
45
|
-
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann
|
58
|
+
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
|
46
59
|
#
|
47
60
|
# lovins - This is an english stemmer, but fairly outdated, and
|
48
61
|
# only really applicable to a restricted type of input text
|
49
62
|
# (keywords in academic publications).
|
50
|
-
#lovins UTF_8,ISO_8859_1 lovins
|
63
|
+
#lovins UTF_8,ISO_8859_1 lovins english
|