stemmer4r 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CVS/Entries +5 -0
- data/CVS/Repository +1 -0
- data/CVS/Root +1 -0
- data/LICENSE +20 -0
- data/README +9 -0
- data/ext/CVS/Entries +1 -0
- data/ext/CVS/Repository +1 -0
- data/ext/CVS/Root +1 -0
- data/ext/stemmer4r/CVS/Entries +4 -0
- data/ext/stemmer4r/CVS/Repository +1 -0
- data/ext/stemmer4r/CVS/Root +1 -0
- data/ext/stemmer4r/depend +14 -0
- data/ext/stemmer4r/extconf.rb +8 -0
- data/ext/stemmer4r/libstemmer_c/CVS/Entries +7 -0
- data/ext/stemmer4r/libstemmer_c/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/MANIFEST +39 -0
- data/ext/stemmer4r/libstemmer_c/Makefile +5 -0
- data/ext/stemmer4r/libstemmer_c/include/CVS/Entries +2 -0
- data/ext/stemmer4r/libstemmer_c/include/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/include/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/include/libstemmer.h +63 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/CVS/Entries +3 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/libstemmer.c +78 -0
- data/ext/stemmer4r/libstemmer_c/libstemmer/modules.h +96 -0
- data/ext/stemmer4r/libstemmer_c/mkinc.mak +42 -0
- data/ext/stemmer4r/libstemmer_c/runtime/CVS/Entries +5 -0
- data/ext/stemmer4r/libstemmer_c/runtime/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/runtime/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/runtime/api.c +69 -0
- data/ext/stemmer4r/libstemmer_c/runtime/api.h +27 -0
- data/ext/stemmer4r/libstemmer_c/runtime/header.h +56 -0
- data/ext/stemmer4r/libstemmer_c/runtime/utilities.c +403 -0
- data/ext/stemmer4r/libstemmer_c/src_c/CVS/Entries +33 -0
- data/ext/stemmer4r/libstemmer_c/src_c/CVS/Repository +1 -0
- data/ext/stemmer4r/libstemmer_c/src_c/CVS/Root +1 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_danish.c +330 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_danish.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_dutch.c +635 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_dutch.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_english.c +1109 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_english.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_finnish.c +792 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_finnish.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_french.c +1276 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_french.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german.c +504 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german2.c +549 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_german2.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_italian.c +1087 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_italian.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_kraaij_pohlmann.c +1780 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_kraaij_pohlmann.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_lovins.c +1752 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_lovins.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_norwegian.c +279 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_norwegian.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_porter.c +776 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_porter.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_portuguese.c +1027 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_portuguese.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_russian.c +701 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_russian.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_spanish.c +1109 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_spanish.h +16 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_swedish.c +299 -0
- data/ext/stemmer4r/libstemmer_c/src_c/stem_swedish.h +16 -0
- data/ext/stemmer4r/stemmer4r.c +146 -0
- data/stemmer4r.gemspec +23 -0
- data/test/CVS/Entries +2 -0
- data/test/CVS/Repository +1 -0
- data/test/CVS/Root +1 -0
- data/test/test.rb +31 -0
- data/test/tests/CVS/Entries +12 -0
- data/test/tests/CVS/Repository +1 -0
- data/test/tests/CVS/Root +1 -0
- data/test/tests/da/CVS/Entries +3 -0
- data/test/tests/da/CVS/Repository +1 -0
- data/test/tests/da/CVS/Root +1 -0
- data/test/tests/da/output.txt +23829 -0
- data/test/tests/da/voc.txt +23829 -0
- data/test/tests/de/CVS/Entries +3 -0
- data/test/tests/de/CVS/Repository +1 -0
- data/test/tests/de/CVS/Root +1 -0
- data/test/tests/de/output.txt +35033 -0
- data/test/tests/de/voc.txt +35033 -0
- data/test/tests/en/CVS/Entries +3 -0
- data/test/tests/en/CVS/Repository +1 -0
- data/test/tests/en/CVS/Root +1 -0
- data/test/tests/en/output.txt +29400 -0
- data/test/tests/en/voc.txt +29400 -0
- data/test/tests/es/CVS/Entries +3 -0
- data/test/tests/es/CVS/Repository +1 -0
- data/test/tests/es/CVS/Root +1 -0
- data/test/tests/es/output.txt +28390 -0
- data/test/tests/es/voc.txt +28390 -0
- data/test/tests/fi/CVS/Entries +3 -0
- data/test/tests/fi/CVS/Repository +1 -0
- data/test/tests/fi/CVS/Root +1 -0
- data/test/tests/fi/output.txt +50000 -0
- data/test/tests/fi/voc.txt +50000 -0
- data/test/tests/fr/CVS/Entries +3 -0
- data/test/tests/fr/CVS/Repository +1 -0
- data/test/tests/fr/CVS/Root +1 -0
- data/test/tests/fr/output.txt +20403 -0
- data/test/tests/fr/voc.txt +20403 -0
- data/test/tests/it/CVS/Entries +3 -0
- data/test/tests/it/CVS/Repository +1 -0
- data/test/tests/it/CVS/Root +1 -0
- data/test/tests/it/output.txt +35494 -0
- data/test/tests/it/voc.txt +35494 -0
- data/test/tests/nl/CVS/Entries +3 -0
- data/test/tests/nl/CVS/Repository +1 -0
- data/test/tests/nl/CVS/Root +1 -0
- data/test/tests/nl/output.txt +45669 -0
- data/test/tests/nl/voc.txt +45669 -0
- data/test/tests/no/CVS/Entries +3 -0
- data/test/tests/no/CVS/Repository +1 -0
- data/test/tests/no/CVS/Root +1 -0
- data/test/tests/no/output.txt +20628 -0
- data/test/tests/no/voc.txt +20628 -0
- data/test/tests/pt/CVS/Entries +3 -0
- data/test/tests/pt/CVS/Repository +1 -0
- data/test/tests/pt/CVS/Root +1 -0
- data/test/tests/pt/output.txt +32016 -0
- data/test/tests/pt/voc.txt +32016 -0
- data/test/tests/ru/CVS/Entries +3 -0
- data/test/tests/ru/CVS/Repository +1 -0
- data/test/tests/ru/CVS/Root +1 -0
- data/test/tests/ru/output.txt +49673 -0
- data/test/tests/ru/voc.txt +49673 -0
- data/test/tests/sv/CVS/Entries +3 -0
- data/test/tests/sv/CVS/Repository +1 -0
- data/test/tests/sv/CVS/Root +1 -0
- data/test/tests/sv/output.txt +30623 -0
- data/test/tests/sv/voc.txt +30623 -0
- metadata +221 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
/* This file was generated automatically by the Snowball to ANSI C compiler */
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
extern struct SN_env * spanish_create_env(void);
|
|
9
|
+
extern void spanish_close_env(struct SN_env * z);
|
|
10
|
+
|
|
11
|
+
extern int spanish_stem(struct SN_env * z);
|
|
12
|
+
|
|
13
|
+
#ifdef __cplusplus
|
|
14
|
+
}
|
|
15
|
+
#endif
|
|
16
|
+
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
|
|
2
|
+
/* This file was generated automatically by the Snowball to ANSI C compiler */
|
|
3
|
+
|
|
4
|
+
#include "../runtime/header.h"
|
|
5
|
+
|
|
6
|
+
extern int swedish_stem(struct SN_env * z);
|
|
7
|
+
static int r_other_suffix(struct SN_env * z);
|
|
8
|
+
static int r_consonant_pair(struct SN_env * z);
|
|
9
|
+
static int r_main_suffix(struct SN_env * z);
|
|
10
|
+
static int r_mark_regions(struct SN_env * z);
|
|
11
|
+
|
|
12
|
+
extern struct SN_env * swedish_create_env(void);
|
|
13
|
+
extern void swedish_close_env(struct SN_env * z);
|
|
14
|
+
|
|
15
|
+
static symbol s_0_0[1] = { 'a' };
|
|
16
|
+
static symbol s_0_1[4] = { 'a', 'r', 'n', 'a' };
|
|
17
|
+
static symbol s_0_2[4] = { 'e', 'r', 'n', 'a' };
|
|
18
|
+
static symbol s_0_3[7] = { 'h', 'e', 't', 'e', 'r', 'n', 'a' };
|
|
19
|
+
static symbol s_0_4[4] = { 'o', 'r', 'n', 'a' };
|
|
20
|
+
static symbol s_0_5[2] = { 'a', 'd' };
|
|
21
|
+
static symbol s_0_6[1] = { 'e' };
|
|
22
|
+
static symbol s_0_7[3] = { 'a', 'd', 'e' };
|
|
23
|
+
static symbol s_0_8[4] = { 'a', 'n', 'd', 'e' };
|
|
24
|
+
static symbol s_0_9[4] = { 'a', 'r', 'n', 'e' };
|
|
25
|
+
static symbol s_0_10[3] = { 'a', 'r', 'e' };
|
|
26
|
+
static symbol s_0_11[4] = { 'a', 's', 't', 'e' };
|
|
27
|
+
static symbol s_0_12[2] = { 'e', 'n' };
|
|
28
|
+
static symbol s_0_13[5] = { 'a', 'n', 'd', 'e', 'n' };
|
|
29
|
+
static symbol s_0_14[4] = { 'a', 'r', 'e', 'n' };
|
|
30
|
+
static symbol s_0_15[5] = { 'h', 'e', 't', 'e', 'n' };
|
|
31
|
+
static symbol s_0_16[3] = { 'e', 'r', 'n' };
|
|
32
|
+
static symbol s_0_17[2] = { 'a', 'r' };
|
|
33
|
+
static symbol s_0_18[2] = { 'e', 'r' };
|
|
34
|
+
static symbol s_0_19[5] = { 'h', 'e', 't', 'e', 'r' };
|
|
35
|
+
static symbol s_0_20[2] = { 'o', 'r' };
|
|
36
|
+
static symbol s_0_21[1] = { 's' };
|
|
37
|
+
static symbol s_0_22[2] = { 'a', 's' };
|
|
38
|
+
static symbol s_0_23[5] = { 'a', 'r', 'n', 'a', 's' };
|
|
39
|
+
static symbol s_0_24[5] = { 'e', 'r', 'n', 'a', 's' };
|
|
40
|
+
static symbol s_0_25[5] = { 'o', 'r', 'n', 'a', 's' };
|
|
41
|
+
static symbol s_0_26[2] = { 'e', 's' };
|
|
42
|
+
static symbol s_0_27[4] = { 'a', 'd', 'e', 's' };
|
|
43
|
+
static symbol s_0_28[5] = { 'a', 'n', 'd', 'e', 's' };
|
|
44
|
+
static symbol s_0_29[3] = { 'e', 'n', 's' };
|
|
45
|
+
static symbol s_0_30[5] = { 'a', 'r', 'e', 'n', 's' };
|
|
46
|
+
static symbol s_0_31[6] = { 'h', 'e', 't', 'e', 'n', 's' };
|
|
47
|
+
static symbol s_0_32[4] = { 'e', 'r', 'n', 's' };
|
|
48
|
+
static symbol s_0_33[2] = { 'a', 't' };
|
|
49
|
+
static symbol s_0_34[5] = { 'a', 'n', 'd', 'e', 't' };
|
|
50
|
+
static symbol s_0_35[3] = { 'h', 'e', 't' };
|
|
51
|
+
static symbol s_0_36[3] = { 'a', 's', 't' };
|
|
52
|
+
|
|
53
|
+
static struct among a_0[37] =
|
|
54
|
+
{
|
|
55
|
+
/* 0 */ { 1, s_0_0, -1, 1, 0},
|
|
56
|
+
/* 1 */ { 4, s_0_1, 0, 1, 0},
|
|
57
|
+
/* 2 */ { 4, s_0_2, 0, 1, 0},
|
|
58
|
+
/* 3 */ { 7, s_0_3, 2, 1, 0},
|
|
59
|
+
/* 4 */ { 4, s_0_4, 0, 1, 0},
|
|
60
|
+
/* 5 */ { 2, s_0_5, -1, 1, 0},
|
|
61
|
+
/* 6 */ { 1, s_0_6, -1, 1, 0},
|
|
62
|
+
/* 7 */ { 3, s_0_7, 6, 1, 0},
|
|
63
|
+
/* 8 */ { 4, s_0_8, 6, 1, 0},
|
|
64
|
+
/* 9 */ { 4, s_0_9, 6, 1, 0},
|
|
65
|
+
/* 10 */ { 3, s_0_10, 6, 1, 0},
|
|
66
|
+
/* 11 */ { 4, s_0_11, 6, 1, 0},
|
|
67
|
+
/* 12 */ { 2, s_0_12, -1, 1, 0},
|
|
68
|
+
/* 13 */ { 5, s_0_13, 12, 1, 0},
|
|
69
|
+
/* 14 */ { 4, s_0_14, 12, 1, 0},
|
|
70
|
+
/* 15 */ { 5, s_0_15, 12, 1, 0},
|
|
71
|
+
/* 16 */ { 3, s_0_16, -1, 1, 0},
|
|
72
|
+
/* 17 */ { 2, s_0_17, -1, 1, 0},
|
|
73
|
+
/* 18 */ { 2, s_0_18, -1, 1, 0},
|
|
74
|
+
/* 19 */ { 5, s_0_19, 18, 1, 0},
|
|
75
|
+
/* 20 */ { 2, s_0_20, -1, 1, 0},
|
|
76
|
+
/* 21 */ { 1, s_0_21, -1, 2, 0},
|
|
77
|
+
/* 22 */ { 2, s_0_22, 21, 1, 0},
|
|
78
|
+
/* 23 */ { 5, s_0_23, 22, 1, 0},
|
|
79
|
+
/* 24 */ { 5, s_0_24, 22, 1, 0},
|
|
80
|
+
/* 25 */ { 5, s_0_25, 22, 1, 0},
|
|
81
|
+
/* 26 */ { 2, s_0_26, 21, 1, 0},
|
|
82
|
+
/* 27 */ { 4, s_0_27, 26, 1, 0},
|
|
83
|
+
/* 28 */ { 5, s_0_28, 26, 1, 0},
|
|
84
|
+
/* 29 */ { 3, s_0_29, 21, 1, 0},
|
|
85
|
+
/* 30 */ { 5, s_0_30, 29, 1, 0},
|
|
86
|
+
/* 31 */ { 6, s_0_31, 29, 1, 0},
|
|
87
|
+
/* 32 */ { 4, s_0_32, 21, 1, 0},
|
|
88
|
+
/* 33 */ { 2, s_0_33, -1, 1, 0},
|
|
89
|
+
/* 34 */ { 5, s_0_34, -1, 1, 0},
|
|
90
|
+
/* 35 */ { 3, s_0_35, -1, 1, 0},
|
|
91
|
+
/* 36 */ { 3, s_0_36, -1, 1, 0}
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
static symbol s_1_0[2] = { 'd', 'd' };
|
|
95
|
+
static symbol s_1_1[2] = { 'g', 'd' };
|
|
96
|
+
static symbol s_1_2[2] = { 'n', 'n' };
|
|
97
|
+
static symbol s_1_3[2] = { 'd', 't' };
|
|
98
|
+
static symbol s_1_4[2] = { 'g', 't' };
|
|
99
|
+
static symbol s_1_5[2] = { 'k', 't' };
|
|
100
|
+
static symbol s_1_6[2] = { 't', 't' };
|
|
101
|
+
|
|
102
|
+
static struct among a_1[7] =
|
|
103
|
+
{
|
|
104
|
+
/* 0 */ { 2, s_1_0, -1, -1, 0},
|
|
105
|
+
/* 1 */ { 2, s_1_1, -1, -1, 0},
|
|
106
|
+
/* 2 */ { 2, s_1_2, -1, -1, 0},
|
|
107
|
+
/* 3 */ { 2, s_1_3, -1, -1, 0},
|
|
108
|
+
/* 4 */ { 2, s_1_4, -1, -1, 0},
|
|
109
|
+
/* 5 */ { 2, s_1_5, -1, -1, 0},
|
|
110
|
+
/* 6 */ { 2, s_1_6, -1, -1, 0}
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
static symbol s_2_0[2] = { 'i', 'g' };
|
|
114
|
+
static symbol s_2_1[3] = { 'l', 'i', 'g' };
|
|
115
|
+
static symbol s_2_2[3] = { 'e', 'l', 's' };
|
|
116
|
+
static symbol s_2_3[5] = { 'f', 'u', 'l', 'l', 't' };
|
|
117
|
+
static symbol s_2_4[4] = { 'l', 246, 's', 't' };
|
|
118
|
+
|
|
119
|
+
static struct among a_2[5] =
|
|
120
|
+
{
|
|
121
|
+
/* 0 */ { 2, s_2_0, -1, 1, 0},
|
|
122
|
+
/* 1 */ { 3, s_2_1, 0, 1, 0},
|
|
123
|
+
/* 2 */ { 3, s_2_2, -1, 1, 0},
|
|
124
|
+
/* 3 */ { 5, s_2_3, -1, 3, 0},
|
|
125
|
+
/* 4 */ { 4, s_2_4, -1, 2, 0}
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
static unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32 };
|
|
129
|
+
|
|
130
|
+
static unsigned char g_s_ending[] = { 119, 127, 149 };
|
|
131
|
+
|
|
132
|
+
static symbol s_0[] = { 'l', 246, 's' };
|
|
133
|
+
static symbol s_1[] = { 'f', 'u', 'l', 'l' };
|
|
134
|
+
|
|
135
|
+
static int r_mark_regions(struct SN_env * z) {
|
|
136
|
+
z->I[0] = z->l;
|
|
137
|
+
while(1) { /* goto, line 30 */
|
|
138
|
+
int c = z->c;
|
|
139
|
+
if (!(in_grouping(z, g_v, 97, 246))) goto lab0;
|
|
140
|
+
z->c = c;
|
|
141
|
+
break;
|
|
142
|
+
lab0:
|
|
143
|
+
z->c = c;
|
|
144
|
+
if (z->c >= z->l) return 0;
|
|
145
|
+
z->c++;
|
|
146
|
+
}
|
|
147
|
+
while(1) { /* gopast, line 30 */
|
|
148
|
+
if (!(out_grouping(z, g_v, 97, 246))) goto lab1;
|
|
149
|
+
break;
|
|
150
|
+
lab1:
|
|
151
|
+
if (z->c >= z->l) return 0;
|
|
152
|
+
z->c++;
|
|
153
|
+
}
|
|
154
|
+
z->I[0] = z->c; /* setmark p1, line 30 */
|
|
155
|
+
/* try, line 31 */
|
|
156
|
+
if (!(z->I[0] < 3)) goto lab2;
|
|
157
|
+
z->I[0] = 3;
|
|
158
|
+
lab2:
|
|
159
|
+
return 1;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
static int r_main_suffix(struct SN_env * z) {
|
|
163
|
+
int among_var;
|
|
164
|
+
{ int m3; /* setlimit, line 37 */
|
|
165
|
+
int m = z->l - z->c; (void) m;
|
|
166
|
+
if (z->c < z->I[0]) return 0;
|
|
167
|
+
z->c = z->I[0]; /* tomark, line 37 */
|
|
168
|
+
m3 = z->lb; z->lb = z->c;
|
|
169
|
+
z->c = z->l - m;
|
|
170
|
+
z->ket = z->c; /* [, line 37 */
|
|
171
|
+
among_var = find_among_b(z, a_0, 37); /* substring, line 37 */
|
|
172
|
+
if (!(among_var)) { z->lb = m3; return 0; }
|
|
173
|
+
z->bra = z->c; /* ], line 37 */
|
|
174
|
+
z->lb = m3;
|
|
175
|
+
}
|
|
176
|
+
switch(among_var) {
|
|
177
|
+
case 0: return 0;
|
|
178
|
+
case 1:
|
|
179
|
+
{ int ret;
|
|
180
|
+
ret = slice_del(z); /* delete, line 44 */
|
|
181
|
+
if (ret < 0) return ret;
|
|
182
|
+
}
|
|
183
|
+
break;
|
|
184
|
+
case 2:
|
|
185
|
+
if (!(in_grouping_b(z, g_s_ending, 98, 121))) return 0;
|
|
186
|
+
{ int ret;
|
|
187
|
+
ret = slice_del(z); /* delete, line 46 */
|
|
188
|
+
if (ret < 0) return ret;
|
|
189
|
+
}
|
|
190
|
+
break;
|
|
191
|
+
}
|
|
192
|
+
return 1;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
static int r_consonant_pair(struct SN_env * z) {
|
|
196
|
+
{ int m3; /* setlimit, line 50 */
|
|
197
|
+
int m = z->l - z->c; (void) m;
|
|
198
|
+
if (z->c < z->I[0]) return 0;
|
|
199
|
+
z->c = z->I[0]; /* tomark, line 50 */
|
|
200
|
+
m3 = z->lb; z->lb = z->c;
|
|
201
|
+
z->c = z->l - m;
|
|
202
|
+
{ int m = z->l - z->c; (void) m; /* and, line 52 */
|
|
203
|
+
if (!(find_among_b(z, a_1, 7))) { z->lb = m3; return 0; } /* among, line 51 */
|
|
204
|
+
z->c = z->l - m;
|
|
205
|
+
z->ket = z->c; /* [, line 52 */
|
|
206
|
+
if (z->c <= z->lb) { z->lb = m3; return 0; }
|
|
207
|
+
z->c--; /* next, line 52 */
|
|
208
|
+
z->bra = z->c; /* ], line 52 */
|
|
209
|
+
{ int ret;
|
|
210
|
+
ret = slice_del(z); /* delete, line 52 */
|
|
211
|
+
if (ret < 0) return ret;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
z->lb = m3;
|
|
215
|
+
}
|
|
216
|
+
return 1;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
static int r_other_suffix(struct SN_env * z) {
|
|
220
|
+
int among_var;
|
|
221
|
+
{ int m3; /* setlimit, line 55 */
|
|
222
|
+
int m = z->l - z->c; (void) m;
|
|
223
|
+
if (z->c < z->I[0]) return 0;
|
|
224
|
+
z->c = z->I[0]; /* tomark, line 55 */
|
|
225
|
+
m3 = z->lb; z->lb = z->c;
|
|
226
|
+
z->c = z->l - m;
|
|
227
|
+
z->ket = z->c; /* [, line 56 */
|
|
228
|
+
among_var = find_among_b(z, a_2, 5); /* substring, line 56 */
|
|
229
|
+
if (!(among_var)) { z->lb = m3; return 0; }
|
|
230
|
+
z->bra = z->c; /* ], line 56 */
|
|
231
|
+
switch(among_var) {
|
|
232
|
+
case 0: { z->lb = m3; return 0; }
|
|
233
|
+
case 1:
|
|
234
|
+
{ int ret;
|
|
235
|
+
ret = slice_del(z); /* delete, line 57 */
|
|
236
|
+
if (ret < 0) return ret;
|
|
237
|
+
}
|
|
238
|
+
break;
|
|
239
|
+
case 2:
|
|
240
|
+
{ int ret;
|
|
241
|
+
ret = slice_from_s(z, 3, s_0); /* <-, line 58 */
|
|
242
|
+
if (ret < 0) return ret;
|
|
243
|
+
}
|
|
244
|
+
break;
|
|
245
|
+
case 3:
|
|
246
|
+
{ int ret;
|
|
247
|
+
ret = slice_from_s(z, 4, s_1); /* <-, line 59 */
|
|
248
|
+
if (ret < 0) return ret;
|
|
249
|
+
}
|
|
250
|
+
break;
|
|
251
|
+
}
|
|
252
|
+
z->lb = m3;
|
|
253
|
+
}
|
|
254
|
+
return 1;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
extern int swedish_stem(struct SN_env * z) {
|
|
258
|
+
{ int c = z->c; /* do, line 66 */
|
|
259
|
+
{ int ret = r_mark_regions(z);
|
|
260
|
+
if (ret == 0) goto lab0; /* call mark_regions, line 66 */
|
|
261
|
+
if (ret < 0) return ret;
|
|
262
|
+
}
|
|
263
|
+
lab0:
|
|
264
|
+
z->c = c;
|
|
265
|
+
}
|
|
266
|
+
z->lb = z->c; z->c = z->l; /* backwards, line 67 */
|
|
267
|
+
|
|
268
|
+
{ int m = z->l - z->c; (void) m; /* do, line 68 */
|
|
269
|
+
{ int ret = r_main_suffix(z);
|
|
270
|
+
if (ret == 0) goto lab1; /* call main_suffix, line 68 */
|
|
271
|
+
if (ret < 0) return ret;
|
|
272
|
+
}
|
|
273
|
+
lab1:
|
|
274
|
+
z->c = z->l - m;
|
|
275
|
+
}
|
|
276
|
+
{ int m = z->l - z->c; (void) m; /* do, line 69 */
|
|
277
|
+
{ int ret = r_consonant_pair(z);
|
|
278
|
+
if (ret == 0) goto lab2; /* call consonant_pair, line 69 */
|
|
279
|
+
if (ret < 0) return ret;
|
|
280
|
+
}
|
|
281
|
+
lab2:
|
|
282
|
+
z->c = z->l - m;
|
|
283
|
+
}
|
|
284
|
+
{ int m = z->l - z->c; (void) m; /* do, line 70 */
|
|
285
|
+
{ int ret = r_other_suffix(z);
|
|
286
|
+
if (ret == 0) goto lab3; /* call other_suffix, line 70 */
|
|
287
|
+
if (ret < 0) return ret;
|
|
288
|
+
}
|
|
289
|
+
lab3:
|
|
290
|
+
z->c = z->l - m;
|
|
291
|
+
}
|
|
292
|
+
z->c = z->lb;
|
|
293
|
+
return 1;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
extern struct SN_env * swedish_create_env(void) { return SN_create_env(0, 1, 0); }
|
|
297
|
+
|
|
298
|
+
extern void swedish_close_env(struct SN_env * z) { SN_close_env(z); }
|
|
299
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
/* This file was generated automatically by the Snowball to ANSI C compiler */
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
extern struct SN_env * swedish_create_env(void);
|
|
9
|
+
extern void swedish_close_env(struct SN_env * z);
|
|
10
|
+
|
|
11
|
+
extern int swedish_stem(struct SN_env * z);
|
|
12
|
+
|
|
13
|
+
#ifdef __cplusplus
|
|
14
|
+
}
|
|
15
|
+
#endif
|
|
16
|
+
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* stemmer.c - An interface for libstemmer library.
|
|
3
|
+
* http://snowball.tartarus.org/
|
|
4
|
+
*
|
|
5
|
+
* Copyright (c) Fabien POTENCIER 2005
|
|
6
|
+
*
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#include "ruby.h"
|
|
10
|
+
#include "libstemmer.h"
|
|
11
|
+
|
|
12
|
+
/*
|
|
13
|
+
* Document-class: Stemmer
|
|
14
|
+
*
|
|
15
|
+
* == Summary
|
|
16
|
+
*
|
|
17
|
+
* Ruby extension for stemming.
|
|
18
|
+
*
|
|
19
|
+
* == Abstract
|
|
20
|
+
*
|
|
21
|
+
* Stemmer is a wrapper class for the <tt>libstemmer</tt> library,
|
|
22
|
+
* the snowball stemming algorithms.
|
|
23
|
+
*
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
static VALUE rb_cStemmer;
|
|
27
|
+
|
|
28
|
+
struct sb_stemmer {
|
|
29
|
+
struct SN_env * (*create)(void);
|
|
30
|
+
void (*close)(struct SN_env *);
|
|
31
|
+
int (*stem)(struct SN_env *);
|
|
32
|
+
|
|
33
|
+
struct SN_env * env;
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
static void
|
|
37
|
+
stemmer_mark(struct sb_stemmer* stemmer)
|
|
38
|
+
{
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
static void
|
|
42
|
+
stemmer_free(struct sb_stemmer* stemmer)
|
|
43
|
+
{
|
|
44
|
+
sb_stemmer_delete(stemmer);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/*
|
|
48
|
+
* Document-method: new
|
|
49
|
+
* call-seq: Stemmer.new(language)
|
|
50
|
+
*
|
|
51
|
+
* Creates new code stemmer for +language+.
|
|
52
|
+
*
|
|
53
|
+
* === Parameters
|
|
54
|
+
*
|
|
55
|
+
* +language+:: stemmer language (language ISO code are supported)
|
|
56
|
+
*
|
|
57
|
+
* Currently available languages:
|
|
58
|
+
*
|
|
59
|
+
* * english (en)
|
|
60
|
+
* * french (fr)
|
|
61
|
+
* * spanish (es)
|
|
62
|
+
* * potuguese (pt)
|
|
63
|
+
* * italian (it)
|
|
64
|
+
* * german (de)
|
|
65
|
+
* * dutch (nl)
|
|
66
|
+
* * swedish (se)
|
|
67
|
+
* * danish (da)
|
|
68
|
+
* * russian (ru)
|
|
69
|
+
* * finnish (fi)
|
|
70
|
+
*
|
|
71
|
+
* === Exceptions
|
|
72
|
+
*
|
|
73
|
+
* ArgumentError:: if we can't find an available stemmer for +language+
|
|
74
|
+
*
|
|
75
|
+
* === Examples
|
|
76
|
+
*
|
|
77
|
+
* stemmer = Stemmer.new('en')
|
|
78
|
+
* puts stemmer.stem('testing')
|
|
79
|
+
*
|
|
80
|
+
* stemmer = Stemmer.new('fr')
|
|
81
|
+
* puts stemmer.stem('�t�')
|
|
82
|
+
*
|
|
83
|
+
*/
|
|
84
|
+
static VALUE
|
|
85
|
+
stemmer_allocate(VALUE klass, VALUE algorithm)
|
|
86
|
+
{
|
|
87
|
+
struct sb_stemmer* stemmer;
|
|
88
|
+
char * calgorithm;
|
|
89
|
+
|
|
90
|
+
Check_Type(algorithm, T_STRING);
|
|
91
|
+
calgorithm = strdup(STR2CSTR(algorithm));
|
|
92
|
+
if (!strcmp(calgorithm, "es"))
|
|
93
|
+
{
|
|
94
|
+
strcpy(calgorithm, "spanish");
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
stemmer = sb_stemmer_new(calgorithm);
|
|
98
|
+
free(calgorithm);
|
|
99
|
+
|
|
100
|
+
if (stemmer == NULL)
|
|
101
|
+
{
|
|
102
|
+
rb_raise(rb_eArgError, "Algorithm '%s' doesn't exist", STR2CSTR(algorithm));
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return Data_Wrap_Struct(klass, stemmer_mark, stemmer_free, stemmer);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/*
|
|
109
|
+
* Document-method: stem
|
|
110
|
+
* call-seq: stem(str)
|
|
111
|
+
*
|
|
112
|
+
* Stems string and returns the result.
|
|
113
|
+
*
|
|
114
|
+
* === Parameters
|
|
115
|
+
*
|
|
116
|
+
* +str+:: string to stem. String must be encoding the 'right' way (iso-8859-1 for french for example).
|
|
117
|
+
*/
|
|
118
|
+
static VALUE
|
|
119
|
+
stemmer_stem(VALUE self, VALUE word)
|
|
120
|
+
{
|
|
121
|
+
VALUE ret;
|
|
122
|
+
sb_symbol *cword;
|
|
123
|
+
const sb_symbol *rcword;
|
|
124
|
+
struct sb_stemmer *stemmer;
|
|
125
|
+
|
|
126
|
+
Check_Type(word, T_STRING);
|
|
127
|
+
|
|
128
|
+
cword = strdup(STR2CSTR(word));
|
|
129
|
+
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
|
130
|
+
rcword = sb_stemmer_stem(stemmer, cword, RSTRING(word)->len);
|
|
131
|
+
|
|
132
|
+
ret = rb_str_new2(rcword);
|
|
133
|
+
free(cword);
|
|
134
|
+
|
|
135
|
+
return ret;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
void
|
|
139
|
+
Init_stemmer4r()
|
|
140
|
+
{
|
|
141
|
+
rb_cStemmer = rb_define_class("Stemmer", rb_cObject);
|
|
142
|
+
rb_define_singleton_method(rb_cStemmer, "new", stemmer_allocate, 1);
|
|
143
|
+
rb_define_method(rb_cStemmer, "stem", stemmer_stem, 1);
|
|
144
|
+
rb_define_method(rb_cStemmer, "list", stemmer_list, 0);
|
|
145
|
+
}
|
|
146
|
+
|
data/stemmer4r.gemspec
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
|
|
3
|
+
spec = Gem::Specification.new do |s|
|
|
4
|
+
s.name = 'stemmer4r'
|
|
5
|
+
s.version = '0.1'
|
|
6
|
+
s.author = "Fabien POTENCIER"
|
|
7
|
+
s.email = "fabien.potencier@gmail.com"
|
|
8
|
+
s.homepage = "http://stemmer4r.rubyforge.org"
|
|
9
|
+
s.rubyforge_project = "stemmer4r"
|
|
10
|
+
s.summary = <<-EOF
|
|
11
|
+
Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
|
|
12
|
+
EOF
|
|
13
|
+
s.description = <<-EOF
|
|
14
|
+
Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
|
|
15
|
+
EOF
|
|
16
|
+
s.files = Dir.glob("**/*").delete_if { |item| item.include?(".svn") }
|
|
17
|
+
s.extensions << "ext/stemmer4r/extconf.rb"
|
|
18
|
+
s.require_path = '.'
|
|
19
|
+
s.autorequire = 'stemmer4r'
|
|
20
|
+
s.has_rdoc = true
|
|
21
|
+
s.extra_rdoc_files = ["README"]
|
|
22
|
+
s.test_files = Dir.glob('test/*.rb')
|
|
23
|
+
end
|
data/test/CVS/Entries
ADDED
data/test/CVS/Repository
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stemmer4r/test
|
data/test/CVS/Root
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
data/test/test.rb
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
#!/usr/bin/ruby -w
|
|
2
|
+
|
|
3
|
+
begin
|
|
4
|
+
require 'rubygems'
|
|
5
|
+
require_gem 'stemmer4r'
|
|
6
|
+
rescue LoadError
|
|
7
|
+
require 'stemmer4r'
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
require 'test/unit'
|
|
11
|
+
|
|
12
|
+
class StemmerTest < Test::Unit::TestCase
|
|
13
|
+
def test_valid_language
|
|
14
|
+
assert_not_nil(Stemmer.new('fr'))
|
|
15
|
+
assert_nothing_raised() { Stemmer.new('fr') }
|
|
16
|
+
assert_raise(ArgumentError) { Stemmer.new('notavalidlanguage') }
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def test_stemmer
|
|
20
|
+
dir = File.dirname(__FILE__)
|
|
21
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
|
22
|
+
puts "Testing '#{language}' algorithm..."
|
|
23
|
+
stemmer = Stemmer.new(language)
|
|
24
|
+
input = File.new("#{dir}/tests/#{language}/voc.txt")
|
|
25
|
+
output = File.new("#{dir}/tests/#{language}/output.txt")
|
|
26
|
+
input.each_line { |word|
|
|
27
|
+
assert_equal(output.gets.chomp!, stemmer.stem(word.chomp!))
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stemmer4r/test/tests
|
data/test/tests/CVS/Root
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stemmer4r/test/tests/da
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:ext:fabpot@rubyforge.org:/var/cvs/stemmer4r
|