ruby-stemmer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. data/MIT-LICENSE +21 -0
  2. data/README +79 -0
  3. data/Rakefile +52 -0
  4. data/extconf.rb +14 -0
  5. data/libstemmer_c/MANIFEST +72 -0
  6. data/libstemmer_c/Makefile +9 -0
  7. data/libstemmer_c/README +125 -0
  8. data/libstemmer_c/include/libstemmer.h +79 -0
  9. data/libstemmer_c/libstemmer/libstemmer.c +93 -0
  10. data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
  11. data/libstemmer_c/libstemmer/modules.h +190 -0
  12. data/libstemmer_c/libstemmer/modules.txt +50 -0
  13. data/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  14. data/libstemmer_c/libstemmer/modules_utf8.txt +49 -0
  15. data/libstemmer_c/mkinc.mak +82 -0
  16. data/libstemmer_c/mkinc_utf8.mak +52 -0
  17. data/libstemmer_c/runtime/api.c +66 -0
  18. data/libstemmer_c/runtime/api.h +26 -0
  19. data/libstemmer_c/runtime/header.h +58 -0
  20. data/libstemmer_c/runtime/utilities.c +478 -0
  21. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  22. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  23. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  24. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  25. data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  26. data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  27. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  28. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  29. data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  30. data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  31. data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
  32. data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  33. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  34. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  35. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  36. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  37. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  38. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  39. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  40. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  41. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  42. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  43. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  44. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  45. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  46. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  47. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  48. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  49. data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  50. data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  51. data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  52. data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  53. data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  54. data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  55. data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  56. data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  57. data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  58. data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  59. data/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  60. data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  61. data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
  62. data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  63. data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  64. data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  65. data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  66. data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  67. data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  68. data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  69. data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  70. data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  71. data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  72. data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  73. data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  74. data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  75. data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  76. data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  77. data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  78. data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  79. data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  80. data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  81. data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  82. data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  83. data/ruby-stemmer.c +108 -0
  84. data/test.rb +31 -0
  85. metadata +141 -0
@@ -0,0 +1,16 @@
1
+
2
+ /* This file was generated automatically by the Snowball to ANSI C compiler */
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ extern struct SN_env * italian_UTF_8_create_env(void);
9
+ extern void italian_UTF_8_close_env(struct SN_env * z);
10
+
11
+ extern int italian_UTF_8_stem(struct SN_env * z);
12
+
13
+ #ifdef __cplusplus
14
+ }
15
+ #endif
16
+
@@ -0,0 +1,299 @@
1
+
2
+ /* This file was generated automatically by the Snowball to ANSI C compiler */
3
+
4
+ #include "../runtime/header.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+ extern int norwegian_UTF_8_stem(struct SN_env * z);
10
+ #ifdef __cplusplus
11
+ }
12
+ #endif
13
+ static int r_other_suffix(struct SN_env * z);
14
+ static int r_consonant_pair(struct SN_env * z);
15
+ static int r_main_suffix(struct SN_env * z);
16
+ static int r_mark_regions(struct SN_env * z);
17
+ #ifdef __cplusplus
18
+ extern "C" {
19
+ #endif
20
+
21
+
22
+ extern struct SN_env * norwegian_UTF_8_create_env(void);
23
+ extern void norwegian_UTF_8_close_env(struct SN_env * z);
24
+
25
+
26
+ #ifdef __cplusplus
27
+ }
28
+ #endif
29
+ static const symbol s_0_0[1] = { 'a' };
30
+ static const symbol s_0_1[1] = { 'e' };
31
+ static const symbol s_0_2[3] = { 'e', 'd', 'e' };
32
+ static const symbol s_0_3[4] = { 'a', 'n', 'd', 'e' };
33
+ static const symbol s_0_4[4] = { 'e', 'n', 'd', 'e' };
34
+ static const symbol s_0_5[3] = { 'a', 'n', 'e' };
35
+ static const symbol s_0_6[3] = { 'e', 'n', 'e' };
36
+ static const symbol s_0_7[6] = { 'h', 'e', 't', 'e', 'n', 'e' };
37
+ static const symbol s_0_8[4] = { 'e', 'r', 't', 'e' };
38
+ static const symbol s_0_9[2] = { 'e', 'n' };
39
+ static const symbol s_0_10[5] = { 'h', 'e', 't', 'e', 'n' };
40
+ static const symbol s_0_11[2] = { 'a', 'r' };
41
+ static const symbol s_0_12[2] = { 'e', 'r' };
42
+ static const symbol s_0_13[5] = { 'h', 'e', 't', 'e', 'r' };
43
+ static const symbol s_0_14[1] = { 's' };
44
+ static const symbol s_0_15[2] = { 'a', 's' };
45
+ static const symbol s_0_16[2] = { 'e', 's' };
46
+ static const symbol s_0_17[4] = { 'e', 'd', 'e', 's' };
47
+ static const symbol s_0_18[5] = { 'e', 'n', 'd', 'e', 's' };
48
+ static const symbol s_0_19[4] = { 'e', 'n', 'e', 's' };
49
+ static const symbol s_0_20[7] = { 'h', 'e', 't', 'e', 'n', 'e', 's' };
50
+ static const symbol s_0_21[3] = { 'e', 'n', 's' };
51
+ static const symbol s_0_22[6] = { 'h', 'e', 't', 'e', 'n', 's' };
52
+ static const symbol s_0_23[3] = { 'e', 'r', 's' };
53
+ static const symbol s_0_24[3] = { 'e', 't', 's' };
54
+ static const symbol s_0_25[2] = { 'e', 't' };
55
+ static const symbol s_0_26[3] = { 'h', 'e', 't' };
56
+ static const symbol s_0_27[3] = { 'e', 'r', 't' };
57
+ static const symbol s_0_28[3] = { 'a', 's', 't' };
58
+
59
+ static const struct among a_0[29] =
60
+ {
61
+ /* 0 */ { 1, s_0_0, -1, 1, 0},
62
+ /* 1 */ { 1, s_0_1, -1, 1, 0},
63
+ /* 2 */ { 3, s_0_2, 1, 1, 0},
64
+ /* 3 */ { 4, s_0_3, 1, 1, 0},
65
+ /* 4 */ { 4, s_0_4, 1, 1, 0},
66
+ /* 5 */ { 3, s_0_5, 1, 1, 0},
67
+ /* 6 */ { 3, s_0_6, 1, 1, 0},
68
+ /* 7 */ { 6, s_0_7, 6, 1, 0},
69
+ /* 8 */ { 4, s_0_8, 1, 3, 0},
70
+ /* 9 */ { 2, s_0_9, -1, 1, 0},
71
+ /* 10 */ { 5, s_0_10, 9, 1, 0},
72
+ /* 11 */ { 2, s_0_11, -1, 1, 0},
73
+ /* 12 */ { 2, s_0_12, -1, 1, 0},
74
+ /* 13 */ { 5, s_0_13, 12, 1, 0},
75
+ /* 14 */ { 1, s_0_14, -1, 2, 0},
76
+ /* 15 */ { 2, s_0_15, 14, 1, 0},
77
+ /* 16 */ { 2, s_0_16, 14, 1, 0},
78
+ /* 17 */ { 4, s_0_17, 16, 1, 0},
79
+ /* 18 */ { 5, s_0_18, 16, 1, 0},
80
+ /* 19 */ { 4, s_0_19, 16, 1, 0},
81
+ /* 20 */ { 7, s_0_20, 19, 1, 0},
82
+ /* 21 */ { 3, s_0_21, 14, 1, 0},
83
+ /* 22 */ { 6, s_0_22, 21, 1, 0},
84
+ /* 23 */ { 3, s_0_23, 14, 1, 0},
85
+ /* 24 */ { 3, s_0_24, 14, 1, 0},
86
+ /* 25 */ { 2, s_0_25, -1, 1, 0},
87
+ /* 26 */ { 3, s_0_26, 25, 1, 0},
88
+ /* 27 */ { 3, s_0_27, -1, 3, 0},
89
+ /* 28 */ { 3, s_0_28, -1, 1, 0}
90
+ };
91
+
92
+ static const symbol s_1_0[2] = { 'd', 't' };
93
+ static const symbol s_1_1[2] = { 'v', 't' };
94
+
95
+ static const struct among a_1[2] =
96
+ {
97
+ /* 0 */ { 2, s_1_0, -1, -1, 0},
98
+ /* 1 */ { 2, s_1_1, -1, -1, 0}
99
+ };
100
+
101
+ static const symbol s_2_0[3] = { 'l', 'e', 'g' };
102
+ static const symbol s_2_1[4] = { 'e', 'l', 'e', 'g' };
103
+ static const symbol s_2_2[2] = { 'i', 'g' };
104
+ static const symbol s_2_3[3] = { 'e', 'i', 'g' };
105
+ static const symbol s_2_4[3] = { 'l', 'i', 'g' };
106
+ static const symbol s_2_5[4] = { 'e', 'l', 'i', 'g' };
107
+ static const symbol s_2_6[3] = { 'e', 'l', 's' };
108
+ static const symbol s_2_7[3] = { 'l', 'o', 'v' };
109
+ static const symbol s_2_8[4] = { 'e', 'l', 'o', 'v' };
110
+ static const symbol s_2_9[4] = { 's', 'l', 'o', 'v' };
111
+ static const symbol s_2_10[7] = { 'h', 'e', 't', 's', 'l', 'o', 'v' };
112
+
113
+ static const struct among a_2[11] =
114
+ {
115
+ /* 0 */ { 3, s_2_0, -1, 1, 0},
116
+ /* 1 */ { 4, s_2_1, 0, 1, 0},
117
+ /* 2 */ { 2, s_2_2, -1, 1, 0},
118
+ /* 3 */ { 3, s_2_3, 2, 1, 0},
119
+ /* 4 */ { 3, s_2_4, 2, 1, 0},
120
+ /* 5 */ { 4, s_2_5, 4, 1, 0},
121
+ /* 6 */ { 3, s_2_6, -1, 1, 0},
122
+ /* 7 */ { 3, s_2_7, -1, 1, 0},
123
+ /* 8 */ { 4, s_2_8, 7, 1, 0},
124
+ /* 9 */ { 4, s_2_9, 7, 1, 0},
125
+ /* 10 */ { 7, s_2_10, 9, 1, 0}
126
+ };
127
+
128
+ static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
129
+
130
+ static const unsigned char g_s_ending[] = { 119, 125, 149, 1 };
131
+
132
+ static const symbol s_0[] = { 'k' };
133
+ static const symbol s_1[] = { 'e', 'r' };
134
+
135
+ static int r_mark_regions(struct SN_env * z) {
136
+ z->I[0] = z->l;
137
+ { int c_test = z->c; /* test, line 30 */
138
+ { int ret = skip_utf8(z->p, z->c, 0, z->l, + 3);
139
+ if (ret < 0) return 0;
140
+ z->c = ret; /* hop, line 30 */
141
+ }
142
+ z->I[1] = z->c; /* setmark x, line 30 */
143
+ z->c = c_test;
144
+ }
145
+ if (out_grouping_U(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 31 */
146
+ { /* gopast */ /* non v, line 31 */
147
+ int ret = in_grouping_U(z, g_v, 97, 248, 1);
148
+ if (ret < 0) return 0;
149
+ z->c += ret;
150
+ }
151
+ z->I[0] = z->c; /* setmark p1, line 31 */
152
+ /* try, line 32 */
153
+ if (!(z->I[0] < z->I[1])) goto lab0;
154
+ z->I[0] = z->I[1];
155
+ lab0:
156
+ return 1;
157
+ }
158
+
159
+ static int r_main_suffix(struct SN_env * z) {
160
+ int among_var;
161
+ { int mlimit; /* setlimit, line 38 */
162
+ int m1 = z->l - z->c; (void)m1;
163
+ if (z->c < z->I[0]) return 0;
164
+ z->c = z->I[0]; /* tomark, line 38 */
165
+ mlimit = z->lb; z->lb = z->c;
166
+ z->c = z->l - m1;
167
+ z->ket = z->c; /* [, line 38 */
168
+ if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851426 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
169
+ among_var = find_among_b(z, a_0, 29); /* substring, line 38 */
170
+ if (!(among_var)) { z->lb = mlimit; return 0; }
171
+ z->bra = z->c; /* ], line 38 */
172
+ z->lb = mlimit;
173
+ }
174
+ switch(among_var) {
175
+ case 0: return 0;
176
+ case 1:
177
+ { int ret = slice_del(z); /* delete, line 44 */
178
+ if (ret < 0) return ret;
179
+ }
180
+ break;
181
+ case 2:
182
+ { int m2 = z->l - z->c; (void)m2; /* or, line 46 */
183
+ if (in_grouping_b_U(z, g_s_ending, 98, 122, 0)) goto lab1;
184
+ goto lab0;
185
+ lab1:
186
+ z->c = z->l - m2;
187
+ if (!(eq_s_b(z, 1, s_0))) return 0;
188
+ if (out_grouping_b_U(z, g_v, 97, 248, 0)) return 0;
189
+ }
190
+ lab0:
191
+ { int ret = slice_del(z); /* delete, line 46 */
192
+ if (ret < 0) return ret;
193
+ }
194
+ break;
195
+ case 3:
196
+ { int ret = slice_from_s(z, 2, s_1); /* <-, line 48 */
197
+ if (ret < 0) return ret;
198
+ }
199
+ break;
200
+ }
201
+ return 1;
202
+ }
203
+
204
+ static int r_consonant_pair(struct SN_env * z) {
205
+ { int m_test = z->l - z->c; /* test, line 53 */
206
+ { int mlimit; /* setlimit, line 54 */
207
+ int m1 = z->l - z->c; (void)m1;
208
+ if (z->c < z->I[0]) return 0;
209
+ z->c = z->I[0]; /* tomark, line 54 */
210
+ mlimit = z->lb; z->lb = z->c;
211
+ z->c = z->l - m1;
212
+ z->ket = z->c; /* [, line 54 */
213
+ if (z->c - 1 <= z->lb || z->p[z->c - 1] != 116) { z->lb = mlimit; return 0; }
214
+ if (!(find_among_b(z, a_1, 2))) { z->lb = mlimit; return 0; } /* substring, line 54 */
215
+ z->bra = z->c; /* ], line 54 */
216
+ z->lb = mlimit;
217
+ }
218
+ z->c = z->l - m_test;
219
+ }
220
+ { int ret = skip_utf8(z->p, z->c, z->lb, 0, -1);
221
+ if (ret < 0) return 0;
222
+ z->c = ret; /* next, line 59 */
223
+ }
224
+ z->bra = z->c; /* ], line 59 */
225
+ { int ret = slice_del(z); /* delete, line 59 */
226
+ if (ret < 0) return ret;
227
+ }
228
+ return 1;
229
+ }
230
+
231
+ static int r_other_suffix(struct SN_env * z) {
232
+ int among_var;
233
+ { int mlimit; /* setlimit, line 63 */
234
+ int m1 = z->l - z->c; (void)m1;
235
+ if (z->c < z->I[0]) return 0;
236
+ z->c = z->I[0]; /* tomark, line 63 */
237
+ mlimit = z->lb; z->lb = z->c;
238
+ z->c = z->l - m1;
239
+ z->ket = z->c; /* [, line 63 */
240
+ if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((4718720 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
241
+ among_var = find_among_b(z, a_2, 11); /* substring, line 63 */
242
+ if (!(among_var)) { z->lb = mlimit; return 0; }
243
+ z->bra = z->c; /* ], line 63 */
244
+ z->lb = mlimit;
245
+ }
246
+ switch(among_var) {
247
+ case 0: return 0;
248
+ case 1:
249
+ { int ret = slice_del(z); /* delete, line 67 */
250
+ if (ret < 0) return ret;
251
+ }
252
+ break;
253
+ }
254
+ return 1;
255
+ }
256
+
257
+ extern int norwegian_UTF_8_stem(struct SN_env * z) {
258
+ { int c1 = z->c; /* do, line 74 */
259
+ { int ret = r_mark_regions(z);
260
+ if (ret == 0) goto lab0; /* call mark_regions, line 74 */
261
+ if (ret < 0) return ret;
262
+ }
263
+ lab0:
264
+ z->c = c1;
265
+ }
266
+ z->lb = z->c; z->c = z->l; /* backwards, line 75 */
267
+
268
+ { int m2 = z->l - z->c; (void)m2; /* do, line 76 */
269
+ { int ret = r_main_suffix(z);
270
+ if (ret == 0) goto lab1; /* call main_suffix, line 76 */
271
+ if (ret < 0) return ret;
272
+ }
273
+ lab1:
274
+ z->c = z->l - m2;
275
+ }
276
+ { int m3 = z->l - z->c; (void)m3; /* do, line 77 */
277
+ { int ret = r_consonant_pair(z);
278
+ if (ret == 0) goto lab2; /* call consonant_pair, line 77 */
279
+ if (ret < 0) return ret;
280
+ }
281
+ lab2:
282
+ z->c = z->l - m3;
283
+ }
284
+ { int m4 = z->l - z->c; (void)m4; /* do, line 78 */
285
+ { int ret = r_other_suffix(z);
286
+ if (ret == 0) goto lab3; /* call other_suffix, line 78 */
287
+ if (ret < 0) return ret;
288
+ }
289
+ lab3:
290
+ z->c = z->l - m4;
291
+ }
292
+ z->c = z->lb;
293
+ return 1;
294
+ }
295
+
296
+ extern struct SN_env * norwegian_UTF_8_create_env(void) { return SN_create_env(0, 2, 0); }
297
+
298
+ extern void norwegian_UTF_8_close_env(struct SN_env * z) { SN_close_env(z, 0); }
299
+
@@ -0,0 +1,16 @@
1
+
2
+ /* This file was generated automatically by the Snowball to ANSI C compiler */
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ extern struct SN_env * norwegian_UTF_8_create_env(void);
9
+ extern void norwegian_UTF_8_close_env(struct SN_env * z);
10
+
11
+ extern int norwegian_UTF_8_stem(struct SN_env * z);
12
+
13
+ #ifdef __cplusplus
14
+ }
15
+ #endif
16
+