mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
unit SnowballProgram;
|
|
2
|
+
|
|
3
|
+
interface
|
|
4
|
+
|
|
5
|
+
Type
|
|
6
|
+
TAmongHandler = Function : Boolean of Object;
|
|
7
|
+
|
|
8
|
+
Type
|
|
9
|
+
TAmong = record
|
|
10
|
+
Str : AnsiString; // search string
|
|
11
|
+
Index : Integer; // index to longest matching substring
|
|
12
|
+
Result : Integer; // result of the lookup
|
|
13
|
+
Method : TAmongHandler; // method to use if substring matches
|
|
14
|
+
End;
|
|
15
|
+
|
|
16
|
+
Type
|
|
17
|
+
{$M+}
|
|
18
|
+
TSnowballProgram = Class
|
|
19
|
+
Protected
|
|
20
|
+
FCurrent : AnsiString;
|
|
21
|
+
FCursor : Integer;
|
|
22
|
+
FLimit : Integer;
|
|
23
|
+
FBkLimit : Integer;
|
|
24
|
+
FBra : Integer;
|
|
25
|
+
FKet : Integer;
|
|
26
|
+
|
|
27
|
+
Procedure SetCurrent(Current: AnsiString);
|
|
28
|
+
|
|
29
|
+
Protected
|
|
30
|
+
Function InGrouping(s : array of char; min, max : Integer) : Boolean;
|
|
31
|
+
Function InGroupingBk(s : array of char; min, max : Integer) : Boolean;
|
|
32
|
+
Function OutGrouping(s : array of char; min, max : Integer) : Boolean;
|
|
33
|
+
Function OutGroupingBk(s : array of char; min, max : Integer) : Boolean;
|
|
34
|
+
|
|
35
|
+
Function EqS(s_size : Integer; s : AnsiString) : Boolean;
|
|
36
|
+
Function EqSBk(s_size : Integer; s : AnsiString) : Boolean;
|
|
37
|
+
|
|
38
|
+
Function EqV(s : AnsiString) : Boolean;
|
|
39
|
+
Function EqVBk(s : AnsiString) : Boolean;
|
|
40
|
+
|
|
41
|
+
Function FindAmong(v : array of TAmong; v_size : Integer) : Integer;
|
|
42
|
+
Function FindAmongBk(v : array of TAmong; v_size : Integer) : Integer;
|
|
43
|
+
|
|
44
|
+
Procedure SliceDel;
|
|
45
|
+
Procedure SliceCheck;
|
|
46
|
+
Procedure SliceFrom(s : AnsiString);
|
|
47
|
+
|
|
48
|
+
Function ReplaceS(bra, ket : Integer; s : AnsiString) : Integer;
|
|
49
|
+
Procedure Insert(bra, ket : Integer; s : AnsiString);
|
|
50
|
+
|
|
51
|
+
Function SliceTo : AnsiString;
|
|
52
|
+
Function AssignTo : AnsiString;
|
|
53
|
+
|
|
54
|
+
Public
|
|
55
|
+
{ Set & Retrieve current string }
|
|
56
|
+
Property Current: AnsiString Read FCurrent Write SetCurrent;
|
|
57
|
+
|
|
58
|
+
{ Method subclasses need to implement }
|
|
59
|
+
Function stem : Boolean; Virtual; Abstract;
|
|
60
|
+
End;
|
|
61
|
+
|
|
62
|
+
Implementation
|
|
63
|
+
|
|
64
|
+
Uses Math;
|
|
65
|
+
|
|
66
|
+
Procedure TSnowballProgram.SetCurrent(Current: AnsiString);
|
|
67
|
+
Begin
|
|
68
|
+
FCurrent := Current;
|
|
69
|
+
FCursor := 0;
|
|
70
|
+
FLimit := Length(Current);
|
|
71
|
+
FBkLimit := 0;
|
|
72
|
+
FBra := FCursor;
|
|
73
|
+
FKet := FLimit;
|
|
74
|
+
End;
|
|
75
|
+
|
|
76
|
+
Function TSnowballProgram.InGrouping(s : array of char; min, max : Integer) : Boolean;
|
|
77
|
+
Var ch : Integer;
|
|
78
|
+
Begin
|
|
79
|
+
Result := False;
|
|
80
|
+
If (FCursor >= FLimit) Then Exit;
|
|
81
|
+
|
|
82
|
+
ch := Ord(FCurrent[FCursor + 1]);
|
|
83
|
+
If (ch > max) Or (ch < min) Then Exit;
|
|
84
|
+
|
|
85
|
+
ch := ch - min;
|
|
86
|
+
If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit;
|
|
87
|
+
|
|
88
|
+
Inc(FCursor);
|
|
89
|
+
Result := True;
|
|
90
|
+
End;
|
|
91
|
+
|
|
92
|
+
Function TSnowballProgram.InGroupingBk(s : array of char; min, max : Integer) : Boolean;
|
|
93
|
+
Var ch : Integer;
|
|
94
|
+
Begin
|
|
95
|
+
Result := False;
|
|
96
|
+
If (FCursor <= FBkLimit) Then Exit;
|
|
97
|
+
|
|
98
|
+
ch := Ord(FCurrent[FCursor]);
|
|
99
|
+
If (ch > max) Or (ch < min) Then Exit;
|
|
100
|
+
|
|
101
|
+
ch := ch - min;
|
|
102
|
+
If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit;
|
|
103
|
+
|
|
104
|
+
Dec(FCursor);
|
|
105
|
+
Result := True;
|
|
106
|
+
End;
|
|
107
|
+
|
|
108
|
+
Function TSnowballProgram.OutGrouping(s : array of char; min, max : Integer) : Boolean;
|
|
109
|
+
Var ch : Integer;
|
|
110
|
+
Begin
|
|
111
|
+
Result := False;
|
|
112
|
+
If (FCursor >= FLimit) Then Exit;
|
|
113
|
+
|
|
114
|
+
ch := Ord(FCurrent[FCursor + 1]);
|
|
115
|
+
|
|
116
|
+
If (ch > max) Or (ch < min) Then
|
|
117
|
+
Begin
|
|
118
|
+
Inc(FCursor);
|
|
119
|
+
Result := True;
|
|
120
|
+
Exit;
|
|
121
|
+
End;
|
|
122
|
+
|
|
123
|
+
ch := ch - min;
|
|
124
|
+
If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then
|
|
125
|
+
Begin
|
|
126
|
+
Inc(FCursor);
|
|
127
|
+
Result := True;
|
|
128
|
+
End;
|
|
129
|
+
End;
|
|
130
|
+
|
|
131
|
+
Function TSnowballProgram.OutGroupingBk(s : array of char; min, max : Integer) : Boolean;
|
|
132
|
+
Var ch : Integer;
|
|
133
|
+
Begin
|
|
134
|
+
Result := False;
|
|
135
|
+
|
|
136
|
+
If (FCursor <= FBkLimit) Then Exit;
|
|
137
|
+
|
|
138
|
+
ch := Ord(FCurrent[FCursor]);
|
|
139
|
+
If (ch > max) Or (ch < min) Then
|
|
140
|
+
Begin
|
|
141
|
+
Dec(FCursor);
|
|
142
|
+
Result := True;
|
|
143
|
+
Exit;
|
|
144
|
+
End;
|
|
145
|
+
|
|
146
|
+
ch := ch - min;
|
|
147
|
+
If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then
|
|
148
|
+
Begin
|
|
149
|
+
Dec(FCursor);
|
|
150
|
+
Result := True;
|
|
151
|
+
End;
|
|
152
|
+
End;
|
|
153
|
+
|
|
154
|
+
Function TSnowballProgram.EqS(s_size : Integer; s : AnsiString) : Boolean;
|
|
155
|
+
Var I : Integer;
|
|
156
|
+
Begin
|
|
157
|
+
Result := False;
|
|
158
|
+
|
|
159
|
+
If (FLimit - FCursor) < s_size Then Exit;
|
|
160
|
+
|
|
161
|
+
For I := 1 To s_size Do
|
|
162
|
+
If FCurrent[FCursor + I] <> s[I] Then Exit;
|
|
163
|
+
|
|
164
|
+
FCursor := FCursor + s_size;
|
|
165
|
+
|
|
166
|
+
Result := True;
|
|
167
|
+
End;
|
|
168
|
+
|
|
169
|
+
Function TSnowballProgram.EqSBk(s_size : Integer; s : AnsiString) : Boolean;
|
|
170
|
+
Var I : Integer;
|
|
171
|
+
Begin
|
|
172
|
+
Result := False;
|
|
173
|
+
|
|
174
|
+
if (FCursor - FBkLimit) < s_size Then Exit;
|
|
175
|
+
|
|
176
|
+
For I := 1 To s_size Do
|
|
177
|
+
If FCurrent[FCursor - s_size + I] <> s[i] Then Exit;
|
|
178
|
+
|
|
179
|
+
FCursor := FCursor - s_size;
|
|
180
|
+
|
|
181
|
+
Result := True;
|
|
182
|
+
End;
|
|
183
|
+
|
|
184
|
+
Function TSnowballProgram.EqV(s : AnsiString) : Boolean;
|
|
185
|
+
Begin
|
|
186
|
+
Result := EqS(Length(s), s);
|
|
187
|
+
End;
|
|
188
|
+
|
|
189
|
+
Function TSnowballProgram.EqVBk(s : AnsiString) : Boolean;
|
|
190
|
+
Begin
|
|
191
|
+
Result := EqSBk(Length(s), s);
|
|
192
|
+
End;
|
|
193
|
+
|
|
194
|
+
Function TSnowballProgram.FindAmong(v : array of TAmong; v_size : Integer) : Integer;
|
|
195
|
+
Var i, i2, j, c, l, common_i, common_j, k, diff, common : Integer;
|
|
196
|
+
first_key_inspected, res : Boolean;
|
|
197
|
+
w : TAmong;
|
|
198
|
+
Begin
|
|
199
|
+
i := 0;
|
|
200
|
+
j := v_size;
|
|
201
|
+
|
|
202
|
+
c := FCursor;
|
|
203
|
+
l := FLimit;
|
|
204
|
+
|
|
205
|
+
common_i := 0;
|
|
206
|
+
common_j := 0;
|
|
207
|
+
|
|
208
|
+
first_key_inspected := false;
|
|
209
|
+
|
|
210
|
+
While True Do
|
|
211
|
+
Begin
|
|
212
|
+
k := i + ((j - i) Shr 1);
|
|
213
|
+
diff := 0;
|
|
214
|
+
common := Min(common_i, common_j); // smaller
|
|
215
|
+
w := v[k];
|
|
216
|
+
|
|
217
|
+
For i2 := common To Length(w.Str) - 1 Do
|
|
218
|
+
Begin
|
|
219
|
+
if (c + common) = l Then
|
|
220
|
+
Begin
|
|
221
|
+
diff := -1;
|
|
222
|
+
Break;
|
|
223
|
+
End;
|
|
224
|
+
|
|
225
|
+
diff := Ord(FCurrent[c + common + 1]) - Ord(w.Str[i2 + 1]);
|
|
226
|
+
if diff <> 0 Then Break;
|
|
227
|
+
|
|
228
|
+
Inc(common);
|
|
229
|
+
End;
|
|
230
|
+
|
|
231
|
+
if diff < 0 Then
|
|
232
|
+
Begin
|
|
233
|
+
j := k;
|
|
234
|
+
common_j := common;
|
|
235
|
+
End
|
|
236
|
+
Else
|
|
237
|
+
Begin
|
|
238
|
+
i := k;
|
|
239
|
+
common_i := common;
|
|
240
|
+
End;
|
|
241
|
+
|
|
242
|
+
If (j - i) <= 1 Then
|
|
243
|
+
Begin
|
|
244
|
+
If (i > 0) Then Break; // v->s has been inspected
|
|
245
|
+
if (j = i) Then Break; // only one item in v
|
|
246
|
+
|
|
247
|
+
// - but now we need to go round once more to get
|
|
248
|
+
// v->s inspected. This looks messy, but is actually
|
|
249
|
+
// the optimal approach.
|
|
250
|
+
|
|
251
|
+
if (first_key_inspected) Then Break;
|
|
252
|
+
first_key_inspected := True;
|
|
253
|
+
End;
|
|
254
|
+
End;
|
|
255
|
+
|
|
256
|
+
While True Do
|
|
257
|
+
Begin
|
|
258
|
+
w := v[i];
|
|
259
|
+
If (common_i >= Length(w.Str)) Then
|
|
260
|
+
Begin
|
|
261
|
+
FCursor := c + Length(w.Str);
|
|
262
|
+
If Not Assigned(w.Method) Then
|
|
263
|
+
Begin
|
|
264
|
+
Result := w.Result;
|
|
265
|
+
Exit;
|
|
266
|
+
End;
|
|
267
|
+
|
|
268
|
+
res := w.Method;
|
|
269
|
+
|
|
270
|
+
FCursor := c + Length(w.Str);
|
|
271
|
+
if (res) Then Begin
|
|
272
|
+
Result := w.Result;
|
|
273
|
+
Exit;
|
|
274
|
+
End;
|
|
275
|
+
End;
|
|
276
|
+
|
|
277
|
+
i := w.Index;
|
|
278
|
+
if i < 0 Then
|
|
279
|
+
Begin
|
|
280
|
+
Result := 0;
|
|
281
|
+
Exit;
|
|
282
|
+
End;
|
|
283
|
+
End;
|
|
284
|
+
End;
|
|
285
|
+
|
|
286
|
+
Function TSnowballProgram.FindAmongBk(v : array of TAmong; v_size : Integer) : Integer;
|
|
287
|
+
Var i, j, c, lb, common_i, common_j, k, diff, common, i2 : Integer;
|
|
288
|
+
first_key_inspected, res : Boolean;
|
|
289
|
+
w : TAmong;
|
|
290
|
+
Begin
|
|
291
|
+
i := 0;
|
|
292
|
+
j := v_size;
|
|
293
|
+
|
|
294
|
+
c := FCursor;
|
|
295
|
+
lb := FBkLimit;
|
|
296
|
+
|
|
297
|
+
common_i := 0;
|
|
298
|
+
common_j := 0;
|
|
299
|
+
|
|
300
|
+
first_key_inspected := false;
|
|
301
|
+
|
|
302
|
+
While True Do
|
|
303
|
+
Begin
|
|
304
|
+
k := i + ((j - i) Shr 1);
|
|
305
|
+
diff := 0;
|
|
306
|
+
common := Min(common_i, common_j);
|
|
307
|
+
w := v[k];
|
|
308
|
+
|
|
309
|
+
For i2 := Length(w.Str) - 1 - common DownTo 0 Do
|
|
310
|
+
Begin
|
|
311
|
+
If (c - common) = lb Then
|
|
312
|
+
Begin
|
|
313
|
+
diff := -1;
|
|
314
|
+
Break;
|
|
315
|
+
End;
|
|
316
|
+
|
|
317
|
+
diff := Ord(FCurrent[c - common]) - Ord(w.Str[i2 + 1]);
|
|
318
|
+
if diff <> 0 Then Break;
|
|
319
|
+
Inc(common);
|
|
320
|
+
End;
|
|
321
|
+
|
|
322
|
+
If diff < 0 Then
|
|
323
|
+
Begin
|
|
324
|
+
j := k;
|
|
325
|
+
common_j := common;
|
|
326
|
+
End
|
|
327
|
+
Else
|
|
328
|
+
Begin
|
|
329
|
+
i := k;
|
|
330
|
+
common_i := common;
|
|
331
|
+
End;
|
|
332
|
+
|
|
333
|
+
If (j - i) <= 1 Then
|
|
334
|
+
Begin
|
|
335
|
+
if i > 0 Then Break;
|
|
336
|
+
if j = i Then Break;
|
|
337
|
+
if first_key_inspected Then Break;
|
|
338
|
+
first_key_inspected := True;
|
|
339
|
+
End;
|
|
340
|
+
End;
|
|
341
|
+
|
|
342
|
+
While True Do
|
|
343
|
+
Begin
|
|
344
|
+
w := v[i];
|
|
345
|
+
if common_i >= Length(w.Str) Then
|
|
346
|
+
Begin
|
|
347
|
+
FCursor := c - Length(w.Str);
|
|
348
|
+
If Not Assigned(w.Method) Then
|
|
349
|
+
Begin
|
|
350
|
+
Result := w.Result;
|
|
351
|
+
Exit;
|
|
352
|
+
End;
|
|
353
|
+
|
|
354
|
+
res := w.Method;
|
|
355
|
+
|
|
356
|
+
FCursor := c - Length(w.Str);
|
|
357
|
+
If Res Then
|
|
358
|
+
Begin
|
|
359
|
+
Result := w.Result;
|
|
360
|
+
Exit;
|
|
361
|
+
End;
|
|
362
|
+
End;
|
|
363
|
+
|
|
364
|
+
i := w.Index;
|
|
365
|
+
If i < 0 Then
|
|
366
|
+
Begin
|
|
367
|
+
Result := 0;
|
|
368
|
+
Exit;
|
|
369
|
+
End;
|
|
370
|
+
End;
|
|
371
|
+
End;
|
|
372
|
+
|
|
373
|
+
Procedure TSnowballProgram.SliceCheck;
|
|
374
|
+
Begin
|
|
375
|
+
if (FBra < 0) Or (FBra > FKet) Or (FKet > FLimit) Or (FLimit > Length(FCurrent)) Then
|
|
376
|
+
Begin
|
|
377
|
+
WriteLn('Faulty slice operation.');
|
|
378
|
+
Halt;
|
|
379
|
+
End;
|
|
380
|
+
End;
|
|
381
|
+
|
|
382
|
+
Procedure TSnowballProgram.SliceDel;
|
|
383
|
+
Begin
|
|
384
|
+
SliceFrom('');
|
|
385
|
+
End;
|
|
386
|
+
|
|
387
|
+
Function TSnowballProgram.ReplaceS(bra, ket : Integer; s : AnsiString) : Integer;
|
|
388
|
+
Var adjustment : Integer;
|
|
389
|
+
Begin
|
|
390
|
+
adjustment := Length(s) - (ket - bra);
|
|
391
|
+
|
|
392
|
+
Delete(FCurrent, bra + 1, ket - bra);
|
|
393
|
+
System.Insert(s, FCurrent, bra + 1);
|
|
394
|
+
|
|
395
|
+
FLimit := FLimit + adjustment;
|
|
396
|
+
|
|
397
|
+
if (FCursor >= ket) Then
|
|
398
|
+
FCursor := FCursor + adjustment
|
|
399
|
+
Else If (FCursor > bra) Then
|
|
400
|
+
FCursor := bra;
|
|
401
|
+
|
|
402
|
+
Result := adjustment;
|
|
403
|
+
End;
|
|
404
|
+
|
|
405
|
+
Procedure TSnowballProgram.Insert(bra, ket : Integer; s : AnsiString);
|
|
406
|
+
Var adjustment : Integer;
|
|
407
|
+
Begin
|
|
408
|
+
adjustment := ReplaceS(bra, ket, s);
|
|
409
|
+
If (bra <= FBra) Then FBra := FBra + adjustment;
|
|
410
|
+
If (bra <= FKet) Then FKet := FKet + adjustment;
|
|
411
|
+
End;
|
|
412
|
+
|
|
413
|
+
Function TSnowballProgram.SliceTo() : AnsiString;
|
|
414
|
+
Begin
|
|
415
|
+
SliceCheck();
|
|
416
|
+
Result := Copy(FCurrent, FBra + 1, FKet - FBra);
|
|
417
|
+
End;
|
|
418
|
+
|
|
419
|
+
Procedure TSnowballProgram.SliceFrom(s : AnsiString);
|
|
420
|
+
Begin
|
|
421
|
+
SliceCheck();
|
|
422
|
+
ReplaceS(FBra, FKet, s);
|
|
423
|
+
End;
|
|
424
|
+
|
|
425
|
+
Function TSnowballProgram.AssignTo() : AnsiString;
|
|
426
|
+
Begin
|
|
427
|
+
Result := Copy(FCurrent, 1, FLimit);
|
|
428
|
+
End;
|
|
429
|
+
|
|
430
|
+
End.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env perl
|
|
2
|
+
use strict;
|
|
3
|
+
use warnings;
|
|
4
|
+
|
|
5
|
+
# Generate Pascal stemwords source.
|
|
6
|
+
|
|
7
|
+
my @sources = @ARGV;
|
|
8
|
+
|
|
9
|
+
while (defined(my $line = <STDIN>)) {
|
|
10
|
+
if ($line =~ /\{\s*BEGIN TEMPLATE\s*\}/) {
|
|
11
|
+
my $template = '';
|
|
12
|
+
while (defined($line = <STDIN>) && $line !~ /\{\s*END TEMPLATE\s*\}/) {
|
|
13
|
+
$template .= $line;
|
|
14
|
+
}
|
|
15
|
+
foreach my $source(@sources) {
|
|
16
|
+
my $out = $template;
|
|
17
|
+
$out =~ s/%STEMMER%/$source/g;
|
|
18
|
+
print $out;
|
|
19
|
+
}
|
|
20
|
+
next;
|
|
21
|
+
}
|
|
22
|
+
print $line;
|
|
23
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
program stemwords;
|
|
2
|
+
|
|
3
|
+
{$ifdef windows}
|
|
4
|
+
{$APPTYPE CONSOLE}
|
|
5
|
+
{$endif}
|
|
6
|
+
|
|
7
|
+
uses
|
|
8
|
+
SnowballProgram,
|
|
9
|
+
{ BEGIN TEMPLATE }
|
|
10
|
+
%STEMMER%Stemmer in '%STEMMER%Stemmer.pas',
|
|
11
|
+
{ END TEMPLATE }
|
|
12
|
+
SysUtils;
|
|
13
|
+
|
|
14
|
+
Var
|
|
15
|
+
Stemmer : TSnowballProgram;
|
|
16
|
+
CurWord : AnsiString;
|
|
17
|
+
i : Integer;
|
|
18
|
+
language : AnsiString;
|
|
19
|
+
|
|
20
|
+
Const
|
|
21
|
+
Delimiters : Set Of Char = [#10, #13];
|
|
22
|
+
|
|
23
|
+
Function NextWord : Boolean;
|
|
24
|
+
Var C : Char;
|
|
25
|
+
Begin
|
|
26
|
+
CurWord := '';
|
|
27
|
+
|
|
28
|
+
Result := Not Eof;
|
|
29
|
+
|
|
30
|
+
While Not Eof Do
|
|
31
|
+
Begin
|
|
32
|
+
Read(C);
|
|
33
|
+
If IOResult <> 0 Then Break;
|
|
34
|
+
If C In Delimiters Then Break;
|
|
35
|
+
CurWord := CurWord + C;
|
|
36
|
+
End;
|
|
37
|
+
End;
|
|
38
|
+
|
|
39
|
+
begin
|
|
40
|
+
language := 'english';
|
|
41
|
+
i := 0;
|
|
42
|
+
while i < ParamCount do
|
|
43
|
+
begin
|
|
44
|
+
i := i + 1;
|
|
45
|
+
if ParamStr(i) = '-l' then
|
|
46
|
+
begin
|
|
47
|
+
i := i + 1;
|
|
48
|
+
language := ParamStr(i);
|
|
49
|
+
continue;
|
|
50
|
+
end;
|
|
51
|
+
WriteLn('option '+ParamStr(i)+' unknown');
|
|
52
|
+
Exit;
|
|
53
|
+
end;
|
|
54
|
+
if False then
|
|
55
|
+
{ BEGIN TEMPLATE }
|
|
56
|
+
else if language = '%STEMMER%' then
|
|
57
|
+
Stemmer := T%STEMMER%Stemmer.Create
|
|
58
|
+
{ END TEMPLATE }
|
|
59
|
+
else
|
|
60
|
+
begin
|
|
61
|
+
WriteLn('Stemming language '+language+' unknown');
|
|
62
|
+
Exit;
|
|
63
|
+
end;
|
|
64
|
+
|
|
65
|
+
Try
|
|
66
|
+
While Not Eof Do
|
|
67
|
+
Begin
|
|
68
|
+
While NextWord Do
|
|
69
|
+
Begin
|
|
70
|
+
Stemmer.Current := CurWord;
|
|
71
|
+
Stemmer.Stem;
|
|
72
|
+
WriteLn(Stemmer.Current);
|
|
73
|
+
End;
|
|
74
|
+
End;
|
|
75
|
+
Finally
|
|
76
|
+
Stemmer.Free;
|
|
77
|
+
End;
|
|
78
|
+
end.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#! /bin/sh/env python
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import re
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
python_out_folder = sys.argv[1]
|
|
8
|
+
|
|
9
|
+
filematch = re.compile(r"(\w+)_stemmer\.py$")
|
|
10
|
+
|
|
11
|
+
imports = []
|
|
12
|
+
languages = []
|
|
13
|
+
|
|
14
|
+
for pyscript in os.listdir(python_out_folder):
|
|
15
|
+
match = filematch.match(pyscript)
|
|
16
|
+
if (match):
|
|
17
|
+
langname = match.group(1)
|
|
18
|
+
titlecase = langname.title()
|
|
19
|
+
languages.append(" '%(lang)s': %(title)sStemmer," % {'lang': langname, 'title': titlecase})
|
|
20
|
+
imports.append('from .%(lang)s_stemmer import %(title)sStemmer' % {'lang': langname, 'title': titlecase})
|
|
21
|
+
imports.sort()
|
|
22
|
+
languages.sort()
|
|
23
|
+
|
|
24
|
+
src = '''__all__ = ('language', 'stemmer')
|
|
25
|
+
|
|
26
|
+
%(imports)s
|
|
27
|
+
|
|
28
|
+
_languages = {
|
|
29
|
+
%(languages)s
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
import Stemmer
|
|
34
|
+
cext_available = True
|
|
35
|
+
except ImportError:
|
|
36
|
+
cext_available = False
|
|
37
|
+
|
|
38
|
+
def algorithms():
|
|
39
|
+
if cext_available:
|
|
40
|
+
return Stemmer.language()
|
|
41
|
+
else:
|
|
42
|
+
return list(_languages.keys())
|
|
43
|
+
|
|
44
|
+
def stemmer(lang):
|
|
45
|
+
if cext_available:
|
|
46
|
+
return Stemmer.Stemmer(lang)
|
|
47
|
+
if lang.lower() in _languages:
|
|
48
|
+
return _languages[lang.lower()]()
|
|
49
|
+
else:
|
|
50
|
+
raise KeyError("Stemming algorithm '%%s' not found" %% lang)
|
|
51
|
+
''' % {'imports': '\n'.join(imports), 'languages': '\n'.join(languages)}
|
|
52
|
+
|
|
53
|
+
with open(os.path.join(python_out_folder, '__init__.py'), 'w') as out:
|
|
54
|
+
out.write(src)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
from setuptools import setup
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
SNOWBALL_VERSION = '2.2.0'
|
|
7
|
+
|
|
8
|
+
n_stemmers = 0
|
|
9
|
+
|
|
10
|
+
langs = []
|
|
11
|
+
variants = {}
|
|
12
|
+
with open('modules.txt') as fp:
|
|
13
|
+
for line in fp.readlines():
|
|
14
|
+
if len(line) <= 1 or line[0] == '#':
|
|
15
|
+
continue
|
|
16
|
+
if line[-1:] == '\n':
|
|
17
|
+
line = line[:-1]
|
|
18
|
+
tokens = re.split(r'\s+', line)
|
|
19
|
+
if len(tokens) < 3:
|
|
20
|
+
print("Bad modules.txt line: " + line)
|
|
21
|
+
continue
|
|
22
|
+
(name, encs, codes) = tokens[:3]
|
|
23
|
+
if len(tokens) > 3:
|
|
24
|
+
variant_of = tokens[3]
|
|
25
|
+
if variant_of in variants:
|
|
26
|
+
variants[variant_of].append(name)
|
|
27
|
+
else:
|
|
28
|
+
variants[variant_of] = [name]
|
|
29
|
+
else:
|
|
30
|
+
langs.append(name)
|
|
31
|
+
n_stemmers += 1
|
|
32
|
+
|
|
33
|
+
desc = 'This package provides ' + str(n_stemmers) + ' stemmers for ' + \
|
|
34
|
+
str(len(langs)) + ' languages generated from Snowball algorithms.'
|
|
35
|
+
|
|
36
|
+
classifiers = [
|
|
37
|
+
'Development Status :: 5 - Production/Stable',
|
|
38
|
+
'Intended Audience :: Developers',
|
|
39
|
+
'License :: OSI Approved :: BSD License'
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
for lang in langs:
|
|
43
|
+
lang_titlecase = lang.title()
|
|
44
|
+
# Only classifiers listed in https://pypi.org/classifiers/ are allowed
|
|
45
|
+
if lang_titlecase not in ('Armenian', 'Yiddish'):
|
|
46
|
+
classifiers.append('Natural Language :: ' + lang_titlecase)
|
|
47
|
+
|
|
48
|
+
classifiers.extend([
|
|
49
|
+
'Operating System :: OS Independent',
|
|
50
|
+
'Programming Language :: Python',
|
|
51
|
+
'Programming Language :: Python :: 2',
|
|
52
|
+
'Programming Language :: Python :: 2.6',
|
|
53
|
+
'Programming Language :: Python :: 2.7',
|
|
54
|
+
'Programming Language :: Python :: 3',
|
|
55
|
+
'Programming Language :: Python :: 3.4',
|
|
56
|
+
'Programming Language :: Python :: 3.5',
|
|
57
|
+
'Programming Language :: Python :: 3.6',
|
|
58
|
+
'Programming Language :: Python :: 3.7',
|
|
59
|
+
'Programming Language :: Python :: 3.8',
|
|
60
|
+
'Programming Language :: Python :: 3.9',
|
|
61
|
+
'Programming Language :: Python :: 3.10',
|
|
62
|
+
'Programming Language :: Python :: Implementation :: CPython',
|
|
63
|
+
'Programming Language :: Python :: Implementation :: PyPy',
|
|
64
|
+
'Topic :: Database',
|
|
65
|
+
'Topic :: Internet :: WWW/HTTP :: Indexing/Search',
|
|
66
|
+
'Topic :: Text Processing :: Indexing',
|
|
67
|
+
'Topic :: Text Processing :: Linguistic'
|
|
68
|
+
])
|
|
69
|
+
|
|
70
|
+
setup(name='snowballstemmer',
|
|
71
|
+
version=SNOWBALL_VERSION,
|
|
72
|
+
description=desc,
|
|
73
|
+
author='Snowball Developers',
|
|
74
|
+
author_email='snowball-discuss@lists.tartarus.org',
|
|
75
|
+
url='https://github.com/snowballstem/snowball',
|
|
76
|
+
keywords="stemmer",
|
|
77
|
+
license="BSD-3-Clause",
|
|
78
|
+
packages=['snowballstemmer'],
|
|
79
|
+
package_dir={"snowballstemmer": "src/snowballstemmer"},
|
|
80
|
+
classifiers = classifiers
|
|
81
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
|
|
2
|
+
class Among(object):
|
|
3
|
+
def __init__(self, s, substring_i, result, method=None):
|
|
4
|
+
"""
|
|
5
|
+
@ivar s search string
|
|
6
|
+
@ivar substring index to longest matching substring
|
|
7
|
+
@ivar result of the lookup
|
|
8
|
+
@ivar method method to use if substring matches
|
|
9
|
+
"""
|
|
10
|
+
self.s = s
|
|
11
|
+
self.substring_i = substring_i
|
|
12
|
+
self.result = result
|
|
13
|
+
self.method = method
|