mittens 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,430 @@
1
+ unit SnowballProgram;
2
+
3
+ interface
4
+
5
+ Type
6
+ TAmongHandler = Function : Boolean of Object;
7
+
8
+ Type
9
+ TAmong = record
10
+ Str : AnsiString; // search string
11
+ Index : Integer; // index to longest matching substring
12
+ Result : Integer; // result of the lookup
13
+ Method : TAmongHandler; // method to use if substring matches
14
+ End;
15
+
16
+ Type
17
+ {$M+}
18
+ TSnowballProgram = Class
19
+ Protected
20
+ FCurrent : AnsiString;
21
+ FCursor : Integer;
22
+ FLimit : Integer;
23
+ FBkLimit : Integer;
24
+ FBra : Integer;
25
+ FKet : Integer;
26
+
27
+ Procedure SetCurrent(Current: AnsiString);
28
+
29
+ Protected
30
+ Function InGrouping(s : array of char; min, max : Integer) : Boolean;
31
+ Function InGroupingBk(s : array of char; min, max : Integer) : Boolean;
32
+ Function OutGrouping(s : array of char; min, max : Integer) : Boolean;
33
+ Function OutGroupingBk(s : array of char; min, max : Integer) : Boolean;
34
+
35
+ Function EqS(s_size : Integer; s : AnsiString) : Boolean;
36
+ Function EqSBk(s_size : Integer; s : AnsiString) : Boolean;
37
+
38
+ Function EqV(s : AnsiString) : Boolean;
39
+ Function EqVBk(s : AnsiString) : Boolean;
40
+
41
+ Function FindAmong(v : array of TAmong; v_size : Integer) : Integer;
42
+ Function FindAmongBk(v : array of TAmong; v_size : Integer) : Integer;
43
+
44
+ Procedure SliceDel;
45
+ Procedure SliceCheck;
46
+ Procedure SliceFrom(s : AnsiString);
47
+
48
+ Function ReplaceS(bra, ket : Integer; s : AnsiString) : Integer;
49
+ Procedure Insert(bra, ket : Integer; s : AnsiString);
50
+
51
+ Function SliceTo : AnsiString;
52
+ Function AssignTo : AnsiString;
53
+
54
+ Public
55
+ { Set & Retrieve current string }
56
+ Property Current: AnsiString Read FCurrent Write SetCurrent;
57
+
58
+ { Method subclasses need to implement }
59
+ Function stem : Boolean; Virtual; Abstract;
60
+ End;
61
+
62
+ Implementation
63
+
64
+ Uses Math;
65
+
66
+ Procedure TSnowballProgram.SetCurrent(Current: AnsiString);
67
+ Begin
68
+ FCurrent := Current;
69
+ FCursor := 0;
70
+ FLimit := Length(Current);
71
+ FBkLimit := 0;
72
+ FBra := FCursor;
73
+ FKet := FLimit;
74
+ End;
75
+
76
+ Function TSnowballProgram.InGrouping(s : array of char; min, max : Integer) : Boolean;
77
+ Var ch : Integer;
78
+ Begin
79
+ Result := False;
80
+ If (FCursor >= FLimit) Then Exit;
81
+
82
+ ch := Ord(FCurrent[FCursor + 1]);
83
+ If (ch > max) Or (ch < min) Then Exit;
84
+
85
+ ch := ch - min;
86
+ If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit;
87
+
88
+ Inc(FCursor);
89
+ Result := True;
90
+ End;
91
+
92
+ Function TSnowballProgram.InGroupingBk(s : array of char; min, max : Integer) : Boolean;
93
+ Var ch : Integer;
94
+ Begin
95
+ Result := False;
96
+ If (FCursor <= FBkLimit) Then Exit;
97
+
98
+ ch := Ord(FCurrent[FCursor]);
99
+ If (ch > max) Or (ch < min) Then Exit;
100
+
101
+ ch := ch - min;
102
+ If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit;
103
+
104
+ Dec(FCursor);
105
+ Result := True;
106
+ End;
107
+
108
+ Function TSnowballProgram.OutGrouping(s : array of char; min, max : Integer) : Boolean;
109
+ Var ch : Integer;
110
+ Begin
111
+ Result := False;
112
+ If (FCursor >= FLimit) Then Exit;
113
+
114
+ ch := Ord(FCurrent[FCursor + 1]);
115
+
116
+ If (ch > max) Or (ch < min) Then
117
+ Begin
118
+ Inc(FCursor);
119
+ Result := True;
120
+ Exit;
121
+ End;
122
+
123
+ ch := ch - min;
124
+ If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then
125
+ Begin
126
+ Inc(FCursor);
127
+ Result := True;
128
+ End;
129
+ End;
130
+
131
+ Function TSnowballProgram.OutGroupingBk(s : array of char; min, max : Integer) : Boolean;
132
+ Var ch : Integer;
133
+ Begin
134
+ Result := False;
135
+
136
+ If (FCursor <= FBkLimit) Then Exit;
137
+
138
+ ch := Ord(FCurrent[FCursor]);
139
+ If (ch > max) Or (ch < min) Then
140
+ Begin
141
+ Dec(FCursor);
142
+ Result := True;
143
+ Exit;
144
+ End;
145
+
146
+ ch := ch - min;
147
+ If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then
148
+ Begin
149
+ Dec(FCursor);
150
+ Result := True;
151
+ End;
152
+ End;
153
+
154
+ Function TSnowballProgram.EqS(s_size : Integer; s : AnsiString) : Boolean;
155
+ Var I : Integer;
156
+ Begin
157
+ Result := False;
158
+
159
+ If (FLimit - FCursor) < s_size Then Exit;
160
+
161
+ For I := 1 To s_size Do
162
+ If FCurrent[FCursor + I] <> s[I] Then Exit;
163
+
164
+ FCursor := FCursor + s_size;
165
+
166
+ Result := True;
167
+ End;
168
+
169
+ Function TSnowballProgram.EqSBk(s_size : Integer; s : AnsiString) : Boolean;
170
+ Var I : Integer;
171
+ Begin
172
+ Result := False;
173
+
174
+ if (FCursor - FBkLimit) < s_size Then Exit;
175
+
176
+ For I := 1 To s_size Do
177
+ If FCurrent[FCursor - s_size + I] <> s[i] Then Exit;
178
+
179
+ FCursor := FCursor - s_size;
180
+
181
+ Result := True;
182
+ End;
183
+
184
+ Function TSnowballProgram.EqV(s : AnsiString) : Boolean;
185
+ Begin
186
+ Result := EqS(Length(s), s);
187
+ End;
188
+
189
+ Function TSnowballProgram.EqVBk(s : AnsiString) : Boolean;
190
+ Begin
191
+ Result := EqSBk(Length(s), s);
192
+ End;
193
+
194
+ Function TSnowballProgram.FindAmong(v : array of TAmong; v_size : Integer) : Integer;
195
+ Var i, i2, j, c, l, common_i, common_j, k, diff, common : Integer;
196
+ first_key_inspected, res : Boolean;
197
+ w : TAmong;
198
+ Begin
199
+ i := 0;
200
+ j := v_size;
201
+
202
+ c := FCursor;
203
+ l := FLimit;
204
+
205
+ common_i := 0;
206
+ common_j := 0;
207
+
208
+ first_key_inspected := false;
209
+
210
+ While True Do
211
+ Begin
212
+ k := i + ((j - i) Shr 1);
213
+ diff := 0;
214
+ common := Min(common_i, common_j); // smaller
215
+ w := v[k];
216
+
217
+ For i2 := common To Length(w.Str) - 1 Do
218
+ Begin
219
+ if (c + common) = l Then
220
+ Begin
221
+ diff := -1;
222
+ Break;
223
+ End;
224
+
225
+ diff := Ord(FCurrent[c + common + 1]) - Ord(w.Str[i2 + 1]);
226
+ if diff <> 0 Then Break;
227
+
228
+ Inc(common);
229
+ End;
230
+
231
+ if diff < 0 Then
232
+ Begin
233
+ j := k;
234
+ common_j := common;
235
+ End
236
+ Else
237
+ Begin
238
+ i := k;
239
+ common_i := common;
240
+ End;
241
+
242
+ If (j - i) <= 1 Then
243
+ Begin
244
+ If (i > 0) Then Break; // v->s has been inspected
245
+ if (j = i) Then Break; // only one item in v
246
+
247
+ // - but now we need to go round once more to get
248
+ // v->s inspected. This looks messy, but is actually
249
+ // the optimal approach.
250
+
251
+ if (first_key_inspected) Then Break;
252
+ first_key_inspected := True;
253
+ End;
254
+ End;
255
+
256
+ While True Do
257
+ Begin
258
+ w := v[i];
259
+ If (common_i >= Length(w.Str)) Then
260
+ Begin
261
+ FCursor := c + Length(w.Str);
262
+ If Not Assigned(w.Method) Then
263
+ Begin
264
+ Result := w.Result;
265
+ Exit;
266
+ End;
267
+
268
+ res := w.Method;
269
+
270
+ FCursor := c + Length(w.Str);
271
+ if (res) Then Begin
272
+ Result := w.Result;
273
+ Exit;
274
+ End;
275
+ End;
276
+
277
+ i := w.Index;
278
+ if i < 0 Then
279
+ Begin
280
+ Result := 0;
281
+ Exit;
282
+ End;
283
+ End;
284
+ End;
285
+
286
+ Function TSnowballProgram.FindAmongBk(v : array of TAmong; v_size : Integer) : Integer;
287
+ Var i, j, c, lb, common_i, common_j, k, diff, common, i2 : Integer;
288
+ first_key_inspected, res : Boolean;
289
+ w : TAmong;
290
+ Begin
291
+ i := 0;
292
+ j := v_size;
293
+
294
+ c := FCursor;
295
+ lb := FBkLimit;
296
+
297
+ common_i := 0;
298
+ common_j := 0;
299
+
300
+ first_key_inspected := false;
301
+
302
+ While True Do
303
+ Begin
304
+ k := i + ((j - i) Shr 1);
305
+ diff := 0;
306
+ common := Min(common_i, common_j);
307
+ w := v[k];
308
+
309
+ For i2 := Length(w.Str) - 1 - common DownTo 0 Do
310
+ Begin
311
+ If (c - common) = lb Then
312
+ Begin
313
+ diff := -1;
314
+ Break;
315
+ End;
316
+
317
+ diff := Ord(FCurrent[c - common]) - Ord(w.Str[i2 + 1]);
318
+ if diff <> 0 Then Break;
319
+ Inc(common);
320
+ End;
321
+
322
+ If diff < 0 Then
323
+ Begin
324
+ j := k;
325
+ common_j := common;
326
+ End
327
+ Else
328
+ Begin
329
+ i := k;
330
+ common_i := common;
331
+ End;
332
+
333
+ If (j - i) <= 1 Then
334
+ Begin
335
+ if i > 0 Then Break;
336
+ if j = i Then Break;
337
+ if first_key_inspected Then Break;
338
+ first_key_inspected := True;
339
+ End;
340
+ End;
341
+
342
+ While True Do
343
+ Begin
344
+ w := v[i];
345
+ if common_i >= Length(w.Str) Then
346
+ Begin
347
+ FCursor := c - Length(w.Str);
348
+ If Not Assigned(w.Method) Then
349
+ Begin
350
+ Result := w.Result;
351
+ Exit;
352
+ End;
353
+
354
+ res := w.Method;
355
+
356
+ FCursor := c - Length(w.Str);
357
+ If Res Then
358
+ Begin
359
+ Result := w.Result;
360
+ Exit;
361
+ End;
362
+ End;
363
+
364
+ i := w.Index;
365
+ If i < 0 Then
366
+ Begin
367
+ Result := 0;
368
+ Exit;
369
+ End;
370
+ End;
371
+ End;
372
+
373
+ Procedure TSnowballProgram.SliceCheck;
374
+ Begin
375
+ if (FBra < 0) Or (FBra > FKet) Or (FKet > FLimit) Or (FLimit > Length(FCurrent)) Then
376
+ Begin
377
+ WriteLn('Faulty slice operation.');
378
+ Halt;
379
+ End;
380
+ End;
381
+
382
+ Procedure TSnowballProgram.SliceDel;
383
+ Begin
384
+ SliceFrom('');
385
+ End;
386
+
387
+ Function TSnowballProgram.ReplaceS(bra, ket : Integer; s : AnsiString) : Integer;
388
+ Var adjustment : Integer;
389
+ Begin
390
+ adjustment := Length(s) - (ket - bra);
391
+
392
+ Delete(FCurrent, bra + 1, ket - bra);
393
+ System.Insert(s, FCurrent, bra + 1);
394
+
395
+ FLimit := FLimit + adjustment;
396
+
397
+ if (FCursor >= ket) Then
398
+ FCursor := FCursor + adjustment
399
+ Else If (FCursor > bra) Then
400
+ FCursor := bra;
401
+
402
+ Result := adjustment;
403
+ End;
404
+
405
+ Procedure TSnowballProgram.Insert(bra, ket : Integer; s : AnsiString);
406
+ Var adjustment : Integer;
407
+ Begin
408
+ adjustment := ReplaceS(bra, ket, s);
409
+ If (bra <= FBra) Then FBra := FBra + adjustment;
410
+ If (bra <= FKet) Then FKet := FKet + adjustment;
411
+ End;
412
+
413
+ Function TSnowballProgram.SliceTo() : AnsiString;
414
+ Begin
415
+ SliceCheck();
416
+ Result := Copy(FCurrent, FBra + 1, FKet - FBra);
417
+ End;
418
+
419
+ Procedure TSnowballProgram.SliceFrom(s : AnsiString);
420
+ Begin
421
+ SliceCheck();
422
+ ReplaceS(FBra, FKet, s);
423
+ End;
424
+
425
+ Function TSnowballProgram.AssignTo() : AnsiString;
426
+ Begin
427
+ Result := Copy(FCurrent, 1, FLimit);
428
+ End;
429
+
430
+ End.
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env perl
2
+ use strict;
3
+ use warnings;
4
+
5
+ # Generate Pascal stemwords source.
6
+
7
+ my @sources = @ARGV;
8
+
9
+ while (defined(my $line = <STDIN>)) {
10
+ if ($line =~ /\{\s*BEGIN TEMPLATE\s*\}/) {
11
+ my $template = '';
12
+ while (defined($line = <STDIN>) && $line !~ /\{\s*END TEMPLATE\s*\}/) {
13
+ $template .= $line;
14
+ }
15
+ foreach my $source(@sources) {
16
+ my $out = $template;
17
+ $out =~ s/%STEMMER%/$source/g;
18
+ print $out;
19
+ }
20
+ next;
21
+ }
22
+ print $line;
23
+ }
@@ -0,0 +1,78 @@
1
+ program stemwords;
2
+
3
+ {$ifdef windows}
4
+ {$APPTYPE CONSOLE}
5
+ {$endif}
6
+
7
+ uses
8
+ SnowballProgram,
9
+ { BEGIN TEMPLATE }
10
+ %STEMMER%Stemmer in '%STEMMER%Stemmer.pas',
11
+ { END TEMPLATE }
12
+ SysUtils;
13
+
14
+ Var
15
+ Stemmer : TSnowballProgram;
16
+ CurWord : AnsiString;
17
+ i : Integer;
18
+ language : AnsiString;
19
+
20
+ Const
21
+ Delimiters : Set Of Char = [#10, #13];
22
+
23
+ Function NextWord : Boolean;
24
+ Var C : Char;
25
+ Begin
26
+ CurWord := '';
27
+
28
+ Result := Not Eof;
29
+
30
+ While Not Eof Do
31
+ Begin
32
+ Read(C);
33
+ If IOResult <> 0 Then Break;
34
+ If C In Delimiters Then Break;
35
+ CurWord := CurWord + C;
36
+ End;
37
+ End;
38
+
39
+ begin
40
+ language := 'english';
41
+ i := 0;
42
+ while i < ParamCount do
43
+ begin
44
+ i := i + 1;
45
+ if ParamStr(i) = '-l' then
46
+ begin
47
+ i := i + 1;
48
+ language := ParamStr(i);
49
+ continue;
50
+ end;
51
+ WriteLn('option '+ParamStr(i)+' unknown');
52
+ Exit;
53
+ end;
54
+ if False then
55
+ { BEGIN TEMPLATE }
56
+ else if language = '%STEMMER%' then
57
+ Stemmer := T%STEMMER%Stemmer.Create
58
+ { END TEMPLATE }
59
+ else
60
+ begin
61
+ WriteLn('Stemming language '+language+' unknown');
62
+ Exit;
63
+ end;
64
+
65
+ Try
66
+ While Not Eof Do
67
+ Begin
68
+ While NextWord Do
69
+ Begin
70
+ Stemmer.Current := CurWord;
71
+ Stemmer.Stem;
72
+ WriteLn(Stemmer.Current);
73
+ End;
74
+ End;
75
+ Finally
76
+ Stemmer.Free;
77
+ End;
78
+ end.
@@ -0,0 +1,7 @@
1
+ include *.rst
2
+ include modules.txt
3
+ include setup.*
4
+ recursive-include src *.py
5
+ include MANIFEST.in
6
+ include COPYING
7
+ include NEWS
@@ -0,0 +1,54 @@
1
+ #! /bin/sh/env python
2
+
3
+ import sys
4
+ import re
5
+ import os
6
+
7
+ python_out_folder = sys.argv[1]
8
+
9
+ filematch = re.compile(r"(\w+)_stemmer\.py$")
10
+
11
+ imports = []
12
+ languages = []
13
+
14
+ for pyscript in os.listdir(python_out_folder):
15
+ match = filematch.match(pyscript)
16
+ if (match):
17
+ langname = match.group(1)
18
+ titlecase = langname.title()
19
+ languages.append(" '%(lang)s': %(title)sStemmer," % {'lang': langname, 'title': titlecase})
20
+ imports.append('from .%(lang)s_stemmer import %(title)sStemmer' % {'lang': langname, 'title': titlecase})
21
+ imports.sort()
22
+ languages.sort()
23
+
24
+ src = '''__all__ = ('language', 'stemmer')
25
+
26
+ %(imports)s
27
+
28
+ _languages = {
29
+ %(languages)s
30
+ }
31
+
32
+ try:
33
+ import Stemmer
34
+ cext_available = True
35
+ except ImportError:
36
+ cext_available = False
37
+
38
+ def algorithms():
39
+ if cext_available:
40
+ return Stemmer.language()
41
+ else:
42
+ return list(_languages.keys())
43
+
44
+ def stemmer(lang):
45
+ if cext_available:
46
+ return Stemmer.Stemmer(lang)
47
+ if lang.lower() in _languages:
48
+ return _languages[lang.lower()]()
49
+ else:
50
+ raise KeyError("Stemming algorithm '%%s' not found" %% lang)
51
+ ''' % {'imports': '\n'.join(imports), 'languages': '\n'.join(languages)}
52
+
53
+ with open(os.path.join(python_out_folder, '__init__.py'), 'w') as out:
54
+ out.write(src)
@@ -0,0 +1,6 @@
1
+ [metadata]
2
+ long_description = file: README.rst
3
+ long_description_content_type = text/x-rst
4
+
5
+ [bdist_wheel]
6
+ universal=1
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env python
2
+
3
+ from setuptools import setup
4
+ import re
5
+
6
+ SNOWBALL_VERSION = '2.2.0'
7
+
8
+ n_stemmers = 0
9
+
10
+ langs = []
11
+ variants = {}
12
+ with open('modules.txt') as fp:
13
+ for line in fp.readlines():
14
+ if len(line) <= 1 or line[0] == '#':
15
+ continue
16
+ if line[-1:] == '\n':
17
+ line = line[:-1]
18
+ tokens = re.split(r'\s+', line)
19
+ if len(tokens) < 3:
20
+ print("Bad modules.txt line: " + line)
21
+ continue
22
+ (name, encs, codes) = tokens[:3]
23
+ if len(tokens) > 3:
24
+ variant_of = tokens[3]
25
+ if variant_of in variants:
26
+ variants[variant_of].append(name)
27
+ else:
28
+ variants[variant_of] = [name]
29
+ else:
30
+ langs.append(name)
31
+ n_stemmers += 1
32
+
33
+ desc = 'This package provides ' + str(n_stemmers) + ' stemmers for ' + \
34
+ str(len(langs)) + ' languages generated from Snowball algorithms.'
35
+
36
+ classifiers = [
37
+ 'Development Status :: 5 - Production/Stable',
38
+ 'Intended Audience :: Developers',
39
+ 'License :: OSI Approved :: BSD License'
40
+ ]
41
+
42
+ for lang in langs:
43
+ lang_titlecase = lang.title()
44
+ # Only classifiers listed in https://pypi.org/classifiers/ are allowed
45
+ if lang_titlecase not in ('Armenian', 'Yiddish'):
46
+ classifiers.append('Natural Language :: ' + lang_titlecase)
47
+
48
+ classifiers.extend([
49
+ 'Operating System :: OS Independent',
50
+ 'Programming Language :: Python',
51
+ 'Programming Language :: Python :: 2',
52
+ 'Programming Language :: Python :: 2.6',
53
+ 'Programming Language :: Python :: 2.7',
54
+ 'Programming Language :: Python :: 3',
55
+ 'Programming Language :: Python :: 3.4',
56
+ 'Programming Language :: Python :: 3.5',
57
+ 'Programming Language :: Python :: 3.6',
58
+ 'Programming Language :: Python :: 3.7',
59
+ 'Programming Language :: Python :: 3.8',
60
+ 'Programming Language :: Python :: 3.9',
61
+ 'Programming Language :: Python :: 3.10',
62
+ 'Programming Language :: Python :: Implementation :: CPython',
63
+ 'Programming Language :: Python :: Implementation :: PyPy',
64
+ 'Topic :: Database',
65
+ 'Topic :: Internet :: WWW/HTTP :: Indexing/Search',
66
+ 'Topic :: Text Processing :: Indexing',
67
+ 'Topic :: Text Processing :: Linguistic'
68
+ ])
69
+
70
+ setup(name='snowballstemmer',
71
+ version=SNOWBALL_VERSION,
72
+ description=desc,
73
+ author='Snowball Developers',
74
+ author_email='snowball-discuss@lists.tartarus.org',
75
+ url='https://github.com/snowballstem/snowball',
76
+ keywords="stemmer",
77
+ license="BSD-3-Clause",
78
+ packages=['snowballstemmer'],
79
+ package_dir={"snowballstemmer": "src/snowballstemmer"},
80
+ classifiers = classifiers
81
+ )
@@ -0,0 +1,13 @@
1
+
2
+ class Among(object):
3
+ def __init__(self, s, substring_i, result, method=None):
4
+ """
5
+ @ivar s search string
6
+ @ivar substring index to longest matching substring
7
+ @ivar result of the lookup
8
+ @ivar method method to use if substring matches
9
+ """
10
+ self.s = s
11
+ self.substring_i = substring_i
12
+ self.result = result
13
+ self.method = method