mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
class BaseStemmer(object):
|
|
2
|
+
def __init__(self):
|
|
3
|
+
self.set_current("")
|
|
4
|
+
|
|
5
|
+
def set_current(self, value):
|
|
6
|
+
'''
|
|
7
|
+
Set the self.current string.
|
|
8
|
+
'''
|
|
9
|
+
self.current = value
|
|
10
|
+
self.cursor = 0
|
|
11
|
+
self.limit = len(self.current)
|
|
12
|
+
self.limit_backward = 0
|
|
13
|
+
self.bra = self.cursor
|
|
14
|
+
self.ket = self.limit
|
|
15
|
+
|
|
16
|
+
def get_current(self):
|
|
17
|
+
'''
|
|
18
|
+
Get the self.current string.
|
|
19
|
+
'''
|
|
20
|
+
return self.current
|
|
21
|
+
|
|
22
|
+
def copy_from(self, other):
|
|
23
|
+
self.current = other.current
|
|
24
|
+
self.cursor = other.cursor
|
|
25
|
+
self.limit = other.limit
|
|
26
|
+
self.limit_backward = other.limit_backward
|
|
27
|
+
self.bra = other.bra
|
|
28
|
+
self.ket = other.ket
|
|
29
|
+
|
|
30
|
+
def in_grouping(self, s, min, max):
|
|
31
|
+
if self.cursor >= self.limit:
|
|
32
|
+
return False
|
|
33
|
+
ch = ord(self.current[self.cursor])
|
|
34
|
+
if ch > max or ch < min:
|
|
35
|
+
return False
|
|
36
|
+
ch -= min
|
|
37
|
+
if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
|
|
38
|
+
return False
|
|
39
|
+
self.cursor += 1
|
|
40
|
+
return True
|
|
41
|
+
|
|
42
|
+
def go_in_grouping(self, s, min, max):
|
|
43
|
+
while self.cursor < self.limit:
|
|
44
|
+
ch = ord(self.current[self.cursor])
|
|
45
|
+
if ch > max or ch < min:
|
|
46
|
+
return True
|
|
47
|
+
ch -= min
|
|
48
|
+
if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
|
|
49
|
+
return True
|
|
50
|
+
self.cursor += 1
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
def in_grouping_b(self, s, min, max):
|
|
54
|
+
if self.cursor <= self.limit_backward:
|
|
55
|
+
return False
|
|
56
|
+
ch = ord(self.current[self.cursor - 1])
|
|
57
|
+
if ch > max or ch < min:
|
|
58
|
+
return False
|
|
59
|
+
ch -= min
|
|
60
|
+
if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
|
|
61
|
+
return False
|
|
62
|
+
self.cursor -= 1
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
def go_in_grouping_b(self, s, min, max):
|
|
66
|
+
while self.cursor > self.limit_backward:
|
|
67
|
+
ch = ord(self.current[self.cursor - 1])
|
|
68
|
+
if ch > max or ch < min:
|
|
69
|
+
return True
|
|
70
|
+
ch -= min
|
|
71
|
+
if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
|
|
72
|
+
return True
|
|
73
|
+
self.cursor -= 1
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
def out_grouping(self, s, min, max):
|
|
77
|
+
if self.cursor >= self.limit:
|
|
78
|
+
return False
|
|
79
|
+
ch = ord(self.current[self.cursor])
|
|
80
|
+
if ch > max or ch < min:
|
|
81
|
+
self.cursor += 1
|
|
82
|
+
return True
|
|
83
|
+
ch -= min
|
|
84
|
+
if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0:
|
|
85
|
+
self.cursor += 1
|
|
86
|
+
return True
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
def go_out_grouping(self, s, min, max):
|
|
90
|
+
while self.cursor < self.limit:
|
|
91
|
+
ch = ord(self.current[self.cursor])
|
|
92
|
+
if ch <= max and ch >= min:
|
|
93
|
+
ch -= min
|
|
94
|
+
if (s[ch >> 3] & (0X1 << (ch & 0x7))):
|
|
95
|
+
return True
|
|
96
|
+
self.cursor += 1
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
def out_grouping_b(self, s, min, max):
|
|
100
|
+
if self.cursor <= self.limit_backward:
|
|
101
|
+
return False
|
|
102
|
+
ch = ord(self.current[self.cursor - 1])
|
|
103
|
+
if ch > max or ch < min:
|
|
104
|
+
self.cursor -= 1
|
|
105
|
+
return True
|
|
106
|
+
ch -= min
|
|
107
|
+
if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0:
|
|
108
|
+
self.cursor -= 1
|
|
109
|
+
return True
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
def go_out_grouping_b(self, s, min, max):
|
|
113
|
+
while self.cursor > self.limit_backward:
|
|
114
|
+
ch = ord(self.current[self.cursor - 1])
|
|
115
|
+
if ch <= max and ch >= min:
|
|
116
|
+
ch -= min
|
|
117
|
+
if (s[ch >> 3] & (0X1 << (ch & 0x7))):
|
|
118
|
+
return True
|
|
119
|
+
self.cursor -= 1
|
|
120
|
+
return False
|
|
121
|
+
|
|
122
|
+
def eq_s(self, s):
|
|
123
|
+
if self.limit - self.cursor < len(s):
|
|
124
|
+
return False
|
|
125
|
+
if self.current[self.cursor:self.cursor + len(s)] != s:
|
|
126
|
+
return False
|
|
127
|
+
self.cursor += len(s)
|
|
128
|
+
return True
|
|
129
|
+
|
|
130
|
+
def eq_s_b(self, s):
|
|
131
|
+
if self.cursor - self.limit_backward < len(s):
|
|
132
|
+
return False
|
|
133
|
+
if self.current[self.cursor - len(s):self.cursor] != s:
|
|
134
|
+
return False
|
|
135
|
+
self.cursor -= len(s)
|
|
136
|
+
return True
|
|
137
|
+
|
|
138
|
+
def find_among(self, v):
|
|
139
|
+
i = 0
|
|
140
|
+
j = len(v)
|
|
141
|
+
|
|
142
|
+
c = self.cursor
|
|
143
|
+
l = self.limit
|
|
144
|
+
|
|
145
|
+
common_i = 0
|
|
146
|
+
common_j = 0
|
|
147
|
+
|
|
148
|
+
first_key_inspected = False
|
|
149
|
+
|
|
150
|
+
while True:
|
|
151
|
+
k = i + ((j - i) >> 1)
|
|
152
|
+
diff = 0
|
|
153
|
+
common = min(common_i, common_j) # smaller
|
|
154
|
+
w = v[k]
|
|
155
|
+
for i2 in range(common, len(w.s)):
|
|
156
|
+
if c + common == l:
|
|
157
|
+
diff = -1
|
|
158
|
+
break
|
|
159
|
+
diff = ord(self.current[c + common]) - ord(w.s[i2])
|
|
160
|
+
if diff != 0:
|
|
161
|
+
break
|
|
162
|
+
common += 1
|
|
163
|
+
if diff < 0:
|
|
164
|
+
j = k
|
|
165
|
+
common_j = common
|
|
166
|
+
else:
|
|
167
|
+
i = k
|
|
168
|
+
common_i = common
|
|
169
|
+
if j - i <= 1:
|
|
170
|
+
if i > 0:
|
|
171
|
+
break # v->s has been inspected
|
|
172
|
+
if j == i:
|
|
173
|
+
break # only one item in v
|
|
174
|
+
# - but now we need to go round once more to get
|
|
175
|
+
# v->s inspected. This looks messy, but is actually
|
|
176
|
+
# the optimal approach.
|
|
177
|
+
if first_key_inspected:
|
|
178
|
+
break
|
|
179
|
+
first_key_inspected = True
|
|
180
|
+
while True:
|
|
181
|
+
w = v[i]
|
|
182
|
+
if common_i >= len(w.s):
|
|
183
|
+
self.cursor = c + len(w.s)
|
|
184
|
+
if w.method is None:
|
|
185
|
+
return w.result
|
|
186
|
+
method = getattr(self, w.method)
|
|
187
|
+
res = method()
|
|
188
|
+
self.cursor = c + len(w.s)
|
|
189
|
+
if res:
|
|
190
|
+
return w.result
|
|
191
|
+
i = w.substring_i
|
|
192
|
+
if i < 0:
|
|
193
|
+
return 0
|
|
194
|
+
return -1 # not reachable
|
|
195
|
+
|
|
196
|
+
def find_among_b(self, v):
|
|
197
|
+
'''
|
|
198
|
+
find_among_b is for backwards processing. Same comments apply
|
|
199
|
+
'''
|
|
200
|
+
i = 0
|
|
201
|
+
j = len(v)
|
|
202
|
+
|
|
203
|
+
c = self.cursor
|
|
204
|
+
lb = self.limit_backward
|
|
205
|
+
|
|
206
|
+
common_i = 0
|
|
207
|
+
common_j = 0
|
|
208
|
+
|
|
209
|
+
first_key_inspected = False
|
|
210
|
+
|
|
211
|
+
while True:
|
|
212
|
+
k = i + ((j - i) >> 1)
|
|
213
|
+
diff = 0
|
|
214
|
+
common = min(common_i, common_j)
|
|
215
|
+
w = v[k]
|
|
216
|
+
for i2 in range(len(w.s) - 1 - common, -1, -1):
|
|
217
|
+
if c - common == lb:
|
|
218
|
+
diff = -1
|
|
219
|
+
break
|
|
220
|
+
diff = ord(self.current[c - 1 - common]) - ord(w.s[i2])
|
|
221
|
+
if diff != 0:
|
|
222
|
+
break
|
|
223
|
+
common += 1
|
|
224
|
+
if diff < 0:
|
|
225
|
+
j = k
|
|
226
|
+
common_j = common
|
|
227
|
+
else:
|
|
228
|
+
i = k
|
|
229
|
+
common_i = common
|
|
230
|
+
if j - i <= 1:
|
|
231
|
+
if i > 0:
|
|
232
|
+
break
|
|
233
|
+
if j == i:
|
|
234
|
+
break
|
|
235
|
+
if first_key_inspected:
|
|
236
|
+
break
|
|
237
|
+
first_key_inspected = True
|
|
238
|
+
while True:
|
|
239
|
+
w = v[i]
|
|
240
|
+
if common_i >= len(w.s):
|
|
241
|
+
self.cursor = c - len(w.s)
|
|
242
|
+
if w.method is None:
|
|
243
|
+
return w.result
|
|
244
|
+
method = getattr(self, w.method)
|
|
245
|
+
res = method()
|
|
246
|
+
self.cursor = c - len(w.s)
|
|
247
|
+
if res:
|
|
248
|
+
return w.result
|
|
249
|
+
i = w.substring_i
|
|
250
|
+
if i < 0:
|
|
251
|
+
return 0
|
|
252
|
+
return -1 # not reachable
|
|
253
|
+
|
|
254
|
+
def replace_s(self, c_bra, c_ket, s):
|
|
255
|
+
'''
|
|
256
|
+
to replace chars between c_bra and c_ket in self.current by the
|
|
257
|
+
chars in s.
|
|
258
|
+
|
|
259
|
+
@type c_bra int
|
|
260
|
+
@type c_ket int
|
|
261
|
+
@type s: string
|
|
262
|
+
'''
|
|
263
|
+
adjustment = len(s) - (c_ket - c_bra)
|
|
264
|
+
self.current = self.current[0:c_bra] + s + self.current[c_ket:]
|
|
265
|
+
self.limit += adjustment
|
|
266
|
+
if self.cursor >= c_ket:
|
|
267
|
+
self.cursor += adjustment
|
|
268
|
+
elif self.cursor > c_bra:
|
|
269
|
+
self.cursor = c_bra
|
|
270
|
+
return adjustment
|
|
271
|
+
|
|
272
|
+
def slice_check(self):
|
|
273
|
+
if self.bra < 0 or self.bra > self.ket or self.ket > self.limit or self.limit > len(self.current):
|
|
274
|
+
return False
|
|
275
|
+
return True
|
|
276
|
+
|
|
277
|
+
def slice_from(self, s):
|
|
278
|
+
'''
|
|
279
|
+
@type s string
|
|
280
|
+
'''
|
|
281
|
+
result = False
|
|
282
|
+
if self.slice_check():
|
|
283
|
+
self.replace_s(self.bra, self.ket, s)
|
|
284
|
+
result = True
|
|
285
|
+
return result
|
|
286
|
+
|
|
287
|
+
def slice_del(self):
|
|
288
|
+
return self.slice_from("")
|
|
289
|
+
|
|
290
|
+
def insert(self, c_bra, c_ket, s):
|
|
291
|
+
'''
|
|
292
|
+
@type c_bra int
|
|
293
|
+
@type c_ket int
|
|
294
|
+
@type s: string
|
|
295
|
+
'''
|
|
296
|
+
adjustment = self.replace_s(c_bra, c_ket, s)
|
|
297
|
+
if c_bra <= self.bra:
|
|
298
|
+
self.bra += adjustment
|
|
299
|
+
if c_bra <= self.ket:
|
|
300
|
+
self.ket += adjustment
|
|
301
|
+
|
|
302
|
+
def slice_to(self):
|
|
303
|
+
'''
|
|
304
|
+
Return the slice as a string.
|
|
305
|
+
'''
|
|
306
|
+
result = ''
|
|
307
|
+
if self.slice_check():
|
|
308
|
+
result = self.current[self.bra:self.ket]
|
|
309
|
+
return result
|
|
310
|
+
|
|
311
|
+
def assign_to(self):
|
|
312
|
+
'''
|
|
313
|
+
Return the current string up to the limit.
|
|
314
|
+
'''
|
|
315
|
+
return self.current[0:self.limit]
|
|
316
|
+
|
|
317
|
+
def stemWord(self, word):
|
|
318
|
+
self.set_current(word)
|
|
319
|
+
self._stem()
|
|
320
|
+
return self.get_current()
|
|
321
|
+
|
|
322
|
+
def stemWords(self, words):
|
|
323
|
+
return [self.stemWord(word) for word in words]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import codecs
|
|
3
|
+
import snowballstemmer
|
|
4
|
+
|
|
5
|
+
def usage():
|
|
6
|
+
print('''usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]
|
|
7
|
+
|
|
8
|
+
The input file consists of a list of words to be stemmed, one per
|
|
9
|
+
line. Words should be in lower case, but (for English) A-Z letters
|
|
10
|
+
are mapped to their a-z equivalents anyway. If omitted, stdin is
|
|
11
|
+
used.
|
|
12
|
+
|
|
13
|
+
If -c is given, the argument is the character encoding of the input
|
|
14
|
+
and output files. If it is omitted, the UTF-8 encoding is used.
|
|
15
|
+
|
|
16
|
+
If -p is given the output file consists of each word of the input
|
|
17
|
+
file followed by \"->\" followed by its stemmed equivalent.
|
|
18
|
+
If -p2 is given the output file is a two column layout containing
|
|
19
|
+
the input words in the first column and the stemmed eqivalents in
|
|
20
|
+
the second column.
|
|
21
|
+
|
|
22
|
+
Otherwise, the output file consists of the stemmed words, one per
|
|
23
|
+
line.
|
|
24
|
+
|
|
25
|
+
-h displays this help''' % sys.argv[0])
|
|
26
|
+
|
|
27
|
+
def main():
|
|
28
|
+
argv = sys.argv[1:]
|
|
29
|
+
if len(argv) < 5:
|
|
30
|
+
usage()
|
|
31
|
+
else:
|
|
32
|
+
pretty = 0
|
|
33
|
+
input = ''
|
|
34
|
+
output = ''
|
|
35
|
+
encoding = 'utf_8'
|
|
36
|
+
language = 'English'
|
|
37
|
+
show_help = False
|
|
38
|
+
while len(argv):
|
|
39
|
+
arg = argv[0]
|
|
40
|
+
argv = argv[1:]
|
|
41
|
+
if arg == '-h':
|
|
42
|
+
show_help = True
|
|
43
|
+
break
|
|
44
|
+
elif arg == "-p":
|
|
45
|
+
pretty = 1
|
|
46
|
+
elif arg == "-p2":
|
|
47
|
+
pretty = 2
|
|
48
|
+
elif arg == "-l":
|
|
49
|
+
if len(argv) == 0:
|
|
50
|
+
show_help = True
|
|
51
|
+
break
|
|
52
|
+
language = argv[0]
|
|
53
|
+
argv = argv[1:]
|
|
54
|
+
elif arg == "-i":
|
|
55
|
+
if len(argv) == 0:
|
|
56
|
+
show_help = True
|
|
57
|
+
break
|
|
58
|
+
input = argv[0]
|
|
59
|
+
argv = argv[1:]
|
|
60
|
+
elif arg == "-o":
|
|
61
|
+
if len(argv) == 0:
|
|
62
|
+
show_help = True
|
|
63
|
+
break
|
|
64
|
+
output = argv[0]
|
|
65
|
+
argv = argv[1:]
|
|
66
|
+
elif arg == "-c":
|
|
67
|
+
if len(argv) == 0:
|
|
68
|
+
show_help = True
|
|
69
|
+
break
|
|
70
|
+
encoding = argv[0]
|
|
71
|
+
if show_help or input == '' or output == '':
|
|
72
|
+
usage()
|
|
73
|
+
else:
|
|
74
|
+
stemming(language, input, output, encoding, pretty)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def stemming(lang, input, output, encoding, pretty):
|
|
78
|
+
stemmer = snowballstemmer.stemmer(lang)
|
|
79
|
+
with codecs.open(output, "w", encoding) as outfile:
|
|
80
|
+
with codecs.open(input, "r", encoding) as infile:
|
|
81
|
+
for original in infile.readlines():
|
|
82
|
+
original = original.strip()
|
|
83
|
+
# Convert only ASCII-letters to lowercase, to match C behavior
|
|
84
|
+
original = ''.join((c.lower() if 'A' <= c <= 'Z' else c for c in original))
|
|
85
|
+
stemmed = stemmer.stemWord(original)
|
|
86
|
+
if pretty == 0:
|
|
87
|
+
if stemmed != "":
|
|
88
|
+
outfile.write(stemmed)
|
|
89
|
+
elif pretty == 1:
|
|
90
|
+
outfile.write(original, " -> ", stemmed)
|
|
91
|
+
elif pretty == 2:
|
|
92
|
+
outfile.write(original)
|
|
93
|
+
if len(original) < 30:
|
|
94
|
+
outfile.write(" " * (30 - len(original)))
|
|
95
|
+
else:
|
|
96
|
+
outfile.write("\n")
|
|
97
|
+
outfile.write(" " * 30)
|
|
98
|
+
outfile.write(stemmed)
|
|
99
|
+
outfile.write('\n')
|
|
100
|
+
|
|
101
|
+
main()
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import re
|
|
3
|
+
import snowballstemmer
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def usage():
|
|
7
|
+
print("testapp.py <algorithm> \"sentence\"...")
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
argv = sys.argv
|
|
11
|
+
if len(argv) < 1:
|
|
12
|
+
usage()
|
|
13
|
+
return
|
|
14
|
+
algorithm = 'english'
|
|
15
|
+
if len(argv) > 2:
|
|
16
|
+
algorithm = argv[1]
|
|
17
|
+
argv = argv[2:]
|
|
18
|
+
else:
|
|
19
|
+
argv = argv[1:]
|
|
20
|
+
stemmer = snowballstemmer.stemmer(algorithm)
|
|
21
|
+
splitter = re.compile(r"[\s\.-]")
|
|
22
|
+
for arg in argv:
|
|
23
|
+
for word in splitter.split(arg):
|
|
24
|
+
if word == '':
|
|
25
|
+
continue
|
|
26
|
+
original = word.lower()
|
|
27
|
+
print(original + " -> " + stemmer.stemWord(original))
|
|
28
|
+
main()
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
|
|
2
|
+
#include <stdlib.h> /* for calloc, free */
|
|
3
|
+
#include "header.h"
|
|
4
|
+
|
|
5
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size)
|
|
6
|
+
{
|
|
7
|
+
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
|
|
8
|
+
if (z == NULL) return NULL;
|
|
9
|
+
z->p = create_s();
|
|
10
|
+
if (z->p == NULL) goto error;
|
|
11
|
+
if (S_size)
|
|
12
|
+
{
|
|
13
|
+
int i;
|
|
14
|
+
z->S = (symbol * *) calloc(S_size, sizeof(symbol *));
|
|
15
|
+
if (z->S == NULL) goto error;
|
|
16
|
+
|
|
17
|
+
for (i = 0; i < S_size; i++)
|
|
18
|
+
{
|
|
19
|
+
z->S[i] = create_s();
|
|
20
|
+
if (z->S[i] == NULL) goto error;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (I_size)
|
|
25
|
+
{
|
|
26
|
+
z->I = (int *) calloc(I_size, sizeof(int));
|
|
27
|
+
if (z->I == NULL) goto error;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
return z;
|
|
31
|
+
error:
|
|
32
|
+
SN_close_env(z, S_size);
|
|
33
|
+
return NULL;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
extern void SN_close_env(struct SN_env * z, int S_size)
|
|
37
|
+
{
|
|
38
|
+
if (z == NULL) return;
|
|
39
|
+
if (S_size)
|
|
40
|
+
{
|
|
41
|
+
int i;
|
|
42
|
+
for (i = 0; i < S_size; i++)
|
|
43
|
+
{
|
|
44
|
+
lose_s(z->S[i]);
|
|
45
|
+
}
|
|
46
|
+
free(z->S);
|
|
47
|
+
}
|
|
48
|
+
free(z->I);
|
|
49
|
+
if (z->p) lose_s(z->p);
|
|
50
|
+
free(z);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
|
|
54
|
+
{
|
|
55
|
+
int err = replace_s(z, 0, z->l, size, s, NULL);
|
|
56
|
+
z->c = 0;
|
|
57
|
+
return err;
|
|
58
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
|
|
2
|
+
typedef unsigned char symbol;
|
|
3
|
+
|
|
4
|
+
/* Or replace 'char' above with 'short' for 16 bit characters.
|
|
5
|
+
|
|
6
|
+
More precisely, replace 'char' with whatever type guarantees the
|
|
7
|
+
character width you need. Note however that sizeof(symbol) should divide
|
|
8
|
+
HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise
|
|
9
|
+
there is an alignment problem. In the unlikely event of a problem here,
|
|
10
|
+
consult Martin Porter.
|
|
11
|
+
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
struct SN_env {
|
|
15
|
+
symbol * p;
|
|
16
|
+
int c; int l; int lb; int bra; int ket;
|
|
17
|
+
symbol * * S;
|
|
18
|
+
int * I;
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
#ifdef __cplusplus
|
|
22
|
+
extern "C" {
|
|
23
|
+
#endif
|
|
24
|
+
|
|
25
|
+
extern struct SN_env * SN_create_env(int S_size, int I_size);
|
|
26
|
+
extern void SN_close_env(struct SN_env * z, int S_size);
|
|
27
|
+
|
|
28
|
+
extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
|
|
29
|
+
|
|
30
|
+
#ifdef __cplusplus
|
|
31
|
+
}
|
|
32
|
+
#endif
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
|
|
2
|
+
#include <limits.h>
|
|
3
|
+
|
|
4
|
+
#include "api.h"
|
|
5
|
+
|
|
6
|
+
#define MAXINT INT_MAX
|
|
7
|
+
#define MININT INT_MIN
|
|
8
|
+
|
|
9
|
+
#define HEAD 2*sizeof(int)
|
|
10
|
+
|
|
11
|
+
#define SIZE(p) ((int *)(p))[-1]
|
|
12
|
+
#define SET_SIZE(p, n) ((int *)(p))[-1] = n
|
|
13
|
+
#define CAPACITY(p) ((int *)(p))[-2]
|
|
14
|
+
|
|
15
|
+
struct among
|
|
16
|
+
{ int s_size; /* number of chars in string */
|
|
17
|
+
const symbol * s; /* search string */
|
|
18
|
+
int substring_i;/* index to longest matching substring */
|
|
19
|
+
int result; /* result of the lookup */
|
|
20
|
+
int (* function)(struct SN_env *);
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
extern symbol * create_s(void);
|
|
24
|
+
extern void lose_s(symbol * p);
|
|
25
|
+
|
|
26
|
+
extern int skip_utf8(const symbol * p, int c, int limit, int n);
|
|
27
|
+
|
|
28
|
+
extern int skip_b_utf8(const symbol * p, int c, int limit, int n);
|
|
29
|
+
|
|
30
|
+
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
31
|
+
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
32
|
+
extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
33
|
+
extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
34
|
+
|
|
35
|
+
extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
36
|
+
extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
37
|
+
extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
38
|
+
extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
|
39
|
+
|
|
40
|
+
extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
|
|
41
|
+
extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
|
|
42
|
+
extern int eq_v(struct SN_env * z, const symbol * p);
|
|
43
|
+
extern int eq_v_b(struct SN_env * z, const symbol * p);
|
|
44
|
+
|
|
45
|
+
extern int find_among(struct SN_env * z, const struct among * v, int v_size);
|
|
46
|
+
extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
|
|
47
|
+
|
|
48
|
+
extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
|
|
49
|
+
extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
|
|
50
|
+
extern int slice_from_v(struct SN_env * z, const symbol * p);
|
|
51
|
+
extern int slice_del(struct SN_env * z);
|
|
52
|
+
|
|
53
|
+
extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
|
|
54
|
+
extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
|
|
55
|
+
|
|
56
|
+
extern symbol * slice_to(struct SN_env * z, symbol * p);
|
|
57
|
+
extern symbol * assign_to(struct SN_env * z, symbol * p);
|
|
58
|
+
|
|
59
|
+
extern int len_utf8(const symbol * p);
|
|
60
|
+
|
|
61
|
+
extern void debug(struct SN_env * z, int number, int line_count);
|