mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
use std::borrow::Cow;
|
|
2
|
+
use snowball::Among;
|
|
3
|
+
|
|
4
|
+
#[derive(Debug, Clone)]
|
|
5
|
+
pub struct SnowballEnv<'a> {
|
|
6
|
+
pub current: Cow<'a, str>,
|
|
7
|
+
pub cursor: i32,
|
|
8
|
+
pub limit: i32,
|
|
9
|
+
pub limit_backward: i32,
|
|
10
|
+
pub bra: i32,
|
|
11
|
+
pub ket: i32,
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
impl<'a> SnowballEnv<'a> {
|
|
16
|
+
pub fn create(value: &'a str) -> Self {
|
|
17
|
+
let len = value.len();
|
|
18
|
+
SnowballEnv {
|
|
19
|
+
current: Cow::from(value),
|
|
20
|
+
cursor: 0,
|
|
21
|
+
limit: len as i32,
|
|
22
|
+
limit_backward: 0,
|
|
23
|
+
bra: 0,
|
|
24
|
+
ket: len as i32,
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
pub fn get_current(self) -> Cow<'a, str> {
|
|
29
|
+
self.current
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
pub fn set_current(&mut self, current: &'a str) {
|
|
33
|
+
self.current = Cow::from(current);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
pub fn set_current_s(&mut self, current: String) {
|
|
37
|
+
self.current = Cow::from(current);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
fn replace_s(&mut self, bra: i32, ket: i32, s: &str) -> i32 {
|
|
41
|
+
let adjustment = s.len() as i32 - (ket - bra);
|
|
42
|
+
let mut result = String::with_capacity(self.current.len());
|
|
43
|
+
{
|
|
44
|
+
let (lhs, _) = self.current.split_at(bra as usize);
|
|
45
|
+
let (_, rhs) = self.current.split_at(ket as usize);
|
|
46
|
+
result.push_str(lhs);
|
|
47
|
+
result.push_str(s);
|
|
48
|
+
result.push_str(rhs);
|
|
49
|
+
}
|
|
50
|
+
// ... not very nice...
|
|
51
|
+
let new_lim = self.limit + adjustment;
|
|
52
|
+
self.limit = new_lim;
|
|
53
|
+
if self.cursor >= ket {
|
|
54
|
+
let new_cur = self.cursor + adjustment;
|
|
55
|
+
self.cursor = new_cur;
|
|
56
|
+
} else if self.cursor > bra {
|
|
57
|
+
self.cursor = bra
|
|
58
|
+
}
|
|
59
|
+
self.current = Cow::from(result);
|
|
60
|
+
adjustment
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/// Check if s is after cursor.
|
|
64
|
+
/// If so, move cursor to the end of s
|
|
65
|
+
pub fn eq_s(&mut self, s: &str) -> bool {
|
|
66
|
+
if self.cursor >= self.limit {
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
if self.current[(self.cursor as usize)..].starts_with(s) {
|
|
70
|
+
self.cursor += s.len() as i32;
|
|
71
|
+
while !self.current.is_char_boundary(self.cursor as usize) {
|
|
72
|
+
self.cursor += 1;
|
|
73
|
+
}
|
|
74
|
+
true
|
|
75
|
+
} else {
|
|
76
|
+
false
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/// Check if 's' is before cursor
|
|
81
|
+
/// If so, move cursor to the beginning of s
|
|
82
|
+
pub fn eq_s_b(&mut self, s: &str) -> bool {
|
|
83
|
+
if (self.cursor - self.limit_backward) < s.len() as i32 {
|
|
84
|
+
false
|
|
85
|
+
// Check if cursor -s.len is a char boundary. if not well... return false obv
|
|
86
|
+
} else if !self.current.is_char_boundary(self.cursor as usize - s.len()) ||
|
|
87
|
+
!self.current[self.cursor as usize - s.len()..].starts_with(s) {
|
|
88
|
+
false
|
|
89
|
+
} else {
|
|
90
|
+
self.cursor -= s.len() as i32;
|
|
91
|
+
true
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/// Replace string between `bra` and `ket` with s
|
|
96
|
+
pub fn slice_from(&mut self, s: &str) -> bool {
|
|
97
|
+
let (bra, ket) = (self.bra, self.ket);
|
|
98
|
+
self.replace_s(bra, ket, s);
|
|
99
|
+
true
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/// Move cursor to next character
|
|
103
|
+
pub fn next_char(&mut self) {
|
|
104
|
+
self.cursor += 1;
|
|
105
|
+
while !self.current.is_char_boundary(self.cursor as usize) {
|
|
106
|
+
self.cursor += 1;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/// Move cursor to previous character
|
|
111
|
+
pub fn previous_char(&mut self) {
|
|
112
|
+
self.cursor -= 1;
|
|
113
|
+
while !self.current.is_char_boundary(self.cursor as usize) {
|
|
114
|
+
self.cursor -= 1;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
pub fn hop(&mut self, mut delta: i32) -> bool {
|
|
119
|
+
let mut res = self.cursor;
|
|
120
|
+
while delta > 0 {
|
|
121
|
+
delta -= 1;
|
|
122
|
+
if res >= self.limit {
|
|
123
|
+
return false;
|
|
124
|
+
}
|
|
125
|
+
res += 1;
|
|
126
|
+
while res < self.limit && !self.current.is_char_boundary(res as usize) {
|
|
127
|
+
res += 1;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
self.cursor = res;
|
|
131
|
+
return true;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
pub fn hop_checked(&mut self, delta: i32) -> bool {
|
|
135
|
+
return delta >= 0 && self.hop(delta);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
pub fn hop_back(&mut self, mut delta: i32) -> bool {
|
|
139
|
+
let mut res = self.cursor;
|
|
140
|
+
while delta > 0 {
|
|
141
|
+
delta -= 1;
|
|
142
|
+
if res <= self.limit_backward {
|
|
143
|
+
return false;
|
|
144
|
+
}
|
|
145
|
+
res -= 1;
|
|
146
|
+
while res > self.limit_backward && !self.current.is_char_boundary(res as usize) {
|
|
147
|
+
res -= 1;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
self.cursor = res;
|
|
151
|
+
return true;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
pub fn hop_back_checked(&mut self, delta: i32) -> bool {
|
|
155
|
+
return delta >= 0 && self.hop_back(delta);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// A grouping is represented by a minimum code point, a maximum code point,
|
|
159
|
+
// and a bitfield of which code points in that range are in the grouping.
|
|
160
|
+
// For example, in english.sbl, valid_LI is 'cdeghkmnrt'.
|
|
161
|
+
// The minimum and maximum code points are 99 and 116,
|
|
162
|
+
// so every time one of these grouping functions is called for g_valid_LI,
|
|
163
|
+
// min must be 99 and max must be 116. There are 18 code points within that
|
|
164
|
+
// range (inclusive) so the grouping is represented with 18 bits, plus 6 bits of padding:
|
|
165
|
+
//
|
|
166
|
+
// cdefghij klmnopqr st
|
|
167
|
+
// 11101100 10110001 01000000
|
|
168
|
+
//
|
|
169
|
+
// The first bit is the least significant.
|
|
170
|
+
// Those three bytes become &[0b00110111, 0b10001101, 0b00000010],
|
|
171
|
+
// which is &[55, 141, 2], which is how g_valid_LI is defined in english.rs.
|
|
172
|
+
/// Check if the char the cursor points to is in the grouping
|
|
173
|
+
pub fn in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
|
|
174
|
+
if self.cursor >= self.limit {
|
|
175
|
+
return false;
|
|
176
|
+
}
|
|
177
|
+
if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
|
|
178
|
+
let mut ch = chr as u32; //codepoint as integer
|
|
179
|
+
if ch > max || ch < min {
|
|
180
|
+
return false;
|
|
181
|
+
}
|
|
182
|
+
ch -= min;
|
|
183
|
+
if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
|
|
184
|
+
return false;
|
|
185
|
+
}
|
|
186
|
+
self.next_char();
|
|
187
|
+
return true;
|
|
188
|
+
}
|
|
189
|
+
return false;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
pub fn in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
|
|
193
|
+
if self.cursor <= self.limit_backward {
|
|
194
|
+
return false;
|
|
195
|
+
}
|
|
196
|
+
self.previous_char();
|
|
197
|
+
if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
|
|
198
|
+
let mut ch = chr as u32; //codepoint as integer
|
|
199
|
+
self.next_char();
|
|
200
|
+
if ch > max || ch < min {
|
|
201
|
+
return false;
|
|
202
|
+
}
|
|
203
|
+
ch -= min;
|
|
204
|
+
if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
|
|
205
|
+
return false;
|
|
206
|
+
}
|
|
207
|
+
self.previous_char();
|
|
208
|
+
return true;
|
|
209
|
+
}
|
|
210
|
+
return false;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
pub fn out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
|
|
214
|
+
if self.cursor >= self.limit {
|
|
215
|
+
return false;
|
|
216
|
+
}
|
|
217
|
+
if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
|
|
218
|
+
let mut ch = chr as u32; //codepoint as integer
|
|
219
|
+
if ch > max || ch < min {
|
|
220
|
+
self.next_char();
|
|
221
|
+
return true;
|
|
222
|
+
}
|
|
223
|
+
ch -= min;
|
|
224
|
+
if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
|
|
225
|
+
self.next_char();
|
|
226
|
+
return true;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
return false;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
pub fn out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
|
|
233
|
+
if self.cursor <= self.limit_backward {
|
|
234
|
+
return false;
|
|
235
|
+
}
|
|
236
|
+
self.previous_char();
|
|
237
|
+
if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
|
|
238
|
+
let mut ch = chr as u32; //codepoint as integer
|
|
239
|
+
self.next_char();
|
|
240
|
+
if ch > max || ch < min {
|
|
241
|
+
self.previous_char();
|
|
242
|
+
return true;
|
|
243
|
+
}
|
|
244
|
+
ch -= min;
|
|
245
|
+
if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
|
|
246
|
+
self.previous_char();
|
|
247
|
+
return true;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
return false;
|
|
251
|
+
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
/// Helper function that removes the string slice between `bra` and `ket`
|
|
256
|
+
pub fn slice_del(&mut self) -> bool {
|
|
257
|
+
self.slice_from("")
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
pub fn insert(&mut self, bra: i32, ket: i32, s: &str) {
|
|
261
|
+
let adjustment = self.replace_s(bra, ket, s);
|
|
262
|
+
if bra <= self.bra {
|
|
263
|
+
self.bra = self.bra + adjustment;
|
|
264
|
+
}
|
|
265
|
+
if bra <= self.ket {
|
|
266
|
+
self.ket = self.ket + adjustment;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
pub fn assign_to(&mut self) -> String {
|
|
271
|
+
self.current[0..self.limit as usize].to_string()
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
pub fn slice_to(&mut self) -> String {
|
|
275
|
+
self.current[self.bra as usize..self.ket as usize].to_string()
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
pub fn find_among<T>(&mut self, amongs: &[Among<T>], context: &mut T) -> i32 {
|
|
279
|
+
use std::cmp::min;
|
|
280
|
+
let mut i: i32 = 0;
|
|
281
|
+
let mut j: i32 = amongs.len() as i32;
|
|
282
|
+
|
|
283
|
+
let c = self.cursor;
|
|
284
|
+
let l = self.limit;
|
|
285
|
+
|
|
286
|
+
let mut common_i = 0i32;
|
|
287
|
+
let mut common_j = 0i32;
|
|
288
|
+
|
|
289
|
+
let mut first_key_inspected = false;
|
|
290
|
+
loop {
|
|
291
|
+
let k = i + ((j - i) >> 1);
|
|
292
|
+
let mut diff: i32 = 0;
|
|
293
|
+
let mut common = min(common_i, common_j);
|
|
294
|
+
let w = &amongs[k as usize];
|
|
295
|
+
for lvar in common..w.0.len() as i32 {
|
|
296
|
+
if c + common == l {
|
|
297
|
+
diff = -1;
|
|
298
|
+
break;
|
|
299
|
+
}
|
|
300
|
+
diff = self.current.as_bytes()[(c + common) as usize] as i32 - w.0.as_bytes()[lvar as usize] as i32;
|
|
301
|
+
if diff != 0 {
|
|
302
|
+
break;
|
|
303
|
+
}
|
|
304
|
+
common += 1;
|
|
305
|
+
}
|
|
306
|
+
if diff < 0 {
|
|
307
|
+
j = k;
|
|
308
|
+
common_j = common;
|
|
309
|
+
} else {
|
|
310
|
+
i = k;
|
|
311
|
+
common_i = common;
|
|
312
|
+
}
|
|
313
|
+
if j - i <= 1 {
|
|
314
|
+
if i > 0 {
|
|
315
|
+
break;
|
|
316
|
+
}
|
|
317
|
+
if j == i {
|
|
318
|
+
break;
|
|
319
|
+
}
|
|
320
|
+
if first_key_inspected {
|
|
321
|
+
break;
|
|
322
|
+
}
|
|
323
|
+
first_key_inspected = true;
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
loop {
|
|
328
|
+
let w = &amongs[i as usize];
|
|
329
|
+
if common_i >= w.0.len() as i32{
|
|
330
|
+
self.cursor = c + w.0.len() as i32;
|
|
331
|
+
if let Some(ref method) = w.3 {
|
|
332
|
+
let res = method(self, context);
|
|
333
|
+
self.cursor = c + w.0.len() as i32;
|
|
334
|
+
if res {
|
|
335
|
+
return w.2;
|
|
336
|
+
}
|
|
337
|
+
} else {
|
|
338
|
+
return w.2;
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
i = w.1;
|
|
342
|
+
if i < 0 {
|
|
343
|
+
return 0;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
pub fn find_among_b<T>(&mut self, amongs: &[Among<T>], context: &mut T) -> i32 {
|
|
349
|
+
let mut i: i32 = 0;
|
|
350
|
+
let mut j: i32 = amongs.len() as i32;
|
|
351
|
+
|
|
352
|
+
let c = self.cursor;
|
|
353
|
+
let lb = self.limit_backward;
|
|
354
|
+
|
|
355
|
+
let mut common_i = 0i32;
|
|
356
|
+
let mut common_j = 0i32;
|
|
357
|
+
|
|
358
|
+
let mut first_key_inspected = false;
|
|
359
|
+
|
|
360
|
+
loop {
|
|
361
|
+
let k = i + ((j - i) >> 1);
|
|
362
|
+
let mut diff: i32 = 0;
|
|
363
|
+
let mut common = if common_i < common_j {
|
|
364
|
+
common_i
|
|
365
|
+
} else {
|
|
366
|
+
common_j
|
|
367
|
+
};
|
|
368
|
+
let w = &amongs[k as usize];
|
|
369
|
+
for lvar in (0..w.0.len() - common as usize).rev() {
|
|
370
|
+
if c - common == lb {
|
|
371
|
+
diff = -1;
|
|
372
|
+
break;
|
|
373
|
+
}
|
|
374
|
+
diff = self.current.as_bytes()[(c - common - 1) as usize] as i32 - w.0.as_bytes()[lvar] as i32;
|
|
375
|
+
if diff != 0 {
|
|
376
|
+
break;
|
|
377
|
+
}
|
|
378
|
+
// Count up commons. But not one character but the byte width of that char
|
|
379
|
+
common += 1;
|
|
380
|
+
}
|
|
381
|
+
if diff < 0 {
|
|
382
|
+
j = k;
|
|
383
|
+
common_j = common;
|
|
384
|
+
} else {
|
|
385
|
+
i = k;
|
|
386
|
+
common_i = common;
|
|
387
|
+
}
|
|
388
|
+
if j - i <= 1 {
|
|
389
|
+
if i > 0 {
|
|
390
|
+
break;
|
|
391
|
+
}
|
|
392
|
+
if j == i {
|
|
393
|
+
break;
|
|
394
|
+
}
|
|
395
|
+
if first_key_inspected {
|
|
396
|
+
break;
|
|
397
|
+
}
|
|
398
|
+
first_key_inspected = true;
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
loop {
|
|
402
|
+
let w = &amongs[i as usize];
|
|
403
|
+
if common_i >= w.0.len() as i32 {
|
|
404
|
+
self.cursor = c - w.0.len() as i32;
|
|
405
|
+
if let Some(ref method) = w.3 {
|
|
406
|
+
let res = method(self, context);
|
|
407
|
+
self.cursor = c - w.0.len() as i32;
|
|
408
|
+
if res {
|
|
409
|
+
return w.2;
|
|
410
|
+
}
|
|
411
|
+
} else {
|
|
412
|
+
return w.2;
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
i = w.1;
|
|
416
|
+
if i < 0 {
|
|
417
|
+
return 0;
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/* This is a simple program which uses libstemmer to provide a command
|
|
2
|
+
* line interface for stemming using any of the algorithms provided.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
#include <stdio.h>
|
|
6
|
+
#include <stdlib.h>
|
|
7
|
+
#include <string.h> /* for strlen, memcmp */
|
|
8
|
+
|
|
9
|
+
#include "libstemmer.h"
|
|
10
|
+
|
|
11
|
+
#define EMOJI_FACE_THROWING_A_KISS "\xf0\x9f\x98\x98"
|
|
12
|
+
#define U_40079 "\xf1\x80\x81\xb9"
|
|
13
|
+
static const struct testcase {
|
|
14
|
+
/* Stemmer to use, or 0 to test with all stemmers */
|
|
15
|
+
const char * language;
|
|
16
|
+
/* Character encoding (can be 0 for UTF-8) */
|
|
17
|
+
const char * charenc;
|
|
18
|
+
/* Input string (0 marks end of list) */
|
|
19
|
+
const char * input;
|
|
20
|
+
/* Expected output string (0 means same as input) */
|
|
21
|
+
const char * expect;
|
|
22
|
+
} testcases[] = {
|
|
23
|
+
{ "en", 0,
|
|
24
|
+
"a" EMOJI_FACE_THROWING_A_KISS "ing",
|
|
25
|
+
"a" EMOJI_FACE_THROWING_A_KISS "e" },
|
|
26
|
+
{ "en", 0, U_40079 "wing", 0 },
|
|
27
|
+
// The Finnish stemmer used to damage numbers ending with two or more of
|
|
28
|
+
// the same digit: https://github.com/snowballstem/snowball/issues/66
|
|
29
|
+
{ 0, 0, "2000", 0 },
|
|
30
|
+
{ 0, 0, "999", 0 },
|
|
31
|
+
{ 0, 0, "1000000000", 0 },
|
|
32
|
+
// The Danish stemmer used to damage a number at the end of a word:
|
|
33
|
+
// https://github.com/snowballstem/snowball/issues/81
|
|
34
|
+
{ 0, 0, "space1999", 0 },
|
|
35
|
+
{ 0, 0, "hal9000", 0 },
|
|
36
|
+
{ 0, 0, "0x0e00", 0 },
|
|
37
|
+
{ 0, 0, 0, 0 }
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
static void
|
|
41
|
+
run_testcase(const char * language, const struct testcase *test)
|
|
42
|
+
{
|
|
43
|
+
const char * charenc = test->charenc;
|
|
44
|
+
const char * input = test->input;
|
|
45
|
+
const char * expect = test->expect;
|
|
46
|
+
struct sb_stemmer * stemmer = sb_stemmer_new(language, charenc);
|
|
47
|
+
const sb_symbol * stemmed;
|
|
48
|
+
int len;
|
|
49
|
+
|
|
50
|
+
if (expect == NULL) expect = input;
|
|
51
|
+
if (stemmer == 0) {
|
|
52
|
+
if (charenc == NULL) {
|
|
53
|
+
fprintf(stderr, "language `%s' not available for stemming\n", language);
|
|
54
|
+
exit(1);
|
|
55
|
+
} else {
|
|
56
|
+
fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
|
|
57
|
+
exit(1);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
stemmed = sb_stemmer_stem(stemmer, (const unsigned char*)input, strlen(input));
|
|
61
|
+
if (stemmed == NULL) {
|
|
62
|
+
fprintf(stderr, "Out of memory");
|
|
63
|
+
exit(1);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
len = sb_stemmer_length(stemmer);
|
|
67
|
+
if (len != (int)strlen(expect) || memcmp(stemmed, expect, len) != 0) {
|
|
68
|
+
fprintf(stderr, "%s stemmer output for %s was %.*s not %s\n",
|
|
69
|
+
language, input, len, stemmed, expect);
|
|
70
|
+
exit(1);
|
|
71
|
+
}
|
|
72
|
+
sb_stemmer_delete(stemmer);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
int
|
|
76
|
+
main(int argc, char * argv[])
|
|
77
|
+
{
|
|
78
|
+
const char ** all_languages = sb_stemmer_list();
|
|
79
|
+
const struct testcase * p;
|
|
80
|
+
(void)argc;
|
|
81
|
+
(void)argv;
|
|
82
|
+
for (p = testcases; p->input; ++p) {
|
|
83
|
+
const char * language = p->language;
|
|
84
|
+
if (language) {
|
|
85
|
+
run_testcase(language, p);
|
|
86
|
+
} else {
|
|
87
|
+
const char ** l;
|
|
88
|
+
for (l = all_languages; *l; ++l) {
|
|
89
|
+
run_testcase(*l, p);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return 0;
|
|
95
|
+
}
|