mittens 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,6 @@
1
+ use snowball::SnowballEnv;
2
+
3
+ pub struct Among<T: 'static>(pub &'static str,
4
+ pub i32,
5
+ pub i32,
6
+ pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>);
@@ -0,0 +1,6 @@
1
+ pub mod algorithms;
2
+ mod among;
3
+ mod snowball_env;
4
+
5
+ pub use snowball::among::Among;
6
+ pub use snowball::snowball_env::SnowballEnv;
@@ -0,0 +1,421 @@
1
+ use std::borrow::Cow;
2
+ use snowball::Among;
3
+
4
+ #[derive(Debug, Clone)]
5
+ pub struct SnowballEnv<'a> {
6
+ pub current: Cow<'a, str>,
7
+ pub cursor: i32,
8
+ pub limit: i32,
9
+ pub limit_backward: i32,
10
+ pub bra: i32,
11
+ pub ket: i32,
12
+ }
13
+
14
+
15
+ impl<'a> SnowballEnv<'a> {
16
+ pub fn create(value: &'a str) -> Self {
17
+ let len = value.len();
18
+ SnowballEnv {
19
+ current: Cow::from(value),
20
+ cursor: 0,
21
+ limit: len as i32,
22
+ limit_backward: 0,
23
+ bra: 0,
24
+ ket: len as i32,
25
+ }
26
+ }
27
+
28
+ pub fn get_current(self) -> Cow<'a, str> {
29
+ self.current
30
+ }
31
+
32
+ pub fn set_current(&mut self, current: &'a str) {
33
+ self.current = Cow::from(current);
34
+ }
35
+
36
+ pub fn set_current_s(&mut self, current: String) {
37
+ self.current = Cow::from(current);
38
+ }
39
+
40
+ fn replace_s(&mut self, bra: i32, ket: i32, s: &str) -> i32 {
41
+ let adjustment = s.len() as i32 - (ket - bra);
42
+ let mut result = String::with_capacity(self.current.len());
43
+ {
44
+ let (lhs, _) = self.current.split_at(bra as usize);
45
+ let (_, rhs) = self.current.split_at(ket as usize);
46
+ result.push_str(lhs);
47
+ result.push_str(s);
48
+ result.push_str(rhs);
49
+ }
50
+ // ... not very nice...
51
+ let new_lim = self.limit + adjustment;
52
+ self.limit = new_lim;
53
+ if self.cursor >= ket {
54
+ let new_cur = self.cursor + adjustment;
55
+ self.cursor = new_cur;
56
+ } else if self.cursor > bra {
57
+ self.cursor = bra
58
+ }
59
+ self.current = Cow::from(result);
60
+ adjustment
61
+ }
62
+
63
+ /// Check if s is after cursor.
64
+ /// If so, move cursor to the end of s
65
+ pub fn eq_s(&mut self, s: &str) -> bool {
66
+ if self.cursor >= self.limit {
67
+ return false;
68
+ }
69
+ if self.current[(self.cursor as usize)..].starts_with(s) {
70
+ self.cursor += s.len() as i32;
71
+ while !self.current.is_char_boundary(self.cursor as usize) {
72
+ self.cursor += 1;
73
+ }
74
+ true
75
+ } else {
76
+ false
77
+ }
78
+ }
79
+
80
+ /// Check if 's' is before cursor
81
+ /// If so, move cursor to the beginning of s
82
+ pub fn eq_s_b(&mut self, s: &str) -> bool {
83
+ if (self.cursor - self.limit_backward) < s.len() as i32 {
84
+ false
85
+ // Check if cursor -s.len is a char boundary. if not well... return false obv
86
+ } else if !self.current.is_char_boundary(self.cursor as usize - s.len()) ||
87
+ !self.current[self.cursor as usize - s.len()..].starts_with(s) {
88
+ false
89
+ } else {
90
+ self.cursor -= s.len() as i32;
91
+ true
92
+ }
93
+ }
94
+
95
+ /// Replace string between `bra` and `ket` with s
96
+ pub fn slice_from(&mut self, s: &str) -> bool {
97
+ let (bra, ket) = (self.bra, self.ket);
98
+ self.replace_s(bra, ket, s);
99
+ true
100
+ }
101
+
102
+ /// Move cursor to next character
103
+ pub fn next_char(&mut self) {
104
+ self.cursor += 1;
105
+ while !self.current.is_char_boundary(self.cursor as usize) {
106
+ self.cursor += 1;
107
+ }
108
+ }
109
+
110
+ /// Move cursor to previous character
111
+ pub fn previous_char(&mut self) {
112
+ self.cursor -= 1;
113
+ while !self.current.is_char_boundary(self.cursor as usize) {
114
+ self.cursor -= 1;
115
+ }
116
+ }
117
+
118
+ pub fn hop(&mut self, mut delta: i32) -> bool {
119
+ let mut res = self.cursor;
120
+ while delta > 0 {
121
+ delta -= 1;
122
+ if res >= self.limit {
123
+ return false;
124
+ }
125
+ res += 1;
126
+ while res < self.limit && !self.current.is_char_boundary(res as usize) {
127
+ res += 1;
128
+ }
129
+ }
130
+ self.cursor = res;
131
+ return true;
132
+ }
133
+
134
+ pub fn hop_checked(&mut self, delta: i32) -> bool {
135
+ return delta >= 0 && self.hop(delta);
136
+ }
137
+
138
+ pub fn hop_back(&mut self, mut delta: i32) -> bool {
139
+ let mut res = self.cursor;
140
+ while delta > 0 {
141
+ delta -= 1;
142
+ if res <= self.limit_backward {
143
+ return false;
144
+ }
145
+ res -= 1;
146
+ while res > self.limit_backward && !self.current.is_char_boundary(res as usize) {
147
+ res -= 1;
148
+ }
149
+ }
150
+ self.cursor = res;
151
+ return true;
152
+ }
153
+
154
+ pub fn hop_back_checked(&mut self, delta: i32) -> bool {
155
+ return delta >= 0 && self.hop_back(delta);
156
+ }
157
+
158
+ // A grouping is represented by a minimum code point, a maximum code point,
159
+ // and a bitfield of which code points in that range are in the grouping.
160
+ // For example, in english.sbl, valid_LI is 'cdeghkmnrt'.
161
+ // The minimum and maximum code points are 99 and 116,
162
+ // so every time one of these grouping functions is called for g_valid_LI,
163
+ // min must be 99 and max must be 116. There are 18 code points within that
164
+ // range (inclusive) so the grouping is represented with 18 bits, plus 6 bits of padding:
165
+ //
166
+ // cdefghij klmnopqr st
167
+ // 11101100 10110001 01000000
168
+ //
169
+ // The first bit is the least significant.
170
+ // Those three bytes become &[0b00110111, 0b10001101, 0b00000010],
171
+ // which is &[55, 141, 2], which is how g_valid_LI is defined in english.rs.
172
+ /// Check if the char the cursor points to is in the grouping
173
+ pub fn in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
174
+ if self.cursor >= self.limit {
175
+ return false;
176
+ }
177
+ if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
178
+ let mut ch = chr as u32; //codepoint as integer
179
+ if ch > max || ch < min {
180
+ return false;
181
+ }
182
+ ch -= min;
183
+ if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
184
+ return false;
185
+ }
186
+ self.next_char();
187
+ return true;
188
+ }
189
+ return false;
190
+ }
191
+
192
+ pub fn in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
193
+ if self.cursor <= self.limit_backward {
194
+ return false;
195
+ }
196
+ self.previous_char();
197
+ if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
198
+ let mut ch = chr as u32; //codepoint as integer
199
+ self.next_char();
200
+ if ch > max || ch < min {
201
+ return false;
202
+ }
203
+ ch -= min;
204
+ if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
205
+ return false;
206
+ }
207
+ self.previous_char();
208
+ return true;
209
+ }
210
+ return false;
211
+ }
212
+
213
+ pub fn out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
214
+ if self.cursor >= self.limit {
215
+ return false;
216
+ }
217
+ if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
218
+ let mut ch = chr as u32; //codepoint as integer
219
+ if ch > max || ch < min {
220
+ self.next_char();
221
+ return true;
222
+ }
223
+ ch -= min;
224
+ if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
225
+ self.next_char();
226
+ return true;
227
+ }
228
+ }
229
+ return false;
230
+ }
231
+
232
+ pub fn out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
233
+ if self.cursor <= self.limit_backward {
234
+ return false;
235
+ }
236
+ self.previous_char();
237
+ if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
238
+ let mut ch = chr as u32; //codepoint as integer
239
+ self.next_char();
240
+ if ch > max || ch < min {
241
+ self.previous_char();
242
+ return true;
243
+ }
244
+ ch -= min;
245
+ if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
246
+ self.previous_char();
247
+ return true;
248
+ }
249
+ }
250
+ return false;
251
+
252
+ }
253
+
254
+
255
+ /// Helper function that removes the string slice between `bra` and `ket`
256
+ pub fn slice_del(&mut self) -> bool {
257
+ self.slice_from("")
258
+ }
259
+
260
+ pub fn insert(&mut self, bra: i32, ket: i32, s: &str) {
261
+ let adjustment = self.replace_s(bra, ket, s);
262
+ if bra <= self.bra {
263
+ self.bra = self.bra + adjustment;
264
+ }
265
+ if bra <= self.ket {
266
+ self.ket = self.ket + adjustment;
267
+ }
268
+ }
269
+
270
+ pub fn assign_to(&mut self) -> String {
271
+ self.current[0..self.limit as usize].to_string()
272
+ }
273
+
274
+ pub fn slice_to(&mut self) -> String {
275
+ self.current[self.bra as usize..self.ket as usize].to_string()
276
+ }
277
+
278
+ pub fn find_among<T>(&mut self, amongs: &[Among<T>], context: &mut T) -> i32 {
279
+ use std::cmp::min;
280
+ let mut i: i32 = 0;
281
+ let mut j: i32 = amongs.len() as i32;
282
+
283
+ let c = self.cursor;
284
+ let l = self.limit;
285
+
286
+ let mut common_i = 0i32;
287
+ let mut common_j = 0i32;
288
+
289
+ let mut first_key_inspected = false;
290
+ loop {
291
+ let k = i + ((j - i) >> 1);
292
+ let mut diff: i32 = 0;
293
+ let mut common = min(common_i, common_j);
294
+ let w = &amongs[k as usize];
295
+ for lvar in common..w.0.len() as i32 {
296
+ if c + common == l {
297
+ diff = -1;
298
+ break;
299
+ }
300
+ diff = self.current.as_bytes()[(c + common) as usize] as i32 - w.0.as_bytes()[lvar as usize] as i32;
301
+ if diff != 0 {
302
+ break;
303
+ }
304
+ common += 1;
305
+ }
306
+ if diff < 0 {
307
+ j = k;
308
+ common_j = common;
309
+ } else {
310
+ i = k;
311
+ common_i = common;
312
+ }
313
+ if j - i <= 1 {
314
+ if i > 0 {
315
+ break;
316
+ }
317
+ if j == i {
318
+ break;
319
+ }
320
+ if first_key_inspected {
321
+ break;
322
+ }
323
+ first_key_inspected = true;
324
+ }
325
+ }
326
+
327
+ loop {
328
+ let w = &amongs[i as usize];
329
+ if common_i >= w.0.len() as i32{
330
+ self.cursor = c + w.0.len() as i32;
331
+ if let Some(ref method) = w.3 {
332
+ let res = method(self, context);
333
+ self.cursor = c + w.0.len() as i32;
334
+ if res {
335
+ return w.2;
336
+ }
337
+ } else {
338
+ return w.2;
339
+ }
340
+ }
341
+ i = w.1;
342
+ if i < 0 {
343
+ return 0;
344
+ }
345
+ }
346
+ }
347
+
348
+ pub fn find_among_b<T>(&mut self, amongs: &[Among<T>], context: &mut T) -> i32 {
349
+ let mut i: i32 = 0;
350
+ let mut j: i32 = amongs.len() as i32;
351
+
352
+ let c = self.cursor;
353
+ let lb = self.limit_backward;
354
+
355
+ let mut common_i = 0i32;
356
+ let mut common_j = 0i32;
357
+
358
+ let mut first_key_inspected = false;
359
+
360
+ loop {
361
+ let k = i + ((j - i) >> 1);
362
+ let mut diff: i32 = 0;
363
+ let mut common = if common_i < common_j {
364
+ common_i
365
+ } else {
366
+ common_j
367
+ };
368
+ let w = &amongs[k as usize];
369
+ for lvar in (0..w.0.len() - common as usize).rev() {
370
+ if c - common == lb {
371
+ diff = -1;
372
+ break;
373
+ }
374
+ diff = self.current.as_bytes()[(c - common - 1) as usize] as i32 - w.0.as_bytes()[lvar] as i32;
375
+ if diff != 0 {
376
+ break;
377
+ }
378
+ // Count up commons. But not one character but the byte width of that char
379
+ common += 1;
380
+ }
381
+ if diff < 0 {
382
+ j = k;
383
+ common_j = common;
384
+ } else {
385
+ i = k;
386
+ common_i = common;
387
+ }
388
+ if j - i <= 1 {
389
+ if i > 0 {
390
+ break;
391
+ }
392
+ if j == i {
393
+ break;
394
+ }
395
+ if first_key_inspected {
396
+ break;
397
+ }
398
+ first_key_inspected = true;
399
+ }
400
+ }
401
+ loop {
402
+ let w = &amongs[i as usize];
403
+ if common_i >= w.0.len() as i32 {
404
+ self.cursor = c - w.0.len() as i32;
405
+ if let Some(ref method) = w.3 {
406
+ let res = method(self, context);
407
+ self.cursor = c - w.0.len() as i32;
408
+ if res {
409
+ return w.2;
410
+ }
411
+ } else {
412
+ return w.2;
413
+ }
414
+ }
415
+ i = w.1;
416
+ if i < 0 {
417
+ return 0;
418
+ }
419
+ }
420
+ }
421
+ }
@@ -0,0 +1,95 @@
1
+ /* This is a simple program which uses libstemmer to provide a command
2
+ * line interface for stemming using any of the algorithms provided.
3
+ */
4
+
5
+ #include <stdio.h>
6
+ #include <stdlib.h>
7
+ #include <string.h> /* for strlen, memcmp */
8
+
9
+ #include "libstemmer.h"
10
+
11
+ #define EMOJI_FACE_THROWING_A_KISS "\xf0\x9f\x98\x98"
12
+ #define U_40079 "\xf1\x80\x81\xb9"
13
+ static const struct testcase {
14
+ /* Stemmer to use, or 0 to test with all stemmers */
15
+ const char * language;
16
+ /* Character encoding (can be 0 for UTF-8) */
17
+ const char * charenc;
18
+ /* Input string (0 marks end of list) */
19
+ const char * input;
20
+ /* Expected output string (0 means same as input) */
21
+ const char * expect;
22
+ } testcases[] = {
23
+ { "en", 0,
24
+ "a" EMOJI_FACE_THROWING_A_KISS "ing",
25
+ "a" EMOJI_FACE_THROWING_A_KISS "e" },
26
+ { "en", 0, U_40079 "wing", 0 },
27
+ // The Finnish stemmer used to damage numbers ending with two or more of
28
+ // the same digit: https://github.com/snowballstem/snowball/issues/66
29
+ { 0, 0, "2000", 0 },
30
+ { 0, 0, "999", 0 },
31
+ { 0, 0, "1000000000", 0 },
32
+ // The Danish stemmer used to damage a number at the end of a word:
33
+ // https://github.com/snowballstem/snowball/issues/81
34
+ { 0, 0, "space1999", 0 },
35
+ { 0, 0, "hal9000", 0 },
36
+ { 0, 0, "0x0e00", 0 },
37
+ { 0, 0, 0, 0 }
38
+ };
39
+
40
+ static void
41
+ run_testcase(const char * language, const struct testcase *test)
42
+ {
43
+ const char * charenc = test->charenc;
44
+ const char * input = test->input;
45
+ const char * expect = test->expect;
46
+ struct sb_stemmer * stemmer = sb_stemmer_new(language, charenc);
47
+ const sb_symbol * stemmed;
48
+ int len;
49
+
50
+ if (expect == NULL) expect = input;
51
+ if (stemmer == 0) {
52
+ if (charenc == NULL) {
53
+ fprintf(stderr, "language `%s' not available for stemming\n", language);
54
+ exit(1);
55
+ } else {
56
+ fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
57
+ exit(1);
58
+ }
59
+ }
60
+ stemmed = sb_stemmer_stem(stemmer, (const unsigned char*)input, strlen(input));
61
+ if (stemmed == NULL) {
62
+ fprintf(stderr, "Out of memory");
63
+ exit(1);
64
+ }
65
+
66
+ len = sb_stemmer_length(stemmer);
67
+ if (len != (int)strlen(expect) || memcmp(stemmed, expect, len) != 0) {
68
+ fprintf(stderr, "%s stemmer output for %s was %.*s not %s\n",
69
+ language, input, len, stemmed, expect);
70
+ exit(1);
71
+ }
72
+ sb_stemmer_delete(stemmer);
73
+ }
74
+
75
+ int
76
+ main(int argc, char * argv[])
77
+ {
78
+ const char ** all_languages = sb_stemmer_list();
79
+ const struct testcase * p;
80
+ (void)argc;
81
+ (void)argv;
82
+ for (p = testcases; p->input; ++p) {
83
+ const char * language = p->language;
84
+ if (language) {
85
+ run_testcase(language, p);
86
+ } else {
87
+ const char ** l;
88
+ for (l = all_languages; *l; ++l) {
89
+ run_testcase(*l, p);
90
+ }
91
+ }
92
+ }
93
+
94
+ return 0;
95
+ }