mittens 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -3
- data/lib/mittens/version.rb +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +7 -7
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -3,10 +3,10 @@
|
|
3
3
|
// Copyright (c) 2015, Cesar Souza
|
4
4
|
// Copyright (c) 2018, Olly Betts
|
5
5
|
// All rights reserved.
|
6
|
-
//
|
6
|
+
//
|
7
7
|
// Redistribution and use in source and binary forms, with or without
|
8
8
|
// modification, are permitted provided that the following conditions are met:
|
9
|
-
//
|
9
|
+
//
|
10
10
|
// * Redistributions of source code must retain the above copyright notice,
|
11
11
|
// * this list of conditions and the following disclaimer.
|
12
12
|
// * Redistributions in binary form must reproduce the above copyright
|
@@ -15,7 +15,7 @@
|
|
15
15
|
// * Neither the name of the copyright holders nor the names of its contributors
|
16
16
|
// * may be used to endorse or promote products derived from this software
|
17
17
|
// * without specific prior written permission.
|
18
|
-
//
|
18
|
+
//
|
19
19
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
20
20
|
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
21
21
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
@@ -36,13 +36,13 @@ namespace Snowball
|
|
36
36
|
/// <summary>
|
37
37
|
/// Class holding current state.
|
38
38
|
/// </summary>
|
39
|
-
///
|
39
|
+
///
|
40
40
|
public class Env
|
41
41
|
{
|
42
42
|
/// <summary>
|
43
43
|
/// Initializes a new instance of the <see cref="Env"/> class.
|
44
44
|
/// </summary>
|
45
|
-
///
|
45
|
+
///
|
46
46
|
protected Env()
|
47
47
|
{
|
48
48
|
}
|
@@ -50,43 +50,43 @@ namespace Snowball
|
|
50
50
|
/// <summary>
|
51
51
|
/// Gets the current string.
|
52
52
|
/// </summary>
|
53
|
-
///
|
53
|
+
///
|
54
54
|
protected StringBuilder current;
|
55
55
|
|
56
56
|
/// <summary>
|
57
57
|
/// Current cursor position.
|
58
58
|
/// </summary>
|
59
|
-
///
|
59
|
+
///
|
60
60
|
protected int cursor;
|
61
61
|
|
62
62
|
/// <summary>
|
63
63
|
/// Forward limit for inspecting the buffer.
|
64
64
|
/// </summary>
|
65
|
-
///
|
65
|
+
///
|
66
66
|
protected int limit;
|
67
67
|
|
68
68
|
/// <summary>
|
69
69
|
/// Backward limit for inspecting the buffer.
|
70
70
|
/// </summary>
|
71
|
-
///
|
71
|
+
///
|
72
72
|
protected int limit_backward;
|
73
73
|
|
74
74
|
/// <summary>
|
75
75
|
/// Starting bracket position.
|
76
76
|
/// </summary>
|
77
|
-
///
|
77
|
+
///
|
78
78
|
protected int bra;
|
79
79
|
|
80
80
|
/// <summary>
|
81
81
|
/// Ending bracket position.
|
82
82
|
/// </summary>
|
83
|
-
///
|
83
|
+
///
|
84
84
|
protected int ket;
|
85
85
|
|
86
86
|
/// <summary>
|
87
87
|
/// Copy another Env object.
|
88
88
|
/// </summary>
|
89
|
-
///
|
89
|
+
///
|
90
90
|
public Env(Env other)
|
91
91
|
{
|
92
92
|
copy_from(other);
|
@@ -95,7 +95,7 @@ namespace Snowball
|
|
95
95
|
/// <summary>
|
96
96
|
/// Copy another Env object.
|
97
97
|
/// </summary>
|
98
|
-
///
|
98
|
+
///
|
99
99
|
protected void copy_from(Env other)
|
100
100
|
{
|
101
101
|
current = other.current;
|
@@ -111,13 +111,13 @@ namespace Snowball
|
|
111
111
|
/// <summary>
|
112
112
|
/// Base class for Snowball's stemmer algorithms.
|
113
113
|
/// </summary>
|
114
|
-
///
|
114
|
+
///
|
115
115
|
public abstract class Stemmer : Env
|
116
116
|
{
|
117
117
|
/// <summary>
|
118
118
|
/// Initializes a new instance of the <see cref="Stemmer"/> class.
|
119
119
|
/// </summary>
|
120
|
-
///
|
120
|
+
///
|
121
121
|
protected Stemmer()
|
122
122
|
{
|
123
123
|
current = new StringBuilder();
|
@@ -128,14 +128,14 @@ namespace Snowball
|
|
128
128
|
/// <summary>
|
129
129
|
/// Calls the stemmer to process the next word.
|
130
130
|
/// </summary>
|
131
|
-
///
|
131
|
+
///
|
132
132
|
protected abstract bool stem();
|
133
133
|
|
134
134
|
|
135
135
|
/// <summary>
|
136
136
|
/// Stems the buffer's contents.
|
137
137
|
/// </summary>
|
138
|
-
///
|
138
|
+
///
|
139
139
|
public bool Stem()
|
140
140
|
{
|
141
141
|
return this.stem();
|
@@ -144,11 +144,11 @@ namespace Snowball
|
|
144
144
|
/// <summary>
|
145
145
|
/// Stems a given word.
|
146
146
|
/// </summary>
|
147
|
-
///
|
147
|
+
///
|
148
148
|
/// <param name="word">The word to be stemmed.</param>
|
149
|
-
///
|
149
|
+
///
|
150
150
|
/// <returns>The stemmed word.</returns>
|
151
|
-
///
|
151
|
+
///
|
152
152
|
public string Stem(string word)
|
153
153
|
{
|
154
154
|
setBufferContents(word);
|
@@ -160,7 +160,7 @@ namespace Snowball
|
|
160
160
|
/// <summary>
|
161
161
|
/// Gets the current processing buffer.
|
162
162
|
/// </summary>
|
163
|
-
///
|
163
|
+
///
|
164
164
|
public StringBuilder Buffer
|
165
165
|
{
|
166
166
|
get { return current; }
|
@@ -171,7 +171,7 @@ namespace Snowball
|
|
171
171
|
/// or the stemmed word, if the stemmer has been
|
172
172
|
/// processed.
|
173
173
|
/// </summary>
|
174
|
-
///
|
174
|
+
///
|
175
175
|
public string Current
|
176
176
|
{
|
177
177
|
get { return current.ToString(); }
|
@@ -192,7 +192,7 @@ namespace Snowball
|
|
192
192
|
|
193
193
|
|
194
194
|
/// <summary>
|
195
|
-
/// Determines whether the current character is
|
195
|
+
/// Determines whether the current character is
|
196
196
|
/// inside a given group of characters <c>s</c>.
|
197
197
|
/// </summary>
|
198
198
|
protected int in_grouping(string s, int min, int max, bool repeat)
|
@@ -217,7 +217,7 @@ namespace Snowball
|
|
217
217
|
}
|
218
218
|
|
219
219
|
/// <summary>
|
220
|
-
/// Determines whether the current character is
|
220
|
+
/// Determines whether the current character is
|
221
221
|
/// inside a given group of characters <c>s</c>.
|
222
222
|
/// </summary>
|
223
223
|
protected int in_grouping_b(string s, int min, int max, bool repeat)
|
@@ -241,7 +241,7 @@ namespace Snowball
|
|
241
241
|
}
|
242
242
|
|
243
243
|
/// <summary>
|
244
|
-
/// Determines whether the current character is
|
244
|
+
/// Determines whether the current character is
|
245
245
|
/// outside a given group of characters <c>s</c>.
|
246
246
|
/// </summary>
|
247
247
|
protected int out_grouping(string s, int min, int max, bool repeat)
|
@@ -272,7 +272,7 @@ namespace Snowball
|
|
272
272
|
}
|
273
273
|
|
274
274
|
/// <summary>
|
275
|
-
/// Determines whether the current character is
|
275
|
+
/// Determines whether the current character is
|
276
276
|
/// outside a given group of characters <c>s</c>.
|
277
277
|
/// </summary>
|
278
278
|
protected int out_grouping_b(string s, int min, int max, bool repeat)
|
@@ -323,6 +323,26 @@ namespace Snowball
|
|
323
323
|
return true;
|
324
324
|
}
|
325
325
|
|
326
|
+
/// <summary>
|
327
|
+
/// Determines if the current buffer contains the
|
328
|
+
/// string s, starting from the current position and
|
329
|
+
/// going forward.
|
330
|
+
/// </summary>
|
331
|
+
protected bool eq_s(StringBuilder s)
|
332
|
+
{
|
333
|
+
if (limit - cursor < s.Length)
|
334
|
+
return false;
|
335
|
+
|
336
|
+
for (int i = 0; i != s.Length; i++)
|
337
|
+
{
|
338
|
+
if (current[cursor + i] != s[i])
|
339
|
+
return false;
|
340
|
+
}
|
341
|
+
|
342
|
+
cursor += s.Length;
|
343
|
+
return true;
|
344
|
+
}
|
345
|
+
|
326
346
|
/// <summary>
|
327
347
|
/// Determines if the current buffer contains the
|
328
348
|
/// string s, starting from the current position and
|
@@ -365,11 +385,11 @@ namespace Snowball
|
|
365
385
|
|
366
386
|
|
367
387
|
/// <summary>
|
368
|
-
/// Searches if the current buffer matches against one of the
|
388
|
+
/// Searches if the current buffer matches against one of the
|
369
389
|
/// amongs, starting from the current cursor position and going
|
370
390
|
/// forward.
|
371
391
|
/// </summary>
|
372
|
-
///
|
392
|
+
///
|
373
393
|
protected int find_among(Among[] v)
|
374
394
|
{
|
375
395
|
int i = 0;
|
@@ -463,11 +483,11 @@ namespace Snowball
|
|
463
483
|
}
|
464
484
|
|
465
485
|
/// <summary>
|
466
|
-
/// Searches if the current buffer matches against one of the
|
486
|
+
/// Searches if the current buffer matches against one of the
|
467
487
|
/// amongs, starting from the current cursor position and going
|
468
488
|
/// backwards.
|
469
489
|
/// </summary>
|
470
|
-
///
|
490
|
+
///
|
471
491
|
protected int find_among_b(Among[] v)
|
472
492
|
{
|
473
493
|
int i = 0;
|
@@ -559,7 +579,7 @@ namespace Snowball
|
|
559
579
|
/// Replaces the characters between <c>c_bra</c>
|
560
580
|
/// and <c>c_ket</c> by the characters in s.
|
561
581
|
/// </summary>
|
562
|
-
///
|
582
|
+
///
|
563
583
|
protected int replace_s(int c_bra, int c_ket, String s)
|
564
584
|
{
|
565
585
|
int adjustment = s.Length - (c_ket - c_bra);
|
@@ -586,7 +606,7 @@ namespace Snowball
|
|
586
606
|
/// <summary>
|
587
607
|
/// Replaces the contents of the bracket with the string s.
|
588
608
|
/// </summary>
|
589
|
-
///
|
609
|
+
///
|
590
610
|
/// <param name="s">The s.</param>
|
591
611
|
protected void slice_from(String s)
|
592
612
|
{
|
@@ -597,7 +617,7 @@ namespace Snowball
|
|
597
617
|
/// <summary>
|
598
618
|
/// Removes the current bracket contents.
|
599
619
|
/// </summary>
|
600
|
-
///
|
620
|
+
///
|
601
621
|
protected void slice_del()
|
602
622
|
{
|
603
623
|
slice_from("");
|
@@ -606,7 +626,7 @@ namespace Snowball
|
|
606
626
|
/// <summary>
|
607
627
|
/// Replaces the contents of the bracket with the string s.
|
608
628
|
/// </summary>
|
609
|
-
///
|
629
|
+
///
|
610
630
|
protected void insert(int c_bra, int c_ket, String s)
|
611
631
|
{
|
612
632
|
int adjustment = replace_s(c_bra, c_ket, s);
|
@@ -617,7 +637,7 @@ namespace Snowball
|
|
617
637
|
/// <summary>
|
618
638
|
/// Replaces the contents of the bracket with the string s.
|
619
639
|
/// </summary>
|
620
|
-
///
|
640
|
+
///
|
621
641
|
protected void insert(int c_bra, int c_ket, StringBuilder s)
|
622
642
|
{
|
623
643
|
int adjustment = replace_s(c_bra, c_ket, s.ToString());
|
@@ -628,7 +648,7 @@ namespace Snowball
|
|
628
648
|
/// <summary>
|
629
649
|
/// Replaces the contents of the bracket with the string s.
|
630
650
|
/// </summary>
|
631
|
-
///
|
651
|
+
///
|
632
652
|
protected void slice_to(StringBuilder s)
|
633
653
|
{
|
634
654
|
slice_check();
|
@@ -638,7 +658,7 @@ namespace Snowball
|
|
638
658
|
/// <summary>
|
639
659
|
/// Replaces the contents of the bracket with the string s.
|
640
660
|
/// </summary>
|
641
|
-
///
|
661
|
+
///
|
642
662
|
protected void assign_to(StringBuilder s)
|
643
663
|
{
|
644
664
|
Replace(s, 0, s.Length, current.ToString(0, limit));
|
@@ -1,11 +1,12 @@
|
|
1
1
|
// Copyright (c) 2001, Dr Martin Porter
|
2
2
|
// Copyright (c) 2002, Richard Boulton
|
3
3
|
// Copyright (c) 2015, Cesar Souza
|
4
|
+
// Copyright (c) 2025, Olly Betts
|
4
5
|
// All rights reserved.
|
5
|
-
//
|
6
|
+
//
|
6
7
|
// Redistribution and use in source and binary forms, with or without
|
7
8
|
// modification, are permitted provided that the following conditions are met:
|
8
|
-
//
|
9
|
+
//
|
9
10
|
// * Redistributions of source code must retain the above copyright notice,
|
10
11
|
// * this list of conditions and the following disclaimer.
|
11
12
|
// * Redistributions in binary form must reproduce the above copyright
|
@@ -14,7 +15,7 @@
|
|
14
15
|
// * Neither the name of the copyright holders nor the names of its contributors
|
15
16
|
// * may be used to endorse or promote products derived from this software
|
16
17
|
// * without specific prior written permission.
|
17
|
-
//
|
18
|
+
//
|
18
19
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
19
20
|
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
20
21
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
@@ -32,24 +33,23 @@ namespace Snowball
|
|
32
33
|
using System.IO;
|
33
34
|
using System.Reflection;
|
34
35
|
using System.Linq;
|
35
|
-
using System.Text;
|
36
36
|
|
37
37
|
/// <summary>
|
38
38
|
/// Snowball's Stemmer program.
|
39
39
|
/// </summary>
|
40
|
-
///
|
40
|
+
///
|
41
41
|
public static class Program
|
42
42
|
{
|
43
43
|
|
44
44
|
private static void usage()
|
45
45
|
{
|
46
|
-
Console.WriteLine("Usage: stemwords.exe -l <language> -i <input file> [-o <output file>]");
|
46
|
+
Console.WriteLine("Usage: stemwords.exe -l <language> [-i <input file>] [-o <output file>]");
|
47
47
|
}
|
48
48
|
|
49
49
|
/// <summary>
|
50
50
|
/// Main program entrypoint.
|
51
51
|
/// </summary>
|
52
|
-
///
|
52
|
+
///
|
53
53
|
public static void Main(String[] args)
|
54
54
|
{
|
55
55
|
string language = null;
|
@@ -62,11 +62,11 @@ namespace Snowball
|
|
62
62
|
language = args[i + 1];
|
63
63
|
else if (args[i] == "-i")
|
64
64
|
inputName = args[i + 1];
|
65
|
-
if (args[i] == "-o")
|
65
|
+
else if (args[i] == "-o")
|
66
66
|
outputName = args[i + 1];
|
67
67
|
}
|
68
68
|
|
69
|
-
if (language == null
|
69
|
+
if (language == null)
|
70
70
|
{
|
71
71
|
usage();
|
72
72
|
return;
|
@@ -89,13 +89,17 @@ namespace Snowball
|
|
89
89
|
Console.WriteLine("Using " + stemmer.GetType());
|
90
90
|
|
91
91
|
TextWriter output = System.Console.Out;
|
92
|
-
|
93
92
|
if (outputName != null)
|
94
93
|
output = new StreamWriter(outputName);
|
95
94
|
|
95
|
+
TextReader input = System.Console.In;
|
96
|
+
if (inputName != null)
|
97
|
+
input = new StreamReader(inputName);
|
96
98
|
|
97
|
-
|
99
|
+
while (true)
|
98
100
|
{
|
101
|
+
var line = input.ReadLine();
|
102
|
+
if (line == null) break;
|
99
103
|
var o = stemmer.Stem(line);
|
100
104
|
output.WriteLine(o);
|
101
105
|
}
|
@@ -105,7 +109,7 @@ namespace Snowball
|
|
105
109
|
|
106
110
|
private static bool match(string stemmerName, string language)
|
107
111
|
{
|
108
|
-
string expectedName = language
|
112
|
+
string expectedName = language + "Stemmer";
|
109
113
|
|
110
114
|
return stemmerName.StartsWith(expectedName,
|
111
115
|
StringComparison.CurrentCultureIgnoreCase);
|
@@ -12,7 +12,7 @@ What is Stemming?
|
|
12
12
|
|
13
13
|
Stemming maps different forms of the same word to a common "stem" - for
|
14
14
|
example, the English stemmer maps *connection*, *connections*, *connective*,
|
15
|
-
*connected*, and *connecting* to *connect*. So a
|
15
|
+
*connected*, and *connecting* to *connect*. So a search for *connected*
|
16
16
|
would also find documents which only have the other forms.
|
17
17
|
|
18
18
|
This stem form is often a word itself, but this is not always the case as this
|
@@ -50,6 +50,9 @@ files for the UTF-8 only version of the library.
|
|
50
50
|
Using the library
|
51
51
|
=================
|
52
52
|
|
53
|
+
The stemming algorithms generally expect the input text to use composed accents
|
54
|
+
(Unicode NFC or NFKC) and to have been folded to lower case already.
|
55
|
+
|
53
56
|
The library provides a simple C API. Essentially, a new stemmer can
|
54
57
|
be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
|
55
58
|
used to stem a word, "sb_stemmer_length" returns the stemmed
|
@@ -122,11 +125,11 @@ GNU autotool framework (and in particular, automake and autoconf) as follows:
|
|
122
125
|
libstemmer_c subdirectory of the top level directory of the project.
|
123
126
|
|
124
127
|
2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
|
125
|
-
|
128
|
+
|
126
129
|
noinst_LTLIBRARIES = libstemmer.la
|
127
130
|
include $(srcdir)/mkinc.mak
|
128
131
|
noinst_HEADERS = $(snowball_headers)
|
129
|
-
libstemmer_la_SOURCES = $(snowball_sources)
|
132
|
+
libstemmer_la_SOURCES = $(snowball_sources)
|
130
133
|
|
131
134
|
(You may also need to add other lines to this, for example, if you are using
|
132
135
|
compiler options which are not compatible with compiling the libstemmer
|
@@ -144,5 +147,5 @@ SUBDIRS=libstemmer_c
|
|
144
147
|
<name>_LIBADD = libstemmer_c/libstemmer.la
|
145
148
|
|
146
149
|
(Where <name> is the name of the library or executable which links against
|
147
|
-
libstemmer.)
|
150
|
+
libstemmer.)
|
148
151
|
|
@@ -12,7 +12,7 @@ What is Stemming?
|
|
12
12
|
|
13
13
|
Stemming maps different forms of the same word to a common "stem" - for
|
14
14
|
example, the English stemmer maps *connection*, *connections*, *connective*,
|
15
|
-
*connected*, and *connecting* to *connect*. So a
|
15
|
+
*connected*, and *connecting* to *connect*. So a search for *connected*
|
16
16
|
would also find documents which only have the other forms.
|
17
17
|
|
18
18
|
This stem form is often a word itself, but this is not always the case as this
|
@@ -39,6 +39,9 @@ mcs -target:exe -out:stemwords.exe -r:snowballstemmer.dll csharp/Stemwords/Progr
|
|
39
39
|
Using the library
|
40
40
|
=================
|
41
41
|
|
42
|
+
The stemming algorithms generally expect the input text to use composed accents
|
43
|
+
(Unicode NFC or NFKC) and to have been folded to lower case already.
|
44
|
+
|
42
45
|
There is currently no formal documentation on the use of the C# version
|
43
46
|
of the library. Additionally, its interface is not guaranteed to be
|
44
47
|
stable.
|
@@ -12,7 +12,7 @@ What is Stemming?
|
|
12
12
|
|
13
13
|
Stemming maps different forms of the same word to a common "stem" - for
|
14
14
|
example, the English stemmer maps *connection*, *connections*, *connective*,
|
15
|
-
*connected*, and *connecting* to *connect*. So a
|
15
|
+
*connected*, and *connecting* to *connect*. So a search for *connected*
|
16
16
|
would also find documents which only have the other forms.
|
17
17
|
|
18
18
|
This stem form is often a word itself, but this is not always the case as this
|
@@ -25,6 +25,14 @@ to a root form and/or get a root form which is itself a word then Snowball's
|
|
25
25
|
stemming algorithms likely aren't the right answer.
|
26
26
|
|
27
27
|
|
28
|
+
Requirements
|
29
|
+
============
|
30
|
+
|
31
|
+
The Java code generated by Snowball requires Java >= 7 (since Snowball 3.0.0).
|
32
|
+
Java 7 was released in 2011, and Java 6's EOL was 2013 so we don't expect this
|
33
|
+
to be a problematic requirement.
|
34
|
+
|
35
|
+
|
28
36
|
Compiling the library
|
29
37
|
=====================
|
30
38
|
|
@@ -41,6 +49,9 @@ provides a command line interface to the library.
|
|
41
49
|
Using the library
|
42
50
|
=================
|
43
51
|
|
52
|
+
The stemming algorithms generally expect the input text to use composed accents
|
53
|
+
(Unicode NFC or NFKC) and to have been folded to lower case already.
|
54
|
+
|
44
55
|
There is currently no formal documentation on the use of the Java version
|
45
56
|
of the library. Additionally, its interface is not guaranteed to be
|
46
57
|
stable.
|
@@ -6,7 +6,7 @@ What is Stemming?
|
|
6
6
|
|
7
7
|
Stemming maps different forms of the same word to a common "stem" - for
|
8
8
|
example, the English stemmer maps *connection*, *connections*, *connective*,
|
9
|
-
*connected*, and *connecting* to *connect*. So a
|
9
|
+
*connected*, and *connecting* to *connect*. So a search for *connected*
|
10
10
|
would also find documents which only have the other forms.
|
11
11
|
|
12
12
|
This stem form is often a word itself, but this is not always the case as this
|
@@ -22,16 +22,18 @@ stemming algorithms likely aren't the right answer.
|
|
22
22
|
How to use library
|
23
23
|
------------------
|
24
24
|
|
25
|
+
The stemming algorithms generally expect the input text to use composed accents
|
26
|
+
(Unicode NFC or NFKC) and to have been folded to lower case already.
|
27
|
+
|
25
28
|
You can use each stemming modules from Javascript code - e.g to use them
|
26
29
|
with node:
|
27
30
|
|
28
31
|
.. code-block:: javascript
|
29
32
|
|
30
|
-
|
31
|
-
const english_stemmer = require('english-stemmer.js');
|
33
|
+
var EnglishStemmer = require('english-stemmer.js');
|
32
34
|
|
33
35
|
var stemmer = new EnglishStemmer();
|
34
|
-
|
36
|
+
console.log(stemmer.stemWord("testing"));
|
35
37
|
|
36
38
|
You'll need to bundle ``base-stemmer.js`` and whichever languages you want
|
37
39
|
stemmers for (e.g. ``english-stemmer.js`` for English).
|
@@ -1,16 +1,18 @@
|
|
1
1
|
Snowball stemming library collection for Python
|
2
2
|
===============================================
|
3
3
|
|
4
|
-
Python 3 (>= 3.3) is supported. We no longer
|
5
|
-
|
6
|
-
|
4
|
+
Python 3 (>= 3.3) is supported. We no longer support Python 2 as the Python
|
5
|
+
developers stopped supporting it at the start of 2020. Snowball 2.1.0 was the
|
6
|
+
last release to officially support Python 2; Snowball 3.0.0 was the last
|
7
|
+
release which had the code to support Python 2, but we were no longer testing
|
8
|
+
it.
|
7
9
|
|
8
10
|
What is Stemming?
|
9
11
|
-----------------
|
10
12
|
|
11
13
|
Stemming maps different forms of the same word to a common "stem" - for
|
12
14
|
example, the English stemmer maps *connection*, *connections*, *connective*,
|
13
|
-
*connected*, and *connecting* to *connect*. So a
|
15
|
+
*connected*, and *connecting* to *connect*. So a search for *connected*
|
14
16
|
would also find documents which only have the other forms.
|
15
17
|
|
16
18
|
This stem form is often a word itself, but this is not always the case as this
|
@@ -25,6 +27,9 @@ stemming algorithms likely aren't the right answer.
|
|
25
27
|
How to use library
|
26
28
|
------------------
|
27
29
|
|
30
|
+
The stemming algorithms generally expect the input text to use composed accents
|
31
|
+
(Unicode NFC or NFKC) and to have been folded to lower case already.
|
32
|
+
|
28
33
|
The ``snowballstemmer`` module has two functions.
|
29
34
|
|
30
35
|
The ``snowballstemmer.algorithms`` function returns a list of available
|
@@ -32,7 +32,7 @@ stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
|
|
32
32
|
sb_symbol * newb;
|
33
33
|
newb = (sb_symbol *)
|
34
34
|
realloc(b, (lim + INC) * sizeof(sb_symbol));
|
35
|
-
if (newb ==
|
35
|
+
if (newb == NULL) goto error;
|
36
36
|
b = newb;
|
37
37
|
lim = lim + INC;
|
38
38
|
}
|
@@ -50,7 +50,7 @@ stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
|
|
50
50
|
const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
|
51
51
|
if (stemmed == NULL)
|
52
52
|
{
|
53
|
-
fprintf(stderr, "Out of memory");
|
53
|
+
fprintf(stderr, "Out of memory or internal error\n");
|
54
54
|
exit(1);
|
55
55
|
}
|
56
56
|
|
@@ -78,7 +78,7 @@ stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
|
|
78
78
|
}
|
79
79
|
}
|
80
80
|
error:
|
81
|
-
|
81
|
+
free(b);
|
82
82
|
return;
|
83
83
|
}
|
84
84
|
|
@@ -114,8 +114,8 @@ usage(int n)
|
|
114
114
|
int
|
115
115
|
main(int argc, char * argv[])
|
116
116
|
{
|
117
|
-
const char * in =
|
118
|
-
const char * out =
|
117
|
+
const char * in = NULL;
|
118
|
+
const char * out = NULL;
|
119
119
|
FILE * f_in;
|
120
120
|
FILE * f_out;
|
121
121
|
struct sb_stemmer * stemmer;
|
@@ -172,20 +172,20 @@ main(int argc, char * argv[])
|
|
172
172
|
}
|
173
173
|
|
174
174
|
/* prepare the files */
|
175
|
-
f_in = (in ==
|
176
|
-
if (f_in ==
|
175
|
+
f_in = (in == NULL) ? stdin : fopen(in, "r");
|
176
|
+
if (f_in == NULL) {
|
177
177
|
fprintf(stderr, "file %s not found\n", in);
|
178
178
|
exit(1);
|
179
179
|
}
|
180
|
-
f_out = (out ==
|
181
|
-
if (f_out ==
|
180
|
+
f_out = (out == NULL) ? stdout : fopen(out, "w");
|
181
|
+
if (f_out == NULL) {
|
182
182
|
fprintf(stderr, "file %s cannot be opened\n", out);
|
183
183
|
exit(1);
|
184
184
|
}
|
185
185
|
|
186
186
|
/* do the stemming process: */
|
187
187
|
stemmer = sb_stemmer_new(language, charenc);
|
188
|
-
if (stemmer ==
|
188
|
+
if (stemmer == NULL) {
|
189
189
|
if (charenc == NULL) {
|
190
190
|
fprintf(stderr, "language `%s' not available for stemming\n", language);
|
191
191
|
exit(1);
|
@@ -197,8 +197,8 @@ main(int argc, char * argv[])
|
|
197
197
|
stem_file(stemmer, f_in, f_out);
|
198
198
|
sb_stemmer_delete(stemmer);
|
199
199
|
|
200
|
-
if (in !=
|
201
|
-
if (out !=
|
200
|
+
if (in != NULL) (void) fclose(f_in);
|
201
|
+
if (out != NULL) (void) fclose(f_out);
|
202
202
|
|
203
203
|
return 0;
|
204
204
|
}
|