mittens 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +3 -3
  4. data/lib/mittens/version.rb +1 -1
  5. data/vendor/snowball/.github/workflows/ci.yml +216 -0
  6. data/vendor/snowball/CONTRIBUTING.rst +111 -62
  7. data/vendor/snowball/GNUmakefile +194 -136
  8. data/vendor/snowball/NEWS +798 -3
  9. data/vendor/snowball/README.rst +50 -1
  10. data/vendor/snowball/ada/src/stemmer.adb +25 -13
  11. data/vendor/snowball/ada/src/stemmer.ads +9 -9
  12. data/vendor/snowball/ada/stemmer_config.gpr +7 -7
  13. data/vendor/snowball/algorithms/basque.sbl +4 -19
  14. data/vendor/snowball/algorithms/catalan.sbl +2 -9
  15. data/vendor/snowball/algorithms/danish.sbl +1 -1
  16. data/vendor/snowball/algorithms/dutch.sbl +284 -122
  17. data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
  18. data/vendor/snowball/algorithms/english.sbl +52 -37
  19. data/vendor/snowball/algorithms/esperanto.sbl +157 -0
  20. data/vendor/snowball/algorithms/estonian.sbl +269 -0
  21. data/vendor/snowball/algorithms/finnish.sbl +2 -3
  22. data/vendor/snowball/algorithms/french.sbl +42 -16
  23. data/vendor/snowball/algorithms/german.sbl +35 -14
  24. data/vendor/snowball/algorithms/greek.sbl +76 -76
  25. data/vendor/snowball/algorithms/hungarian.sbl +8 -6
  26. data/vendor/snowball/algorithms/indonesian.sbl +14 -8
  27. data/vendor/snowball/algorithms/italian.sbl +11 -21
  28. data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
  29. data/vendor/snowball/algorithms/lovins.sbl +0 -1
  30. data/vendor/snowball/algorithms/nepali.sbl +138 -37
  31. data/vendor/snowball/algorithms/norwegian.sbl +19 -5
  32. data/vendor/snowball/algorithms/porter.sbl +2 -2
  33. data/vendor/snowball/algorithms/portuguese.sbl +9 -13
  34. data/vendor/snowball/algorithms/romanian.sbl +17 -4
  35. data/vendor/snowball/algorithms/serbian.sbl +467 -468
  36. data/vendor/snowball/algorithms/spanish.sbl +5 -7
  37. data/vendor/snowball/algorithms/swedish.sbl +60 -6
  38. data/vendor/snowball/algorithms/tamil.sbl +207 -176
  39. data/vendor/snowball/algorithms/turkish.sbl +461 -445
  40. data/vendor/snowball/algorithms/yiddish.sbl +36 -38
  41. data/vendor/snowball/compiler/analyser.c +445 -192
  42. data/vendor/snowball/compiler/driver.c +109 -101
  43. data/vendor/snowball/compiler/generator.c +853 -464
  44. data/vendor/snowball/compiler/generator_ada.c +404 -366
  45. data/vendor/snowball/compiler/generator_csharp.c +297 -260
  46. data/vendor/snowball/compiler/generator_go.c +323 -254
  47. data/vendor/snowball/compiler/generator_java.c +326 -252
  48. data/vendor/snowball/compiler/generator_js.c +362 -252
  49. data/vendor/snowball/compiler/generator_pascal.c +349 -197
  50. data/vendor/snowball/compiler/generator_python.c +257 -240
  51. data/vendor/snowball/compiler/generator_rust.c +423 -251
  52. data/vendor/snowball/compiler/header.h +117 -71
  53. data/vendor/snowball/compiler/space.c +137 -68
  54. data/vendor/snowball/compiler/syswords.h +2 -2
  55. data/vendor/snowball/compiler/tokeniser.c +125 -107
  56. data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
  57. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
  58. data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
  59. data/vendor/snowball/csharp/Stemwords/App.config +2 -2
  60. data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
  61. data/vendor/snowball/doc/libstemmer_c_README +7 -4
  62. data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
  63. data/vendor/snowball/doc/libstemmer_java_README +12 -1
  64. data/vendor/snowball/doc/libstemmer_js_README +6 -4
  65. data/vendor/snowball/doc/libstemmer_python_README +9 -4
  66. data/vendor/snowball/examples/stemwords.c +12 -12
  67. data/vendor/snowball/go/env.go +107 -31
  68. data/vendor/snowball/go/util.go +0 -4
  69. data/vendor/snowball/include/libstemmer.h +4 -0
  70. data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
  71. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
  72. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
  73. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
  74. data/vendor/snowball/javascript/base-stemmer.js +186 -2
  75. data/vendor/snowball/javascript/stemwords.js +3 -6
  76. data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
  77. data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
  78. data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
  79. data/vendor/snowball/libstemmer/modules.txt +13 -10
  80. data/vendor/snowball/libstemmer/test.c +1 -1
  81. data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
  82. data/vendor/snowball/pascal/generate.pl +13 -13
  83. data/vendor/snowball/python/create_init.py +4 -1
  84. data/vendor/snowball/python/setup.cfg +0 -3
  85. data/vendor/snowball/python/setup.py +8 -3
  86. data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
  87. data/vendor/snowball/python/stemwords.py +8 -12
  88. data/vendor/snowball/runtime/api.c +10 -5
  89. data/vendor/snowball/runtime/header.h +10 -9
  90. data/vendor/snowball/runtime/utilities.c +9 -9
  91. data/vendor/snowball/rust/build.rs +1 -1
  92. data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
  93. data/vendor/snowball/tests/stemtest.c +7 -4
  94. metadata +7 -7
  95. data/vendor/snowball/.travis.yml +0 -112
  96. data/vendor/snowball/algorithms/german2.sbl +0 -145
  97. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
  98. data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -3,10 +3,10 @@
3
3
  // Copyright (c) 2015, Cesar Souza
4
4
  // Copyright (c) 2018, Olly Betts
5
5
  // All rights reserved.
6
- //
6
+ //
7
7
  // Redistribution and use in source and binary forms, with or without
8
8
  // modification, are permitted provided that the following conditions are met:
9
- //
9
+ //
10
10
  // * Redistributions of source code must retain the above copyright notice,
11
11
  // * this list of conditions and the following disclaimer.
12
12
  // * Redistributions in binary form must reproduce the above copyright
@@ -15,7 +15,7 @@
15
15
  // * Neither the name of the copyright holders nor the names of its contributors
16
16
  // * may be used to endorse or promote products derived from this software
17
17
  // * without specific prior written permission.
18
- //
18
+ //
19
19
  // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
20
  // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
21
  // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -36,13 +36,13 @@ namespace Snowball
36
36
  /// <summary>
37
37
  /// Class holding current state.
38
38
  /// </summary>
39
- ///
39
+ ///
40
40
  public class Env
41
41
  {
42
42
  /// <summary>
43
43
  /// Initializes a new instance of the <see cref="Env"/> class.
44
44
  /// </summary>
45
- ///
45
+ ///
46
46
  protected Env()
47
47
  {
48
48
  }
@@ -50,43 +50,43 @@ namespace Snowball
50
50
  /// <summary>
51
51
  /// Gets the current string.
52
52
  /// </summary>
53
- ///
53
+ ///
54
54
  protected StringBuilder current;
55
55
 
56
56
  /// <summary>
57
57
  /// Current cursor position.
58
58
  /// </summary>
59
- ///
59
+ ///
60
60
  protected int cursor;
61
61
 
62
62
  /// <summary>
63
63
  /// Forward limit for inspecting the buffer.
64
64
  /// </summary>
65
- ///
65
+ ///
66
66
  protected int limit;
67
67
 
68
68
  /// <summary>
69
69
  /// Backward limit for inspecting the buffer.
70
70
  /// </summary>
71
- ///
71
+ ///
72
72
  protected int limit_backward;
73
73
 
74
74
  /// <summary>
75
75
  /// Starting bracket position.
76
76
  /// </summary>
77
- ///
77
+ ///
78
78
  protected int bra;
79
79
 
80
80
  /// <summary>
81
81
  /// Ending bracket position.
82
82
  /// </summary>
83
- ///
83
+ ///
84
84
  protected int ket;
85
85
 
86
86
  /// <summary>
87
87
  /// Copy another Env object.
88
88
  /// </summary>
89
- ///
89
+ ///
90
90
  public Env(Env other)
91
91
  {
92
92
  copy_from(other);
@@ -95,7 +95,7 @@ namespace Snowball
95
95
  /// <summary>
96
96
  /// Copy another Env object.
97
97
  /// </summary>
98
- ///
98
+ ///
99
99
  protected void copy_from(Env other)
100
100
  {
101
101
  current = other.current;
@@ -111,13 +111,13 @@ namespace Snowball
111
111
  /// <summary>
112
112
  /// Base class for Snowball's stemmer algorithms.
113
113
  /// </summary>
114
- ///
114
+ ///
115
115
  public abstract class Stemmer : Env
116
116
  {
117
117
  /// <summary>
118
118
  /// Initializes a new instance of the <see cref="Stemmer"/> class.
119
119
  /// </summary>
120
- ///
120
+ ///
121
121
  protected Stemmer()
122
122
  {
123
123
  current = new StringBuilder();
@@ -128,14 +128,14 @@ namespace Snowball
128
128
  /// <summary>
129
129
  /// Calls the stemmer to process the next word.
130
130
  /// </summary>
131
- ///
131
+ ///
132
132
  protected abstract bool stem();
133
133
 
134
134
 
135
135
  /// <summary>
136
136
  /// Stems the buffer's contents.
137
137
  /// </summary>
138
- ///
138
+ ///
139
139
  public bool Stem()
140
140
  {
141
141
  return this.stem();
@@ -144,11 +144,11 @@ namespace Snowball
144
144
  /// <summary>
145
145
  /// Stems a given word.
146
146
  /// </summary>
147
- ///
147
+ ///
148
148
  /// <param name="word">The word to be stemmed.</param>
149
- ///
149
+ ///
150
150
  /// <returns>The stemmed word.</returns>
151
- ///
151
+ ///
152
152
  public string Stem(string word)
153
153
  {
154
154
  setBufferContents(word);
@@ -160,7 +160,7 @@ namespace Snowball
160
160
  /// <summary>
161
161
  /// Gets the current processing buffer.
162
162
  /// </summary>
163
- ///
163
+ ///
164
164
  public StringBuilder Buffer
165
165
  {
166
166
  get { return current; }
@@ -171,7 +171,7 @@ namespace Snowball
171
171
  /// or the stemmed word, if the stemmer has been
172
172
  /// processed.
173
173
  /// </summary>
174
- ///
174
+ ///
175
175
  public string Current
176
176
  {
177
177
  get { return current.ToString(); }
@@ -192,7 +192,7 @@ namespace Snowball
192
192
 
193
193
 
194
194
  /// <summary>
195
- /// Determines whether the current character is
195
+ /// Determines whether the current character is
196
196
  /// inside a given group of characters <c>s</c>.
197
197
  /// </summary>
198
198
  protected int in_grouping(string s, int min, int max, bool repeat)
@@ -217,7 +217,7 @@ namespace Snowball
217
217
  }
218
218
 
219
219
  /// <summary>
220
- /// Determines whether the current character is
220
+ /// Determines whether the current character is
221
221
  /// inside a given group of characters <c>s</c>.
222
222
  /// </summary>
223
223
  protected int in_grouping_b(string s, int min, int max, bool repeat)
@@ -241,7 +241,7 @@ namespace Snowball
241
241
  }
242
242
 
243
243
  /// <summary>
244
- /// Determines whether the current character is
244
+ /// Determines whether the current character is
245
245
  /// outside a given group of characters <c>s</c>.
246
246
  /// </summary>
247
247
  protected int out_grouping(string s, int min, int max, bool repeat)
@@ -272,7 +272,7 @@ namespace Snowball
272
272
  }
273
273
 
274
274
  /// <summary>
275
- /// Determines whether the current character is
275
+ /// Determines whether the current character is
276
276
  /// outside a given group of characters <c>s</c>.
277
277
  /// </summary>
278
278
  protected int out_grouping_b(string s, int min, int max, bool repeat)
@@ -323,6 +323,26 @@ namespace Snowball
323
323
  return true;
324
324
  }
325
325
 
326
+ /// <summary>
327
+ /// Determines if the current buffer contains the
328
+ /// string s, starting from the current position and
329
+ /// going forward.
330
+ /// </summary>
331
+ protected bool eq_s(StringBuilder s)
332
+ {
333
+ if (limit - cursor < s.Length)
334
+ return false;
335
+
336
+ for (int i = 0; i != s.Length; i++)
337
+ {
338
+ if (current[cursor + i] != s[i])
339
+ return false;
340
+ }
341
+
342
+ cursor += s.Length;
343
+ return true;
344
+ }
345
+
326
346
  /// <summary>
327
347
  /// Determines if the current buffer contains the
328
348
  /// string s, starting from the current position and
@@ -365,11 +385,11 @@ namespace Snowball
365
385
 
366
386
 
367
387
  /// <summary>
368
- /// Searches if the current buffer matches against one of the
388
+ /// Searches if the current buffer matches against one of the
369
389
  /// amongs, starting from the current cursor position and going
370
390
  /// forward.
371
391
  /// </summary>
372
- ///
392
+ ///
373
393
  protected int find_among(Among[] v)
374
394
  {
375
395
  int i = 0;
@@ -463,11 +483,11 @@ namespace Snowball
463
483
  }
464
484
 
465
485
  /// <summary>
466
- /// Searches if the current buffer matches against one of the
486
+ /// Searches if the current buffer matches against one of the
467
487
  /// amongs, starting from the current cursor position and going
468
488
  /// backwards.
469
489
  /// </summary>
470
- ///
490
+ ///
471
491
  protected int find_among_b(Among[] v)
472
492
  {
473
493
  int i = 0;
@@ -559,7 +579,7 @@ namespace Snowball
559
579
  /// Replaces the characters between <c>c_bra</c>
560
580
  /// and <c>c_ket</c> by the characters in s.
561
581
  /// </summary>
562
- ///
582
+ ///
563
583
  protected int replace_s(int c_bra, int c_ket, String s)
564
584
  {
565
585
  int adjustment = s.Length - (c_ket - c_bra);
@@ -586,7 +606,7 @@ namespace Snowball
586
606
  /// <summary>
587
607
  /// Replaces the contents of the bracket with the string s.
588
608
  /// </summary>
589
- ///
609
+ ///
590
610
  /// <param name="s">The s.</param>
591
611
  protected void slice_from(String s)
592
612
  {
@@ -597,7 +617,7 @@ namespace Snowball
597
617
  /// <summary>
598
618
  /// Removes the current bracket contents.
599
619
  /// </summary>
600
- ///
620
+ ///
601
621
  protected void slice_del()
602
622
  {
603
623
  slice_from("");
@@ -606,7 +626,7 @@ namespace Snowball
606
626
  /// <summary>
607
627
  /// Replaces the contents of the bracket with the string s.
608
628
  /// </summary>
609
- ///
629
+ ///
610
630
  protected void insert(int c_bra, int c_ket, String s)
611
631
  {
612
632
  int adjustment = replace_s(c_bra, c_ket, s);
@@ -617,7 +637,7 @@ namespace Snowball
617
637
  /// <summary>
618
638
  /// Replaces the contents of the bracket with the string s.
619
639
  /// </summary>
620
- ///
640
+ ///
621
641
  protected void insert(int c_bra, int c_ket, StringBuilder s)
622
642
  {
623
643
  int adjustment = replace_s(c_bra, c_ket, s.ToString());
@@ -628,7 +648,7 @@ namespace Snowball
628
648
  /// <summary>
629
649
  /// Replaces the contents of the bracket with the string s.
630
650
  /// </summary>
631
- ///
651
+ ///
632
652
  protected void slice_to(StringBuilder s)
633
653
  {
634
654
  slice_check();
@@ -638,7 +658,7 @@ namespace Snowball
638
658
  /// <summary>
639
659
  /// Replaces the contents of the bracket with the string s.
640
660
  /// </summary>
641
- ///
661
+ ///
642
662
  protected void assign_to(StringBuilder s)
643
663
  {
644
664
  Replace(s, 0, s.Length, current.ToString(0, limit));
@@ -1,6 +1,6 @@
1
1
  <?xml version="1.0" encoding="utf-8" ?>
2
2
  <configuration>
3
- <startup>
3
+ <startup>
4
4
  <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5" />
5
5
  </startup>
6
- </configuration>
6
+ </configuration>
@@ -1,11 +1,12 @@
1
1
  // Copyright (c) 2001, Dr Martin Porter
2
2
  // Copyright (c) 2002, Richard Boulton
3
3
  // Copyright (c) 2015, Cesar Souza
4
+ // Copyright (c) 2025, Olly Betts
4
5
  // All rights reserved.
5
- //
6
+ //
6
7
  // Redistribution and use in source and binary forms, with or without
7
8
  // modification, are permitted provided that the following conditions are met:
8
- //
9
+ //
9
10
  // * Redistributions of source code must retain the above copyright notice,
10
11
  // * this list of conditions and the following disclaimer.
11
12
  // * Redistributions in binary form must reproduce the above copyright
@@ -14,7 +15,7 @@
14
15
  // * Neither the name of the copyright holders nor the names of its contributors
15
16
  // * may be used to endorse or promote products derived from this software
16
17
  // * without specific prior written permission.
17
- //
18
+ //
18
19
  // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
20
  // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
21
  // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -32,24 +33,23 @@ namespace Snowball
32
33
  using System.IO;
33
34
  using System.Reflection;
34
35
  using System.Linq;
35
- using System.Text;
36
36
 
37
37
  /// <summary>
38
38
  /// Snowball's Stemmer program.
39
39
  /// </summary>
40
- ///
40
+ ///
41
41
  public static class Program
42
42
  {
43
43
 
44
44
  private static void usage()
45
45
  {
46
- Console.WriteLine("Usage: stemwords.exe -l <language> -i <input file> [-o <output file>]");
46
+ Console.WriteLine("Usage: stemwords.exe -l <language> [-i <input file>] [-o <output file>]");
47
47
  }
48
48
 
49
49
  /// <summary>
50
50
  /// Main program entrypoint.
51
51
  /// </summary>
52
- ///
52
+ ///
53
53
  public static void Main(String[] args)
54
54
  {
55
55
  string language = null;
@@ -62,11 +62,11 @@ namespace Snowball
62
62
  language = args[i + 1];
63
63
  else if (args[i] == "-i")
64
64
  inputName = args[i + 1];
65
- if (args[i] == "-o")
65
+ else if (args[i] == "-o")
66
66
  outputName = args[i + 1];
67
67
  }
68
68
 
69
- if (language == null || inputName == null)
69
+ if (language == null)
70
70
  {
71
71
  usage();
72
72
  return;
@@ -89,13 +89,17 @@ namespace Snowball
89
89
  Console.WriteLine("Using " + stemmer.GetType());
90
90
 
91
91
  TextWriter output = System.Console.Out;
92
-
93
92
  if (outputName != null)
94
93
  output = new StreamWriter(outputName);
95
94
 
95
+ TextReader input = System.Console.In;
96
+ if (inputName != null)
97
+ input = new StreamReader(inputName);
96
98
 
97
- foreach (var line in File.ReadAllLines(inputName))
99
+ while (true)
98
100
  {
101
+ var line = input.ReadLine();
102
+ if (line == null) break;
99
103
  var o = stemmer.Stem(line);
100
104
  output.WriteLine(o);
101
105
  }
@@ -105,7 +109,7 @@ namespace Snowball
105
109
 
106
110
  private static bool match(string stemmerName, string language)
107
111
  {
108
- string expectedName = language.Replace("_", "") + "Stemmer";
112
+ string expectedName = language + "Stemmer";
109
113
 
110
114
  return stemmerName.StartsWith(expectedName,
111
115
  StringComparison.CurrentCultureIgnoreCase);
@@ -12,7 +12,7 @@ What is Stemming?
12
12
 
13
13
  Stemming maps different forms of the same word to a common "stem" - for
14
14
  example, the English stemmer maps *connection*, *connections*, *connective*,
15
- *connected*, and *connecting* to *connect*. So a searching for *connected*
15
+ *connected*, and *connecting* to *connect*. So a search for *connected*
16
16
  would also find documents which only have the other forms.
17
17
 
18
18
  This stem form is often a word itself, but this is not always the case as this
@@ -50,6 +50,9 @@ files for the UTF-8 only version of the library.
50
50
  Using the library
51
51
  =================
52
52
 
53
+ The stemming algorithms generally expect the input text to use composed accents
54
+ (Unicode NFC or NFKC) and to have been folded to lower case already.
55
+
53
56
  The library provides a simple C API. Essentially, a new stemmer can
54
57
  be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
55
58
  used to stem a word, "sb_stemmer_length" returns the stemmed
@@ -122,11 +125,11 @@ GNU autotool framework (and in particular, automake and autoconf) as follows:
122
125
  libstemmer_c subdirectory of the top level directory of the project.
123
126
 
124
127
  2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
125
-
128
+
126
129
  noinst_LTLIBRARIES = libstemmer.la
127
130
  include $(srcdir)/mkinc.mak
128
131
  noinst_HEADERS = $(snowball_headers)
129
- libstemmer_la_SOURCES = $(snowball_sources)
132
+ libstemmer_la_SOURCES = $(snowball_sources)
130
133
 
131
134
  (You may also need to add other lines to this, for example, if you are using
132
135
  compiler options which are not compatible with compiling the libstemmer
@@ -144,5 +147,5 @@ SUBDIRS=libstemmer_c
144
147
  <name>_LIBADD = libstemmer_c/libstemmer.la
145
148
 
146
149
  (Where <name> is the name of the library or executable which links against
147
- libstemmer.)
150
+ libstemmer.)
148
151
 
@@ -12,7 +12,7 @@ What is Stemming?
12
12
 
13
13
  Stemming maps different forms of the same word to a common "stem" - for
14
14
  example, the English stemmer maps *connection*, *connections*, *connective*,
15
- *connected*, and *connecting* to *connect*. So a searching for *connected*
15
+ *connected*, and *connecting* to *connect*. So a search for *connected*
16
16
  would also find documents which only have the other forms.
17
17
 
18
18
  This stem form is often a word itself, but this is not always the case as this
@@ -39,6 +39,9 @@ mcs -target:exe -out:stemwords.exe -r:snowballstemmer.dll csharp/Stemwords/Progr
39
39
  Using the library
40
40
  =================
41
41
 
42
+ The stemming algorithms generally expect the input text to use composed accents
43
+ (Unicode NFC or NFKC) and to have been folded to lower case already.
44
+
42
45
  There is currently no formal documentation on the use of the C# version
43
46
  of the library. Additionally, its interface is not guaranteed to be
44
47
  stable.
@@ -12,7 +12,7 @@ What is Stemming?
12
12
 
13
13
  Stemming maps different forms of the same word to a common "stem" - for
14
14
  example, the English stemmer maps *connection*, *connections*, *connective*,
15
- *connected*, and *connecting* to *connect*. So a searching for *connected*
15
+ *connected*, and *connecting* to *connect*. So a search for *connected*
16
16
  would also find documents which only have the other forms.
17
17
 
18
18
  This stem form is often a word itself, but this is not always the case as this
@@ -25,6 +25,14 @@ to a root form and/or get a root form which is itself a word then Snowball's
25
25
  stemming algorithms likely aren't the right answer.
26
26
 
27
27
 
28
+ Requirements
29
+ ============
30
+
31
+ The Java code generated by Snowball requires Java >= 7 (since Snowball 3.0.0).
32
+ Java 7 was released in 2011, and Java 6's EOL was 2013 so we don't expect this
33
+ to be a problematic requirement.
34
+
35
+
28
36
  Compiling the library
29
37
  =====================
30
38
 
@@ -41,6 +49,9 @@ provides a command line interface to the library.
41
49
  Using the library
42
50
  =================
43
51
 
52
+ The stemming algorithms generally expect the input text to use composed accents
53
+ (Unicode NFC or NFKC) and to have been folded to lower case already.
54
+
44
55
  There is currently no formal documentation on the use of the Java version
45
56
  of the library. Additionally, its interface is not guaranteed to be
46
57
  stable.
@@ -6,7 +6,7 @@ What is Stemming?
6
6
 
7
7
  Stemming maps different forms of the same word to a common "stem" - for
8
8
  example, the English stemmer maps *connection*, *connections*, *connective*,
9
- *connected*, and *connecting* to *connect*. So a searching for *connected*
9
+ *connected*, and *connecting* to *connect*. So a search for *connected*
10
10
  would also find documents which only have the other forms.
11
11
 
12
12
  This stem form is often a word itself, but this is not always the case as this
@@ -22,16 +22,18 @@ stemming algorithms likely aren't the right answer.
22
22
  How to use library
23
23
  ------------------
24
24
 
25
+ The stemming algorithms generally expect the input text to use composed accents
26
+ (Unicode NFC or NFKC) and to have been folded to lower case already.
27
+
25
28
  You can use each stemming modules from Javascript code - e.g to use them
26
29
  with node:
27
30
 
28
31
  .. code-block:: javascript
29
32
 
30
- const stemmer = require('base-stemmer.js');
31
- const english_stemmer = require('english-stemmer.js');
33
+ var EnglishStemmer = require('english-stemmer.js');
32
34
 
33
35
  var stemmer = new EnglishStemmer();
34
- alert(stemmer.stemWord("testing"));
36
+ console.log(stemmer.stemWord("testing"));
35
37
 
36
38
  You'll need to bundle ``base-stemmer.js`` and whichever languages you want
37
39
  stemmers for (e.g. ``english-stemmer.js`` for English).
@@ -1,16 +1,18 @@
1
1
  Snowball stemming library collection for Python
2
2
  ===============================================
3
3
 
4
- Python 3 (>= 3.3) is supported. We no longer actively support Python 2 as
5
- the Python developers stopped supporting it at the start of 2020. Snowball
6
- 2.1.0 was the last release to officially support Python 2.
4
+ Python 3 (>= 3.3) is supported. We no longer support Python 2 as the Python
5
+ developers stopped supporting it at the start of 2020. Snowball 2.1.0 was the
6
+ last release to officially support Python 2; Snowball 3.0.0 was the last
7
+ release which had the code to support Python 2, but we were no longer testing
8
+ it.
7
9
 
8
10
  What is Stemming?
9
11
  -----------------
10
12
 
11
13
  Stemming maps different forms of the same word to a common "stem" - for
12
14
  example, the English stemmer maps *connection*, *connections*, *connective*,
13
- *connected*, and *connecting* to *connect*. So a searching for *connected*
15
+ *connected*, and *connecting* to *connect*. So a search for *connected*
14
16
  would also find documents which only have the other forms.
15
17
 
16
18
  This stem form is often a word itself, but this is not always the case as this
@@ -25,6 +27,9 @@ stemming algorithms likely aren't the right answer.
25
27
  How to use library
26
28
  ------------------
27
29
 
30
+ The stemming algorithms generally expect the input text to use composed accents
31
+ (Unicode NFC or NFKC) and to have been folded to lower case already.
32
+
28
33
  The ``snowballstemmer`` module has two functions.
29
34
 
30
35
  The ``snowballstemmer.algorithms`` function returns a list of available
@@ -32,7 +32,7 @@ stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
32
32
  sb_symbol * newb;
33
33
  newb = (sb_symbol *)
34
34
  realloc(b, (lim + INC) * sizeof(sb_symbol));
35
- if (newb == 0) goto error;
35
+ if (newb == NULL) goto error;
36
36
  b = newb;
37
37
  lim = lim + INC;
38
38
  }
@@ -50,7 +50,7 @@ stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
50
50
  const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
51
51
  if (stemmed == NULL)
52
52
  {
53
- fprintf(stderr, "Out of memory");
53
+ fprintf(stderr, "Out of memory or internal error\n");
54
54
  exit(1);
55
55
  }
56
56
 
@@ -78,7 +78,7 @@ stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
78
78
  }
79
79
  }
80
80
  error:
81
- if (b != 0) free(b);
81
+ free(b);
82
82
  return;
83
83
  }
84
84
 
@@ -114,8 +114,8 @@ usage(int n)
114
114
  int
115
115
  main(int argc, char * argv[])
116
116
  {
117
- const char * in = 0;
118
- const char * out = 0;
117
+ const char * in = NULL;
118
+ const char * out = NULL;
119
119
  FILE * f_in;
120
120
  FILE * f_out;
121
121
  struct sb_stemmer * stemmer;
@@ -172,20 +172,20 @@ main(int argc, char * argv[])
172
172
  }
173
173
 
174
174
  /* prepare the files */
175
- f_in = (in == 0) ? stdin : fopen(in, "r");
176
- if (f_in == 0) {
175
+ f_in = (in == NULL) ? stdin : fopen(in, "r");
176
+ if (f_in == NULL) {
177
177
  fprintf(stderr, "file %s not found\n", in);
178
178
  exit(1);
179
179
  }
180
- f_out = (out == 0) ? stdout : fopen(out, "w");
181
- if (f_out == 0) {
180
+ f_out = (out == NULL) ? stdout : fopen(out, "w");
181
+ if (f_out == NULL) {
182
182
  fprintf(stderr, "file %s cannot be opened\n", out);
183
183
  exit(1);
184
184
  }
185
185
 
186
186
  /* do the stemming process: */
187
187
  stemmer = sb_stemmer_new(language, charenc);
188
- if (stemmer == 0) {
188
+ if (stemmer == NULL) {
189
189
  if (charenc == NULL) {
190
190
  fprintf(stderr, "language `%s' not available for stemming\n", language);
191
191
  exit(1);
@@ -197,8 +197,8 @@ main(int argc, char * argv[])
197
197
  stem_file(stemmer, f_in, f_out);
198
198
  sb_stemmer_delete(stemmer);
199
199
 
200
- if (in != 0) (void) fclose(f_in);
201
- if (out != 0) (void) fclose(f_out);
200
+ if (in != NULL) (void) fclose(f_in);
201
+ if (out != NULL) (void) fclose(f_out);
202
202
 
203
203
  return 0;
204
204
  }