mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,754 @@
1
+ Snowball 2.2.0 (2021-11-10)
2
+ ===========================
3
+
4
+ New Code Generators
5
+ -------------------
6
+
7
+ * Add Ada generator from Stephane Carrez (#135).
8
+
9
+ Javascript
10
+ ----------
11
+
12
+ * Fix generated code to use integer division rather than floating point
13
+ division.
14
+
15
+ Noted by David Corbett.
16
+
17
+ Pascal
18
+ ------
19
+
20
+ * Fix code generated for division. Previously real division was used and the
21
+ generated code would fail to compile with an "Incompatible types" error.
22
+
23
+ Noted by David Corbett.
24
+
25
+ * Fix code generated for Snowball's `minint` and `maxint` constant.
26
+
27
+ Python
28
+ ------
29
+
30
+ * Python 2 is no longer actively supported, as proposed on the mailing list:
31
+ https://lists.tartarus.org/pipermail/snowball-discuss/2021-August/001721.html
32
+
33
+ * Fix code generated for division. Previously the Python code we generated
34
+ used integer division but rounded negative fractions towards negative
35
+ infinity rather than zero under Python 2, and under Python 3 used floating
36
+ point division.
37
+
38
+ Noted by David Corbett.
39
+
40
+ Code Quality Improvements
41
+ -------------------------
42
+
43
+ * C#: An `among` without functions is now generated as `static` and groupings
44
+ are now generated as constant. Patches from James Turner in #146 and #147.
45
+
46
+ Code generation improvements
47
+ ----------------------------
48
+
49
+ * General:
50
+
51
+ + Constant numeric subexpressions and constant numeric tests are now
52
+ evaluated at Snowball compile time.
53
+
54
+ Behavioural changes to existing algorithms
55
+ ------------------------------------------
56
+
57
+ * german2: Fix handling of `qu` to match algorithm description. Previously
58
+ the implementation erroneously did `skip 2` after `qu`. We suspect this was
59
+ intended to skip the `qu` but that's already been done by the substring/among
60
+ matching, so it actually skips an extra two characters.
61
+
62
+ The implementation has always differed in this way, but there's no good
63
+ reason to skip two extra characters here so overall it seems best to change
64
+ the code to match the description. This change only affects the stemming of
65
+ a single word in the sample vocabulary - `quae` which seems to actually be
66
+ Latin rather than German.
67
+
68
+ Optimisations to existing algorithms
69
+ ------------------------------------
70
+
71
+ * arabic: Handle exception cases in the among they're exceptions to.
72
+
73
+ * greek: Remove unused slice setting, handle exception cases in the among
74
+ they're exceptions to, and turn `substring ... among ... or substring ...
75
+ among ...` into a single `substring ... among ...` in cases where it is
76
+ trivial to do so.
77
+
78
+ * hindi: Eliminate the need for variable `p`.
79
+
80
+ * irish: Minor optimisation in setting `pV` and `p1`.
81
+
82
+ * yiddish: Make use of `among` more.
83
+
84
+ Compiler
85
+ --------
86
+
87
+ * Fix handling of `len` and `lenof` being declared as names.
88
+
89
+ For compatibility with programs written for older Snowball versions
90
+ len and lenof stop being tokens if declared as names. However this
91
+ code didn't work correctly if the tokeniser's name buffer needed to
92
+ be enlarged to hold the token name (i.e. 3 or 5 elements respectively).
93
+
94
+ * Report a clearer error if `=` is used instead of `==` in an integer test.
95
+
96
+ * Replace a single entry command list with its contents in the internal syntax
97
+ tree. This puts things in a more canonical form, which helps subsequent
98
+ optimisations.
99
+
100
+ Build system
101
+ ------------
102
+
103
+ * Support building on Microsoft Windows (using mingw+msys or a similar
104
+ Unix-like environment). Patch from Jannick in #129.
105
+
106
+ * Split out INCLUDES from CPPFLAGS so that CPPFLAGS can now be overridden by
107
+ the user if required. Fixes #148, reported by Dominique Leuenberger.
108
+
109
+ * Regenerate algorithms.mk only when needed rather than on every `make` run.
110
+
111
+ libstemmer
112
+ ----------
113
+
114
+ * The libstemmer static library now has a `.a` extension, rather than `.o`.
115
+ Patch from Michal Vasilek in #150.
116
+
117
+ Testsuite
118
+ ---------
119
+
120
+ * stemtest: Test that numbers and numeric codes aren't damaged by any of the
121
+ algorithms. Regression test for #66. Fixes #81.
122
+
123
+ * ada: Fix ada tests to fail if output differs. There was an extra `| head
124
+ -300` compared to other languages, which meant that the exit code of `diff`
125
+ was ignored. It seems more helpful (and is more consistent) not to limit how
126
+ many differences are shown so just drop this addition.
127
+
128
+ * go: Stop thinning testdata. It looks like we only are because the test
129
+ harness code was based on that for rust, which was based on that for
130
+ javascript, which was only thinning because it was reading everything into
131
+ memory and the larger vocabulary lists were resulting in out of memory
132
+ issues.
133
+
134
+ * javascript: Speed up stemwords.js. Process input line-by-line rather than
135
+ reading the whole file into memory, splitting, iterating, and creating an
136
+ array with all the output, joining and writing out a single huge string.
137
+ This also means we can stop thinning the test data for javascript, which we
138
+ were only doing because the huge arabic test data file was causing out of
139
+ memory errors. Also drop the -p option, which isn't useful here and
140
+ complicates the code.
141
+
142
+ * rust: Turn on optimisation in the makefile rather than the CI config. This
143
+ makes the tests run in about 1/5 of the time and there's really no reason to
144
+ be thinning the testdata for rust.
145
+
146
+ Documentation
147
+ -------------
148
+
149
+ * CONTRIBUTING.rst: Improve documentation for adding a new stemming algorithm.
150
+
151
+ * Improve wording of Python docs.
152
+
153
+ Snowball 2.1.0 (2021-01-21)
154
+ ===========================
155
+
156
+ C/C++
157
+ -----
158
+
159
+ * Fix decoding of 4-byte UTF-8 sequences in `grouping` checks. This bug
160
+ affected Unicode codepoints U+40000 to U+7FFFF and U+C0000 to U+FFFFF and
161
+ doesn't affect any of the stemming algorithms we currently ship (#138,
162
+ reported by Stephane Carrez).
163
+
164
+ Python
165
+ ------
166
+
167
+ * Fix snowballstemmer.algorithms() method (#132, reported by kkaiser).
168
+
169
+ * Update code to generate trove language classifiers for PyPI. All the
170
+ natural languages we previously had stemmers for have now been added to
171
+ PyPI's list, but Armenian and Yiddish aren't on it. Patch from Dmitry
172
+ Shachnev.
173
+
174
+ Code Quality Improvements
175
+ -------------------------
176
+
177
+ * Suppress GCC warning in compiler code.
178
+
179
+ * Use `const` pointers more in C runtime.
180
+
181
+ * Only use spaces for indentation in javascript code. Change proposed by Emily
182
+ Marigold Klassen in #123, and seems to be the modern Javascript norm.
183
+
184
+ New Snowball Language Features
185
+ ------------------------------
186
+
187
+ * `lenof` and `sizeof` can now be applied to a literal string, which can be
188
+ useful if you want to do calculations on cursor values.
189
+
190
+ This change actually simplifies the language a little, since you can now use
191
+ a literal string in any read-only context which accepts a string variable.
192
+
193
+ Code generation improvements
194
+ ----------------------------
195
+
196
+ * General:
197
+
198
+ + Fix bugs in the code generated to handle failure of `goto`, `gopast` or
199
+ `try` inside `setlimit` or string-`$`. This affected all languages (though
200
+ the issue with `try` wasn't present for C). These bugs don't affect any of
201
+ the stemming algorithms we currently ship. Reported by Stefan Petkovic on
202
+ snowball-discuss.
203
+
204
+ + Change `hop` with a negative argument to work as documented. The manual
205
+ says a negative argument to hop will raise signal f, but the implementation
206
+ for all languages was actually to move the cursor in the opposite direction
207
+ to `hop` with a positive argument. The implemented behaviour is
208
+ problematic as it allows invalidating implicitly saved cursor values by
209
+ modifying the string outside the current region, so we've decided it's best
210
+ to fix the implementation to match the documentation.
211
+
212
+ The only Snowball code we're aware of which relies on this was the original
213
+ version of the new Yiddish stemming algorithm, which has been updated not
214
+ to rely on this.
215
+
216
+ The compiler now issues a warning for `hop` with a constant negative
217
+ argument (internally now converted to `false`), and for `hop` with a
218
+ constant zero argument (internally now converted to `true`).
219
+
220
+ + Canonicalise `among` actions equivalent to `()` such as `(true)` which
221
+ previously resulted in an extra case in the among, and for Python
222
+ we'd generate invalid Python code (`if` or `elif` with an empty body).
223
+ Bug revealed by Assaf Urieli's Yiddish stemmer in #137.
224
+
225
+ + Eliminate variables whose values are never used - they no longer have
226
+ corresponding member variables, etc, and no code is generated for any
227
+ assignments to them.
228
+
229
+ + Don't generate anything for an unused `grouping`.
230
+
231
+ + Stop warning "grouping X defined but not used" for a `grouping` which is
232
+ only used to define other another `grouping`.
233
+
234
+ * C/C++:
235
+
236
+ + Store booleans in same array as integers. This means each boolean is
237
+ stored as an int instead of an unsigned char which means 4 bytes instead of
238
+ 1, but we save a pointer (4 or 8 bytes) in struct SN_env which is a win for
239
+ all the current stemmers. For an algorithm which uses both integers and
240
+ booleans, we also save the overhead of allocating a block on the heap, and
241
+ potentially improve data locality.
242
+
243
+ + Eliminate duplicate generated C comment for sliceto.
244
+
245
+ * Pascal:
246
+
247
+ + Avoid generating unused variables. The Pascal code generated for the
248
+ stemmers we ship is now warning free (tested with fpc 3.2.0).
249
+
250
+ * Python:
251
+
252
+ + End `if`-chain with `else` where possible, avoiding a redundant test
253
+ of the variable being switched on. This optimisation kicks in for an
254
+ `among` where all cases have commands. This change seems to speed up `make
255
+ check_python_arabic` by a few percent.
256
+
257
+ New stemming algorithms
258
+ -----------------------
259
+
260
+ * Add Serbian stemmer from stef4np (#113).
261
+
262
+ * Add Yiddish stemmer from Assaf Urieli (#137).
263
+
264
+ * Add Armenian stemmer from Astghik Mkrtchyan. It's been on the website for
265
+ over a decade, and included in Xapian for over 9 years without any negative
266
+ feedback.
267
+
268
+ Optimisations to existing algorithms
269
+ ------------------------------------
270
+
271
+ * kraaij_pohlmann: Use `$v = limit` instead of `do (tolimit setmark v)` since
272
+ this generates simpler code, and also matches the code other algorithm
273
+ implementations use.
274
+
275
+ Probably for languages like C with optimising compilers the compiler
276
+ will generate equivalent code anyway, but e.g. for Python this should be
277
+ an improvement.
278
+
279
+ Code clarity improvements to existing algorithms
280
+ ------------------------------------------------
281
+
282
+ * hindi.sbl: Fix comment typo.
283
+
284
+ Compiler
285
+ --------
286
+
287
+ * Don't count `$x = x + 1` as initialising or using `x`, so it's now handled
288
+ like `$x += 1` already is.
289
+
290
+ * Comments are now only included in the generated code if command like option
291
+ -comments is specified.
292
+
293
+ The comments in the generated code are useful if you're trying to debug the
294
+ compiler, and perhaps also if you are trying to debug your Snowball code, but
295
+ for everyone else they just bloat the code which as the number of languages
296
+ we support grows becomes more of an issue.
297
+
298
+ * `-parentclassname` is not only for java and csharp so don't disable it if
299
+ those backends are disabled.
300
+
301
+ * `-syntax` now reports the value for each numeric literal.
302
+
303
+ * Report location for excessive get nesting error.
304
+
305
+ * Internally the compiler now represents negated literal numbers as a simple
306
+ `c_number` rather than `c_neg` applied to a `c_number` with a positive value.
307
+ This simplifies optimisations that want to check for a constant numeric
308
+ expression.
309
+
310
+ Build system
311
+ ------------
312
+
313
+ * Link binaries with LDFLAGS if it's set, which is needed for some platform
314
+ (e.g. OpenEmbedded). Patch from Andreas Müller (#120).
315
+
316
+ * Add missing dependencies of algorithms.go rule.
317
+
318
+ Testsuite
319
+ ---------
320
+
321
+ * C: Add stemtest for low-level regression tests.
322
+
323
+ Documentation
324
+ -------------
325
+
326
+ * Document a C99 compiler as a requirement for building the snowball compiler
327
+ (but the C code it generates should still work with any ISO C compiler).
328
+
329
+ A few declarations mixed with code crept in some time ago (which nobody's
330
+ complained about), so this is really just formally documenting a requirement
331
+ which already existed.
332
+
333
+ * README: Explain what Snowball is and what Stemming is (#131, reported by Sean
334
+ Kelly).
335
+
336
+ * CONTRIBUTING.rst: Expand section on adding a new generator.
337
+
338
+ * For Python snowballstemmer module include global NEWS instead of
339
+ Python-specific CHANGES.rst and use README.rst as the long description.
340
+ Patch from Dmitry Shachnev (#119).
341
+
342
+ * COPYING: Update and incorporate Python backend licensing information which
343
+ was previously in a separate file.
344
+
345
+ Snowball 2.0.0 (2019-10-02)
346
+ ===========================
347
+
348
+ C/C++
349
+ -----
350
+
351
+ * Fully handle 4-byte UTF-8 sequences. Previously `hop` and `next` handled
352
+ sequences of any length, but commands which look at the character value only
353
+ handled sequences up to length 3. Fixes #89.
354
+
355
+ * Fix handling of a 3-byte UTF-8 sequence in a grouping in `backwardmode`.
356
+
357
+ Java
358
+ ----
359
+
360
+ * TestApp.java:
361
+
362
+ - Always use UTF-8 for I/O. Patch from David Corbett (#80).
363
+
364
+ - Allow reading input from stdin.
365
+
366
+ - Remove rather pointless "stem n times" feature.
367
+
368
+ - Only lower case ASCII to match stemwords.c.
369
+
370
+ - Stem empty lines too to match stemwords.c.
371
+
372
+ Code Quality Improvements
373
+ -------------------------
374
+
375
+ * Fix various warnings from newer compilers.
376
+
377
+ * Improve use of `const`.
378
+
379
+ * Share common functions between compiler backends rather than having multiple
380
+ copies of the same code.
381
+
382
+ * Assorted code clean-up.
383
+
384
+ * Initialise line_labelled member of struct generator to 0. Previously we were
385
+ invoking undefined behaviour, though in practice it'll be zero initialised on
386
+ most platforms.
387
+
388
+ New Code Generators
389
+ -------------------
390
+
391
+ * Add Python generator (#24). Originally written by Yoshiki Shibukawa, with
392
+ additional updates by Dmitry Shachnev.
393
+
394
+ * Add Javascript generator. Based on JSX generator (#26) written by Yoshiki
395
+ Shibukawa.
396
+
397
+ * Add Rust generator from Jakob Demler (#51).
398
+
399
+ * Add Go generator from Marty Schoch (#57).
400
+
401
+ * Add C# generator. Based on patch from Cesar Souza (#16, #17).
402
+
403
+ * Add Pascal generator. Based on Delphi backend from stemming.zip file on old
404
+ website (#75).
405
+
406
+ New Snowball Language Features
407
+ ------------------------------
408
+
409
+ * Add `len` and `lenof` to measure Unicode length. These are similar to `size`
410
+ and `sizeof` (respectively), but `size` and `sizeof` return the length in
411
+ bytes under `-utf8`, whereas these new commands give the same result whether
412
+ using `-utf8`, `-widechars` or neither (but under `-utf8` they are O(n) in
413
+ the length of the string). For compatibility with existing code which might
414
+ use these as variable or function names, they stop being treated as tokens if
415
+ declared to be a variable or function.
416
+
417
+ * New `{U+1234}` stringdef notation for Unicode codepoints.
418
+
419
+ * More versatile integer tests. Now you can compare any two arithmetic
420
+ expressions with a relational operator in parentheses after the `$`, so for
421
+ example `$(len > 3)` can now be used when previously a temporary variable was
422
+ required: `$tmp = len $tmp > 3`
423
+
424
+ Code generation improvements
425
+ ----------------------------
426
+
427
+ * General:
428
+
429
+ + Avoid unnecessarily saving and restoring of the cursor for more commands -
430
+ `atlimit`, `do`, `set` and `unset` all leave the cursor alone or always
431
+ restore its value, and for C `booltest` (which other languages already
432
+ handled).
433
+
434
+ + Special case handling for `setlimit tomark AE`. All uses of setlimit in
435
+ the current stemmers we ship follow this pattern, and by special-casing we
436
+ can avoid having to save and restore the cursor (#74).
437
+
438
+ + Merge duplicate actions in the same `among`. This reduces the size of the
439
+ switch/if-chain in the generated code which dispatch the among for many of
440
+ the stemmers.
441
+
442
+ + Generate simpler code for `among`. We always check for a zero return value
443
+ when we call the among, so there's no point also checking for that in the
444
+ switch/if-chain. We can also avoid the switch/if-chain entirely when
445
+ there's only one possible outcome (besides the zero return).
446
+
447
+ + Optimise code generated for `do <function call>`. This speeds up "make
448
+ check_python" by about 2%, and should speed up other interpreted languages
449
+ too (#110).
450
+
451
+ + Generate more and better comments referencing snowball source.
452
+
453
+ + Add homepage URL and compiler version as comments in generated files.
454
+
455
+ * C/C++:
456
+
457
+ + Fix `size` and `sizeof` to not report one too high (reported by Assem
458
+ Chelli in #32).
459
+
460
+ + If signal `f` from a function call would lead to return from the current
461
+ function then handle this and bailing out on an error together with a
462
+ simple `if (ret <= 0) return ret;`
463
+
464
+ + Inline testing for a single character literals.
465
+
466
+ + Avoiding generating `|| 0` in corner case - this can result in a compiler
467
+ warning when building the generated code.
468
+
469
+ + Implement `insert_v()` in terms of `insert_s()`.
470
+
471
+ + Add conditional `extern "C"` so `runtime/api.h` can be included from C++
472
+ code. Closes #90, reported by vvarma.
473
+
474
+ * Java:
475
+
476
+ + Fix functions in `among` to work in Java. We seem to need to make the
477
+ methods called from among `public` instead of `private`, and to call them
478
+ on `this` instead of the `methodObject` (which is cleaner anyway). No
479
+ revision in version control seems to generate working code for this case,
480
+ but Richard says it definitely used to work - possibly older JVMs failed to
481
+ correctly enforce the access controls when methods were invoked by
482
+ reflection.
483
+
484
+ + Code after handling `f` by returning from the current function is
485
+ unreachable too.
486
+
487
+ + Previously we incorrectly decided that code after an `or` was
488
+ unreachable in certain cases. None of the current stemmers in the
489
+ distribution triggered this, but Martin Porter's snowball version
490
+ of the Schinke Latin stemmer does. Fixes #58, reported by Alexander
491
+ Myltsev.
492
+
493
+ + The reachability logic was failing to consider reachability from
494
+ the final command in an `or`. Fixes #82, reported by David Corbett.
495
+
496
+ + Fix `maxint` and `minint`. Patch from David Corbett in #31.
497
+
498
+ + Fix `$` on strings. The previous generated code was just wrong. This
499
+ doesn't affect any of the included algorithms, but for example breaks
500
+ Martin Porter's snowball implementation of Schinke's Latin Stemmer.
501
+ Issue noted by Jakob Demler while working on the Rust backend in #51,
502
+ and reported in the Schinke's Latin Stemmer by Alexander Myltsev
503
+ in #58.
504
+
505
+ + Make SnowballProgram objects serializable. Patch from Oleg Smirnov in #43.
506
+
507
+ + Eliminate range-check implementation for groupings. This was removed from
508
+ the C generator 10 years earlier, isn't used for any of the existing
509
+ algorithms, and it doesn't seem likely it would be - the grouping would
510
+ have to consist entirely of a contiguous block of Unicode code-points.
511
+
512
+ + Simplify code generated for `repeat` and `atleast`.
513
+
514
+ + Eliminate unused return values and variables from runtime functions.
515
+
516
+ + Only import the `among` and `SnowballProgram` classes if they're actually
517
+ used.
518
+
519
+ + Only generate `copy_from()` method if it's used.
520
+
521
+ + Merge runtime functions `eq_s` and `eq_v` functions.
522
+
523
+ + Java arrays know their own length so stop storing it separately.
524
+
525
+ + Escape char 127 (DEL) in generated Java code. It's unlikely that this
526
+ character would actually be used in a real stemmer, so this was more of a
527
+ theoretical bug.
528
+
529
+ + Drop unused import of InvocationTargetException from SnowballStemmer.
530
+ Reported by GerritDeMeulder in #72.
531
+
532
+ + Fix lint check issues in generated Java code. The stemmer classes are only
533
+ referenced in the example app via reflection, so add
534
+ @SuppressWarnings("unused") for them. The stemmer classes override
535
+ equals() and hashCode() methods from the standard java Object class, so
536
+ mark these with @Override. Both suggested by GerritDeMeulder in #72.
537
+
538
+ + Declare Java variables at point of use in generated code. Putting all
539
+ declarations at the top of the function was adding unnecessary complexity
540
+ to the Java generator code for no benefit.
541
+
542
+ + Improve formatting of generated code.
543
+
544
+ New stemming algorithms
545
+ -----------------------
546
+
547
+ * Add Tamil stemmer from Damodharan Rajalingam (#2, #3).
548
+
549
+ * Add Arabic stemmer from Assem Chelli (#32, #50).
550
+
551
+ * Add Irish stemmer from Jim O'Regan (#48).
552
+
553
+ * Add Nepali stemmer from Arthur Zakirov (#70).
554
+
555
+ * Add Indonesian stemmer from Olly Betts (#71).
556
+
557
+ * Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review.
558
+
559
+ * Add Lithuanian stemmer from Dainius Jocas (#22, #76).
560
+
561
+ * Add Greek stemmer from Oleg Smirnov (#44).
562
+
563
+ * Add Catalan and Basque stemmers from Israel Olalla (#104).
564
+
565
+ Behavioural changes to existing algorithms
566
+ ------------------------------------------
567
+
568
+ * Portuguese:
569
+
570
+ + Replace incorrect Spanish suffixes by Portuguese suffixes (#1).
571
+
572
+ * French:
573
+
574
+ + The MSDOS CP850 version of the French algorithm was missing changes present
575
+ in the ISO8859-1 and Unicode versions. There's now a single version of
576
+ each algorithm which was based on the Unicode version.
577
+
578
+ + Recognize French suffixes even when they begin with diaereses. Patch from
579
+ David Corbett in #78.
580
+
581
+ * Russian:
582
+
583
+ + We now normalise 'ё' to 'е' before stemming. The documentation has long
584
+ said "we assume ['ё'] is mapped into ['е']" but it's more convenient for
585
+ the stemmer to actually perform this normalisation. This change has no
586
+ effect if the caller is already normalising as we recommend. It's a change
587
+ in behaviour they aren't, but 'ё' occurs rarely (there are currently no
588
+ instances in our test vocabulary) and this improves behaviour when it does
589
+ occur. Patch from Eugene Mirotin (#65, #68).
590
+
591
+ * Finish:
592
+
593
+ + Adjust the Finnish algorithm not to mangle numbers. This change also
594
+ means it tends to leave foreign words alone. Fixes #66.
595
+
596
+ * Danish:
597
+
598
+ + Adjust Danish algorithm not to mangle alphanumeric codes. In particular
599
+ alphanumeric codes ending in a double digit (e.g. 0x0e00, hal9000,
600
+ space1999) are no longer mangled. See #81.
601
+
602
+ Optimisations to existing algorithms
603
+ ------------------------------------
604
+
605
+ * Turkish:
606
+
607
+ + Simplify uses of `test` in stemmer code.
608
+
609
+ + Check for 'ad' or 'soyad' more efficiently, and without needing the
610
+ strlen variable. This speeds up "make check_utf8_turkish" by 11%
611
+ on x86 Linux.
612
+
613
+ * Kraaij-Pohlmann:
614
+
615
+ + Eliminate variable x `$p1 <= cursor` is simpler and a little more efficient
616
+ than `setmark x $x >= p1`.
617
+
618
+ Code clarity improvements to existing algorithms
619
+ ------------------------------------------------
620
+
621
+ * Turkish:
622
+
623
+ + Use , for cedilla to match the conventions used in other stemmers.
624
+
625
+ * Kraaij-Pohlmann:
626
+
627
+ + Avoid cryptic `[among ( (])` ... `)` construct - instead use the same
628
+ `[substring] among (` ... `)` construct we do in other stemmers.
629
+
630
+ Compiler
631
+ --------
632
+
633
+ * Support conventional --help and --version options.
634
+
635
+ * Warn if -r or -ep used with backend other than C/C++.
636
+
637
+ * Warn if encoding command line options are specified when generating code in a
638
+ language with a fixed encoding.
639
+
640
+ * The default classname is now set based on the output filename, so `-n` is now
641
+ often no longer needed. Fixes #64.
642
+
643
+ * Avoid potential one byte buffer over-read when parsing snowball code.
644
+
645
+ * Avoid comparing with uninitialised array element during compilation.
646
+
647
+ * Improve `-syntax` output for `setlimit L for C`.
648
+
649
+ * Optimise away double negation so generators don't have to worry about
650
+ generating `--` (decrement operator in many languages). Fixes #52, reported
651
+ by David Corbett.
652
+
653
+ * Improved compiler error and warning messages:
654
+
655
+ - We now report FILE:LINE: before each diagnostic message.
656
+
657
+ - Improve warnings for unused declarations/definitions.
658
+
659
+ - Warn for variables which are used, but either never initialised
660
+ or never read.
661
+
662
+ - Flag non-ASCII literal strings. This is an error for wide Unicode, but
663
+ only a warning for single-byte and UTF-8 which work so long as the source
664
+ encoding matches the encoding used in the generated stemmer code.
665
+
666
+ - Improve error recovery after an undeclared `define`. We now sniff the
667
+ token after the identifier and if it is `as` we parse as a routine,
668
+ otherwise we parse as a grouping. Previously we always just assumed it was
669
+ a routine, which gave a confusing second error if it was a grouping.
670
+
671
+ - Improve error recovery after an unexpected token in `among`. Previously
672
+ we acted as if the unexpected token closed the `among` (this probably
673
+ wasn't intended but just a missing `break;` in a switch statement). Now we
674
+ issue an error and try the next token.
675
+
676
+ * Report error instead of silently truncating character values (e.g. `hex 123`
677
+ previously silently became byte 0x23 which is `#` rather than a
678
+ g-with-cedilla).
679
+
680
+ * Enlarge the initial input buffer size to 8192 bytes and double each time we
681
+ hit the end. Snowball programs are typically a few KB in size (with the
682
+ current largest we ship being the Greek stemmer at 27KB) so the previous
683
+ approach of starting with a 10 byte input buffer and increasing its size by
684
+ 50% plus 40 bytes each time it filled was inefficient, needing up to 15
685
+ reallocations to load greek.sbl.
686
+
687
+ * Identify variables only used by one `routine`/`external`. This information
688
+ isn't yet used, but such variables which are also always written to before
689
+ being read can be emitted as local variables in most target languages.
690
+
691
+ * We now allow multiple source files on command line, and allow them to be
692
+ after (or even interspersed) with options to better match modern Unix
693
+ conventions. Support for multiple source files allows specifying a single
694
+ byte character set mapping via a source file of `stringdef`.
695
+
696
+ * Avoid infinite recursion in compiler when optimising a recursive snowball
697
+ function. Recursive functions aren't typical in snowball programs, but
698
+ the compiler shouldn't crash for any input, especially not a valid one.
699
+ We now simply limit on how deep the compiler will recurse and make the
700
+ pessimistic assumption in the unlikely event we hit this limit.
701
+
702
+ Build system
703
+ ------------
704
+
705
+ * `make clean` in C libstemmer_c distribution now removes `examples/*.o`.
706
+ (#59)
707
+
708
+ * Fix all the places which previously had to have a list of stemmers to work
709
+ dynamically or be generated, so now only modules.txt needs updating to add
710
+ a new stemmer.
711
+
712
+ * Add check_java make target which runs tests for java.
713
+
714
+ * Support gzipped test data (the uncompressed arabic test data is too big for
715
+ github).
716
+
717
+ * GNUmakefile: Drop useless `-eprefix` and `-r` options from snowball
718
+ invocations for Java - these are only meaningful when generating C code.
719
+
720
+ * Pass CFLAGS when linking which matches convention (e.g. automake does it) and
721
+ facilitates use of tools such as ASan. Fixes #84, reported by Thomas
722
+ Pointhuber.
723
+
724
+ * Add CI builds with -std=c90 to check compiler and generated code are C90
725
+ (#54)
726
+
727
+ libstemmer
728
+ ----------
729
+
730
+ * Split out CPPFLAGS from CFLAGS and use CFLAGS when linking stemwords.
731
+
732
+ * Add -O2 to CFLAGS.
733
+
734
+ * Make generated tables of encodings and modules const.
735
+
736
+ * Fix clang static analyzer memory leak warning (in practice this code path
737
+ can never actually be taken). Patch from Patrick O. Perry (#56)
738
+
739
+ Documentation
740
+ -------------
741
+
742
+ * Added copyright and licensing details (#10).
743
+
744
+ * Document that libstemmer supports ISO_8859_2 encoding. Currently hungarian
745
+ and romanian are available in ISO_8859_2.
746
+
747
+ * Remove documentation falsely claiming that libstemmer supports CP850
748
+ encoding.
749
+
750
+ * CONTRIBUTING.rst: Add guidance for contributing new stemming algorithms and
751
+ new language backends.
752
+
753
+ * Overhaul libstemmer_python_README. Most notably, replace the benchmark data
754
+ which was very out of date.