mittens 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +3 -3
  4. data/lib/mittens/version.rb +1 -1
  5. data/vendor/snowball/.github/workflows/ci.yml +216 -0
  6. data/vendor/snowball/CONTRIBUTING.rst +111 -62
  7. data/vendor/snowball/GNUmakefile +194 -136
  8. data/vendor/snowball/NEWS +798 -3
  9. data/vendor/snowball/README.rst +50 -1
  10. data/vendor/snowball/ada/src/stemmer.adb +25 -13
  11. data/vendor/snowball/ada/src/stemmer.ads +9 -9
  12. data/vendor/snowball/ada/stemmer_config.gpr +7 -7
  13. data/vendor/snowball/algorithms/basque.sbl +4 -19
  14. data/vendor/snowball/algorithms/catalan.sbl +2 -9
  15. data/vendor/snowball/algorithms/danish.sbl +1 -1
  16. data/vendor/snowball/algorithms/dutch.sbl +284 -122
  17. data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
  18. data/vendor/snowball/algorithms/english.sbl +52 -37
  19. data/vendor/snowball/algorithms/esperanto.sbl +157 -0
  20. data/vendor/snowball/algorithms/estonian.sbl +269 -0
  21. data/vendor/snowball/algorithms/finnish.sbl +2 -3
  22. data/vendor/snowball/algorithms/french.sbl +42 -16
  23. data/vendor/snowball/algorithms/german.sbl +35 -14
  24. data/vendor/snowball/algorithms/greek.sbl +76 -76
  25. data/vendor/snowball/algorithms/hungarian.sbl +8 -6
  26. data/vendor/snowball/algorithms/indonesian.sbl +14 -8
  27. data/vendor/snowball/algorithms/italian.sbl +11 -21
  28. data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
  29. data/vendor/snowball/algorithms/lovins.sbl +0 -1
  30. data/vendor/snowball/algorithms/nepali.sbl +138 -37
  31. data/vendor/snowball/algorithms/norwegian.sbl +19 -5
  32. data/vendor/snowball/algorithms/porter.sbl +2 -2
  33. data/vendor/snowball/algorithms/portuguese.sbl +9 -13
  34. data/vendor/snowball/algorithms/romanian.sbl +17 -4
  35. data/vendor/snowball/algorithms/serbian.sbl +467 -468
  36. data/vendor/snowball/algorithms/spanish.sbl +5 -7
  37. data/vendor/snowball/algorithms/swedish.sbl +60 -6
  38. data/vendor/snowball/algorithms/tamil.sbl +207 -176
  39. data/vendor/snowball/algorithms/turkish.sbl +461 -445
  40. data/vendor/snowball/algorithms/yiddish.sbl +36 -38
  41. data/vendor/snowball/compiler/analyser.c +445 -192
  42. data/vendor/snowball/compiler/driver.c +109 -101
  43. data/vendor/snowball/compiler/generator.c +853 -464
  44. data/vendor/snowball/compiler/generator_ada.c +404 -366
  45. data/vendor/snowball/compiler/generator_csharp.c +297 -260
  46. data/vendor/snowball/compiler/generator_go.c +323 -254
  47. data/vendor/snowball/compiler/generator_java.c +326 -252
  48. data/vendor/snowball/compiler/generator_js.c +362 -252
  49. data/vendor/snowball/compiler/generator_pascal.c +349 -197
  50. data/vendor/snowball/compiler/generator_python.c +257 -240
  51. data/vendor/snowball/compiler/generator_rust.c +423 -251
  52. data/vendor/snowball/compiler/header.h +117 -71
  53. data/vendor/snowball/compiler/space.c +137 -68
  54. data/vendor/snowball/compiler/syswords.h +2 -2
  55. data/vendor/snowball/compiler/tokeniser.c +125 -107
  56. data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
  57. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
  58. data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
  59. data/vendor/snowball/csharp/Stemwords/App.config +2 -2
  60. data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
  61. data/vendor/snowball/doc/libstemmer_c_README +7 -4
  62. data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
  63. data/vendor/snowball/doc/libstemmer_java_README +12 -1
  64. data/vendor/snowball/doc/libstemmer_js_README +6 -4
  65. data/vendor/snowball/doc/libstemmer_python_README +9 -4
  66. data/vendor/snowball/examples/stemwords.c +12 -12
  67. data/vendor/snowball/go/env.go +107 -31
  68. data/vendor/snowball/go/util.go +0 -4
  69. data/vendor/snowball/include/libstemmer.h +4 -0
  70. data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
  71. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
  72. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
  73. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
  74. data/vendor/snowball/javascript/base-stemmer.js +186 -2
  75. data/vendor/snowball/javascript/stemwords.js +3 -6
  76. data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
  77. data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
  78. data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
  79. data/vendor/snowball/libstemmer/modules.txt +13 -10
  80. data/vendor/snowball/libstemmer/test.c +1 -1
  81. data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
  82. data/vendor/snowball/pascal/generate.pl +13 -13
  83. data/vendor/snowball/python/create_init.py +4 -1
  84. data/vendor/snowball/python/setup.cfg +0 -3
  85. data/vendor/snowball/python/setup.py +8 -3
  86. data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
  87. data/vendor/snowball/python/stemwords.py +8 -12
  88. data/vendor/snowball/runtime/api.c +10 -5
  89. data/vendor/snowball/runtime/header.h +10 -9
  90. data/vendor/snowball/runtime/utilities.c +9 -9
  91. data/vendor/snowball/rust/build.rs +1 -1
  92. data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
  93. data/vendor/snowball/tests/stemtest.c +7 -4
  94. metadata +7 -7
  95. data/vendor/snowball/.travis.yml +0 -112
  96. data/vendor/snowball/algorithms/german2.sbl +0 -145
  97. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
  98. data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -62,9 +62,9 @@ backwardmode (
62
62
  // prefix not in {ke, peng, per}
63
63
  define SUFFIX_KAN_OK as (
64
64
  // On page 29, the example "kompas Q.31" says "Both Nazief and Porter
65
- // stemmer converted the word peledakan (blast, explotion) to ledak (to
66
- // blast, to explode)". However, the algorithm as described doesn't
67
- // behave in this way - grammatically the prefix pe- occurs as a
65
+ // stemmer converted the word peledakan (blast, explotion [sic]) to
66
+ // ledak (to blast, to explode)". However, the algorithm as described
67
+ // doesn't behave in this way - grammatically the prefix pe- occurs as a
68
68
  // variation of both the first-order derivational prefix peng- and the
69
69
  // second-order derivational prefix per-, but table 2.5 doesn't include
70
70
  // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly)
@@ -104,13 +104,19 @@ backwardmode (
104
104
  //
105
105
  // Elsewhere the paper defines V... as meaning "the stem starts with
106
106
  // a vowel" and K... as meaning "the stem starts with a consonant".
107
+ // The meaning of | isn't actually defined, but clearly means
108
+ // alternation.
107
109
  //
108
- // In other places where it says X|Y... it seems the | binds more
110
+ // However nowhere is the precedence of | vs ... defined, and there
111
+ // isn't a standard precedence we could reasonably assume. In other
112
+ // places where the paper says X|Y... it seems the | binds more
109
113
  // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit
110
114
  // odd as the first letter must be either a vowel or a consonant, so
111
- // that really just means "ends cᵢcⱼ". However, nowhere in the paper
112
- // uses or defines a notation such as ...X, which may explain this
113
- // seemingly redundant way of specifying this.
115
+ // that really just means "ends cᵢcⱼ" (and has at least one letter
116
+ // before cᵢ but we only call SUFFIX_I_OK if $measure > 2 which
117
+ // ensures that part). However, nowhere in the paper uses or defines
118
+ // a notation such as ...X, which may explain this seemingly redundant
119
+ // way of specifying this.
114
120
  //
115
121
  // The conditions elsewhere on prefix removal (e.g. V...) are clearly
116
122
  // on the stem left after the prefix is removed. None of the other
@@ -118,7 +124,7 @@ backwardmode (
118
124
  // consistency with the prefix rules we might expect that the cᵢcⱼ
119
125
  // test is on what's left *after* removing the "i" suffix.
120
126
  //
121
- // However, studying Indonesian wordlists and discussion with a native
127
+ // Studying Indonesian wordlists and discussion with a native
122
128
  // speaker leads us to conclude that the purpose of this check is to
123
129
  // protect words of foreign origin (e.g. "televisi", "organisasi",
124
130
  // "komunikasi") from stemming, and the common feature of these is
@@ -1,6 +1,5 @@
1
1
 
2
2
  routines (
3
- exceptions
4
3
  prelude postlude mark_regions
5
4
  RV R1 R2
6
5
  attached_pronoun
@@ -59,6 +58,8 @@ define mark_regions as (
59
58
  do (
60
59
  ( v (non-v gopast v) or (v gopast non-v) )
61
60
  or
61
+ 'divan' // Otherwise "divano" stems to "div" and collides with "diva".
62
+ or
62
63
  ( non-v (non-v gopast v) or (v next) )
63
64
  setmark pV
64
65
  )
@@ -108,7 +109,7 @@ backwardmode (
108
109
  'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
109
110
  'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
110
111
  'atrice' 'atrici'
111
- 'ante' 'anti' // Note 1
112
+ 'ante' 'anti'
112
113
  ( R2 delete )
113
114
  'azione' 'azioni' 'atore' 'atori'
114
115
  ( R2 delete
@@ -179,24 +180,13 @@ backwardmode (
179
180
  )
180
181
  )
181
182
 
182
- define exceptions as (
183
- ['divano' atlimit ] <- 'divan' // Otherwise "divano" stems to "div" and collides with "diva"
184
- )
185
-
186
183
  define stem as (
187
- exceptions or (
188
- do prelude
189
- do mark_regions
190
- backwards (
191
- do attached_pronoun
192
- do (standard_suffix or verb_suffix)
193
- do vowel_suffix
194
- )
195
- do postlude
196
- )
184
+ do prelude
185
+ do mark_regions
186
+ backwards (
187
+ do attached_pronoun
188
+ do (standard_suffix or verb_suffix)
189
+ do vowel_suffix
190
+ )
191
+ do postlude
197
192
  )
198
-
199
- /*
200
- Note 1: additions of 15 Jun 2005
201
- */
202
-
@@ -5,10 +5,10 @@ stringescapes { }
5
5
 
6
6
  /* Special characters in Unicode Latin Extended-A */
7
7
  // ' nosine
8
- stringdef a' '{U+0105}' // ą a + ogonek
9
- stringdef e' '{U+0119}' // ę e + ogonek
10
- stringdef i' '{U+012F}' // į i + ogonek
11
- stringdef u' '{U+0173}' // ų u + ogonek
8
+ stringdef ak '{U+0105}' // ą a + ogonek
9
+ stringdef ek '{U+0119}' // ę e + ogonek
10
+ stringdef ik '{U+012F}' // į i + ogonek
11
+ stringdef uk '{U+0173}' // ų u + ogonek
12
12
 
13
13
  // . taskas
14
14
  stringdef e. '{U+0117}' // ė e + dot
@@ -16,10 +16,10 @@ stringdef e. '{U+0117}' // ė e + dot
16
16
  // - ilgoji
17
17
  stringdef u- '{U+016B}' // ū u + macron
18
18
 
19
- // * varnele
20
- stringdef c* '{U+010D}' // č c + caron (haček)
21
- stringdef s* '{U+0161}' // š s + caron (haček)
22
- stringdef z* '{U+017E}' // ž z + caron (haček)
19
+ // v varnele
20
+ stringdef cv '{U+010D}' // č c + caron (haček)
21
+ stringdef sv '{U+0161}' // š s + caron (haček)
22
+ stringdef zv '{U+017E}' // ž z + caron (haček)
23
23
 
24
24
  // [C](VC)^m[V|C]
25
25
  // definitions of variables for
@@ -31,31 +31,30 @@ integers ( p1 )
31
31
  groupings ( v )
32
32
 
33
33
  // v - all lithuanian vowels
34
- define v 'aeiyou{a'}{e'}{i'}{u'}{e.}{u-}'
34
+ define v 'aeiyou{ak}{ek}{ik}{uk}{e.}{u-}'
35
35
 
36
36
  // all lithuanian stemmer routines: 4 steps
37
37
  routines (
38
- step2 R1 step1 fix_chdz fix_gd fix_conflicts
38
+ step2 step1 fix_chdz fix_gd fix_conflicts
39
39
  )
40
40
 
41
41
  backwardmode (
42
42
 
43
- define R1 as $p1 <= cursor
44
43
  define step1 as (
45
- setlimit tomark p1 for ([substring]) R1 among (
44
+ setlimit tomark p1 for ([substring]) among (
46
45
  // Daiktavardžiai (Nouns)
47
46
  // I linksniuotė (declension I)
48
47
  'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys
49
48
  'o' 'io' // vyro, kelio
50
49
  'ui' 'iui' // vyrui, keliui
51
- '{a'}' 'i{a'}' '{i'}' // vyrą, kelią, brolį
50
+ '{ak}' 'i{ak}' '{ik}' // vyrą, kelią, brolį
52
51
  'u' 'iu' // vyru, keliu
53
52
  'e' 'yje' // vyre, kelyje
54
53
  'y' 'au' 'i' // kely, brolau, broli,
55
54
  'an' // nusižengiman
56
55
 
57
56
  'ai' 'iai' // vyrai, keliai
58
- '{u'}' 'i{u'}' // vyrų, kelių
57
+ '{uk}' 'i{uk}' // vyrų, kelių
59
58
  'ams' 'am' // vyrams, vyram
60
59
  'iams' 'iam' // broliams, broliam
61
60
  'us' 'ius' // vyrus, brolius
@@ -65,9 +64,9 @@ backwardmode (
65
64
  'ysna' // žutysna
66
65
 
67
66
  'asis' 'aisi' // sukimasis, sukimaisi
68
- 'osi' '{u'}si' // sukimosi, sukimųsi
67
+ 'osi' '{uk}si' // sukimosi, sukimųsi
69
68
  'uisi' // sukimuisi
70
- '{a'}si' // sukimąsi
69
+ '{ak}si' // sukimąsi
71
70
  'usi' // sukimusi
72
71
  'esi' // sukimesi
73
72
 
@@ -89,7 +88,7 @@ backwardmode (
89
88
  '{e.}' // gervė
90
89
  '{e.}s' // gervės
91
90
  'ei' // gervei
92
- '{e'}' // gervę
91
+ '{ek}' // gervę
93
92
  '{e.}j' '{e.}je' // gervėj, gervėje
94
93
  '{e.}ms' // gervėms
95
94
  'es' // gerves
@@ -114,14 +113,14 @@ backwardmode (
114
113
  // V linksniuote (declension V)
115
114
  'ies' 'ens' 'enio' 'ers' // avies, vandens, sesers
116
115
  'eniui' 'eriai' // vandeniui, eriai
117
- 'en{i'}' 'er{i'}' // vandenį, seserį
116
+ 'en{ik}' 'er{ik}' // vandenį, seserį
118
117
  'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria
119
118
  'enyje' 'eryje' // vandenyje, seseryje
120
119
  'ie' 'enie' 'erie' // avie, vandenie, seserie
121
120
 
122
121
  'enys' 'erys' // vandenys, seserys
123
- // 'en{u'}' konfliktas su 'žandenų' 'antenų'
124
- 'er{u'}' // seserų
122
+ // 'en{uk}' konfliktas su 'žandenų' 'antenų'
123
+ 'er{uk}' // seserų
125
124
  'ims' 'enims' 'erims' // avims, vandemins, seserims
126
125
  'enis' // vandenis
127
126
  'imis' // žebenkštimis
@@ -184,11 +183,11 @@ backwardmode (
184
183
  'sit' 'site' // gersit, gersite
185
184
 
186
185
  // tariamoji nuosaka (subjunctive mood)
187
- '{c*}iau' '{c*}iausi' // dirbčiau
186
+ '{cv}iau' '{cv}iausi' // dirbčiau
188
187
  'tum' 'tumei' // dirbtum, dirbtumei
189
188
  'tumeis' 'tumeisi' // mokytumeis, mokytumeisi
190
- // 't{u'}' nes blogai batutų -> batų
191
- 't{u'}si' // mokytųsi
189
+ // 't{uk}' nes blogai batutų -> batų
190
+ 't{uk}si' // mokytųsi
192
191
  // 'tume' konfliktas su 'šventume'
193
192
  'tum{e.}m' // dirbtumėm
194
193
  'tum{e.}me' // dirbtumėme
@@ -219,8 +218,8 @@ backwardmode (
219
218
  // 'tis' konfliktas, nes rytme-tis -> rytme
220
219
 
221
220
  // dalyviai (participles)
222
- '{a'}s' 'i{a'}s' '{i'}s' // dirbąs, žaidžiąs, gulįs
223
- 't{u'}s' // suktųs -> suk
221
+ '{ak}s' 'i{ak}s' '{ik}s' // dirbąs, žaidžiąs, gulįs
222
+ 't{uk}s' // suktųs -> suk
224
223
  'sim{e.}s' // suksimės
225
224
  'sit{e.}s' // suksitės
226
225
  'kite' // supkite
@@ -236,7 +235,7 @@ backwardmode (
236
235
  // budvardziu priesagos (Adjective suffixes)
237
236
  // 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is
238
237
  'ing' // tvark-ing-as
239
- 'i{s*}k' // lenk-išk-as
238
+ 'i{sv}k' // lenk-išk-as
240
239
  '{e.}t' // dem-ėt-as
241
240
  'ot' // garban-ot-as
242
241
  'uot' 'iuot' // lang-uot-as, akin-iuot-as
@@ -247,15 +246,15 @@ backwardmode (
247
246
  'iul' // maž-ul-is
248
247
  '{e.}l' // maž-ėl-is
249
248
  'yl' // maž-yl-is
250
- 'u{c*}iuk' // maž-učiuk-as
249
+ 'u{cv}iuk' // maž-učiuk-as
251
250
  'uliuk' // maž-uliuk-as
252
251
  'ut{e.}ait' // maž-utėlait-is
253
252
  'ok' // did-ok-as
254
253
  'iok' // višč-iok-as
255
- 'sv' '{s*}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as
254
+ 'sv' '{sv}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as
256
255
  'op' 'iop' // dvej-op-as, viener-iop-as
257
256
  'ain' // apval-ain-as
258
- 'yk{s*}t' 'yk{s*}{c*}' // ten-ykšt-is, vakar-ykšč-ias
257
+ 'yk{sv}t' 'yk{sv}{cv}' // ten-ykšt-is, vakar-ykšč-ias
259
258
 
260
259
  // laisniai
261
260
  'esn' // did-esn-is
@@ -266,17 +265,17 @@ backwardmode (
266
265
  'ias' // žaliasis
267
266
  'oj' 'ioj' // gerojo, žaliojo
268
267
  'aj' 'iaj' // gerajam, žaliajam
269
- '{a'}j' 'i{a'}j' // garąjį, žaliąjį
268
+ '{ak}j' 'i{ak}j' // garąjį, žaliąjį
270
269
  'uoj' 'iuoj' // geruoju, žaliuoju
271
270
  'iej' // gerieji
272
- '{u'}j' 'i{u'}j' // gerųjų, žaliųjų
271
+ '{uk}j' 'i{uk}j' // gerųjų, žaliųjų
273
272
  'ies' // geriesiems
274
273
  'uos' 'iuos' // geruosius, žaliuosius
275
274
  'ais' 'iais' // geraisiais, žaliaisiais
276
275
 
277
276
  // moteriska gimine (Female gender)
278
277
  'os' 'ios' // gerosios, žaliosios
279
- '{a'}s' 'i{a'}s' // gerąsios, žaliąsias
278
+ '{ak}s' 'i{ak}s' // gerąsios, žaliąsias
280
279
 
281
280
  // būtasis dažninis laikas (frequentative past tense)
282
281
  'dav' // ei-dav-o
@@ -285,9 +284,9 @@ backwardmode (
285
284
  'ant' 'iant'
286
285
  'int' // tur-int-is
287
286
  '{e.}j' // tur-ėj-o
288
- '{e'}' //
289
- '{e.}j{e'}'
290
- '{e'}s' // dirb-ęs-is
287
+ '{ek}' //
288
+ '{e.}j{ek}'
289
+ '{ek}s' // dirb-ęs-is
291
290
 
292
291
  'siant' // dirb-siant
293
292
 
@@ -336,8 +335,8 @@ backwardmode (
336
335
 
337
336
  define fix_chdz as (
338
337
  [substring] among (
339
- '{c*}' (<-'t')
340
- 'd{z*}' (<-'d')
338
+ '{cv}' (<-'t')
339
+ 'd{zv}' (<-'d')
341
340
  )
342
341
  )
343
342
 
@@ -205,4 +205,3 @@ define stem as (
205
205
  do respell
206
206
  )
207
207
  )
208
-
@@ -6,10 +6,9 @@
6
6
  */
7
7
 
8
8
  routines (
9
- remove_category_1
10
- check_category_2
11
- remove_category_2
12
- remove_category_3
9
+ remove_category_1
10
+ remove_category_2
11
+ remove_category_3
13
12
  )
14
13
 
15
14
  stringescapes {}
@@ -49,44 +48,146 @@ stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU
49
48
  stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA
50
49
 
51
50
  externals ( stem )
52
- backwardmode (
53
- define remove_category_1 as(
54
- [substring] among (
55
- '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}'
56
- '{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}'
57
- '{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}'
58
- (delete)
59
- '{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete)
60
- )
61
- )
62
51
 
63
- define check_category_2 as(
64
- [substring] among(
65
- '{dsc}' '{dsa}' '{dvsai}'
66
- )
67
- )
52
+ backwardmode (
53
+ define remove_category_1 as(
54
+ [substring] among (
55
+ '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}'
56
+ '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}'
57
+ '{dls}{dsc}{dlg}{dvsai}'
58
+ '{dls}{dsa}{dlg}'
59
+ '{dls}{dsc}{dlg}'
60
+ '{dll}{dvsaa}{dli}'
61
+ '{dll}{dvsaa}{dlii}'
62
+ '{dlpa}{dlc}{dvsi}'
63
+ '{dll}{dvse}'
64
+ '{dlr}{dlta}'
65
+ '{dlm}{dvsai}'
66
+ '{dlm}{dvsaa}'
67
+ (delete)
68
+ '{dlka}{dvso}'
69
+ '{dlka}{dvsaa}'
70
+ '{dlka}{dvsi}'
71
+ '{dlka}{dvsii}'
72
+ '{dlka}{dvsai}'
73
+ ('{dle}' or '{dvse}' or delete)
74
+ )
75
+ )
68
76
 
69
- define remove_category_2 as (
70
- [substring] among(
71
- '{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete)
72
- '{dvsai}' ('{dlta}{dsv}{dlr}' delete)
73
- )
74
- )
77
+ define remove_category_2 as (
78
+ [substring] among(
79
+ '{dsc}' '{dsa}'
80
+ ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete)
81
+ '{dvsai}'
82
+ ('{dlta}{dsv}{dlr}' delete)
83
+ )
84
+ )
75
85
 
76
- define remove_category_3 as(
77
- [substring] among(
78
- '{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}'
79
- (delete)
80
- )
81
- )
86
+ define remove_category_3 as(
87
+ [substring] among(
88
+ '{dltha}{dvsi}{dli}{dls}{dsv}'
89
+ '{dlh}{dvsu}{dln}{dvse}{dlc}'
90
+ '{dlh}{dvsu}{dln}{dsv}{dlc}'
91
+ '{dln}{dvse}{dlc}{dls}{dsv}'
92
+ '{dln}{dvse}{dlc}{dln}{dsv}'
93
+ '{dli}{dle}{dlka}{dvsii}'
94
+ '{dli}{dle}{dlka}{dvsaa}'
95
+ '{dli}{dle}{dlka}{dvso}'
96
+ '{dvsi}{dle}{dlka}{dvsii}'
97
+ '{dvsi}{dle}{dlka}{dvsaa}'
98
+ '{dvsi}{dle}{dlka}{dvso}'
99
+ '{dli}{dlc}{dln}{dsv}'
100
+ '{dvsi}{dlc}{dln}{dsv}'
101
+ '{dli}{dlc}{dls}{dsv}'
102
+ '{dvsi}{dlc}{dls}{dsv}'
103
+ '{dle}{dlc}{dln}{dsv}'
104
+ '{dvse}{dlc}{dln}{dsv}'
105
+ '{dle}{dlc}{dls}{dsv}'
106
+ '{dvse}{dlc}{dls}{dsv}'
107
+ '{dlc}{dvsi}{dln}{dsv}'
108
+ '{dlc}{dvse}{dls}{dsv}'
109
+ '{dlc}{dsv}{dly}{dvsau}'
110
+ '{dltha}{dvsi}{dln}{dsv}'
111
+ '{dltha}{dvsi}{dly}{dvso}'
112
+ '{dltha}{dvsi}{dly}{dvsau}'
113
+ '{dltha}{dvsi}{dls}{dsv}'
114
+ '{dltha}{dsv}{dly}{dvso}'
115
+ '{dltha}{dsv}{dly}{dvsau}'
116
+ '{dld}{dvsi}{dly}{dvso}'
117
+ '{dld}{dvse}{dlkha}{dvsi}'
118
+ '{dld}{dvse}{dlkha}{dvsii}'
119
+ '{dll}{dvsaa}{dln}{dsv}'
120
+ '{dlm}{dvsaa}{dltha}{dvsi}'
121
+ '{dln}{dvse}{dlka}{dvsai}'
122
+ '{dln}{dvse}{dlka}{dvsaa}'
123
+ '{dln}{dvse}{dlka}{dvso}'
124
+ '{dln}{dvse}{dlc}{dvsau}'
125
+ '{dlh}{dvso}{dls}{dsv}'
126
+ '{dli}{dln}{dsv}{dlc}'
127
+ '{dvsi}{dln}{dsv}{dlc}'
128
+ '{dln}{dvse}{dlc}{dvsu}'
129
+ '{dli}{dlc}{dvsau}'
130
+ '{dvsi}{dlc}{dvsau}'
131
+ '{dli}{dls}{dsv}'
132
+ '{dvsi}{dls}{dsv}'
133
+ '{dvsi}{dly}{dvso}'
134
+ '{dli}{dly}{dvso}'
135
+ '{dle}{dlka}{dvsaa}'
136
+ '{dvse}{dlka}{dvsaa}'
137
+ '{dle}{dlka}{dvsii}'
138
+ '{dvse}{dlka}{dvsii}'
139
+ '{dle}{dlka}{dvsai}'
140
+ '{dvse}{dlka}{dvsai}'
141
+ '{dle}{dlka}{dvso}'
142
+ '{dvse}{dlka}{dvso}'
143
+ '{dle}{dlc}{dvsu}'
144
+ '{dvse}{dlc}{dvsu}'
145
+ '{dle}{dlc}{dvsau}'
146
+ '{dvse}{dlc}{dvsau}'
147
+ '{dlc}{dln}{dsv}'
148
+ '{dlc}{dls}{dsv}'
149
+ '{dltha}{dvsi}{dle}'
150
+ '{dlpa}{dlr}{dsv}'
151
+ '{dlb}{dly}{dvso}'
152
+ '{dlh}{dlr}{dvsu}'
153
+ '{dlh}{dlr}{dvsuu}'
154
+ '{dvsi}{dld}{dvsaa}'
155
+ '{dli}{dld}{dvsaa}'
156
+ '{dvsi}{dld}{dvso}'
157
+ '{dli}{dld}{dvso}'
158
+ '{dvsi}{dld}{dvsai}'
159
+ '{dli}{dld}{dvsai}'
160
+ '{dln}{dvse}{dlc}'
161
+ '{dli}{dlc}'
162
+ '{dvsi}{dlc}'
163
+ '{dle}{dlc}'
164
+ '{dvse}{dlc}'
165
+ '{dlc}{dvsu}'
166
+ '{dlc}{dvse}'
167
+ '{dlc}{dvsau}'
168
+ '{dltha}{dvsii}'
169
+ '{dltha}{dvse}'
170
+ '{dld}{dvsaa}'
171
+ '{dld}{dvsii}'
172
+ '{dld}{dvsai}'
173
+ '{dld}{dvso}'
174
+ '{dln}{dvsu}'
175
+ '{dln}{dvse}'
176
+ '{dly}{dvso}'
177
+ '{dly}{dvsau}'
178
+ '{dlc}'
179
+ (delete)
180
+ )
181
+ )
82
182
 
83
183
  )
84
184
 
85
185
  define stem as (
86
- backwards (
87
- do remove_category_1
88
- do (
89
- repeat (do (check_category_2 and remove_category_2) remove_category_3)
90
- )
91
- )
186
+ backwards (
187
+ do remove_category_1
188
+ repeat (
189
+ do remove_category_2
190
+ remove_category_3
191
+ )
192
+ )
92
193
  )
@@ -17,18 +17,22 @@ stringescapes {}
17
17
 
18
18
  stringdef ae '{U+00E6}'
19
19
  stringdef ao '{U+00E5}'
20
+ stringdef e^ '{U+00EA}' // e-circumflex
21
+ stringdef o` '{U+00F2}' // o-grave
22
+ stringdef o' '{U+00F3}' // o-acute
23
+ stringdef o^ '{U+00F4}' // o-circumflex
20
24
  stringdef o/ '{U+00F8}'
21
25
 
22
- define v 'aeiouy{ae}{ao}{o/}'
26
+ define v 'ae{e^}io{o`}{o'}{o^}uy{ae}{ao}{o/}'
23
27
 
24
- define s_ending 'bcdfghjlmnoprtvyz'
28
+ define s_ending 'bcdfghjlmnoptvyz'
25
29
 
26
30
  define mark_regions as (
27
31
 
28
32
  $p1 = limit
29
33
 
30
34
  test ( hop 3 setmark x )
31
- goto v gopast non-v setmark p1
35
+ gopast v gopast non-v setmark p1
32
36
  try ( $p1 < x $p1 = x )
33
37
  )
34
38
 
@@ -40,10 +44,20 @@ backwardmode (
40
44
 
41
45
  'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
42
46
  'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
43
- 'hetens' 'ers' 'ets' 'et' 'het' 'ast'
47
+ 'hetens' 'ets' 'et' 'het' 'ast'
44
48
  (delete)
49
+ 'ers'
50
+ (
51
+ among (
52
+ 'amm' 'ast' 'ind' 'kap' 'kk' 'lt' 'nk' 'omm' 'pp' 'v'
53
+ '{o/}st'
54
+ ()
55
+ 'giv' 'hav' 'skap' ''
56
+ (delete)
57
+ )
58
+ )
45
59
  's'
46
- (s_ending or ('k' non-v) delete)
60
+ (s_ending or ('r' not 'e') or ('k' non-v) delete)
47
61
  'erte' 'ert'
48
62
  (<-'er')
49
63
  )
@@ -38,11 +38,11 @@ backwardmode (
38
38
  test gopast v delete
39
39
  test substring among(
40
40
  'at' 'bl' 'iz'
41
- (<+ 'e')
41
+ (insert 'e')
42
42
  'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
43
43
  // ignoring double c, h, j, k, q, v, w, and x
44
44
  ([next] delete)
45
- '' (atmark p1 test shortv <+ 'e')
45
+ '' (atmark p1 test shortv insert 'e')
46
46
  )
47
47
  )
48
48
  )
@@ -25,7 +25,7 @@ stringdef i' '{U+00ED}' // i-acute
25
25
  stringdef o^ '{U+00F4}' // o-circumflex
26
26
  stringdef o' '{U+00F3}' // o-acute
27
27
  stringdef u' '{U+00FA}' // u-acute
28
- stringdef c, '{U+00E7}' // c-cedilla
28
+ stringdef cc '{U+00E7}' // c-cedilla
29
29
 
30
30
  stringdef a~ '{U+00E3}' // a-tilde
31
31
  stringdef o~ '{U+00F5}' // o-tilde
@@ -38,7 +38,7 @@ define prelude as repeat (
38
38
  '{a~}' (<- 'a~')
39
39
  '{o~}' (<- 'o~')
40
40
  '' (next)
41
- ) //or next
41
+ )
42
42
  )
43
43
 
44
44
  define mark_regions as (
@@ -64,7 +64,7 @@ define postlude as repeat (
64
64
  'a~' (<- '{a~}')
65
65
  'o~' (<- '{o~}')
66
66
  '' (next)
67
- ) //or next
67
+ )
68
68
  )
69
69
 
70
70
  backwardmode (
@@ -86,9 +86,9 @@ backwardmode (
86
86
  'amento' 'amentos'
87
87
  'imento' 'imentos'
88
88
 
89
- 'adora' 'ador' 'a{c,}a~o'
90
- 'adoras' 'adores' 'a{c,}o~es' // no -ic test
91
- 'ante' 'antes' '{a^}ncia' // Note 1
89
+ 'adora' 'ador' 'a{cc}a~o'
90
+ 'adoras' 'adores' 'a{cc}o~es' // no -ic test
91
+ 'ante' 'antes' '{a^}ncia'
92
92
  (
93
93
  R2 delete
94
94
  )
@@ -97,7 +97,7 @@ backwardmode (
97
97
  (
98
98
  R2 <- 'log'
99
99
  )
100
- 'u{c,}a~o' 'u{c,}o~es'
100
+ 'u{cc}a~o' 'u{cc}o~es'
101
101
  (
102
102
  R2 <- 'u'
103
103
  )
@@ -122,7 +122,7 @@ backwardmode (
122
122
  R2 delete
123
123
  try (
124
124
  [substring] among(
125
- 'ante' // Note 1
125
+ 'ante'
126
126
  'avel'
127
127
  '{i'}vel' (R2 delete)
128
128
  )
@@ -193,7 +193,7 @@ backwardmode (
193
193
  'e' '{e'}' '{e^}'
194
194
  ( RV delete [('u'] test 'g') or
195
195
  ('i'] test 'c') RV delete )
196
- '{c,}' (<-'c')
196
+ '{cc}' (<-'c')
197
197
  )
198
198
  )
199
199
  )
@@ -212,7 +212,3 @@ define stem as (
212
212
  )
213
213
  do postlude
214
214
  )
215
-
216
- /*
217
- Note 1: additions of 15 Jun 2005
218
- */