mittens 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,460 @@
1
+ /* *******************************************
2
+ * Stemmer for Yiddish language in YIVO script
3
+ *
4
+ * Author: Assaf Urieli
5
+ * Emails: assaf.urieli at gmail.com
6
+ * Version: 0.1 (15.05.2020)
7
+ *
8
+ ********************************************* */
9
+
10
+ routines (
11
+ prelude
12
+ mark_regions
13
+ R1
14
+ R1plus3
15
+ standard_suffix
16
+ )
17
+
18
+ externals ( stem )
19
+
20
+ integers ( p1 x )
21
+
22
+ groupings ( vowel niked alefBeys consonant )
23
+
24
+ stringescapes {}
25
+
26
+ // AlefBeys
27
+ stringdef Alef '{U+05D0}'
28
+ stringdef Beys '{U+05D1}'
29
+ stringdef Giml '{U+05D2}'
30
+ stringdef Dalet '{U+05D3}'
31
+ stringdef Hey '{U+05D4}'
32
+ stringdef Vov '{U+05D5}'
33
+ stringdef Zayen '{U+05D6}'
34
+ stringdef Khes '{U+05D7}'
35
+ stringdef Tes '{U+05D8}'
36
+ stringdef Yud '{U+05D9}'
37
+ stringdef LangerKhof '{U+05DA}'
38
+ stringdef Khof '{U+05DB}'
39
+ stringdef Lamed '{U+05DC}'
40
+ stringdef ShlosMem '{U+05DD}'
41
+ stringdef Mem '{U+05DE}'
42
+ stringdef LangerNun '{U+05DF}'
43
+ stringdef Nun '{U+05E0}'
44
+ stringdef Samekh '{U+05E1}'
45
+ stringdef Ayen '{U+05E2}'
46
+ stringdef LangerFey '{U+05E3}'
47
+ stringdef Fey '{U+05E4}'
48
+ stringdef LangerTsadek '{U+05E5}'
49
+ stringdef Tsadek '{U+05E6}'
50
+ stringdef Kuf '{U+05E7}'
51
+ stringdef Reysh '{U+05E8}'
52
+ stringdef Shin '{U+05E9}'
53
+ stringdef Sof '{U+05EA}'
54
+ stringdef TsveyVovn '{U+05F0}'
55
+ stringdef VovYud '{U+05F1}'
56
+ stringdef TsveyYudn '{U+05F2}'
57
+
58
+ // Niked
59
+ stringdef Shvo '{U+05B0}'
60
+ stringdef Khirik '{U+05B4}'
61
+ stringdef Tseyre '{U+05B5}'
62
+ stringdef Segl '{U+05B6}'
63
+ stringdef ReducedSegl '{U+05B1}'
64
+ stringdef Pasekh '{U+05B7}'
65
+ stringdef ReducedPasekh '{U+05B2}'
66
+ stringdef Komets '{U+05B8}'
67
+ stringdef ReducedKomets '{U+05B3}'
68
+ stringdef Rafe '{U+05BF}'
69
+ stringdef SinDot '{U+05C2}'
70
+ stringdef ShinDot '{U+05C1}'
71
+ stringdef Khoylm '{U+05B9}'
72
+ stringdef Melupm '{U+05BC}'
73
+ stringdef Kubuts '{U+05BB}'
74
+
75
+ // Groupings
76
+ define niked '{Shvo}{Khirik}{Tseyre}{Segl}{ReducedSegl}{Pasekh}{ReducedPasekh}{Komets}{ReducedKomets}{SinDot}{ShinDot}{Khoylm}{Melupm}{Kubuts}{Rafe}'
77
+ define alefBeys '{Alef}{Beys}{Giml}{Dalet}{Hey}{Vov}{Zayen}{Khes}{Tes}{Yud}{LangerKhof}{Khof}{Lamed}{ShlosMem}{Mem}{LangerNun}{Nun}{Samekh}{Ayen}{LangerFey}{Fey}{LangerTsadek}{Tsadek}{Kuf}{Reysh}{Shin}{Sof}{TsveyVovn}{VovYud}{TsveyYudn}'
78
+ define vowel '{Alef}{Vov}{Yud}{Ayen}{VovYud}{TsveyYudn}'
79
+ define consonant alefBeys - vowel
80
+
81
+ define prelude as (
82
+ do (
83
+ repeat goto (
84
+ [substring] among (
85
+ '{Vov}{Vov}' ( not '{Melupm}' <- '{TsveyVovn}' )
86
+ '{Vov}{Yud}' ( not '{Khirik}' <- '{VovYud}' )
87
+ '{Yud}{Yud}' ( not '{Khirik}' <- '{TsveyYudn}' )
88
+ '{LangerKhof}' ( <- '{Khof}')
89
+ '{ShlosMem}' ( <- '{Mem}' )
90
+ '{LangerNun}' ( <- '{Nun}' )
91
+ '{LangerFey}' ( <- '{Fey}' )
92
+ '{LangerTsadek}' ( <- '{Tsadek}' )
93
+ )
94
+ )
95
+ )
96
+
97
+ do (repeat goto ( [niked] delete ))
98
+ )
99
+
100
+ define mark_regions as (
101
+ $p1 = limit
102
+
103
+ (
104
+ try (
105
+ // Replace past participle ge- at start of word
106
+ // Unless word starts with gelt- or gebn-
107
+ ['{Giml}{Ayen}']
108
+ not ('{Lamed}{Tes}' or '{Beys}{Nun}') <- 'GE'
109
+ )
110
+
111
+ try (
112
+ // skip verbal prefix
113
+ among(
114
+ // Free stressed: Adurkh-, Durkh-, Ahin-, Aher-, Avek-, Mit-, Antkegn-, Akegn-, Anider-, Arop-, Aroys-, Aroyf-, Arum-, Arayn-, Arunter-, Ariber-, Nokh-, Farbay-, Aheym-, Afir-, Faroys-, Funander-, Tsuzamen-, Tsunoyf-, Tsurik-
115
+ '{Alef}{Dalet}{Vov}{Reysh}{Khof}' '{Dalet}{Vov}{Reysh}{Khof}' '{Alef}{Hey}{Yud}{Nun}' '{Alef}{Hey}{Ayen}{Reysh}' '{Alef}{TsveyVovn}{Ayen}{Kuf}' '{Mem}{Yud}{Tes}' '{Alef}{Nun}{Tes}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Nun}{Yud}{Dalet}{Ayen}{Reysh}' '{Alef}{Reysh}{Alef}{Fey}' '{Alef}{Reysh}{VovYud}{Samekh}' '{Alef}{Reysh}{VovYud}{Fey}' '{Alef}{Reysh}{Vov}{Mem}' '{Alef}{Reysh}{TsveyYudn}{Nun}' '{Alef}{Reysh}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Reysh}{Yud}{Beys}{Ayen}{Reysh}' '{Nun}{Alef}{Khof}' '{Fey}{Alef}{Reysh}{Beys}{TsveyYudn}' '{Alef}{Hey}{TsveyYudn}{Mem}' '{Alef}{Fey}{Yud}{Reysh}' '{Fey}{Alef}{Reysh}{VovYud}{Samekh}' '{Fey}{Vov}{Nun}{Alef}{Nun}{Dalet}{Ayen}{Reysh}' '{Tsadek}{Vov}{Zayen}{Alef}{Mem}{Ayen}{Nun}' '{Tsadek}{Vov}{Nun}{VovYud}{Fey}' '{Tsadek}{Vov}{Reysh}{Yud}{Kuf}'
116
+
117
+ // Stressed: Oys-, Oyf-, Um-, Unter-, Iber-, Ayn-, On-, Op-, Bay-, For-, Tsu-.
118
+ '{Alef}{VovYud}{Samekh}' '{Alef}{VovYud}{Fey}' '{Alef}{Vov}{Mem}' '{Alef}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Yud}{Beys}{Ayen}{Reysh}' '{Alef}{TsveyYudn}{Nun}' '{Alef}{Nun}' '{Alef}{Fey}' '{Beys}{TsveyYudn}' '{Fey}{Alef}{Reysh}' '{Tsadek}{Vov}'
119
+
120
+ // Unstressed: Ant-, Ba-, Der-, Tse-. Far- already covered by For-. Ge- comes later.
121
+ '{Alef}{Nun}{Tes}' '{Beys}{Alef}' '{Dalet}{Ayen}{Reysh}' '{Tsadek}{Ayen}'
122
+
123
+ // If verbal prefix followed by Tsu- or Ge-, replace it
124
+ (
125
+ // Don't mark the TSU- prefix inside verbs like "oys-tsugn"
126
+ test (('{Tsadek}{Vov}{Giml}{Nun}' or '{Tsadek}{Vov}{Kuf}{Tes}' or '{Tsadek}{Vov}{Kuf}{Nun}') atlimit)
127
+ or
128
+ // Don't mark the GE- prefix inside verbs like "avek-gebn"
129
+ test ('{Giml}{Ayen}{Beys}{Nun}')
130
+ or
131
+ ( ['{Giml}{Ayen}'] <- 'GE')
132
+ or
133
+ (['{Tsadek}{Vov}'] <- 'TSU')
134
+ )
135
+ )
136
+ )
137
+
138
+ test(hop 3 setmark x)
139
+
140
+ // We want to allow three-consonant Hebrew roots.
141
+ // To this end, we skip three-consonant combinations that exist in non-Hebraic Yiddish.
142
+ try (
143
+ among(
144
+ '{Shin}{Fey}{Reysh}' '{Shin}{Tes}{Reysh}' '{Shin}{Tes}{Shin}' '{Dalet}{Zayen}{Shin}'
145
+ ( true )
146
+ )
147
+ )
148
+
149
+ // Either 3 consonants or the first non-vowel after a vowel
150
+ (
151
+ not (consonant consonant consonant setmark p1)
152
+ goto vowel repeat vowel setmark p1
153
+ )
154
+ try($p1 < x $p1 = x) // at least 3 past the prefix
155
+ )
156
+
157
+ )
158
+
159
+ backwardmode (
160
+ define R1 as $p1 <= cursor
161
+ // Like R1, but also allows the cursor to be outside R1 by the width of Giml Yud Samekh
162
+ define R1plus3 as $p1 <= cursor + sizeof '{Giml}{Yud}{Samekh}'
163
+
164
+ define standard_suffix as (
165
+ do (
166
+ [substring] among(
167
+ // Plural/adjective endings: -er, -ers, -e, -n, -s, -en, -ns, -eners, -ens, -es
168
+ '{Ayen}{Reysh}{Samekh}' '{Ayen}{Nun}' '{Nun}{Samekh}' '{Ayen}{Nun}{Ayen}{Reysh}{Samekh}' '{Ayen}{Samekh}' '{Ayen}' '{Nun}' '{Samekh}' '{Ayen}{Mem}' '{Ayen}{Reysh}'
169
+ ( R1 delete )
170
+
171
+ // Exception: don't delete noun endings -ie, like "agitatsie"
172
+ '{Yud}{Ayen}'
173
+ ( true )
174
+
175
+ // -ies => ie
176
+ '{Yud}{Ayen}{Samekh}'
177
+ ( R1 <- '{Yud}{Ayen}' )
178
+
179
+ // Plural/adjective endings: -enem, -ener, -ene, -ens
180
+ '{Ayen}{Nun}{Ayen}' '{Ayen}{Nun}{Ayen}{Mem}' '{Ayen}{Nun}{Ayen}{Reysh}' '{Ayen}{Nun}{Samekh}'
181
+ (R1 delete
182
+ [substring] among (
183
+ // -gegangen => -gey
184
+ '{Giml}{Alef}{Nun}{Giml}' (<- '{Giml}{TsveyYudn}')
185
+ // -genumen => -nem
186
+ '{Nun}{Vov}{Mem}' (<- '{Nun}{Ayen}{Mem}')
187
+ // -gemiten => -mayd
188
+ '{Mem}{Yud}{Tes}' (<- '{Mem}{TsveyYudn}{Dalet}')
189
+ // -gebiten => -bayt
190
+ '{Beys}{Yud}{Tes}' (<- '{Beys}{TsveyYudn}{Tes}')
191
+ // -gebisen => -bays
192
+ '{Beys}{Yud}{Samekh}' (<- '{Beys}{TsveyYudn}{Samekh}')
193
+ // -gevizen => -vayz
194
+ '{TsveyVovn}{Yud}{Zayen}' (<- '{TsveyVovn}{TsveyYudn}{Zayen}')
195
+ // -getriben => -trayb
196
+ '{Tes}{Reysh}{Yud}{Beys}' (<- '{Tes}{Reysh}{TsveyYudn}{Beys}')
197
+ // -geliten => -layt
198
+ '{Lamed}{Yud}{Tes}' (<- '{Lamed}{TsveyYudn}{Tes}')
199
+ // -gekliben => -klayb
200
+ '{Kuf}{Lamed}{Yud}{Beys}' (<- '{Kuf}{Lamed}{TsveyYudn}{Beys}')
201
+ // -geriben => -rayb
202
+ '{Reysh}{Yud}{Beys}' (<- '{Reysh}{TsveyYudn}{Beys}')
203
+ // -gerisen => -rays
204
+ '{Reysh}{Yud}{Samekh}' (<- '{Reysh}{TsveyYudn}{Samekh}')
205
+ // -geshvigen => -shvayg
206
+ '{Shin}{TsveyVovn}{Yud}{Giml}' (<- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}')
207
+ // -geshmisen => -shmays
208
+ '{Shin}{Mem}{Yud}{Samekh}' (<- '{Shin}{Mem}{TsveyYudn}{Samekh}')
209
+ // -geshniten => -shnayd
210
+ '{Shin}{Nun}{Yud}{Tes}' (<- '{Shin}{Nun}{TsveyYudn}{Dalet}')
211
+ // -geshriben => -shrayb
212
+ '{Shin}{Reysh}{Yud}{Beys}' (<- '{Shin}{Reysh}{TsveyYudn}{Beys}')
213
+ // -gebunden => -bind
214
+ '{Beys}{Vov}{Nun}{Dalet}' (<- '{Beys}{Yud}{Nun}{Dalet}')
215
+ // -gevuntshn => -vintsh
216
+ '{TsveyVovn}{Vov}{Tes}{Shin}' (<- '{TsveyVovn}{Yud}{Tes}{Shin}')
217
+ // -gezungen => -zing
218
+ '{Zayen}{Vov}{Nun}{Giml}' (<- '{Zayen}{Yud}{Nun}{Giml}')
219
+ // -getrunken => -trink
220
+ '{Tes}{Reysh}{Vov}{Nun}{Kuf}' (<- '{Tes}{Reysh}{Yud}{Nun}{Kuf}')
221
+ // -getsvungen => -tsving
222
+ '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}' (<- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}')
223
+ // -geshlungen => -shling
224
+ '{Shin}{Lamed}{Vov}{Nun}{Giml}' (<- '{Shin}{Lamed}{Yud}{Nun}{Giml}')
225
+ // -geboygen => -beyg
226
+ '{Beys}{VovYud}{Giml}' (<- '{Beys}{TsveyYudn}{Giml}')
227
+ // -gehoyben => -heyb
228
+ '{Hey}{VovYud}{Beys}' (<- '{Hey}{TsveyYudn}{Beys}')
229
+ // -farloyren => -farlir
230
+ '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}' (<- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}')
231
+ // -shtanen => -shtey
232
+ '{Shin}{Tes}{Alef}{Nun}' (<- '{Shin}{Tes}{TsveyYudn}')
233
+ // -geshvoyrn => -shver
234
+ '{Shin}{TsveyVovn}{VovYud}{Reysh}' (<- '{Shin}{TsveyVovn}{Ayen}{Reysh}')
235
+ )
236
+ )
237
+
238
+ // Verb/past participle ending: -t
239
+ '{Tes}'
240
+ ( R1 delete )
241
+
242
+ // As well as noun/adjectives ending in -tn, -te, -ter, -ts so that the "-t" doesn't differentiate
243
+ // Similarly for past participles: -tns, -tene, -tenem, -tener
244
+ // If the Tes was before R1, we try to perform the same action while leaving the Tes in place
245
+ '{Tes}{Nun}' '{Tes}{Ayen}' '{Tes}{Ayen}{Reysh}' '{Tes}{Samekh}'
246
+ '{Tes}{Nun}{Samekh}' '{Tes}{Ayen}{Nun}{Ayen}' '{Tes}{Ayen}{Nun}{Ayen}{Mem}' '{Tes}{Ayen}{Nun}{Ayen}{Reysh}'
247
+ ( ((R1 delete) or ( <- '{Tes}'))
248
+ // -(ge)brakht => -breng
249
+ ['{Beys}{Reysh}{Alef}{Khof}' try '{Giml}{Ayen}'] <- '{Beys}{Reysh}{Ayen}{Nun}{Giml}'
250
+ )
251
+
252
+ // Past participles: -et, -etn, -ets, -ete, -eter
253
+ '{Ayen}{Tes}' '{Ayen}{Tes}{Nun}' '{Ayen}{Tes}{Samekh}' '{Ayen}{Tes}{Ayen}' '{Ayen}{Tes}{Ayen}{Reysh}'
254
+ ( R1 delete )
255
+
256
+ // -geyn shorted to -gey
257
+ '{Giml}{TsveyYudn}{Nun}'
258
+ ( <- '{Giml}{TsveyYudn}')
259
+
260
+ // ##################### Long list of irregular past participles
261
+ // -(ge)gangen (shortened to -gangen after prefixes) => -gey
262
+ '{Giml}{Alef}{Nun}{Giml}{Ayen}{Nun}'
263
+ ( <- '{Giml}{TsveyYudn}' )
264
+
265
+ // -(ge)numen (shortened to -numen after prefixes) => -nem
266
+ '{Nun}{Vov}{Mem}{Ayen}{Nun}'
267
+ (<- '{Nun}{Ayen}{Mem}' )
268
+
269
+ // -(ge)shribn (shortened to -shribn after prefixes) => -shrayb
270
+ '{Shin}{Reysh}{Yud}{Beys}{Nun}'
271
+ (<- '{Shin}{Reysh}{TsveyYudn}{Beys}' )
272
+
273
+ // -gemiten => -mayd
274
+ 'GE{Mem}{Yud}{Tes}{Nun}'
275
+ (<- '{Mem}{TsveyYudn}{Dalet}')
276
+
277
+ // -gebiten => -bayt
278
+ 'GE{Beys}{Yud}{Tes}{Nun}'
279
+ (<- '{Beys}{TsveyYudn}{Tes}')
280
+
281
+ // -gebisen => -bays
282
+ 'GE{Beys}{Yud}{Samekh}{Nun}'
283
+ ( <- '{Beys}{TsveyYudn}{Samekh}')
284
+
285
+ // -gevizen => -vayz
286
+ '{TsveyVovn}{Yud}{Zayen}{Nun}'
287
+ ( <- '{TsveyVovn}{TsveyYudn}{Zayen}')
288
+
289
+ // -getriben => -trayb
290
+ '{Tes}{Reysh}{Yud}{Beys}{Nun}'
291
+ ( <- '{Tes}{Reysh}{TsveyYudn}{Beys}')
292
+
293
+ // -geliten => -layt
294
+ 'GE{Lamed}{Yud}{Tes}{Nun}'
295
+ ( <- '{Lamed}{TsveyYudn}{Tes}')
296
+
297
+ // -gekliben => -klayb
298
+ '{Kuf}{Lamed}{Yud}{Beys}{Nun}'
299
+ ( <- '{Kuf}{Lamed}{TsveyYudn}{Beys}')
300
+
301
+ // -geriben => -rayb
302
+ '{Reysh}{Yud}{Beys}{Nun}'
303
+ ( <- '{Reysh}{TsveyYudn}{Beys}')
304
+
305
+ // -gerisen => -rays
306
+ 'GE{Reysh}{Yud}{Samekh}{Nun}'
307
+ ( <- '{Reysh}{TsveyYudn}{Samekh}')
308
+
309
+ // -geshvigen => -shvayg
310
+ '{Shin}{TsveyVovn}{Yud}{Giml}{Nun}'
311
+ ( <- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}')
312
+
313
+ // -geshmisen => -shmays
314
+ '{Shin}{Mem}{Yud}{Samekh}{Nun}'
315
+ ( <- '{Shin}{Mem}{TsveyYudn}{Samekh}')
316
+
317
+ // -geshniten => -shnayd
318
+ '{Shin}{Nun}{Yud}{Tes}{Nun}'
319
+ ( <- '{Shin}{Nun}{TsveyYudn}{Dalet}')
320
+
321
+ // -gebunden => -bind
322
+ '{Beys}{Vov}{Nun}{Dalet}{Nun}'
323
+ ( <- '{Beys}{Yud}{Nun}{Dalet}')
324
+
325
+ // -gevuntshn => -vintsh
326
+ '{TsveyVovn}{Vov}{Tes}{Shin}{Nun}'
327
+ ( <- '{TsveyVovn}{Yud}{Tes}{Shin}')
328
+
329
+ // -gezungen => -zing
330
+ '{Zayen}{Vov}{Nun}{Giml}{Nun}'
331
+ ( <- '{Zayen}{Yud}{Nun}{Giml}')
332
+
333
+ // -getrunken => -trink
334
+ '{Tes}{Reysh}{Vov}{Nun}{Kuf}{Nun}'
335
+ ( <- '{Tes}{Reysh}{Yud}{Nun}{Kuf}')
336
+
337
+ // -getsvungen => -tsving
338
+ '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}{Nun}'
339
+ ( <- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}')
340
+
341
+ // -geshlungen => -shling
342
+ '{Shin}{Lamed}{Vov}{Nun}{Giml}{Nun}'
343
+ ( <- '{Shin}{Lamed}{Yud}{Nun}{Giml}')
344
+
345
+ // -geboygen => -beyg
346
+ '{Beys}{VovYud}{Giml}{Nun}'
347
+ ( <- '{Beys}{TsveyYudn}{Giml}')
348
+
349
+ // -gehoyben => -heyb
350
+ '{Hey}{VovYud}{Beys}{Nun}'
351
+ ( <- '{Hey}{TsveyYudn}{Beys}')
352
+
353
+ // -farloyren => -farlir
354
+ '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}{Nun}'
355
+ ( <- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}')
356
+
357
+ // -shtanen => -shtey
358
+ '{Shin}{Tes}{Alef}{Nun}{Ayen}{Nun}'
359
+ ( <- '{Shin}{Tes}{TsveyYudn}')
360
+
361
+ // -geshvoyrn => -shver
362
+ '{Shin}{TsveyVovn}{VovYud}{Reysh}{Nun}'
363
+ ( <- '{Shin}{TsveyVovn}{Ayen}{Reysh}')
364
+
365
+ // -(ge)brakht (shortened to -brakht after prefixes) => -breng
366
+ '{Beys}{Reysh}{Alef}{Khof}{Tes}'
367
+ (<- '{Beys}{Reysh}{Ayen}{Nun}{Giml}' )
368
+
369
+ // ###### End of irregular past participles
370
+
371
+ // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft
372
+ '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}'
373
+ ( R1 delete )
374
+
375
+ // Noun endings: -izm, izmen
376
+ '{Yud}{Zayen}{Mem}' '{Yud}{Zayen}{Mem}{Ayen}{Nun}'
377
+ ( R1 delete )
378
+
379
+ // Plural ending: -im
380
+ '{Yud}{Mem}'
381
+ ( R1 delete )
382
+
383
+ // Plural ending: -os (Hebraic), replace with -h
384
+ '{Vov}{Sof}'
385
+ ( R1 <- '{Hey}' )
386
+
387
+ // Diminutive endings: -elekh, -ele, -lekh, -eles, -elen
388
+ '{Ayen}{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}' '{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}{Samekh}' '{Ayen}{Lamed}{Ayen}{Nun}'
389
+ ( R1 delete )
390
+
391
+ // Noun ending: -ist
392
+ '{Yud}{Samekh}{Tes}'
393
+ (
394
+ // Exceptions: -gist, -shist
395
+ ( ('{Giml}' or '{Shin}') try (R1plus3 <- '{Yud}{Samekh}') )
396
+ or
397
+ ( R1 delete )
398
+ )
399
+
400
+ // Noun ending: -istn
401
+ '{Yud}{Samekh}{Tes}{Nun}'
402
+ ( R1 delete )
403
+
404
+ // Verb ending: -stu
405
+ '{Samekh}{Tes}{Vov}'
406
+ ( R1 delete )
407
+
408
+ // Superlative ending: -ster, -ste, -stn
409
+ '{Samekh}{Tes}{Ayen}{Reysh}' '{Samekh}{Tes}{Ayen}' '{Samekh}{Tes}{Nun}'
410
+ ( R1 delete )
411
+
412
+ // Ambiguous verb ending: -st
413
+ '{Samekh}{Tes}'
414
+ ( R1 delete )
415
+ )
416
+ )
417
+
418
+ do (
419
+ [substring] among(
420
+ // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft
421
+ '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}'
422
+ ( R1 delete )
423
+
424
+ // Diminutive endings: -l
425
+ '{Lamed}'
426
+ ( R1 consonant delete )
427
+ )
428
+ )
429
+
430
+ do (
431
+ [substring] among(
432
+ // Adjective endings: -ig, -ik, -ish, -nik, -dik
433
+ '{Yud}{Giml}' '{Yud}{Kuf}' '{Yud}{Shin}' '{Nun}{Yud}{Kuf}' '{Dalet}{Yud}{Kuf}'
434
+ ( R1 delete )
435
+
436
+ // Exceptions to above: -blik, -glik
437
+ '{Beys}{Lamed}{Yud}{Kuf}' '{Giml}{Lamed}{Yud}{Kuf}'
438
+ ( true )
439
+
440
+ // Present participle endings: -ndik
441
+ '{Nun}{Dalet}{Yud}{Kuf}'
442
+ ( R1 delete )
443
+
444
+ // Present participle ending -endik: delete if after a -ng, -nk, -n, -m, consonant+l, or vowel.
445
+ // Otherwise, delete just the -ndik part.
446
+ '{Ayen}{Nun}{Dalet}{Yud}{Kuf}'
447
+ ( R1 delete )
448
+ )
449
+ )
450
+
451
+ do (repeat goto ( ['GE' or 'TSU'] delete ))
452
+ )
453
+ )
454
+
455
+ define stem as (
456
+ do prelude
457
+ do mark_regions
458
+ backwards
459
+ do standard_suffix
460
+ )
@@ -0,0 +1,98 @@
1
+ // ISO-8859-2 character mappings.
2
+
3
+ stringdef U+00A0 hex 'A0'
4
+ stringdef U+0104 hex 'A1'
5
+ stringdef U+02D8 hex 'A2'
6
+ stringdef U+0141 hex 'A3'
7
+ stringdef U+00A4 hex 'A4'
8
+ stringdef U+013D hex 'A5'
9
+ stringdef U+015A hex 'A6'
10
+ stringdef U+00A7 hex 'A7'
11
+ stringdef U+00A8 hex 'A8'
12
+ stringdef U+0160 hex 'A9'
13
+ stringdef U+015E hex 'AA'
14
+ stringdef U+0164 hex 'AB'
15
+ stringdef U+0179 hex 'AC'
16
+ stringdef U+00AD hex 'AD'
17
+ stringdef U+017D hex 'AE'
18
+ stringdef U+017B hex 'AF'
19
+ stringdef U+00B0 hex 'B0'
20
+ stringdef U+0105 hex 'B1'
21
+ stringdef U+02DB hex 'B2'
22
+ stringdef U+0142 hex 'B3'
23
+ stringdef U+00B4 hex 'B4'
24
+ stringdef U+013E hex 'B5'
25
+ stringdef U+015B hex 'B6'
26
+ stringdef U+02C7 hex 'B7'
27
+ stringdef U+00B8 hex 'B8'
28
+ stringdef U+0161 hex 'B9'
29
+ stringdef U+015F hex 'BA'
30
+ stringdef U+0165 hex 'BB'
31
+ stringdef U+017A hex 'BC'
32
+ stringdef U+02DD hex 'BD'
33
+ stringdef U+017E hex 'BE'
34
+ stringdef U+017C hex 'BF'
35
+ stringdef U+0154 hex 'C0'
36
+ stringdef U+00C1 hex 'C1'
37
+ stringdef U+00C2 hex 'C2'
38
+ stringdef U+0102 hex 'C3'
39
+ stringdef U+00C4 hex 'C4'
40
+ stringdef U+0139 hex 'C5'
41
+ stringdef U+0106 hex 'C6'
42
+ stringdef U+00C7 hex 'C7'
43
+ stringdef U+010C hex 'C8'
44
+ stringdef U+00C9 hex 'C9'
45
+ stringdef U+0118 hex 'CA'
46
+ stringdef U+00CB hex 'CB'
47
+ stringdef U+011A hex 'CC'
48
+ stringdef U+00CD hex 'CD'
49
+ stringdef U+00CE hex 'CE'
50
+ stringdef U+010E hex 'CF'
51
+ stringdef U+0110 hex 'D0'
52
+ stringdef U+0143 hex 'D1'
53
+ stringdef U+0147 hex 'D2'
54
+ stringdef U+00D3 hex 'D3'
55
+ stringdef U+00D4 hex 'D4'
56
+ stringdef U+0150 hex 'D5'
57
+ stringdef U+00D6 hex 'D6'
58
+ stringdef U+00D7 hex 'D7'
59
+ stringdef U+0158 hex 'D8'
60
+ stringdef U+016E hex 'D9'
61
+ stringdef U+00DA hex 'DA'
62
+ stringdef U+0170 hex 'DB'
63
+ stringdef U+00DC hex 'DC'
64
+ stringdef U+00DD hex 'DD'
65
+ stringdef U+0162 hex 'DE'
66
+ stringdef U+00DF hex 'DF'
67
+ stringdef U+0155 hex 'E0'
68
+ stringdef U+00E1 hex 'E1'
69
+ stringdef U+00E2 hex 'E2'
70
+ stringdef U+0103 hex 'E3'
71
+ stringdef U+00E4 hex 'E4'
72
+ stringdef U+013A hex 'E5'
73
+ stringdef U+0107 hex 'E6'
74
+ stringdef U+00E7 hex 'E7'
75
+ stringdef U+010D hex 'E8'
76
+ stringdef U+00E9 hex 'E9'
77
+ stringdef U+0119 hex 'EA'
78
+ stringdef U+00EB hex 'EB'
79
+ stringdef U+011B hex 'EC'
80
+ stringdef U+00ED hex 'ED'
81
+ stringdef U+00EE hex 'EE'
82
+ stringdef U+010F hex 'EF'
83
+ stringdef U+0111 hex 'F0'
84
+ stringdef U+0144 hex 'F1'
85
+ stringdef U+0148 hex 'F2'
86
+ stringdef U+00F3 hex 'F3'
87
+ stringdef U+00F4 hex 'F4'
88
+ stringdef U+0151 hex 'F5'
89
+ stringdef U+00F6 hex 'F6'
90
+ stringdef U+00F7 hex 'F7'
91
+ stringdef U+0159 hex 'F8'
92
+ stringdef U+016F hex 'F9'
93
+ stringdef U+00FA hex 'FA'
94
+ stringdef U+0171 hex 'FB'
95
+ stringdef U+00FC hex 'FC'
96
+ stringdef U+00FD hex 'FD'
97
+ stringdef U+0163 hex 'FE'
98
+ stringdef U+02D9 hex 'FF'
@@ -0,0 +1,74 @@
1
+ // KOI8-R character mappings.
2
+
3
+ stringdef U+00A0 hex '9A'
4
+ stringdef U+00A9 hex 'BF'
5
+ stringdef U+00B0 hex '9C'
6
+ stringdef U+00B2 hex '9D'
7
+ stringdef U+00B7 hex '9E'
8
+ stringdef U+00F7 hex '9F'
9
+ stringdef U+0401 hex 'B3'
10
+ stringdef U+0410 hex 'E1'
11
+ stringdef U+0411 hex 'E2'
12
+ stringdef U+0412 hex 'F7'
13
+ stringdef U+0413 hex 'E7'
14
+ stringdef U+0414 hex 'E4'
15
+ stringdef U+0415 hex 'E5'
16
+ stringdef U+0416 hex 'F6'
17
+ stringdef U+0417 hex 'FA'
18
+ stringdef U+0418 hex 'E9'
19
+ stringdef U+0419 hex 'EA'
20
+ stringdef U+041A hex 'EB'
21
+ stringdef U+041B hex 'EC'
22
+ stringdef U+041C hex 'ED'
23
+ stringdef U+041D hex 'EE'
24
+ stringdef U+041E hex 'EF'
25
+ stringdef U+041F hex 'F0'
26
+ stringdef U+0420 hex 'F2'
27
+ stringdef U+0421 hex 'F3'
28
+ stringdef U+0422 hex 'F4'
29
+ stringdef U+0423 hex 'F5'
30
+ stringdef U+0424 hex 'E6'
31
+ stringdef U+0425 hex 'E8'
32
+ stringdef U+0426 hex 'E3'
33
+ stringdef U+0427 hex 'FE'
34
+ stringdef U+0428 hex 'FB'
35
+ stringdef U+0429 hex 'FD'
36
+ stringdef U+042A hex 'FF'
37
+ stringdef U+042B hex 'F9'
38
+ stringdef U+042C hex 'F8'
39
+ stringdef U+042D hex 'FC'
40
+ stringdef U+042E hex 'E0'
41
+ stringdef U+042F hex 'F1'
42
+ stringdef U+0430 hex 'C1'
43
+ stringdef U+0431 hex 'C2'
44
+ stringdef U+0432 hex 'D7'
45
+ stringdef U+0433 hex 'C7'
46
+ stringdef U+0434 hex 'C4'
47
+ stringdef U+0435 hex 'C5'
48
+ stringdef U+0436 hex 'D6'
49
+ stringdef U+0437 hex 'DA'
50
+ stringdef U+0438 hex 'C9'
51
+ stringdef U+0439 hex 'CA'
52
+ stringdef U+043A hex 'CB'
53
+ stringdef U+043B hex 'CC'
54
+ stringdef U+043C hex 'CD'
55
+ stringdef U+043D hex 'CE'
56
+ stringdef U+043E hex 'CF'
57
+ stringdef U+043F hex 'D0'
58
+ stringdef U+0440 hex 'D2'
59
+ stringdef U+0441 hex 'D3'
60
+ stringdef U+0442 hex 'D4'
61
+ stringdef U+0443 hex 'D5'
62
+ stringdef U+0444 hex 'C6'
63
+ stringdef U+0445 hex 'C8'
64
+ stringdef U+0446 hex 'C3'
65
+ stringdef U+0447 hex 'DE'
66
+ stringdef U+0448 hex 'DB'
67
+ stringdef U+0449 hex 'DD'
68
+ stringdef U+044A hex 'DF'
69
+ stringdef U+044B hex 'D9'
70
+ stringdef U+044C hex 'D8'
71
+ stringdef U+044D hex 'DC'
72
+ stringdef U+044E hex 'C0'
73
+ stringdef U+044F hex 'D1'
74
+ stringdef U+0451 hex 'A3'