mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,460 @@
1
+ /* *******************************************
2
+ * Stemmer for Yiddish language in YIVO script
3
+ *
4
+ * Author: Assaf Urieli
5
+ * Emails: assaf.urieli at gmail.com
6
+ * Version: 0.1 (15.05.2020)
7
+ *
8
+ ********************************************* */
9
+
10
+ routines (
11
+ prelude
12
+ mark_regions
13
+ R1
14
+ R1plus3
15
+ standard_suffix
16
+ )
17
+
18
+ externals ( stem )
19
+
20
+ integers ( p1 x )
21
+
22
+ groupings ( vowel niked alefBeys consonant )
23
+
24
+ stringescapes {}
25
+
26
+ // AlefBeys
27
+ stringdef Alef '{U+05D0}'
28
+ stringdef Beys '{U+05D1}'
29
+ stringdef Giml '{U+05D2}'
30
+ stringdef Dalet '{U+05D3}'
31
+ stringdef Hey '{U+05D4}'
32
+ stringdef Vov '{U+05D5}'
33
+ stringdef Zayen '{U+05D6}'
34
+ stringdef Khes '{U+05D7}'
35
+ stringdef Tes '{U+05D8}'
36
+ stringdef Yud '{U+05D9}'
37
+ stringdef LangerKhof '{U+05DA}'
38
+ stringdef Khof '{U+05DB}'
39
+ stringdef Lamed '{U+05DC}'
40
+ stringdef ShlosMem '{U+05DD}'
41
+ stringdef Mem '{U+05DE}'
42
+ stringdef LangerNun '{U+05DF}'
43
+ stringdef Nun '{U+05E0}'
44
+ stringdef Samekh '{U+05E1}'
45
+ stringdef Ayen '{U+05E2}'
46
+ stringdef LangerFey '{U+05E3}'
47
+ stringdef Fey '{U+05E4}'
48
+ stringdef LangerTsadek '{U+05E5}'
49
+ stringdef Tsadek '{U+05E6}'
50
+ stringdef Kuf '{U+05E7}'
51
+ stringdef Reysh '{U+05E8}'
52
+ stringdef Shin '{U+05E9}'
53
+ stringdef Sof '{U+05EA}'
54
+ stringdef TsveyVovn '{U+05F0}'
55
+ stringdef VovYud '{U+05F1}'
56
+ stringdef TsveyYudn '{U+05F2}'
57
+
58
+ // Niked
59
+ stringdef Shvo '{U+05B0}'
60
+ stringdef Khirik '{U+05B4}'
61
+ stringdef Tseyre '{U+05B5}'
62
+ stringdef Segl '{U+05B6}'
63
+ stringdef ReducedSegl '{U+05B1}'
64
+ stringdef Pasekh '{U+05B7}'
65
+ stringdef ReducedPasekh '{U+05B2}'
66
+ stringdef Komets '{U+05B8}'
67
+ stringdef ReducedKomets '{U+05B3}'
68
+ stringdef Rafe '{U+05BF}'
69
+ stringdef SinDot '{U+05C2}'
70
+ stringdef ShinDot '{U+05C1}'
71
+ stringdef Khoylm '{U+05B9}'
72
+ stringdef Melupm '{U+05BC}'
73
+ stringdef Kubuts '{U+05BB}'
74
+
75
+ // Groupings
76
+ define niked '{Shvo}{Khirik}{Tseyre}{Segl}{ReducedSegl}{Pasekh}{ReducedPasekh}{Komets}{ReducedKomets}{SinDot}{ShinDot}{Khoylm}{Melupm}{Kubuts}{Rafe}'
77
+ define alefBeys '{Alef}{Beys}{Giml}{Dalet}{Hey}{Vov}{Zayen}{Khes}{Tes}{Yud}{LangerKhof}{Khof}{Lamed}{ShlosMem}{Mem}{LangerNun}{Nun}{Samekh}{Ayen}{LangerFey}{Fey}{LangerTsadek}{Tsadek}{Kuf}{Reysh}{Shin}{Sof}{TsveyVovn}{VovYud}{TsveyYudn}'
78
+ define vowel '{Alef}{Vov}{Yud}{Ayen}{VovYud}{TsveyYudn}'
79
+ define consonant alefBeys - vowel
80
+
81
+ define prelude as (
82
+ do (
83
+ repeat goto (
84
+ [substring] among (
85
+ '{Vov}{Vov}' ( not '{Melupm}' <- '{TsveyVovn}' )
86
+ '{Vov}{Yud}' ( not '{Khirik}' <- '{VovYud}' )
87
+ '{Yud}{Yud}' ( not '{Khirik}' <- '{TsveyYudn}' )
88
+ '{LangerKhof}' ( <- '{Khof}')
89
+ '{ShlosMem}' ( <- '{Mem}' )
90
+ '{LangerNun}' ( <- '{Nun}' )
91
+ '{LangerFey}' ( <- '{Fey}' )
92
+ '{LangerTsadek}' ( <- '{Tsadek}' )
93
+ )
94
+ )
95
+ )
96
+
97
+ do (repeat goto ( [niked] delete ))
98
+ )
99
+
100
+ define mark_regions as (
101
+ $p1 = limit
102
+
103
+ (
104
+ try (
105
+ // Replace past participle ge- at start of word
106
+ // Unless word starts with gelt- or gebn-
107
+ ['{Giml}{Ayen}']
108
+ not ('{Lamed}{Tes}' or '{Beys}{Nun}') <- 'GE'
109
+ )
110
+
111
+ try (
112
+ // skip verbal prefix
113
+ among(
114
+ // Free stressed: Adurkh-, Durkh-, Ahin-, Aher-, Avek-, Mit-, Antkegn-, Akegn-, Anider-, Arop-, Aroys-, Aroyf-, Arum-, Arayn-, Arunter-, Ariber-, Nokh-, Farbay-, Aheym-, Afir-, Faroys-, Funander-, Tsuzamen-, Tsunoyf-, Tsurik-
115
+ '{Alef}{Dalet}{Vov}{Reysh}{Khof}' '{Dalet}{Vov}{Reysh}{Khof}' '{Alef}{Hey}{Yud}{Nun}' '{Alef}{Hey}{Ayen}{Reysh}' '{Alef}{TsveyVovn}{Ayen}{Kuf}' '{Mem}{Yud}{Tes}' '{Alef}{Nun}{Tes}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Nun}{Yud}{Dalet}{Ayen}{Reysh}' '{Alef}{Reysh}{Alef}{Fey}' '{Alef}{Reysh}{VovYud}{Samekh}' '{Alef}{Reysh}{VovYud}{Fey}' '{Alef}{Reysh}{Vov}{Mem}' '{Alef}{Reysh}{TsveyYudn}{Nun}' '{Alef}{Reysh}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Reysh}{Yud}{Beys}{Ayen}{Reysh}' '{Nun}{Alef}{Khof}' '{Fey}{Alef}{Reysh}{Beys}{TsveyYudn}' '{Alef}{Hey}{TsveyYudn}{Mem}' '{Alef}{Fey}{Yud}{Reysh}' '{Fey}{Alef}{Reysh}{VovYud}{Samekh}' '{Fey}{Vov}{Nun}{Alef}{Nun}{Dalet}{Ayen}{Reysh}' '{Tsadek}{Vov}{Zayen}{Alef}{Mem}{Ayen}{Nun}' '{Tsadek}{Vov}{Nun}{VovYud}{Fey}' '{Tsadek}{Vov}{Reysh}{Yud}{Kuf}'
116
+
117
+ // Stressed: Oys-, Oyf-, Um-, Unter-, Iber-, Ayn-, On-, Op-, Bay-, For-, Tsu-.
118
+ '{Alef}{VovYud}{Samekh}' '{Alef}{VovYud}{Fey}' '{Alef}{Vov}{Mem}' '{Alef}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Yud}{Beys}{Ayen}{Reysh}' '{Alef}{TsveyYudn}{Nun}' '{Alef}{Nun}' '{Alef}{Fey}' '{Beys}{TsveyYudn}' '{Fey}{Alef}{Reysh}' '{Tsadek}{Vov}'
119
+
120
+ // Unstressed: Ant-, Ba-, Der-, Tse-. Far- already covered by For-. Ge- comes later.
121
+ '{Alef}{Nun}{Tes}' '{Beys}{Alef}' '{Dalet}{Ayen}{Reysh}' '{Tsadek}{Ayen}'
122
+
123
+ // If verbal prefix followed by Tsu- or Ge-, replace it
124
+ (
125
+ // Don't mark the TSU- prefix inside verbs like "oys-tsugn"
126
+ test (('{Tsadek}{Vov}{Giml}{Nun}' or '{Tsadek}{Vov}{Kuf}{Tes}' or '{Tsadek}{Vov}{Kuf}{Nun}') atlimit)
127
+ or
128
+ // Don't mark the GE- prefix inside verbs like "avek-gebn"
129
+ test ('{Giml}{Ayen}{Beys}{Nun}')
130
+ or
131
+ ( ['{Giml}{Ayen}'] <- 'GE')
132
+ or
133
+ (['{Tsadek}{Vov}'] <- 'TSU')
134
+ )
135
+ )
136
+ )
137
+
138
+ test(hop 3 setmark x)
139
+
140
+ // We want to allow three-consonant Hebrew roots.
141
+ // To this end, we skip three-consonant combinations that exist in non-Hebraic Yiddish.
142
+ try (
143
+ among(
144
+ '{Shin}{Fey}{Reysh}' '{Shin}{Tes}{Reysh}' '{Shin}{Tes}{Shin}' '{Dalet}{Zayen}{Shin}'
145
+ ( true )
146
+ )
147
+ )
148
+
149
+ // Either 3 consonants or the first non-vowel after a vowel
150
+ (
151
+ not (consonant consonant consonant setmark p1)
152
+ goto vowel repeat vowel setmark p1
153
+ )
154
+ try($p1 < x $p1 = x) // at least 3 past the prefix
155
+ )
156
+
157
+ )
158
+
159
+ backwardmode (
160
+ define R1 as $p1 <= cursor
161
+ // Like R1, but also allows the cursor to be outside R1 by the width of Giml Yud Samekh
162
+ define R1plus3 as $p1 <= cursor + sizeof '{Giml}{Yud}{Samekh}'
163
+
164
+ define standard_suffix as (
165
+ do (
166
+ [substring] among(
167
+ // Plural/adjective endings: -er, -ers, -e, -n, -s, -en, -ns, -eners, -ens, -es
168
+ '{Ayen}{Reysh}{Samekh}' '{Ayen}{Nun}' '{Nun}{Samekh}' '{Ayen}{Nun}{Ayen}{Reysh}{Samekh}' '{Ayen}{Samekh}' '{Ayen}' '{Nun}' '{Samekh}' '{Ayen}{Mem}' '{Ayen}{Reysh}'
169
+ ( R1 delete )
170
+
171
+ // Exception: don't delete noun endings -ie, like "agitatsie"
172
+ '{Yud}{Ayen}'
173
+ ( true )
174
+
175
+ // -ies => ie
176
+ '{Yud}{Ayen}{Samekh}'
177
+ ( R1 <- '{Yud}{Ayen}' )
178
+
179
+ // Plural/adjective endings: -enem, -ener, -ene, -ens
180
+ '{Ayen}{Nun}{Ayen}' '{Ayen}{Nun}{Ayen}{Mem}' '{Ayen}{Nun}{Ayen}{Reysh}' '{Ayen}{Nun}{Samekh}'
181
+ (R1 delete
182
+ [substring] among (
183
+ // -gegangen => -gey
184
+ '{Giml}{Alef}{Nun}{Giml}' (<- '{Giml}{TsveyYudn}')
185
+ // -genumen => -nem
186
+ '{Nun}{Vov}{Mem}' (<- '{Nun}{Ayen}{Mem}')
187
+ // -gemiten => -mayd
188
+ '{Mem}{Yud}{Tes}' (<- '{Mem}{TsveyYudn}{Dalet}')
189
+ // -gebiten => -bayt
190
+ '{Beys}{Yud}{Tes}' (<- '{Beys}{TsveyYudn}{Tes}')
191
+ // -gebisen => -bays
192
+ '{Beys}{Yud}{Samekh}' (<- '{Beys}{TsveyYudn}{Samekh}')
193
+ // -gevizen => -vayz
194
+ '{TsveyVovn}{Yud}{Zayen}' (<- '{TsveyVovn}{TsveyYudn}{Zayen}')
195
+ // -getriben => -trayb
196
+ '{Tes}{Reysh}{Yud}{Beys}' (<- '{Tes}{Reysh}{TsveyYudn}{Beys}')
197
+ // -geliten => -layt
198
+ '{Lamed}{Yud}{Tes}' (<- '{Lamed}{TsveyYudn}{Tes}')
199
+ // -gekliben => -klayb
200
+ '{Kuf}{Lamed}{Yud}{Beys}' (<- '{Kuf}{Lamed}{TsveyYudn}{Beys}')
201
+ // -geriben => -rayb
202
+ '{Reysh}{Yud}{Beys}' (<- '{Reysh}{TsveyYudn}{Beys}')
203
+ // -gerisen => -rays
204
+ '{Reysh}{Yud}{Samekh}' (<- '{Reysh}{TsveyYudn}{Samekh}')
205
+ // -geshvigen => -shvayg
206
+ '{Shin}{TsveyVovn}{Yud}{Giml}' (<- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}')
207
+ // -geshmisen => -shmays
208
+ '{Shin}{Mem}{Yud}{Samekh}' (<- '{Shin}{Mem}{TsveyYudn}{Samekh}')
209
+ // -geshniten => -shnayd
210
+ '{Shin}{Nun}{Yud}{Tes}' (<- '{Shin}{Nun}{TsveyYudn}{Dalet}')
211
+ // -geshriben => -shrayb
212
+ '{Shin}{Reysh}{Yud}{Beys}' (<- '{Shin}{Reysh}{TsveyYudn}{Beys}')
213
+ // -gebunden => -bind
214
+ '{Beys}{Vov}{Nun}{Dalet}' (<- '{Beys}{Yud}{Nun}{Dalet}')
215
+ // -gevuntshn => -vintsh
216
+ '{TsveyVovn}{Vov}{Tes}{Shin}' (<- '{TsveyVovn}{Yud}{Tes}{Shin}')
217
+ // -gezungen => -zing
218
+ '{Zayen}{Vov}{Nun}{Giml}' (<- '{Zayen}{Yud}{Nun}{Giml}')
219
+ // -getrunken => -trink
220
+ '{Tes}{Reysh}{Vov}{Nun}{Kuf}' (<- '{Tes}{Reysh}{Yud}{Nun}{Kuf}')
221
+ // -getsvungen => -tsving
222
+ '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}' (<- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}')
223
+ // -geshlungen => -shling
224
+ '{Shin}{Lamed}{Vov}{Nun}{Giml}' (<- '{Shin}{Lamed}{Yud}{Nun}{Giml}')
225
+ // -geboygen => -beyg
226
+ '{Beys}{VovYud}{Giml}' (<- '{Beys}{TsveyYudn}{Giml}')
227
+ // -gehoyben => -heyb
228
+ '{Hey}{VovYud}{Beys}' (<- '{Hey}{TsveyYudn}{Beys}')
229
+ // -farloyren => -farlir
230
+ '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}' (<- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}')
231
+ // -shtanen => -shtey
232
+ '{Shin}{Tes}{Alef}{Nun}' (<- '{Shin}{Tes}{TsveyYudn}')
233
+ // -geshvoyrn => -shver
234
+ '{Shin}{TsveyVovn}{VovYud}{Reysh}' (<- '{Shin}{TsveyVovn}{Ayen}{Reysh}')
235
+ )
236
+ )
237
+
238
+ // Verb/past participle ending: -t
239
+ '{Tes}'
240
+ ( R1 delete )
241
+
242
+ // As well as noun/adjectives ending in -tn, -te, -ter, -ts so that the "-t" doesn't differentiate
243
+ // Similarly for past participles: -tns, -tene, -tenem, -tener
244
+ // If the Tes was before R1, we try to perform the same action while leaving the Tes in place
245
+ '{Tes}{Nun}' '{Tes}{Ayen}' '{Tes}{Ayen}{Reysh}' '{Tes}{Samekh}'
246
+ '{Tes}{Nun}{Samekh}' '{Tes}{Ayen}{Nun}{Ayen}' '{Tes}{Ayen}{Nun}{Ayen}{Mem}' '{Tes}{Ayen}{Nun}{Ayen}{Reysh}'
247
+ ( ((R1 delete) or ( <- '{Tes}'))
248
+ // -(ge)brakht => -breng
249
+ ['{Beys}{Reysh}{Alef}{Khof}' try '{Giml}{Ayen}'] <- '{Beys}{Reysh}{Ayen}{Nun}{Giml}'
250
+ )
251
+
252
+ // Past participles: -et, -etn, -ets, -ete, -eter
253
+ '{Ayen}{Tes}' '{Ayen}{Tes}{Nun}' '{Ayen}{Tes}{Samekh}' '{Ayen}{Tes}{Ayen}' '{Ayen}{Tes}{Ayen}{Reysh}'
254
+ ( R1 delete )
255
+
256
+ // -geyn shorted to -gey
257
+ '{Giml}{TsveyYudn}{Nun}'
258
+ ( <- '{Giml}{TsveyYudn}')
259
+
260
+ // ##################### Long list of irregular past participles
261
+ // -(ge)gangen (shortened to -gangen after prefixes) => -gey
262
+ '{Giml}{Alef}{Nun}{Giml}{Ayen}{Nun}'
263
+ ( <- '{Giml}{TsveyYudn}' )
264
+
265
+ // -(ge)numen (shortened to -numen after prefixes) => -nem
266
+ '{Nun}{Vov}{Mem}{Ayen}{Nun}'
267
+ (<- '{Nun}{Ayen}{Mem}' )
268
+
269
+ // -(ge)shribn (shortened to -shribn after prefixes) => -shrayb
270
+ '{Shin}{Reysh}{Yud}{Beys}{Nun}'
271
+ (<- '{Shin}{Reysh}{TsveyYudn}{Beys}' )
272
+
273
+ // -gemiten => -mayd
274
+ 'GE{Mem}{Yud}{Tes}{Nun}'
275
+ (<- '{Mem}{TsveyYudn}{Dalet}')
276
+
277
+ // -gebiten => -bayt
278
+ 'GE{Beys}{Yud}{Tes}{Nun}'
279
+ (<- '{Beys}{TsveyYudn}{Tes}')
280
+
281
+ // -gebisen => -bays
282
+ 'GE{Beys}{Yud}{Samekh}{Nun}'
283
+ ( <- '{Beys}{TsveyYudn}{Samekh}')
284
+
285
+ // -gevizen => -vayz
286
+ '{TsveyVovn}{Yud}{Zayen}{Nun}'
287
+ ( <- '{TsveyVovn}{TsveyYudn}{Zayen}')
288
+
289
+ // -getriben => -trayb
290
+ '{Tes}{Reysh}{Yud}{Beys}{Nun}'
291
+ ( <- '{Tes}{Reysh}{TsveyYudn}{Beys}')
292
+
293
+ // -geliten => -layt
294
+ 'GE{Lamed}{Yud}{Tes}{Nun}'
295
+ ( <- '{Lamed}{TsveyYudn}{Tes}')
296
+
297
+ // -gekliben => -klayb
298
+ '{Kuf}{Lamed}{Yud}{Beys}{Nun}'
299
+ ( <- '{Kuf}{Lamed}{TsveyYudn}{Beys}')
300
+
301
+ // -geriben => -rayb
302
+ '{Reysh}{Yud}{Beys}{Nun}'
303
+ ( <- '{Reysh}{TsveyYudn}{Beys}')
304
+
305
+ // -gerisen => -rays
306
+ 'GE{Reysh}{Yud}{Samekh}{Nun}'
307
+ ( <- '{Reysh}{TsveyYudn}{Samekh}')
308
+
309
+ // -geshvigen => -shvayg
310
+ '{Shin}{TsveyVovn}{Yud}{Giml}{Nun}'
311
+ ( <- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}')
312
+
313
+ // -geshmisen => -shmays
314
+ '{Shin}{Mem}{Yud}{Samekh}{Nun}'
315
+ ( <- '{Shin}{Mem}{TsveyYudn}{Samekh}')
316
+
317
+ // -geshniten => -shnayd
318
+ '{Shin}{Nun}{Yud}{Tes}{Nun}'
319
+ ( <- '{Shin}{Nun}{TsveyYudn}{Dalet}')
320
+
321
+ // -gebunden => -bind
322
+ '{Beys}{Vov}{Nun}{Dalet}{Nun}'
323
+ ( <- '{Beys}{Yud}{Nun}{Dalet}')
324
+
325
+ // -gevuntshn => -vintsh
326
+ '{TsveyVovn}{Vov}{Tes}{Shin}{Nun}'
327
+ ( <- '{TsveyVovn}{Yud}{Tes}{Shin}')
328
+
329
+ // -gezungen => -zing
330
+ '{Zayen}{Vov}{Nun}{Giml}{Nun}'
331
+ ( <- '{Zayen}{Yud}{Nun}{Giml}')
332
+
333
+ // -getrunken => -trink
334
+ '{Tes}{Reysh}{Vov}{Nun}{Kuf}{Nun}'
335
+ ( <- '{Tes}{Reysh}{Yud}{Nun}{Kuf}')
336
+
337
+ // -getsvungen => -tsving
338
+ '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}{Nun}'
339
+ ( <- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}')
340
+
341
+ // -geshlungen => -shling
342
+ '{Shin}{Lamed}{Vov}{Nun}{Giml}{Nun}'
343
+ ( <- '{Shin}{Lamed}{Yud}{Nun}{Giml}')
344
+
345
+ // -geboygen => -beyg
346
+ '{Beys}{VovYud}{Giml}{Nun}'
347
+ ( <- '{Beys}{TsveyYudn}{Giml}')
348
+
349
+ // -gehoyben => -heyb
350
+ '{Hey}{VovYud}{Beys}{Nun}'
351
+ ( <- '{Hey}{TsveyYudn}{Beys}')
352
+
353
+ // -farloyren => -farlir
354
+ '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}{Nun}'
355
+ ( <- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}')
356
+
357
+ // -shtanen => -shtey
358
+ '{Shin}{Tes}{Alef}{Nun}{Ayen}{Nun}'
359
+ ( <- '{Shin}{Tes}{TsveyYudn}')
360
+
361
+ // -geshvoyrn => -shver
362
+ '{Shin}{TsveyVovn}{VovYud}{Reysh}{Nun}'
363
+ ( <- '{Shin}{TsveyVovn}{Ayen}{Reysh}')
364
+
365
+ // -(ge)brakht (shortened to -brakht after prefixes) => -breng
366
+ '{Beys}{Reysh}{Alef}{Khof}{Tes}'
367
+ (<- '{Beys}{Reysh}{Ayen}{Nun}{Giml}' )
368
+
369
+ // ###### End of irregular past participles
370
+
371
+ // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft
372
+ '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}'
373
+ ( R1 delete )
374
+
375
+ // Noun endings: -izm, izmen
376
+ '{Yud}{Zayen}{Mem}' '{Yud}{Zayen}{Mem}{Ayen}{Nun}'
377
+ ( R1 delete )
378
+
379
+ // Plural ending: -im
380
+ '{Yud}{Mem}'
381
+ ( R1 delete )
382
+
383
+ // Plural ending: -os (Hebraic), replace with -h
384
+ '{Vov}{Sof}'
385
+ ( R1 <- '{Hey}' )
386
+
387
+ // Diminutive endings: -elekh, -ele, -lekh, -eles, -elen
388
+ '{Ayen}{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}' '{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}{Samekh}' '{Ayen}{Lamed}{Ayen}{Nun}'
389
+ ( R1 delete )
390
+
391
+ // Noun ending: -ist
392
+ '{Yud}{Samekh}{Tes}'
393
+ (
394
+ // Exceptions: -gist, -shist
395
+ ( ('{Giml}' or '{Shin}') try (R1plus3 <- '{Yud}{Samekh}') )
396
+ or
397
+ ( R1 delete )
398
+ )
399
+
400
+ // Noun ending: -istn
401
+ '{Yud}{Samekh}{Tes}{Nun}'
402
+ ( R1 delete )
403
+
404
+ // Verb ending: -stu
405
+ '{Samekh}{Tes}{Vov}'
406
+ ( R1 delete )
407
+
408
+ // Superlative ending: -ster, -ste, -stn
409
+ '{Samekh}{Tes}{Ayen}{Reysh}' '{Samekh}{Tes}{Ayen}' '{Samekh}{Tes}{Nun}'
410
+ ( R1 delete )
411
+
412
+ // Ambiguous verb ending: -st
413
+ '{Samekh}{Tes}'
414
+ ( R1 delete )
415
+ )
416
+ )
417
+
418
+ do (
419
+ [substring] among(
420
+ // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft
421
+ '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}'
422
+ ( R1 delete )
423
+
424
+ // Diminutive endings: -l
425
+ '{Lamed}'
426
+ ( R1 consonant delete )
427
+ )
428
+ )
429
+
430
+ do (
431
+ [substring] among(
432
+ // Adjective endings: -ig, -ik, -ish, -nik, -dik
433
+ '{Yud}{Giml}' '{Yud}{Kuf}' '{Yud}{Shin}' '{Nun}{Yud}{Kuf}' '{Dalet}{Yud}{Kuf}'
434
+ ( R1 delete )
435
+
436
+ // Exceptions to above: -blik, -glik
437
+ '{Beys}{Lamed}{Yud}{Kuf}' '{Giml}{Lamed}{Yud}{Kuf}'
438
+ ( true )
439
+
440
+ // Present participle endings: -ndik
441
+ '{Nun}{Dalet}{Yud}{Kuf}'
442
+ ( R1 delete )
443
+
444
+ // Present participle ending -endik: delete if after a -ng, -nk, -n, -m, consonant+l, or vowel.
445
+ // Otherwise, delete just the -ndik part.
446
+ '{Ayen}{Nun}{Dalet}{Yud}{Kuf}'
447
+ ( R1 delete )
448
+ )
449
+ )
450
+
451
+ do (repeat goto ( ['GE' or 'TSU'] delete ))
452
+ )
453
+ )
454
+
455
+ define stem as (
456
+ do prelude
457
+ do mark_regions
458
+ backwards
459
+ do standard_suffix
460
+ )
@@ -0,0 +1,98 @@
1
+ // ISO-8859-2 character mappings.
2
+
3
+ stringdef U+00A0 hex 'A0'
4
+ stringdef U+0104 hex 'A1'
5
+ stringdef U+02D8 hex 'A2'
6
+ stringdef U+0141 hex 'A3'
7
+ stringdef U+00A4 hex 'A4'
8
+ stringdef U+013D hex 'A5'
9
+ stringdef U+015A hex 'A6'
10
+ stringdef U+00A7 hex 'A7'
11
+ stringdef U+00A8 hex 'A8'
12
+ stringdef U+0160 hex 'A9'
13
+ stringdef U+015E hex 'AA'
14
+ stringdef U+0164 hex 'AB'
15
+ stringdef U+0179 hex 'AC'
16
+ stringdef U+00AD hex 'AD'
17
+ stringdef U+017D hex 'AE'
18
+ stringdef U+017B hex 'AF'
19
+ stringdef U+00B0 hex 'B0'
20
+ stringdef U+0105 hex 'B1'
21
+ stringdef U+02DB hex 'B2'
22
+ stringdef U+0142 hex 'B3'
23
+ stringdef U+00B4 hex 'B4'
24
+ stringdef U+013E hex 'B5'
25
+ stringdef U+015B hex 'B6'
26
+ stringdef U+02C7 hex 'B7'
27
+ stringdef U+00B8 hex 'B8'
28
+ stringdef U+0161 hex 'B9'
29
+ stringdef U+015F hex 'BA'
30
+ stringdef U+0165 hex 'BB'
31
+ stringdef U+017A hex 'BC'
32
+ stringdef U+02DD hex 'BD'
33
+ stringdef U+017E hex 'BE'
34
+ stringdef U+017C hex 'BF'
35
+ stringdef U+0154 hex 'C0'
36
+ stringdef U+00C1 hex 'C1'
37
+ stringdef U+00C2 hex 'C2'
38
+ stringdef U+0102 hex 'C3'
39
+ stringdef U+00C4 hex 'C4'
40
+ stringdef U+0139 hex 'C5'
41
+ stringdef U+0106 hex 'C6'
42
+ stringdef U+00C7 hex 'C7'
43
+ stringdef U+010C hex 'C8'
44
+ stringdef U+00C9 hex 'C9'
45
+ stringdef U+0118 hex 'CA'
46
+ stringdef U+00CB hex 'CB'
47
+ stringdef U+011A hex 'CC'
48
+ stringdef U+00CD hex 'CD'
49
+ stringdef U+00CE hex 'CE'
50
+ stringdef U+010E hex 'CF'
51
+ stringdef U+0110 hex 'D0'
52
+ stringdef U+0143 hex 'D1'
53
+ stringdef U+0147 hex 'D2'
54
+ stringdef U+00D3 hex 'D3'
55
+ stringdef U+00D4 hex 'D4'
56
+ stringdef U+0150 hex 'D5'
57
+ stringdef U+00D6 hex 'D6'
58
+ stringdef U+00D7 hex 'D7'
59
+ stringdef U+0158 hex 'D8'
60
+ stringdef U+016E hex 'D9'
61
+ stringdef U+00DA hex 'DA'
62
+ stringdef U+0170 hex 'DB'
63
+ stringdef U+00DC hex 'DC'
64
+ stringdef U+00DD hex 'DD'
65
+ stringdef U+0162 hex 'DE'
66
+ stringdef U+00DF hex 'DF'
67
+ stringdef U+0155 hex 'E0'
68
+ stringdef U+00E1 hex 'E1'
69
+ stringdef U+00E2 hex 'E2'
70
+ stringdef U+0103 hex 'E3'
71
+ stringdef U+00E4 hex 'E4'
72
+ stringdef U+013A hex 'E5'
73
+ stringdef U+0107 hex 'E6'
74
+ stringdef U+00E7 hex 'E7'
75
+ stringdef U+010D hex 'E8'
76
+ stringdef U+00E9 hex 'E9'
77
+ stringdef U+0119 hex 'EA'
78
+ stringdef U+00EB hex 'EB'
79
+ stringdef U+011B hex 'EC'
80
+ stringdef U+00ED hex 'ED'
81
+ stringdef U+00EE hex 'EE'
82
+ stringdef U+010F hex 'EF'
83
+ stringdef U+0111 hex 'F0'
84
+ stringdef U+0144 hex 'F1'
85
+ stringdef U+0148 hex 'F2'
86
+ stringdef U+00F3 hex 'F3'
87
+ stringdef U+00F4 hex 'F4'
88
+ stringdef U+0151 hex 'F5'
89
+ stringdef U+00F6 hex 'F6'
90
+ stringdef U+00F7 hex 'F7'
91
+ stringdef U+0159 hex 'F8'
92
+ stringdef U+016F hex 'F9'
93
+ stringdef U+00FA hex 'FA'
94
+ stringdef U+0171 hex 'FB'
95
+ stringdef U+00FC hex 'FC'
96
+ stringdef U+00FD hex 'FD'
97
+ stringdef U+0163 hex 'FE'
98
+ stringdef U+02D9 hex 'FF'
@@ -0,0 +1,74 @@
1
+ // KOI8-R character mappings.
2
+
3
+ stringdef U+00A0 hex '9A'
4
+ stringdef U+00A9 hex 'BF'
5
+ stringdef U+00B0 hex '9C'
6
+ stringdef U+00B2 hex '9D'
7
+ stringdef U+00B7 hex '9E'
8
+ stringdef U+00F7 hex '9F'
9
+ stringdef U+0401 hex 'B3'
10
+ stringdef U+0410 hex 'E1'
11
+ stringdef U+0411 hex 'E2'
12
+ stringdef U+0412 hex 'F7'
13
+ stringdef U+0413 hex 'E7'
14
+ stringdef U+0414 hex 'E4'
15
+ stringdef U+0415 hex 'E5'
16
+ stringdef U+0416 hex 'F6'
17
+ stringdef U+0417 hex 'FA'
18
+ stringdef U+0418 hex 'E9'
19
+ stringdef U+0419 hex 'EA'
20
+ stringdef U+041A hex 'EB'
21
+ stringdef U+041B hex 'EC'
22
+ stringdef U+041C hex 'ED'
23
+ stringdef U+041D hex 'EE'
24
+ stringdef U+041E hex 'EF'
25
+ stringdef U+041F hex 'F0'
26
+ stringdef U+0420 hex 'F2'
27
+ stringdef U+0421 hex 'F3'
28
+ stringdef U+0422 hex 'F4'
29
+ stringdef U+0423 hex 'F5'
30
+ stringdef U+0424 hex 'E6'
31
+ stringdef U+0425 hex 'E8'
32
+ stringdef U+0426 hex 'E3'
33
+ stringdef U+0427 hex 'FE'
34
+ stringdef U+0428 hex 'FB'
35
+ stringdef U+0429 hex 'FD'
36
+ stringdef U+042A hex 'FF'
37
+ stringdef U+042B hex 'F9'
38
+ stringdef U+042C hex 'F8'
39
+ stringdef U+042D hex 'FC'
40
+ stringdef U+042E hex 'E0'
41
+ stringdef U+042F hex 'F1'
42
+ stringdef U+0430 hex 'C1'
43
+ stringdef U+0431 hex 'C2'
44
+ stringdef U+0432 hex 'D7'
45
+ stringdef U+0433 hex 'C7'
46
+ stringdef U+0434 hex 'C4'
47
+ stringdef U+0435 hex 'C5'
48
+ stringdef U+0436 hex 'D6'
49
+ stringdef U+0437 hex 'DA'
50
+ stringdef U+0438 hex 'C9'
51
+ stringdef U+0439 hex 'CA'
52
+ stringdef U+043A hex 'CB'
53
+ stringdef U+043B hex 'CC'
54
+ stringdef U+043C hex 'CD'
55
+ stringdef U+043D hex 'CE'
56
+ stringdef U+043E hex 'CF'
57
+ stringdef U+043F hex 'D0'
58
+ stringdef U+0440 hex 'D2'
59
+ stringdef U+0441 hex 'D3'
60
+ stringdef U+0442 hex 'D4'
61
+ stringdef U+0443 hex 'D5'
62
+ stringdef U+0444 hex 'C6'
63
+ stringdef U+0445 hex 'C8'
64
+ stringdef U+0446 hex 'C3'
65
+ stringdef U+0447 hex 'DE'
66
+ stringdef U+0448 hex 'DB'
67
+ stringdef U+0449 hex 'DD'
68
+ stringdef U+044A hex 'DF'
69
+ stringdef U+044B hex 'D9'
70
+ stringdef U+044C hex 'D8'
71
+ stringdef U+044D hex 'DC'
72
+ stringdef U+044E hex 'C0'
73
+ stringdef U+044F hex 'D1'
74
+ stringdef U+0451 hex 'A3'