mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,208 @@
1
+
2
+ stringescapes {}
3
+
4
+ routines (
5
+ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC
6
+
7
+ endings
8
+
9
+ undouble respell
10
+ )
11
+
12
+ externals ( stem )
13
+
14
+ backwardmode (
15
+
16
+ /* Lovins' conditions A, B ... CC, as given in her Appendix B, where
17
+ a test for a two letter prefix ('test hop 2') is implicitly
18
+ assumed. Note that 'e' next 'u' corresponds to her u*e because
19
+ Snowball is scanning backwards. */
20
+
21
+ define A as ( hop 2 )
22
+ define B as ( hop 3 )
23
+ define C as ( hop 4 )
24
+ define D as ( hop 5 )
25
+ define E as ( test hop 2 not 'e' )
26
+ define F as ( test hop 3 not 'e' )
27
+ define G as ( test hop 3 'f' )
28
+ define H as ( test hop 2 't' or 'll' )
29
+ define I as ( test hop 2 not 'o' not 'e' )
30
+ define J as ( test hop 2 not 'a' not 'e' )
31
+ define K as ( test hop 3 'l' or 'i' or ('e' next 'u') )
32
+ define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') )
33
+ define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' )
34
+ define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) )
35
+ define O as ( test hop 2 'l' or 'i' )
36
+ define P as ( test hop 2 not 'c' )
37
+ define Q as ( test hop 2 test hop 3 not 'l' not 'n' )
38
+ define R as ( test hop 2 'n' or 'r' )
39
+ define S as ( test hop 2 'dr' or ('t' not 't') )
40
+ define T as ( test hop 2 's' or ('t' not 'o') )
41
+ define U as ( test hop 2 'l' or 'm' or 'n' or 'r' )
42
+ define V as ( test hop 2 'c' )
43
+ define W as ( test hop 2 not 's' not 'u' )
44
+ define X as ( test hop 2 'l' or 'i' or ('e' next 'u') )
45
+ define Y as ( test hop 2 'in' )
46
+ define Z as ( test hop 2 not 'f' )
47
+ define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or'
48
+ 'es' 't' ) )
49
+ define BB as ( test hop 3 not 'met' not 'ryst' )
50
+ define CC as ( test hop 2 'l' )
51
+
52
+
53
+ /* The system of endings, as given in Appendix A. */
54
+
55
+ define endings as (
56
+ [substring] among(
57
+ 'alistically' B 'arizability' A 'izationally' B
58
+
59
+ 'antialness' A 'arisations' A 'arizations' A 'entialness' A
60
+
61
+ 'allically' C 'antaneous' A 'antiality' A 'arisation' A
62
+ 'arization' A 'ationally' B 'ativeness' A 'eableness' E
63
+ 'entations' A 'entiality' A 'entialize' A 'entiation' A
64
+ 'ionalness' A 'istically' A 'itousness' A 'izability' A
65
+ 'izational' A
66
+
67
+ 'ableness' A 'arizable' A 'entation' A 'entially' A
68
+ 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A
69
+ 'ionality' A 'ionalize' A 'iousness' A 'izations' A
70
+ 'lessness' A
71
+
72
+ 'ability' A 'aically' A 'alistic' B 'alities' A
73
+ 'ariness' E 'aristic' A 'arizing' A 'ateness' A
74
+ 'atingly' A 'ational' B 'atively' A 'ativism' A
75
+ 'elihood' E 'encible' A 'entally' A 'entials' A
76
+ 'entiate' A 'entness' A 'fulness' A 'ibility' A
77
+ 'icalism' A 'icalist' A 'icality' A 'icalize' A
78
+ 'ication' G 'icianry' A 'ination' A 'ingness' A
79
+ 'ionally' A 'isation' A 'ishness' A 'istical' A
80
+ 'iteness' A 'iveness' A 'ivistic' A 'ivities' A
81
+ 'ization' F 'izement' A 'oidally' A 'ousness' A
82
+
83
+ 'aceous' A 'acious' B 'action' G 'alness' A
84
+ 'ancial' A 'ancies' A 'ancing' B 'ariser' A
85
+ 'arized' A 'arizer' A 'atable' A 'ations' B
86
+ 'atives' A 'eature' Z 'efully' A 'encies' A
87
+ 'encing' A 'ential' A 'enting' C 'entist' A
88
+ 'eously' A 'ialist' A 'iality' A 'ialize' A
89
+ 'ically' A 'icance' A 'icians' A 'icists' A
90
+ 'ifully' A 'ionals' A 'ionate' D 'ioning' A
91
+ 'ionist' A 'iously' A 'istics' A 'izable' E
92
+ 'lessly' A 'nesses' A 'oidism' A
93
+
94
+ 'acies' A 'acity' A 'aging' B 'aical' A
95
+ 'alist' A 'alism' B 'ality' A 'alize' A
96
+ 'allic'BB 'anced' B 'ances' B 'antic' C
97
+ 'arial' A 'aries' A 'arily' A 'arity' B
98
+ 'arize' A 'aroid' A 'ately' A 'ating' I
99
+ 'ation' B 'ative' A 'ators' A 'atory' A
100
+ 'ature' E 'early' Y 'ehood' A 'eless' A
101
+ 'elity' A 'ement' A 'enced' A 'ences' A
102
+ 'eness' E 'ening' E 'ental' A 'ented' C
103
+ 'ently' A 'fully' A 'ially' A 'icant' A
104
+ 'ician' A 'icide' A 'icism' A 'icist' A
105
+ 'icity' A 'idine' I 'iedly' A 'ihood' A
106
+ 'inate' A 'iness' A 'ingly' B 'inism' J
107
+ 'inity'CC 'ional' A 'ioned' A 'ished' A
108
+ 'istic' A 'ities' A 'itous' A 'ively' A
109
+ 'ivity' A 'izers' F 'izing' F 'oidal' A
110
+ 'oides' A 'otide' A 'ously' A
111
+
112
+ 'able' A 'ably' A 'ages' B 'ally' B
113
+ 'ance' B 'ancy' B 'ants' B 'aric' A
114
+ 'arly' K 'ated' I 'ates' A 'atic' B
115
+ 'ator' A 'ealy' Y 'edly' E 'eful' A
116
+ 'eity' A 'ence' A 'ency' A 'ened' E
117
+ 'enly' E 'eous' A 'hood' A 'ials' A
118
+ 'ians' A 'ible' A 'ibly' A 'ical' A
119
+ 'ides' L 'iers' A 'iful' A 'ines' M
120
+ 'ings' N 'ions' B 'ious' A 'isms' B
121
+ 'ists' A 'itic' H 'ized' F 'izer' F
122
+ 'less' A 'lily' A 'ness' A 'ogen' A
123
+ 'ward' A 'wise' A 'ying' B 'yish' A
124
+
125
+ 'acy' A 'age' B 'aic' A 'als'BB
126
+ 'ant' B 'ars' O 'ary' F 'ata' A
127
+ 'ate' A 'eal' Y 'ear' Y 'ely' E
128
+ 'ene' E 'ent' C 'ery' E 'ese' A
129
+ 'ful' A 'ial' A 'ian' A 'ics' A
130
+ 'ide' L 'ied' A 'ier' A 'ies' P
131
+ 'ily' A 'ine' M 'ing' N 'ion' Q
132
+ 'ish' C 'ism' B 'ist' A 'ite'AA
133
+ 'ity' A 'ium' A 'ive' A 'ize' F
134
+ 'oid' A 'one' R 'ous' A
135
+
136
+ 'ae' A 'al'BB 'ar' X 'as' B
137
+ 'ed' E 'en' F 'es' E 'ia' A
138
+ 'ic' A 'is' A 'ly' B 'on' S
139
+ 'or' T 'um' U 'us' V 'yl' R
140
+ '{'}s' A 's{'}' A
141
+
142
+ 'a' A 'e' A 'i' A 'o' A
143
+ 's' W 'y' B
144
+
145
+ (delete)
146
+ )
147
+ )
148
+
149
+ /* Undoubling is rule 1 of appendix C. */
150
+
151
+ define undouble as (
152
+ test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss'
153
+ 'tt')
154
+ [next] delete
155
+ )
156
+
157
+ /* The other appendix C rules can be done together. */
158
+
159
+ define respell as (
160
+ [substring] among (
161
+ 'iev' (<-'ief')
162
+ 'uct' (<-'uc')
163
+ 'umpt' (<-'um')
164
+ 'rpt' (<-'rb')
165
+ 'urs' (<-'ur')
166
+ 'istr' (<-'ister')
167
+ 'metr' (<-'meter')
168
+ 'olv' (<-'olut')
169
+ 'ul' (not 'a' not 'i' not 'o' <-'l')
170
+ 'bex' (<-'bic')
171
+ 'dex' (<-'dic')
172
+ 'pex' (<-'pic')
173
+ 'tex' (<-'tic')
174
+ 'ax' (<-'ac')
175
+ 'ex' (<-'ec')
176
+ 'ix' (<-'ic')
177
+ 'lux' (<-'luc')
178
+ 'uad' (<-'uas')
179
+ 'vad' (<-'vas')
180
+ 'cid' (<-'cis')
181
+ 'lid' (<-'lis')
182
+ 'erid' (<-'eris')
183
+ 'pand' (<-'pans')
184
+ 'end' (not 's' <-'ens')
185
+ 'ond' (<-'ons')
186
+ 'lud' (<-'lus')
187
+ 'rud' (<-'rus')
188
+ 'her' (not 'p' not 't' <-'hes')
189
+ 'mit' (<-'mis')
190
+ 'ent' (not 'm' <-'ens')
191
+ /* 'ent' was 'end' in the 1968 paper - a typo. */
192
+ 'ert' (<-'ers')
193
+ 'et' (not 'n' <-'es')
194
+ 'yt' (<-'ys')
195
+ 'yz' (<-'ys')
196
+ )
197
+ )
198
+ )
199
+
200
+ define stem as (
201
+
202
+ backwards (
203
+ do endings
204
+ do undouble
205
+ do respell
206
+ )
207
+ )
208
+
@@ -0,0 +1,92 @@
1
+ /*
2
+ * Authors:
3
+ * - Ingroj Shrestha <ing.stha@gmail.com>, Nepali NLP Group
4
+ * - Oleg Bartunov <obartunov@gmail.com>, Postgres Professional Ltd.
5
+ * - Shreeya Singh Dhakal, Nepali NLP Group
6
+ */
7
+
8
+ routines (
9
+ remove_category_1
10
+ check_category_2
11
+ remove_category_2
12
+ remove_category_3
13
+ )
14
+
15
+ stringescapes {}
16
+
17
+ stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU
18
+ stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA
19
+ stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I
20
+ stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II
21
+ stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E
22
+ stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA
23
+ stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA
24
+ stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA
25
+ stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA
26
+ stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA
27
+ stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA
28
+ stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA
29
+ stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA
30
+ stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA
31
+ stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA
32
+ stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA
33
+ stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA
34
+ stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA
35
+ stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA
36
+ stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA
37
+ stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA
38
+ stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA
39
+ stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA
40
+ stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA
41
+ stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I
42
+ stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II
43
+ stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U
44
+ stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU
45
+ stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E
46
+ stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI
47
+ stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O
48
+ stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU
49
+ stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA
50
+
51
+ externals ( stem )
52
+ backwardmode (
53
+ define remove_category_1 as(
54
+ [substring] among (
55
+ '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}'
56
+ '{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}'
57
+ '{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}'
58
+ (delete)
59
+ '{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete)
60
+ )
61
+ )
62
+
63
+ define check_category_2 as(
64
+ [substring] among(
65
+ '{dsc}' '{dsa}' '{dvsai}'
66
+ )
67
+ )
68
+
69
+ define remove_category_2 as (
70
+ [substring] among(
71
+ '{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete)
72
+ '{dvsai}' ('{dlta}{dsv}{dlr}' delete)
73
+ )
74
+ )
75
+
76
+ define remove_category_3 as(
77
+ [substring] among(
78
+ '{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}'
79
+ (delete)
80
+ )
81
+ )
82
+
83
+ )
84
+
85
+ define stem as (
86
+ backwards (
87
+ do remove_category_1
88
+ do (
89
+ repeat (do (check_category_2 and remove_category_2) remove_category_3)
90
+ )
91
+ )
92
+ )
@@ -0,0 +1,80 @@
1
+ routines (
2
+ mark_regions
3
+ main_suffix
4
+ consonant_pair
5
+ other_suffix
6
+ )
7
+
8
+ externals ( stem )
9
+
10
+ integers ( p1 x )
11
+
12
+ groupings ( v s_ending )
13
+
14
+ stringescapes {}
15
+
16
+ /* special characters */
17
+
18
+ stringdef ae '{U+00E6}'
19
+ stringdef ao '{U+00E5}'
20
+ stringdef o/ '{U+00F8}'
21
+
22
+ define v 'aeiouy{ae}{ao}{o/}'
23
+
24
+ define s_ending 'bcdfghjlmnoprtvyz'
25
+
26
+ define mark_regions as (
27
+
28
+ $p1 = limit
29
+
30
+ test ( hop 3 setmark x )
31
+ goto v gopast non-v setmark p1
32
+ try ( $p1 < x $p1 = x )
33
+ )
34
+
35
+ backwardmode (
36
+
37
+ define main_suffix as (
38
+ setlimit tomark p1 for ([substring])
39
+ among(
40
+
41
+ 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
42
+ 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
43
+ 'hetens' 'ers' 'ets' 'et' 'het' 'ast'
44
+ (delete)
45
+ 's'
46
+ (s_ending or ('k' non-v) delete)
47
+ 'erte' 'ert'
48
+ (<-'er')
49
+ )
50
+ )
51
+
52
+ define consonant_pair as (
53
+ test (
54
+ setlimit tomark p1 for ([substring])
55
+ among(
56
+ 'dt' 'vt'
57
+ )
58
+ )
59
+ next] delete
60
+ )
61
+
62
+ define other_suffix as (
63
+ setlimit tomark p1 for ([substring])
64
+ among(
65
+ 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
66
+ 'hetslov'
67
+ (delete)
68
+ )
69
+ )
70
+ )
71
+
72
+ define stem as (
73
+
74
+ do mark_regions
75
+ backwards (
76
+ do main_suffix
77
+ do consonant_pair
78
+ do other_suffix
79
+ )
80
+ )
@@ -0,0 +1,139 @@
1
+ integers ( p1 p2 )
2
+ booleans ( Y_found )
3
+
4
+ routines (
5
+ shortv
6
+ R1 R2
7
+ Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b
8
+ )
9
+
10
+ externals ( stem )
11
+
12
+ groupings ( v v_WXY )
13
+
14
+ define v 'aeiouy'
15
+ define v_WXY v + 'wxY'
16
+
17
+ backwardmode (
18
+
19
+ define shortv as ( non-v_WXY v non-v )
20
+
21
+ define R1 as $p1 <= cursor
22
+ define R2 as $p2 <= cursor
23
+
24
+ define Step_1a as (
25
+ [substring] among (
26
+ 'sses' (<-'ss')
27
+ 'ies' (<-'i')
28
+ 'ss' ()
29
+ 's' (delete)
30
+ )
31
+ )
32
+
33
+ define Step_1b as (
34
+ [substring] among (
35
+ 'eed' (R1 <-'ee')
36
+ 'ed'
37
+ 'ing' (
38
+ test gopast v delete
39
+ test substring among(
40
+ 'at' 'bl' 'iz'
41
+ (<+ 'e')
42
+ 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
43
+ // ignoring double c, h, j, k, q, v, w, and x
44
+ ([next] delete)
45
+ '' (atmark p1 test shortv <+ 'e')
46
+ )
47
+ )
48
+ )
49
+ )
50
+
51
+ define Step_1c as (
52
+ ['y' or 'Y']
53
+ gopast v
54
+ <-'i'
55
+ )
56
+
57
+ define Step_2 as (
58
+ [substring] R1 among (
59
+ 'tional' (<-'tion')
60
+ 'enci' (<-'ence')
61
+ 'anci' (<-'ance')
62
+ 'abli' (<-'able')
63
+ 'entli' (<-'ent')
64
+ 'eli' (<-'e')
65
+ 'izer' 'ization'
66
+ (<-'ize')
67
+ 'ational' 'ation' 'ator'
68
+ (<-'ate')
69
+ 'alli' (<-'al')
70
+ 'alism' 'aliti'
71
+ (<-'al')
72
+ 'fulness' (<-'ful')
73
+ 'ousli' 'ousness'
74
+ (<-'ous')
75
+ 'iveness' 'iviti'
76
+ (<-'ive')
77
+ 'biliti' (<-'ble')
78
+ )
79
+ )
80
+
81
+ define Step_3 as (
82
+ [substring] R1 among (
83
+ 'alize' (<-'al')
84
+ 'icate' 'iciti' 'ical'
85
+ (<-'ic')
86
+ 'ative' 'ful' 'ness'
87
+ (delete)
88
+ )
89
+ )
90
+
91
+ define Step_4 as (
92
+ [substring] R2 among (
93
+ 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
94
+ 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
95
+ (delete)
96
+ 'ion' ('s' or 't' delete)
97
+ )
98
+ )
99
+
100
+ define Step_5a as (
101
+ ['e']
102
+ R2 or (R1 not shortv)
103
+ delete
104
+ )
105
+
106
+ define Step_5b as (
107
+ ['l']
108
+ R2 'l'
109
+ delete
110
+ )
111
+ )
112
+
113
+ define stem as (
114
+
115
+ unset Y_found
116
+ do ( ['y'] <-'Y' set Y_found)
117
+ do repeat(goto (v ['y']) <-'Y' set Y_found)
118
+
119
+ $p1 = limit
120
+ $p2 = limit
121
+ do(
122
+ gopast v gopast non-v setmark p1
123
+ gopast v gopast non-v setmark p2
124
+ )
125
+
126
+ backwards (
127
+ do Step_1a
128
+ do Step_1b
129
+ do Step_1c
130
+ do Step_2
131
+ do Step_3
132
+ do Step_4
133
+ do Step_5a
134
+ do Step_5b
135
+ )
136
+
137
+ do(Y_found repeat(goto (['Y']) <-'y'))
138
+
139
+ )