mittens 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,208 @@
1
+
2
+ stringescapes {}
3
+
4
+ routines (
5
+ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC
6
+
7
+ endings
8
+
9
+ undouble respell
10
+ )
11
+
12
+ externals ( stem )
13
+
14
+ backwardmode (
15
+
16
+ /* Lovins' conditions A, B ... CC, as given in her Appendix B, where
17
+ a test for a two letter prefix ('test hop 2') is implicitly
18
+ assumed. Note that 'e' next 'u' corresponds to her u*e because
19
+ Snowball is scanning backwards. */
20
+
21
+ define A as ( hop 2 )
22
+ define B as ( hop 3 )
23
+ define C as ( hop 4 )
24
+ define D as ( hop 5 )
25
+ define E as ( test hop 2 not 'e' )
26
+ define F as ( test hop 3 not 'e' )
27
+ define G as ( test hop 3 'f' )
28
+ define H as ( test hop 2 't' or 'll' )
29
+ define I as ( test hop 2 not 'o' not 'e' )
30
+ define J as ( test hop 2 not 'a' not 'e' )
31
+ define K as ( test hop 3 'l' or 'i' or ('e' next 'u') )
32
+ define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') )
33
+ define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' )
34
+ define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) )
35
+ define O as ( test hop 2 'l' or 'i' )
36
+ define P as ( test hop 2 not 'c' )
37
+ define Q as ( test hop 2 test hop 3 not 'l' not 'n' )
38
+ define R as ( test hop 2 'n' or 'r' )
39
+ define S as ( test hop 2 'dr' or ('t' not 't') )
40
+ define T as ( test hop 2 's' or ('t' not 'o') )
41
+ define U as ( test hop 2 'l' or 'm' or 'n' or 'r' )
42
+ define V as ( test hop 2 'c' )
43
+ define W as ( test hop 2 not 's' not 'u' )
44
+ define X as ( test hop 2 'l' or 'i' or ('e' next 'u') )
45
+ define Y as ( test hop 2 'in' )
46
+ define Z as ( test hop 2 not 'f' )
47
+ define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or'
48
+ 'es' 't' ) )
49
+ define BB as ( test hop 3 not 'met' not 'ryst' )
50
+ define CC as ( test hop 2 'l' )
51
+
52
+
53
+ /* The system of endings, as given in Appendix A. */
54
+
55
+ define endings as (
56
+ [substring] among(
57
+ 'alistically' B 'arizability' A 'izationally' B
58
+
59
+ 'antialness' A 'arisations' A 'arizations' A 'entialness' A
60
+
61
+ 'allically' C 'antaneous' A 'antiality' A 'arisation' A
62
+ 'arization' A 'ationally' B 'ativeness' A 'eableness' E
63
+ 'entations' A 'entiality' A 'entialize' A 'entiation' A
64
+ 'ionalness' A 'istically' A 'itousness' A 'izability' A
65
+ 'izational' A
66
+
67
+ 'ableness' A 'arizable' A 'entation' A 'entially' A
68
+ 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A
69
+ 'ionality' A 'ionalize' A 'iousness' A 'izations' A
70
+ 'lessness' A
71
+
72
+ 'ability' A 'aically' A 'alistic' B 'alities' A
73
+ 'ariness' E 'aristic' A 'arizing' A 'ateness' A
74
+ 'atingly' A 'ational' B 'atively' A 'ativism' A
75
+ 'elihood' E 'encible' A 'entally' A 'entials' A
76
+ 'entiate' A 'entness' A 'fulness' A 'ibility' A
77
+ 'icalism' A 'icalist' A 'icality' A 'icalize' A
78
+ 'ication' G 'icianry' A 'ination' A 'ingness' A
79
+ 'ionally' A 'isation' A 'ishness' A 'istical' A
80
+ 'iteness' A 'iveness' A 'ivistic' A 'ivities' A
81
+ 'ization' F 'izement' A 'oidally' A 'ousness' A
82
+
83
+ 'aceous' A 'acious' B 'action' G 'alness' A
84
+ 'ancial' A 'ancies' A 'ancing' B 'ariser' A
85
+ 'arized' A 'arizer' A 'atable' A 'ations' B
86
+ 'atives' A 'eature' Z 'efully' A 'encies' A
87
+ 'encing' A 'ential' A 'enting' C 'entist' A
88
+ 'eously' A 'ialist' A 'iality' A 'ialize' A
89
+ 'ically' A 'icance' A 'icians' A 'icists' A
90
+ 'ifully' A 'ionals' A 'ionate' D 'ioning' A
91
+ 'ionist' A 'iously' A 'istics' A 'izable' E
92
+ 'lessly' A 'nesses' A 'oidism' A
93
+
94
+ 'acies' A 'acity' A 'aging' B 'aical' A
95
+ 'alist' A 'alism' B 'ality' A 'alize' A
96
+ 'allic'BB 'anced' B 'ances' B 'antic' C
97
+ 'arial' A 'aries' A 'arily' A 'arity' B
98
+ 'arize' A 'aroid' A 'ately' A 'ating' I
99
+ 'ation' B 'ative' A 'ators' A 'atory' A
100
+ 'ature' E 'early' Y 'ehood' A 'eless' A
101
+ 'elity' A 'ement' A 'enced' A 'ences' A
102
+ 'eness' E 'ening' E 'ental' A 'ented' C
103
+ 'ently' A 'fully' A 'ially' A 'icant' A
104
+ 'ician' A 'icide' A 'icism' A 'icist' A
105
+ 'icity' A 'idine' I 'iedly' A 'ihood' A
106
+ 'inate' A 'iness' A 'ingly' B 'inism' J
107
+ 'inity'CC 'ional' A 'ioned' A 'ished' A
108
+ 'istic' A 'ities' A 'itous' A 'ively' A
109
+ 'ivity' A 'izers' F 'izing' F 'oidal' A
110
+ 'oides' A 'otide' A 'ously' A
111
+
112
+ 'able' A 'ably' A 'ages' B 'ally' B
113
+ 'ance' B 'ancy' B 'ants' B 'aric' A
114
+ 'arly' K 'ated' I 'ates' A 'atic' B
115
+ 'ator' A 'ealy' Y 'edly' E 'eful' A
116
+ 'eity' A 'ence' A 'ency' A 'ened' E
117
+ 'enly' E 'eous' A 'hood' A 'ials' A
118
+ 'ians' A 'ible' A 'ibly' A 'ical' A
119
+ 'ides' L 'iers' A 'iful' A 'ines' M
120
+ 'ings' N 'ions' B 'ious' A 'isms' B
121
+ 'ists' A 'itic' H 'ized' F 'izer' F
122
+ 'less' A 'lily' A 'ness' A 'ogen' A
123
+ 'ward' A 'wise' A 'ying' B 'yish' A
124
+
125
+ 'acy' A 'age' B 'aic' A 'als'BB
126
+ 'ant' B 'ars' O 'ary' F 'ata' A
127
+ 'ate' A 'eal' Y 'ear' Y 'ely' E
128
+ 'ene' E 'ent' C 'ery' E 'ese' A
129
+ 'ful' A 'ial' A 'ian' A 'ics' A
130
+ 'ide' L 'ied' A 'ier' A 'ies' P
131
+ 'ily' A 'ine' M 'ing' N 'ion' Q
132
+ 'ish' C 'ism' B 'ist' A 'ite'AA
133
+ 'ity' A 'ium' A 'ive' A 'ize' F
134
+ 'oid' A 'one' R 'ous' A
135
+
136
+ 'ae' A 'al'BB 'ar' X 'as' B
137
+ 'ed' E 'en' F 'es' E 'ia' A
138
+ 'ic' A 'is' A 'ly' B 'on' S
139
+ 'or' T 'um' U 'us' V 'yl' R
140
+ '{'}s' A 's{'}' A
141
+
142
+ 'a' A 'e' A 'i' A 'o' A
143
+ 's' W 'y' B
144
+
145
+ (delete)
146
+ )
147
+ )
148
+
149
+ /* Undoubling is rule 1 of appendix C. */
150
+
151
+ define undouble as (
152
+ test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss'
153
+ 'tt')
154
+ [next] delete
155
+ )
156
+
157
+ /* The other appendix C rules can be done together. */
158
+
159
+ define respell as (
160
+ [substring] among (
161
+ 'iev' (<-'ief')
162
+ 'uct' (<-'uc')
163
+ 'umpt' (<-'um')
164
+ 'rpt' (<-'rb')
165
+ 'urs' (<-'ur')
166
+ 'istr' (<-'ister')
167
+ 'metr' (<-'meter')
168
+ 'olv' (<-'olut')
169
+ 'ul' (not 'a' not 'i' not 'o' <-'l')
170
+ 'bex' (<-'bic')
171
+ 'dex' (<-'dic')
172
+ 'pex' (<-'pic')
173
+ 'tex' (<-'tic')
174
+ 'ax' (<-'ac')
175
+ 'ex' (<-'ec')
176
+ 'ix' (<-'ic')
177
+ 'lux' (<-'luc')
178
+ 'uad' (<-'uas')
179
+ 'vad' (<-'vas')
180
+ 'cid' (<-'cis')
181
+ 'lid' (<-'lis')
182
+ 'erid' (<-'eris')
183
+ 'pand' (<-'pans')
184
+ 'end' (not 's' <-'ens')
185
+ 'ond' (<-'ons')
186
+ 'lud' (<-'lus')
187
+ 'rud' (<-'rus')
188
+ 'her' (not 'p' not 't' <-'hes')
189
+ 'mit' (<-'mis')
190
+ 'ent' (not 'm' <-'ens')
191
+ /* 'ent' was 'end' in the 1968 paper - a typo. */
192
+ 'ert' (<-'ers')
193
+ 'et' (not 'n' <-'es')
194
+ 'yt' (<-'ys')
195
+ 'yz' (<-'ys')
196
+ )
197
+ )
198
+ )
199
+
200
+ define stem as (
201
+
202
+ backwards (
203
+ do endings
204
+ do undouble
205
+ do respell
206
+ )
207
+ )
208
+
@@ -0,0 +1,92 @@
1
+ /*
2
+ * Authors:
3
+ * - Ingroj Shrestha <ing.stha@gmail.com>, Nepali NLP Group
4
+ * - Oleg Bartunov <obartunov@gmail.com>, Postgres Professional Ltd.
5
+ * - Shreeya Singh Dhakal, Nepali NLP Group
6
+ */
7
+
8
+ routines (
9
+ remove_category_1
10
+ check_category_2
11
+ remove_category_2
12
+ remove_category_3
13
+ )
14
+
15
+ stringescapes {}
16
+
17
+ stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU
18
+ stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA
19
+ stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I
20
+ stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II
21
+ stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E
22
+ stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA
23
+ stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA
24
+ stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA
25
+ stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA
26
+ stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA
27
+ stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA
28
+ stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA
29
+ stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA
30
+ stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA
31
+ stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA
32
+ stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA
33
+ stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA
34
+ stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA
35
+ stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA
36
+ stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA
37
+ stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA
38
+ stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA
39
+ stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA
40
+ stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA
41
+ stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I
42
+ stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II
43
+ stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U
44
+ stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU
45
+ stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E
46
+ stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI
47
+ stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O
48
+ stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU
49
+ stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA
50
+
51
+ externals ( stem )
52
+ backwardmode (
53
+ define remove_category_1 as(
54
+ [substring] among (
55
+ '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}'
56
+ '{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}'
57
+ '{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}'
58
+ (delete)
59
+ '{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete)
60
+ )
61
+ )
62
+
63
+ define check_category_2 as(
64
+ [substring] among(
65
+ '{dsc}' '{dsa}' '{dvsai}'
66
+ )
67
+ )
68
+
69
+ define remove_category_2 as (
70
+ [substring] among(
71
+ '{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete)
72
+ '{dvsai}' ('{dlta}{dsv}{dlr}' delete)
73
+ )
74
+ )
75
+
76
+ define remove_category_3 as(
77
+ [substring] among(
78
+ '{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}'
79
+ (delete)
80
+ )
81
+ )
82
+
83
+ )
84
+
85
+ define stem as (
86
+ backwards (
87
+ do remove_category_1
88
+ do (
89
+ repeat (do (check_category_2 and remove_category_2) remove_category_3)
90
+ )
91
+ )
92
+ )
@@ -0,0 +1,80 @@
1
+ routines (
2
+ mark_regions
3
+ main_suffix
4
+ consonant_pair
5
+ other_suffix
6
+ )
7
+
8
+ externals ( stem )
9
+
10
+ integers ( p1 x )
11
+
12
+ groupings ( v s_ending )
13
+
14
+ stringescapes {}
15
+
16
+ /* special characters */
17
+
18
+ stringdef ae '{U+00E6}'
19
+ stringdef ao '{U+00E5}'
20
+ stringdef o/ '{U+00F8}'
21
+
22
+ define v 'aeiouy{ae}{ao}{o/}'
23
+
24
+ define s_ending 'bcdfghjlmnoprtvyz'
25
+
26
+ define mark_regions as (
27
+
28
+ $p1 = limit
29
+
30
+ test ( hop 3 setmark x )
31
+ goto v gopast non-v setmark p1
32
+ try ( $p1 < x $p1 = x )
33
+ )
34
+
35
+ backwardmode (
36
+
37
+ define main_suffix as (
38
+ setlimit tomark p1 for ([substring])
39
+ among(
40
+
41
+ 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
42
+ 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
43
+ 'hetens' 'ers' 'ets' 'et' 'het' 'ast'
44
+ (delete)
45
+ 's'
46
+ (s_ending or ('k' non-v) delete)
47
+ 'erte' 'ert'
48
+ (<-'er')
49
+ )
50
+ )
51
+
52
+ define consonant_pair as (
53
+ test (
54
+ setlimit tomark p1 for ([substring])
55
+ among(
56
+ 'dt' 'vt'
57
+ )
58
+ )
59
+ next] delete
60
+ )
61
+
62
+ define other_suffix as (
63
+ setlimit tomark p1 for ([substring])
64
+ among(
65
+ 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
66
+ 'hetslov'
67
+ (delete)
68
+ )
69
+ )
70
+ )
71
+
72
+ define stem as (
73
+
74
+ do mark_regions
75
+ backwards (
76
+ do main_suffix
77
+ do consonant_pair
78
+ do other_suffix
79
+ )
80
+ )
@@ -0,0 +1,139 @@
1
+ integers ( p1 p2 )
2
+ booleans ( Y_found )
3
+
4
+ routines (
5
+ shortv
6
+ R1 R2
7
+ Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b
8
+ )
9
+
10
+ externals ( stem )
11
+
12
+ groupings ( v v_WXY )
13
+
14
+ define v 'aeiouy'
15
+ define v_WXY v + 'wxY'
16
+
17
+ backwardmode (
18
+
19
+ define shortv as ( non-v_WXY v non-v )
20
+
21
+ define R1 as $p1 <= cursor
22
+ define R2 as $p2 <= cursor
23
+
24
+ define Step_1a as (
25
+ [substring] among (
26
+ 'sses' (<-'ss')
27
+ 'ies' (<-'i')
28
+ 'ss' ()
29
+ 's' (delete)
30
+ )
31
+ )
32
+
33
+ define Step_1b as (
34
+ [substring] among (
35
+ 'eed' (R1 <-'ee')
36
+ 'ed'
37
+ 'ing' (
38
+ test gopast v delete
39
+ test substring among(
40
+ 'at' 'bl' 'iz'
41
+ (<+ 'e')
42
+ 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
43
+ // ignoring double c, h, j, k, q, v, w, and x
44
+ ([next] delete)
45
+ '' (atmark p1 test shortv <+ 'e')
46
+ )
47
+ )
48
+ )
49
+ )
50
+
51
+ define Step_1c as (
52
+ ['y' or 'Y']
53
+ gopast v
54
+ <-'i'
55
+ )
56
+
57
+ define Step_2 as (
58
+ [substring] R1 among (
59
+ 'tional' (<-'tion')
60
+ 'enci' (<-'ence')
61
+ 'anci' (<-'ance')
62
+ 'abli' (<-'able')
63
+ 'entli' (<-'ent')
64
+ 'eli' (<-'e')
65
+ 'izer' 'ization'
66
+ (<-'ize')
67
+ 'ational' 'ation' 'ator'
68
+ (<-'ate')
69
+ 'alli' (<-'al')
70
+ 'alism' 'aliti'
71
+ (<-'al')
72
+ 'fulness' (<-'ful')
73
+ 'ousli' 'ousness'
74
+ (<-'ous')
75
+ 'iveness' 'iviti'
76
+ (<-'ive')
77
+ 'biliti' (<-'ble')
78
+ )
79
+ )
80
+
81
+ define Step_3 as (
82
+ [substring] R1 among (
83
+ 'alize' (<-'al')
84
+ 'icate' 'iciti' 'ical'
85
+ (<-'ic')
86
+ 'ative' 'ful' 'ness'
87
+ (delete)
88
+ )
89
+ )
90
+
91
+ define Step_4 as (
92
+ [substring] R2 among (
93
+ 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
94
+ 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
95
+ (delete)
96
+ 'ion' ('s' or 't' delete)
97
+ )
98
+ )
99
+
100
+ define Step_5a as (
101
+ ['e']
102
+ R2 or (R1 not shortv)
103
+ delete
104
+ )
105
+
106
+ define Step_5b as (
107
+ ['l']
108
+ R2 'l'
109
+ delete
110
+ )
111
+ )
112
+
113
+ define stem as (
114
+
115
+ unset Y_found
116
+ do ( ['y'] <-'Y' set Y_found)
117
+ do repeat(goto (v ['y']) <-'Y' set Y_found)
118
+
119
+ $p1 = limit
120
+ $p2 = limit
121
+ do(
122
+ gopast v gopast non-v setmark p1
123
+ gopast v gopast non-v setmark p2
124
+ )
125
+
126
+ backwards (
127
+ do Step_1a
128
+ do Step_1b
129
+ do Step_1c
130
+ do Step_2
131
+ do Step_3
132
+ do Step_4
133
+ do Step_5a
134
+ do Step_5b
135
+ )
136
+
137
+ do(Y_found repeat(goto (['Y']) <-'y'))
138
+
139
+ )