mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,192 @@
1
+ // An implementation of the "Porter Stemmer for Bahasa Indonesia" from:
2
+ // http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf
3
+
4
+ integers (
5
+ // The paper defines measure as the number of vowels in the word. We
6
+ // count this initially, then adjust the count each time we remove a
7
+ // prefix or suffix.
8
+ measure
9
+
10
+ // Numeric code for the type of prefix removed:
11
+ //
12
+ // 0 other/none
13
+ // 1 'di' or 'meng' or 'ter'
14
+ // 2 'per'
15
+ // 3 'ke' or 'peng'
16
+ // 4 'ber'
17
+ //
18
+ // Some of these have variant forms, so e.g. "meng" includes "men", "me",
19
+ // "meny", "mem".
20
+ //
21
+ // Note that the value of prefix is only used in remove_suffix (and
22
+ // routines it calls) so we don't need to worry about
23
+ // remove_second_order_prefix overwriting a value of prefix set by
24
+ // remove_first_order_prefix since remove_suffix gets called between
25
+ // the two.
26
+ prefix
27
+ )
28
+
29
+ groupings ( vowel )
30
+
31
+ routines (
32
+ remove_particle
33
+ remove_possessive_pronoun
34
+ remove_first_order_prefix
35
+ remove_second_order_prefix
36
+ remove_suffix
37
+ KER
38
+ SUFFIX_KAN_OK
39
+ SUFFIX_AN_OK
40
+ SUFFIX_I_OK
41
+ VOWEL
42
+ )
43
+
44
+ externals ( stem )
45
+
46
+ stringescapes {}
47
+
48
+ backwardmode (
49
+
50
+ define remove_particle as (
51
+ [substring] among (
52
+ 'kah' 'lah' 'pun' (delete $measure-=1)
53
+ )
54
+ )
55
+
56
+ define remove_possessive_pronoun as (
57
+ [substring] among (
58
+ 'ku' 'mu' 'nya' (delete $measure-=1)
59
+ )
60
+ )
61
+
62
+ // prefix not in {ke, peng, per}
63
+ define SUFFIX_KAN_OK as (
64
+ // On page 29, the example "kompas Q.31" says "Both Nazief and Porter
65
+ // stemmer converted the word peledakan (blast, explotion) to ledak (to
66
+ // blast, to explode)". However, the algorithm as described doesn't
67
+ // behave in this way - grammatically the prefix pe- occurs as a
68
+ // variation of both the first-order derivational prefix peng- and the
69
+ // second-order derivational prefix per-, but table 2.5 doesn't include
70
+ // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly)
71
+ // as having prefix "per" not "peng", and so we remove derivational
72
+ // suffix "kan" rather than "an" to give stem leda. (Porter-style
73
+ // stemmers remove the longest suffix they can amongst those available,
74
+ // which this paper notes in the last paragraph on page 15).
75
+ //
76
+ // We resolve this by amending the condition on suffix "kan" to
77
+ // "prefix ∉ {ke, peng, per}", which seems to make the stemmer's
78
+ // behaviour match all the examples in the paper except for one:
79
+ // "perbaikan" is shown in table 3.4 as stemming to "bai", but with
80
+ // this change it now stems to "baik". The table notes that "baik" is
81
+ // the actual root so this deviation is an improvement. In a sample
82
+ // vocabulary derived from the most common words in id.wikipedia.org,
83
+ // this change only affects 0.12% of words (76 out of 64,587, including
84
+ // "peledakan" and "perbaikan").
85
+ $prefix != 3 and $prefix != 2
86
+ )
87
+
88
+ // prefix not in {di, meng, ter}
89
+ define SUFFIX_AN_OK as ( $prefix != 1 )
90
+
91
+ define SUFFIX_I_OK as (
92
+ // prefix not in {ke, peng, ber}
93
+ $prefix <= 2
94
+
95
+ // The rest of the condition from the paper is:
96
+ // V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i
97
+ //
98
+ // The meaning of this is unclear in several ways, and none of the
99
+ // examples given of the stemmer's behaviour in the paper help to
100
+ // resolve these issues.
101
+ //
102
+ // Notice that c₂ isn't actually used - the most obvious explanation
103
+ // seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁".
104
+ //
105
+ // Elsewhere the paper defines V... as meaning "the stem starts with
106
+ // a vowel" and K... as meaning "the stem starts with a consonant".
107
+ //
108
+ // In other places where it says X|Y... it seems the | binds more
109
+ // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit
110
+ // odd as the first letter must be either a vowel or a consonant, so
111
+ // that really just means "ends cᵢcⱼ". However, nowhere in the paper
112
+ // uses or defines a notation such as ...X, which may explain this
113
+ // seemingly redundant way of specifying this.
114
+ //
115
+ // The conditions elsewhere on prefix removal (e.g. V...) are clearly
116
+ // on the stem left after the prefix is removed. None of the other
117
+ // rules for suffix removal have conditions on the stem, but for
118
+ // consistency with the prefix rules we might expect that the cᵢcⱼ
119
+ // test is on what's left *after* removing the "i" suffix.
120
+ //
121
+ // However, studying Indonesian wordlists and discussion with a native
122
+ // speaker leads us to conclude that the purpose of this check is to
123
+ // protect words of foreign origin (e.g. "televisi", "organisasi",
124
+ // "komunikasi") from stemming, and the common feature of these is
125
+ // that the word ends "-si", so we conclude that the condition here
126
+ // should be read as "word does not end -si", and this is what we
127
+ // have implemented.
128
+ not 's'
129
+ )
130
+
131
+ define remove_suffix as (
132
+ [substring] among (
133
+ 'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK
134
+ (delete $measure-=1)
135
+ )
136
+ )
137
+ )
138
+
139
+ define vowel 'aeiou'
140
+
141
+ define VOWEL as ( vowel )
142
+
143
+ define KER as ( non-vowel 'er' )
144
+
145
+ define remove_first_order_prefix as (
146
+ [substring] among (
147
+ 'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1)
148
+ 'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1)
149
+ 'meny' VOWEL ($prefix=1 <-'s' $measure-=1)
150
+ 'peny' VOWEL ($prefix=3 <-'s' $measure-=1)
151
+ 'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete)
152
+ 'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete)
153
+ )
154
+ )
155
+
156
+ define remove_second_order_prefix as (
157
+ // The paper has the condition on removal of prefix "bel" and "pel" as
158
+ // just "ajar" not "ajar..." but it seems that the latter must be what
159
+ // is intended so that e.g. "pelajaran" stems to "ajar" not "lajar".
160
+ // This change only affects a very small number of words (11 out of
161
+ // 64,587) and only for the better.
162
+ [substring] among (
163
+ 'per' 'pe' (delete $prefix=2 $measure-=1)
164
+ 'pelajar' (<-'ajar' $measure-=1)
165
+ 'ber' (delete $prefix=4 $measure-=1)
166
+ 'belajar' (<-'ajar' $prefix=4 $measure-=1)
167
+ 'be' KER (delete $prefix=4 $measure-=1)
168
+ )
169
+ )
170
+
171
+ define stem as (
172
+ $measure = 0
173
+ do ( repeat ( gopast vowel $measure+=1 ) )
174
+ $measure > 2
175
+ $prefix = 0
176
+ backwards (
177
+ do remove_particle
178
+ $measure > 2
179
+ do remove_possessive_pronoun
180
+ )
181
+ $measure > 2
182
+ test (
183
+ remove_first_order_prefix
184
+ do (
185
+ test ($measure > 2 backwards remove_suffix)
186
+ $measure > 2 remove_second_order_prefix
187
+ )
188
+ ) or (
189
+ do remove_second_order_prefix
190
+ do ($measure > 2 backwards remove_suffix)
191
+ )
192
+ )
@@ -0,0 +1,149 @@
1
+ routines (
2
+ R1 R2 RV
3
+ initial_morph
4
+ mark_regions
5
+ noun_sfx
6
+ deriv
7
+ verb_sfx
8
+ )
9
+
10
+ externals ( stem )
11
+
12
+ integers ( pV p1 p2 )
13
+
14
+ groupings ( v )
15
+
16
+ stringescapes {}
17
+
18
+ /* Accented characters */
19
+
20
+ stringdef a' '{U+00E1}' // a-acute
21
+ stringdef e' '{U+00E9}' // e-acute
22
+ stringdef i' '{U+00ED}' // i-acute
23
+ stringdef o' '{U+00F3}' // o-acute
24
+ stringdef u' '{U+00FA}' // u-acute
25
+
26
+ define v 'aeiou{a'}{e'}{i'}{o'}{u'}'
27
+
28
+ define mark_regions as (
29
+
30
+ $pV = limit
31
+ $p1 = limit
32
+ $p2 = limit // defaults
33
+
34
+ do (
35
+ gopast v setmark pV
36
+ gopast non-v setmark p1
37
+ gopast v gopast non-v setmark p2
38
+ )
39
+ )
40
+
41
+ define initial_morph as (
42
+ [substring] among (
43
+ 'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic
44
+ (delete)
45
+
46
+ // verbs
47
+ 'd{'}'
48
+ (delete)
49
+ 'd{'}fh'
50
+ (<- 'f')
51
+ // other contractions
52
+ 'm{'}' 'b{'}'
53
+ (delete)
54
+
55
+ 'sh'
56
+ (<- 's')
57
+
58
+ 'mb'
59
+ (<- 'b')
60
+ 'gc'
61
+ (<- 'c')
62
+ 'nd'
63
+ (<- 'd')
64
+ 'bhf'
65
+ (<- 'f')
66
+ 'ng'
67
+ (<- 'g')
68
+ 'bp'
69
+ (<- 'p')
70
+ 'ts'
71
+ (<- 's')
72
+ 'dt'
73
+ (<- 't')
74
+
75
+ // Lenition
76
+ 'bh'
77
+ (<- 'b')
78
+ 'ch'
79
+ (<- 'c')
80
+ 'dh'
81
+ (<- 'd')
82
+ 'fh'
83
+ (<- 'f')
84
+ 'gh'
85
+ (<- 'g')
86
+ 'mh'
87
+ (<- 'm')
88
+ 'ph'
89
+ (<- 'p')
90
+ 'th'
91
+ (<- 't')
92
+ )
93
+ )
94
+
95
+ backwardmode (
96
+
97
+ define RV as $pV <= cursor
98
+ define R1 as $p1 <= cursor
99
+ define R2 as $p2 <= cursor
100
+
101
+ define noun_sfx as (
102
+ [substring] among (
103
+ 'amh' 'eamh' 'abh' 'eabh'
104
+ 'aibh' 'ibh' 'aimh' 'imh'
105
+ 'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta'
106
+ (R1 delete)
107
+ 'ire' 'ir{i'}' 'aire' 'air{i'}'
108
+ (R2 delete)
109
+ )
110
+ )
111
+ define deriv as (
112
+ [substring] among (
113
+ 'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta'
114
+ (R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl
115
+ 'arcacht' 'arcachta{i'}' 'arcachta'
116
+ (<- 'arc') // monarcacht -> monarc
117
+ 'gineach' 'gineas' 'ginis'
118
+ (<- 'gin')
119
+ 'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}'
120
+ (<- 'graf')
121
+ 'paite' 'patach' 'pataigh' 'patacha'
122
+ (<- 'paite')
123
+ '{o'}ideach' '{o'}ideacha' '{o'}idigh'
124
+ (<- '{o'}id')
125
+ )
126
+ )
127
+ define verb_sfx as (
128
+ [substring] among (
129
+ 'imid' 'aimid' '{i'}mid' 'a{i'}mid'
130
+ 'faidh' 'fidh'
131
+ (RV delete)
132
+ 'ain'
133
+ 'eadh' 'adh'
134
+ '{a'}il'
135
+ 'tear' 'tar'
136
+ (R1 delete)
137
+ )
138
+ )
139
+ )
140
+
141
+ define stem as (
142
+ do initial_morph
143
+ do mark_regions
144
+ backwards (
145
+ do noun_sfx
146
+ do deriv
147
+ do verb_sfx
148
+ )
149
+ )
@@ -0,0 +1,202 @@
1
+
2
+ routines (
3
+ exceptions
4
+ prelude postlude mark_regions
5
+ RV R1 R2
6
+ attached_pronoun
7
+ standard_suffix
8
+ verb_suffix
9
+ vowel_suffix
10
+ )
11
+
12
+ externals ( stem )
13
+
14
+ integers ( pV p1 p2 )
15
+
16
+ groupings ( v AEIO CG )
17
+
18
+ stringescapes {}
19
+
20
+ /* special characters */
21
+
22
+ stringdef a' '{U+00E1}'
23
+ stringdef a` '{U+00E0}'
24
+ stringdef e' '{U+00E9}'
25
+ stringdef e` '{U+00E8}'
26
+ stringdef i' '{U+00ED}'
27
+ stringdef i` '{U+00EC}'
28
+ stringdef o' '{U+00F3}'
29
+ stringdef o` '{U+00F2}'
30
+ stringdef u' '{U+00FA}'
31
+ stringdef u` '{U+00F9}'
32
+
33
+ define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
34
+
35
+ define prelude as (
36
+ test repeat (
37
+ [substring] among(
38
+ '{a'}' (<- '{a`}')
39
+ '{e'}' (<- '{e`}')
40
+ '{i'}' (<- '{i`}')
41
+ '{o'}' (<- '{o`}')
42
+ '{u'}' (<- '{u`}')
43
+ 'qu' (<- 'qU')
44
+ '' (next)
45
+ )
46
+ )
47
+ repeat goto (
48
+ v [ ('u' ] v <- 'U') or
49
+ ('i' ] v <- 'I')
50
+ )
51
+ )
52
+
53
+ define mark_regions as (
54
+
55
+ $pV = limit
56
+ $p1 = limit
57
+ $p2 = limit // defaults
58
+
59
+ do (
60
+ ( v (non-v gopast v) or (v gopast non-v) )
61
+ or
62
+ ( non-v (non-v gopast v) or (v next) )
63
+ setmark pV
64
+ )
65
+ do (
66
+ gopast v gopast non-v setmark p1
67
+ gopast v gopast non-v setmark p2
68
+ )
69
+ )
70
+
71
+ define postlude as repeat (
72
+
73
+ [substring] among(
74
+ 'I' (<- 'i')
75
+ 'U' (<- 'u')
76
+ '' (next)
77
+ )
78
+
79
+ )
80
+
81
+ backwardmode (
82
+
83
+ define RV as $pV <= cursor
84
+ define R1 as $p1 <= cursor
85
+ define R2 as $p2 <= cursor
86
+
87
+ define attached_pronoun as (
88
+ [substring] among(
89
+ 'ci' 'gli' 'la' 'le' 'li' 'lo'
90
+ 'mi' 'ne' 'si' 'ti' 'vi'
91
+ // the compound forms are:
92
+ 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
93
+ 'mela' 'mele' 'meli' 'melo' 'mene'
94
+ 'tela' 'tele' 'teli' 'telo' 'tene'
95
+ 'cela' 'cele' 'celi' 'celo' 'cene'
96
+ 'vela' 'vele' 'veli' 'velo' 'vene'
97
+ )
98
+ among( (RV)
99
+ 'ando' 'endo' (delete)
100
+ 'ar' 'er' 'ir' (<- 'e')
101
+ )
102
+ )
103
+
104
+ define standard_suffix as (
105
+ [substring] among(
106
+
107
+ 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
108
+ 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
109
+ 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
110
+ 'atrice' 'atrici'
111
+ 'ante' 'anti' // Note 1
112
+ ( R2 delete )
113
+ 'azione' 'azioni' 'atore' 'atori'
114
+ ( R2 delete
115
+ try ( ['ic'] R2 delete )
116
+ )
117
+ 'logia' 'logie'
118
+ ( R2 <- 'log' )
119
+ 'uzione' 'uzioni' 'usione' 'usioni'
120
+ ( R2 <- 'u' )
121
+ 'enza' 'enze'
122
+ ( R2 <- 'ente' )
123
+ 'amento' 'amenti' 'imento' 'imenti'
124
+ ( RV delete )
125
+ 'amente' (
126
+ R1 delete
127
+ try (
128
+ [substring] R2 delete among(
129
+ 'iv' ( ['at'] R2 delete )
130
+ 'os' 'ic' 'abil'
131
+ )
132
+ )
133
+ )
134
+ 'it{a`}' (
135
+ R2 delete
136
+ try (
137
+ [substring] among(
138
+ 'abil' 'ic' 'iv' (R2 delete)
139
+ )
140
+ )
141
+ )
142
+ 'ivo' 'ivi' 'iva' 'ive' (
143
+ R2 delete
144
+ try ( ['at'] R2 delete ['ic'] R2 delete )
145
+ )
146
+ )
147
+ )
148
+
149
+ define verb_suffix as setlimit tomark pV for (
150
+ [substring] among(
151
+ 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
152
+ 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
153
+ 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
154
+ 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
155
+ 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
156
+ 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
157
+ 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
158
+ 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
159
+ 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
160
+ 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
161
+ 'ono' 'uta' 'ute' 'uti' 'uto'
162
+
163
+ 'ar' 'ir' // but 'er' is problematical
164
+ (delete)
165
+ )
166
+ )
167
+
168
+ define AEIO 'aeio{a`}{e`}{i`}{o`}'
169
+ define CG 'cg'
170
+
171
+ define vowel_suffix as (
172
+ try (
173
+ [AEIO] RV delete
174
+ ['i'] RV delete
175
+ )
176
+ try (
177
+ ['h'] CG RV delete
178
+ )
179
+ )
180
+ )
181
+
182
+ define exceptions as (
183
+ ['divano' atlimit ] <- 'divan' // Otherwise "divano" stems to "div" and collides with "diva"
184
+ )
185
+
186
+ define stem as (
187
+ exceptions or (
188
+ do prelude
189
+ do mark_regions
190
+ backwards (
191
+ do attached_pronoun
192
+ do (standard_suffix or verb_suffix)
193
+ do vowel_suffix
194
+ )
195
+ do postlude
196
+ )
197
+ )
198
+
199
+ /*
200
+ Note 1: additions of 15 Jun 2005
201
+ */
202
+