mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
// An implementation of the "Porter Stemmer for Bahasa Indonesia" from:
|
|
2
|
+
// http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf
|
|
3
|
+
|
|
4
|
+
integers (
|
|
5
|
+
// The paper defines measure as the number of vowels in the word. We
|
|
6
|
+
// count this initially, then adjust the count each time we remove a
|
|
7
|
+
// prefix or suffix.
|
|
8
|
+
measure
|
|
9
|
+
|
|
10
|
+
// Numeric code for the type of prefix removed:
|
|
11
|
+
//
|
|
12
|
+
// 0 other/none
|
|
13
|
+
// 1 'di' or 'meng' or 'ter'
|
|
14
|
+
// 2 'per'
|
|
15
|
+
// 3 'ke' or 'peng'
|
|
16
|
+
// 4 'ber'
|
|
17
|
+
//
|
|
18
|
+
// Some of these have variant forms, so e.g. "meng" includes "men", "me",
|
|
19
|
+
// "meny", "mem".
|
|
20
|
+
//
|
|
21
|
+
// Note that the value of prefix is only used in remove_suffix (and
|
|
22
|
+
// routines it calls) so we don't need to worry about
|
|
23
|
+
// remove_second_order_prefix overwriting a value of prefix set by
|
|
24
|
+
// remove_first_order_prefix since remove_suffix gets called between
|
|
25
|
+
// the two.
|
|
26
|
+
prefix
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
groupings ( vowel )
|
|
30
|
+
|
|
31
|
+
routines (
|
|
32
|
+
remove_particle
|
|
33
|
+
remove_possessive_pronoun
|
|
34
|
+
remove_first_order_prefix
|
|
35
|
+
remove_second_order_prefix
|
|
36
|
+
remove_suffix
|
|
37
|
+
KER
|
|
38
|
+
SUFFIX_KAN_OK
|
|
39
|
+
SUFFIX_AN_OK
|
|
40
|
+
SUFFIX_I_OK
|
|
41
|
+
VOWEL
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
externals ( stem )
|
|
45
|
+
|
|
46
|
+
stringescapes {}
|
|
47
|
+
|
|
48
|
+
backwardmode (
|
|
49
|
+
|
|
50
|
+
define remove_particle as (
|
|
51
|
+
[substring] among (
|
|
52
|
+
'kah' 'lah' 'pun' (delete $measure-=1)
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
define remove_possessive_pronoun as (
|
|
57
|
+
[substring] among (
|
|
58
|
+
'ku' 'mu' 'nya' (delete $measure-=1)
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
// prefix not in {ke, peng, per}
|
|
63
|
+
define SUFFIX_KAN_OK as (
|
|
64
|
+
// On page 29, the example "kompas Q.31" says "Both Nazief and Porter
|
|
65
|
+
// stemmer converted the word peledakan (blast, explotion) to ledak (to
|
|
66
|
+
// blast, to explode)". However, the algorithm as described doesn't
|
|
67
|
+
// behave in this way - grammatically the prefix pe- occurs as a
|
|
68
|
+
// variation of both the first-order derivational prefix peng- and the
|
|
69
|
+
// second-order derivational prefix per-, but table 2.5 doesn't include
|
|
70
|
+
// "pe", only table 2.6 does, so "peledakan" is handled (incorrectly)
|
|
71
|
+
// as having prefix "per" not "peng", and so we remove derivational
|
|
72
|
+
// suffix "kan" rather than "an" to give stem leda. (Porter-style
|
|
73
|
+
// stemmers remove the longest suffix they can amongst those available,
|
|
74
|
+
// which this paper notes in the last paragraph on page 15).
|
|
75
|
+
//
|
|
76
|
+
// We resolve this by amending the condition on suffix "kan" to
|
|
77
|
+
// "prefix ∉ {ke, peng, per}", which seems to make the stemmer's
|
|
78
|
+
// behaviour match all the examples in the paper except for one:
|
|
79
|
+
// "perbaikan" is shown in table 3.4 as stemming to "bai", but with
|
|
80
|
+
// this change it now stems to "baik". The table notes that "baik" is
|
|
81
|
+
// the actual root so this deviation is an improvement. In a sample
|
|
82
|
+
// vocabulary derived from the most common words in id.wikipedia.org,
|
|
83
|
+
// this change only affects 0.12% of words (76 out of 64,587, including
|
|
84
|
+
// "peledakan" and "perbaikan").
|
|
85
|
+
$prefix != 3 and $prefix != 2
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
// prefix not in {di, meng, ter}
|
|
89
|
+
define SUFFIX_AN_OK as ( $prefix != 1 )
|
|
90
|
+
|
|
91
|
+
define SUFFIX_I_OK as (
|
|
92
|
+
// prefix not in {ke, peng, ber}
|
|
93
|
+
$prefix <= 2
|
|
94
|
+
|
|
95
|
+
// The rest of the condition from the paper is:
|
|
96
|
+
// V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i
|
|
97
|
+
//
|
|
98
|
+
// The meaning of this is unclear in several ways, and none of the
|
|
99
|
+
// examples given of the stemmer's behaviour in the paper help to
|
|
100
|
+
// resolve these issues.
|
|
101
|
+
//
|
|
102
|
+
// Notice that c₂ isn't actually used - the most obvious explanation
|
|
103
|
+
// seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁".
|
|
104
|
+
//
|
|
105
|
+
// Elsewhere the paper defines V... as meaning "the stem starts with
|
|
106
|
+
// a vowel" and K... as meaning "the stem starts with a consonant".
|
|
107
|
+
//
|
|
108
|
+
// In other places where it says X|Y... it seems the | binds more
|
|
109
|
+
// tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit
|
|
110
|
+
// odd as the first letter must be either a vowel or a consonant, so
|
|
111
|
+
// that really just means "ends cᵢcⱼ". However, nowhere in the paper
|
|
112
|
+
// uses or defines a notation such as ...X, which may explain this
|
|
113
|
+
// seemingly redundant way of specifying this.
|
|
114
|
+
//
|
|
115
|
+
// The conditions elsewhere on prefix removal (e.g. V...) are clearly
|
|
116
|
+
// on the stem left after the prefix is removed. None of the other
|
|
117
|
+
// rules for suffix removal have conditions on the stem, but for
|
|
118
|
+
// consistency with the prefix rules we might expect that the cᵢcⱼ
|
|
119
|
+
// test is on what's left *after* removing the "i" suffix.
|
|
120
|
+
//
|
|
121
|
+
// However, studying Indonesian wordlists and discussion with a native
|
|
122
|
+
// speaker leads us to conclude that the purpose of this check is to
|
|
123
|
+
// protect words of foreign origin (e.g. "televisi", "organisasi",
|
|
124
|
+
// "komunikasi") from stemming, and the common feature of these is
|
|
125
|
+
// that the word ends "-si", so we conclude that the condition here
|
|
126
|
+
// should be read as "word does not end -si", and this is what we
|
|
127
|
+
// have implemented.
|
|
128
|
+
not 's'
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
define remove_suffix as (
|
|
132
|
+
[substring] among (
|
|
133
|
+
'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK
|
|
134
|
+
(delete $measure-=1)
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
define vowel 'aeiou'
|
|
140
|
+
|
|
141
|
+
define VOWEL as ( vowel )
|
|
142
|
+
|
|
143
|
+
define KER as ( non-vowel 'er' )
|
|
144
|
+
|
|
145
|
+
define remove_first_order_prefix as (
|
|
146
|
+
[substring] among (
|
|
147
|
+
'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1)
|
|
148
|
+
'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1)
|
|
149
|
+
'meny' VOWEL ($prefix=1 <-'s' $measure-=1)
|
|
150
|
+
'peny' VOWEL ($prefix=3 <-'s' $measure-=1)
|
|
151
|
+
'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete)
|
|
152
|
+
'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete)
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
define remove_second_order_prefix as (
|
|
157
|
+
// The paper has the condition on removal of prefix "bel" and "pel" as
|
|
158
|
+
// just "ajar" not "ajar..." but it seems that the latter must be what
|
|
159
|
+
// is intended so that e.g. "pelajaran" stems to "ajar" not "lajar".
|
|
160
|
+
// This change only affects a very small number of words (11 out of
|
|
161
|
+
// 64,587) and only for the better.
|
|
162
|
+
[substring] among (
|
|
163
|
+
'per' 'pe' (delete $prefix=2 $measure-=1)
|
|
164
|
+
'pelajar' (<-'ajar' $measure-=1)
|
|
165
|
+
'ber' (delete $prefix=4 $measure-=1)
|
|
166
|
+
'belajar' (<-'ajar' $prefix=4 $measure-=1)
|
|
167
|
+
'be' KER (delete $prefix=4 $measure-=1)
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
define stem as (
|
|
172
|
+
$measure = 0
|
|
173
|
+
do ( repeat ( gopast vowel $measure+=1 ) )
|
|
174
|
+
$measure > 2
|
|
175
|
+
$prefix = 0
|
|
176
|
+
backwards (
|
|
177
|
+
do remove_particle
|
|
178
|
+
$measure > 2
|
|
179
|
+
do remove_possessive_pronoun
|
|
180
|
+
)
|
|
181
|
+
$measure > 2
|
|
182
|
+
test (
|
|
183
|
+
remove_first_order_prefix
|
|
184
|
+
do (
|
|
185
|
+
test ($measure > 2 backwards remove_suffix)
|
|
186
|
+
$measure > 2 remove_second_order_prefix
|
|
187
|
+
)
|
|
188
|
+
) or (
|
|
189
|
+
do remove_second_order_prefix
|
|
190
|
+
do ($measure > 2 backwards remove_suffix)
|
|
191
|
+
)
|
|
192
|
+
)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
routines (
|
|
2
|
+
R1 R2 RV
|
|
3
|
+
initial_morph
|
|
4
|
+
mark_regions
|
|
5
|
+
noun_sfx
|
|
6
|
+
deriv
|
|
7
|
+
verb_sfx
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
externals ( stem )
|
|
11
|
+
|
|
12
|
+
integers ( pV p1 p2 )
|
|
13
|
+
|
|
14
|
+
groupings ( v )
|
|
15
|
+
|
|
16
|
+
stringescapes {}
|
|
17
|
+
|
|
18
|
+
/* Accented characters */
|
|
19
|
+
|
|
20
|
+
stringdef a' '{U+00E1}' // a-acute
|
|
21
|
+
stringdef e' '{U+00E9}' // e-acute
|
|
22
|
+
stringdef i' '{U+00ED}' // i-acute
|
|
23
|
+
stringdef o' '{U+00F3}' // o-acute
|
|
24
|
+
stringdef u' '{U+00FA}' // u-acute
|
|
25
|
+
|
|
26
|
+
define v 'aeiou{a'}{e'}{i'}{o'}{u'}'
|
|
27
|
+
|
|
28
|
+
define mark_regions as (
|
|
29
|
+
|
|
30
|
+
$pV = limit
|
|
31
|
+
$p1 = limit
|
|
32
|
+
$p2 = limit // defaults
|
|
33
|
+
|
|
34
|
+
do (
|
|
35
|
+
gopast v setmark pV
|
|
36
|
+
gopast non-v setmark p1
|
|
37
|
+
gopast v gopast non-v setmark p2
|
|
38
|
+
)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
define initial_morph as (
|
|
42
|
+
[substring] among (
|
|
43
|
+
'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic
|
|
44
|
+
(delete)
|
|
45
|
+
|
|
46
|
+
// verbs
|
|
47
|
+
'd{'}'
|
|
48
|
+
(delete)
|
|
49
|
+
'd{'}fh'
|
|
50
|
+
(<- 'f')
|
|
51
|
+
// other contractions
|
|
52
|
+
'm{'}' 'b{'}'
|
|
53
|
+
(delete)
|
|
54
|
+
|
|
55
|
+
'sh'
|
|
56
|
+
(<- 's')
|
|
57
|
+
|
|
58
|
+
'mb'
|
|
59
|
+
(<- 'b')
|
|
60
|
+
'gc'
|
|
61
|
+
(<- 'c')
|
|
62
|
+
'nd'
|
|
63
|
+
(<- 'd')
|
|
64
|
+
'bhf'
|
|
65
|
+
(<- 'f')
|
|
66
|
+
'ng'
|
|
67
|
+
(<- 'g')
|
|
68
|
+
'bp'
|
|
69
|
+
(<- 'p')
|
|
70
|
+
'ts'
|
|
71
|
+
(<- 's')
|
|
72
|
+
'dt'
|
|
73
|
+
(<- 't')
|
|
74
|
+
|
|
75
|
+
// Lenition
|
|
76
|
+
'bh'
|
|
77
|
+
(<- 'b')
|
|
78
|
+
'ch'
|
|
79
|
+
(<- 'c')
|
|
80
|
+
'dh'
|
|
81
|
+
(<- 'd')
|
|
82
|
+
'fh'
|
|
83
|
+
(<- 'f')
|
|
84
|
+
'gh'
|
|
85
|
+
(<- 'g')
|
|
86
|
+
'mh'
|
|
87
|
+
(<- 'm')
|
|
88
|
+
'ph'
|
|
89
|
+
(<- 'p')
|
|
90
|
+
'th'
|
|
91
|
+
(<- 't')
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
backwardmode (
|
|
96
|
+
|
|
97
|
+
define RV as $pV <= cursor
|
|
98
|
+
define R1 as $p1 <= cursor
|
|
99
|
+
define R2 as $p2 <= cursor
|
|
100
|
+
|
|
101
|
+
define noun_sfx as (
|
|
102
|
+
[substring] among (
|
|
103
|
+
'amh' 'eamh' 'abh' 'eabh'
|
|
104
|
+
'aibh' 'ibh' 'aimh' 'imh'
|
|
105
|
+
'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta'
|
|
106
|
+
(R1 delete)
|
|
107
|
+
'ire' 'ir{i'}' 'aire' 'air{i'}'
|
|
108
|
+
(R2 delete)
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
define deriv as (
|
|
112
|
+
[substring] among (
|
|
113
|
+
'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta'
|
|
114
|
+
(R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl
|
|
115
|
+
'arcacht' 'arcachta{i'}' 'arcachta'
|
|
116
|
+
(<- 'arc') // monarcacht -> monarc
|
|
117
|
+
'gineach' 'gineas' 'ginis'
|
|
118
|
+
(<- 'gin')
|
|
119
|
+
'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}'
|
|
120
|
+
(<- 'graf')
|
|
121
|
+
'paite' 'patach' 'pataigh' 'patacha'
|
|
122
|
+
(<- 'paite')
|
|
123
|
+
'{o'}ideach' '{o'}ideacha' '{o'}idigh'
|
|
124
|
+
(<- '{o'}id')
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
define verb_sfx as (
|
|
128
|
+
[substring] among (
|
|
129
|
+
'imid' 'aimid' '{i'}mid' 'a{i'}mid'
|
|
130
|
+
'faidh' 'fidh'
|
|
131
|
+
(RV delete)
|
|
132
|
+
'ain'
|
|
133
|
+
'eadh' 'adh'
|
|
134
|
+
'{a'}il'
|
|
135
|
+
'tear' 'tar'
|
|
136
|
+
(R1 delete)
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
define stem as (
|
|
142
|
+
do initial_morph
|
|
143
|
+
do mark_regions
|
|
144
|
+
backwards (
|
|
145
|
+
do noun_sfx
|
|
146
|
+
do deriv
|
|
147
|
+
do verb_sfx
|
|
148
|
+
)
|
|
149
|
+
)
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
|
|
2
|
+
routines (
|
|
3
|
+
exceptions
|
|
4
|
+
prelude postlude mark_regions
|
|
5
|
+
RV R1 R2
|
|
6
|
+
attached_pronoun
|
|
7
|
+
standard_suffix
|
|
8
|
+
verb_suffix
|
|
9
|
+
vowel_suffix
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
externals ( stem )
|
|
13
|
+
|
|
14
|
+
integers ( pV p1 p2 )
|
|
15
|
+
|
|
16
|
+
groupings ( v AEIO CG )
|
|
17
|
+
|
|
18
|
+
stringescapes {}
|
|
19
|
+
|
|
20
|
+
/* special characters */
|
|
21
|
+
|
|
22
|
+
stringdef a' '{U+00E1}'
|
|
23
|
+
stringdef a` '{U+00E0}'
|
|
24
|
+
stringdef e' '{U+00E9}'
|
|
25
|
+
stringdef e` '{U+00E8}'
|
|
26
|
+
stringdef i' '{U+00ED}'
|
|
27
|
+
stringdef i` '{U+00EC}'
|
|
28
|
+
stringdef o' '{U+00F3}'
|
|
29
|
+
stringdef o` '{U+00F2}'
|
|
30
|
+
stringdef u' '{U+00FA}'
|
|
31
|
+
stringdef u` '{U+00F9}'
|
|
32
|
+
|
|
33
|
+
define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
|
|
34
|
+
|
|
35
|
+
define prelude as (
|
|
36
|
+
test repeat (
|
|
37
|
+
[substring] among(
|
|
38
|
+
'{a'}' (<- '{a`}')
|
|
39
|
+
'{e'}' (<- '{e`}')
|
|
40
|
+
'{i'}' (<- '{i`}')
|
|
41
|
+
'{o'}' (<- '{o`}')
|
|
42
|
+
'{u'}' (<- '{u`}')
|
|
43
|
+
'qu' (<- 'qU')
|
|
44
|
+
'' (next)
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
repeat goto (
|
|
48
|
+
v [ ('u' ] v <- 'U') or
|
|
49
|
+
('i' ] v <- 'I')
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
define mark_regions as (
|
|
54
|
+
|
|
55
|
+
$pV = limit
|
|
56
|
+
$p1 = limit
|
|
57
|
+
$p2 = limit // defaults
|
|
58
|
+
|
|
59
|
+
do (
|
|
60
|
+
( v (non-v gopast v) or (v gopast non-v) )
|
|
61
|
+
or
|
|
62
|
+
( non-v (non-v gopast v) or (v next) )
|
|
63
|
+
setmark pV
|
|
64
|
+
)
|
|
65
|
+
do (
|
|
66
|
+
gopast v gopast non-v setmark p1
|
|
67
|
+
gopast v gopast non-v setmark p2
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
define postlude as repeat (
|
|
72
|
+
|
|
73
|
+
[substring] among(
|
|
74
|
+
'I' (<- 'i')
|
|
75
|
+
'U' (<- 'u')
|
|
76
|
+
'' (next)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
backwardmode (
|
|
82
|
+
|
|
83
|
+
define RV as $pV <= cursor
|
|
84
|
+
define R1 as $p1 <= cursor
|
|
85
|
+
define R2 as $p2 <= cursor
|
|
86
|
+
|
|
87
|
+
define attached_pronoun as (
|
|
88
|
+
[substring] among(
|
|
89
|
+
'ci' 'gli' 'la' 'le' 'li' 'lo'
|
|
90
|
+
'mi' 'ne' 'si' 'ti' 'vi'
|
|
91
|
+
// the compound forms are:
|
|
92
|
+
'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
|
|
93
|
+
'mela' 'mele' 'meli' 'melo' 'mene'
|
|
94
|
+
'tela' 'tele' 'teli' 'telo' 'tene'
|
|
95
|
+
'cela' 'cele' 'celi' 'celo' 'cene'
|
|
96
|
+
'vela' 'vele' 'veli' 'velo' 'vene'
|
|
97
|
+
)
|
|
98
|
+
among( (RV)
|
|
99
|
+
'ando' 'endo' (delete)
|
|
100
|
+
'ar' 'er' 'ir' (<- 'e')
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
define standard_suffix as (
|
|
105
|
+
[substring] among(
|
|
106
|
+
|
|
107
|
+
'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
|
|
108
|
+
'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
|
|
109
|
+
'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
|
|
110
|
+
'atrice' 'atrici'
|
|
111
|
+
'ante' 'anti' // Note 1
|
|
112
|
+
( R2 delete )
|
|
113
|
+
'azione' 'azioni' 'atore' 'atori'
|
|
114
|
+
( R2 delete
|
|
115
|
+
try ( ['ic'] R2 delete )
|
|
116
|
+
)
|
|
117
|
+
'logia' 'logie'
|
|
118
|
+
( R2 <- 'log' )
|
|
119
|
+
'uzione' 'uzioni' 'usione' 'usioni'
|
|
120
|
+
( R2 <- 'u' )
|
|
121
|
+
'enza' 'enze'
|
|
122
|
+
( R2 <- 'ente' )
|
|
123
|
+
'amento' 'amenti' 'imento' 'imenti'
|
|
124
|
+
( RV delete )
|
|
125
|
+
'amente' (
|
|
126
|
+
R1 delete
|
|
127
|
+
try (
|
|
128
|
+
[substring] R2 delete among(
|
|
129
|
+
'iv' ( ['at'] R2 delete )
|
|
130
|
+
'os' 'ic' 'abil'
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
'it{a`}' (
|
|
135
|
+
R2 delete
|
|
136
|
+
try (
|
|
137
|
+
[substring] among(
|
|
138
|
+
'abil' 'ic' 'iv' (R2 delete)
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
'ivo' 'ivi' 'iva' 'ive' (
|
|
143
|
+
R2 delete
|
|
144
|
+
try ( ['at'] R2 delete ['ic'] R2 delete )
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
define verb_suffix as setlimit tomark pV for (
|
|
150
|
+
[substring] among(
|
|
151
|
+
'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
|
|
152
|
+
'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
|
|
153
|
+
'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
|
|
154
|
+
'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
|
|
155
|
+
'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
|
|
156
|
+
'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
|
|
157
|
+
'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
|
|
158
|
+
'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
|
|
159
|
+
'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
|
|
160
|
+
'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
|
|
161
|
+
'ono' 'uta' 'ute' 'uti' 'uto'
|
|
162
|
+
|
|
163
|
+
'ar' 'ir' // but 'er' is problematical
|
|
164
|
+
(delete)
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
define AEIO 'aeio{a`}{e`}{i`}{o`}'
|
|
169
|
+
define CG 'cg'
|
|
170
|
+
|
|
171
|
+
define vowel_suffix as (
|
|
172
|
+
try (
|
|
173
|
+
[AEIO] RV delete
|
|
174
|
+
['i'] RV delete
|
|
175
|
+
)
|
|
176
|
+
try (
|
|
177
|
+
['h'] CG RV delete
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
define exceptions as (
|
|
183
|
+
['divano' atlimit ] <- 'divan' // Otherwise "divano" stems to "div" and collides with "diva"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
define stem as (
|
|
187
|
+
exceptions or (
|
|
188
|
+
do prelude
|
|
189
|
+
do mark_regions
|
|
190
|
+
backwards (
|
|
191
|
+
do attached_pronoun
|
|
192
|
+
do (standard_suffix or verb_suffix)
|
|
193
|
+
do vowel_suffix
|
|
194
|
+
)
|
|
195
|
+
do postlude
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
/*
|
|
200
|
+
Note 1: additions of 15 Jun 2005
|
|
201
|
+
*/
|
|
202
|
+
|