mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
routines (
|
|
2
|
+
mark_regions
|
|
3
|
+
main_suffix
|
|
4
|
+
consonant_pair
|
|
5
|
+
other_suffix
|
|
6
|
+
undouble
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
externals ( stem )
|
|
10
|
+
|
|
11
|
+
strings ( ch )
|
|
12
|
+
|
|
13
|
+
integers ( p1 x )
|
|
14
|
+
|
|
15
|
+
groupings ( c v s_ending )
|
|
16
|
+
|
|
17
|
+
stringescapes {}
|
|
18
|
+
|
|
19
|
+
/* special characters */
|
|
20
|
+
|
|
21
|
+
stringdef ae '{U+00E6}'
|
|
22
|
+
stringdef ao '{U+00E5}'
|
|
23
|
+
stringdef o/ '{U+00F8}'
|
|
24
|
+
|
|
25
|
+
define c 'bcdfghjklmnpqrstvwxz'
|
|
26
|
+
|
|
27
|
+
define v 'aeiouy{ae}{ao}{o/}'
|
|
28
|
+
|
|
29
|
+
define s_ending 'abcdfghjklmnoprtvyz{ao}'
|
|
30
|
+
|
|
31
|
+
define mark_regions as (
|
|
32
|
+
|
|
33
|
+
$p1 = limit
|
|
34
|
+
|
|
35
|
+
test ( hop 3 setmark x )
|
|
36
|
+
goto v gopast non-v setmark p1
|
|
37
|
+
try ( $p1 < x $p1 = x )
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
backwardmode (
|
|
41
|
+
|
|
42
|
+
define main_suffix as (
|
|
43
|
+
setlimit tomark p1 for ([substring])
|
|
44
|
+
among(
|
|
45
|
+
|
|
46
|
+
'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere'
|
|
47
|
+
'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes'
|
|
48
|
+
'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets'
|
|
49
|
+
'erets' 'et' 'eret'
|
|
50
|
+
(delete)
|
|
51
|
+
's'
|
|
52
|
+
(s_ending delete)
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
define consonant_pair as (
|
|
57
|
+
test (
|
|
58
|
+
setlimit tomark p1 for ([substring])
|
|
59
|
+
among(
|
|
60
|
+
'gd' // significant in the call from other_suffix
|
|
61
|
+
'dt' 'gt' 'kt'
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
next] delete
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
define other_suffix as (
|
|
68
|
+
do ( ['st'] 'ig' delete )
|
|
69
|
+
setlimit tomark p1 for ([substring])
|
|
70
|
+
among(
|
|
71
|
+
'ig' 'lig' 'elig' 'els'
|
|
72
|
+
(delete do consonant_pair)
|
|
73
|
+
'l{o/}st'
|
|
74
|
+
(<-'l{o/}s')
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
define undouble as (
|
|
78
|
+
setlimit tomark p1 for ([c] ->ch)
|
|
79
|
+
ch
|
|
80
|
+
delete
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
define stem as (
|
|
85
|
+
|
|
86
|
+
do mark_regions
|
|
87
|
+
backwards (
|
|
88
|
+
do main_suffix
|
|
89
|
+
do consonant_pair
|
|
90
|
+
do other_suffix
|
|
91
|
+
do undouble
|
|
92
|
+
)
|
|
93
|
+
)
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
routines (
|
|
2
|
+
prelude postlude
|
|
3
|
+
e_ending
|
|
4
|
+
en_ending
|
|
5
|
+
mark_regions
|
|
6
|
+
R1 R2
|
|
7
|
+
undouble
|
|
8
|
+
standard_suffix
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
externals ( stem )
|
|
12
|
+
|
|
13
|
+
booleans ( e_found )
|
|
14
|
+
|
|
15
|
+
integers ( p1 p2 )
|
|
16
|
+
|
|
17
|
+
groupings ( v v_I v_j )
|
|
18
|
+
|
|
19
|
+
stringescapes {}
|
|
20
|
+
|
|
21
|
+
/* special characters */
|
|
22
|
+
|
|
23
|
+
stringdef a" '{U+00E4}'
|
|
24
|
+
stringdef e" '{U+00EB}'
|
|
25
|
+
stringdef i" '{U+00EF}'
|
|
26
|
+
stringdef o" '{U+00F6}'
|
|
27
|
+
stringdef u" '{U+00FC}'
|
|
28
|
+
|
|
29
|
+
stringdef a' '{U+00E1}'
|
|
30
|
+
stringdef e' '{U+00E9}'
|
|
31
|
+
stringdef i' '{U+00ED}'
|
|
32
|
+
stringdef o' '{U+00F3}'
|
|
33
|
+
stringdef u' '{U+00FA}'
|
|
34
|
+
|
|
35
|
+
stringdef e` '{U+00E8}'
|
|
36
|
+
|
|
37
|
+
define v 'aeiouy{e`}'
|
|
38
|
+
define v_I v + 'I'
|
|
39
|
+
define v_j v + 'j'
|
|
40
|
+
|
|
41
|
+
define prelude as (
|
|
42
|
+
test repeat (
|
|
43
|
+
[substring] among(
|
|
44
|
+
'{a"}' '{a'}'
|
|
45
|
+
(<- 'a')
|
|
46
|
+
'{e"}' '{e'}'
|
|
47
|
+
(<- 'e')
|
|
48
|
+
'{i"}' '{i'}'
|
|
49
|
+
(<- 'i')
|
|
50
|
+
'{o"}' '{o'}'
|
|
51
|
+
(<- 'o')
|
|
52
|
+
'{u"}' '{u'}'
|
|
53
|
+
(<- 'u')
|
|
54
|
+
'' (next)
|
|
55
|
+
) //or next
|
|
56
|
+
)
|
|
57
|
+
try(['y'] <- 'Y')
|
|
58
|
+
repeat goto (
|
|
59
|
+
v [('i'] v <- 'I') or
|
|
60
|
+
('y'] <- 'Y')
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
define mark_regions as (
|
|
65
|
+
|
|
66
|
+
$p1 = limit
|
|
67
|
+
$p2 = limit
|
|
68
|
+
|
|
69
|
+
gopast v gopast non-v setmark p1
|
|
70
|
+
try($p1 < 3 $p1 = 3) // at least 3
|
|
71
|
+
gopast v gopast non-v setmark p2
|
|
72
|
+
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
define postlude as repeat (
|
|
76
|
+
|
|
77
|
+
[substring] among(
|
|
78
|
+
'Y' (<- 'y')
|
|
79
|
+
'I' (<- 'i')
|
|
80
|
+
'' (next)
|
|
81
|
+
) //or next
|
|
82
|
+
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
backwardmode (
|
|
86
|
+
|
|
87
|
+
define R1 as $p1 <= cursor
|
|
88
|
+
define R2 as $p2 <= cursor
|
|
89
|
+
|
|
90
|
+
define undouble as (
|
|
91
|
+
test among('kk' 'dd' 'tt') [next] delete
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
define e_ending as (
|
|
95
|
+
unset e_found
|
|
96
|
+
['e'] R1 test non-v delete
|
|
97
|
+
set e_found
|
|
98
|
+
undouble
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
define en_ending as (
|
|
102
|
+
R1 non-v and not 'gem' delete
|
|
103
|
+
undouble
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
define standard_suffix as (
|
|
107
|
+
do (
|
|
108
|
+
[substring] among(
|
|
109
|
+
'heden'
|
|
110
|
+
( R1 <- 'heid'
|
|
111
|
+
)
|
|
112
|
+
'en' 'ene'
|
|
113
|
+
( en_ending
|
|
114
|
+
)
|
|
115
|
+
's' 'se'
|
|
116
|
+
( R1 non-v_j delete
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
do e_ending
|
|
121
|
+
|
|
122
|
+
do ( ['heid'] R2 not 'c' delete
|
|
123
|
+
['en'] en_ending
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
do (
|
|
127
|
+
[substring] among(
|
|
128
|
+
'end' 'ing'
|
|
129
|
+
( R2 delete
|
|
130
|
+
(['ig'] R2 not 'e' delete) or undouble
|
|
131
|
+
)
|
|
132
|
+
'ig'
|
|
133
|
+
( R2 not 'e' delete
|
|
134
|
+
)
|
|
135
|
+
'lijk'
|
|
136
|
+
( R2 delete e_ending
|
|
137
|
+
)
|
|
138
|
+
'baar'
|
|
139
|
+
( R2 delete
|
|
140
|
+
)
|
|
141
|
+
'bar'
|
|
142
|
+
( R2 e_found delete
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
do (
|
|
147
|
+
non-v_I
|
|
148
|
+
test (
|
|
149
|
+
among ('aa' 'ee' 'oo' 'uu')
|
|
150
|
+
non-v
|
|
151
|
+
)
|
|
152
|
+
[next] delete
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
define stem as (
|
|
158
|
+
|
|
159
|
+
do prelude
|
|
160
|
+
do mark_regions
|
|
161
|
+
backwards
|
|
162
|
+
do standard_suffix
|
|
163
|
+
do postlude
|
|
164
|
+
)
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
integers ( p1 p2 )
|
|
2
|
+
booleans ( Y_found )
|
|
3
|
+
|
|
4
|
+
routines (
|
|
5
|
+
prelude postlude
|
|
6
|
+
mark_regions
|
|
7
|
+
shortv
|
|
8
|
+
R1 R2
|
|
9
|
+
Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5
|
|
10
|
+
exception1
|
|
11
|
+
exception2
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
externals ( stem )
|
|
15
|
+
|
|
16
|
+
groupings ( v v_WXY valid_LI )
|
|
17
|
+
|
|
18
|
+
stringescapes {}
|
|
19
|
+
|
|
20
|
+
define v 'aeiouy'
|
|
21
|
+
define v_WXY v + 'wxY'
|
|
22
|
+
|
|
23
|
+
define valid_LI 'cdeghkmnrt'
|
|
24
|
+
|
|
25
|
+
define prelude as (
|
|
26
|
+
unset Y_found
|
|
27
|
+
do ( ['{'}'] delete)
|
|
28
|
+
do ( ['y'] <-'Y' set Y_found)
|
|
29
|
+
do repeat(goto (v ['y']) <-'Y' set Y_found)
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
define mark_regions as (
|
|
33
|
+
$p1 = limit
|
|
34
|
+
$p2 = limit
|
|
35
|
+
do(
|
|
36
|
+
among (
|
|
37
|
+
'gener'
|
|
38
|
+
'commun' // added May 2005
|
|
39
|
+
'arsen' // added Nov 2006 (arsenic/arsenal)
|
|
40
|
+
// ... extensions possible here ...
|
|
41
|
+
) or (gopast v gopast non-v)
|
|
42
|
+
setmark p1
|
|
43
|
+
gopast v gopast non-v setmark p2
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
backwardmode (
|
|
48
|
+
|
|
49
|
+
define shortv as (
|
|
50
|
+
( non-v_WXY v non-v )
|
|
51
|
+
or
|
|
52
|
+
( non-v v atlimit )
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
define R1 as $p1 <= cursor
|
|
56
|
+
define R2 as $p2 <= cursor
|
|
57
|
+
|
|
58
|
+
define Step_1a as (
|
|
59
|
+
try (
|
|
60
|
+
[substring] among (
|
|
61
|
+
'{'}' '{'}s' '{'}s{'}'
|
|
62
|
+
(delete)
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
[substring] among (
|
|
66
|
+
'sses' (<-'ss')
|
|
67
|
+
'ied' 'ies'
|
|
68
|
+
((hop 2 <-'i') or <-'ie')
|
|
69
|
+
's' (next gopast v delete)
|
|
70
|
+
'us' 'ss'
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
define Step_1b as (
|
|
75
|
+
[substring] among (
|
|
76
|
+
'eed' 'eedly'
|
|
77
|
+
(R1 <-'ee')
|
|
78
|
+
'ed' 'edly' 'ing' 'ingly'
|
|
79
|
+
(
|
|
80
|
+
test gopast v delete
|
|
81
|
+
test substring among(
|
|
82
|
+
'at' 'bl' 'iz'
|
|
83
|
+
(<+ 'e')
|
|
84
|
+
'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
|
|
85
|
+
// ignoring double c, h, j, k, q, v, w, and x
|
|
86
|
+
([next] delete)
|
|
87
|
+
'' (atmark p1 test shortv <+ 'e')
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
define Step_1c as (
|
|
94
|
+
['y' or 'Y']
|
|
95
|
+
non-v not atlimit
|
|
96
|
+
<-'i'
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
define Step_2 as (
|
|
100
|
+
[substring] R1 among (
|
|
101
|
+
'tional' (<-'tion')
|
|
102
|
+
'enci' (<-'ence')
|
|
103
|
+
'anci' (<-'ance')
|
|
104
|
+
'abli' (<-'able')
|
|
105
|
+
'entli' (<-'ent')
|
|
106
|
+
'izer' 'ization'
|
|
107
|
+
(<-'ize')
|
|
108
|
+
'ational' 'ation' 'ator'
|
|
109
|
+
(<-'ate')
|
|
110
|
+
'alism' 'aliti' 'alli'
|
|
111
|
+
(<-'al')
|
|
112
|
+
'fulness' (<-'ful')
|
|
113
|
+
'ousli' 'ousness'
|
|
114
|
+
(<-'ous')
|
|
115
|
+
'iveness' 'iviti'
|
|
116
|
+
(<-'ive')
|
|
117
|
+
'biliti' 'bli'
|
|
118
|
+
(<-'ble')
|
|
119
|
+
'ogi' ('l' <-'og')
|
|
120
|
+
'fulli' (<-'ful')
|
|
121
|
+
'lessli' (<-'less')
|
|
122
|
+
'li' (valid_LI delete)
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
define Step_3 as (
|
|
127
|
+
[substring] R1 among (
|
|
128
|
+
'tional' (<- 'tion')
|
|
129
|
+
'ational' (<- 'ate')
|
|
130
|
+
'alize' (<-'al')
|
|
131
|
+
'icate' 'iciti' 'ical'
|
|
132
|
+
(<-'ic')
|
|
133
|
+
'ful' 'ness'
|
|
134
|
+
(delete)
|
|
135
|
+
'ative'
|
|
136
|
+
(R2 delete) // 'R2' added Dec 2001
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
define Step_4 as (
|
|
141
|
+
[substring] R2 among (
|
|
142
|
+
'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
|
|
143
|
+
'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
|
|
144
|
+
(delete)
|
|
145
|
+
'ion' ('s' or 't' delete)
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
define Step_5 as (
|
|
150
|
+
[substring] among (
|
|
151
|
+
'e' (R2 or (R1 not shortv) delete)
|
|
152
|
+
'l' (R2 'l' delete)
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
define exception2 as (
|
|
157
|
+
|
|
158
|
+
[substring] atlimit among(
|
|
159
|
+
'inning' 'outing' 'canning' 'herring' 'earring'
|
|
160
|
+
'proceed' 'exceed' 'succeed'
|
|
161
|
+
|
|
162
|
+
// ... extensions possible here ...
|
|
163
|
+
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
define exception1 as (
|
|
169
|
+
|
|
170
|
+
[substring] atlimit among(
|
|
171
|
+
|
|
172
|
+
/* special changes: */
|
|
173
|
+
|
|
174
|
+
'skis' (<-'ski')
|
|
175
|
+
'skies' (<-'sky')
|
|
176
|
+
'dying' (<-'die')
|
|
177
|
+
'lying' (<-'lie')
|
|
178
|
+
'tying' (<-'tie')
|
|
179
|
+
|
|
180
|
+
/* special -LY cases */
|
|
181
|
+
|
|
182
|
+
'idly' (<-'idl')
|
|
183
|
+
'gently' (<-'gentl')
|
|
184
|
+
'ugly' (<-'ugli')
|
|
185
|
+
'early' (<-'earli')
|
|
186
|
+
'only' (<-'onli')
|
|
187
|
+
'singly' (<-'singl')
|
|
188
|
+
|
|
189
|
+
// ... extensions possible here ...
|
|
190
|
+
|
|
191
|
+
/* invariant forms: */
|
|
192
|
+
|
|
193
|
+
'sky'
|
|
194
|
+
'news'
|
|
195
|
+
'howe'
|
|
196
|
+
|
|
197
|
+
'atlas' 'cosmos' 'bias' 'andes' // not plural forms
|
|
198
|
+
|
|
199
|
+
// ... extensions possible here ...
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
define postlude as (Y_found repeat(goto (['Y']) <-'y'))
|
|
204
|
+
|
|
205
|
+
define stem as (
|
|
206
|
+
|
|
207
|
+
exception1 or
|
|
208
|
+
not hop 3 or (
|
|
209
|
+
do prelude
|
|
210
|
+
do mark_regions
|
|
211
|
+
backwards (
|
|
212
|
+
|
|
213
|
+
do Step_1a
|
|
214
|
+
|
|
215
|
+
exception2 or (
|
|
216
|
+
|
|
217
|
+
do Step_1b
|
|
218
|
+
do Step_1c
|
|
219
|
+
|
|
220
|
+
do Step_2
|
|
221
|
+
do Step_3
|
|
222
|
+
do Step_4
|
|
223
|
+
|
|
224
|
+
do Step_5
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
do postlude
|
|
228
|
+
)
|
|
229
|
+
)
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
|
|
2
|
+
/* Finnish stemmer.
|
|
3
|
+
|
|
4
|
+
Numbers in square brackets refer to the sections in
|
|
5
|
+
Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999
|
|
6
|
+
ISBN 0-415-20705-3
|
|
7
|
+
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
routines (
|
|
11
|
+
mark_regions
|
|
12
|
+
R2
|
|
13
|
+
particle_etc possessive
|
|
14
|
+
LONG VI
|
|
15
|
+
case_ending
|
|
16
|
+
i_plural
|
|
17
|
+
t_plural
|
|
18
|
+
other_endings
|
|
19
|
+
tidy
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
externals ( stem )
|
|
23
|
+
|
|
24
|
+
integers ( p1 p2 )
|
|
25
|
+
strings ( x )
|
|
26
|
+
booleans ( ending_removed )
|
|
27
|
+
groupings ( AEI C V1 V2 particle_end )
|
|
28
|
+
|
|
29
|
+
stringescapes {}
|
|
30
|
+
|
|
31
|
+
/* special characters */
|
|
32
|
+
|
|
33
|
+
stringdef a" '{U+00E4}'
|
|
34
|
+
stringdef o" '{U+00F6}'
|
|
35
|
+
|
|
36
|
+
define AEI 'a{a"}ei'
|
|
37
|
+
define C 'bcdfghjklmnpqrstvwxz'
|
|
38
|
+
define V1 'aeiouy{a"}{o"}'
|
|
39
|
+
define V2 'aeiou{a"}{o"}'
|
|
40
|
+
define particle_end V1 + 'nt'
|
|
41
|
+
|
|
42
|
+
define mark_regions as (
|
|
43
|
+
|
|
44
|
+
$p1 = limit
|
|
45
|
+
$p2 = limit
|
|
46
|
+
|
|
47
|
+
goto V1 gopast non-V1 setmark p1
|
|
48
|
+
goto V1 gopast non-V1 setmark p2
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
backwardmode (
|
|
52
|
+
|
|
53
|
+
define R2 as $p2 <= cursor
|
|
54
|
+
|
|
55
|
+
define particle_etc as (
|
|
56
|
+
setlimit tomark p1 for ([substring])
|
|
57
|
+
among(
|
|
58
|
+
'kin'
|
|
59
|
+
'kaan' 'k{a"}{a"}n'
|
|
60
|
+
'ko' 'k{o"}'
|
|
61
|
+
'han' 'h{a"}n'
|
|
62
|
+
'pa' 'p{a"}' // Particles [91]
|
|
63
|
+
(particle_end)
|
|
64
|
+
'sti' // Adverb [87]
|
|
65
|
+
(R2)
|
|
66
|
+
)
|
|
67
|
+
delete
|
|
68
|
+
)
|
|
69
|
+
define possessive as ( // [36]
|
|
70
|
+
setlimit tomark p1 for ([substring])
|
|
71
|
+
among(
|
|
72
|
+
'si'
|
|
73
|
+
(not 'k' delete) // take 'ksi' as the Comitative case
|
|
74
|
+
'ni'
|
|
75
|
+
(delete ['kse'] <- 'ksi') // kseni = ksi + ni
|
|
76
|
+
'nsa' 'ns{a"}'
|
|
77
|
+
'mme'
|
|
78
|
+
'nne'
|
|
79
|
+
(delete)
|
|
80
|
+
/* Now for Vn possessives after case endings: [36] */
|
|
81
|
+
'an'
|
|
82
|
+
(among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete)
|
|
83
|
+
'{a"}n'
|
|
84
|
+
(among('t{a"}' 'ss{a"}' 'st{a"}'
|
|
85
|
+
'll{a"}' 'lt{a"}' 'n{a"}') delete)
|
|
86
|
+
'en'
|
|
87
|
+
(among('lle' 'ine') delete)
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
define LONG as
|
|
92
|
+
among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}')
|
|
93
|
+
|
|
94
|
+
define VI as ('i' V2)
|
|
95
|
+
|
|
96
|
+
define case_ending as (
|
|
97
|
+
setlimit tomark p1 for ([substring])
|
|
98
|
+
among(
|
|
99
|
+
'han' ('a') //-.
|
|
100
|
+
'hen' ('e') // |
|
|
101
|
+
'hin' ('i') // |
|
|
102
|
+
'hon' ('o') // |
|
|
103
|
+
'h{a"}n' ('{a"}') // Illative [43]
|
|
104
|
+
'h{o"}n' ('{o"}') // |
|
|
105
|
+
'siin' VI // |
|
|
106
|
+
'seen' LONG //-'
|
|
107
|
+
|
|
108
|
+
'den' VI
|
|
109
|
+
'tten' VI // Genitive plurals [34]
|
|
110
|
+
()
|
|
111
|
+
'n' // Genitive or Illative
|
|
112
|
+
( try ( LONG // Illative
|
|
113
|
+
or 'ie' // Genitive
|
|
114
|
+
and next ]
|
|
115
|
+
)
|
|
116
|
+
/* otherwise Genitive */
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
'a' '{a"}' //-.
|
|
120
|
+
(V1 C) // |
|
|
121
|
+
'tta' 'tt{a"}' // Partitive [32]
|
|
122
|
+
('e') // |
|
|
123
|
+
'ta' 't{a"}' //-'
|
|
124
|
+
|
|
125
|
+
'ssa' 'ss{a"}' // Inessive [41]
|
|
126
|
+
'sta' 'st{a"}' // Elative [42]
|
|
127
|
+
|
|
128
|
+
'lla' 'll{a"}' // Adessive [44]
|
|
129
|
+
'lta' 'lt{a"}' // Ablative [51]
|
|
130
|
+
'lle' // Allative [46]
|
|
131
|
+
'na' 'n{a"}' // Essive [49]
|
|
132
|
+
'ksi' // Translative[50]
|
|
133
|
+
'ine' // Comitative [51]
|
|
134
|
+
|
|
135
|
+
/* Abessive and Instructive are too rare for
|
|
136
|
+
inclusion [51] */
|
|
137
|
+
|
|
138
|
+
)
|
|
139
|
+
delete
|
|
140
|
+
set ending_removed
|
|
141
|
+
)
|
|
142
|
+
define other_endings as (
|
|
143
|
+
setlimit tomark p2 for ([substring])
|
|
144
|
+
among(
|
|
145
|
+
'mpi' 'mpa' 'mp{a"}'
|
|
146
|
+
'mmi' 'mma' 'mm{a"}' // Comparative forms [85]
|
|
147
|
+
(not 'po') //-improves things
|
|
148
|
+
'impi' 'impa' 'imp{a"}'
|
|
149
|
+
'immi' 'imma' 'imm{a"}' // Superlative forms [86]
|
|
150
|
+
'eja' 'ej{a"}' // indicates agent [93.1B]
|
|
151
|
+
)
|
|
152
|
+
delete
|
|
153
|
+
)
|
|
154
|
+
define i_plural as ( // [26]
|
|
155
|
+
setlimit tomark p1 for ([substring])
|
|
156
|
+
among(
|
|
157
|
+
'i' 'j'
|
|
158
|
+
)
|
|
159
|
+
delete
|
|
160
|
+
)
|
|
161
|
+
define t_plural as ( // [26]
|
|
162
|
+
setlimit tomark p1 for (
|
|
163
|
+
['t'] test V1
|
|
164
|
+
delete
|
|
165
|
+
)
|
|
166
|
+
setlimit tomark p2 for ([substring])
|
|
167
|
+
among(
|
|
168
|
+
'mma' (not 'po') //-mmat endings
|
|
169
|
+
'imma' //-immat endings
|
|
170
|
+
)
|
|
171
|
+
delete
|
|
172
|
+
)
|
|
173
|
+
define tidy as (
|
|
174
|
+
setlimit tomark p1 for (
|
|
175
|
+
do ( LONG and ([next] delete ) ) // undouble vowel
|
|
176
|
+
do ( [AEI] C delete ) // remove trailing a, a", e, i
|
|
177
|
+
do ( ['j'] 'o' or 'u' delete )
|
|
178
|
+
do ( ['o'] 'j' delete )
|
|
179
|
+
)
|
|
180
|
+
goto non-V1 [C] -> x x delete // undouble consonant
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
define stem as (
|
|
185
|
+
|
|
186
|
+
do mark_regions
|
|
187
|
+
unset ending_removed
|
|
188
|
+
backwards (
|
|
189
|
+
do particle_etc
|
|
190
|
+
do possessive
|
|
191
|
+
do case_ending
|
|
192
|
+
do other_endings
|
|
193
|
+
(ending_removed do i_plural) or do t_plural
|
|
194
|
+
do tidy
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
|