czech-stemmer 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +80 -0
- data/LICENSE.txt +20 -0
- data/README.markdown +10 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/czech-stemmer.gemspec +66 -0
- data/lib/czech-stemmer.rb +125 -0
- data/test/CzechStemmer.java +173 -0
- data/test/TestCzechStemmer.java +300 -0
- data/test/TestCzechStemmer.java.txt +300 -0
- data/test/helper.rb +2 -0
- data/test/java_test_converter.bash +7 -0
- data/test/test_czech-stemmer.rb +221 -0
- metadata +130 -0
@@ -0,0 +1,300 @@
|
|
1
|
+
package org.apache.lucene.analysis.cz;
|
2
|
+
|
3
|
+
/*
|
4
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
5
|
+
* contributor license agreements. See the NOTICE file distributed with
|
6
|
+
* this work for additional information regarding copyright ownership.
|
7
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
8
|
+
* (the "License"); you may not use this file except in compliance with
|
9
|
+
* the License. You may obtain a copy of the License at
|
10
|
+
*
|
11
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
*
|
13
|
+
* Unless required by applicable law or agreed to in writing, software
|
14
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
* See the License for the specific language governing permissions and
|
17
|
+
* limitations under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
import java.io.IOException;
|
21
|
+
import java.io.Reader;
|
22
|
+
import java.io.StringReader;
|
23
|
+
|
24
|
+
import org.apache.lucene.analysis.Analyzer;
|
25
|
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
26
|
+
import org.apache.lucene.analysis.MockTokenizer;
|
27
|
+
import org.apache.lucene.analysis.Tokenizer;
|
28
|
+
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
29
|
+
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
30
|
+
import org.apache.lucene.analysis.util.CharArraySet;
|
31
|
+
|
32
|
+
/**
|
33
|
+
* Test the Czech Stemmer.
|
34
|
+
*
|
35
|
+
* Note: its algorithmic, so some stems are nonsense
|
36
|
+
*
|
37
|
+
*/
|
38
|
+
public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
39
|
+
|
40
|
+
/**
|
41
|
+
* Test showing how masculine noun forms conflate
|
42
|
+
*/
|
43
|
+
public void testMasculineNouns() throws IOException {
|
44
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
45
|
+
|
46
|
+
/* animate ending with a hard consonant */
|
47
|
+
assertAnalyzesTo(cz, "pán", new String[] { "pán" });
|
48
|
+
assertAnalyzesTo(cz, "páni", new String[] { "pán" });
|
49
|
+
assertAnalyzesTo(cz, "pánové", new String[] { "pán" });
|
50
|
+
assertAnalyzesTo(cz, "pána", new String[] { "pán" });
|
51
|
+
assertAnalyzesTo(cz, "pánů", new String[] { "pán" });
|
52
|
+
assertAnalyzesTo(cz, "pánovi", new String[] { "pán" });
|
53
|
+
assertAnalyzesTo(cz, "pánům", new String[] { "pán" });
|
54
|
+
assertAnalyzesTo(cz, "pány", new String[] { "pán" });
|
55
|
+
assertAnalyzesTo(cz, "páne", new String[] { "pán" });
|
56
|
+
assertAnalyzesTo(cz, "pánech", new String[] { "pán" });
|
57
|
+
assertAnalyzesTo(cz, "pánem", new String[] { "pán" });
|
58
|
+
|
59
|
+
/* inanimate ending with hard consonant */
|
60
|
+
assertAnalyzesTo(cz, "hrad", new String[] { "hrad" });
|
61
|
+
assertAnalyzesTo(cz, "hradu", new String[] { "hrad" });
|
62
|
+
assertAnalyzesTo(cz, "hrade", new String[] { "hrad" });
|
63
|
+
assertAnalyzesTo(cz, "hradem", new String[] { "hrad" });
|
64
|
+
assertAnalyzesTo(cz, "hrady", new String[] { "hrad" });
|
65
|
+
assertAnalyzesTo(cz, "hradech", new String[] { "hrad" });
|
66
|
+
assertAnalyzesTo(cz, "hradům", new String[] { "hrad" });
|
67
|
+
assertAnalyzesTo(cz, "hradů", new String[] { "hrad" });
|
68
|
+
|
69
|
+
/* animate ending with a soft consonant */
|
70
|
+
assertAnalyzesTo(cz, "muž", new String[] { "muh" });
|
71
|
+
assertAnalyzesTo(cz, "muži", new String[] { "muh" });
|
72
|
+
assertAnalyzesTo(cz, "muže", new String[] { "muh" });
|
73
|
+
assertAnalyzesTo(cz, "mužů", new String[] { "muh" });
|
74
|
+
assertAnalyzesTo(cz, "mužům", new String[] { "muh" });
|
75
|
+
assertAnalyzesTo(cz, "mužích", new String[] { "muh" });
|
76
|
+
assertAnalyzesTo(cz, "mužem", new String[] { "muh" });
|
77
|
+
|
78
|
+
/* inanimate ending with a soft consonant */
|
79
|
+
assertAnalyzesTo(cz, "stroj", new String[] { "stroj" });
|
80
|
+
assertAnalyzesTo(cz, "stroje", new String[] { "stroj" });
|
81
|
+
assertAnalyzesTo(cz, "strojů", new String[] { "stroj" });
|
82
|
+
assertAnalyzesTo(cz, "stroji", new String[] { "stroj" });
|
83
|
+
assertAnalyzesTo(cz, "strojům", new String[] { "stroj" });
|
84
|
+
assertAnalyzesTo(cz, "strojích", new String[] { "stroj" });
|
85
|
+
assertAnalyzesTo(cz, "strojem", new String[] { "stroj" });
|
86
|
+
|
87
|
+
/* ending with a */
|
88
|
+
assertAnalyzesTo(cz, "předseda", new String[] { "předsd" });
|
89
|
+
assertAnalyzesTo(cz, "předsedové", new String[] { "předsd" });
|
90
|
+
assertAnalyzesTo(cz, "předsedy", new String[] { "předsd" });
|
91
|
+
assertAnalyzesTo(cz, "předsedů", new String[] { "předsd" });
|
92
|
+
assertAnalyzesTo(cz, "předsedovi", new String[] { "předsd" });
|
93
|
+
assertAnalyzesTo(cz, "předsedům", new String[] { "předsd" });
|
94
|
+
assertAnalyzesTo(cz, "předsedu", new String[] { "předsd" });
|
95
|
+
assertAnalyzesTo(cz, "předsedo", new String[] { "předsd" });
|
96
|
+
assertAnalyzesTo(cz, "předsedech", new String[] { "předsd" });
|
97
|
+
assertAnalyzesTo(cz, "předsedou", new String[] { "předsd" });
|
98
|
+
|
99
|
+
/* ending with e */
|
100
|
+
assertAnalyzesTo(cz, "soudce", new String[] { "soudk" });
|
101
|
+
assertAnalyzesTo(cz, "soudci", new String[] { "soudk" });
|
102
|
+
assertAnalyzesTo(cz, "soudců", new String[] { "soudk" });
|
103
|
+
assertAnalyzesTo(cz, "soudcům", new String[] { "soudk" });
|
104
|
+
assertAnalyzesTo(cz, "soudcích", new String[] { "soudk" });
|
105
|
+
assertAnalyzesTo(cz, "soudcem", new String[] { "soudk" });
|
106
|
+
}
|
107
|
+
|
108
|
+
/**
|
109
|
+
* Test showing how feminine noun forms conflate
|
110
|
+
*/
|
111
|
+
public void testFeminineNouns() throws IOException {
|
112
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
113
|
+
|
114
|
+
/* ending with hard consonant */
|
115
|
+
assertAnalyzesTo(cz, "kost", new String[] { "kost" });
|
116
|
+
assertAnalyzesTo(cz, "kosti", new String[] { "kost" });
|
117
|
+
assertAnalyzesTo(cz, "kostí", new String[] { "kost" });
|
118
|
+
assertAnalyzesTo(cz, "kostem", new String[] { "kost" });
|
119
|
+
assertAnalyzesTo(cz, "kostech", new String[] { "kost" });
|
120
|
+
assertAnalyzesTo(cz, "kostmi", new String[] { "kost" });
|
121
|
+
|
122
|
+
/* ending with a soft consonant */
|
123
|
+
// note: in this example sing nom. and sing acc. don't conflate w/ the rest
|
124
|
+
assertAnalyzesTo(cz, "píseň", new String[] { "písň" });
|
125
|
+
assertAnalyzesTo(cz, "písně", new String[] { "písn" });
|
126
|
+
assertAnalyzesTo(cz, "písni", new String[] { "písn" });
|
127
|
+
assertAnalyzesTo(cz, "písněmi", new String[] { "písn" });
|
128
|
+
assertAnalyzesTo(cz, "písních", new String[] { "písn" });
|
129
|
+
assertAnalyzesTo(cz, "písním", new String[] { "písn" });
|
130
|
+
|
131
|
+
/* ending with e */
|
132
|
+
assertAnalyzesTo(cz, "růže", new String[] { "růh" });
|
133
|
+
assertAnalyzesTo(cz, "růží", new String[] { "růh" });
|
134
|
+
assertAnalyzesTo(cz, "růžím", new String[] { "růh" });
|
135
|
+
assertAnalyzesTo(cz, "růžích", new String[] { "růh" });
|
136
|
+
assertAnalyzesTo(cz, "růžemi", new String[] { "růh" });
|
137
|
+
assertAnalyzesTo(cz, "růži", new String[] { "růh" });
|
138
|
+
|
139
|
+
/* ending with a */
|
140
|
+
assertAnalyzesTo(cz, "žena", new String[] { "žn" });
|
141
|
+
assertAnalyzesTo(cz, "ženy", new String[] { "žn" });
|
142
|
+
assertAnalyzesTo(cz, "žen", new String[] { "žn" });
|
143
|
+
assertAnalyzesTo(cz, "ženě", new String[] { "žn" });
|
144
|
+
assertAnalyzesTo(cz, "ženám", new String[] { "žn" });
|
145
|
+
assertAnalyzesTo(cz, "ženu", new String[] { "žn" });
|
146
|
+
assertAnalyzesTo(cz, "ženo", new String[] { "žn" });
|
147
|
+
assertAnalyzesTo(cz, "ženách", new String[] { "žn" });
|
148
|
+
assertAnalyzesTo(cz, "ženou", new String[] { "žn" });
|
149
|
+
assertAnalyzesTo(cz, "ženami", new String[] { "žn" });
|
150
|
+
}
|
151
|
+
|
152
|
+
/**
|
153
|
+
* Test showing how neuter noun forms conflate
|
154
|
+
*/
|
155
|
+
public void testNeuterNouns() throws IOException {
|
156
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
157
|
+
|
158
|
+
/* ending with o */
|
159
|
+
assertAnalyzesTo(cz, "město", new String[] { "měst" });
|
160
|
+
assertAnalyzesTo(cz, "města", new String[] { "měst" });
|
161
|
+
assertAnalyzesTo(cz, "měst", new String[] { "měst" });
|
162
|
+
assertAnalyzesTo(cz, "městu", new String[] { "měst" });
|
163
|
+
assertAnalyzesTo(cz, "městům", new String[] { "měst" });
|
164
|
+
assertAnalyzesTo(cz, "městě", new String[] { "měst" });
|
165
|
+
assertAnalyzesTo(cz, "městech", new String[] { "měst" });
|
166
|
+
assertAnalyzesTo(cz, "městem", new String[] { "měst" });
|
167
|
+
assertAnalyzesTo(cz, "městy", new String[] { "měst" });
|
168
|
+
|
169
|
+
/* ending with e */
|
170
|
+
assertAnalyzesTo(cz, "moře", new String[] { "moř" });
|
171
|
+
assertAnalyzesTo(cz, "moří", new String[] { "moř" });
|
172
|
+
assertAnalyzesTo(cz, "mořím", new String[] { "moř" });
|
173
|
+
assertAnalyzesTo(cz, "moři", new String[] { "moř" });
|
174
|
+
assertAnalyzesTo(cz, "mořích", new String[] { "moř" });
|
175
|
+
assertAnalyzesTo(cz, "mořem", new String[] { "moř" });
|
176
|
+
|
177
|
+
/* ending with ě */
|
178
|
+
assertAnalyzesTo(cz, "kuře", new String[] { "kuř" });
|
179
|
+
assertAnalyzesTo(cz, "kuřata", new String[] { "kuř" });
|
180
|
+
assertAnalyzesTo(cz, "kuřete", new String[] { "kuř" });
|
181
|
+
assertAnalyzesTo(cz, "kuřat", new String[] { "kuř" });
|
182
|
+
assertAnalyzesTo(cz, "kuřeti", new String[] { "kuř" });
|
183
|
+
assertAnalyzesTo(cz, "kuřatům", new String[] { "kuř" });
|
184
|
+
assertAnalyzesTo(cz, "kuřatech", new String[] { "kuř" });
|
185
|
+
assertAnalyzesTo(cz, "kuřetem", new String[] { "kuř" });
|
186
|
+
assertAnalyzesTo(cz, "kuřaty", new String[] { "kuř" });
|
187
|
+
|
188
|
+
/* ending with í */
|
189
|
+
assertAnalyzesTo(cz, "stavení", new String[] { "stavn" });
|
190
|
+
assertAnalyzesTo(cz, "stavením", new String[] { "stavn" });
|
191
|
+
assertAnalyzesTo(cz, "staveních", new String[] { "stavn" });
|
192
|
+
assertAnalyzesTo(cz, "staveními", new String[] { "stavn" });
|
193
|
+
}
|
194
|
+
|
195
|
+
/**
|
196
|
+
* Test showing how adjectival forms conflate
|
197
|
+
*/
|
198
|
+
public void testAdjectives() throws IOException {
|
199
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
200
|
+
|
201
|
+
/* ending with ý/á/é */
|
202
|
+
assertAnalyzesTo(cz, "mladý", new String[] { "mlad" });
|
203
|
+
assertAnalyzesTo(cz, "mladí", new String[] { "mlad" });
|
204
|
+
assertAnalyzesTo(cz, "mladého", new String[] { "mlad" });
|
205
|
+
assertAnalyzesTo(cz, "mladých", new String[] { "mlad" });
|
206
|
+
assertAnalyzesTo(cz, "mladému", new String[] { "mlad" });
|
207
|
+
assertAnalyzesTo(cz, "mladým", new String[] { "mlad" });
|
208
|
+
assertAnalyzesTo(cz, "mladé", new String[] { "mlad" });
|
209
|
+
assertAnalyzesTo(cz, "mladém", new String[] { "mlad" });
|
210
|
+
assertAnalyzesTo(cz, "mladými", new String[] { "mlad" });
|
211
|
+
assertAnalyzesTo(cz, "mladá", new String[] { "mlad" });
|
212
|
+
assertAnalyzesTo(cz, "mladou", new String[] { "mlad" });
|
213
|
+
|
214
|
+
/* ending with í */
|
215
|
+
assertAnalyzesTo(cz, "jarní", new String[] { "jarn" });
|
216
|
+
assertAnalyzesTo(cz, "jarního", new String[] { "jarn" });
|
217
|
+
assertAnalyzesTo(cz, "jarních", new String[] { "jarn" });
|
218
|
+
assertAnalyzesTo(cz, "jarnímu", new String[] { "jarn" });
|
219
|
+
assertAnalyzesTo(cz, "jarním", new String[] { "jarn" });
|
220
|
+
assertAnalyzesTo(cz, "jarními", new String[] { "jarn" });
|
221
|
+
}
|
222
|
+
|
223
|
+
/**
|
224
|
+
* Test some possessive suffixes
|
225
|
+
*/
|
226
|
+
public void testPossessive() throws IOException {
|
227
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
228
|
+
assertAnalyzesTo(cz, "Karlův", new String[] { "karl" });
|
229
|
+
assertAnalyzesTo(cz, "jazykový", new String[] { "jazyk" });
|
230
|
+
}
|
231
|
+
|
232
|
+
/**
|
233
|
+
* Test some exceptional rules, implemented as rewrites.
|
234
|
+
*/
|
235
|
+
public void testExceptions() throws IOException {
|
236
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
237
|
+
|
238
|
+
/* rewrite of št -> sk */
|
239
|
+
assertAnalyzesTo(cz, "český", new String[] { "česk" });
|
240
|
+
assertAnalyzesTo(cz, "čeští", new String[] { "česk" });
|
241
|
+
|
242
|
+
/* rewrite of čt -> ck */
|
243
|
+
assertAnalyzesTo(cz, "anglický", new String[] { "anglick" });
|
244
|
+
assertAnalyzesTo(cz, "angličtí", new String[] { "anglick" });
|
245
|
+
|
246
|
+
/* rewrite of z -> h */
|
247
|
+
assertAnalyzesTo(cz, "kniha", new String[] { "knih" });
|
248
|
+
assertAnalyzesTo(cz, "knize", new String[] { "knih" });
|
249
|
+
|
250
|
+
/* rewrite of ž -> h */
|
251
|
+
assertAnalyzesTo(cz, "mazat", new String[] { "mah" });
|
252
|
+
assertAnalyzesTo(cz, "mažu", new String[] { "mah" });
|
253
|
+
|
254
|
+
/* rewrite of c -> k */
|
255
|
+
assertAnalyzesTo(cz, "kluk", new String[] { "kluk" });
|
256
|
+
assertAnalyzesTo(cz, "kluci", new String[] { "kluk" });
|
257
|
+
assertAnalyzesTo(cz, "klucích", new String[] { "kluk" });
|
258
|
+
|
259
|
+
/* rewrite of č -> k */
|
260
|
+
assertAnalyzesTo(cz, "hezký", new String[] { "hezk" });
|
261
|
+
assertAnalyzesTo(cz, "hezčí", new String[] { "hezk" });
|
262
|
+
|
263
|
+
/* rewrite of *ů* -> *o* */
|
264
|
+
assertAnalyzesTo(cz, "hůl", new String[] { "hol" });
|
265
|
+
assertAnalyzesTo(cz, "hole", new String[] { "hol" });
|
266
|
+
|
267
|
+
/* rewrite of e* -> * */
|
268
|
+
assertAnalyzesTo(cz, "deska", new String[] { "desk" });
|
269
|
+
assertAnalyzesTo(cz, "desek", new String[] { "desk" });
|
270
|
+
}
|
271
|
+
|
272
|
+
/**
|
273
|
+
* Test that very short words are not stemmed.
|
274
|
+
*/
|
275
|
+
public void testDontStem() throws IOException {
|
276
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
277
|
+
assertAnalyzesTo(cz, "e", new String[] { "e" });
|
278
|
+
assertAnalyzesTo(cz, "zi", new String[] { "zi" });
|
279
|
+
}
|
280
|
+
|
281
|
+
public void testWithKeywordAttribute() throws IOException {
|
282
|
+
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
283
|
+
set.add("hole");
|
284
|
+
CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
|
285
|
+
new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set));
|
286
|
+
assertTokenStreamContents(filter, new String[] { "hole", "desk" });
|
287
|
+
}
|
288
|
+
|
289
|
+
public void testEmptyTerm() throws IOException {
|
290
|
+
Analyzer a = new Analyzer() {
|
291
|
+
@Override
|
292
|
+
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
293
|
+
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
294
|
+
return new TokenStreamComponents(tokenizer, new CzechStemFilter(tokenizer));
|
295
|
+
}
|
296
|
+
};
|
297
|
+
checkOneTerm(a, "", "");
|
298
|
+
}
|
299
|
+
|
300
|
+
}
|
@@ -0,0 +1,300 @@
|
|
1
|
+
package org.apache.lucene.analysis.cz;
|
2
|
+
|
3
|
+
#
|
4
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
5
|
+
* contributor license agreements. See the NOTICE file distributed with
|
6
|
+
* this work for additional information regarding copyright ownership.
|
7
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
8
|
+
* (the "License"); you may not use this file except in compliance with
|
9
|
+
* the License. You may obtain a copy of the License at
|
10
|
+
*
|
11
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
*
|
13
|
+
* Unless required by applicable law or agreed to in writing, software
|
14
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
* See the License for the specific language governing permissions and
|
17
|
+
* limitations under the License.
|
18
|
+
|
19
|
+
|
20
|
+
import java.io.IOException;
|
21
|
+
import java.io.Reader;
|
22
|
+
import java.io.StringReader;
|
23
|
+
|
24
|
+
import org.apache.lucene.analysis.Analyzer;
|
25
|
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
26
|
+
import org.apache.lucene.analysis.MockTokenizer;
|
27
|
+
import org.apache.lucene.analysis.Tokenizer;
|
28
|
+
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
29
|
+
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
30
|
+
import org.apache.lucene.analysis.util.CharArraySet;
|
31
|
+
|
32
|
+
#*
|
33
|
+
* Test the Czech Stemmer.
|
34
|
+
*
|
35
|
+
* Note: its algorithmic, so some stems are nonsense
|
36
|
+
*
|
37
|
+
|
38
|
+
public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
39
|
+
|
40
|
+
#*
|
41
|
+
* Test showing how masculine noun forms conflate
|
42
|
+
|
43
|
+
public void testMasculineNouns() throws IOException {
|
44
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
45
|
+
|
46
|
+
# animate ending with a hard consonant
|
47
|
+
assert_analyzes_to( "pán", "pán" )
|
48
|
+
assert_analyzes_to( "páni", "pán" )
|
49
|
+
assert_analyzes_to( "pánové", "pán" )
|
50
|
+
assert_analyzes_to( "pána", "pán" )
|
51
|
+
assert_analyzes_to( "pánů", "pán" )
|
52
|
+
assert_analyzes_to( "pánovi", "pán" )
|
53
|
+
assert_analyzes_to( "pánům", "pán" )
|
54
|
+
assert_analyzes_to( "pány", "pán" )
|
55
|
+
assert_analyzes_to( "páne", "pán" )
|
56
|
+
assert_analyzes_to( "pánech", "pán" )
|
57
|
+
assert_analyzes_to( "pánem", "pán" )
|
58
|
+
|
59
|
+
# inanimate ending with hard consonant
|
60
|
+
assert_analyzes_to( "hrad", "hrad" )
|
61
|
+
assert_analyzes_to( "hradu", "hrad" )
|
62
|
+
assert_analyzes_to( "hrade", "hrad" )
|
63
|
+
assert_analyzes_to( "hradem", "hrad" )
|
64
|
+
assert_analyzes_to( "hrady", "hrad" )
|
65
|
+
assert_analyzes_to( "hradech", "hrad" )
|
66
|
+
assert_analyzes_to( "hradům", "hrad" )
|
67
|
+
assert_analyzes_to( "hradů", "hrad" )
|
68
|
+
|
69
|
+
# animate ending with a soft consonant
|
70
|
+
assert_analyzes_to( "muž", "muh" )
|
71
|
+
assert_analyzes_to( "muži", "muh" )
|
72
|
+
assert_analyzes_to( "muže", "muh" )
|
73
|
+
assert_analyzes_to( "mužů", "muh" )
|
74
|
+
assert_analyzes_to( "mužům", "muh" )
|
75
|
+
assert_analyzes_to( "mužích", "muh" )
|
76
|
+
assert_analyzes_to( "mužem", "muh" )
|
77
|
+
|
78
|
+
# inanimate ending with a soft consonant
|
79
|
+
assert_analyzes_to( "stroj", "stroj" )
|
80
|
+
assert_analyzes_to( "stroje", "stroj" )
|
81
|
+
assert_analyzes_to( "strojů", "stroj" )
|
82
|
+
assert_analyzes_to( "stroji", "stroj" )
|
83
|
+
assert_analyzes_to( "strojům", "stroj" )
|
84
|
+
assert_analyzes_to( "strojích", "stroj" )
|
85
|
+
assert_analyzes_to( "strojem", "stroj" )
|
86
|
+
|
87
|
+
# ending with a
|
88
|
+
assert_analyzes_to( "předseda", "předsd" )
|
89
|
+
assert_analyzes_to( "předsedové", "předsd" )
|
90
|
+
assert_analyzes_to( "předsedy", "předsd" )
|
91
|
+
assert_analyzes_to( "předsedů", "předsd" )
|
92
|
+
assert_analyzes_to( "předsedovi", "předsd" )
|
93
|
+
assert_analyzes_to( "předsedům", "předsd" )
|
94
|
+
assert_analyzes_to( "předsedu", "předsd" )
|
95
|
+
assert_analyzes_to( "předsedo", "předsd" )
|
96
|
+
assert_analyzes_to( "předsedech", "předsd" )
|
97
|
+
assert_analyzes_to( "předsedou", "předsd" )
|
98
|
+
|
99
|
+
# ending with e
|
100
|
+
assert_analyzes_to( "soudce", "soudk" )
|
101
|
+
assert_analyzes_to( "soudci", "soudk" )
|
102
|
+
assert_analyzes_to( "soudců", "soudk" )
|
103
|
+
assert_analyzes_to( "soudcům", "soudk" )
|
104
|
+
assert_analyzes_to( "soudcích", "soudk" )
|
105
|
+
assert_analyzes_to( "soudcem", "soudk" )
|
106
|
+
}
|
107
|
+
|
108
|
+
#*
|
109
|
+
* Test showing how feminine noun forms conflate
|
110
|
+
|
111
|
+
public void testFeminineNouns() throws IOException {
|
112
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
113
|
+
|
114
|
+
# ending with hard consonant
|
115
|
+
assert_analyzes_to( "kost", "kost" )
|
116
|
+
assert_analyzes_to( "kosti", "kost" )
|
117
|
+
assert_analyzes_to( "kostí", "kost" )
|
118
|
+
assert_analyzes_to( "kostem", "kost" )
|
119
|
+
assert_analyzes_to( "kostech", "kost" )
|
120
|
+
assert_analyzes_to( "kostmi", "kost" )
|
121
|
+
|
122
|
+
# ending with a soft consonant
|
123
|
+
// note: in this example sing nom. and sing acc. don't conflate w/ the rest
|
124
|
+
assert_analyzes_to( "píseň", "písň" )
|
125
|
+
assert_analyzes_to( "písně", "písn" )
|
126
|
+
assert_analyzes_to( "písni", "písn" )
|
127
|
+
assert_analyzes_to( "písněmi", "písn" )
|
128
|
+
assert_analyzes_to( "písních", "písn" )
|
129
|
+
assert_analyzes_to( "písním", "písn" )
|
130
|
+
|
131
|
+
# ending with e
|
132
|
+
assert_analyzes_to( "růže", "růh" )
|
133
|
+
assert_analyzes_to( "růží", "růh" )
|
134
|
+
assert_analyzes_to( "růžím", "růh" )
|
135
|
+
assert_analyzes_to( "růžích", "růh" )
|
136
|
+
assert_analyzes_to( "růžemi", "růh" )
|
137
|
+
assert_analyzes_to( "růži", "růh" )
|
138
|
+
|
139
|
+
# ending with a
|
140
|
+
assert_analyzes_to( "žena", "žn" )
|
141
|
+
assert_analyzes_to( "ženy", "žn" )
|
142
|
+
assert_analyzes_to( "žen", "žn" )
|
143
|
+
assert_analyzes_to( "ženě", "žn" )
|
144
|
+
assert_analyzes_to( "ženám", "žn" )
|
145
|
+
assert_analyzes_to( "ženu", "žn" )
|
146
|
+
assert_analyzes_to( "ženo", "žn" )
|
147
|
+
assert_analyzes_to( "ženách", "žn" )
|
148
|
+
assert_analyzes_to( "ženou", "žn" )
|
149
|
+
assert_analyzes_to( "ženami", "žn" )
|
150
|
+
}
|
151
|
+
|
152
|
+
#*
|
153
|
+
* Test showing how neuter noun forms conflate
|
154
|
+
|
155
|
+
public void testNeuterNouns() throws IOException {
|
156
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
157
|
+
|
158
|
+
# ending with o
|
159
|
+
assert_analyzes_to( "město", "měst" )
|
160
|
+
assert_analyzes_to( "města", "měst" )
|
161
|
+
assert_analyzes_to( "měst", "měst" )
|
162
|
+
assert_analyzes_to( "městu", "měst" )
|
163
|
+
assert_analyzes_to( "městům", "měst" )
|
164
|
+
assert_analyzes_to( "městě", "měst" )
|
165
|
+
assert_analyzes_to( "městech", "měst" )
|
166
|
+
assert_analyzes_to( "městem", "měst" )
|
167
|
+
assert_analyzes_to( "městy", "měst" )
|
168
|
+
|
169
|
+
# ending with e
|
170
|
+
assert_analyzes_to( "moře", "moř" )
|
171
|
+
assert_analyzes_to( "moří", "moř" )
|
172
|
+
assert_analyzes_to( "mořím", "moř" )
|
173
|
+
assert_analyzes_to( "moři", "moř" )
|
174
|
+
assert_analyzes_to( "mořích", "moř" )
|
175
|
+
assert_analyzes_to( "mořem", "moř" )
|
176
|
+
|
177
|
+
# ending with ě
|
178
|
+
assert_analyzes_to( "kuře", "kuř" )
|
179
|
+
assert_analyzes_to( "kuřata", "kuř" )
|
180
|
+
assert_analyzes_to( "kuřete", "kuř" )
|
181
|
+
assert_analyzes_to( "kuřat", "kuř" )
|
182
|
+
assert_analyzes_to( "kuřeti", "kuř" )
|
183
|
+
assert_analyzes_to( "kuřatům", "kuř" )
|
184
|
+
assert_analyzes_to( "kuřatech", "kuř" )
|
185
|
+
assert_analyzes_to( "kuřetem", "kuř" )
|
186
|
+
assert_analyzes_to( "kuřaty", "kuř" )
|
187
|
+
|
188
|
+
# ending with í
|
189
|
+
assert_analyzes_to( "stavení", "stavn" )
|
190
|
+
assert_analyzes_to( "stavením", "stavn" )
|
191
|
+
assert_analyzes_to( "staveních", "stavn" )
|
192
|
+
assert_analyzes_to( "staveními", "stavn" )
|
193
|
+
}
|
194
|
+
|
195
|
+
#*
|
196
|
+
* Test showing how adjectival forms conflate
|
197
|
+
|
198
|
+
public void testAdjectives() throws IOException {
|
199
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
200
|
+
|
201
|
+
# ending with ý/á/é
|
202
|
+
assert_analyzes_to( "mladý", "mlad" )
|
203
|
+
assert_analyzes_to( "mladí", "mlad" )
|
204
|
+
assert_analyzes_to( "mladého", "mlad" )
|
205
|
+
assert_analyzes_to( "mladých", "mlad" )
|
206
|
+
assert_analyzes_to( "mladému", "mlad" )
|
207
|
+
assert_analyzes_to( "mladým", "mlad" )
|
208
|
+
assert_analyzes_to( "mladé", "mlad" )
|
209
|
+
assert_analyzes_to( "mladém", "mlad" )
|
210
|
+
assert_analyzes_to( "mladými", "mlad" )
|
211
|
+
assert_analyzes_to( "mladá", "mlad" )
|
212
|
+
assert_analyzes_to( "mladou", "mlad" )
|
213
|
+
|
214
|
+
# ending with í
|
215
|
+
assert_analyzes_to( "jarní", "jarn" )
|
216
|
+
assert_analyzes_to( "jarního", "jarn" )
|
217
|
+
assert_analyzes_to( "jarních", "jarn" )
|
218
|
+
assert_analyzes_to( "jarnímu", "jarn" )
|
219
|
+
assert_analyzes_to( "jarním", "jarn" )
|
220
|
+
assert_analyzes_to( "jarními", "jarn" )
|
221
|
+
}
|
222
|
+
|
223
|
+
#*
|
224
|
+
* Test some possessive suffixes
|
225
|
+
|
226
|
+
public void testPossessive() throws IOException {
|
227
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
228
|
+
assert_analyzes_to( "Karlův", "karl" )
|
229
|
+
assert_analyzes_to( "jazykový", "jazyk" )
|
230
|
+
}
|
231
|
+
|
232
|
+
#*
|
233
|
+
* Test some exceptional rules, implemented as rewrites.
|
234
|
+
|
235
|
+
public void testExceptions() throws IOException {
|
236
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
237
|
+
|
238
|
+
# rewrite of št -> sk
|
239
|
+
assert_analyzes_to( "český", "česk" )
|
240
|
+
assert_analyzes_to( "čeští", "česk" )
|
241
|
+
|
242
|
+
# rewrite of čt -> ck
|
243
|
+
assert_analyzes_to( "anglický", "anglick" )
|
244
|
+
assert_analyzes_to( "angličtí", "anglick" )
|
245
|
+
|
246
|
+
# rewrite of z -> h
|
247
|
+
assert_analyzes_to( "kniha", "knih" )
|
248
|
+
assert_analyzes_to( "knize", "knih" )
|
249
|
+
|
250
|
+
# rewrite of ž -> h
|
251
|
+
assert_analyzes_to( "mazat", "mah" )
|
252
|
+
assert_analyzes_to( "mažu", "mah" )
|
253
|
+
|
254
|
+
# rewrite of c -> k
|
255
|
+
assert_analyzes_to( "kluk", "kluk" )
|
256
|
+
assert_analyzes_to( "kluci", "kluk" )
|
257
|
+
assert_analyzes_to( "klucích", "kluk" )
|
258
|
+
|
259
|
+
# rewrite of č -> k
|
260
|
+
assert_analyzes_to( "hezký", "hezk" )
|
261
|
+
assert_analyzes_to( "hezčí", "hezk" )
|
262
|
+
|
263
|
+
# rewrite of *ů* -> *o*
|
264
|
+
assert_analyzes_to( "hůl", "hol" )
|
265
|
+
assert_analyzes_to( "hole", "hol" )
|
266
|
+
|
267
|
+
# rewrite of e* -> *
|
268
|
+
assert_analyzes_to( "deska", "desk" )
|
269
|
+
assert_analyzes_to( "desek", "desk" )
|
270
|
+
}
|
271
|
+
|
272
|
+
#*
|
273
|
+
* Test that very short words are not stemmed.
|
274
|
+
|
275
|
+
public void testDontStem() throws IOException {
|
276
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
277
|
+
assert_analyzes_to( "e", "e" )
|
278
|
+
assert_analyzes_to( "zi", "zi" )
|
279
|
+
}
|
280
|
+
|
281
|
+
public void testWithKeywordAttribute() throws IOException {
|
282
|
+
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
283
|
+
set.add("hole");
|
284
|
+
CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
|
285
|
+
new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set));
|
286
|
+
assertTokenStreamContents(filter, "hole", "desk" )
|
287
|
+
}
|
288
|
+
|
289
|
+
public void testEmptyTerm() throws IOException {
|
290
|
+
Analyzer a = new Analyzer() {
|
291
|
+
@Override
|
292
|
+
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
293
|
+
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
294
|
+
return new TokenStreamComponents(tokenizer, new CzechStemFilter(tokenizer));
|
295
|
+
}
|
296
|
+
};
|
297
|
+
checkOneTerm(a, "", "");
|
298
|
+
}
|
299
|
+
|
300
|
+
}
|
data/test/helper.rb
ADDED