czech-stemmer 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +80 -0
- data/LICENSE.txt +20 -0
- data/README.markdown +10 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/czech-stemmer.gemspec +66 -0
- data/lib/czech-stemmer.rb +125 -0
- data/test/CzechStemmer.java +173 -0
- data/test/TestCzechStemmer.java +300 -0
- data/test/TestCzechStemmer.java.txt +300 -0
- data/test/helper.rb +2 -0
- data/test/java_test_converter.bash +7 -0
- data/test/test_czech-stemmer.rb +221 -0
- metadata +130 -0
@@ -0,0 +1,300 @@
|
|
1
|
+
package org.apache.lucene.analysis.cz;
|
2
|
+
|
3
|
+
/*
|
4
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
5
|
+
* contributor license agreements. See the NOTICE file distributed with
|
6
|
+
* this work for additional information regarding copyright ownership.
|
7
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
8
|
+
* (the "License"); you may not use this file except in compliance with
|
9
|
+
* the License. You may obtain a copy of the License at
|
10
|
+
*
|
11
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
*
|
13
|
+
* Unless required by applicable law or agreed to in writing, software
|
14
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
* See the License for the specific language governing permissions and
|
17
|
+
* limitations under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
import java.io.IOException;
|
21
|
+
import java.io.Reader;
|
22
|
+
import java.io.StringReader;
|
23
|
+
|
24
|
+
import org.apache.lucene.analysis.Analyzer;
|
25
|
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
26
|
+
import org.apache.lucene.analysis.MockTokenizer;
|
27
|
+
import org.apache.lucene.analysis.Tokenizer;
|
28
|
+
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
29
|
+
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
30
|
+
import org.apache.lucene.analysis.util.CharArraySet;
|
31
|
+
|
32
|
+
/**
|
33
|
+
* Test the Czech Stemmer.
|
34
|
+
*
|
35
|
+
* Note: its algorithmic, so some stems are nonsense
|
36
|
+
*
|
37
|
+
*/
|
38
|
+
public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
39
|
+
|
40
|
+
/**
|
41
|
+
* Test showing how masculine noun forms conflate
|
42
|
+
*/
|
43
|
+
public void testMasculineNouns() throws IOException {
|
44
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
45
|
+
|
46
|
+
/* animate ending with a hard consonant */
|
47
|
+
assertAnalyzesTo(cz, "pán", new String[] { "pán" });
|
48
|
+
assertAnalyzesTo(cz, "páni", new String[] { "pán" });
|
49
|
+
assertAnalyzesTo(cz, "pánové", new String[] { "pán" });
|
50
|
+
assertAnalyzesTo(cz, "pána", new String[] { "pán" });
|
51
|
+
assertAnalyzesTo(cz, "pánů", new String[] { "pán" });
|
52
|
+
assertAnalyzesTo(cz, "pánovi", new String[] { "pán" });
|
53
|
+
assertAnalyzesTo(cz, "pánům", new String[] { "pán" });
|
54
|
+
assertAnalyzesTo(cz, "pány", new String[] { "pán" });
|
55
|
+
assertAnalyzesTo(cz, "páne", new String[] { "pán" });
|
56
|
+
assertAnalyzesTo(cz, "pánech", new String[] { "pán" });
|
57
|
+
assertAnalyzesTo(cz, "pánem", new String[] { "pán" });
|
58
|
+
|
59
|
+
/* inanimate ending with hard consonant */
|
60
|
+
assertAnalyzesTo(cz, "hrad", new String[] { "hrad" });
|
61
|
+
assertAnalyzesTo(cz, "hradu", new String[] { "hrad" });
|
62
|
+
assertAnalyzesTo(cz, "hrade", new String[] { "hrad" });
|
63
|
+
assertAnalyzesTo(cz, "hradem", new String[] { "hrad" });
|
64
|
+
assertAnalyzesTo(cz, "hrady", new String[] { "hrad" });
|
65
|
+
assertAnalyzesTo(cz, "hradech", new String[] { "hrad" });
|
66
|
+
assertAnalyzesTo(cz, "hradům", new String[] { "hrad" });
|
67
|
+
assertAnalyzesTo(cz, "hradů", new String[] { "hrad" });
|
68
|
+
|
69
|
+
/* animate ending with a soft consonant */
|
70
|
+
assertAnalyzesTo(cz, "muž", new String[] { "muh" });
|
71
|
+
assertAnalyzesTo(cz, "muži", new String[] { "muh" });
|
72
|
+
assertAnalyzesTo(cz, "muže", new String[] { "muh" });
|
73
|
+
assertAnalyzesTo(cz, "mužů", new String[] { "muh" });
|
74
|
+
assertAnalyzesTo(cz, "mužům", new String[] { "muh" });
|
75
|
+
assertAnalyzesTo(cz, "mužích", new String[] { "muh" });
|
76
|
+
assertAnalyzesTo(cz, "mužem", new String[] { "muh" });
|
77
|
+
|
78
|
+
/* inanimate ending with a soft consonant */
|
79
|
+
assertAnalyzesTo(cz, "stroj", new String[] { "stroj" });
|
80
|
+
assertAnalyzesTo(cz, "stroje", new String[] { "stroj" });
|
81
|
+
assertAnalyzesTo(cz, "strojů", new String[] { "stroj" });
|
82
|
+
assertAnalyzesTo(cz, "stroji", new String[] { "stroj" });
|
83
|
+
assertAnalyzesTo(cz, "strojům", new String[] { "stroj" });
|
84
|
+
assertAnalyzesTo(cz, "strojích", new String[] { "stroj" });
|
85
|
+
assertAnalyzesTo(cz, "strojem", new String[] { "stroj" });
|
86
|
+
|
87
|
+
/* ending with a */
|
88
|
+
assertAnalyzesTo(cz, "předseda", new String[] { "předsd" });
|
89
|
+
assertAnalyzesTo(cz, "předsedové", new String[] { "předsd" });
|
90
|
+
assertAnalyzesTo(cz, "předsedy", new String[] { "předsd" });
|
91
|
+
assertAnalyzesTo(cz, "předsedů", new String[] { "předsd" });
|
92
|
+
assertAnalyzesTo(cz, "předsedovi", new String[] { "předsd" });
|
93
|
+
assertAnalyzesTo(cz, "předsedům", new String[] { "předsd" });
|
94
|
+
assertAnalyzesTo(cz, "předsedu", new String[] { "předsd" });
|
95
|
+
assertAnalyzesTo(cz, "předsedo", new String[] { "předsd" });
|
96
|
+
assertAnalyzesTo(cz, "předsedech", new String[] { "předsd" });
|
97
|
+
assertAnalyzesTo(cz, "předsedou", new String[] { "předsd" });
|
98
|
+
|
99
|
+
/* ending with e */
|
100
|
+
assertAnalyzesTo(cz, "soudce", new String[] { "soudk" });
|
101
|
+
assertAnalyzesTo(cz, "soudci", new String[] { "soudk" });
|
102
|
+
assertAnalyzesTo(cz, "soudců", new String[] { "soudk" });
|
103
|
+
assertAnalyzesTo(cz, "soudcům", new String[] { "soudk" });
|
104
|
+
assertAnalyzesTo(cz, "soudcích", new String[] { "soudk" });
|
105
|
+
assertAnalyzesTo(cz, "soudcem", new String[] { "soudk" });
|
106
|
+
}
|
107
|
+
|
108
|
+
/**
|
109
|
+
* Test showing how feminine noun forms conflate
|
110
|
+
*/
|
111
|
+
public void testFeminineNouns() throws IOException {
|
112
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
113
|
+
|
114
|
+
/* ending with hard consonant */
|
115
|
+
assertAnalyzesTo(cz, "kost", new String[] { "kost" });
|
116
|
+
assertAnalyzesTo(cz, "kosti", new String[] { "kost" });
|
117
|
+
assertAnalyzesTo(cz, "kostí", new String[] { "kost" });
|
118
|
+
assertAnalyzesTo(cz, "kostem", new String[] { "kost" });
|
119
|
+
assertAnalyzesTo(cz, "kostech", new String[] { "kost" });
|
120
|
+
assertAnalyzesTo(cz, "kostmi", new String[] { "kost" });
|
121
|
+
|
122
|
+
/* ending with a soft consonant */
|
123
|
+
// note: in this example sing nom. and sing acc. don't conflate w/ the rest
|
124
|
+
assertAnalyzesTo(cz, "píseň", new String[] { "písň" });
|
125
|
+
assertAnalyzesTo(cz, "písně", new String[] { "písn" });
|
126
|
+
assertAnalyzesTo(cz, "písni", new String[] { "písn" });
|
127
|
+
assertAnalyzesTo(cz, "písněmi", new String[] { "písn" });
|
128
|
+
assertAnalyzesTo(cz, "písních", new String[] { "písn" });
|
129
|
+
assertAnalyzesTo(cz, "písním", new String[] { "písn" });
|
130
|
+
|
131
|
+
/* ending with e */
|
132
|
+
assertAnalyzesTo(cz, "růže", new String[] { "růh" });
|
133
|
+
assertAnalyzesTo(cz, "růží", new String[] { "růh" });
|
134
|
+
assertAnalyzesTo(cz, "růžím", new String[] { "růh" });
|
135
|
+
assertAnalyzesTo(cz, "růžích", new String[] { "růh" });
|
136
|
+
assertAnalyzesTo(cz, "růžemi", new String[] { "růh" });
|
137
|
+
assertAnalyzesTo(cz, "růži", new String[] { "růh" });
|
138
|
+
|
139
|
+
/* ending with a */
|
140
|
+
assertAnalyzesTo(cz, "žena", new String[] { "žn" });
|
141
|
+
assertAnalyzesTo(cz, "ženy", new String[] { "žn" });
|
142
|
+
assertAnalyzesTo(cz, "žen", new String[] { "žn" });
|
143
|
+
assertAnalyzesTo(cz, "ženě", new String[] { "žn" });
|
144
|
+
assertAnalyzesTo(cz, "ženám", new String[] { "žn" });
|
145
|
+
assertAnalyzesTo(cz, "ženu", new String[] { "žn" });
|
146
|
+
assertAnalyzesTo(cz, "ženo", new String[] { "žn" });
|
147
|
+
assertAnalyzesTo(cz, "ženách", new String[] { "žn" });
|
148
|
+
assertAnalyzesTo(cz, "ženou", new String[] { "žn" });
|
149
|
+
assertAnalyzesTo(cz, "ženami", new String[] { "žn" });
|
150
|
+
}
|
151
|
+
|
152
|
+
/**
|
153
|
+
* Test showing how neuter noun forms conflate
|
154
|
+
*/
|
155
|
+
public void testNeuterNouns() throws IOException {
|
156
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
157
|
+
|
158
|
+
/* ending with o */
|
159
|
+
assertAnalyzesTo(cz, "město", new String[] { "měst" });
|
160
|
+
assertAnalyzesTo(cz, "města", new String[] { "měst" });
|
161
|
+
assertAnalyzesTo(cz, "měst", new String[] { "měst" });
|
162
|
+
assertAnalyzesTo(cz, "městu", new String[] { "měst" });
|
163
|
+
assertAnalyzesTo(cz, "městům", new String[] { "měst" });
|
164
|
+
assertAnalyzesTo(cz, "městě", new String[] { "měst" });
|
165
|
+
assertAnalyzesTo(cz, "městech", new String[] { "měst" });
|
166
|
+
assertAnalyzesTo(cz, "městem", new String[] { "měst" });
|
167
|
+
assertAnalyzesTo(cz, "městy", new String[] { "měst" });
|
168
|
+
|
169
|
+
/* ending with e */
|
170
|
+
assertAnalyzesTo(cz, "moře", new String[] { "moř" });
|
171
|
+
assertAnalyzesTo(cz, "moří", new String[] { "moř" });
|
172
|
+
assertAnalyzesTo(cz, "mořím", new String[] { "moř" });
|
173
|
+
assertAnalyzesTo(cz, "moři", new String[] { "moř" });
|
174
|
+
assertAnalyzesTo(cz, "mořích", new String[] { "moř" });
|
175
|
+
assertAnalyzesTo(cz, "mořem", new String[] { "moř" });
|
176
|
+
|
177
|
+
/* ending with ě */
|
178
|
+
assertAnalyzesTo(cz, "kuře", new String[] { "kuř" });
|
179
|
+
assertAnalyzesTo(cz, "kuřata", new String[] { "kuř" });
|
180
|
+
assertAnalyzesTo(cz, "kuřete", new String[] { "kuř" });
|
181
|
+
assertAnalyzesTo(cz, "kuřat", new String[] { "kuř" });
|
182
|
+
assertAnalyzesTo(cz, "kuřeti", new String[] { "kuř" });
|
183
|
+
assertAnalyzesTo(cz, "kuřatům", new String[] { "kuř" });
|
184
|
+
assertAnalyzesTo(cz, "kuřatech", new String[] { "kuř" });
|
185
|
+
assertAnalyzesTo(cz, "kuřetem", new String[] { "kuř" });
|
186
|
+
assertAnalyzesTo(cz, "kuřaty", new String[] { "kuř" });
|
187
|
+
|
188
|
+
/* ending with í */
|
189
|
+
assertAnalyzesTo(cz, "stavení", new String[] { "stavn" });
|
190
|
+
assertAnalyzesTo(cz, "stavením", new String[] { "stavn" });
|
191
|
+
assertAnalyzesTo(cz, "staveních", new String[] { "stavn" });
|
192
|
+
assertAnalyzesTo(cz, "staveními", new String[] { "stavn" });
|
193
|
+
}
|
194
|
+
|
195
|
+
/**
|
196
|
+
* Test showing how adjectival forms conflate
|
197
|
+
*/
|
198
|
+
public void testAdjectives() throws IOException {
|
199
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
200
|
+
|
201
|
+
/* ending with ý/á/é */
|
202
|
+
assertAnalyzesTo(cz, "mladý", new String[] { "mlad" });
|
203
|
+
assertAnalyzesTo(cz, "mladí", new String[] { "mlad" });
|
204
|
+
assertAnalyzesTo(cz, "mladého", new String[] { "mlad" });
|
205
|
+
assertAnalyzesTo(cz, "mladých", new String[] { "mlad" });
|
206
|
+
assertAnalyzesTo(cz, "mladému", new String[] { "mlad" });
|
207
|
+
assertAnalyzesTo(cz, "mladým", new String[] { "mlad" });
|
208
|
+
assertAnalyzesTo(cz, "mladé", new String[] { "mlad" });
|
209
|
+
assertAnalyzesTo(cz, "mladém", new String[] { "mlad" });
|
210
|
+
assertAnalyzesTo(cz, "mladými", new String[] { "mlad" });
|
211
|
+
assertAnalyzesTo(cz, "mladá", new String[] { "mlad" });
|
212
|
+
assertAnalyzesTo(cz, "mladou", new String[] { "mlad" });
|
213
|
+
|
214
|
+
/* ending with í */
|
215
|
+
assertAnalyzesTo(cz, "jarní", new String[] { "jarn" });
|
216
|
+
assertAnalyzesTo(cz, "jarního", new String[] { "jarn" });
|
217
|
+
assertAnalyzesTo(cz, "jarních", new String[] { "jarn" });
|
218
|
+
assertAnalyzesTo(cz, "jarnímu", new String[] { "jarn" });
|
219
|
+
assertAnalyzesTo(cz, "jarním", new String[] { "jarn" });
|
220
|
+
assertAnalyzesTo(cz, "jarními", new String[] { "jarn" });
|
221
|
+
}
|
222
|
+
|
223
|
+
/**
|
224
|
+
* Test some possessive suffixes
|
225
|
+
*/
|
226
|
+
public void testPossessive() throws IOException {
|
227
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
228
|
+
assertAnalyzesTo(cz, "Karlův", new String[] { "karl" });
|
229
|
+
assertAnalyzesTo(cz, "jazykový", new String[] { "jazyk" });
|
230
|
+
}
|
231
|
+
|
232
|
+
/**
|
233
|
+
* Test some exceptional rules, implemented as rewrites.
|
234
|
+
*/
|
235
|
+
public void testExceptions() throws IOException {
|
236
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
237
|
+
|
238
|
+
/* rewrite of št -> sk */
|
239
|
+
assertAnalyzesTo(cz, "český", new String[] { "česk" });
|
240
|
+
assertAnalyzesTo(cz, "čeští", new String[] { "česk" });
|
241
|
+
|
242
|
+
/* rewrite of čt -> ck */
|
243
|
+
assertAnalyzesTo(cz, "anglický", new String[] { "anglick" });
|
244
|
+
assertAnalyzesTo(cz, "angličtí", new String[] { "anglick" });
|
245
|
+
|
246
|
+
/* rewrite of z -> h */
|
247
|
+
assertAnalyzesTo(cz, "kniha", new String[] { "knih" });
|
248
|
+
assertAnalyzesTo(cz, "knize", new String[] { "knih" });
|
249
|
+
|
250
|
+
/* rewrite of ž -> h */
|
251
|
+
assertAnalyzesTo(cz, "mazat", new String[] { "mah" });
|
252
|
+
assertAnalyzesTo(cz, "mažu", new String[] { "mah" });
|
253
|
+
|
254
|
+
/* rewrite of c -> k */
|
255
|
+
assertAnalyzesTo(cz, "kluk", new String[] { "kluk" });
|
256
|
+
assertAnalyzesTo(cz, "kluci", new String[] { "kluk" });
|
257
|
+
assertAnalyzesTo(cz, "klucích", new String[] { "kluk" });
|
258
|
+
|
259
|
+
/* rewrite of č -> k */
|
260
|
+
assertAnalyzesTo(cz, "hezký", new String[] { "hezk" });
|
261
|
+
assertAnalyzesTo(cz, "hezčí", new String[] { "hezk" });
|
262
|
+
|
263
|
+
/* rewrite of *ů* -> *o* */
|
264
|
+
assertAnalyzesTo(cz, "hůl", new String[] { "hol" });
|
265
|
+
assertAnalyzesTo(cz, "hole", new String[] { "hol" });
|
266
|
+
|
267
|
+
/* rewrite of e* -> * */
|
268
|
+
assertAnalyzesTo(cz, "deska", new String[] { "desk" });
|
269
|
+
assertAnalyzesTo(cz, "desek", new String[] { "desk" });
|
270
|
+
}
|
271
|
+
|
272
|
+
/**
|
273
|
+
* Test that very short words are not stemmed.
|
274
|
+
*/
|
275
|
+
public void testDontStem() throws IOException {
|
276
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
277
|
+
assertAnalyzesTo(cz, "e", new String[] { "e" });
|
278
|
+
assertAnalyzesTo(cz, "zi", new String[] { "zi" });
|
279
|
+
}
|
280
|
+
|
281
|
+
public void testWithKeywordAttribute() throws IOException {
|
282
|
+
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
283
|
+
set.add("hole");
|
284
|
+
CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
|
285
|
+
new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set));
|
286
|
+
assertTokenStreamContents(filter, new String[] { "hole", "desk" });
|
287
|
+
}
|
288
|
+
|
289
|
+
public void testEmptyTerm() throws IOException {
|
290
|
+
Analyzer a = new Analyzer() {
|
291
|
+
@Override
|
292
|
+
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
293
|
+
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
294
|
+
return new TokenStreamComponents(tokenizer, new CzechStemFilter(tokenizer));
|
295
|
+
}
|
296
|
+
};
|
297
|
+
checkOneTerm(a, "", "");
|
298
|
+
}
|
299
|
+
|
300
|
+
}
|
@@ -0,0 +1,300 @@
|
|
1
|
+
package org.apache.lucene.analysis.cz;
|
2
|
+
|
3
|
+
#
|
4
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
5
|
+
* contributor license agreements. See the NOTICE file distributed with
|
6
|
+
* this work for additional information regarding copyright ownership.
|
7
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
8
|
+
* (the "License"); you may not use this file except in compliance with
|
9
|
+
* the License. You may obtain a copy of the License at
|
10
|
+
*
|
11
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
*
|
13
|
+
* Unless required by applicable law or agreed to in writing, software
|
14
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
* See the License for the specific language governing permissions and
|
17
|
+
* limitations under the License.
|
18
|
+
|
19
|
+
|
20
|
+
import java.io.IOException;
|
21
|
+
import java.io.Reader;
|
22
|
+
import java.io.StringReader;
|
23
|
+
|
24
|
+
import org.apache.lucene.analysis.Analyzer;
|
25
|
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
26
|
+
import org.apache.lucene.analysis.MockTokenizer;
|
27
|
+
import org.apache.lucene.analysis.Tokenizer;
|
28
|
+
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
29
|
+
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
30
|
+
import org.apache.lucene.analysis.util.CharArraySet;
|
31
|
+
|
32
|
+
#*
|
33
|
+
* Test the Czech Stemmer.
|
34
|
+
*
|
35
|
+
* Note: its algorithmic, so some stems are nonsense
|
36
|
+
*
|
37
|
+
|
38
|
+
public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
39
|
+
|
40
|
+
#*
|
41
|
+
* Test showing how masculine noun forms conflate
|
42
|
+
|
43
|
+
public void testMasculineNouns() throws IOException {
|
44
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
45
|
+
|
46
|
+
# animate ending with a hard consonant
|
47
|
+
assert_analyzes_to( "pán", "pán" )
|
48
|
+
assert_analyzes_to( "páni", "pán" )
|
49
|
+
assert_analyzes_to( "pánové", "pán" )
|
50
|
+
assert_analyzes_to( "pána", "pán" )
|
51
|
+
assert_analyzes_to( "pánů", "pán" )
|
52
|
+
assert_analyzes_to( "pánovi", "pán" )
|
53
|
+
assert_analyzes_to( "pánům", "pán" )
|
54
|
+
assert_analyzes_to( "pány", "pán" )
|
55
|
+
assert_analyzes_to( "páne", "pán" )
|
56
|
+
assert_analyzes_to( "pánech", "pán" )
|
57
|
+
assert_analyzes_to( "pánem", "pán" )
|
58
|
+
|
59
|
+
# inanimate ending with hard consonant
|
60
|
+
assert_analyzes_to( "hrad", "hrad" )
|
61
|
+
assert_analyzes_to( "hradu", "hrad" )
|
62
|
+
assert_analyzes_to( "hrade", "hrad" )
|
63
|
+
assert_analyzes_to( "hradem", "hrad" )
|
64
|
+
assert_analyzes_to( "hrady", "hrad" )
|
65
|
+
assert_analyzes_to( "hradech", "hrad" )
|
66
|
+
assert_analyzes_to( "hradům", "hrad" )
|
67
|
+
assert_analyzes_to( "hradů", "hrad" )
|
68
|
+
|
69
|
+
# animate ending with a soft consonant
|
70
|
+
assert_analyzes_to( "muž", "muh" )
|
71
|
+
assert_analyzes_to( "muži", "muh" )
|
72
|
+
assert_analyzes_to( "muže", "muh" )
|
73
|
+
assert_analyzes_to( "mužů", "muh" )
|
74
|
+
assert_analyzes_to( "mužům", "muh" )
|
75
|
+
assert_analyzes_to( "mužích", "muh" )
|
76
|
+
assert_analyzes_to( "mužem", "muh" )
|
77
|
+
|
78
|
+
# inanimate ending with a soft consonant
|
79
|
+
assert_analyzes_to( "stroj", "stroj" )
|
80
|
+
assert_analyzes_to( "stroje", "stroj" )
|
81
|
+
assert_analyzes_to( "strojů", "stroj" )
|
82
|
+
assert_analyzes_to( "stroji", "stroj" )
|
83
|
+
assert_analyzes_to( "strojům", "stroj" )
|
84
|
+
assert_analyzes_to( "strojích", "stroj" )
|
85
|
+
assert_analyzes_to( "strojem", "stroj" )
|
86
|
+
|
87
|
+
# ending with a
|
88
|
+
assert_analyzes_to( "předseda", "předsd" )
|
89
|
+
assert_analyzes_to( "předsedové", "předsd" )
|
90
|
+
assert_analyzes_to( "předsedy", "předsd" )
|
91
|
+
assert_analyzes_to( "předsedů", "předsd" )
|
92
|
+
assert_analyzes_to( "předsedovi", "předsd" )
|
93
|
+
assert_analyzes_to( "předsedům", "předsd" )
|
94
|
+
assert_analyzes_to( "předsedu", "předsd" )
|
95
|
+
assert_analyzes_to( "předsedo", "předsd" )
|
96
|
+
assert_analyzes_to( "předsedech", "předsd" )
|
97
|
+
assert_analyzes_to( "předsedou", "předsd" )
|
98
|
+
|
99
|
+
# ending with e
|
100
|
+
assert_analyzes_to( "soudce", "soudk" )
|
101
|
+
assert_analyzes_to( "soudci", "soudk" )
|
102
|
+
assert_analyzes_to( "soudců", "soudk" )
|
103
|
+
assert_analyzes_to( "soudcům", "soudk" )
|
104
|
+
assert_analyzes_to( "soudcích", "soudk" )
|
105
|
+
assert_analyzes_to( "soudcem", "soudk" )
|
106
|
+
}
|
107
|
+
|
108
|
+
#*
|
109
|
+
* Test showing how feminine noun forms conflate
|
110
|
+
|
111
|
+
public void testFeminineNouns() throws IOException {
|
112
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
113
|
+
|
114
|
+
# ending with hard consonant
|
115
|
+
assert_analyzes_to( "kost", "kost" )
|
116
|
+
assert_analyzes_to( "kosti", "kost" )
|
117
|
+
assert_analyzes_to( "kostí", "kost" )
|
118
|
+
assert_analyzes_to( "kostem", "kost" )
|
119
|
+
assert_analyzes_to( "kostech", "kost" )
|
120
|
+
assert_analyzes_to( "kostmi", "kost" )
|
121
|
+
|
122
|
+
# ending with a soft consonant
|
123
|
+
// note: in this example sing nom. and sing acc. don't conflate w/ the rest
|
124
|
+
assert_analyzes_to( "píseň", "písň" )
|
125
|
+
assert_analyzes_to( "písně", "písn" )
|
126
|
+
assert_analyzes_to( "písni", "písn" )
|
127
|
+
assert_analyzes_to( "písněmi", "písn" )
|
128
|
+
assert_analyzes_to( "písních", "písn" )
|
129
|
+
assert_analyzes_to( "písním", "písn" )
|
130
|
+
|
131
|
+
# ending with e
|
132
|
+
assert_analyzes_to( "růže", "růh" )
|
133
|
+
assert_analyzes_to( "růží", "růh" )
|
134
|
+
assert_analyzes_to( "růžím", "růh" )
|
135
|
+
assert_analyzes_to( "růžích", "růh" )
|
136
|
+
assert_analyzes_to( "růžemi", "růh" )
|
137
|
+
assert_analyzes_to( "růži", "růh" )
|
138
|
+
|
139
|
+
# ending with a
|
140
|
+
assert_analyzes_to( "žena", "žn" )
|
141
|
+
assert_analyzes_to( "ženy", "žn" )
|
142
|
+
assert_analyzes_to( "žen", "žn" )
|
143
|
+
assert_analyzes_to( "ženě", "žn" )
|
144
|
+
assert_analyzes_to( "ženám", "žn" )
|
145
|
+
assert_analyzes_to( "ženu", "žn" )
|
146
|
+
assert_analyzes_to( "ženo", "žn" )
|
147
|
+
assert_analyzes_to( "ženách", "žn" )
|
148
|
+
assert_analyzes_to( "ženou", "žn" )
|
149
|
+
assert_analyzes_to( "ženami", "žn" )
|
150
|
+
}
|
151
|
+
|
152
|
+
#*
|
153
|
+
* Test showing how neuter noun forms conflate
|
154
|
+
|
155
|
+
public void testNeuterNouns() throws IOException {
|
156
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
157
|
+
|
158
|
+
# ending with o
|
159
|
+
assert_analyzes_to( "město", "měst" )
|
160
|
+
assert_analyzes_to( "města", "měst" )
|
161
|
+
assert_analyzes_to( "měst", "měst" )
|
162
|
+
assert_analyzes_to( "městu", "měst" )
|
163
|
+
assert_analyzes_to( "městům", "měst" )
|
164
|
+
assert_analyzes_to( "městě", "měst" )
|
165
|
+
assert_analyzes_to( "městech", "měst" )
|
166
|
+
assert_analyzes_to( "městem", "měst" )
|
167
|
+
assert_analyzes_to( "městy", "měst" )
|
168
|
+
|
169
|
+
# ending with e
|
170
|
+
assert_analyzes_to( "moře", "moř" )
|
171
|
+
assert_analyzes_to( "moří", "moř" )
|
172
|
+
assert_analyzes_to( "mořím", "moř" )
|
173
|
+
assert_analyzes_to( "moři", "moř" )
|
174
|
+
assert_analyzes_to( "mořích", "moř" )
|
175
|
+
assert_analyzes_to( "mořem", "moř" )
|
176
|
+
|
177
|
+
# ending with ě
|
178
|
+
assert_analyzes_to( "kuře", "kuř" )
|
179
|
+
assert_analyzes_to( "kuřata", "kuř" )
|
180
|
+
assert_analyzes_to( "kuřete", "kuř" )
|
181
|
+
assert_analyzes_to( "kuřat", "kuř" )
|
182
|
+
assert_analyzes_to( "kuřeti", "kuř" )
|
183
|
+
assert_analyzes_to( "kuřatům", "kuř" )
|
184
|
+
assert_analyzes_to( "kuřatech", "kuř" )
|
185
|
+
assert_analyzes_to( "kuřetem", "kuř" )
|
186
|
+
assert_analyzes_to( "kuřaty", "kuř" )
|
187
|
+
|
188
|
+
# ending with í
|
189
|
+
assert_analyzes_to( "stavení", "stavn" )
|
190
|
+
assert_analyzes_to( "stavením", "stavn" )
|
191
|
+
assert_analyzes_to( "staveních", "stavn" )
|
192
|
+
assert_analyzes_to( "staveními", "stavn" )
|
193
|
+
}
|
194
|
+
|
195
|
+
#*
|
196
|
+
* Test showing how adjectival forms conflate
|
197
|
+
|
198
|
+
public void testAdjectives() throws IOException {
|
199
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
200
|
+
|
201
|
+
# ending with ý/á/é
|
202
|
+
assert_analyzes_to( "mladý", "mlad" )
|
203
|
+
assert_analyzes_to( "mladí", "mlad" )
|
204
|
+
assert_analyzes_to( "mladého", "mlad" )
|
205
|
+
assert_analyzes_to( "mladých", "mlad" )
|
206
|
+
assert_analyzes_to( "mladému", "mlad" )
|
207
|
+
assert_analyzes_to( "mladým", "mlad" )
|
208
|
+
assert_analyzes_to( "mladé", "mlad" )
|
209
|
+
assert_analyzes_to( "mladém", "mlad" )
|
210
|
+
assert_analyzes_to( "mladými", "mlad" )
|
211
|
+
assert_analyzes_to( "mladá", "mlad" )
|
212
|
+
assert_analyzes_to( "mladou", "mlad" )
|
213
|
+
|
214
|
+
# ending with í
|
215
|
+
assert_analyzes_to( "jarní", "jarn" )
|
216
|
+
assert_analyzes_to( "jarního", "jarn" )
|
217
|
+
assert_analyzes_to( "jarních", "jarn" )
|
218
|
+
assert_analyzes_to( "jarnímu", "jarn" )
|
219
|
+
assert_analyzes_to( "jarním", "jarn" )
|
220
|
+
assert_analyzes_to( "jarními", "jarn" )
|
221
|
+
}
|
222
|
+
|
223
|
+
#*
|
224
|
+
* Test some possessive suffixes
|
225
|
+
|
226
|
+
public void testPossessive() throws IOException {
|
227
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
228
|
+
assert_analyzes_to( "Karlův", "karl" )
|
229
|
+
assert_analyzes_to( "jazykový", "jazyk" )
|
230
|
+
}
|
231
|
+
|
232
|
+
#*
|
233
|
+
* Test some exceptional rules, implemented as rewrites.
|
234
|
+
|
235
|
+
public void testExceptions() throws IOException {
|
236
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
237
|
+
|
238
|
+
# rewrite of št -> sk
|
239
|
+
assert_analyzes_to( "český", "česk" )
|
240
|
+
assert_analyzes_to( "čeští", "česk" )
|
241
|
+
|
242
|
+
# rewrite of čt -> ck
|
243
|
+
assert_analyzes_to( "anglický", "anglick" )
|
244
|
+
assert_analyzes_to( "angličtí", "anglick" )
|
245
|
+
|
246
|
+
# rewrite of z -> h
|
247
|
+
assert_analyzes_to( "kniha", "knih" )
|
248
|
+
assert_analyzes_to( "knize", "knih" )
|
249
|
+
|
250
|
+
# rewrite of ž -> h
|
251
|
+
assert_analyzes_to( "mazat", "mah" )
|
252
|
+
assert_analyzes_to( "mažu", "mah" )
|
253
|
+
|
254
|
+
# rewrite of c -> k
|
255
|
+
assert_analyzes_to( "kluk", "kluk" )
|
256
|
+
assert_analyzes_to( "kluci", "kluk" )
|
257
|
+
assert_analyzes_to( "klucích", "kluk" )
|
258
|
+
|
259
|
+
# rewrite of č -> k
|
260
|
+
assert_analyzes_to( "hezký", "hezk" )
|
261
|
+
assert_analyzes_to( "hezčí", "hezk" )
|
262
|
+
|
263
|
+
# rewrite of *ů* -> *o*
|
264
|
+
assert_analyzes_to( "hůl", "hol" )
|
265
|
+
assert_analyzes_to( "hole", "hol" )
|
266
|
+
|
267
|
+
# rewrite of e* -> *
|
268
|
+
assert_analyzes_to( "deska", "desk" )
|
269
|
+
assert_analyzes_to( "desek", "desk" )
|
270
|
+
}
|
271
|
+
|
272
|
+
#*
|
273
|
+
* Test that very short words are not stemmed.
|
274
|
+
|
275
|
+
public void testDontStem() throws IOException {
|
276
|
+
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
277
|
+
assert_analyzes_to( "e", "e" )
|
278
|
+
assert_analyzes_to( "zi", "zi" )
|
279
|
+
}
|
280
|
+
|
281
|
+
public void testWithKeywordAttribute() throws IOException {
|
282
|
+
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
283
|
+
set.add("hole");
|
284
|
+
CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
|
285
|
+
new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set));
|
286
|
+
assertTokenStreamContents(filter, "hole", "desk" )
|
287
|
+
}
|
288
|
+
|
289
|
+
public void testEmptyTerm() throws IOException {
|
290
|
+
Analyzer a = new Analyzer() {
|
291
|
+
@Override
|
292
|
+
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
293
|
+
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
294
|
+
return new TokenStreamComponents(tokenizer, new CzechStemFilter(tokenizer));
|
295
|
+
}
|
296
|
+
};
|
297
|
+
checkOneTerm(a, "", "");
|
298
|
+
}
|
299
|
+
|
300
|
+
}
|
data/test/helper.rb
ADDED