czech-stemmer 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +80 -0
- data/LICENSE.txt +20 -0
- data/README.markdown +10 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/czech-stemmer.gemspec +66 -0
- data/lib/czech-stemmer.rb +125 -0
- data/test/CzechStemmer.java +173 -0
- data/test/TestCzechStemmer.java +300 -0
- data/test/TestCzechStemmer.java.txt +300 -0
- data/test/helper.rb +2 -0
- data/test/java_test_converter.bash +7 -0
- data/test/test_czech-stemmer.rb +221 -0
- metadata +130 -0
@@ -0,0 +1,221 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
|
4
|
+
class TestStemmer < Minitest::Test
|
5
|
+
|
6
|
+
def test_masculine_nouns
|
7
|
+
# animate ending with a hard consonant
|
8
|
+
assert_analyzes_to( "pán", "pán" )
|
9
|
+
assert_analyzes_to( "páni", "pán" )
|
10
|
+
assert_analyzes_to( "pánové", "pán" )
|
11
|
+
assert_analyzes_to( "pána", "pán" )
|
12
|
+
assert_analyzes_to( "pánů", "pán" )
|
13
|
+
assert_analyzes_to( "pánovi", "pán" )
|
14
|
+
assert_analyzes_to( "pánům", "pán" )
|
15
|
+
assert_analyzes_to( "pány", "pán" )
|
16
|
+
assert_analyzes_to( "páne", "pán" )
|
17
|
+
assert_analyzes_to( "pánech", "pán" )
|
18
|
+
assert_analyzes_to( "pánem", "pán" )
|
19
|
+
|
20
|
+
# inanimate ending with hard consonant
|
21
|
+
assert_analyzes_to( "hrad", "hrad" )
|
22
|
+
assert_analyzes_to( "hradu", "hrad" )
|
23
|
+
assert_analyzes_to( "hrade", "hrad" )
|
24
|
+
assert_analyzes_to( "hradem", "hrad" )
|
25
|
+
assert_analyzes_to( "hrady", "hrad" )
|
26
|
+
assert_analyzes_to( "hradech", "hrad" )
|
27
|
+
assert_analyzes_to( "hradům", "hrad" )
|
28
|
+
assert_analyzes_to( "hradů", "hrad" )
|
29
|
+
|
30
|
+
# animate ending with a soft consonant
|
31
|
+
assert_analyzes_to( "muž", "muh" )
|
32
|
+
assert_analyzes_to( "muži", "muh" )
|
33
|
+
assert_analyzes_to( "muže", "muh" )
|
34
|
+
assert_analyzes_to( "mužů", "muh" )
|
35
|
+
assert_analyzes_to( "mužům", "muh" )
|
36
|
+
assert_analyzes_to( "mužích", "muh" )
|
37
|
+
assert_analyzes_to( "mužem", "muh" )
|
38
|
+
|
39
|
+
# inanimate ending with a soft consonant
|
40
|
+
assert_analyzes_to( "stroj", "stroj" )
|
41
|
+
assert_analyzes_to( "stroje", "stroj" )
|
42
|
+
assert_analyzes_to( "strojů", "stroj" )
|
43
|
+
assert_analyzes_to( "stroji", "stroj" )
|
44
|
+
assert_analyzes_to( "strojům", "stroj" )
|
45
|
+
assert_analyzes_to( "strojích", "stroj" )
|
46
|
+
assert_analyzes_to( "strojem", "stroj" )
|
47
|
+
|
48
|
+
# ending with a
|
49
|
+
assert_analyzes_to( "předseda", "předsd" )
|
50
|
+
assert_analyzes_to( "předsedové", "předsd" )
|
51
|
+
assert_analyzes_to( "předsedy", "předsd" )
|
52
|
+
assert_analyzes_to( "předsedů", "předsd" )
|
53
|
+
assert_analyzes_to( "předsedovi", "předsd" )
|
54
|
+
assert_analyzes_to( "předsedům", "předsd" )
|
55
|
+
assert_analyzes_to( "předsedu", "předsd" )
|
56
|
+
assert_analyzes_to( "předsedo", "předsd" )
|
57
|
+
assert_analyzes_to( "předsedech", "předsd" )
|
58
|
+
assert_analyzes_to( "předsedou", "předsd" )
|
59
|
+
|
60
|
+
# ending with e
|
61
|
+
assert_analyzes_to( "soudce", "soudk" )
|
62
|
+
assert_analyzes_to( "soudci", "soudk" )
|
63
|
+
assert_analyzes_to( "soudců", "soudk" )
|
64
|
+
assert_analyzes_to( "soudcům", "soudk" )
|
65
|
+
assert_analyzes_to( "soudcích", "soudk" )
|
66
|
+
assert_analyzes_to( "soudcem", "soudk" )
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_feminine_nouns
|
70
|
+
# ending with hard consonant
|
71
|
+
assert_analyzes_to( "kost", "kost" )
|
72
|
+
assert_analyzes_to( "kosti", "kost" )
|
73
|
+
assert_analyzes_to( "kostí", "kost" )
|
74
|
+
assert_analyzes_to( "kostem", "kost" )
|
75
|
+
assert_analyzes_to( "kostech", "kost" )
|
76
|
+
assert_analyzes_to( "kostmi", "kost" )
|
77
|
+
|
78
|
+
# ending with a soft consonant
|
79
|
+
# note: in this example sing nom. and sing acc. don't conflate w/ the rest
|
80
|
+
assert_analyzes_to( "píseň", "písň" )
|
81
|
+
assert_analyzes_to( "písně", "písn" )
|
82
|
+
assert_analyzes_to( "písni", "písn" )
|
83
|
+
assert_analyzes_to( "písněmi", "písn" )
|
84
|
+
assert_analyzes_to( "písních", "písn" )
|
85
|
+
assert_analyzes_to( "písním", "písn" )
|
86
|
+
|
87
|
+
# ending with e
|
88
|
+
assert_analyzes_to( "růže", "růh" )
|
89
|
+
assert_analyzes_to( "růží", "růh" )
|
90
|
+
assert_analyzes_to( "růžím", "růh" )
|
91
|
+
assert_analyzes_to( "růžích", "růh" )
|
92
|
+
assert_analyzes_to( "růžemi", "růh" )
|
93
|
+
assert_analyzes_to( "růži", "růh" )
|
94
|
+
|
95
|
+
# ending with a
|
96
|
+
assert_analyzes_to( "žena", "žn" )
|
97
|
+
assert_analyzes_to( "ženy", "žn" )
|
98
|
+
assert_analyzes_to( "žen", "žn" )
|
99
|
+
assert_analyzes_to( "ženě", "žn" )
|
100
|
+
assert_analyzes_to( "ženám", "žn" )
|
101
|
+
assert_analyzes_to( "ženu", "žn" )
|
102
|
+
assert_analyzes_to( "ženo", "žn" )
|
103
|
+
assert_analyzes_to( "ženách", "žn" )
|
104
|
+
assert_analyzes_to( "ženou", "žn" )
|
105
|
+
assert_analyzes_to( "ženami", "žn" )
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_neuter_nouns
|
110
|
+
# ending with o
|
111
|
+
assert_analyzes_to( "město", "měst" )
|
112
|
+
assert_analyzes_to( "města", "měst" )
|
113
|
+
assert_analyzes_to( "měst", "měst" )
|
114
|
+
assert_analyzes_to( "městu", "měst" )
|
115
|
+
assert_analyzes_to( "městům", "měst" )
|
116
|
+
assert_analyzes_to( "městě", "měst" )
|
117
|
+
assert_analyzes_to( "městech", "měst" )
|
118
|
+
assert_analyzes_to( "městem", "měst" )
|
119
|
+
assert_analyzes_to( "městy", "měst" )
|
120
|
+
|
121
|
+
# ending with e
|
122
|
+
assert_analyzes_to( "moře", "moř" )
|
123
|
+
assert_analyzes_to( "moří", "moř" )
|
124
|
+
assert_analyzes_to( "mořím", "moř" )
|
125
|
+
assert_analyzes_to( "moři", "moř" )
|
126
|
+
assert_analyzes_to( "mořích", "moř" )
|
127
|
+
assert_analyzes_to( "mořem", "moř" )
|
128
|
+
|
129
|
+
# ending with ě
|
130
|
+
assert_analyzes_to( "kuře", "kuř" )
|
131
|
+
assert_analyzes_to( "kuřata", "kuř" )
|
132
|
+
assert_analyzes_to( "kuřete", "kuř" )
|
133
|
+
assert_analyzes_to( "kuřat", "kuř" )
|
134
|
+
assert_analyzes_to( "kuřeti", "kuř" )
|
135
|
+
assert_analyzes_to( "kuřatům", "kuř" )
|
136
|
+
assert_analyzes_to( "kuřatech", "kuř" )
|
137
|
+
assert_analyzes_to( "kuřetem", "kuř" )
|
138
|
+
assert_analyzes_to( "kuřaty", "kuř" )
|
139
|
+
|
140
|
+
# ending with í
|
141
|
+
assert_analyzes_to( "stavení", "stavn" )
|
142
|
+
assert_analyzes_to( "stavením", "stavn" )
|
143
|
+
assert_analyzes_to( "staveních", "stavn" )
|
144
|
+
assert_analyzes_to( "staveními", "stavn" )
|
145
|
+
|
146
|
+
end
|
147
|
+
|
148
|
+
def test_adjectives
|
149
|
+
# ending with ý/á/é
|
150
|
+
assert_analyzes_to( "mladý", "mlad" )
|
151
|
+
assert_analyzes_to( "mladí", "mlad" )
|
152
|
+
assert_analyzes_to( "mladého", "mlad" )
|
153
|
+
assert_analyzes_to( "mladých", "mlad" )
|
154
|
+
assert_analyzes_to( "mladému", "mlad" )
|
155
|
+
assert_analyzes_to( "mladým", "mlad" )
|
156
|
+
assert_analyzes_to( "mladé", "mlad" )
|
157
|
+
assert_analyzes_to( "mladém", "mlad" )
|
158
|
+
assert_analyzes_to( "mladými", "mlad" )
|
159
|
+
assert_analyzes_to( "mladá", "mlad" )
|
160
|
+
assert_analyzes_to( "mladou", "mlad" )
|
161
|
+
|
162
|
+
# ending with í
|
163
|
+
assert_analyzes_to( "jarní", "jarn" )
|
164
|
+
assert_analyzes_to( "jarního", "jarn" )
|
165
|
+
assert_analyzes_to( "jarních", "jarn" )
|
166
|
+
assert_analyzes_to( "jarnímu", "jarn" )
|
167
|
+
assert_analyzes_to( "jarním", "jarn" )
|
168
|
+
assert_analyzes_to( "jarními", "jarn" )
|
169
|
+
end
|
170
|
+
|
171
|
+
def test_possessive
|
172
|
+
#lucene test case uses "Karlův"
|
173
|
+
assert_analyzes_to( "karlův", "karl" )
|
174
|
+
assert_analyzes_to( "jazykový", "jazyk" )
|
175
|
+
end
|
176
|
+
|
177
|
+
def test_exceptions
|
178
|
+
# rewrite of št -> sk
|
179
|
+
assert_analyzes_to( "český", "česk" )
|
180
|
+
assert_analyzes_to( "čeští", "česk" )
|
181
|
+
|
182
|
+
# rewrite of čt -> ck
|
183
|
+
assert_analyzes_to( "anglický", "anglick" )
|
184
|
+
assert_analyzes_to( "angličtí", "anglick" )
|
185
|
+
|
186
|
+
# rewrite of z -> h
|
187
|
+
assert_analyzes_to( "kniha", "knih" )
|
188
|
+
assert_analyzes_to( "knize", "knih" )
|
189
|
+
|
190
|
+
# rewrite of ž -> h
|
191
|
+
assert_analyzes_to( "mazat", "mah" )
|
192
|
+
assert_analyzes_to( "mažu", "mah" )
|
193
|
+
|
194
|
+
# rewrite of c -> k
|
195
|
+
assert_analyzes_to( "kluk", "kluk" )
|
196
|
+
assert_analyzes_to( "kluci", "kluk" )
|
197
|
+
assert_analyzes_to( "klucích", "kluk" )
|
198
|
+
|
199
|
+
# rewrite of č -> k
|
200
|
+
assert_analyzes_to( "hezký", "hezk" )
|
201
|
+
assert_analyzes_to( "hezčí", "hezk" )
|
202
|
+
|
203
|
+
# rewrite of *ů* -> *o*
|
204
|
+
assert_analyzes_to( "hůl", "hol" )
|
205
|
+
assert_analyzes_to( "hole", "hol" )
|
206
|
+
|
207
|
+
# rewrite of e* -> *
|
208
|
+
assert_analyzes_to( "deska", "desk" )
|
209
|
+
assert_analyzes_to( "desek", "desk" )
|
210
|
+
end
|
211
|
+
|
212
|
+
def test_dont_stem
|
213
|
+
assert_analyzes_to( "e", "e" )
|
214
|
+
assert_analyzes_to( "zi", "zi" )
|
215
|
+
end
|
216
|
+
|
217
|
+
def assert_analyzes_to word, stem
|
218
|
+
assert_equal(stem, CzechStemmer.stem(word))
|
219
|
+
end
|
220
|
+
|
221
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: czech-stemmer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ondrej Odchazel
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-06-24 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: shoulda
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rdoc
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: jeweler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 2.0.1
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 2.0.1
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simplecov
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: Based pn Lucene implementation
|
84
|
+
email: hypertornado@gmail.com
|
85
|
+
executables: []
|
86
|
+
extensions: []
|
87
|
+
extra_rdoc_files:
|
88
|
+
- LICENSE.txt
|
89
|
+
- README.markdown
|
90
|
+
files:
|
91
|
+
- .document
|
92
|
+
- Gemfile
|
93
|
+
- Gemfile.lock
|
94
|
+
- LICENSE.txt
|
95
|
+
- README.markdown
|
96
|
+
- Rakefile
|
97
|
+
- VERSION
|
98
|
+
- czech-stemmer.gemspec
|
99
|
+
- lib/czech-stemmer.rb
|
100
|
+
- test/CzechStemmer.java
|
101
|
+
- test/TestCzechStemmer.java
|
102
|
+
- test/TestCzechStemmer.java.txt
|
103
|
+
- test/helper.rb
|
104
|
+
- test/java_test_converter.bash
|
105
|
+
- test/test_czech-stemmer.rb
|
106
|
+
homepage: http://github.com/hypertornado/czech-stemmer
|
107
|
+
licenses:
|
108
|
+
- MIT
|
109
|
+
metadata: {}
|
110
|
+
post_install_message:
|
111
|
+
rdoc_options: []
|
112
|
+
require_paths:
|
113
|
+
- lib
|
114
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
115
|
+
requirements:
|
116
|
+
- - '>='
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: '0'
|
119
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - '>='
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '0'
|
124
|
+
requirements: []
|
125
|
+
rubyforge_project:
|
126
|
+
rubygems_version: 2.0.14
|
127
|
+
signing_key:
|
128
|
+
specification_version: 4
|
129
|
+
summary: Ruby port of czech stemmer in Lucene
|
130
|
+
test_files: []
|