czech-stemmer 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,221 @@
1
+ require 'helper'
2
+
3
+
4
+ class TestStemmer < Minitest::Test
5
+
6
+ def test_masculine_nouns
7
+ # animate ending with a hard consonant
8
+ assert_analyzes_to( "pán", "pán" )
9
+ assert_analyzes_to( "páni", "pán" )
10
+ assert_analyzes_to( "pánové", "pán" )
11
+ assert_analyzes_to( "pána", "pán" )
12
+ assert_analyzes_to( "pánů", "pán" )
13
+ assert_analyzes_to( "pánovi", "pán" )
14
+ assert_analyzes_to( "pánům", "pán" )
15
+ assert_analyzes_to( "pány", "pán" )
16
+ assert_analyzes_to( "páne", "pán" )
17
+ assert_analyzes_to( "pánech", "pán" )
18
+ assert_analyzes_to( "pánem", "pán" )
19
+
20
+ # inanimate ending with hard consonant
21
+ assert_analyzes_to( "hrad", "hrad" )
22
+ assert_analyzes_to( "hradu", "hrad" )
23
+ assert_analyzes_to( "hrade", "hrad" )
24
+ assert_analyzes_to( "hradem", "hrad" )
25
+ assert_analyzes_to( "hrady", "hrad" )
26
+ assert_analyzes_to( "hradech", "hrad" )
27
+ assert_analyzes_to( "hradům", "hrad" )
28
+ assert_analyzes_to( "hradů", "hrad" )
29
+
30
+ # animate ending with a soft consonant
31
+ assert_analyzes_to( "muž", "muh" )
32
+ assert_analyzes_to( "muži", "muh" )
33
+ assert_analyzes_to( "muže", "muh" )
34
+ assert_analyzes_to( "mužů", "muh" )
35
+ assert_analyzes_to( "mužům", "muh" )
36
+ assert_analyzes_to( "mužích", "muh" )
37
+ assert_analyzes_to( "mužem", "muh" )
38
+
39
+ # inanimate ending with a soft consonant
40
+ assert_analyzes_to( "stroj", "stroj" )
41
+ assert_analyzes_to( "stroje", "stroj" )
42
+ assert_analyzes_to( "strojů", "stroj" )
43
+ assert_analyzes_to( "stroji", "stroj" )
44
+ assert_analyzes_to( "strojům", "stroj" )
45
+ assert_analyzes_to( "strojích", "stroj" )
46
+ assert_analyzes_to( "strojem", "stroj" )
47
+
48
+ # ending with a
49
+ assert_analyzes_to( "předseda", "předsd" )
50
+ assert_analyzes_to( "předsedové", "předsd" )
51
+ assert_analyzes_to( "předsedy", "předsd" )
52
+ assert_analyzes_to( "předsedů", "předsd" )
53
+ assert_analyzes_to( "předsedovi", "předsd" )
54
+ assert_analyzes_to( "předsedům", "předsd" )
55
+ assert_analyzes_to( "předsedu", "předsd" )
56
+ assert_analyzes_to( "předsedo", "předsd" )
57
+ assert_analyzes_to( "předsedech", "předsd" )
58
+ assert_analyzes_to( "předsedou", "předsd" )
59
+
60
+ # ending with e
61
+ assert_analyzes_to( "soudce", "soudk" )
62
+ assert_analyzes_to( "soudci", "soudk" )
63
+ assert_analyzes_to( "soudců", "soudk" )
64
+ assert_analyzes_to( "soudcům", "soudk" )
65
+ assert_analyzes_to( "soudcích", "soudk" )
66
+ assert_analyzes_to( "soudcem", "soudk" )
67
+ end
68
+
69
+ def test_feminine_nouns
70
+ # ending with hard consonant
71
+ assert_analyzes_to( "kost", "kost" )
72
+ assert_analyzes_to( "kosti", "kost" )
73
+ assert_analyzes_to( "kostí", "kost" )
74
+ assert_analyzes_to( "kostem", "kost" )
75
+ assert_analyzes_to( "kostech", "kost" )
76
+ assert_analyzes_to( "kostmi", "kost" )
77
+
78
+ # ending with a soft consonant
79
+ # note: in this example sing nom. and sing acc. don't conflate w/ the rest
80
+ assert_analyzes_to( "píseň", "písň" )
81
+ assert_analyzes_to( "písně", "písn" )
82
+ assert_analyzes_to( "písni", "písn" )
83
+ assert_analyzes_to( "písněmi", "písn" )
84
+ assert_analyzes_to( "písních", "písn" )
85
+ assert_analyzes_to( "písním", "písn" )
86
+
87
+ # ending with e
88
+ assert_analyzes_to( "růže", "růh" )
89
+ assert_analyzes_to( "růží", "růh" )
90
+ assert_analyzes_to( "růžím", "růh" )
91
+ assert_analyzes_to( "růžích", "růh" )
92
+ assert_analyzes_to( "růžemi", "růh" )
93
+ assert_analyzes_to( "růži", "růh" )
94
+
95
+ # ending with a
96
+ assert_analyzes_to( "žena", "žn" )
97
+ assert_analyzes_to( "ženy", "žn" )
98
+ assert_analyzes_to( "žen", "žn" )
99
+ assert_analyzes_to( "ženě", "žn" )
100
+ assert_analyzes_to( "ženám", "žn" )
101
+ assert_analyzes_to( "ženu", "žn" )
102
+ assert_analyzes_to( "ženo", "žn" )
103
+ assert_analyzes_to( "ženách", "žn" )
104
+ assert_analyzes_to( "ženou", "žn" )
105
+ assert_analyzes_to( "ženami", "žn" )
106
+
107
+ end
108
+
109
+ def test_neuter_nouns
110
+ # ending with o
111
+ assert_analyzes_to( "město", "měst" )
112
+ assert_analyzes_to( "města", "měst" )
113
+ assert_analyzes_to( "měst", "měst" )
114
+ assert_analyzes_to( "městu", "měst" )
115
+ assert_analyzes_to( "městům", "měst" )
116
+ assert_analyzes_to( "městě", "měst" )
117
+ assert_analyzes_to( "městech", "měst" )
118
+ assert_analyzes_to( "městem", "měst" )
119
+ assert_analyzes_to( "městy", "měst" )
120
+
121
+ # ending with e
122
+ assert_analyzes_to( "moře", "moř" )
123
+ assert_analyzes_to( "moří", "moř" )
124
+ assert_analyzes_to( "mořím", "moř" )
125
+ assert_analyzes_to( "moři", "moř" )
126
+ assert_analyzes_to( "mořích", "moř" )
127
+ assert_analyzes_to( "mořem", "moř" )
128
+
129
+ # ending with ě
130
+ assert_analyzes_to( "kuře", "kuř" )
131
+ assert_analyzes_to( "kuřata", "kuř" )
132
+ assert_analyzes_to( "kuřete", "kuř" )
133
+ assert_analyzes_to( "kuřat", "kuř" )
134
+ assert_analyzes_to( "kuřeti", "kuř" )
135
+ assert_analyzes_to( "kuřatům", "kuř" )
136
+ assert_analyzes_to( "kuřatech", "kuř" )
137
+ assert_analyzes_to( "kuřetem", "kuř" )
138
+ assert_analyzes_to( "kuřaty", "kuř" )
139
+
140
+ # ending with í
141
+ assert_analyzes_to( "stavení", "stavn" )
142
+ assert_analyzes_to( "stavením", "stavn" )
143
+ assert_analyzes_to( "staveních", "stavn" )
144
+ assert_analyzes_to( "staveními", "stavn" )
145
+
146
+ end
147
+
148
+ def test_adjectives
149
+ # ending with ý/á/é
150
+ assert_analyzes_to( "mladý", "mlad" )
151
+ assert_analyzes_to( "mladí", "mlad" )
152
+ assert_analyzes_to( "mladého", "mlad" )
153
+ assert_analyzes_to( "mladých", "mlad" )
154
+ assert_analyzes_to( "mladému", "mlad" )
155
+ assert_analyzes_to( "mladým", "mlad" )
156
+ assert_analyzes_to( "mladé", "mlad" )
157
+ assert_analyzes_to( "mladém", "mlad" )
158
+ assert_analyzes_to( "mladými", "mlad" )
159
+ assert_analyzes_to( "mladá", "mlad" )
160
+ assert_analyzes_to( "mladou", "mlad" )
161
+
162
+ # ending with í
163
+ assert_analyzes_to( "jarní", "jarn" )
164
+ assert_analyzes_to( "jarního", "jarn" )
165
+ assert_analyzes_to( "jarních", "jarn" )
166
+ assert_analyzes_to( "jarnímu", "jarn" )
167
+ assert_analyzes_to( "jarním", "jarn" )
168
+ assert_analyzes_to( "jarními", "jarn" )
169
+ end
170
+
171
+ def test_possessive
172
+ #lucene test case uses "Karlův"
173
+ assert_analyzes_to( "karlův", "karl" )
174
+ assert_analyzes_to( "jazykový", "jazyk" )
175
+ end
176
+
177
+ def test_exceptions
178
+ # rewrite of št -> sk
179
+ assert_analyzes_to( "český", "česk" )
180
+ assert_analyzes_to( "čeští", "česk" )
181
+
182
+ # rewrite of čt -> ck
183
+ assert_analyzes_to( "anglický", "anglick" )
184
+ assert_analyzes_to( "angličtí", "anglick" )
185
+
186
+ # rewrite of z -> h
187
+ assert_analyzes_to( "kniha", "knih" )
188
+ assert_analyzes_to( "knize", "knih" )
189
+
190
+ # rewrite of ž -> h
191
+ assert_analyzes_to( "mazat", "mah" )
192
+ assert_analyzes_to( "mažu", "mah" )
193
+
194
+ # rewrite of c -> k
195
+ assert_analyzes_to( "kluk", "kluk" )
196
+ assert_analyzes_to( "kluci", "kluk" )
197
+ assert_analyzes_to( "klucích", "kluk" )
198
+
199
+ # rewrite of č -> k
200
+ assert_analyzes_to( "hezký", "hezk" )
201
+ assert_analyzes_to( "hezčí", "hezk" )
202
+
203
+ # rewrite of *ů* -> *o*
204
+ assert_analyzes_to( "hůl", "hol" )
205
+ assert_analyzes_to( "hole", "hol" )
206
+
207
+ # rewrite of e* -> *
208
+ assert_analyzes_to( "deska", "desk" )
209
+ assert_analyzes_to( "desek", "desk" )
210
+ end
211
+
212
+ def test_dont_stem
213
+ assert_analyzes_to( "e", "e" )
214
+ assert_analyzes_to( "zi", "zi" )
215
+ end
216
+
217
+ def assert_analyzes_to word, stem
218
+ assert_equal(stem, CzechStemmer.stem(word))
219
+ end
220
+
221
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: czech-stemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Ondrej Odchazel
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: shoulda
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rdoc
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: jeweler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 2.0.1
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 2.0.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Based pn Lucene implementation
84
+ email: hypertornado@gmail.com
85
+ executables: []
86
+ extensions: []
87
+ extra_rdoc_files:
88
+ - LICENSE.txt
89
+ - README.markdown
90
+ files:
91
+ - .document
92
+ - Gemfile
93
+ - Gemfile.lock
94
+ - LICENSE.txt
95
+ - README.markdown
96
+ - Rakefile
97
+ - VERSION
98
+ - czech-stemmer.gemspec
99
+ - lib/czech-stemmer.rb
100
+ - test/CzechStemmer.java
101
+ - test/TestCzechStemmer.java
102
+ - test/TestCzechStemmer.java.txt
103
+ - test/helper.rb
104
+ - test/java_test_converter.bash
105
+ - test/test_czech-stemmer.rb
106
+ homepage: http://github.com/hypertornado/czech-stemmer
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - '>='
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - '>='
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubyforge_project:
126
+ rubygems_version: 2.0.14
127
+ signing_key:
128
+ specification_version: 4
129
+ summary: Ruby port of czech stemmer in Lucene
130
+ test_files: []