czech-stemmer 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,221 @@
1
+ require 'helper'
2
+
3
+
4
+ class TestStemmer < Minitest::Test
5
+
6
+ def test_masculine_nouns
7
+ # animate ending with a hard consonant
8
+ assert_analyzes_to( "pán", "pán" )
9
+ assert_analyzes_to( "páni", "pán" )
10
+ assert_analyzes_to( "pánové", "pán" )
11
+ assert_analyzes_to( "pána", "pán" )
12
+ assert_analyzes_to( "pánů", "pán" )
13
+ assert_analyzes_to( "pánovi", "pán" )
14
+ assert_analyzes_to( "pánům", "pán" )
15
+ assert_analyzes_to( "pány", "pán" )
16
+ assert_analyzes_to( "páne", "pán" )
17
+ assert_analyzes_to( "pánech", "pán" )
18
+ assert_analyzes_to( "pánem", "pán" )
19
+
20
+ # inanimate ending with hard consonant
21
+ assert_analyzes_to( "hrad", "hrad" )
22
+ assert_analyzes_to( "hradu", "hrad" )
23
+ assert_analyzes_to( "hrade", "hrad" )
24
+ assert_analyzes_to( "hradem", "hrad" )
25
+ assert_analyzes_to( "hrady", "hrad" )
26
+ assert_analyzes_to( "hradech", "hrad" )
27
+ assert_analyzes_to( "hradům", "hrad" )
28
+ assert_analyzes_to( "hradů", "hrad" )
29
+
30
+ # animate ending with a soft consonant
31
+ assert_analyzes_to( "muž", "muh" )
32
+ assert_analyzes_to( "muži", "muh" )
33
+ assert_analyzes_to( "muže", "muh" )
34
+ assert_analyzes_to( "mužů", "muh" )
35
+ assert_analyzes_to( "mužům", "muh" )
36
+ assert_analyzes_to( "mužích", "muh" )
37
+ assert_analyzes_to( "mužem", "muh" )
38
+
39
+ # inanimate ending with a soft consonant
40
+ assert_analyzes_to( "stroj", "stroj" )
41
+ assert_analyzes_to( "stroje", "stroj" )
42
+ assert_analyzes_to( "strojů", "stroj" )
43
+ assert_analyzes_to( "stroji", "stroj" )
44
+ assert_analyzes_to( "strojům", "stroj" )
45
+ assert_analyzes_to( "strojích", "stroj" )
46
+ assert_analyzes_to( "strojem", "stroj" )
47
+
48
+ # ending with a
49
+ assert_analyzes_to( "předseda", "předsd" )
50
+ assert_analyzes_to( "předsedové", "předsd" )
51
+ assert_analyzes_to( "předsedy", "předsd" )
52
+ assert_analyzes_to( "předsedů", "předsd" )
53
+ assert_analyzes_to( "předsedovi", "předsd" )
54
+ assert_analyzes_to( "předsedům", "předsd" )
55
+ assert_analyzes_to( "předsedu", "předsd" )
56
+ assert_analyzes_to( "předsedo", "předsd" )
57
+ assert_analyzes_to( "předsedech", "předsd" )
58
+ assert_analyzes_to( "předsedou", "předsd" )
59
+
60
+ # ending with e
61
+ assert_analyzes_to( "soudce", "soudk" )
62
+ assert_analyzes_to( "soudci", "soudk" )
63
+ assert_analyzes_to( "soudců", "soudk" )
64
+ assert_analyzes_to( "soudcům", "soudk" )
65
+ assert_analyzes_to( "soudcích", "soudk" )
66
+ assert_analyzes_to( "soudcem", "soudk" )
67
+ end
68
+
69
+ def test_feminine_nouns
70
+ # ending with hard consonant
71
+ assert_analyzes_to( "kost", "kost" )
72
+ assert_analyzes_to( "kosti", "kost" )
73
+ assert_analyzes_to( "kostí", "kost" )
74
+ assert_analyzes_to( "kostem", "kost" )
75
+ assert_analyzes_to( "kostech", "kost" )
76
+ assert_analyzes_to( "kostmi", "kost" )
77
+
78
+ # ending with a soft consonant
79
+ # note: in this example sing nom. and sing acc. don't conflate w/ the rest
80
+ assert_analyzes_to( "píseň", "písň" )
81
+ assert_analyzes_to( "písně", "písn" )
82
+ assert_analyzes_to( "písni", "písn" )
83
+ assert_analyzes_to( "písněmi", "písn" )
84
+ assert_analyzes_to( "písních", "písn" )
85
+ assert_analyzes_to( "písním", "písn" )
86
+
87
+ # ending with e
88
+ assert_analyzes_to( "růže", "růh" )
89
+ assert_analyzes_to( "růží", "růh" )
90
+ assert_analyzes_to( "růžím", "růh" )
91
+ assert_analyzes_to( "růžích", "růh" )
92
+ assert_analyzes_to( "růžemi", "růh" )
93
+ assert_analyzes_to( "růži", "růh" )
94
+
95
+ # ending with a
96
+ assert_analyzes_to( "žena", "žn" )
97
+ assert_analyzes_to( "ženy", "žn" )
98
+ assert_analyzes_to( "žen", "žn" )
99
+ assert_analyzes_to( "ženě", "žn" )
100
+ assert_analyzes_to( "ženám", "žn" )
101
+ assert_analyzes_to( "ženu", "žn" )
102
+ assert_analyzes_to( "ženo", "žn" )
103
+ assert_analyzes_to( "ženách", "žn" )
104
+ assert_analyzes_to( "ženou", "žn" )
105
+ assert_analyzes_to( "ženami", "žn" )
106
+
107
+ end
108
+
109
+ def test_neuter_nouns
110
+ # ending with o
111
+ assert_analyzes_to( "město", "měst" )
112
+ assert_analyzes_to( "města", "měst" )
113
+ assert_analyzes_to( "měst", "měst" )
114
+ assert_analyzes_to( "městu", "měst" )
115
+ assert_analyzes_to( "městům", "měst" )
116
+ assert_analyzes_to( "městě", "měst" )
117
+ assert_analyzes_to( "městech", "měst" )
118
+ assert_analyzes_to( "městem", "měst" )
119
+ assert_analyzes_to( "městy", "měst" )
120
+
121
+ # ending with e
122
+ assert_analyzes_to( "moře", "moř" )
123
+ assert_analyzes_to( "moří", "moř" )
124
+ assert_analyzes_to( "mořím", "moř" )
125
+ assert_analyzes_to( "moři", "moř" )
126
+ assert_analyzes_to( "mořích", "moř" )
127
+ assert_analyzes_to( "mořem", "moř" )
128
+
129
+ # ending with ě
130
+ assert_analyzes_to( "kuře", "kuř" )
131
+ assert_analyzes_to( "kuřata", "kuř" )
132
+ assert_analyzes_to( "kuřete", "kuř" )
133
+ assert_analyzes_to( "kuřat", "kuř" )
134
+ assert_analyzes_to( "kuřeti", "kuř" )
135
+ assert_analyzes_to( "kuřatům", "kuř" )
136
+ assert_analyzes_to( "kuřatech", "kuř" )
137
+ assert_analyzes_to( "kuřetem", "kuř" )
138
+ assert_analyzes_to( "kuřaty", "kuř" )
139
+
140
+ # ending with í
141
+ assert_analyzes_to( "stavení", "stavn" )
142
+ assert_analyzes_to( "stavením", "stavn" )
143
+ assert_analyzes_to( "staveních", "stavn" )
144
+ assert_analyzes_to( "staveními", "stavn" )
145
+
146
+ end
147
+
148
+ def test_adjectives
149
+ # ending with ý/á/é
150
+ assert_analyzes_to( "mladý", "mlad" )
151
+ assert_analyzes_to( "mladí", "mlad" )
152
+ assert_analyzes_to( "mladého", "mlad" )
153
+ assert_analyzes_to( "mladých", "mlad" )
154
+ assert_analyzes_to( "mladému", "mlad" )
155
+ assert_analyzes_to( "mladým", "mlad" )
156
+ assert_analyzes_to( "mladé", "mlad" )
157
+ assert_analyzes_to( "mladém", "mlad" )
158
+ assert_analyzes_to( "mladými", "mlad" )
159
+ assert_analyzes_to( "mladá", "mlad" )
160
+ assert_analyzes_to( "mladou", "mlad" )
161
+
162
+ # ending with í
163
+ assert_analyzes_to( "jarní", "jarn" )
164
+ assert_analyzes_to( "jarního", "jarn" )
165
+ assert_analyzes_to( "jarních", "jarn" )
166
+ assert_analyzes_to( "jarnímu", "jarn" )
167
+ assert_analyzes_to( "jarním", "jarn" )
168
+ assert_analyzes_to( "jarními", "jarn" )
169
+ end
170
+
171
+ def test_possessive
172
+ #lucene test case uses "Karlův"
173
+ assert_analyzes_to( "karlův", "karl" )
174
+ assert_analyzes_to( "jazykový", "jazyk" )
175
+ end
176
+
177
+ def test_exceptions
178
+ # rewrite of št -> sk
179
+ assert_analyzes_to( "český", "česk" )
180
+ assert_analyzes_to( "čeští", "česk" )
181
+
182
+ # rewrite of čt -> ck
183
+ assert_analyzes_to( "anglický", "anglick" )
184
+ assert_analyzes_to( "angličtí", "anglick" )
185
+
186
+ # rewrite of z -> h
187
+ assert_analyzes_to( "kniha", "knih" )
188
+ assert_analyzes_to( "knize", "knih" )
189
+
190
+ # rewrite of ž -> h
191
+ assert_analyzes_to( "mazat", "mah" )
192
+ assert_analyzes_to( "mažu", "mah" )
193
+
194
+ # rewrite of c -> k
195
+ assert_analyzes_to( "kluk", "kluk" )
196
+ assert_analyzes_to( "kluci", "kluk" )
197
+ assert_analyzes_to( "klucích", "kluk" )
198
+
199
+ # rewrite of č -> k
200
+ assert_analyzes_to( "hezký", "hezk" )
201
+ assert_analyzes_to( "hezčí", "hezk" )
202
+
203
+ # rewrite of *ů* -> *o*
204
+ assert_analyzes_to( "hůl", "hol" )
205
+ assert_analyzes_to( "hole", "hol" )
206
+
207
+ # rewrite of e* -> *
208
+ assert_analyzes_to( "deska", "desk" )
209
+ assert_analyzes_to( "desek", "desk" )
210
+ end
211
+
212
+ def test_dont_stem
213
+ assert_analyzes_to( "e", "e" )
214
+ assert_analyzes_to( "zi", "zi" )
215
+ end
216
+
217
+ def assert_analyzes_to word, stem
218
+ assert_equal(stem, CzechStemmer.stem(word))
219
+ end
220
+
221
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: czech-stemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Ondrej Odchazel
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: shoulda
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rdoc
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: jeweler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 2.0.1
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 2.0.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Based pn Lucene implementation
84
+ email: hypertornado@gmail.com
85
+ executables: []
86
+ extensions: []
87
+ extra_rdoc_files:
88
+ - LICENSE.txt
89
+ - README.markdown
90
+ files:
91
+ - .document
92
+ - Gemfile
93
+ - Gemfile.lock
94
+ - LICENSE.txt
95
+ - README.markdown
96
+ - Rakefile
97
+ - VERSION
98
+ - czech-stemmer.gemspec
99
+ - lib/czech-stemmer.rb
100
+ - test/CzechStemmer.java
101
+ - test/TestCzechStemmer.java
102
+ - test/TestCzechStemmer.java.txt
103
+ - test/helper.rb
104
+ - test/java_test_converter.bash
105
+ - test/test_czech-stemmer.rb
106
+ homepage: http://github.com/hypertornado/czech-stemmer
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - '>='
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - '>='
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubyforge_project:
126
+ rubygems_version: 2.0.14
127
+ signing_key:
128
+ specification_version: 4
129
+ summary: Ruby port of czech stemmer in Lucene
130
+ test_files: []