czech-stemmer 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +80 -0
- data/LICENSE.txt +20 -0
- data/README.markdown +10 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/czech-stemmer.gemspec +66 -0
- data/lib/czech-stemmer.rb +125 -0
- data/test/CzechStemmer.java +173 -0
- data/test/TestCzechStemmer.java +300 -0
- data/test/TestCzechStemmer.java.txt +300 -0
- data/test/helper.rb +2 -0
- data/test/java_test_converter.bash +7 -0
- data/test/test_czech-stemmer.rb +221 -0
- metadata +130 -0
@@ -0,0 +1,221 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
|
4
|
+
class TestStemmer < Minitest::Test
|
5
|
+
|
6
|
+
def test_masculine_nouns
|
7
|
+
# animate ending with a hard consonant
|
8
|
+
assert_analyzes_to( "pán", "pán" )
|
9
|
+
assert_analyzes_to( "páni", "pán" )
|
10
|
+
assert_analyzes_to( "pánové", "pán" )
|
11
|
+
assert_analyzes_to( "pána", "pán" )
|
12
|
+
assert_analyzes_to( "pánů", "pán" )
|
13
|
+
assert_analyzes_to( "pánovi", "pán" )
|
14
|
+
assert_analyzes_to( "pánům", "pán" )
|
15
|
+
assert_analyzes_to( "pány", "pán" )
|
16
|
+
assert_analyzes_to( "páne", "pán" )
|
17
|
+
assert_analyzes_to( "pánech", "pán" )
|
18
|
+
assert_analyzes_to( "pánem", "pán" )
|
19
|
+
|
20
|
+
# inanimate ending with hard consonant
|
21
|
+
assert_analyzes_to( "hrad", "hrad" )
|
22
|
+
assert_analyzes_to( "hradu", "hrad" )
|
23
|
+
assert_analyzes_to( "hrade", "hrad" )
|
24
|
+
assert_analyzes_to( "hradem", "hrad" )
|
25
|
+
assert_analyzes_to( "hrady", "hrad" )
|
26
|
+
assert_analyzes_to( "hradech", "hrad" )
|
27
|
+
assert_analyzes_to( "hradům", "hrad" )
|
28
|
+
assert_analyzes_to( "hradů", "hrad" )
|
29
|
+
|
30
|
+
# animate ending with a soft consonant
|
31
|
+
assert_analyzes_to( "muž", "muh" )
|
32
|
+
assert_analyzes_to( "muži", "muh" )
|
33
|
+
assert_analyzes_to( "muže", "muh" )
|
34
|
+
assert_analyzes_to( "mužů", "muh" )
|
35
|
+
assert_analyzes_to( "mužům", "muh" )
|
36
|
+
assert_analyzes_to( "mužích", "muh" )
|
37
|
+
assert_analyzes_to( "mužem", "muh" )
|
38
|
+
|
39
|
+
# inanimate ending with a soft consonant
|
40
|
+
assert_analyzes_to( "stroj", "stroj" )
|
41
|
+
assert_analyzes_to( "stroje", "stroj" )
|
42
|
+
assert_analyzes_to( "strojů", "stroj" )
|
43
|
+
assert_analyzes_to( "stroji", "stroj" )
|
44
|
+
assert_analyzes_to( "strojům", "stroj" )
|
45
|
+
assert_analyzes_to( "strojích", "stroj" )
|
46
|
+
assert_analyzes_to( "strojem", "stroj" )
|
47
|
+
|
48
|
+
# ending with a
|
49
|
+
assert_analyzes_to( "předseda", "předsd" )
|
50
|
+
assert_analyzes_to( "předsedové", "předsd" )
|
51
|
+
assert_analyzes_to( "předsedy", "předsd" )
|
52
|
+
assert_analyzes_to( "předsedů", "předsd" )
|
53
|
+
assert_analyzes_to( "předsedovi", "předsd" )
|
54
|
+
assert_analyzes_to( "předsedům", "předsd" )
|
55
|
+
assert_analyzes_to( "předsedu", "předsd" )
|
56
|
+
assert_analyzes_to( "předsedo", "předsd" )
|
57
|
+
assert_analyzes_to( "předsedech", "předsd" )
|
58
|
+
assert_analyzes_to( "předsedou", "předsd" )
|
59
|
+
|
60
|
+
# ending with e
|
61
|
+
assert_analyzes_to( "soudce", "soudk" )
|
62
|
+
assert_analyzes_to( "soudci", "soudk" )
|
63
|
+
assert_analyzes_to( "soudců", "soudk" )
|
64
|
+
assert_analyzes_to( "soudcům", "soudk" )
|
65
|
+
assert_analyzes_to( "soudcích", "soudk" )
|
66
|
+
assert_analyzes_to( "soudcem", "soudk" )
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_feminine_nouns
|
70
|
+
# ending with hard consonant
|
71
|
+
assert_analyzes_to( "kost", "kost" )
|
72
|
+
assert_analyzes_to( "kosti", "kost" )
|
73
|
+
assert_analyzes_to( "kostí", "kost" )
|
74
|
+
assert_analyzes_to( "kostem", "kost" )
|
75
|
+
assert_analyzes_to( "kostech", "kost" )
|
76
|
+
assert_analyzes_to( "kostmi", "kost" )
|
77
|
+
|
78
|
+
# ending with a soft consonant
|
79
|
+
# note: in this example sing nom. and sing acc. don't conflate w/ the rest
|
80
|
+
assert_analyzes_to( "píseň", "písň" )
|
81
|
+
assert_analyzes_to( "písně", "písn" )
|
82
|
+
assert_analyzes_to( "písni", "písn" )
|
83
|
+
assert_analyzes_to( "písněmi", "písn" )
|
84
|
+
assert_analyzes_to( "písních", "písn" )
|
85
|
+
assert_analyzes_to( "písním", "písn" )
|
86
|
+
|
87
|
+
# ending with e
|
88
|
+
assert_analyzes_to( "růže", "růh" )
|
89
|
+
assert_analyzes_to( "růží", "růh" )
|
90
|
+
assert_analyzes_to( "růžím", "růh" )
|
91
|
+
assert_analyzes_to( "růžích", "růh" )
|
92
|
+
assert_analyzes_to( "růžemi", "růh" )
|
93
|
+
assert_analyzes_to( "růži", "růh" )
|
94
|
+
|
95
|
+
# ending with a
|
96
|
+
assert_analyzes_to( "žena", "žn" )
|
97
|
+
assert_analyzes_to( "ženy", "žn" )
|
98
|
+
assert_analyzes_to( "žen", "žn" )
|
99
|
+
assert_analyzes_to( "ženě", "žn" )
|
100
|
+
assert_analyzes_to( "ženám", "žn" )
|
101
|
+
assert_analyzes_to( "ženu", "žn" )
|
102
|
+
assert_analyzes_to( "ženo", "žn" )
|
103
|
+
assert_analyzes_to( "ženách", "žn" )
|
104
|
+
assert_analyzes_to( "ženou", "žn" )
|
105
|
+
assert_analyzes_to( "ženami", "žn" )
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_neuter_nouns
|
110
|
+
# ending with o
|
111
|
+
assert_analyzes_to( "město", "měst" )
|
112
|
+
assert_analyzes_to( "města", "měst" )
|
113
|
+
assert_analyzes_to( "měst", "měst" )
|
114
|
+
assert_analyzes_to( "městu", "měst" )
|
115
|
+
assert_analyzes_to( "městům", "měst" )
|
116
|
+
assert_analyzes_to( "městě", "měst" )
|
117
|
+
assert_analyzes_to( "městech", "měst" )
|
118
|
+
assert_analyzes_to( "městem", "měst" )
|
119
|
+
assert_analyzes_to( "městy", "měst" )
|
120
|
+
|
121
|
+
# ending with e
|
122
|
+
assert_analyzes_to( "moře", "moř" )
|
123
|
+
assert_analyzes_to( "moří", "moř" )
|
124
|
+
assert_analyzes_to( "mořím", "moř" )
|
125
|
+
assert_analyzes_to( "moři", "moř" )
|
126
|
+
assert_analyzes_to( "mořích", "moř" )
|
127
|
+
assert_analyzes_to( "mořem", "moř" )
|
128
|
+
|
129
|
+
# ending with ě
|
130
|
+
assert_analyzes_to( "kuře", "kuř" )
|
131
|
+
assert_analyzes_to( "kuřata", "kuř" )
|
132
|
+
assert_analyzes_to( "kuřete", "kuř" )
|
133
|
+
assert_analyzes_to( "kuřat", "kuř" )
|
134
|
+
assert_analyzes_to( "kuřeti", "kuř" )
|
135
|
+
assert_analyzes_to( "kuřatům", "kuř" )
|
136
|
+
assert_analyzes_to( "kuřatech", "kuř" )
|
137
|
+
assert_analyzes_to( "kuřetem", "kuř" )
|
138
|
+
assert_analyzes_to( "kuřaty", "kuř" )
|
139
|
+
|
140
|
+
# ending with í
|
141
|
+
assert_analyzes_to( "stavení", "stavn" )
|
142
|
+
assert_analyzes_to( "stavením", "stavn" )
|
143
|
+
assert_analyzes_to( "staveních", "stavn" )
|
144
|
+
assert_analyzes_to( "staveními", "stavn" )
|
145
|
+
|
146
|
+
end
|
147
|
+
|
148
|
+
def test_adjectives
|
149
|
+
# ending with ý/á/é
|
150
|
+
assert_analyzes_to( "mladý", "mlad" )
|
151
|
+
assert_analyzes_to( "mladí", "mlad" )
|
152
|
+
assert_analyzes_to( "mladého", "mlad" )
|
153
|
+
assert_analyzes_to( "mladých", "mlad" )
|
154
|
+
assert_analyzes_to( "mladému", "mlad" )
|
155
|
+
assert_analyzes_to( "mladým", "mlad" )
|
156
|
+
assert_analyzes_to( "mladé", "mlad" )
|
157
|
+
assert_analyzes_to( "mladém", "mlad" )
|
158
|
+
assert_analyzes_to( "mladými", "mlad" )
|
159
|
+
assert_analyzes_to( "mladá", "mlad" )
|
160
|
+
assert_analyzes_to( "mladou", "mlad" )
|
161
|
+
|
162
|
+
# ending with í
|
163
|
+
assert_analyzes_to( "jarní", "jarn" )
|
164
|
+
assert_analyzes_to( "jarního", "jarn" )
|
165
|
+
assert_analyzes_to( "jarních", "jarn" )
|
166
|
+
assert_analyzes_to( "jarnímu", "jarn" )
|
167
|
+
assert_analyzes_to( "jarním", "jarn" )
|
168
|
+
assert_analyzes_to( "jarními", "jarn" )
|
169
|
+
end
|
170
|
+
|
171
|
+
def test_possessive
|
172
|
+
#lucene test case uses "Karlův"
|
173
|
+
assert_analyzes_to( "karlův", "karl" )
|
174
|
+
assert_analyzes_to( "jazykový", "jazyk" )
|
175
|
+
end
|
176
|
+
|
177
|
+
def test_exceptions
|
178
|
+
# rewrite of št -> sk
|
179
|
+
assert_analyzes_to( "český", "česk" )
|
180
|
+
assert_analyzes_to( "čeští", "česk" )
|
181
|
+
|
182
|
+
# rewrite of čt -> ck
|
183
|
+
assert_analyzes_to( "anglický", "anglick" )
|
184
|
+
assert_analyzes_to( "angličtí", "anglick" )
|
185
|
+
|
186
|
+
# rewrite of z -> h
|
187
|
+
assert_analyzes_to( "kniha", "knih" )
|
188
|
+
assert_analyzes_to( "knize", "knih" )
|
189
|
+
|
190
|
+
# rewrite of ž -> h
|
191
|
+
assert_analyzes_to( "mazat", "mah" )
|
192
|
+
assert_analyzes_to( "mažu", "mah" )
|
193
|
+
|
194
|
+
# rewrite of c -> k
|
195
|
+
assert_analyzes_to( "kluk", "kluk" )
|
196
|
+
assert_analyzes_to( "kluci", "kluk" )
|
197
|
+
assert_analyzes_to( "klucích", "kluk" )
|
198
|
+
|
199
|
+
# rewrite of č -> k
|
200
|
+
assert_analyzes_to( "hezký", "hezk" )
|
201
|
+
assert_analyzes_to( "hezčí", "hezk" )
|
202
|
+
|
203
|
+
# rewrite of *ů* -> *o*
|
204
|
+
assert_analyzes_to( "hůl", "hol" )
|
205
|
+
assert_analyzes_to( "hole", "hol" )
|
206
|
+
|
207
|
+
# rewrite of e* -> *
|
208
|
+
assert_analyzes_to( "deska", "desk" )
|
209
|
+
assert_analyzes_to( "desek", "desk" )
|
210
|
+
end
|
211
|
+
|
212
|
+
def test_dont_stem
|
213
|
+
assert_analyzes_to( "e", "e" )
|
214
|
+
assert_analyzes_to( "zi", "zi" )
|
215
|
+
end
|
216
|
+
|
217
|
+
def assert_analyzes_to word, stem
|
218
|
+
assert_equal(stem, CzechStemmer.stem(word))
|
219
|
+
end
|
220
|
+
|
221
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: czech-stemmer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ondrej Odchazel
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-06-24 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: shoulda
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rdoc
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: jeweler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 2.0.1
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 2.0.1
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simplecov
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: Based pn Lucene implementation
|
84
|
+
email: hypertornado@gmail.com
|
85
|
+
executables: []
|
86
|
+
extensions: []
|
87
|
+
extra_rdoc_files:
|
88
|
+
- LICENSE.txt
|
89
|
+
- README.markdown
|
90
|
+
files:
|
91
|
+
- .document
|
92
|
+
- Gemfile
|
93
|
+
- Gemfile.lock
|
94
|
+
- LICENSE.txt
|
95
|
+
- README.markdown
|
96
|
+
- Rakefile
|
97
|
+
- VERSION
|
98
|
+
- czech-stemmer.gemspec
|
99
|
+
- lib/czech-stemmer.rb
|
100
|
+
- test/CzechStemmer.java
|
101
|
+
- test/TestCzechStemmer.java
|
102
|
+
- test/TestCzechStemmer.java.txt
|
103
|
+
- test/helper.rb
|
104
|
+
- test/java_test_converter.bash
|
105
|
+
- test/test_czech-stemmer.rb
|
106
|
+
homepage: http://github.com/hypertornado/czech-stemmer
|
107
|
+
licenses:
|
108
|
+
- MIT
|
109
|
+
metadata: {}
|
110
|
+
post_install_message:
|
111
|
+
rdoc_options: []
|
112
|
+
require_paths:
|
113
|
+
- lib
|
114
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
115
|
+
requirements:
|
116
|
+
- - '>='
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: '0'
|
119
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - '>='
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '0'
|
124
|
+
requirements: []
|
125
|
+
rubyforge_project:
|
126
|
+
rubygems_version: 2.0.14
|
127
|
+
signing_key:
|
128
|
+
specification_version: 4
|
129
|
+
summary: Ruby port of czech stemmer in Lucene
|
130
|
+
test_files: []
|