czech-stemmer 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +80 -0
- data/LICENSE.txt +20 -0
- data/README.markdown +10 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/czech-stemmer.gemspec +66 -0
- data/lib/czech-stemmer.rb +125 -0
- data/test/CzechStemmer.java +173 -0
- data/test/TestCzechStemmer.java +300 -0
- data/test/TestCzechStemmer.java.txt +300 -0
- data/test/helper.rb +2 -0
- data/test/java_test_converter.bash +7 -0
- data/test/test_czech-stemmer.rb +221 -0
- metadata +130 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9ea7bcfa41da627a5df410f7c640b1020b1eaa49
|
4
|
+
data.tar.gz: d56383fc4cfb27fafcb517c5ef6f1ccc3a4ab43b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3b961a0aecdb79bb04e4b9f2fc97d10852dd2fe3eaf39f5b117e69f08602fd0841d2850a42e60d45687613c1ba90602960a5e67fa737bebded7a5cfd10f779ca
|
7
|
+
data.tar.gz: 6688dc4eae8a91c1af12cb2c0bad5e7907af9f4140718c22a4038c11b51555172bb85f54b78c04efaba9fae8cba5ba6ad56f9f84f66978adcc2faea0d697fea8
|
data/.document
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
activesupport (4.1.1)
|
5
|
+
i18n (~> 0.6, >= 0.6.9)
|
6
|
+
json (~> 1.7, >= 1.7.7)
|
7
|
+
minitest (~> 5.1)
|
8
|
+
thread_safe (~> 0.1)
|
9
|
+
tzinfo (~> 1.1)
|
10
|
+
addressable (2.3.6)
|
11
|
+
builder (3.2.2)
|
12
|
+
descendants_tracker (0.0.4)
|
13
|
+
thread_safe (~> 0.3, >= 0.3.1)
|
14
|
+
docile (1.1.5)
|
15
|
+
faraday (0.9.0)
|
16
|
+
multipart-post (>= 1.2, < 3)
|
17
|
+
git (1.2.7)
|
18
|
+
github_api (0.11.3)
|
19
|
+
addressable (~> 2.3)
|
20
|
+
descendants_tracker (~> 0.0.1)
|
21
|
+
faraday (~> 0.8, < 0.10)
|
22
|
+
hashie (>= 1.2)
|
23
|
+
multi_json (>= 1.7.5, < 2.0)
|
24
|
+
nokogiri (~> 1.6.0)
|
25
|
+
oauth2
|
26
|
+
hashie (3.0.0)
|
27
|
+
highline (1.6.21)
|
28
|
+
i18n (0.6.9)
|
29
|
+
jeweler (2.0.1)
|
30
|
+
builder
|
31
|
+
bundler (>= 1.0)
|
32
|
+
git (>= 1.2.5)
|
33
|
+
github_api
|
34
|
+
highline (>= 1.6.15)
|
35
|
+
nokogiri (>= 1.5.10)
|
36
|
+
rake
|
37
|
+
rdoc
|
38
|
+
json (1.8.1)
|
39
|
+
jwt (1.0.0)
|
40
|
+
mini_portile (0.6.0)
|
41
|
+
minitest (5.3.5)
|
42
|
+
multi_json (1.10.1)
|
43
|
+
multi_xml (0.5.5)
|
44
|
+
multipart-post (2.0.0)
|
45
|
+
nokogiri (1.6.2.1)
|
46
|
+
mini_portile (= 0.6.0)
|
47
|
+
oauth2 (0.9.4)
|
48
|
+
faraday (>= 0.8, < 0.10)
|
49
|
+
jwt (~> 1.0)
|
50
|
+
multi_json (~> 1.3)
|
51
|
+
multi_xml (~> 0.5)
|
52
|
+
rack (~> 1.2)
|
53
|
+
rack (1.5.2)
|
54
|
+
rake (10.3.2)
|
55
|
+
rdoc (4.1.1)
|
56
|
+
json (~> 1.4)
|
57
|
+
shoulda (3.5.0)
|
58
|
+
shoulda-context (~> 1.0, >= 1.0.1)
|
59
|
+
shoulda-matchers (>= 1.4.1, < 3.0)
|
60
|
+
shoulda-context (1.2.1)
|
61
|
+
shoulda-matchers (2.6.1)
|
62
|
+
activesupport (>= 3.0.0)
|
63
|
+
simplecov (0.8.2)
|
64
|
+
docile (~> 1.1.0)
|
65
|
+
multi_json
|
66
|
+
simplecov-html (~> 0.8.0)
|
67
|
+
simplecov-html (0.8.0)
|
68
|
+
thread_safe (0.3.4)
|
69
|
+
tzinfo (1.2.1)
|
70
|
+
thread_safe (~> 0.1)
|
71
|
+
|
72
|
+
PLATFORMS
|
73
|
+
ruby
|
74
|
+
|
75
|
+
DEPENDENCIES
|
76
|
+
bundler (~> 1.0)
|
77
|
+
jeweler (~> 2.0.1)
|
78
|
+
rdoc
|
79
|
+
shoulda
|
80
|
+
simplecov
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2014 Ondrej Odchazel
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
|
+
gem.name = "czech-stemmer"
|
18
|
+
gem.homepage = "http://github.com/hypertornado/czech-stemmer"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Ruby port of czech stemmer in Lucene}
|
21
|
+
gem.description = %Q{Based pn Lucene implementation}
|
22
|
+
gem.email = "hypertornado@gmail.com"
|
23
|
+
gem.authors = ["Ondrej Odchazel"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rake/testtask'
|
29
|
+
Rake::TestTask.new(:test) do |test|
|
30
|
+
test.libs << 'lib' << 'test'
|
31
|
+
test.pattern = 'test/**/test_*.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
|
35
|
+
desc "Code coverage detail"
|
36
|
+
task :simplecov do
|
37
|
+
ENV['COVERAGE'] = "true"
|
38
|
+
Rake::Task['test'].execute
|
39
|
+
end
|
40
|
+
|
41
|
+
task :default => :test
|
42
|
+
|
43
|
+
require 'rdoc/task'
|
44
|
+
Rake::RDocTask.new do |rdoc|
|
45
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
46
|
+
|
47
|
+
rdoc.rdoc_dir = 'rdoc'
|
48
|
+
rdoc.title = "czech-stemmer #{version}"
|
49
|
+
rdoc.rdoc_files.include('README*')
|
50
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
51
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.0
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "czech-stemmer"
|
8
|
+
s.version = "0.0.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Ondrej Odchazel"]
|
12
|
+
s.date = "2014-06-24"
|
13
|
+
s.description = "Based pn Lucene implementation"
|
14
|
+
s.email = "hypertornado@gmail.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.markdown"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
"Gemfile",
|
22
|
+
"Gemfile.lock",
|
23
|
+
"LICENSE.txt",
|
24
|
+
"README.markdown",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"czech-stemmer.gemspec",
|
28
|
+
"lib/czech-stemmer.rb",
|
29
|
+
"test/CzechStemmer.java",
|
30
|
+
"test/TestCzechStemmer.java",
|
31
|
+
"test/TestCzechStemmer.java.txt",
|
32
|
+
"test/helper.rb",
|
33
|
+
"test/java_test_converter.bash",
|
34
|
+
"test/test_czech-stemmer.rb"
|
35
|
+
]
|
36
|
+
s.homepage = "http://github.com/hypertornado/czech-stemmer"
|
37
|
+
s.licenses = ["MIT"]
|
38
|
+
s.require_paths = ["lib"]
|
39
|
+
s.rubygems_version = "2.0.14"
|
40
|
+
s.summary = "Ruby port of czech stemmer in Lucene"
|
41
|
+
|
42
|
+
if s.respond_to? :specification_version then
|
43
|
+
s.specification_version = 4
|
44
|
+
|
45
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
46
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
47
|
+
s.add_development_dependency(%q<rdoc>, [">= 0"])
|
48
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0"])
|
49
|
+
s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
|
50
|
+
s.add_development_dependency(%q<simplecov>, [">= 0"])
|
51
|
+
else
|
52
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
53
|
+
s.add_dependency(%q<rdoc>, [">= 0"])
|
54
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
55
|
+
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
56
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
57
|
+
end
|
58
|
+
else
|
59
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
60
|
+
s.add_dependency(%q<rdoc>, [">= 0"])
|
61
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
62
|
+
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
63
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
@@ -0,0 +1,125 @@
|
|
1
|
+
class CzechStemmer
|
2
|
+
|
3
|
+
def self.stem word
|
4
|
+
stem = CzechStemmer.remove_case word
|
5
|
+
stem = CzechStemmer.remove_possessives stem
|
6
|
+
if stem.size > 0 then
|
7
|
+
stem = CzechStemmer.normalize stem
|
8
|
+
end
|
9
|
+
return stem
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.remove_case word
|
13
|
+
len = word.size
|
14
|
+
|
15
|
+
if (len > 7 and (
|
16
|
+
word.end_with?("atech")
|
17
|
+
)) then return word[0..-6] end
|
18
|
+
|
19
|
+
if (len > 6 and (
|
20
|
+
word.end_with?("ětem") ||
|
21
|
+
word.end_with?("etem") ||
|
22
|
+
word.end_with?("atům")
|
23
|
+
)) then return word[0..-5] end
|
24
|
+
|
25
|
+
if (len > 5 and (
|
26
|
+
word.end_with?("ech") ||
|
27
|
+
word.end_with?("ich") ||
|
28
|
+
word.end_with?("ích") ||
|
29
|
+
word.end_with?("ého") ||
|
30
|
+
word.end_with?("ěmi") ||
|
31
|
+
word.end_with?("emi") ||
|
32
|
+
word.end_with?("ému") ||
|
33
|
+
word.end_with?("ěte") ||
|
34
|
+
word.end_with?("ete") ||
|
35
|
+
word.end_with?("ěti") ||
|
36
|
+
word.end_with?("eti") ||
|
37
|
+
word.end_with?("ího") ||
|
38
|
+
word.end_with?("iho") ||
|
39
|
+
word.end_with?("ími") ||
|
40
|
+
word.end_with?("ímu") ||
|
41
|
+
word.end_with?("imu") ||
|
42
|
+
word.end_with?("ách") ||
|
43
|
+
word.end_with?("ata") ||
|
44
|
+
word.end_with?("aty") ||
|
45
|
+
word.end_with?("ých") ||
|
46
|
+
word.end_with?("ama") ||
|
47
|
+
word.end_with?("ami") ||
|
48
|
+
word.end_with?("ové") ||
|
49
|
+
word.end_with?("ovi") ||
|
50
|
+
word.end_with?("ými")
|
51
|
+
)) then return word[0..-4] end
|
52
|
+
|
53
|
+
if (len > 4 and (
|
54
|
+
word.end_with?("em") ||
|
55
|
+
word.end_with?("es") ||
|
56
|
+
word.end_with?("ém") ||
|
57
|
+
word.end_with?("ím") ||
|
58
|
+
word.end_with?("ům") ||
|
59
|
+
word.end_with?("at") ||
|
60
|
+
word.end_with?("ám") ||
|
61
|
+
word.end_with?("os") ||
|
62
|
+
word.end_with?("us") ||
|
63
|
+
word.end_with?("ým") ||
|
64
|
+
word.end_with?("mi") ||
|
65
|
+
word.end_with?("ou")
|
66
|
+
)) then return word[0..-3] end
|
67
|
+
|
68
|
+
|
69
|
+
if (len > 3 and ["a", "e", "i", "o", "u", "ů", "y", "á", "é", "í", "ý", "ě"].include?(word[-1,1])) then
|
70
|
+
return word[0..-2]
|
71
|
+
end
|
72
|
+
|
73
|
+
return word
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.remove_possessives word
|
77
|
+
if (word.size > 5 and (
|
78
|
+
word.end_with?("ov") ||
|
79
|
+
word.end_with?("in") ||
|
80
|
+
word.end_with?("ův")
|
81
|
+
)) then return word[0..-3] end
|
82
|
+
|
83
|
+
return word
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.normalize word
|
87
|
+
if word.end_with?("čt") then
|
88
|
+
return word[0..-3] + "ck"
|
89
|
+
end
|
90
|
+
|
91
|
+
if word.end_with?("št") then
|
92
|
+
return word[0..-3] + "sk"
|
93
|
+
end
|
94
|
+
|
95
|
+
if word.end_with?("c") then
|
96
|
+
return word[0..-2] + "k"
|
97
|
+
end
|
98
|
+
|
99
|
+
if word.end_with?("č") then
|
100
|
+
return word[0..-2] + "k"
|
101
|
+
end
|
102
|
+
|
103
|
+
if word.end_with?("z") then
|
104
|
+
return word[0..-2] + "h"
|
105
|
+
end
|
106
|
+
|
107
|
+
if word.end_with?("ž") then
|
108
|
+
return word[0..-2] + "h"
|
109
|
+
end
|
110
|
+
|
111
|
+
if (word.size > 1 and word[-2,1] == "e") then
|
112
|
+
last_char = word[-1,1]
|
113
|
+
return word[0..-3] + last_char
|
114
|
+
end
|
115
|
+
|
116
|
+
if (word.size > 2 and word[-2,1] == "ů") then
|
117
|
+
last_char = word[-1,1]
|
118
|
+
return word[0..-3] + "o" + last_char
|
119
|
+
end
|
120
|
+
|
121
|
+
return word
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
@@ -0,0 +1,173 @@
|
|
1
|
+
package org.apache.lucene.analysis.cz;
|
2
|
+
|
3
|
+
/*
|
4
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
5
|
+
* contributor license agreements. See the NOTICE file distributed with
|
6
|
+
* this work for additional information regarding copyright ownership.
|
7
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
8
|
+
* (the "License"); you may not use this file except in compliance with
|
9
|
+
* the License. You may obtain a copy of the License at
|
10
|
+
*
|
11
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
*
|
13
|
+
* Unless required by applicable law or agreed to in writing, software
|
14
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
* See the License for the specific language governing permissions and
|
17
|
+
* limitations under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
21
|
+
|
22
|
+
/**
|
23
|
+
* Light Stemmer for Czech.
|
24
|
+
* <p>
|
25
|
+
* Implements the algorithm described in:
|
26
|
+
* <i>
|
27
|
+
* Indexing and stemming approaches for the Czech language
|
28
|
+
* </i>
|
29
|
+
* http://portal.acm.org/citation.cfm?id=1598600
|
30
|
+
* </p>
|
31
|
+
*/
|
32
|
+
public class CzechStemmer {
|
33
|
+
|
34
|
+
/**
|
35
|
+
* Stem an input buffer of Czech text.
|
36
|
+
*
|
37
|
+
* @param s input buffer
|
38
|
+
* @param len length of input buffer
|
39
|
+
* @return length of input buffer after normalization
|
40
|
+
*
|
41
|
+
* <p><b>NOTE</b>: Input is expected to be in lowercase,
|
42
|
+
* but with diacritical marks</p>
|
43
|
+
*/
|
44
|
+
public int stem(char s[], int len) {
|
45
|
+
len = removeCase(s, len);
|
46
|
+
len = removePossessives(s, len);
|
47
|
+
if (len > 0) {
|
48
|
+
len = normalize(s, len);
|
49
|
+
}
|
50
|
+
return len;
|
51
|
+
}
|
52
|
+
|
53
|
+
private int removeCase(char s[], int len) {
|
54
|
+
if (len > 7 && endsWith(s, len, "atech"))
|
55
|
+
return len - 5;
|
56
|
+
|
57
|
+
if (len > 6 &&
|
58
|
+
(endsWith(s, len,"ětem") ||
|
59
|
+
endsWith(s, len,"etem") ||
|
60
|
+
endsWith(s, len,"atům")))
|
61
|
+
return len - 4;
|
62
|
+
|
63
|
+
if (len > 5 &&
|
64
|
+
(endsWith(s, len, "ech") ||
|
65
|
+
endsWith(s, len, "ich") ||
|
66
|
+
endsWith(s, len, "ích") ||
|
67
|
+
endsWith(s, len, "ého") ||
|
68
|
+
endsWith(s, len, "ěmi") ||
|
69
|
+
endsWith(s, len, "emi") ||
|
70
|
+
endsWith(s, len, "ému") ||
|
71
|
+
endsWith(s, len, "ěte") ||
|
72
|
+
endsWith(s, len, "ete") ||
|
73
|
+
endsWith(s, len, "ěti") ||
|
74
|
+
endsWith(s, len, "eti") ||
|
75
|
+
endsWith(s, len, "ího") ||
|
76
|
+
endsWith(s, len, "iho") ||
|
77
|
+
endsWith(s, len, "ími") ||
|
78
|
+
endsWith(s, len, "ímu") ||
|
79
|
+
endsWith(s, len, "imu") ||
|
80
|
+
endsWith(s, len, "ách") ||
|
81
|
+
endsWith(s, len, "ata") ||
|
82
|
+
endsWith(s, len, "aty") ||
|
83
|
+
endsWith(s, len, "ých") ||
|
84
|
+
endsWith(s, len, "ama") ||
|
85
|
+
endsWith(s, len, "ami") ||
|
86
|
+
endsWith(s, len, "ové") ||
|
87
|
+
endsWith(s, len, "ovi") ||
|
88
|
+
endsWith(s, len, "ými")))
|
89
|
+
return len - 3;
|
90
|
+
|
91
|
+
if (len > 4 &&
|
92
|
+
(endsWith(s, len, "em") ||
|
93
|
+
endsWith(s, len, "es") ||
|
94
|
+
endsWith(s, len, "ém") ||
|
95
|
+
endsWith(s, len, "ím") ||
|
96
|
+
endsWith(s, len, "ům") ||
|
97
|
+
endsWith(s, len, "at") ||
|
98
|
+
endsWith(s, len, "ám") ||
|
99
|
+
endsWith(s, len, "os") ||
|
100
|
+
endsWith(s, len, "us") ||
|
101
|
+
endsWith(s, len, "ým") ||
|
102
|
+
endsWith(s, len, "mi") ||
|
103
|
+
endsWith(s, len, "ou")))
|
104
|
+
return len - 2;
|
105
|
+
|
106
|
+
if (len > 3) {
|
107
|
+
switch (s[len - 1]) {
|
108
|
+
case 'a':
|
109
|
+
case 'e':
|
110
|
+
case 'i':
|
111
|
+
case 'o':
|
112
|
+
case 'u':
|
113
|
+
case 'ů':
|
114
|
+
case 'y':
|
115
|
+
case 'á':
|
116
|
+
case 'é':
|
117
|
+
case 'í':
|
118
|
+
case 'ý':
|
119
|
+
case 'ě':
|
120
|
+
return len - 1;
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
124
|
+
return len;
|
125
|
+
}
|
126
|
+
|
127
|
+
private int removePossessives(char s[], int len) {
|
128
|
+
if (len > 5 &&
|
129
|
+
(endsWith(s, len, "ov") ||
|
130
|
+
endsWith(s, len, "in") ||
|
131
|
+
endsWith(s, len, "ův")))
|
132
|
+
return len - 2;
|
133
|
+
|
134
|
+
return len;
|
135
|
+
}
|
136
|
+
|
137
|
+
private int normalize(char s[], int len) {
|
138
|
+
if (endsWith(s, len, "čt")) { // čt -> ck
|
139
|
+
s[len - 2] = 'c';
|
140
|
+
s[len - 1] = 'k';
|
141
|
+
return len;
|
142
|
+
}
|
143
|
+
|
144
|
+
if (endsWith(s, len, "št")) { // št -> sk
|
145
|
+
s[len - 2] = 's';
|
146
|
+
s[len - 1] = 'k';
|
147
|
+
return len;
|
148
|
+
}
|
149
|
+
|
150
|
+
switch(s[len - 1]) {
|
151
|
+
case 'c': // [cč] -> k
|
152
|
+
case 'č':
|
153
|
+
s[len - 1] = 'k';
|
154
|
+
return len;
|
155
|
+
case 'z': // [zž] -> h
|
156
|
+
case 'ž':
|
157
|
+
s[len - 1] = 'h';
|
158
|
+
return len;
|
159
|
+
}
|
160
|
+
|
161
|
+
if (len > 1 && s[len - 2] == 'e') {
|
162
|
+
s[len - 2] = s[len - 1]; // e* > *
|
163
|
+
return len - 1;
|
164
|
+
}
|
165
|
+
|
166
|
+
if (len > 2 && s[len - 2] == 'ů') {
|
167
|
+
s[len - 2] = 'o'; // *ů* -> *o*
|
168
|
+
return len;
|
169
|
+
}
|
170
|
+
|
171
|
+
return len;
|
172
|
+
}
|
173
|
+
}
|