czech-stemmer 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9ea7bcfa41da627a5df410f7c640b1020b1eaa49
4
+ data.tar.gz: d56383fc4cfb27fafcb517c5ef6f1ccc3a4ab43b
5
+ SHA512:
6
+ metadata.gz: 3b961a0aecdb79bb04e4b9f2fc97d10852dd2fe3eaf39f5b117e69f08602fd0841d2850a42e60d45687613c1ba90602960a5e67fa737bebded7a5cfd10f779ca
7
+ data.tar.gz: 6688dc4eae8a91c1af12cb2c0bad5e7907af9f4140718c22a4038c11b51555172bb85f54b78c04efaba9fae8cba5ba6ad56f9f84f66978adcc2faea0d697fea8
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ source "http://rubygems.org"
2
+
3
+ group :development do
4
+ gem "shoulda", ">= 0"
5
+ gem "rdoc"
6
+ gem "bundler", "~> 1.0"
7
+ gem "jeweler", "~> 2.0.1"
8
+ gem "simplecov", ">= 0"
9
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,80 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ activesupport (4.1.1)
5
+ i18n (~> 0.6, >= 0.6.9)
6
+ json (~> 1.7, >= 1.7.7)
7
+ minitest (~> 5.1)
8
+ thread_safe (~> 0.1)
9
+ tzinfo (~> 1.1)
10
+ addressable (2.3.6)
11
+ builder (3.2.2)
12
+ descendants_tracker (0.0.4)
13
+ thread_safe (~> 0.3, >= 0.3.1)
14
+ docile (1.1.5)
15
+ faraday (0.9.0)
16
+ multipart-post (>= 1.2, < 3)
17
+ git (1.2.7)
18
+ github_api (0.11.3)
19
+ addressable (~> 2.3)
20
+ descendants_tracker (~> 0.0.1)
21
+ faraday (~> 0.8, < 0.10)
22
+ hashie (>= 1.2)
23
+ multi_json (>= 1.7.5, < 2.0)
24
+ nokogiri (~> 1.6.0)
25
+ oauth2
26
+ hashie (3.0.0)
27
+ highline (1.6.21)
28
+ i18n (0.6.9)
29
+ jeweler (2.0.1)
30
+ builder
31
+ bundler (>= 1.0)
32
+ git (>= 1.2.5)
33
+ github_api
34
+ highline (>= 1.6.15)
35
+ nokogiri (>= 1.5.10)
36
+ rake
37
+ rdoc
38
+ json (1.8.1)
39
+ jwt (1.0.0)
40
+ mini_portile (0.6.0)
41
+ minitest (5.3.5)
42
+ multi_json (1.10.1)
43
+ multi_xml (0.5.5)
44
+ multipart-post (2.0.0)
45
+ nokogiri (1.6.2.1)
46
+ mini_portile (= 0.6.0)
47
+ oauth2 (0.9.4)
48
+ faraday (>= 0.8, < 0.10)
49
+ jwt (~> 1.0)
50
+ multi_json (~> 1.3)
51
+ multi_xml (~> 0.5)
52
+ rack (~> 1.2)
53
+ rack (1.5.2)
54
+ rake (10.3.2)
55
+ rdoc (4.1.1)
56
+ json (~> 1.4)
57
+ shoulda (3.5.0)
58
+ shoulda-context (~> 1.0, >= 1.0.1)
59
+ shoulda-matchers (>= 1.4.1, < 3.0)
60
+ shoulda-context (1.2.1)
61
+ shoulda-matchers (2.6.1)
62
+ activesupport (>= 3.0.0)
63
+ simplecov (0.8.2)
64
+ docile (~> 1.1.0)
65
+ multi_json
66
+ simplecov-html (~> 0.8.0)
67
+ simplecov-html (0.8.0)
68
+ thread_safe (0.3.4)
69
+ tzinfo (1.2.1)
70
+ thread_safe (~> 0.1)
71
+
72
+ PLATFORMS
73
+ ruby
74
+
75
+ DEPENDENCIES
76
+ bundler (~> 1.0)
77
+ jeweler (~> 2.0.1)
78
+ rdoc
79
+ shoulda
80
+ simplecov
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2014 Ondrej Odchazel
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,10 @@
1
+ # czech-stemmer
2
+
3
+ Czech stemmer is Ruby port of CzechStemmer from Lucene.
4
+
5
+ ## Usage
6
+
7
+ ## Copyright
8
+
9
+ Copyright (c) 2014 Ondrej Odchazel. See LICENSE.txt for further details.
10
+
data/Rakefile ADDED
@@ -0,0 +1,51 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
17
+ gem.name = "czech-stemmer"
18
+ gem.homepage = "http://github.com/hypertornado/czech-stemmer"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Ruby port of czech stemmer in Lucene}
21
+ gem.description = %Q{Based pn Lucene implementation}
22
+ gem.email = "hypertornado@gmail.com"
23
+ gem.authors = ["Ondrej Odchazel"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ desc "Code coverage detail"
36
+ task :simplecov do
37
+ ENV['COVERAGE'] = "true"
38
+ Rake::Task['test'].execute
39
+ end
40
+
41
+ task :default => :test
42
+
43
+ require 'rdoc/task'
44
+ Rake::RDocTask.new do |rdoc|
45
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
46
+
47
+ rdoc.rdoc_dir = 'rdoc'
48
+ rdoc.title = "czech-stemmer #{version}"
49
+ rdoc.rdoc_files.include('README*')
50
+ rdoc.rdoc_files.include('lib/**/*.rb')
51
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,66 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "czech-stemmer"
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Ondrej Odchazel"]
12
+ s.date = "2014-06-24"
13
+ s.description = "Based pn Lucene implementation"
14
+ s.email = "hypertornado@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.markdown"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ "Gemfile",
22
+ "Gemfile.lock",
23
+ "LICENSE.txt",
24
+ "README.markdown",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "czech-stemmer.gemspec",
28
+ "lib/czech-stemmer.rb",
29
+ "test/CzechStemmer.java",
30
+ "test/TestCzechStemmer.java",
31
+ "test/TestCzechStemmer.java.txt",
32
+ "test/helper.rb",
33
+ "test/java_test_converter.bash",
34
+ "test/test_czech-stemmer.rb"
35
+ ]
36
+ s.homepage = "http://github.com/hypertornado/czech-stemmer"
37
+ s.licenses = ["MIT"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = "2.0.14"
40
+ s.summary = "Ruby port of czech stemmer in Lucene"
41
+
42
+ if s.respond_to? :specification_version then
43
+ s.specification_version = 4
44
+
45
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
46
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
47
+ s.add_development_dependency(%q<rdoc>, [">= 0"])
48
+ s.add_development_dependency(%q<bundler>, ["~> 1.0"])
49
+ s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
50
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
51
+ else
52
+ s.add_dependency(%q<shoulda>, [">= 0"])
53
+ s.add_dependency(%q<rdoc>, [">= 0"])
54
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
55
+ s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
56
+ s.add_dependency(%q<simplecov>, [">= 0"])
57
+ end
58
+ else
59
+ s.add_dependency(%q<shoulda>, [">= 0"])
60
+ s.add_dependency(%q<rdoc>, [">= 0"])
61
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
62
+ s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
63
+ s.add_dependency(%q<simplecov>, [">= 0"])
64
+ end
65
+ end
66
+
@@ -0,0 +1,125 @@
1
+ class CzechStemmer
2
+
3
+ def self.stem word
4
+ stem = CzechStemmer.remove_case word
5
+ stem = CzechStemmer.remove_possessives stem
6
+ if stem.size > 0 then
7
+ stem = CzechStemmer.normalize stem
8
+ end
9
+ return stem
10
+ end
11
+
12
+ def self.remove_case word
13
+ len = word.size
14
+
15
+ if (len > 7 and (
16
+ word.end_with?("atech")
17
+ )) then return word[0..-6] end
18
+
19
+ if (len > 6 and (
20
+ word.end_with?("ětem") ||
21
+ word.end_with?("etem") ||
22
+ word.end_with?("atům")
23
+ )) then return word[0..-5] end
24
+
25
+ if (len > 5 and (
26
+ word.end_with?("ech") ||
27
+ word.end_with?("ich") ||
28
+ word.end_with?("ích") ||
29
+ word.end_with?("ého") ||
30
+ word.end_with?("ěmi") ||
31
+ word.end_with?("emi") ||
32
+ word.end_with?("ému") ||
33
+ word.end_with?("ěte") ||
34
+ word.end_with?("ete") ||
35
+ word.end_with?("ěti") ||
36
+ word.end_with?("eti") ||
37
+ word.end_with?("ího") ||
38
+ word.end_with?("iho") ||
39
+ word.end_with?("ími") ||
40
+ word.end_with?("ímu") ||
41
+ word.end_with?("imu") ||
42
+ word.end_with?("ách") ||
43
+ word.end_with?("ata") ||
44
+ word.end_with?("aty") ||
45
+ word.end_with?("ých") ||
46
+ word.end_with?("ama") ||
47
+ word.end_with?("ami") ||
48
+ word.end_with?("ové") ||
49
+ word.end_with?("ovi") ||
50
+ word.end_with?("ými")
51
+ )) then return word[0..-4] end
52
+
53
+ if (len > 4 and (
54
+ word.end_with?("em") ||
55
+ word.end_with?("es") ||
56
+ word.end_with?("ém") ||
57
+ word.end_with?("ím") ||
58
+ word.end_with?("ům") ||
59
+ word.end_with?("at") ||
60
+ word.end_with?("ám") ||
61
+ word.end_with?("os") ||
62
+ word.end_with?("us") ||
63
+ word.end_with?("ým") ||
64
+ word.end_with?("mi") ||
65
+ word.end_with?("ou")
66
+ )) then return word[0..-3] end
67
+
68
+
69
+ if (len > 3 and ["a", "e", "i", "o", "u", "ů", "y", "á", "é", "í", "ý", "ě"].include?(word[-1,1])) then
70
+ return word[0..-2]
71
+ end
72
+
73
+ return word
74
+ end
75
+
76
+ def self.remove_possessives word
77
+ if (word.size > 5 and (
78
+ word.end_with?("ov") ||
79
+ word.end_with?("in") ||
80
+ word.end_with?("ův")
81
+ )) then return word[0..-3] end
82
+
83
+ return word
84
+ end
85
+
86
+ def self.normalize word
87
+ if word.end_with?("čt") then
88
+ return word[0..-3] + "ck"
89
+ end
90
+
91
+ if word.end_with?("št") then
92
+ return word[0..-3] + "sk"
93
+ end
94
+
95
+ if word.end_with?("c") then
96
+ return word[0..-2] + "k"
97
+ end
98
+
99
+ if word.end_with?("č") then
100
+ return word[0..-2] + "k"
101
+ end
102
+
103
+ if word.end_with?("z") then
104
+ return word[0..-2] + "h"
105
+ end
106
+
107
+ if word.end_with?("ž") then
108
+ return word[0..-2] + "h"
109
+ end
110
+
111
+ if (word.size > 1 and word[-2,1] == "e") then
112
+ last_char = word[-1,1]
113
+ return word[0..-3] + last_char
114
+ end
115
+
116
+ if (word.size > 2 and word[-2,1] == "ů") then
117
+ last_char = word[-1,1]
118
+ return word[0..-3] + "o" + last_char
119
+ end
120
+
121
+ return word
122
+
123
+ end
124
+
125
+ end
@@ -0,0 +1,173 @@
1
+ package org.apache.lucene.analysis.cz;
2
+
3
+ /*
4
+ * Licensed to the Apache Software Foundation (ASF) under one or more
5
+ * contributor license agreements. See the NOTICE file distributed with
6
+ * this work for additional information regarding copyright ownership.
7
+ * The ASF licenses this file to You under the Apache License, Version 2.0
8
+ * (the "License"); you may not use this file except in compliance with
9
+ * the License. You may obtain a copy of the License at
10
+ *
11
+ * http://www.apache.org/licenses/LICENSE-2.0
12
+ *
13
+ * Unless required by applicable law or agreed to in writing, software
14
+ * distributed under the License is distributed on an "AS IS" BASIS,
15
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ * See the License for the specific language governing permissions and
17
+ * limitations under the License.
18
+ */
19
+
20
+ import static org.apache.lucene.analysis.util.StemmerUtil.*;
21
+
22
+ /**
23
+ * Light Stemmer for Czech.
24
+ * <p>
25
+ * Implements the algorithm described in:
26
+ * <i>
27
+ * Indexing and stemming approaches for the Czech language
28
+ * </i>
29
+ * http://portal.acm.org/citation.cfm?id=1598600
30
+ * </p>
31
+ */
32
+ public class CzechStemmer {
33
+
34
+ /**
35
+ * Stem an input buffer of Czech text.
36
+ *
37
+ * @param s input buffer
38
+ * @param len length of input buffer
39
+ * @return length of input buffer after normalization
40
+ *
41
+ * <p><b>NOTE</b>: Input is expected to be in lowercase,
42
+ * but with diacritical marks</p>
43
+ */
44
+ public int stem(char s[], int len) {
45
+ len = removeCase(s, len);
46
+ len = removePossessives(s, len);
47
+ if (len > 0) {
48
+ len = normalize(s, len);
49
+ }
50
+ return len;
51
+ }
52
+
53
+ private int removeCase(char s[], int len) {
54
+ if (len > 7 && endsWith(s, len, "atech"))
55
+ return len - 5;
56
+
57
+ if (len > 6 &&
58
+ (endsWith(s, len,"ětem") ||
59
+ endsWith(s, len,"etem") ||
60
+ endsWith(s, len,"atům")))
61
+ return len - 4;
62
+
63
+ if (len > 5 &&
64
+ (endsWith(s, len, "ech") ||
65
+ endsWith(s, len, "ich") ||
66
+ endsWith(s, len, "ích") ||
67
+ endsWith(s, len, "ého") ||
68
+ endsWith(s, len, "ěmi") ||
69
+ endsWith(s, len, "emi") ||
70
+ endsWith(s, len, "ému") ||
71
+ endsWith(s, len, "ěte") ||
72
+ endsWith(s, len, "ete") ||
73
+ endsWith(s, len, "ěti") ||
74
+ endsWith(s, len, "eti") ||
75
+ endsWith(s, len, "ího") ||
76
+ endsWith(s, len, "iho") ||
77
+ endsWith(s, len, "ími") ||
78
+ endsWith(s, len, "ímu") ||
79
+ endsWith(s, len, "imu") ||
80
+ endsWith(s, len, "ách") ||
81
+ endsWith(s, len, "ata") ||
82
+ endsWith(s, len, "aty") ||
83
+ endsWith(s, len, "ých") ||
84
+ endsWith(s, len, "ama") ||
85
+ endsWith(s, len, "ami") ||
86
+ endsWith(s, len, "ové") ||
87
+ endsWith(s, len, "ovi") ||
88
+ endsWith(s, len, "ými")))
89
+ return len - 3;
90
+
91
+ if (len > 4 &&
92
+ (endsWith(s, len, "em") ||
93
+ endsWith(s, len, "es") ||
94
+ endsWith(s, len, "ém") ||
95
+ endsWith(s, len, "ím") ||
96
+ endsWith(s, len, "ům") ||
97
+ endsWith(s, len, "at") ||
98
+ endsWith(s, len, "ám") ||
99
+ endsWith(s, len, "os") ||
100
+ endsWith(s, len, "us") ||
101
+ endsWith(s, len, "ým") ||
102
+ endsWith(s, len, "mi") ||
103
+ endsWith(s, len, "ou")))
104
+ return len - 2;
105
+
106
+ if (len > 3) {
107
+ switch (s[len - 1]) {
108
+ case 'a':
109
+ case 'e':
110
+ case 'i':
111
+ case 'o':
112
+ case 'u':
113
+ case 'ů':
114
+ case 'y':
115
+ case 'á':
116
+ case 'é':
117
+ case 'í':
118
+ case 'ý':
119
+ case 'ě':
120
+ return len - 1;
121
+ }
122
+ }
123
+
124
+ return len;
125
+ }
126
+
127
+ private int removePossessives(char s[], int len) {
128
+ if (len > 5 &&
129
+ (endsWith(s, len, "ov") ||
130
+ endsWith(s, len, "in") ||
131
+ endsWith(s, len, "ův")))
132
+ return len - 2;
133
+
134
+ return len;
135
+ }
136
+
137
+ private int normalize(char s[], int len) {
138
+ if (endsWith(s, len, "čt")) { // čt -> ck
139
+ s[len - 2] = 'c';
140
+ s[len - 1] = 'k';
141
+ return len;
142
+ }
143
+
144
+ if (endsWith(s, len, "št")) { // št -> sk
145
+ s[len - 2] = 's';
146
+ s[len - 1] = 'k';
147
+ return len;
148
+ }
149
+
150
+ switch(s[len - 1]) {
151
+ case 'c': // [cč] -> k
152
+ case 'č':
153
+ s[len - 1] = 'k';
154
+ return len;
155
+ case 'z': // [zž] -> h
156
+ case 'ž':
157
+ s[len - 1] = 'h';
158
+ return len;
159
+ }
160
+
161
+ if (len > 1 && s[len - 2] == 'e') {
162
+ s[len - 2] = s[len - 1]; // e* > *
163
+ return len - 1;
164
+ }
165
+
166
+ if (len > 2 && s[len - 2] == 'ů') {
167
+ s[len - 2] = 'o'; // *ů* -> *o*
168
+ return len;
169
+ }
170
+
171
+ return len;
172
+ }
173
+ }