czech-stemmer 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9ea7bcfa41da627a5df410f7c640b1020b1eaa49
4
+ data.tar.gz: d56383fc4cfb27fafcb517c5ef6f1ccc3a4ab43b
5
+ SHA512:
6
+ metadata.gz: 3b961a0aecdb79bb04e4b9f2fc97d10852dd2fe3eaf39f5b117e69f08602fd0841d2850a42e60d45687613c1ba90602960a5e67fa737bebded7a5cfd10f779ca
7
+ data.tar.gz: 6688dc4eae8a91c1af12cb2c0bad5e7907af9f4140718c22a4038c11b51555172bb85f54b78c04efaba9fae8cba5ba6ad56f9f84f66978adcc2faea0d697fea8
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ source "http://rubygems.org"
2
+
3
+ group :development do
4
+ gem "shoulda", ">= 0"
5
+ gem "rdoc"
6
+ gem "bundler", "~> 1.0"
7
+ gem "jeweler", "~> 2.0.1"
8
+ gem "simplecov", ">= 0"
9
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,80 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ activesupport (4.1.1)
5
+ i18n (~> 0.6, >= 0.6.9)
6
+ json (~> 1.7, >= 1.7.7)
7
+ minitest (~> 5.1)
8
+ thread_safe (~> 0.1)
9
+ tzinfo (~> 1.1)
10
+ addressable (2.3.6)
11
+ builder (3.2.2)
12
+ descendants_tracker (0.0.4)
13
+ thread_safe (~> 0.3, >= 0.3.1)
14
+ docile (1.1.5)
15
+ faraday (0.9.0)
16
+ multipart-post (>= 1.2, < 3)
17
+ git (1.2.7)
18
+ github_api (0.11.3)
19
+ addressable (~> 2.3)
20
+ descendants_tracker (~> 0.0.1)
21
+ faraday (~> 0.8, < 0.10)
22
+ hashie (>= 1.2)
23
+ multi_json (>= 1.7.5, < 2.0)
24
+ nokogiri (~> 1.6.0)
25
+ oauth2
26
+ hashie (3.0.0)
27
+ highline (1.6.21)
28
+ i18n (0.6.9)
29
+ jeweler (2.0.1)
30
+ builder
31
+ bundler (>= 1.0)
32
+ git (>= 1.2.5)
33
+ github_api
34
+ highline (>= 1.6.15)
35
+ nokogiri (>= 1.5.10)
36
+ rake
37
+ rdoc
38
+ json (1.8.1)
39
+ jwt (1.0.0)
40
+ mini_portile (0.6.0)
41
+ minitest (5.3.5)
42
+ multi_json (1.10.1)
43
+ multi_xml (0.5.5)
44
+ multipart-post (2.0.0)
45
+ nokogiri (1.6.2.1)
46
+ mini_portile (= 0.6.0)
47
+ oauth2 (0.9.4)
48
+ faraday (>= 0.8, < 0.10)
49
+ jwt (~> 1.0)
50
+ multi_json (~> 1.3)
51
+ multi_xml (~> 0.5)
52
+ rack (~> 1.2)
53
+ rack (1.5.2)
54
+ rake (10.3.2)
55
+ rdoc (4.1.1)
56
+ json (~> 1.4)
57
+ shoulda (3.5.0)
58
+ shoulda-context (~> 1.0, >= 1.0.1)
59
+ shoulda-matchers (>= 1.4.1, < 3.0)
60
+ shoulda-context (1.2.1)
61
+ shoulda-matchers (2.6.1)
62
+ activesupport (>= 3.0.0)
63
+ simplecov (0.8.2)
64
+ docile (~> 1.1.0)
65
+ multi_json
66
+ simplecov-html (~> 0.8.0)
67
+ simplecov-html (0.8.0)
68
+ thread_safe (0.3.4)
69
+ tzinfo (1.2.1)
70
+ thread_safe (~> 0.1)
71
+
72
+ PLATFORMS
73
+ ruby
74
+
75
+ DEPENDENCIES
76
+ bundler (~> 1.0)
77
+ jeweler (~> 2.0.1)
78
+ rdoc
79
+ shoulda
80
+ simplecov
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2014 Ondrej Odchazel
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,10 @@
1
+ # czech-stemmer
2
+
3
+ Czech stemmer is Ruby port of CzechStemmer from Lucene.
4
+
5
+ ## Usage
6
+
7
+ ## Copyright
8
+
9
+ Copyright (c) 2014 Ondrej Odchazel. See LICENSE.txt for further details.
10
+
data/Rakefile ADDED
@@ -0,0 +1,51 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
17
+ gem.name = "czech-stemmer"
18
+ gem.homepage = "http://github.com/hypertornado/czech-stemmer"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Ruby port of czech stemmer in Lucene}
21
+ gem.description = %Q{Based pn Lucene implementation}
22
+ gem.email = "hypertornado@gmail.com"
23
+ gem.authors = ["Ondrej Odchazel"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ desc "Code coverage detail"
36
+ task :simplecov do
37
+ ENV['COVERAGE'] = "true"
38
+ Rake::Task['test'].execute
39
+ end
40
+
41
+ task :default => :test
42
+
43
+ require 'rdoc/task'
44
+ Rake::RDocTask.new do |rdoc|
45
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
46
+
47
+ rdoc.rdoc_dir = 'rdoc'
48
+ rdoc.title = "czech-stemmer #{version}"
49
+ rdoc.rdoc_files.include('README*')
50
+ rdoc.rdoc_files.include('lib/**/*.rb')
51
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,66 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "czech-stemmer"
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Ondrej Odchazel"]
12
+ s.date = "2014-06-24"
13
+ s.description = "Based pn Lucene implementation"
14
+ s.email = "hypertornado@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.markdown"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ "Gemfile",
22
+ "Gemfile.lock",
23
+ "LICENSE.txt",
24
+ "README.markdown",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "czech-stemmer.gemspec",
28
+ "lib/czech-stemmer.rb",
29
+ "test/CzechStemmer.java",
30
+ "test/TestCzechStemmer.java",
31
+ "test/TestCzechStemmer.java.txt",
32
+ "test/helper.rb",
33
+ "test/java_test_converter.bash",
34
+ "test/test_czech-stemmer.rb"
35
+ ]
36
+ s.homepage = "http://github.com/hypertornado/czech-stemmer"
37
+ s.licenses = ["MIT"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = "2.0.14"
40
+ s.summary = "Ruby port of czech stemmer in Lucene"
41
+
42
+ if s.respond_to? :specification_version then
43
+ s.specification_version = 4
44
+
45
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
46
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
47
+ s.add_development_dependency(%q<rdoc>, [">= 0"])
48
+ s.add_development_dependency(%q<bundler>, ["~> 1.0"])
49
+ s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
50
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
51
+ else
52
+ s.add_dependency(%q<shoulda>, [">= 0"])
53
+ s.add_dependency(%q<rdoc>, [">= 0"])
54
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
55
+ s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
56
+ s.add_dependency(%q<simplecov>, [">= 0"])
57
+ end
58
+ else
59
+ s.add_dependency(%q<shoulda>, [">= 0"])
60
+ s.add_dependency(%q<rdoc>, [">= 0"])
61
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
62
+ s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
63
+ s.add_dependency(%q<simplecov>, [">= 0"])
64
+ end
65
+ end
66
+
@@ -0,0 +1,125 @@
1
+ class CzechStemmer
2
+
3
+ def self.stem word
4
+ stem = CzechStemmer.remove_case word
5
+ stem = CzechStemmer.remove_possessives stem
6
+ if stem.size > 0 then
7
+ stem = CzechStemmer.normalize stem
8
+ end
9
+ return stem
10
+ end
11
+
12
+ def self.remove_case word
13
+ len = word.size
14
+
15
+ if (len > 7 and (
16
+ word.end_with?("atech")
17
+ )) then return word[0..-6] end
18
+
19
+ if (len > 6 and (
20
+ word.end_with?("ětem") ||
21
+ word.end_with?("etem") ||
22
+ word.end_with?("atům")
23
+ )) then return word[0..-5] end
24
+
25
+ if (len > 5 and (
26
+ word.end_with?("ech") ||
27
+ word.end_with?("ich") ||
28
+ word.end_with?("ích") ||
29
+ word.end_with?("ého") ||
30
+ word.end_with?("ěmi") ||
31
+ word.end_with?("emi") ||
32
+ word.end_with?("ému") ||
33
+ word.end_with?("ěte") ||
34
+ word.end_with?("ete") ||
35
+ word.end_with?("ěti") ||
36
+ word.end_with?("eti") ||
37
+ word.end_with?("ího") ||
38
+ word.end_with?("iho") ||
39
+ word.end_with?("ími") ||
40
+ word.end_with?("ímu") ||
41
+ word.end_with?("imu") ||
42
+ word.end_with?("ách") ||
43
+ word.end_with?("ata") ||
44
+ word.end_with?("aty") ||
45
+ word.end_with?("ých") ||
46
+ word.end_with?("ama") ||
47
+ word.end_with?("ami") ||
48
+ word.end_with?("ové") ||
49
+ word.end_with?("ovi") ||
50
+ word.end_with?("ými")
51
+ )) then return word[0..-4] end
52
+
53
+ if (len > 4 and (
54
+ word.end_with?("em") ||
55
+ word.end_with?("es") ||
56
+ word.end_with?("ém") ||
57
+ word.end_with?("ím") ||
58
+ word.end_with?("ům") ||
59
+ word.end_with?("at") ||
60
+ word.end_with?("ám") ||
61
+ word.end_with?("os") ||
62
+ word.end_with?("us") ||
63
+ word.end_with?("ým") ||
64
+ word.end_with?("mi") ||
65
+ word.end_with?("ou")
66
+ )) then return word[0..-3] end
67
+
68
+
69
+ if (len > 3 and ["a", "e", "i", "o", "u", "ů", "y", "á", "é", "í", "ý", "ě"].include?(word[-1,1])) then
70
+ return word[0..-2]
71
+ end
72
+
73
+ return word
74
+ end
75
+
76
+ def self.remove_possessives word
77
+ if (word.size > 5 and (
78
+ word.end_with?("ov") ||
79
+ word.end_with?("in") ||
80
+ word.end_with?("ův")
81
+ )) then return word[0..-3] end
82
+
83
+ return word
84
+ end
85
+
86
+ def self.normalize word
87
+ if word.end_with?("čt") then
88
+ return word[0..-3] + "ck"
89
+ end
90
+
91
+ if word.end_with?("št") then
92
+ return word[0..-3] + "sk"
93
+ end
94
+
95
+ if word.end_with?("c") then
96
+ return word[0..-2] + "k"
97
+ end
98
+
99
+ if word.end_with?("č") then
100
+ return word[0..-2] + "k"
101
+ end
102
+
103
+ if word.end_with?("z") then
104
+ return word[0..-2] + "h"
105
+ end
106
+
107
+ if word.end_with?("ž") then
108
+ return word[0..-2] + "h"
109
+ end
110
+
111
+ if (word.size > 1 and word[-2,1] == "e") then
112
+ last_char = word[-1,1]
113
+ return word[0..-3] + last_char
114
+ end
115
+
116
+ if (word.size > 2 and word[-2,1] == "ů") then
117
+ last_char = word[-1,1]
118
+ return word[0..-3] + "o" + last_char
119
+ end
120
+
121
+ return word
122
+
123
+ end
124
+
125
+ end
@@ -0,0 +1,173 @@
1
+ package org.apache.lucene.analysis.cz;
2
+
3
+ /*
4
+ * Licensed to the Apache Software Foundation (ASF) under one or more
5
+ * contributor license agreements. See the NOTICE file distributed with
6
+ * this work for additional information regarding copyright ownership.
7
+ * The ASF licenses this file to You under the Apache License, Version 2.0
8
+ * (the "License"); you may not use this file except in compliance with
9
+ * the License. You may obtain a copy of the License at
10
+ *
11
+ * http://www.apache.org/licenses/LICENSE-2.0
12
+ *
13
+ * Unless required by applicable law or agreed to in writing, software
14
+ * distributed under the License is distributed on an "AS IS" BASIS,
15
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ * See the License for the specific language governing permissions and
17
+ * limitations under the License.
18
+ */
19
+
20
+ import static org.apache.lucene.analysis.util.StemmerUtil.*;
21
+
22
+ /**
23
+ * Light Stemmer for Czech.
24
+ * <p>
25
+ * Implements the algorithm described in:
26
+ * <i>
27
+ * Indexing and stemming approaches for the Czech language
28
+ * </i>
29
+ * http://portal.acm.org/citation.cfm?id=1598600
30
+ * </p>
31
+ */
32
+ public class CzechStemmer {
33
+
34
+ /**
35
+ * Stem an input buffer of Czech text.
36
+ *
37
+ * @param s input buffer
38
+ * @param len length of input buffer
39
+ * @return length of input buffer after normalization
40
+ *
41
+ * <p><b>NOTE</b>: Input is expected to be in lowercase,
42
+ * but with diacritical marks</p>
43
+ */
44
+ public int stem(char s[], int len) {
45
+ len = removeCase(s, len);
46
+ len = removePossessives(s, len);
47
+ if (len > 0) {
48
+ len = normalize(s, len);
49
+ }
50
+ return len;
51
+ }
52
+
53
+ private int removeCase(char s[], int len) {
54
+ if (len > 7 && endsWith(s, len, "atech"))
55
+ return len - 5;
56
+
57
+ if (len > 6 &&
58
+ (endsWith(s, len,"ětem") ||
59
+ endsWith(s, len,"etem") ||
60
+ endsWith(s, len,"atům")))
61
+ return len - 4;
62
+
63
+ if (len > 5 &&
64
+ (endsWith(s, len, "ech") ||
65
+ endsWith(s, len, "ich") ||
66
+ endsWith(s, len, "ích") ||
67
+ endsWith(s, len, "ého") ||
68
+ endsWith(s, len, "ěmi") ||
69
+ endsWith(s, len, "emi") ||
70
+ endsWith(s, len, "ému") ||
71
+ endsWith(s, len, "ěte") ||
72
+ endsWith(s, len, "ete") ||
73
+ endsWith(s, len, "ěti") ||
74
+ endsWith(s, len, "eti") ||
75
+ endsWith(s, len, "ího") ||
76
+ endsWith(s, len, "iho") ||
77
+ endsWith(s, len, "ími") ||
78
+ endsWith(s, len, "ímu") ||
79
+ endsWith(s, len, "imu") ||
80
+ endsWith(s, len, "ách") ||
81
+ endsWith(s, len, "ata") ||
82
+ endsWith(s, len, "aty") ||
83
+ endsWith(s, len, "ých") ||
84
+ endsWith(s, len, "ama") ||
85
+ endsWith(s, len, "ami") ||
86
+ endsWith(s, len, "ové") ||
87
+ endsWith(s, len, "ovi") ||
88
+ endsWith(s, len, "ými")))
89
+ return len - 3;
90
+
91
+ if (len > 4 &&
92
+ (endsWith(s, len, "em") ||
93
+ endsWith(s, len, "es") ||
94
+ endsWith(s, len, "ém") ||
95
+ endsWith(s, len, "ím") ||
96
+ endsWith(s, len, "ům") ||
97
+ endsWith(s, len, "at") ||
98
+ endsWith(s, len, "ám") ||
99
+ endsWith(s, len, "os") ||
100
+ endsWith(s, len, "us") ||
101
+ endsWith(s, len, "ým") ||
102
+ endsWith(s, len, "mi") ||
103
+ endsWith(s, len, "ou")))
104
+ return len - 2;
105
+
106
+ if (len > 3) {
107
+ switch (s[len - 1]) {
108
+ case 'a':
109
+ case 'e':
110
+ case 'i':
111
+ case 'o':
112
+ case 'u':
113
+ case 'ů':
114
+ case 'y':
115
+ case 'á':
116
+ case 'é':
117
+ case 'í':
118
+ case 'ý':
119
+ case 'ě':
120
+ return len - 1;
121
+ }
122
+ }
123
+
124
+ return len;
125
+ }
126
+
127
+ private int removePossessives(char s[], int len) {
128
+ if (len > 5 &&
129
+ (endsWith(s, len, "ov") ||
130
+ endsWith(s, len, "in") ||
131
+ endsWith(s, len, "ův")))
132
+ return len - 2;
133
+
134
+ return len;
135
+ }
136
+
137
+ private int normalize(char s[], int len) {
138
+ if (endsWith(s, len, "čt")) { // čt -> ck
139
+ s[len - 2] = 'c';
140
+ s[len - 1] = 'k';
141
+ return len;
142
+ }
143
+
144
+ if (endsWith(s, len, "št")) { // št -> sk
145
+ s[len - 2] = 's';
146
+ s[len - 1] = 'k';
147
+ return len;
148
+ }
149
+
150
+ switch(s[len - 1]) {
151
+ case 'c': // [cč] -> k
152
+ case 'č':
153
+ s[len - 1] = 'k';
154
+ return len;
155
+ case 'z': // [zž] -> h
156
+ case 'ž':
157
+ s[len - 1] = 'h';
158
+ return len;
159
+ }
160
+
161
+ if (len > 1 && s[len - 2] == 'e') {
162
+ s[len - 2] = s[len - 1]; // e* > *
163
+ return len - 1;
164
+ }
165
+
166
+ if (len > 2 && s[len - 2] == 'ů') {
167
+ s[len - 2] = 'o'; // *ů* -> *o*
168
+ return len;
169
+ }
170
+
171
+ return len;
172
+ }
173
+ }