RubyGems - czech-stemmer - Versions diffs - 0.0.0 - Mend

czech-stemmer 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +7 -0
data/.document +5 -0
data/Gemfile +9 -0
data/Gemfile.lock +80 -0
data/LICENSE.txt +20 -0
data/README.markdown +10 -0
data/Rakefile +51 -0
data/VERSION +1 -0
data/czech-stemmer.gemspec +66 -0
data/lib/czech-stemmer.rb +125 -0
data/test/CzechStemmer.java +173 -0
data/test/TestCzechStemmer.java +300 -0
data/test/TestCzechStemmer.java.txt +300 -0
data/test/helper.rb +2 -0
data/test/java_test_converter.bash +7 -0
data/test/test_czech-stemmer.rb +221 -0
metadata +130 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 9ea7bcfa41da627a5df410f7c640b1020b1eaa49
+  data.tar.gz: d56383fc4cfb27fafcb517c5ef6f1ccc3a4ab43b
+SHA512:
+  metadata.gz: 3b961a0aecdb79bb04e4b9f2fc97d10852dd2fe3eaf39f5b117e69f08602fd0841d2850a42e60d45687613c1ba90602960a5e67fa737bebded7a5cfd10f779ca
+  data.tar.gz: 6688dc4eae8a91c1af12cb2c0bad5e7907af9f4140718c22a4038c11b51555172bb85f54b78c04efaba9fae8cba5ba6ad56f9f84f66978adcc2faea0d697fea8

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/Gemfile ADDED Viewed

@@ -0,0 +1,9 @@
+source "http://rubygems.org"
+group :development do
+  gem "shoulda", ">= 0"
+  gem "rdoc"
+  gem "bundler", "~> 1.0"
+  gem "jeweler", "~> 2.0.1"
+  gem "simplecov", ">= 0"
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,80 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    activesupport (4.1.1)
+      i18n (~> 0.6, >= 0.6.9)
+      json (~> 1.7, >= 1.7.7)
+      minitest (~> 5.1)
+      thread_safe (~> 0.1)
+      tzinfo (~> 1.1)
+    addressable (2.3.6)
+    builder (3.2.2)
+    descendants_tracker (0.0.4)
+      thread_safe (~> 0.3, >= 0.3.1)
+    docile (1.1.5)
+    faraday (0.9.0)
+      multipart-post (>= 1.2, < 3)
+    git (1.2.7)
+    github_api (0.11.3)
+      addressable (~> 2.3)
+      descendants_tracker (~> 0.0.1)
+      faraday (~> 0.8, < 0.10)
+      hashie (>= 1.2)
+      multi_json (>= 1.7.5, < 2.0)
+      nokogiri (~> 1.6.0)
+      oauth2
+    hashie (3.0.0)
+    highline (1.6.21)
+    i18n (0.6.9)
+    jeweler (2.0.1)
+      builder
+      bundler (>= 1.0)
+      git (>= 1.2.5)
+      github_api
+      highline (>= 1.6.15)
+      nokogiri (>= 1.5.10)
+      rake
+      rdoc
+    json (1.8.1)
+    jwt (1.0.0)
+    mini_portile (0.6.0)
+    minitest (5.3.5)
+    multi_json (1.10.1)
+    multi_xml (0.5.5)
+    multipart-post (2.0.0)
+    nokogiri (1.6.2.1)
+      mini_portile (= 0.6.0)
+    oauth2 (0.9.4)
+      faraday (>= 0.8, < 0.10)
+      jwt (~> 1.0)
+      multi_json (~> 1.3)
+      multi_xml (~> 0.5)
+      rack (~> 1.2)
+    rack (1.5.2)
+    rake (10.3.2)
+    rdoc (4.1.1)
+      json (~> 1.4)
+    shoulda (3.5.0)
+      shoulda-context (~> 1.0, >= 1.0.1)
+      shoulda-matchers (>= 1.4.1, < 3.0)
+    shoulda-context (1.2.1)
+    shoulda-matchers (2.6.1)
+      activesupport (>= 3.0.0)
+    simplecov (0.8.2)
+      docile (~> 1.1.0)
+      multi_json
+      simplecov-html (~> 0.8.0)
+    simplecov-html (0.8.0)
+    thread_safe (0.3.4)
+    tzinfo (1.2.1)
+      thread_safe (~> 0.1)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.0)
+  jeweler (~> 2.0.1)
+  rdoc
+  shoulda
+  simplecov

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2014 Ondrej Odchazel
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.markdown ADDED Viewed

@@ -0,0 +1,10 @@
+# czech-stemmer
+Czech stemmer is Ruby port of CzechStemmer from Lucene.
+## Usage
+## Copyright
+Copyright (c) 2014 Ondrej Odchazel. See LICENSE.txt for further details.

data/Rakefile ADDED Viewed

@@ -0,0 +1,51 @@
+# encoding: utf-8
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
+  gem.name = "czech-stemmer"
+  gem.homepage = "http://github.com/hypertornado/czech-stemmer"
+  gem.license = "MIT"
+  gem.summary = %Q{Ruby port of czech stemmer in Lucene}
+  gem.description = %Q{Based pn Lucene implementation}
+  gem.email = "hypertornado@gmail.com"
+  gem.authors = ["Ondrej Odchazel"]
+  # dependencies defined in Gemfile
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+end
+desc "Code coverage detail"
+task :simplecov do
+  ENV['COVERAGE'] = "true"
+  Rake::Task['test'].execute
+end
+task :default => :test
+require 'rdoc/task'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "czech-stemmer #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.0

data/czech-stemmer.gemspec ADDED Viewed

@@ -0,0 +1,66 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = "czech-stemmer"
+  s.version = "0.0.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Ondrej Odchazel"]
+  s.date = "2014-06-24"
+  s.description = "Based pn Lucene implementation"
+  s.email = "hypertornado@gmail.com"
+  s.extra_rdoc_files = [
+    "LICENSE.txt",
+    "README.markdown"
+  ]
+  s.files = [
+    ".document",
+    "Gemfile",
+    "Gemfile.lock",
+    "LICENSE.txt",
+    "README.markdown",
+    "Rakefile",
+    "VERSION",
+    "czech-stemmer.gemspec",
+    "lib/czech-stemmer.rb",
+    "test/CzechStemmer.java",
+    "test/TestCzechStemmer.java",
+    "test/TestCzechStemmer.java.txt",
+    "test/helper.rb",
+    "test/java_test_converter.bash",
+    "test/test_czech-stemmer.rb"
+  ]
+  s.homepage = "http://github.com/hypertornado/czech-stemmer"
+  s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = "2.0.14"
+  s.summary = "Ruby port of czech stemmer in Lucene"
+  if s.respond_to? :specification_version then
+    s.specification_version = 4
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_development_dependency(%q<shoulda>, [">= 0"])
+      s.add_development_dependency(%q<rdoc>, [">= 0"])
+      s.add_development_dependency(%q<bundler>, ["~> 1.0"])
+      s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
+      s.add_development_dependency(%q<simplecov>, [">= 0"])
+    else
+      s.add_dependency(%q<shoulda>, [">= 0"])
+      s.add_dependency(%q<rdoc>, [">= 0"])
+      s.add_dependency(%q<bundler>, ["~> 1.0"])
+      s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
+      s.add_dependency(%q<simplecov>, [">= 0"])
+    end
+  else
+    s.add_dependency(%q<shoulda>, [">= 0"])
+    s.add_dependency(%q<rdoc>, [">= 0"])
+    s.add_dependency(%q<bundler>, ["~> 1.0"])
+    s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
+    s.add_dependency(%q<simplecov>, [">= 0"])
+  end
+end

data/lib/czech-stemmer.rb ADDED Viewed

@@ -0,0 +1,125 @@
+class CzechStemmer
+  def self.stem word
+    stem = CzechStemmer.remove_case word
+    stem = CzechStemmer.remove_possessives stem
+    if stem.size > 0 then
+      stem = CzechStemmer.normalize stem
+    end
+    return stem
+  end
+  def self.remove_case word
+    len = word.size
+    if (len > 7 and (
+      word.end_with?("atech")
+    )) then return word[0..-6] end
+    if (len > 6 and (
+      word.end_with?("ětem") ||
+      word.end_with?("etem") ||
+      word.end_with?("atům")
+    )) then return word[0..-5] end
+    if (len > 5 and (
+      word.end_with?("ech") ||
+      word.end_with?("ich") ||
+      word.end_with?("ích") ||
+      word.end_with?("ého") ||
+      word.end_with?("ěmi") ||
+      word.end_with?("emi") ||
+      word.end_with?("ému") ||
+      word.end_with?("ěte") ||
+      word.end_with?("ete") ||
+      word.end_with?("ěti") ||
+      word.end_with?("eti") ||
+      word.end_with?("ího") ||
+      word.end_with?("iho") ||
+      word.end_with?("ími") ||
+      word.end_with?("ímu") ||
+      word.end_with?("imu") ||
+      word.end_with?("ách") ||
+      word.end_with?("ata") ||
+      word.end_with?("aty") ||
+      word.end_with?("ých") ||
+      word.end_with?("ama") ||
+      word.end_with?("ami") ||
+      word.end_with?("ové") ||
+      word.end_with?("ovi") ||
+      word.end_with?("ými")
+    )) then return word[0..-4] end
+    if (len > 4 and (
+      word.end_with?("em") ||
+      word.end_with?("es") ||
+      word.end_with?("ém") ||
+      word.end_with?("ím") ||
+      word.end_with?("ům") ||
+      word.end_with?("at") ||
+      word.end_with?("ám") ||
+      word.end_with?("os") ||
+      word.end_with?("us") ||
+      word.end_with?("ým") ||
+      word.end_with?("mi") ||
+      word.end_with?("ou")
+    )) then return word[0..-3] end
+    if (len > 3 and ["a", "e", "i", "o", "u", "ů", "y", "á", "é", "í", "ý", "ě"].include?(word[-1,1])) then
+      return word[0..-2]
+    end
+    return word
+  end
+  def self.remove_possessives word
+    if (word.size > 5 and (
+      word.end_with?("ov") ||
+      word.end_with?("in") ||
+      word.end_with?("ův")
+    )) then return word[0..-3] end
+    return word
+  end
+  def self.normalize word
+    if word.end_with?("čt") then
+      return word[0..-3] + "ck"
+    end
+    if word.end_with?("št") then
+      return word[0..-3] + "sk"
+    end
+    if word.end_with?("c") then
+      return word[0..-2] + "k"
+    end
+    if word.end_with?("č") then
+      return word[0..-2] + "k"
+    end
+    if word.end_with?("z") then
+      return word[0..-2] + "h"
+    end
+    if word.end_with?("ž") then
+      return word[0..-2] + "h"
+    end
+    if (word.size > 1 and word[-2,1] == "e") then
+      last_char = word[-1,1]
+      return word[0..-3] + last_char
+    end
+    if (word.size > 2 and word[-2,1] == "ů") then
+      last_char = word[-1,1]
+      return word[0..-3] + "o" + last_char
+    end
+    return word
+  end
+end

data/test/CzechStemmer.java ADDED Viewed

@@ -0,0 +1,173 @@
+package org.apache.lucene.analysis.cz;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+/**
+ * Light Stemmer for Czech.
+ * <p>
+ * Implements the algorithm described in:
+ * <i>
+ * Indexing and stemming approaches for the Czech language
+ * </i>
+ * http://portal.acm.org/citation.cfm?id=1598600
+ * </p>
+ */
+public class CzechStemmer {
+  /**
+   * Stem an input buffer of Czech text.
+   *
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   *
+   * <p><b>NOTE</b>: Input is expected to be in lowercase,
+   * but with diacritical marks</p>
+   */
+  public int stem(char s[], int len) {
+    len = removeCase(s, len);
+    len = removePossessives(s, len);
+    if (len > 0) {
+      len = normalize(s, len);
+    }
+    return len;
+  }
+  private int removeCase(char s[], int len) {
+    if (len > 7 && endsWith(s, len, "atech"))
+      return len - 5;
+    if (len > 6 &&
+        (endsWith(s, len,"ětem") ||
+        endsWith(s, len,"etem") ||
+        endsWith(s, len,"atům")))
+      return len - 4;
+    if (len > 5 &&
+        (endsWith(s, len, "ech") ||
+        endsWith(s, len, "ich") ||
+        endsWith(s, len, "ích") ||
+        endsWith(s, len, "ého") ||
+        endsWith(s, len, "ěmi") ||
+        endsWith(s, len, "emi") ||
+        endsWith(s, len, "ému") ||
+        endsWith(s, len, "ěte") ||
+        endsWith(s, len, "ete") ||
+        endsWith(s, len, "ěti") ||
+        endsWith(s, len, "eti") ||
+        endsWith(s, len, "ího") ||
+        endsWith(s, len, "iho") ||
+        endsWith(s, len, "ími") ||
+        endsWith(s, len, "ímu") ||
+        endsWith(s, len, "imu") ||
+        endsWith(s, len, "ách") ||
+        endsWith(s, len, "ata") ||
+        endsWith(s, len, "aty") ||
+        endsWith(s, len, "ých") ||
+        endsWith(s, len, "ama") ||
+        endsWith(s, len, "ami") ||
+        endsWith(s, len, "ové") ||
+        endsWith(s, len, "ovi") ||
+        endsWith(s, len, "ými")))
+      return len - 3;
+    if (len > 4 &&
+        (endsWith(s, len, "em") ||
+        endsWith(s, len, "es") ||
+        endsWith(s, len, "ém") ||
+        endsWith(s, len, "ím") ||
+        endsWith(s, len, "ům") ||
+        endsWith(s, len, "at") ||
+        endsWith(s, len, "ám") ||
+        endsWith(s, len, "os") ||
+        endsWith(s, len, "us") ||
+        endsWith(s, len, "ým") ||
+        endsWith(s, len, "mi") ||
+        endsWith(s, len, "ou")))
+      return len - 2;
+    if (len > 3) {
+      switch (s[len - 1]) {
+        case 'a':
+        case 'e':
+        case 'i':
+        case 'o':
+        case 'u':
+        case 'ů':
+        case 'y':
+        case 'á':
+        case 'é':
+        case 'í':
+        case 'ý':
+        case 'ě':
+          return len - 1;
+      }
+    }
+    return len;
+  }
+  private int removePossessives(char s[], int len) {
+    if (len > 5 &&
+        (endsWith(s, len, "ov") ||
+        endsWith(s, len, "in") ||
+        endsWith(s, len, "ův")))
+      return len - 2;
+    return len;
+  }
+  private int normalize(char s[], int len) {
+    if (endsWith(s, len, "čt")) { // čt -> ck
+      s[len - 2] = 'c';
+      s[len - 1] = 'k';
+      return len;
+    }
+    if (endsWith(s, len, "št")) { // št -> sk
+      s[len - 2] = 's';
+      s[len - 1] = 'k';
+      return len;
+    }
+    switch(s[len - 1]) {
+      case 'c': // [cč] -> k
+      case 'č':
+        s[len - 1] = 'k';
+        return len;
+      case 'z': // [zž] -> h
+      case 'ž':
+        s[len - 1] = 'h';
+        return len;
+    }
+    if (len > 1 && s[len - 2] == 'e') {
+      s[len - 2] = s[len - 1]; // e* > *
+      return len - 1;
+    }
+    if (len > 2 && s[len - 2] == 'ů') {
+      s[len - 2] = 'o'; // *ů* -> *o*
+      return len;
+    }
+    return len;
+  }
+}