RubyGems - jruby-stemmer - Versions diffs - 0.0.1-java - Mend

jruby-stemmer 0.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/.gitignore +23 -0
data/Gemfile +4 -0
data/LICENSE.txt +27 -0
data/README.md +32 -0
data/Rakefile +7 -0
data/ext/java-stemmer/Stemmer.java +427 -0
data/jruby-stemmer.gemspec +26 -0
data/lib/java-stemmer.jar +0 -0
data/lib/jruby-stemmer/version.rb +5 -0
data/lib/jruby-stemmer.rb +26 -0
data/spec/stemmer_spec.rb +17 -0
metadata +95 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,23 @@
+*.bundle
+*.gem
+*.jar
+*.o
+*.rbc
+*.so
+.bundle
+.config
+.DS_Store
+.yardoc
+coverage
+doc/
+Gemfile.lock
+InstalledFiles
+lib/bundler/man
+Makefile
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+_yardoc

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in jruby-stemmer.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,27 @@
+Excluding ext/java-stemmer/Stemmer.java -
+  see http://www.tartarus.org/~martin/PorterStemmer for that code
+Copyright (c) 2013 Caius Durling
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,32 @@
+# JRuby::Stemmer
+Easily stem words in ruby, using a native java implementation of the porter stemming algorithm for speed. (Java equivalent of the fast-stemmer gem for MRI.)
+## Installation
+Add this line to your application's Gemfile:
+    gem 'jruby-stemmer'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install jruby-stemmer
+## Usage
+You can either call the wrapper method yourself, or use the mixed in helper method `String#stem`.
+    JRuby::Stemmer.stem("apple") # => "appl"
+    "apple".stem # => "appl"
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,7 @@
+spec = Gem::Specification.load("jruby-stemmer.gemspec")
+require "rubygems/package_task"
+Gem::PackageTask.new(spec) {}
+require "rake/javaextensiontask"
+Rake::JavaExtensionTask.new("java-stemmer", spec)

data/ext/java-stemmer/Stemmer.java ADDED Viewed

@@ -0,0 +1,427 @@
+package org.tartarus.martin.porter_stemmer;
+/*
+   Porter stemmer in Java. The original paper is in
+       Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+       no. 3, pp 130-137,
+   See also http://www.tartarus.org/~martin/PorterStemmer
+   History:
+   Release 1
+   Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
+   The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
+   is then out outside the bounds of b.
+   Release 2
+   Similarly,
+   Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
+   'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
+   b[j] is then outside the bounds of b.
+   Release 3
+   Considerably revised 4/9/00 in the light of many helpful suggestions
+   from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
+   Release 4
+*/
+import java.io.*;
+/**
+  * Stemmer, implementing the Porter Stemming Algorithm
+  *
+  * The Stemmer class transforms a word into its root form.  The input
+  * word can be provided a character at time (by calling add()), or at once
+  * by calling one of the various stem(something) methods.
+  */
+public class Stemmer
+{  private char[] b;
+   private int i,     /* offset into b */
+               i_end, /* offset to end of stemmed word */
+               j, k;
+   private static final int INC = 50;
+                     /* unit of size whereby b is increased */
+   public Stemmer()
+   {  b = new char[INC];
+      i = 0;
+      i_end = 0;
+   }
+   /**
+    * Add a character to the word being stemmed.  When you are finished
+    * adding characters, you can call stem(void) to stem the word.
+    */
+   public void add(char ch)
+   {  if (i == b.length)
+      {  char[] new_b = new char[i+INC];
+         for (int c = 0; c < i; c++) new_b[c] = b[c];
+         b = new_b;
+      }
+      b[i++] = ch;
+   }
+   /** Adds wLen characters to the word being stemmed contained in a portion
+    * of a char[] array. This is like repeated calls of add(char ch), but
+    * faster.
+    */
+   public void add(char[] w, int wLen)
+   {  if (i+wLen >= b.length)
+      {  char[] new_b = new char[i+wLen+INC];
+         for (int c = 0; c < i; c++) new_b[c] = b[c];
+         b = new_b;
+      }
+      for (int c = 0; c < wLen; c++) b[i++] = w[c];
+   }
+   /**
+    * After a word has been stemmed, it can be retrieved by toString(),
+    * or a reference to the internal buffer can be retrieved by getResultBuffer
+    * and getResultLength (which is generally more efficient.)
+    */
+   public String toString() { return new String(b,0,i_end); }
+   /**
+    * Returns the length of the word resulting from the stemming process.
+    */
+   public int getResultLength() { return i_end; }
+   /**
+    * Returns a reference to a character buffer containing the results of
+    * the stemming process.  You also need to consult getResultLength()
+    * to determine the length of the result.
+    */
+   public char[] getResultBuffer() { return b; }
+   /* cons(i) is true <=> b[i] is a consonant. */
+   private final boolean cons(int i)
+   {  switch (b[i])
+      {  case 'a': case 'e': case 'i': case 'o': case 'u': return false;
+         case 'y': return (i==0) ? true : !cons(i-1);
+         default: return true;
+      }
+   }
+   /* m() measures the number of consonant sequences between 0 and j. if c is
+      a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+      presence,
+         <c><v>       gives 0
+         <c>vc<v>     gives 1
+         <c>vcvc<v>   gives 2
+         <c>vcvcvc<v> gives 3
+         ....
+   */
+   private final int m()
+   {  int n = 0;
+      int i = 0;
+      while(true)
+      {  if (i > j) return n;
+         if (! cons(i)) break; i++;
+      }
+      i++;
+      while(true)
+      {  while(true)
+         {  if (i > j) return n;
+               if (cons(i)) break;
+               i++;
+         }
+         i++;
+         n++;
+         while(true)
+         {  if (i > j) return n;
+            if (! cons(i)) break;
+            i++;
+         }
+         i++;
+       }
+   }
+   /* vowelinstem() is true <=> 0,...j contains a vowel */
+   private final boolean vowelinstem()
+   {  int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
+      return false;
+   }
+   /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+   private final boolean doublec(int j)
+   {  if (j < 1) return false;
+      if (b[j] != b[j-1]) return false;
+      return cons(j);
+   }
+   /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
+      and also if the second c is not w,x or y. this is used when trying to
+      restore an e at the end of a short word. e.g.
+         cav(e), lov(e), hop(e), crim(e), but
+         snow, box, tray.
+   */
+   private final boolean cvc(int i)
+   {  if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
+      {  int ch = b[i];
+         if (ch == 'w' || ch == 'x' || ch == 'y') return false;
+      }
+      return true;
+   }
+   private final boolean ends(String s)
+   {  int l = s.length();
+      int o = k-l+1;
+      if (o < 0) return false;
+      for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
+      j = k-l;
+      return true;
+   }
+   /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
+      k. */
+   private final void setto(String s)
+   {  int l = s.length();
+      int o = j+1;
+      for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
+      k = j+l;
+   }
+   /* r(s) is used further down. */
+   private final void r(String s) { if (m() > 0) setto(s); }
+   /* step1() gets rid of plurals and -ed or -ing. e.g.
+          caresses  ->  caress
+          ponies    ->  poni
+          ties      ->  ti
+          caress    ->  caress
+          cats      ->  cat
+          feed      ->  feed
+          agreed    ->  agree
+          disabled  ->  disable
+          matting   ->  mat
+          mating    ->  mate
+          meeting   ->  meet
+          milling   ->  mill
+          messing   ->  mess
+          meetings  ->  meet
+   */
+   private final void step1()
+   {  if (b[k] == 's')
+      {  if (ends("sses")) k -= 2; else
+         if (ends("ies")) setto("i"); else
+         if (b[k-1] != 's') k--;
+      }
+      if (ends("eed")) { if (m() > 0) k--; } else
+      if ((ends("ed") || ends("ing")) && vowelinstem())
+      {  k = j;
+         if (ends("at")) setto("ate"); else
+         if (ends("bl")) setto("ble"); else
+         if (ends("iz")) setto("ize"); else
+         if (doublec(k))
+         {  k--;
+            {  int ch = b[k];
+               if (ch == 'l' || ch == 's' || ch == 'z') k++;
+            }
+         }
+         else if (m() == 1 && cvc(k)) setto("e");
+     }
+   }
+   /* step2() turns terminal y to i when there is another vowel in the stem. */
+   private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }
+   /* step3() maps double suffices to single ones. so -ization ( = -ize plus
+      -ation) maps to -ize etc. note that the string before the suffix must give
+      m() > 0. */
+   private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1])
+   {
+       case 'a': if (ends("ational")) { r("ate"); break; }
+                 if (ends("tional")) { r("tion"); break; }
+                 break;
+       case 'c': if (ends("enci")) { r("ence"); break; }
+                 if (ends("anci")) { r("ance"); break; }
+                 break;
+       case 'e': if (ends("izer")) { r("ize"); break; }
+                 break;
+       case 'l': if (ends("bli")) { r("ble"); break; }
+                 if (ends("alli")) { r("al"); break; }
+                 if (ends("entli")) { r("ent"); break; }
+                 if (ends("eli")) { r("e"); break; }
+                 if (ends("ousli")) { r("ous"); break; }
+                 break;
+       case 'o': if (ends("ization")) { r("ize"); break; }
+                 if (ends("ation")) { r("ate"); break; }
+                 if (ends("ator")) { r("ate"); break; }
+                 break;
+       case 's': if (ends("alism")) { r("al"); break; }
+                 if (ends("iveness")) { r("ive"); break; }
+                 if (ends("fulness")) { r("ful"); break; }
+                 if (ends("ousness")) { r("ous"); break; }
+                 break;
+       case 't': if (ends("aliti")) { r("al"); break; }
+                 if (ends("iviti")) { r("ive"); break; }
+                 if (ends("biliti")) { r("ble"); break; }
+                 break;
+       case 'g': if (ends("logi")) { r("log"); break; }
+   } }
+   /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
+   private final void step4() { switch (b[k])
+   {
+       case 'e': if (ends("icate")) { r("ic"); break; }
+                 if (ends("ative")) { r(""); break; }
+                 if (ends("alize")) { r("al"); break; }
+                 break;
+       case 'i': if (ends("iciti")) { r("ic"); break; }
+                 break;
+       case 'l': if (ends("ical")) { r("ic"); break; }
+                 if (ends("ful")) { r(""); break; }
+                 break;
+       case 's': if (ends("ness")) { r(""); break; }
+                 break;
+   } }
+   /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
+   private final void step5()
+   {   if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
+       {  case 'a': if (ends("al")) break; return;
+          case 'c': if (ends("ance")) break;
+                    if (ends("ence")) break; return;
+          case 'e': if (ends("er")) break; return;
+          case 'i': if (ends("ic")) break; return;
+          case 'l': if (ends("able")) break;
+                    if (ends("ible")) break; return;
+          case 'n': if (ends("ant")) break;
+                    if (ends("ement")) break;
+                    if (ends("ment")) break;
+                    /* element etc. not stripped before the m */
+                    if (ends("ent")) break; return;
+          case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
+                                    /* j >= 0 fixes Bug 2 */
+                    if (ends("ou")) break; return;
+                    /* takes care of -ous */
+          case 's': if (ends("ism")) break; return;
+          case 't': if (ends("ate")) break;
+                    if (ends("iti")) break; return;
+          case 'u': if (ends("ous")) break; return;
+          case 'v': if (ends("ive")) break; return;
+          case 'z': if (ends("ize")) break; return;
+          default: return;
+       }
+       if (m() > 1) k = j;
+   }
+   /* step6() removes a final -e if m() > 1. */
+   private final void step6()
+   {  j = k;
+      if (b[k] == 'e')
+      {  int a = m();
+         if (a > 1 || a == 1 && !cvc(k-1)) k--;
+      }
+      if (b[k] == 'l' && doublec(k) && m() > 1) k--;
+   }
+   /** Stem the word placed into the Stemmer buffer through calls to add().
+    * Returns true if the stemming process resulted in a word different
+    * from the input.  You can retrieve the result with
+    * getResultLength()/getResultBuffer() or toString().
+    */
+   public void stem()
+   {  k = i - 1;
+      if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
+      i_end = k+1; i = 0;
+   }
+   /** Test program for demonstrating the Stemmer.  It reads text from a
+    * a list of files, stems each word, and writes the result to standard
+    * output. Note that the word stemmed is expected to be in lower case:
+    * forcing lower case must be done outside the Stemmer class.
+    * Usage: Stemmer file-name file-name ...
+    */
+   public static void main(String[] args)
+   {
+      char[] w = new char[501];
+      Stemmer s = new Stemmer();
+      for (int i = 0; i < args.length; i++)
+      try
+      {
+         FileInputStream in = new FileInputStream(args[i]);
+         try
+         { while(true)
+           {  int ch = in.read();
+              if (Character.isLetter((char) ch))
+              {
+                 int j = 0;
+                 while(true)
+                 {  ch = Character.toLowerCase((char) ch);
+                    w[j] = (char) ch;
+                    if (j < 500) j++;
+                    ch = in.read();
+                    if (!Character.isLetter((char) ch))
+                    {
+                       /* to test add(char ch) */
+                       for (int c = 0; c < j; c++) s.add(w[c]);
+                       /* or, to test add(char[] w, int j) */
+                       /* s.add(w, j); */
+                       s.stem();
+                       {  String u;
+                          /* and now, to test toString() : */
+                          u = s.toString();
+                          /* to test getResultBuffer(), getResultLength() : */
+                          /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */
+                          System.out.print(u);
+                       }
+                       break;
+                    }
+                 }
+              }
+              if (ch < 0) break;
+              System.out.print((char)ch);
+           }
+         }
+         catch (IOException e)
+         {  System.out.println("error reading " + args[i]);
+            break;
+         }
+      }
+      catch (FileNotFoundException e)
+      {  System.out.println("file " + args[i] + " not found");
+         break;
+      }
+   }
+}

data/jruby-stemmer.gemspec ADDED Viewed

@@ -0,0 +1,26 @@
+# -*- encoding: utf-8 -*-
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'jruby-stemmer/version'
+Gem::Specification.new do |gem|
+  gem.name          = "jruby-stemmer"
+  gem.version       = JRuby::Stemmer::VERSION
+  gem.authors       = ["Caius Durling"]
+  gem.email         = ["dev@caius.name"]
+  gem.description   = %q{Native java implementation of a string stemming algorithm. JRuby replacement for `fast-stemmer` gem under MRI.}
+  gem.summary       = %q{Fast string stemming in JRuby}
+  gem.homepage      = "https://github.com/caius/jruby-stemmer"
+  gem.files         = `git ls-files`.split($/)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.require_paths = ["lib"]
+  gem.platform = "java"
+  gem.files << "lib/java-stemmer.jar"
+  gem.add_development_dependency "rake-compiler"
+  gem.add_development_dependency "rspec"
+end

data/lib/java-stemmer.jar ADDED Viewed

Binary file

data/lib/jruby-stemmer/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module JRuby
+  module Stemmer
+    VERSION = "0.0.1"
+  end
+end

data/lib/jruby-stemmer.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require "jruby-stemmer/version"
+# Mixes in String#stem using java implementation
+module JRuby
+  module Stemmer
+    require "jruby"
+    # include_package 'org.tartarus.martin.porter_stemmer'
+    require_relative "java-stemmer"
+    def self.stem string
+      stemmer = Java::OrgTartarusMartinPorter_Stemmer::Stemmer.new
+      java_string = string.to_java_string
+      stemmer.add java_string.toCharArray, java_string.length
+      stemmer.stem
+      stemmer.to_string
+    end
+    module StringStem
+      def stem
+        JRuby::Stemmer.stem(self)
+      end
+    end
+    String.__send__ :include, StringStem
+  end
+end

data/spec/stemmer_spec.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require "jruby-stemmer"
+describe JRuby::Stemmer do
+  it "stems a string" do
+    result = JRuby::Stemmer.stem("apple")
+    expect(result).to be == "appl"
+  end
+end
+describe "String" do
+  it "has a #stem method" do
+    expect("").to respond_to(:stem)
+  end
+  it "stems a string" do
+    expect("apple".stem).to be == "appl"
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,95 @@
+--- !ruby/object:Gem::Specification
+name: jruby-stemmer
+version: !ruby/object:Gem::Version
+  prerelease:
+  version: 0.0.1
+platform: java
+authors:
+- Caius Durling
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-05-01 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rake-compiler
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: !binary |-
+          MA==
+    none: false
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: !binary |-
+          MA==
+    none: false
+  prerelease: false
+  type: :development
+- !ruby/object:Gem::Dependency
+  name: rspec
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: !binary |-
+          MA==
+    none: false
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: !binary |-
+          MA==
+    none: false
+  prerelease: false
+  type: :development
+description: Native java implementation of a string stemming algorithm. JRuby replacement for `fast-stemmer` gem under MRI.
+email:
+- dev@caius.name
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- ext/java-stemmer/Stemmer.java
+- jruby-stemmer.gemspec
+- lib/java-stemmer.jar
+- lib/jruby-stemmer.rb
+- lib/jruby-stemmer/version.rb
+- spec/stemmer_spec.rb
+homepage: https://github.com/caius/jruby-stemmer
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: !binary |-
+        MA==
+  none: false
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: !binary |-
+        MA==
+  none: false
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Fast string stemming in JRuby
+test_files:
+- spec/stemmer_spec.rb