RubyGems - lemmatizer - Versions diffs - 0.1.0 → 0.1.1 - Mend

lemmatizer 0.1.0 → 0.1.1

Files changed (11) hide show

checksums.yaml +15 -0
data/LICENSE.txt +1 -1
data/README.md +38 -24
data/lemmatizer.gemspec +15 -11
data/lib/lemmatizer.rb +8 -113
data/lib/lemmatizer/core_ext.rb +9 -0
data/lib/lemmatizer/lemmatizer.rb +152 -0
data/lib/lemmatizer/version.rb +4 -2
data/spec/lemmatizer_spec.rb +6 -8
data/spec/spec_helper.rb +1 -4
metadata +17 -13

checksums.yaml ADDED

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    YmRjOGYyZWE4ZWJjNWJjYWU3MzFkY2M5MjU1OTM4MzMxOGM1OGYwYw==
+  data.tar.gz: !binary |-
+    ZjRiZGQ1NjI1MzU2NTEyM2JmMzg0NGZiNDI2ZGRiMzExNmNlNDllNw==
+SHA512:
+  metadata.gz: !binary |-
+    NTM0YThiNDVhYWVlYjZkNDZlZGNmMTg2OTYxMDE0ZDYwNWM0NWE5MGE2YjA5
+    ODZmYTM2YmE1MmM5MGJhODUzOWUxYmQzYTcwMzBhMmIxNmRiOTEwOGRmOWFk
+    YmNmNzc1YWI5ZDMzMDk2NDBhNmExNTUyZDgwYTJhZjFlOTZkN2Y=
+  data.tar.gz: !binary |-
+    ZmFlNWQ2OWYzYTBmMjM1MmVlOThlMWNlMTIwNjAwYjgwMzYxNWM0YmUzMThj
+    YjJiODA3NGNmOTk0MzQ4ZmY2YTc2ODM1YmJhMzgxOTQ1ZmEzNTY4ZDNkMDky
+    YTdmYTk4NDY5MzAzZjk2M2ZhY2RmOTJjZDQwMmY3ODE5N2ViOTY=

data/LICENSE.txt CHANGED

@@ -1,4 +1,4 @@
-Copyright (c) 2012 Yoichiro Hasebe
+Copyright (c) 2012-2013 Yoichiro Hasebe
 MIT License

data/README.md CHANGED

@@ -1,40 +1,54 @@
 lemmatizer
 ==========
 Lemmatizer for text in English.  Inspired by Python's [nltk.corpus.reader.wordnet.morphy](orpusReader.morphy) package.
 Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
 Installation
 ------------
     sudo gem install lemmatizer
 Usage
 -----
-    require "lemmatizer"
-    lem = Lemmatizer.new
-    p lem.lemma("dogs",    :noun ) # => "dog"
-    p lem.lemma("hired",   :verb ) # => "hire"
-    p lem.lemma("hotter",  :adj  ) # => "hot"
-    p lem.lemma("better",  :adv  ) # => "well"
-	# when part-of-speech symbol is not specified as the second argument, lemmatizer tries :verb, :noun, :adj, or :adv one by one in this order.
-	p lem.lemma("fired")           # => "fire"
-	p lem.lemma("slow")            # => "slow"
+```ruby
+require "lemmatizer"
+lem = Lemmatizer.new
+p lem.lemma("dogs",    :noun ) # => "dog"
+p lem.lemma("hired",   :verb ) # => "hire"
+p lem.lemma("hotter",  :adj  ) # => "hot"
+p lem.lemma("better",  :adv  ) # => "well"
+# when part-of-speech symbol is not specified as the second argument,
+# lemmatizer tries :verb, :noun, :adj, and :adv one by one in this order.
+p lem.lemma("fired")           # => "fire"
+p lem.lemma("slow")            # => "slow"
+```
 Limitations
 -----------
-    # Lemmatizer leaves alone words that its dictionary does not contain.  This keeps proper names such as "James" intact.
-    p lem.lemma("MacBooks", :noun) # => "MacBooks"
-	# If an inflected form of word is included as a lemma in the word list, lemmatizer may not give the expected result.
-    p lem.lemma("higher", :adj) # => "higher" not "high"!
-	# The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
-	# Modify dict/index.{noun|verb|adj|adv} if necessary.
+```ruby
+# Lemmatizer leaves alone words that its dictionary does not contain.
+# This keeps proper names such as "James" intact.
+p lem.lemma("MacBooks", :noun) # => "MacBooks"
+# If an inflected form is included as a lemma in the word index,
+# lemmatizer may not give an expected result.
+p lem.lemma("higher", :adj) # => "higher" not "high"!
+# The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
+# Modify dict/index.{noun|verb|adj|adv} if necessary.
+```
+Author
+------
+* Yoichiro Hasebe <yohasebe@gmail.com>
+Thanks for assistance and contributions:
+* Vladimir Ivic <http://vladimirivic.com>
+License
+-------
+Licensed under the MIT license.

data/lemmatizer.gemspec CHANGED

@@ -1,21 +1,25 @@
 # -*- encoding: utf-8 -*-
 lib = File.expand_path('../lib', __FILE__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'lemmatizer/version'
 Gem::Specification.new do |gem|
-  gem.name          = "lemmatizer"
+  gem.name          = 'lemmatizer'
   gem.version       = Lemmatizer::VERSION
-  gem.authors       = ["Yoichiro Hasebe"]
-  gem.email         = ["yohasebe@gmail.com"]
-  gem.description   = %q{Lemmatizer for text in English.  Inspired by Python's nltk.corpus.reader.wordnet.morphy package.}
-  gem.summary       = %q{Englsh lemmatizer in Ruby}
-  gem.homepage      = "http://github.com/yohasebe/lemmatizer"
+  gem.authors       = ['Yoichiro Hasebe']
+  gem.email         = ['yohasebe@gmail.com']
+  gem.description   = %q(
+    Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy package.
+  )
+  gem.summary       = 'Englsh lemmatizer in Ruby'
+  gem.homepage      = 'http://github.com/yohasebe/lemmatizer'
+  gem.licenses      = ['MIT']
   gem.files         = `git ls-files`.split($/)
-  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
-  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
-  gem.require_paths = ["lib"]
+  gem.executables   = gem.files.grep(%r(^bin/)).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r(^(test|spec|features)/))
+  gem.require_paths = ['lib']
-  gem.add_development_dependency "rspec"
+  gem.add_development_dependency 'rspec'
 end

data/lib/lemmatizer.rb CHANGED

@@ -1,117 +1,12 @@
-#! /usr/bin/env ruby
 # -*- coding: utf-8; mode: ruby -*-
-# Inspired by nltk.corpus.reader.wordnet.morphy http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
-# Original code posted by mtbr at http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer
+require 'stringio'
+require 'lemmatizer/version'
+require 'lemmatizer/core_ext'
+require 'lemmatizer/lemmatizer'
-require "lemmatizer/version"
-require "stringio"
-class String
-	def endwith(s)
-		self =~ /#{s}$/
-	end
+module Lemmatizer
+  def self.new
+    Lemmatizer.new
+  end
 end
-class Lemmatizer
-	current_dir = File.expand_path(File.dirname(__FILE__))
-	WN_FILES = {:noun => [current_dir + "/dict/index.noun", current_dir + "/dict/noun.exc"],
-							:verb => [current_dir + "/dict/index.verb", current_dir + "/dict/verb.exc"],
-							:adj  => [current_dir + "/dict/index.adj", current_dir + "/dict/adj.exc"],
-							:adv  => [current_dir + "/dict/index.adv", current_dir + "/dict/adv.exc"]}
-	MORPHOLOGICAL_SUBSTITUTIONS = {
-		:noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
-							['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
-					    ['men', 'man'], ['ies', 'y']],
-		:verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
-						  ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
-		:adj =>  [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
-		:adv =>  []}
-	def initialize(files = WN_FILES)
-		@wordlists = {}
-		@exceptions = {}
-		MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
-			@wordlists[x] = {}
-			@exceptions[x] = {}
-		end
-		if files then
-			files.each_pair do |pos, pair|
-				load_wordnet_files(pos, pair[0], pair[1])
-			end
-		end
-	end
-	def open_file(*args)
-		if args[0].is_a? IO or args[0].is_a? StringIO then
-			yield args[0]
-		else
-			File.open(*args) do |io|
-				yield io
-			end
-		end
-	end
-	def load_wordnet_files(pos, list, exc)
-		open_file(list) do |io|
-			io.each_line do |line|
-				w = line.split(/\s+/)[0]
-				@wordlists[pos][w] = w
-			end
-		end
-		open_file(exc) do |io|
-			io.each_line do |line|
-				w, s = line.split(/\s+/)
-				@exceptions[pos][w] ||= []
-				@exceptions[pos][w] << s
-			end
-		end
-	end
-	def _each_substitutions(form, pos)
-		if lemma = @wordlists[pos][form] then
-			yield lemma
-		end
-		MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
-			old, new = *entry
-			if form.endwith(old)
-				_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
-					yield x
-				end
-			end
-		end
-	end
-	def each_lemma(form, pos)
-		if lemma = @exceptions[pos][form] then
-			lemma.each{|x |yield x}
-		end
-		if pos == :noun and form.endwith('ful')
-			each_lemma(form[0, form.length-3], pos) do |x|
-				yield x+'ful'
-			end
-		else
-			_each_substitutions(form, pos) do|x|
-				yield x
-			end
-		end
-	end
-	def lemma(form, pos = nil)
-    if !pos
-      [:verb, :noun, :adj, :adv].each do |p|
-        result = lemma(form, p)
-        return result unless result == form
-      end
-      return form
-    end
-		each_lemma(form, pos) do |x|
-			return x
-		end
-		return form
-	end
-end

data/lib/lemmatizer/core_ext.rb ADDED

@@ -0,0 +1,9 @@
+# -*- coding: utf-8; mode: ruby -*-
+module Lematizer
+  class ::String
+    def endwith(s)
+      self =~ /#{s}$/
+    end
+  end
+end

data/lib/lemmatizer/lemmatizer.rb ADDED

@@ -0,0 +1,152 @@
+# -*- coding: utf-8; mode: ruby -*-
+module Lemmatizer
+  class Lemmatizer
+    DATA_DIR = File.expand_path('..', File.dirname(__FILE__))
+    WN_FILES = {
+      :noun => [
+        DATA_DIR + '/dict/index.noun',
+        DATA_DIR + '/dict/noun.exc'
+      ],
+      :verb => [
+        DATA_DIR + '/dict/index.verb',
+        DATA_DIR + '/dict/verb.exc'
+      ],
+      :adj  => [
+        DATA_DIR + '/dict/index.adj',
+        DATA_DIR + '/dict/adj.exc'
+      ],
+      :adv  => [
+        DATA_DIR + '/dict/index.adv',
+        DATA_DIR + '/dict/adv.exc'
+      ]
+    }
+    MORPHOLOGICAL_SUBSTITUTIONS = {
+      :noun => [
+        ['s',    ''   ],
+        ['ses',  's'  ],
+        ['ves',  'f'  ],
+        ['xes',  'x'  ],
+        ['zes',  'z'  ],
+        ['ches', 'ch' ],
+        ['shes', 'sh' ],
+        ['men',  'man'],
+        ['ies',  'y'  ]
+      ],
+      :verb => [
+        ['s',   '' ],
+        ['ies', 'y'],
+        ['es',  'e'],
+        ['es',  '' ],
+        ['ed',  'e'],
+        ['ed',  '' ],
+        ['ing', 'e'],
+        ['ing', '' ]
+      ],
+      :adj =>  [
+        ['er',  '' ],
+        ['est', '' ],
+        ['er',  'e'],
+        ['est', 'e']
+      ],
+      :adv =>  [
+      ]
+    }
+    def initialize(files = WN_FILES)
+      @wordlists  = {}
+      @exceptions = {}
+      MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
+        @wordlists[x]  = {}
+        @exceptions[x] = {}
+      end
+      if files
+        files.each_pair do |pos, pair|
+          load_wordnet_files(pos, pair[0], pair[1])
+        end
+      end
+    end
+    def lemma(form, pos = nil)
+      unless pos
+        [:verb, :noun, :adj, :adv].each do |p|
+          result = lemma(form, p)
+          return result unless result == form
+        end
+        return form
+      end
+      each_lemma(form, pos) do |x|
+        return x
+      end
+      form
+    end
+    private
+    def open_file(*args)
+      if args[0].is_a? IO or args[0].is_a? StringIO
+        yield args[0]
+      else
+        File.open(*args) do |io|
+          yield io
+        end
+      end
+    end
+    def load_wordnet_files(pos, list, exc)
+      open_file(list) do |io|
+        io.each_line do |line|
+          w = line.split(/\s+/)[0]
+          @wordlists[pos][w] = w
+        end
+      end
+      open_file(exc) do |io|
+        io.each_line do |line|
+          w, s = line.split(/\s+/)
+          @exceptions[pos][w] ||= []
+          @exceptions[pos][w] << s
+        end
+      end
+    end
+    def each_substitutions(form, pos)
+      if lemma = @wordlists[pos][form]
+        yield lemma
+      end
+      MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
+        old, new = *entry
+        if form.endwith(old)
+          each_substitutions(form[0, form.length - old.length] + new, pos) do |x|
+            yield x
+          end
+        end
+      end
+    end
+    def each_lemma(form, pos)
+      if lemma = @exceptions[pos][form]
+        lemma.each { |x| yield x }
+      end
+      if pos == :noun && form.endwith('ful')
+        each_lemma(form[0, form.length-3], pos) do |x|
+          yield x + 'ful'
+        end
+      else
+      each_substitutions(form, pos) do|x|
+          yield x
+        end
+      end
+    end
+  end
+end

data/lib/lemmatizer/version.rb CHANGED

@@ -1,3 +1,5 @@
-class Lemmatizer
-  VERSION = "0.1.0"
+# -*- coding: utf-8; mode: ruby -*-
+module Lemmatizer
+  VERSION = '0.1.1'
 end

data/spec/lemmatizer_spec.rb CHANGED

@@ -1,18 +1,14 @@
-#!/usr/bin/env ruby
 # -*- coding: utf-8 -*-
-require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+require 'spec_helper'
 require 'lemmatizer'
 describe "Lemmatizer" do
-	it "contains lemmatizing functions:" do
-	end
 	before do
 		@lemmatizer = Lemmatizer.new
 	end
-	describe "lemma" do
+	describe "#lemma" do
 		it "takes a word form and its part-of-speech symbol (:noun, :verb, :adj, :adv) and then returns its lemma form" do
 			result_n1 = @lemmatizer.lemma("analyses", :noun)
 			result_n1.should == "analysis"
@@ -55,6 +51,9 @@ describe "Lemmatizer" do
 			result_3 = @lemmatizer.lemma("higher")
 			result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
+			result_2 = @lemmatizer.lemma("asdfassda") # non-existing word
+			result_2.should == "asdfassda"
       # test cases for words used in README
 			result_t1 = @lemmatizer.lemma("fired")
 			result_t1.should == "fire"
@@ -63,5 +62,4 @@ describe "Lemmatizer" do
 			result_t2.should == "slow"
 		end
 	end
-end
+end

data/spec/spec_helper.rb CHANGED

@@ -1,6 +1,3 @@
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 require 'rspec'
-RSpec.configure do |config|
-	# see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
-end
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))

metadata CHANGED

@@ -1,29 +1,31 @@
 --- !ruby/object:Gem::Specification
 name: lemmatizer
 version: !ruby/object:Gem::Version
-  version: 0.1.0
-  prerelease:
+  version: 0.1.1
 platform: ruby
 authors:
 - Yoichiro Hasebe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-10-29 00:00:00.000000000 Z
+date: 2013-11-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &70314483330880 !ruby/object:Gem::Requirement
-    none: false
+  requirement: !ruby/object:Gem::Requirement
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *70314483330880
-description: Lemmatizer for text in English.  Inspired by Python's nltk.corpus.reader.wordnet.morphy
-  package.
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: ! "\n    Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
+  package.\n  "
 email:
 - yohasebe@gmail.com
 executables: []
@@ -45,32 +47,34 @@ files:
 - lib/dict/noun.exc
 - lib/dict/verb.exc
 - lib/lemmatizer.rb
+- lib/lemmatizer/core_ext.rb
+- lib/lemmatizer/lemmatizer.rb
 - lib/lemmatizer/version.rb
 - spec/lemmatizer_spec.rb
 - spec/spec_helper.rb
 homepage: http://github.com/yohasebe/lemmatizer
-licenses: []
+licenses:
+- MIT
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.17
+rubygems_version: 2.1.9
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Englsh lemmatizer in Ruby
 test_files:
 - spec/lemmatizer_spec.rb