RubyGems - lemmatizer - Versions diffs - 0.1.0 → 0.1.1 - Mend

lemmatizer 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +15 -0
data/LICENSE.txt +1 -1
data/README.md +38 -24
data/lemmatizer.gemspec +15 -11
data/lib/lemmatizer.rb +8 -113
data/lib/lemmatizer/core_ext.rb +9 -0
data/lib/lemmatizer/lemmatizer.rb +152 -0
data/lib/lemmatizer/version.rb +4 -2
data/spec/lemmatizer_spec.rb +6 -8
data/spec/spec_helper.rb +1 -4
metadata +17 -13

checksums.yaml ADDED

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    YmRjOGYyZWE4ZWJjNWJjYWU3MzFkY2M5MjU1OTM4MzMxOGM1OGYwYw==
+  data.tar.gz: !binary |-
+    ZjRiZGQ1NjI1MzU2NTEyM2JmMzg0NGZiNDI2ZGRiMzExNmNlNDllNw==
+SHA512:
+  metadata.gz: !binary |-
+    NTM0YThiNDVhYWVlYjZkNDZlZGNmMTg2OTYxMDE0ZDYwNWM0NWE5MGE2YjA5
+    ODZmYTM2YmE1MmM5MGJhODUzOWUxYmQzYTcwMzBhMmIxNmRiOTEwOGRmOWFk
+    YmNmNzc1YWI5ZDMzMDk2NDBhNmExNTUyZDgwYTJhZjFlOTZkN2Y=
+  data.tar.gz: !binary |-
+    ZmFlNWQ2OWYzYTBmMjM1MmVlOThlMWNlMTIwNjAwYjgwMzYxNWM0YmUzMThj
+    YjJiODA3NGNmOTk0MzQ4ZmY2YTc2ODM1YmJhMzgxOTQ1ZmEzNTY4ZDNkMDky
+    YTdmYTk4NDY5MzAzZjk2M2ZhY2RmOTJjZDQwMmY3ODE5N2ViOTY=

data/LICENSE.txt CHANGED

@@ -1,4 +1,4 @@
-Copyright (c) 2012 Yoichiro Hasebe
+Copyright (c) 2012-2013 Yoichiro Hasebe
 MIT License

data/README.md CHANGED

@@ -1,40 +1,54 @@
 lemmatizer
 ==========
 Lemmatizer for text in English.  Inspired by Python's [nltk.corpus.reader.wordnet.morphy](orpusReader.morphy) package.
 Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
 Installation
 ------------
     sudo gem install lemmatizer
 Usage
 -----
-    require "lemmatizer"
-    lem = Lemmatizer.new
-    p lem.lemma("dogs",    :noun ) # => "dog"
-    p lem.lemma("hired",   :verb ) # => "hire"
-    p lem.lemma("hotter",  :adj  ) # => "hot"
-    p lem.lemma("better",  :adv  ) # => "well"
-	# when part-of-speech symbol is not specified as the second argument, lemmatizer tries :verb, :noun, :adj, or :adv one by one in this order.
-	p lem.lemma("fired")           # => "fire"
-	p lem.lemma("slow")            # => "slow"
+```ruby
+require "lemmatizer"
+lem = Lemmatizer.new
+p lem.lemma("dogs",    :noun ) # => "dog"
+p lem.lemma("hired",   :verb ) # => "hire"
+p lem.lemma("hotter",  :adj  ) # => "hot"
+p lem.lemma("better",  :adv  ) # => "well"
+# when part-of-speech symbol is not specified as the second argument,
+# lemmatizer tries :verb, :noun, :adj, and :adv one by one in this order.
+p lem.lemma("fired")           # => "fire"
+p lem.lemma("slow")            # => "slow"
+```
 Limitations
 -----------
-    # Lemmatizer leaves alone words that its dictionary does not contain.  This keeps proper names such as "James" intact.
-    p lem.lemma("MacBooks", :noun) # => "MacBooks"
-	# If an inflected form of word is included as a lemma in the word list, lemmatizer may not give the expected result.
-    p lem.lemma("higher", :adj) # => "higher" not "high"!
-	# The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
-	# Modify dict/index.{noun|verb|adj|adv} if necessary.
+```ruby
+# Lemmatizer leaves alone words that its dictionary does not contain.
+# This keeps proper names such as "James" intact.
+p lem.lemma("MacBooks", :noun) # => "MacBooks"
+# If an inflected form is included as a lemma in the word index,
+# lemmatizer may not give an expected result.
+p lem.lemma("higher", :adj) # => "higher" not "high"!
+# The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
+# Modify dict/index.{noun|verb|adj|adv} if necessary.
+```
+Author
+------
+* Yoichiro Hasebe <yohasebe@gmail.com>
+Thanks for assistance and contributions:
+* Vladimir Ivic <http://vladimirivic.com>
+License
+-------
+Licensed under the MIT license.

data/lemmatizer.gemspec CHANGED

@@ -1,21 +1,25 @@
 # -*- encoding: utf-8 -*-
 lib = File.expand_path('../lib', __FILE__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'lemmatizer/version'
 Gem::Specification.new do |gem|
-  gem.name          = "lemmatizer"
+  gem.name          = 'lemmatizer'
   gem.version       = Lemmatizer::VERSION
-  gem.authors       = ["Yoichiro Hasebe"]
-  gem.email         = ["yohasebe@gmail.com"]
-  gem.description   = %q{Lemmatizer for text in English.  Inspired by Python's nltk.corpus.reader.wordnet.morphy package.}
-  gem.summary       = %q{Englsh lemmatizer in Ruby}
-  gem.homepage      = "http://github.com/yohasebe/lemmatizer"
+  gem.authors       = ['Yoichiro Hasebe']
+  gem.email         = ['yohasebe@gmail.com']
+  gem.description   = %q(
+    Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy package.
+  )
+  gem.summary       = 'Englsh lemmatizer in Ruby'
+  gem.homepage      = 'http://github.com/yohasebe/lemmatizer'
+  gem.licenses      = ['MIT']
   gem.files         = `git ls-files`.split($/)
-  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
-  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
-  gem.require_paths = ["lib"]
+  gem.executables   = gem.files.grep(%r(^bin/)).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r(^(test|spec|features)/))
+  gem.require_paths = ['lib']
-  gem.add_development_dependency "rspec"
+  gem.add_development_dependency 'rspec'
 end

data/lib/lemmatizer.rb CHANGED

@@ -1,117 +1,12 @@
-#! /usr/bin/env ruby
 # -*- coding: utf-8; mode: ruby -*-
-# Inspired by nltk.corpus.reader.wordnet.morphy http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
-# Original code posted by mtbr at http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer
+require 'stringio'
+require 'lemmatizer/version'
+require 'lemmatizer/core_ext'
+require 'lemmatizer/lemmatizer'
-require "lemmatizer/version"
-require "stringio"
-class String
-	def endwith(s)
-		self =~ /#{s}$/
-	end
+module Lemmatizer
+  def self.new
+    Lemmatizer.new
+  end
 end
-class Lemmatizer
-	current_dir = File.expand_path(File.dirname(__FILE__))
-	WN_FILES = {:noun => [current_dir + "/dict/index.noun", current_dir + "/dict/noun.exc"],
-							:verb => [current_dir + "/dict/index.verb", current_dir + "/dict/verb.exc"],
-							:adj  => [current_dir + "/dict/index.adj", current_dir + "/dict/adj.exc"],
-							:adv  => [current_dir + "/dict/index.adv", current_dir + "/dict/adv.exc"]}
-	MORPHOLOGICAL_SUBSTITUTIONS = {
-		:noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
-							['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
-					    ['men', 'man'], ['ies', 'y']],
-		:verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
-						  ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
-		:adj =>  [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
-		:adv =>  []}
-	def initialize(files = WN_FILES)
-		@wordlists = {}
-		@exceptions = {}
-		MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
-			@wordlists[x] = {}
-			@exceptions[x] = {}
-		end
-		if files then
-			files.each_pair do |pos, pair|
-				load_wordnet_files(pos, pair[0], pair[1])
-			end
-		end
-	end
-	def open_file(*args)
-		if args[0].is_a? IO or args[0].is_a? StringIO then
-			yield args[0]
-		else
-			File.open(*args) do |io|
-				yield io
-			end
-		end
-	end
-	def load_wordnet_files(pos, list, exc)
-		open_file(list) do |io|
-			io.each_line do |line|
-				w = line.split(/\s+/)[0]
-				@wordlists[pos][w] = w
-			end
-		end
-		open_file(exc) do |io|
-			io.each_line do |line|
-				w, s = line.split(/\s+/)
-				@exceptions[pos][w] ||= []
-				@exceptions[pos][w] << s
-			end
-		end
-	end
-	def _each_substitutions(form, pos)
-		if lemma = @wordlists[pos][form] then
-			yield lemma
-		end
-		MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
-			old, new = *entry
-			if form.endwith(old)
-				_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
-					yield x
-				end
-			end
-		end
-	end
-	def each_lemma(form, pos)
-		if lemma = @exceptions[pos][form] then
-			lemma.each{|x |yield x}
-		end
-		if pos == :noun and form.endwith('ful')
-			each_lemma(form[0, form.length-3], pos) do |x|
-				yield x+'ful'
-			end
-		else
-			_each_substitutions(form, pos) do|x|
-				yield x
-			end
-		end
-	end
-	def lemma(form, pos = nil)
-    if !pos
-      [:verb, :noun, :adj, :adv].each do |p|
-        result = lemma(form, p)
-        return result unless result == form
-      end
-      return form
-    end
-		each_lemma(form, pos) do |x|
-			return x
-		end
-		return form
-	end
-end

data/lib/lemmatizer/core_ext.rb ADDED

@@ -0,0 +1,9 @@
+# -*- coding: utf-8; mode: ruby -*-
+module Lematizer
+  class ::String
+    def endwith(s)
+      self =~ /#{s}$/
+    end
+  end
+end

data/lib/lemmatizer/lemmatizer.rb ADDED

@@ -0,0 +1,152 @@
+# -*- coding: utf-8; mode: ruby -*-
+module Lemmatizer
+  class Lemmatizer
+    DATA_DIR = File.expand_path('..', File.dirname(__FILE__))
+    WN_FILES = {
+      :noun => [
+        DATA_DIR + '/dict/index.noun',
+        DATA_DIR + '/dict/noun.exc'
+      ],
+      :verb => [
+        DATA_DIR + '/dict/index.verb',
+        DATA_DIR + '/dict/verb.exc'
+      ],
+      :adj  => [
+        DATA_DIR + '/dict/index.adj',
+        DATA_DIR + '/dict/adj.exc'
+      ],
+      :adv  => [
+        DATA_DIR + '/dict/index.adv',
+        DATA_DIR + '/dict/adv.exc'
+      ]
+    }
+    MORPHOLOGICAL_SUBSTITUTIONS = {
+      :noun => [
+        ['s',    ''   ],
+        ['ses',  's'  ],
+        ['ves',  'f'  ],
+        ['xes',  'x'  ],
+        ['zes',  'z'  ],
+        ['ches', 'ch' ],
+        ['shes', 'sh' ],
+        ['men',  'man'],
+        ['ies',  'y'  ]
+      ],
+      :verb => [
+        ['s',   '' ],
+        ['ies', 'y'],
+        ['es',  'e'],
+        ['es',  '' ],
+        ['ed',  'e'],
+        ['ed',  '' ],
+        ['ing', 'e'],
+        ['ing', '' ]
+      ],
+      :adj =>  [
+        ['er',  '' ],
+        ['est', '' ],
+        ['er',  'e'],
+        ['est', 'e']
+      ],
+      :adv =>  [
+      ]
+    }
+    def initialize(files = WN_FILES)
+      @wordlists  = {}
+      @exceptions = {}
+      MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
+        @wordlists[x]  = {}
+        @exceptions[x] = {}
+      end
+      if files
+        files.each_pair do |pos, pair|
+          load_wordnet_files(pos, pair[0], pair[1])
+        end
+      end
+    end
+    def lemma(form, pos = nil)
+      unless pos
+        [:verb, :noun, :adj, :adv].each do |p|
+          result = lemma(form, p)
+          return result unless result == form
+        end
+        return form
+      end
+      each_lemma(form, pos) do |x|
+        return x
+      end
+      form
+    end
+    private
+    def open_file(*args)
+      if args[0].is_a? IO or args[0].is_a? StringIO
+        yield args[0]
+      else
+        File.open(*args) do |io|
+          yield io
+        end
+      end
+    end
+    def load_wordnet_files(pos, list, exc)
+      open_file(list) do |io|
+        io.each_line do |line|
+          w = line.split(/\s+/)[0]
+          @wordlists[pos][w] = w
+        end
+      end
+      open_file(exc) do |io|
+        io.each_line do |line|
+          w, s = line.split(/\s+/)
+          @exceptions[pos][w] ||= []
+          @exceptions[pos][w] << s
+        end
+      end
+    end
+    def each_substitutions(form, pos)
+      if lemma = @wordlists[pos][form]
+        yield lemma
+      end
+      MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
+        old, new = *entry
+        if form.endwith(old)
+          each_substitutions(form[0, form.length - old.length] + new, pos) do |x|
+            yield x
+          end
+        end
+      end
+    end
+    def each_lemma(form, pos)
+      if lemma = @exceptions[pos][form]
+        lemma.each { |x| yield x }
+      end
+      if pos == :noun && form.endwith('ful')
+        each_lemma(form[0, form.length-3], pos) do |x|
+          yield x + 'ful'
+        end
+      else
+      each_substitutions(form, pos) do|x|
+          yield x
+        end
+      end
+    end
+  end
+end

data/lib/lemmatizer/version.rb CHANGED

@@ -1,3 +1,5 @@
-class Lemmatizer
-  VERSION = "0.1.0"
+# -*- coding: utf-8; mode: ruby -*-
+module Lemmatizer
+  VERSION = '0.1.1'
 end

data/spec/lemmatizer_spec.rb CHANGED

@@ -1,18 +1,14 @@
-#!/usr/bin/env ruby
 # -*- coding: utf-8 -*-
-require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+require 'spec_helper'
 require 'lemmatizer'
 describe "Lemmatizer" do
-	it "contains lemmatizing functions:" do
-	end
 	before do
 		@lemmatizer = Lemmatizer.new
 	end
-	describe "lemma" do
+	describe "#lemma" do
 		it "takes a word form and its part-of-speech symbol (:noun, :verb, :adj, :adv) and then returns its lemma form" do
 			result_n1 = @lemmatizer.lemma("analyses", :noun)
 			result_n1.should == "analysis"
@@ -55,6 +51,9 @@ describe "Lemmatizer" do
 			result_3 = @lemmatizer.lemma("higher")
 			result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
+			result_2 = @lemmatizer.lemma("asdfassda") # non-existing word
+			result_2.should == "asdfassda"
       # test cases for words used in README
 			result_t1 = @lemmatizer.lemma("fired")
 			result_t1.should == "fire"
@@ -63,5 +62,4 @@ describe "Lemmatizer" do
 			result_t2.should == "slow"
 		end
 	end
-end
+end

data/spec/spec_helper.rb CHANGED

@@ -1,6 +1,3 @@
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 require 'rspec'
-RSpec.configure do |config|
-	# see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
-end
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))

metadata CHANGED

@@ -1,29 +1,31 @@
 --- !ruby/object:Gem::Specification
 name: lemmatizer
 version: !ruby/object:Gem::Version
-  version: 0.1.0
-  prerelease:
+  version: 0.1.1
 platform: ruby
 authors:
 - Yoichiro Hasebe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-10-29 00:00:00.000000000 Z
+date: 2013-11-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &70314483330880 !ruby/object:Gem::Requirement
-    none: false
+  requirement: !ruby/object:Gem::Requirement
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *70314483330880
-description: Lemmatizer for text in English.  Inspired by Python's nltk.corpus.reader.wordnet.morphy
-  package.
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: ! "\n    Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
+  package.\n  "
 email:
 - yohasebe@gmail.com
 executables: []
@@ -45,32 +47,34 @@ files:
 - lib/dict/noun.exc
 - lib/dict/verb.exc
 - lib/lemmatizer.rb
+- lib/lemmatizer/core_ext.rb
+- lib/lemmatizer/lemmatizer.rb
 - lib/lemmatizer/version.rb
 - spec/lemmatizer_spec.rb
 - spec/spec_helper.rb
 homepage: http://github.com/yohasebe/lemmatizer
-licenses: []
+licenses:
+- MIT
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.17
+rubygems_version: 2.1.9
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Englsh lemmatizer in Ruby
 test_files:
 - spec/lemmatizer_spec.rb