RubyGems - lemmatizer - Versions diffs - 0.0.1 → 0.1.0 - Mend

lemmatizer 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/README.md CHANGED

@@ -1,7 +1,15 @@
 lemmatizer
 ==========
-Lemmatizer for text in English.  Inspired by Python's nltk.corpus.reader.wordnet.morphy package
+Lemmatizer for text in English.  Inspired by Python's [nltk.corpus.reader.wordnet.morphy](orpusReader.morphy) package.
+Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
+Installation
+------------
+    sudo gem install lemmatizer
 Usage
 -----
@@ -14,7 +22,19 @@ Usage
     p lem.lemma("hired",   :verb ) # => "hire"
     p lem.lemma("hotter",  :adj  ) # => "hot"
     p lem.lemma("better",  :adv  ) # => "well"
+	# when part-of-speech symbol is not specified as the second argument, lemmatizer tries :verb, :noun, :adj, or :adv one by one in this order.
+	p lem.lemma("fired")           # => "fire"
+	p lem.lemma("slow")            # => "slow"
+Limitations
+-----------
     # Lemmatizer leaves alone words that its dictionary does not contain.  This keeps proper names such as "James" intact.
     p lem.lemma("MacBooks", :noun) # => "MacBooks"
+	# If an inflected form of word is included as a lemma in the word list, lemmatizer may not give the expected result.
+    p lem.lemma("higher", :adj) # => "higher" not "high"!
+	# The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
+	# Modify dict/index.{noun|verb|adj|adv} if necessary.

data/lib/dict/index.adj CHANGED

@@ -1,32 +1,3 @@
-  1 This software and database is being provided to you, the LICENSEE, by
-  2 Princeton University under the following license.  By obtaining, using
-  3 and/or copying this software and database, you agree that you have
-  4 read, understood, and will comply with these terms and conditions.:
-  5
-  6 Permission to use, copy, modify and distribute this software and
-  7 database and its documentation for any purpose and without fee or
-  8 royalty is hereby granted, provided that you agree to comply with
-  9 the following copyright notice and statements, including the disclaimer,
-  10 and that the same appear on ALL copies of the software, database and
-  11 documentation, including modifications that you make for internal
-  12 use or for distribution.
-  13
-  14 WordNet 3.0 Copyright 2006 by Princeton University.  All rights reserved.
-  15
-  16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
-  17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-  18 IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
-  19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
-  20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
-  21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
-  22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
-  23 OTHER RIGHTS.
-  24
-  25 The name of Princeton University or Princeton may not be used in
-  26 advertising or publicity pertaining to distribution of the software
-  27 and/or database.  Title to copyright in this software, database and
-  28 any associated documentation shall at all times remain with
-  29 Princeton University and LICENSEE agrees to preserve same.
 .22-caliber a 1 1 \ 1 0 03146310
 .22-calibre a 1 1 \ 1 0 03146310
 .22_caliber a 1 1 \ 1 0 03146310

data/lib/dict/index.adv CHANGED

@@ -1,32 +1,3 @@
-  1 This software and database is being provided to you, the LICENSEE, by
-  2 Princeton University under the following license.  By obtaining, using
-  3 and/or copying this software and database, you agree that you have
-  4 read, understood, and will comply with these terms and conditions.:
-  5
-  6 Permission to use, copy, modify and distribute this software and
-  7 database and its documentation for any purpose and without fee or
-  8 royalty is hereby granted, provided that you agree to comply with
-  9 the following copyright notice and statements, including the disclaimer,
-  10 and that the same appear on ALL copies of the software, database and
-  11 documentation, including modifications that you make for internal
-  12 use or for distribution.
-  13
-  14 WordNet 3.0 Copyright 2006 by Princeton University.  All rights reserved.
-  15
-  16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
-  17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-  18 IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
-  19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
-  20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
-  21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
-  22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
-  23 OTHER RIGHTS.
-  24
-  25 The name of Princeton University or Princeton may not be used in
-  26 advertising or publicity pertaining to distribution of the software
-  27 and/or database.  Title to copyright in this software, database and
-  28 any associated documentation shall at all times remain with
-  29 Princeton University and LICENSEE agrees to preserve same.
 'tween r 1 0 1 0 00250898
 'tween_decks r 1 0 1 0 00498293
 a.d. r 1 0 1 0 00001837

data/lib/dict/index.verb CHANGED

@@ -1,4 +1,3 @@
-# lemma  pos  synset_cnt  p_cnt  [ptr_symbol...]  sense_cnt  tagsense_cnt   synset_offset  [synset_offset...]
 aah v 1 1 @ 1 0 00865776
 abacinate v 1 1 @ 1 0 02168378
 abandon v 5 4 @ ~ $ + 5 5 02228031 02227741 02076676 00613393 00614057

data/lib/lemmatizer.rb CHANGED

@@ -1,8 +1,9 @@
 #! /usr/bin/env ruby
 # -*- coding: utf-8; mode: ruby -*-
-# inspired by nltk.corpus.reader.wordnet.morphy
-# http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
+# Inspired by nltk.corpus.reader.wordnet.morphy http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
+# Original code posted by mtbr at http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer
 require "lemmatizer/version"
 require "stringio"
@@ -22,11 +23,10 @@ class Lemmatizer
 	MORPHOLOGICAL_SUBSTITUTIONS = {
 		:noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
-								['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
-							 ['men', 'man'], ['ies', 'y']],
+							['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
+					    ['men', 'man'], ['ies', 'y']],
 		:verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
-							 ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
+						  ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
 		:adj =>  [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
 		:adv =>  []}
@@ -38,7 +38,7 @@ class Lemmatizer
 			@exceptions[x] = {}
 		end
 		if files then
-			files.each_pair do |pos,pair|
+			files.each_pair do |pos, pair|
 				load_wordnet_files(pos, pair[0], pair[1])
 			end
 		end
@@ -64,19 +64,33 @@ class Lemmatizer
 		open_file(exc) do |io|
 			io.each_line do |line|
-				w,s = line.split(/\s+/)
+				w, s = line.split(/\s+/)
 				@exceptions[pos][w] ||= []
 				@exceptions[pos][w] << s
 			end
 		end
 	end
+	def _each_substitutions(form, pos)
+		if lemma = @wordlists[pos][form] then
+			yield lemma
+		end
+		MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
+			old, new = *entry
+			if form.endwith(old)
+				_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
+					yield x
+				end
+			end
+		end
+	end
 	def each_lemma(form, pos)
 		if lemma = @exceptions[pos][form] then
 			lemma.each{|x |yield x}
 		end
 		if pos == :noun and form.endwith('ful')
-			each_lemma(form[0,form.length-3], pos) do |x|
+			each_lemma(form[0, form.length-3], pos) do |x|
 				yield x+'ful'
 			end
 		else
@@ -86,23 +100,18 @@ class Lemmatizer
 		end
 	end
-	def lemma(form,pos)
+	def lemma(form, pos = nil)
+    if !pos
+      [:verb, :noun, :adj, :adv].each do |p|
+        result = lemma(form, p)
+        return result unless result == form
+      end
+      return form
+    end
 		each_lemma(form, pos) do |x|
 			return x
 		end
 		return form
 	end
-	def _each_substitutions(form, pos)
-		if lemma = @wordlists[pos][form] then
-			yield lemma
-		end
-		MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
-			old, new = *entry
-			if form.endwith(old)
-				_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
-					yield x
-				end
-			end
-		end
-	end
 end

data/lib/lemmatizer/version.rb CHANGED

@@ -1,3 +1,3 @@
 class Lemmatizer
-  VERSION = "0.0.1"
+  VERSION = "0.1.0"
 end

data/spec/lemmatizer_spec.rb CHANGED

@@ -44,6 +44,23 @@ describe "Lemmatizer" do
 			result_r2 = @lemmatizer.lemma("best", :adv)
 			result_r2.should_not == "good"
+      # Lemmatizer give a result even when no pos is given, by assuming it to be :verb, :noun, :adv, or :adj.
+			result_1 = @lemmatizer.lemma("plays")
+			result_1.should == "play"
+			result_2 = @lemmatizer.lemma("oxen")
+			result_2.should == "ox"
+			result_3 = @lemmatizer.lemma("higher")
+			result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
+      # test cases for words used in README
+			result_t1 = @lemmatizer.lemma("fired")
+			result_t1.should == "fire"
+			result_t2 = @lemmatizer.lemma("slower")
+			result_t2.should == "slow"
 		end
 	end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: lemmatizer
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.1.0
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-10-28 00:00:00.000000000 Z
+date: 2012-10-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &70234015642620 !ruby/object:Gem::Requirement
+  requirement: &70314483330880 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *70234015642620
+  version_requirements: *70314483330880
 description: Lemmatizer for text in English.  Inspired by Python's nltk.corpus.reader.wordnet.morphy
   package.
 email: