RubyGems - lemmatizer - Versions diffs - 0.0.1 → 0.1.0 - Mend

lemmatizer 0.0.1 → 0.1.0

Files changed (8) hide show

data/README.md CHANGED

@@ -1,7 +1,15 @@
 lemmatizer
 ==========
-Lemmatizer for text in English.  Inspired by Python's nltk.corpus.reader.wordnet.morphy package
+Lemmatizer for text in English.  Inspired by Python's [nltk.corpus.reader.wordnet.morphy](orpusReader.morphy) package.
+Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
+Installation
+------------
+    sudo gem install lemmatizer
 Usage
 -----
@@ -14,7 +22,19 @@ Usage
     p lem.lemma("hired",   :verb ) # => "hire"
     p lem.lemma("hotter",  :adj  ) # => "hot"
     p lem.lemma("better",  :adv  ) # => "well"
+	# when part-of-speech symbol is not specified as the second argument, lemmatizer tries :verb, :noun, :adj, or :adv one by one in this order.
+	p lem.lemma("fired")           # => "fire"
+	p lem.lemma("slow")            # => "slow"
+Limitations
+-----------
     # Lemmatizer leaves alone words that its dictionary does not contain.  This keeps proper names such as "James" intact.
     p lem.lemma("MacBooks", :noun) # => "MacBooks"
+	# If an inflected form of word is included as a lemma in the word list, lemmatizer may not give the expected result.
+    p lem.lemma("higher", :adj) # => "higher" not "high"!
+	# The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
+	# Modify dict/index.{noun|verb|adj|adv} if necessary.

data/lib/dict/index.adj CHANGED

@@ -1,32 +1,3 @@
-  1 This software and database is being provided to you, the LICENSEE, by
-  2 Princeton University under the following license.  By obtaining, using
-  3 and/or copying this software and database, you agree that you have
-  4 read, understood, and will comply with these terms and conditions.:
-  5
-  6 Permission to use, copy, modify and distribute this software and
-  7 database and its documentation for any purpose and without fee or
-  8 royalty is hereby granted, provided that you agree to comply with
-  9 the following copyright notice and statements, including the disclaimer,
-  10 and that the same appear on ALL copies of the software, database and
-  11 documentation, including modifications that you make for internal
-  12 use or for distribution.
-  13
-  14 WordNet 3.0 Copyright 2006 by Princeton University.  All rights reserved.
-  15
-  16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
-  17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-  18 IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
-  19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
-  20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
-  21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
-  22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
-  23 OTHER RIGHTS.
-  24
-  25 The name of Princeton University or Princeton may not be used in
-  26 advertising or publicity pertaining to distribution of the software
-  27 and/or database.  Title to copyright in this software, database and
-  28 any associated documentation shall at all times remain with
-  29 Princeton University and LICENSEE agrees to preserve same.
 .22-caliber a 1 1 \ 1 0 03146310
 .22-calibre a 1 1 \ 1 0 03146310
 .22_caliber a 1 1 \ 1 0 03146310

data/lib/dict/index.adv CHANGED

@@ -1,32 +1,3 @@
-  1 This software and database is being provided to you, the LICENSEE, by
-  2 Princeton University under the following license.  By obtaining, using
-  3 and/or copying this software and database, you agree that you have
-  4 read, understood, and will comply with these terms and conditions.:
-  5
-  6 Permission to use, copy, modify and distribute this software and
-  7 database and its documentation for any purpose and without fee or
-  8 royalty is hereby granted, provided that you agree to comply with
-  9 the following copyright notice and statements, including the disclaimer,
-  10 and that the same appear on ALL copies of the software, database and
-  11 documentation, including modifications that you make for internal
-  12 use or for distribution.
-  13
-  14 WordNet 3.0 Copyright 2006 by Princeton University.  All rights reserved.
-  15
-  16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
-  17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-  18 IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
-  19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
-  20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
-  21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
-  22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
-  23 OTHER RIGHTS.
-  24
-  25 The name of Princeton University or Princeton may not be used in
-  26 advertising or publicity pertaining to distribution of the software
-  27 and/or database.  Title to copyright in this software, database and
-  28 any associated documentation shall at all times remain with
-  29 Princeton University and LICENSEE agrees to preserve same.
 'tween r 1 0 1 0 00250898
 'tween_decks r 1 0 1 0 00498293
 a.d. r 1 0 1 0 00001837

data/lib/dict/index.verb CHANGED

@@ -1,4 +1,3 @@
-# lemma  pos  synset_cnt  p_cnt  [ptr_symbol...]  sense_cnt  tagsense_cnt   synset_offset  [synset_offset...]
 aah v 1 1 @ 1 0 00865776
 abacinate v 1 1 @ 1 0 02168378
 abandon v 5 4 @ ~ $ + 5 5 02228031 02227741 02076676 00613393 00614057

data/lib/lemmatizer.rb CHANGED

@@ -1,8 +1,9 @@
 #! /usr/bin/env ruby
 # -*- coding: utf-8; mode: ruby -*-
-# inspired by nltk.corpus.reader.wordnet.morphy
-# http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
+# Inspired by nltk.corpus.reader.wordnet.morphy http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
+# Original code posted by mtbr at http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer
 require "lemmatizer/version"
 require "stringio"
@@ -22,11 +23,10 @@ class Lemmatizer
 	MORPHOLOGICAL_SUBSTITUTIONS = {
 		:noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
-								['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
-							 ['men', 'man'], ['ies', 'y']],
+							['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
+					    ['men', 'man'], ['ies', 'y']],
 		:verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
-							 ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
+						  ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
 		:adj =>  [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
 		:adv =>  []}
@@ -38,7 +38,7 @@ class Lemmatizer
 			@exceptions[x] = {}
 		end
 		if files then
-			files.each_pair do |pos,pair|
+			files.each_pair do |pos, pair|
 				load_wordnet_files(pos, pair[0], pair[1])
 			end
 		end
@@ -64,19 +64,33 @@ class Lemmatizer
 		open_file(exc) do |io|
 			io.each_line do |line|
-				w,s = line.split(/\s+/)
+				w, s = line.split(/\s+/)
 				@exceptions[pos][w] ||= []
 				@exceptions[pos][w] << s
 			end
 		end
 	end
+	def _each_substitutions(form, pos)
+		if lemma = @wordlists[pos][form] then
+			yield lemma
+		end
+		MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
+			old, new = *entry
+			if form.endwith(old)
+				_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
+					yield x
+				end
+			end
+		end
+	end
 	def each_lemma(form, pos)
 		if lemma = @exceptions[pos][form] then
 			lemma.each{|x |yield x}
 		end
 		if pos == :noun and form.endwith('ful')
-			each_lemma(form[0,form.length-3], pos) do |x|
+			each_lemma(form[0, form.length-3], pos) do |x|
 				yield x+'ful'
 			end
 		else
@@ -86,23 +100,18 @@ class Lemmatizer
 		end
 	end
-	def lemma(form,pos)
+	def lemma(form, pos = nil)
+    if !pos
+      [:verb, :noun, :adj, :adv].each do |p|
+        result = lemma(form, p)
+        return result unless result == form
+      end
+      return form
+    end
 		each_lemma(form, pos) do |x|
 			return x
 		end
 		return form
 	end
-	def _each_substitutions(form, pos)
-		if lemma = @wordlists[pos][form] then
-			yield lemma
-		end
-		MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
-			old, new = *entry
-			if form.endwith(old)
-				_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
-					yield x
-				end
-			end
-		end
-	end
 end

data/lib/lemmatizer/version.rb CHANGED

@@ -1,3 +1,3 @@
 class Lemmatizer
-  VERSION = "0.0.1"
+  VERSION = "0.1.0"
 end

data/spec/lemmatizer_spec.rb CHANGED

@@ -44,6 +44,23 @@ describe "Lemmatizer" do
 			result_r2 = @lemmatizer.lemma("best", :adv)
 			result_r2.should_not == "good"
+      # Lemmatizer give a result even when no pos is given, by assuming it to be :verb, :noun, :adv, or :adj.
+			result_1 = @lemmatizer.lemma("plays")
+			result_1.should == "play"
+			result_2 = @lemmatizer.lemma("oxen")
+			result_2.should == "ox"
+			result_3 = @lemmatizer.lemma("higher")
+			result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
+      # test cases for words used in README
+			result_t1 = @lemmatizer.lemma("fired")
+			result_t1.should == "fire"
+			result_t2 = @lemmatizer.lemma("slower")
+			result_t2.should == "slow"
 		end
 	end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: lemmatizer
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.1.0
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-10-28 00:00:00.000000000 Z
+date: 2012-10-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &70234015642620 !ruby/object:Gem::Requirement
+  requirement: &70314483330880 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *70234015642620
+  version_requirements: *70314483330880
 description: Lemmatizer for text in English.  Inspired by Python's nltk.corpus.reader.wordnet.morphy
   package.
 email: