RubyGems - lemmatizer - Versions diffs - 0.0.1 - Mend

lemmatizer 0.0.1

Files changed (19) hide show

data/lib/lemmatizer.rb ADDED Viewed

@@ -0,0 +1,108 @@
+#! /usr/bin/env ruby
+# -*- coding: utf-8; mode: ruby -*-
+# inspired by nltk.corpus.reader.wordnet.morphy
+# http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
+require "lemmatizer/version"
+require "stringio"
+class String
+	def endwith(s)
+		self =~ /#{s}$/
+	end
+end
+class Lemmatizer
+	current_dir = File.expand_path(File.dirname(__FILE__))
+	WN_FILES = {:noun => [current_dir + "/dict/index.noun", current_dir + "/dict/noun.exc"],
+							:verb => [current_dir + "/dict/index.verb", current_dir + "/dict/verb.exc"],
+							:adj  => [current_dir + "/dict/index.adj", current_dir + "/dict/adj.exc"],
+							:adv  => [current_dir + "/dict/index.adv", current_dir + "/dict/adv.exc"]}
+	MORPHOLOGICAL_SUBSTITUTIONS = {
+		:noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
+								['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
+							 ['men', 'man'], ['ies', 'y']],
+		:verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
+							 ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
+		:adj =>  [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
+		:adv =>  []}
+	def initialize(files = WN_FILES)
+		@wordlists = {}
+		@exceptions = {}
+		MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
+			@wordlists[x] = {}
+			@exceptions[x] = {}
+		end
+		if files then
+			files.each_pair do |pos,pair|
+				load_wordnet_files(pos, pair[0], pair[1])
+			end
+		end
+	end
+	def open_file(*args)
+		if args[0].is_a? IO or args[0].is_a? StringIO then
+			yield args[0]
+		else
+			File.open(*args) do |io|
+				yield io
+			end
+		end
+	end
+	def load_wordnet_files(pos, list, exc)
+		open_file(list) do |io|
+			io.each_line do |line|
+				w = line.split(/\s+/)[0]
+				@wordlists[pos][w] = w
+			end
+		end
+		open_file(exc) do |io|
+			io.each_line do |line|
+				w,s = line.split(/\s+/)
+				@exceptions[pos][w] ||= []
+				@exceptions[pos][w] << s
+			end
+		end
+	end
+	def each_lemma(form, pos)
+		if lemma = @exceptions[pos][form] then
+			lemma.each{|x |yield x}
+		end
+		if pos == :noun and form.endwith('ful')
+			each_lemma(form[0,form.length-3], pos) do |x|
+				yield x+'ful'
+			end
+		else
+			_each_substitutions(form, pos) do|x|
+				yield x
+			end
+		end
+	end
+	def lemma(form,pos)
+		each_lemma(form, pos) do |x|
+			return x
+		end
+		return form
+	end
+	def _each_substitutions(form, pos)
+		if lemma = @wordlists[pos][form] then
+			yield lemma
+		end
+		MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
+			old, new = *entry
+			if form.endwith(old)
+				_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
+					yield x
+				end
+			end
+		end
+	end
+end

data/lib/lemmatizer/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+class Lemmatizer
+  VERSION = "0.0.1"
+end

data/spec/lemmatizer_spec.rb ADDED Viewed

@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+# -*- coding: utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+require 'lemmatizer'
+describe "Lemmatizer" do
+	it "contains lemmatizing functions:" do
+	end
+	before do
+		@lemmatizer = Lemmatizer.new
+	end
+	describe "lemma" do
+		it "takes a word form and its part-of-speech symbol (:noun, :verb, :adj, :adv) and then returns its lemma form" do
+			result_n1 = @lemmatizer.lemma("analyses", :noun)
+			result_n1.should == "analysis"
+      # Lemmatizer leaves alone words that its dictionary does not contain to keep proper names such as "James" intact.
+			result_n2 = @lemmatizer.lemma("MacBooks", :noun)
+			result_n2.should_not == "MacBook"
+			result_n3 = @lemmatizer.lemma("desks", :noun)
+			result_n3.should == "desk"
+			result_v1 = @lemmatizer.lemma("hired", :verb)
+			result_v1.should == "hire"
+			result_v2 = @lemmatizer.lemma("worried", :verb)
+			result_v2.should == "worry"
+			result_v3 = @lemmatizer.lemma("partying", :verb)
+			result_v3.should == "party"
+			result_a1 = @lemmatizer.lemma("better", :adj)
+			result_a1.should == "good"
+			result_a2 = @lemmatizer.lemma("hotter", :adj)
+			result_a2.should == "hot"
+			result_r1 = @lemmatizer.lemma("best", :adv)
+			result_r1.should == "well"
+			result_r2 = @lemmatizer.lemma("best", :adv)
+			result_r2.should_not == "good"
+		end
+	end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,6 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+require 'rspec'
+RSpec.configure do |config|
+	# see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
+end

metadata ADDED Viewed

@@ -0,0 +1,77 @@
+--- !ruby/object:Gem::Specification
+name: lemmatizer
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- Yoichiro Hasebe
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-10-28 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: &70234015642620 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *70234015642620
+description: Lemmatizer for text in English.  Inspired by Python's nltk.corpus.reader.wordnet.morphy
+  package.
+email:
+- yohasebe@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- lemmatizer.gemspec
+- lib/dict/adj.exc
+- lib/dict/adv.exc
+- lib/dict/index.adj
+- lib/dict/index.adv
+- lib/dict/index.noun
+- lib/dict/index.verb
+- lib/dict/noun.exc
+- lib/dict/verb.exc
+- lib/lemmatizer.rb
+- lib/lemmatizer/version.rb
+- spec/lemmatizer_spec.rb
+- spec/spec_helper.rb
+homepage: http://github.com/yohasebe/lemmatizer
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.17
+signing_key:
+specification_version: 3
+summary: Englsh lemmatizer in Ruby
+test_files:
+- spec/lemmatizer_spec.rb
+- spec/spec_helper.rb