lemmatizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/lemmatizer.rb ADDED
@@ -0,0 +1,108 @@
1
+ #! /usr/bin/env ruby
2
+ # -*- coding: utf-8; mode: ruby -*-
3
+
4
+ # inspired by nltk.corpus.reader.wordnet.morphy
5
+ # http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
6
+
7
+ require "lemmatizer/version"
8
+ require "stringio"
9
+
10
+ class String
11
+ def endwith(s)
12
+ self =~ /#{s}$/
13
+ end
14
+ end
15
+
16
+ class Lemmatizer
17
+ current_dir = File.expand_path(File.dirname(__FILE__))
18
+ WN_FILES = {:noun => [current_dir + "/dict/index.noun", current_dir + "/dict/noun.exc"],
19
+ :verb => [current_dir + "/dict/index.verb", current_dir + "/dict/verb.exc"],
20
+ :adj => [current_dir + "/dict/index.adj", current_dir + "/dict/adj.exc"],
21
+ :adv => [current_dir + "/dict/index.adv", current_dir + "/dict/adv.exc"]}
22
+
23
+ MORPHOLOGICAL_SUBSTITUTIONS = {
24
+ :noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
25
+ ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
26
+ ['men', 'man'], ['ies', 'y']],
27
+ :verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
28
+ ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
29
+
30
+ :adj => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
31
+ :adv => []}
32
+
33
+ def initialize(files = WN_FILES)
34
+ @wordlists = {}
35
+ @exceptions = {}
36
+ MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
37
+ @wordlists[x] = {}
38
+ @exceptions[x] = {}
39
+ end
40
+ if files then
41
+ files.each_pair do |pos,pair|
42
+ load_wordnet_files(pos, pair[0], pair[1])
43
+ end
44
+ end
45
+ end
46
+
47
+ def open_file(*args)
48
+ if args[0].is_a? IO or args[0].is_a? StringIO then
49
+ yield args[0]
50
+ else
51
+ File.open(*args) do |io|
52
+ yield io
53
+ end
54
+ end
55
+ end
56
+
57
+ def load_wordnet_files(pos, list, exc)
58
+ open_file(list) do |io|
59
+ io.each_line do |line|
60
+ w = line.split(/\s+/)[0]
61
+ @wordlists[pos][w] = w
62
+ end
63
+ end
64
+
65
+ open_file(exc) do |io|
66
+ io.each_line do |line|
67
+ w,s = line.split(/\s+/)
68
+ @exceptions[pos][w] ||= []
69
+ @exceptions[pos][w] << s
70
+ end
71
+ end
72
+ end
73
+
74
+ def each_lemma(form, pos)
75
+ if lemma = @exceptions[pos][form] then
76
+ lemma.each{|x |yield x}
77
+ end
78
+ if pos == :noun and form.endwith('ful')
79
+ each_lemma(form[0,form.length-3], pos) do |x|
80
+ yield x+'ful'
81
+ end
82
+ else
83
+ _each_substitutions(form, pos) do|x|
84
+ yield x
85
+ end
86
+ end
87
+ end
88
+
89
+ def lemma(form,pos)
90
+ each_lemma(form, pos) do |x|
91
+ return x
92
+ end
93
+ return form
94
+ end
95
+ def _each_substitutions(form, pos)
96
+ if lemma = @wordlists[pos][form] then
97
+ yield lemma
98
+ end
99
+ MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
100
+ old, new = *entry
101
+ if form.endwith(old)
102
+ _each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
103
+ yield x
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,3 @@
1
+ class Lemmatizer
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
5
+ require 'lemmatizer'
6
+
7
+ describe "Lemmatizer" do
8
+ it "contains lemmatizing functions:" do
9
+ end
10
+
11
+ before do
12
+ @lemmatizer = Lemmatizer.new
13
+ end
14
+
15
+ describe "lemma" do
16
+ it "takes a word form and its part-of-speech symbol (:noun, :verb, :adj, :adv) and then returns its lemma form" do
17
+ result_n1 = @lemmatizer.lemma("analyses", :noun)
18
+ result_n1.should == "analysis"
19
+
20
+ # Lemmatizer leaves alone words that its dictionary does not contain to keep proper names such as "James" intact.
21
+ result_n2 = @lemmatizer.lemma("MacBooks", :noun)
22
+ result_n2.should_not == "MacBook"
23
+
24
+ result_n3 = @lemmatizer.lemma("desks", :noun)
25
+ result_n3.should == "desk"
26
+
27
+ result_v1 = @lemmatizer.lemma("hired", :verb)
28
+ result_v1.should == "hire"
29
+
30
+ result_v2 = @lemmatizer.lemma("worried", :verb)
31
+ result_v2.should == "worry"
32
+
33
+ result_v3 = @lemmatizer.lemma("partying", :verb)
34
+ result_v3.should == "party"
35
+
36
+ result_a1 = @lemmatizer.lemma("better", :adj)
37
+ result_a1.should == "good"
38
+
39
+ result_a2 = @lemmatizer.lemma("hotter", :adj)
40
+ result_a2.should == "hot"
41
+
42
+ result_r1 = @lemmatizer.lemma("best", :adv)
43
+ result_r1.should == "well"
44
+
45
+ result_r2 = @lemmatizer.lemma("best", :adv)
46
+ result_r2.should_not == "good"
47
+ end
48
+ end
49
+
50
+ end
@@ -0,0 +1,6 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require 'rspec'
3
+
4
+ RSpec.configure do |config|
5
+ # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
6
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lemmatizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Yoichiro Hasebe
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-10-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &70234015642620 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70234015642620
25
+ description: Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
26
+ package.
27
+ email:
28
+ - yohasebe@gmail.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - .gitignore
34
+ - Gemfile
35
+ - LICENSE.txt
36
+ - README.md
37
+ - Rakefile
38
+ - lemmatizer.gemspec
39
+ - lib/dict/adj.exc
40
+ - lib/dict/adv.exc
41
+ - lib/dict/index.adj
42
+ - lib/dict/index.adv
43
+ - lib/dict/index.noun
44
+ - lib/dict/index.verb
45
+ - lib/dict/noun.exc
46
+ - lib/dict/verb.exc
47
+ - lib/lemmatizer.rb
48
+ - lib/lemmatizer/version.rb
49
+ - spec/lemmatizer_spec.rb
50
+ - spec/spec_helper.rb
51
+ homepage: http://github.com/yohasebe/lemmatizer
52
+ licenses: []
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubyforge_project:
71
+ rubygems_version: 1.8.17
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Englsh lemmatizer in Ruby
75
+ test_files:
76
+ - spec/lemmatizer_spec.rb
77
+ - spec/spec_helper.rb