lemmatizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/lemmatizer.rb ADDED
@@ -0,0 +1,108 @@
1
+ #! /usr/bin/env ruby
2
+ # -*- coding: utf-8; mode: ruby -*-
3
+
4
+ # inspired by nltk.corpus.reader.wordnet.morphy
5
+ # http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
6
+
7
+ require "lemmatizer/version"
8
+ require "stringio"
9
+
10
+ class String
11
+ def endwith(s)
12
+ self =~ /#{s}$/
13
+ end
14
+ end
15
+
16
+ class Lemmatizer
17
+ current_dir = File.expand_path(File.dirname(__FILE__))
18
+ WN_FILES = {:noun => [current_dir + "/dict/index.noun", current_dir + "/dict/noun.exc"],
19
+ :verb => [current_dir + "/dict/index.verb", current_dir + "/dict/verb.exc"],
20
+ :adj => [current_dir + "/dict/index.adj", current_dir + "/dict/adj.exc"],
21
+ :adv => [current_dir + "/dict/index.adv", current_dir + "/dict/adv.exc"]}
22
+
23
+ MORPHOLOGICAL_SUBSTITUTIONS = {
24
+ :noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
25
+ ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
26
+ ['men', 'man'], ['ies', 'y']],
27
+ :verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
28
+ ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
29
+
30
+ :adj => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
31
+ :adv => []}
32
+
33
+ def initialize(files = WN_FILES)
34
+ @wordlists = {}
35
+ @exceptions = {}
36
+ MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
37
+ @wordlists[x] = {}
38
+ @exceptions[x] = {}
39
+ end
40
+ if files then
41
+ files.each_pair do |pos,pair|
42
+ load_wordnet_files(pos, pair[0], pair[1])
43
+ end
44
+ end
45
+ end
46
+
47
+ def open_file(*args)
48
+ if args[0].is_a? IO or args[0].is_a? StringIO then
49
+ yield args[0]
50
+ else
51
+ File.open(*args) do |io|
52
+ yield io
53
+ end
54
+ end
55
+ end
56
+
57
+ def load_wordnet_files(pos, list, exc)
58
+ open_file(list) do |io|
59
+ io.each_line do |line|
60
+ w = line.split(/\s+/)[0]
61
+ @wordlists[pos][w] = w
62
+ end
63
+ end
64
+
65
+ open_file(exc) do |io|
66
+ io.each_line do |line|
67
+ w,s = line.split(/\s+/)
68
+ @exceptions[pos][w] ||= []
69
+ @exceptions[pos][w] << s
70
+ end
71
+ end
72
+ end
73
+
74
+ def each_lemma(form, pos)
75
+ if lemma = @exceptions[pos][form] then
76
+ lemma.each{|x |yield x}
77
+ end
78
+ if pos == :noun and form.endwith('ful')
79
+ each_lemma(form[0,form.length-3], pos) do |x|
80
+ yield x+'ful'
81
+ end
82
+ else
83
+ _each_substitutions(form, pos) do|x|
84
+ yield x
85
+ end
86
+ end
87
+ end
88
+
89
+ def lemma(form,pos)
90
+ each_lemma(form, pos) do |x|
91
+ return x
92
+ end
93
+ return form
94
+ end
95
+ def _each_substitutions(form, pos)
96
+ if lemma = @wordlists[pos][form] then
97
+ yield lemma
98
+ end
99
+ MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
100
+ old, new = *entry
101
+ if form.endwith(old)
102
+ _each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
103
+ yield x
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,3 @@
1
+ class Lemmatizer
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
5
+ require 'lemmatizer'
6
+
7
+ describe "Lemmatizer" do
8
+ it "contains lemmatizing functions:" do
9
+ end
10
+
11
+ before do
12
+ @lemmatizer = Lemmatizer.new
13
+ end
14
+
15
+ describe "lemma" do
16
+ it "takes a word form and its part-of-speech symbol (:noun, :verb, :adj, :adv) and then returns its lemma form" do
17
+ result_n1 = @lemmatizer.lemma("analyses", :noun)
18
+ result_n1.should == "analysis"
19
+
20
+ # Lemmatizer leaves alone words that its dictionary does not contain to keep proper names such as "James" intact.
21
+ result_n2 = @lemmatizer.lemma("MacBooks", :noun)
22
+ result_n2.should_not == "MacBook"
23
+
24
+ result_n3 = @lemmatizer.lemma("desks", :noun)
25
+ result_n3.should == "desk"
26
+
27
+ result_v1 = @lemmatizer.lemma("hired", :verb)
28
+ result_v1.should == "hire"
29
+
30
+ result_v2 = @lemmatizer.lemma("worried", :verb)
31
+ result_v2.should == "worry"
32
+
33
+ result_v3 = @lemmatizer.lemma("partying", :verb)
34
+ result_v3.should == "party"
35
+
36
+ result_a1 = @lemmatizer.lemma("better", :adj)
37
+ result_a1.should == "good"
38
+
39
+ result_a2 = @lemmatizer.lemma("hotter", :adj)
40
+ result_a2.should == "hot"
41
+
42
+ result_r1 = @lemmatizer.lemma("best", :adv)
43
+ result_r1.should == "well"
44
+
45
+ result_r2 = @lemmatizer.lemma("best", :adv)
46
+ result_r2.should_not == "good"
47
+ end
48
+ end
49
+
50
+ end
@@ -0,0 +1,6 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require 'rspec'
3
+
4
+ RSpec.configure do |config|
5
+ # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
6
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lemmatizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Yoichiro Hasebe
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-10-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &70234015642620 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70234015642620
25
+ description: Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
26
+ package.
27
+ email:
28
+ - yohasebe@gmail.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - .gitignore
34
+ - Gemfile
35
+ - LICENSE.txt
36
+ - README.md
37
+ - Rakefile
38
+ - lemmatizer.gemspec
39
+ - lib/dict/adj.exc
40
+ - lib/dict/adv.exc
41
+ - lib/dict/index.adj
42
+ - lib/dict/index.adv
43
+ - lib/dict/index.noun
44
+ - lib/dict/index.verb
45
+ - lib/dict/noun.exc
46
+ - lib/dict/verb.exc
47
+ - lib/lemmatizer.rb
48
+ - lib/lemmatizer/version.rb
49
+ - spec/lemmatizer_spec.rb
50
+ - spec/spec_helper.rb
51
+ homepage: http://github.com/yohasebe/lemmatizer
52
+ licenses: []
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubyforge_project:
71
+ rubygems_version: 1.8.17
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Englsh lemmatizer in Ruby
75
+ test_files:
76
+ - spec/lemmatizer_spec.rb
77
+ - spec/spec_helper.rb