lemmatizer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +20 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +20 -0
- data/Rakefile +9 -0
- data/lemmatizer.gemspec +21 -0
- data/lib/dict/adj.exc +1490 -0
- data/lib/dict/adv.exc +7 -0
- data/lib/dict/index.adj +21508 -0
- data/lib/dict/index.adv +4510 -0
- data/lib/dict/index.noun +117798 -0
- data/lib/dict/index.verb +11530 -0
- data/lib/dict/noun.exc +2054 -0
- data/lib/dict/verb.exc +2401 -0
- data/lib/lemmatizer.rb +108 -0
- data/lib/lemmatizer/version.rb +3 -0
- data/spec/lemmatizer_spec.rb +50 -0
- data/spec/spec_helper.rb +6 -0
- metadata +77 -0
data/lib/lemmatizer.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8; mode: ruby -*-
|
3
|
+
|
4
|
+
# inspired by nltk.corpus.reader.wordnet.morphy
|
5
|
+
# http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
|
6
|
+
|
7
|
+
require "lemmatizer/version"
|
8
|
+
require "stringio"
|
9
|
+
|
10
|
+
class String
|
11
|
+
def endwith(s)
|
12
|
+
self =~ /#{s}$/
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Lemmatizer
|
17
|
+
current_dir = File.expand_path(File.dirname(__FILE__))
|
18
|
+
WN_FILES = {:noun => [current_dir + "/dict/index.noun", current_dir + "/dict/noun.exc"],
|
19
|
+
:verb => [current_dir + "/dict/index.verb", current_dir + "/dict/verb.exc"],
|
20
|
+
:adj => [current_dir + "/dict/index.adj", current_dir + "/dict/adj.exc"],
|
21
|
+
:adv => [current_dir + "/dict/index.adv", current_dir + "/dict/adv.exc"]}
|
22
|
+
|
23
|
+
MORPHOLOGICAL_SUBSTITUTIONS = {
|
24
|
+
:noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
|
25
|
+
['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
|
26
|
+
['men', 'man'], ['ies', 'y']],
|
27
|
+
:verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
|
28
|
+
['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
|
29
|
+
|
30
|
+
:adj => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
|
31
|
+
:adv => []}
|
32
|
+
|
33
|
+
def initialize(files = WN_FILES)
|
34
|
+
@wordlists = {}
|
35
|
+
@exceptions = {}
|
36
|
+
MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
|
37
|
+
@wordlists[x] = {}
|
38
|
+
@exceptions[x] = {}
|
39
|
+
end
|
40
|
+
if files then
|
41
|
+
files.each_pair do |pos,pair|
|
42
|
+
load_wordnet_files(pos, pair[0], pair[1])
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def open_file(*args)
|
48
|
+
if args[0].is_a? IO or args[0].is_a? StringIO then
|
49
|
+
yield args[0]
|
50
|
+
else
|
51
|
+
File.open(*args) do |io|
|
52
|
+
yield io
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def load_wordnet_files(pos, list, exc)
|
58
|
+
open_file(list) do |io|
|
59
|
+
io.each_line do |line|
|
60
|
+
w = line.split(/\s+/)[0]
|
61
|
+
@wordlists[pos][w] = w
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
open_file(exc) do |io|
|
66
|
+
io.each_line do |line|
|
67
|
+
w,s = line.split(/\s+/)
|
68
|
+
@exceptions[pos][w] ||= []
|
69
|
+
@exceptions[pos][w] << s
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def each_lemma(form, pos)
|
75
|
+
if lemma = @exceptions[pos][form] then
|
76
|
+
lemma.each{|x |yield x}
|
77
|
+
end
|
78
|
+
if pos == :noun and form.endwith('ful')
|
79
|
+
each_lemma(form[0,form.length-3], pos) do |x|
|
80
|
+
yield x+'ful'
|
81
|
+
end
|
82
|
+
else
|
83
|
+
_each_substitutions(form, pos) do|x|
|
84
|
+
yield x
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def lemma(form,pos)
|
90
|
+
each_lemma(form, pos) do |x|
|
91
|
+
return x
|
92
|
+
end
|
93
|
+
return form
|
94
|
+
end
|
95
|
+
def _each_substitutions(form, pos)
|
96
|
+
if lemma = @wordlists[pos][form] then
|
97
|
+
yield lemma
|
98
|
+
end
|
99
|
+
MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
|
100
|
+
old, new = *entry
|
101
|
+
if form.endwith(old)
|
102
|
+
_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
|
103
|
+
yield x
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
5
|
+
require 'lemmatizer'
|
6
|
+
|
7
|
+
describe "Lemmatizer" do
|
8
|
+
it "contains lemmatizing functions:" do
|
9
|
+
end
|
10
|
+
|
11
|
+
before do
|
12
|
+
@lemmatizer = Lemmatizer.new
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "lemma" do
|
16
|
+
it "takes a word form and its part-of-speech symbol (:noun, :verb, :adj, :adv) and then returns its lemma form" do
|
17
|
+
result_n1 = @lemmatizer.lemma("analyses", :noun)
|
18
|
+
result_n1.should == "analysis"
|
19
|
+
|
20
|
+
# Lemmatizer leaves alone words that its dictionary does not contain to keep proper names such as "James" intact.
|
21
|
+
result_n2 = @lemmatizer.lemma("MacBooks", :noun)
|
22
|
+
result_n2.should_not == "MacBook"
|
23
|
+
|
24
|
+
result_n3 = @lemmatizer.lemma("desks", :noun)
|
25
|
+
result_n3.should == "desk"
|
26
|
+
|
27
|
+
result_v1 = @lemmatizer.lemma("hired", :verb)
|
28
|
+
result_v1.should == "hire"
|
29
|
+
|
30
|
+
result_v2 = @lemmatizer.lemma("worried", :verb)
|
31
|
+
result_v2.should == "worry"
|
32
|
+
|
33
|
+
result_v3 = @lemmatizer.lemma("partying", :verb)
|
34
|
+
result_v3.should == "party"
|
35
|
+
|
36
|
+
result_a1 = @lemmatizer.lemma("better", :adj)
|
37
|
+
result_a1.should == "good"
|
38
|
+
|
39
|
+
result_a2 = @lemmatizer.lemma("hotter", :adj)
|
40
|
+
result_a2.should == "hot"
|
41
|
+
|
42
|
+
result_r1 = @lemmatizer.lemma("best", :adv)
|
43
|
+
result_r1.should == "well"
|
44
|
+
|
45
|
+
result_r2 = @lemmatizer.lemma("best", :adv)
|
46
|
+
result_r2.should_not == "good"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: lemmatizer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Yoichiro Hasebe
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-10-28 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &70234015642620 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70234015642620
|
25
|
+
description: Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
|
26
|
+
package.
|
27
|
+
email:
|
28
|
+
- yohasebe@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- .gitignore
|
34
|
+
- Gemfile
|
35
|
+
- LICENSE.txt
|
36
|
+
- README.md
|
37
|
+
- Rakefile
|
38
|
+
- lemmatizer.gemspec
|
39
|
+
- lib/dict/adj.exc
|
40
|
+
- lib/dict/adv.exc
|
41
|
+
- lib/dict/index.adj
|
42
|
+
- lib/dict/index.adv
|
43
|
+
- lib/dict/index.noun
|
44
|
+
- lib/dict/index.verb
|
45
|
+
- lib/dict/noun.exc
|
46
|
+
- lib/dict/verb.exc
|
47
|
+
- lib/lemmatizer.rb
|
48
|
+
- lib/lemmatizer/version.rb
|
49
|
+
- spec/lemmatizer_spec.rb
|
50
|
+
- spec/spec_helper.rb
|
51
|
+
homepage: http://github.com/yohasebe/lemmatizer
|
52
|
+
licenses: []
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
requirements: []
|
70
|
+
rubyforge_project:
|
71
|
+
rubygems_version: 1.8.17
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Englsh lemmatizer in Ruby
|
75
|
+
test_files:
|
76
|
+
- spec/lemmatizer_spec.rb
|
77
|
+
- spec/spec_helper.rb
|