lemmatizer 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +22 -2
- data/lib/dict/index.adj +0 -29
- data/lib/dict/index.adv +0 -29
- data/lib/dict/index.verb +0 -1
- data/lib/lemmatizer.rb +32 -23
- data/lib/lemmatizer/version.rb +1 -1
- data/spec/lemmatizer_spec.rb +17 -0
- metadata +4 -4
data/README.md
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
lemmatizer
|
2
2
|
==========
|
3
3
|
|
4
|
-
Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy package
|
4
|
+
Lemmatizer for text in English. Inspired by Python's [nltk.corpus.reader.wordnet.morphy](orpusReader.morphy) package.
|
5
|
+
|
6
|
+
Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
|
7
|
+
|
8
|
+
Installation
|
9
|
+
------------
|
10
|
+
|
11
|
+
sudo gem install lemmatizer
|
12
|
+
|
5
13
|
|
6
14
|
Usage
|
7
15
|
-----
|
@@ -14,7 +22,19 @@ Usage
|
|
14
22
|
p lem.lemma("hired", :verb ) # => "hire"
|
15
23
|
p lem.lemma("hotter", :adj ) # => "hot"
|
16
24
|
p lem.lemma("better", :adv ) # => "well"
|
25
|
+
|
26
|
+
# when part-of-speech symbol is not specified as the second argument, lemmatizer tries :verb, :noun, :adj, or :adv one by one in this order.
|
27
|
+
p lem.lemma("fired") # => "fire"
|
28
|
+
p lem.lemma("slow") # => "slow"
|
29
|
+
|
30
|
+
Limitations
|
31
|
+
-----------
|
17
32
|
|
18
33
|
# Lemmatizer leaves alone words that its dictionary does not contain. This keeps proper names such as "James" intact.
|
19
34
|
p lem.lemma("MacBooks", :noun) # => "MacBooks"
|
20
|
-
|
35
|
+
|
36
|
+
# If an inflected form of word is included as a lemma in the word list, lemmatizer may not give the expected result.
|
37
|
+
p lem.lemma("higher", :adj) # => "higher" not "high"!
|
38
|
+
|
39
|
+
# The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
|
40
|
+
# Modify dict/index.{noun|verb|adj|adv} if necessary.
|
data/lib/dict/index.adj
CHANGED
@@ -1,32 +1,3 @@
|
|
1
|
-
1 This software and database is being provided to you, the LICENSEE, by
|
2
|
-
2 Princeton University under the following license. By obtaining, using
|
3
|
-
3 and/or copying this software and database, you agree that you have
|
4
|
-
4 read, understood, and will comply with these terms and conditions.:
|
5
|
-
5
|
6
|
-
6 Permission to use, copy, modify and distribute this software and
|
7
|
-
7 database and its documentation for any purpose and without fee or
|
8
|
-
8 royalty is hereby granted, provided that you agree to comply with
|
9
|
-
9 the following copyright notice and statements, including the disclaimer,
|
10
|
-
10 and that the same appear on ALL copies of the software, database and
|
11
|
-
11 documentation, including modifications that you make for internal
|
12
|
-
12 use or for distribution.
|
13
|
-
13
|
14
|
-
14 WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
|
15
|
-
15
|
16
|
-
16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
|
17
|
-
17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
|
18
|
-
18 IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
|
19
|
-
19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
|
20
|
-
20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
|
21
|
-
21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
|
22
|
-
22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
|
23
|
-
23 OTHER RIGHTS.
|
24
|
-
24
|
25
|
-
25 The name of Princeton University or Princeton may not be used in
|
26
|
-
26 advertising or publicity pertaining to distribution of the software
|
27
|
-
27 and/or database. Title to copyright in this software, database and
|
28
|
-
28 any associated documentation shall at all times remain with
|
29
|
-
29 Princeton University and LICENSEE agrees to preserve same.
|
30
1
|
.22-caliber a 1 1 \ 1 0 03146310
|
31
2
|
.22-calibre a 1 1 \ 1 0 03146310
|
32
3
|
.22_caliber a 1 1 \ 1 0 03146310
|
data/lib/dict/index.adv
CHANGED
@@ -1,32 +1,3 @@
|
|
1
|
-
1 This software and database is being provided to you, the LICENSEE, by
|
2
|
-
2 Princeton University under the following license. By obtaining, using
|
3
|
-
3 and/or copying this software and database, you agree that you have
|
4
|
-
4 read, understood, and will comply with these terms and conditions.:
|
5
|
-
5
|
6
|
-
6 Permission to use, copy, modify and distribute this software and
|
7
|
-
7 database and its documentation for any purpose and without fee or
|
8
|
-
8 royalty is hereby granted, provided that you agree to comply with
|
9
|
-
9 the following copyright notice and statements, including the disclaimer,
|
10
|
-
10 and that the same appear on ALL copies of the software, database and
|
11
|
-
11 documentation, including modifications that you make for internal
|
12
|
-
12 use or for distribution.
|
13
|
-
13
|
14
|
-
14 WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
|
15
|
-
15
|
16
|
-
16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
|
17
|
-
17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
|
18
|
-
18 IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
|
19
|
-
19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
|
20
|
-
20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
|
21
|
-
21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
|
22
|
-
22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
|
23
|
-
23 OTHER RIGHTS.
|
24
|
-
24
|
25
|
-
25 The name of Princeton University or Princeton may not be used in
|
26
|
-
26 advertising or publicity pertaining to distribution of the software
|
27
|
-
27 and/or database. Title to copyright in this software, database and
|
28
|
-
28 any associated documentation shall at all times remain with
|
29
|
-
29 Princeton University and LICENSEE agrees to preserve same.
|
30
1
|
'tween r 1 0 1 0 00250898
|
31
2
|
'tween_decks r 1 0 1 0 00498293
|
32
3
|
a.d. r 1 0 1 0 00001837
|
data/lib/dict/index.verb
CHANGED
data/lib/lemmatizer.rb
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
# -*- coding: utf-8; mode: ruby -*-
|
3
3
|
|
4
|
-
#
|
5
|
-
# http://
|
4
|
+
# Inspired by nltk.corpus.reader.wordnet.morphy http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
|
5
|
+
# Original code posted by mtbr at http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer
|
6
|
+
|
6
7
|
|
7
8
|
require "lemmatizer/version"
|
8
9
|
require "stringio"
|
@@ -22,11 +23,10 @@ class Lemmatizer
|
|
22
23
|
|
23
24
|
MORPHOLOGICAL_SUBSTITUTIONS = {
|
24
25
|
:noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
|
25
|
-
|
26
|
-
|
26
|
+
['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
|
27
|
+
['men', 'man'], ['ies', 'y']],
|
27
28
|
:verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
|
28
|
-
|
29
|
-
|
29
|
+
['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
|
30
30
|
:adj => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
|
31
31
|
:adv => []}
|
32
32
|
|
@@ -38,7 +38,7 @@ class Lemmatizer
|
|
38
38
|
@exceptions[x] = {}
|
39
39
|
end
|
40
40
|
if files then
|
41
|
-
files.each_pair do |pos,pair|
|
41
|
+
files.each_pair do |pos, pair|
|
42
42
|
load_wordnet_files(pos, pair[0], pair[1])
|
43
43
|
end
|
44
44
|
end
|
@@ -64,19 +64,33 @@ class Lemmatizer
|
|
64
64
|
|
65
65
|
open_file(exc) do |io|
|
66
66
|
io.each_line do |line|
|
67
|
-
w,s = line.split(/\s+/)
|
67
|
+
w, s = line.split(/\s+/)
|
68
68
|
@exceptions[pos][w] ||= []
|
69
69
|
@exceptions[pos][w] << s
|
70
70
|
end
|
71
71
|
end
|
72
72
|
end
|
73
73
|
|
74
|
+
def _each_substitutions(form, pos)
|
75
|
+
if lemma = @wordlists[pos][form] then
|
76
|
+
yield lemma
|
77
|
+
end
|
78
|
+
MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
|
79
|
+
old, new = *entry
|
80
|
+
if form.endwith(old)
|
81
|
+
_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
|
82
|
+
yield x
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
74
88
|
def each_lemma(form, pos)
|
75
89
|
if lemma = @exceptions[pos][form] then
|
76
90
|
lemma.each{|x |yield x}
|
77
91
|
end
|
78
92
|
if pos == :noun and form.endwith('ful')
|
79
|
-
each_lemma(form[0,form.length-3], pos) do |x|
|
93
|
+
each_lemma(form[0, form.length-3], pos) do |x|
|
80
94
|
yield x+'ful'
|
81
95
|
end
|
82
96
|
else
|
@@ -86,23 +100,18 @@ class Lemmatizer
|
|
86
100
|
end
|
87
101
|
end
|
88
102
|
|
89
|
-
def lemma(form,pos)
|
103
|
+
def lemma(form, pos = nil)
|
104
|
+
if !pos
|
105
|
+
[:verb, :noun, :adj, :adv].each do |p|
|
106
|
+
result = lemma(form, p)
|
107
|
+
return result unless result == form
|
108
|
+
end
|
109
|
+
return form
|
110
|
+
end
|
90
111
|
each_lemma(form, pos) do |x|
|
91
112
|
return x
|
92
113
|
end
|
93
114
|
return form
|
94
115
|
end
|
95
|
-
|
96
|
-
if lemma = @wordlists[pos][form] then
|
97
|
-
yield lemma
|
98
|
-
end
|
99
|
-
MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
|
100
|
-
old, new = *entry
|
101
|
-
if form.endwith(old)
|
102
|
-
_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
|
103
|
-
yield x
|
104
|
-
end
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|
116
|
+
|
108
117
|
end
|
data/lib/lemmatizer/version.rb
CHANGED
data/spec/lemmatizer_spec.rb
CHANGED
@@ -44,6 +44,23 @@ describe "Lemmatizer" do
|
|
44
44
|
|
45
45
|
result_r2 = @lemmatizer.lemma("best", :adv)
|
46
46
|
result_r2.should_not == "good"
|
47
|
+
|
48
|
+
# Lemmatizer give a result even when no pos is given, by assuming it to be :verb, :noun, :adv, or :adj.
|
49
|
+
result_1 = @lemmatizer.lemma("plays")
|
50
|
+
result_1.should == "play"
|
51
|
+
|
52
|
+
result_2 = @lemmatizer.lemma("oxen")
|
53
|
+
result_2.should == "ox"
|
54
|
+
|
55
|
+
result_3 = @lemmatizer.lemma("higher")
|
56
|
+
result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
|
57
|
+
|
58
|
+
# test cases for words used in README
|
59
|
+
result_t1 = @lemmatizer.lemma("fired")
|
60
|
+
result_t1.should == "fire"
|
61
|
+
|
62
|
+
result_t2 = @lemmatizer.lemma("slower")
|
63
|
+
result_t2.should == "slow"
|
47
64
|
end
|
48
65
|
end
|
49
66
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lemmatizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-10-
|
12
|
+
date: 2012-10-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70314483330880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70314483330880
|
25
25
|
description: Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
|
26
26
|
package.
|
27
27
|
email:
|