lemmatizer 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +22 -2
- data/lib/dict/index.adj +0 -29
- data/lib/dict/index.adv +0 -29
- data/lib/dict/index.verb +0 -1
- data/lib/lemmatizer.rb +32 -23
- data/lib/lemmatizer/version.rb +1 -1
- data/spec/lemmatizer_spec.rb +17 -0
- metadata +4 -4
data/README.md
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
lemmatizer
|
2
2
|
==========
|
3
3
|
|
4
|
-
Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy package
|
4
|
+
Lemmatizer for text in English. Inspired by Python's [nltk.corpus.reader.wordnet.morphy](orpusReader.morphy) package.
|
5
|
+
|
6
|
+
Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
|
7
|
+
|
8
|
+
Installation
|
9
|
+
------------
|
10
|
+
|
11
|
+
sudo gem install lemmatizer
|
12
|
+
|
5
13
|
|
6
14
|
Usage
|
7
15
|
-----
|
@@ -14,7 +22,19 @@ Usage
|
|
14
22
|
p lem.lemma("hired", :verb ) # => "hire"
|
15
23
|
p lem.lemma("hotter", :adj ) # => "hot"
|
16
24
|
p lem.lemma("better", :adv ) # => "well"
|
25
|
+
|
26
|
+
# when part-of-speech symbol is not specified as the second argument, lemmatizer tries :verb, :noun, :adj, or :adv one by one in this order.
|
27
|
+
p lem.lemma("fired") # => "fire"
|
28
|
+
p lem.lemma("slow") # => "slow"
|
29
|
+
|
30
|
+
Limitations
|
31
|
+
-----------
|
17
32
|
|
18
33
|
# Lemmatizer leaves alone words that its dictionary does not contain. This keeps proper names such as "James" intact.
|
19
34
|
p lem.lemma("MacBooks", :noun) # => "MacBooks"
|
20
|
-
|
35
|
+
|
36
|
+
# If an inflected form of word is included as a lemma in the word list, lemmatizer may not give the expected result.
|
37
|
+
p lem.lemma("higher", :adj) # => "higher" not "high"!
|
38
|
+
|
39
|
+
# The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
|
40
|
+
# Modify dict/index.{noun|verb|adj|adv} if necessary.
|
data/lib/dict/index.adj
CHANGED
@@ -1,32 +1,3 @@
|
|
1
|
-
1 This software and database is being provided to you, the LICENSEE, by
|
2
|
-
2 Princeton University under the following license. By obtaining, using
|
3
|
-
3 and/or copying this software and database, you agree that you have
|
4
|
-
4 read, understood, and will comply with these terms and conditions.:
|
5
|
-
5
|
6
|
-
6 Permission to use, copy, modify and distribute this software and
|
7
|
-
7 database and its documentation for any purpose and without fee or
|
8
|
-
8 royalty is hereby granted, provided that you agree to comply with
|
9
|
-
9 the following copyright notice and statements, including the disclaimer,
|
10
|
-
10 and that the same appear on ALL copies of the software, database and
|
11
|
-
11 documentation, including modifications that you make for internal
|
12
|
-
12 use or for distribution.
|
13
|
-
13
|
14
|
-
14 WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
|
15
|
-
15
|
16
|
-
16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
|
17
|
-
17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
|
18
|
-
18 IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
|
19
|
-
19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
|
20
|
-
20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
|
21
|
-
21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
|
22
|
-
22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
|
23
|
-
23 OTHER RIGHTS.
|
24
|
-
24
|
25
|
-
25 The name of Princeton University or Princeton may not be used in
|
26
|
-
26 advertising or publicity pertaining to distribution of the software
|
27
|
-
27 and/or database. Title to copyright in this software, database and
|
28
|
-
28 any associated documentation shall at all times remain with
|
29
|
-
29 Princeton University and LICENSEE agrees to preserve same.
|
30
1
|
.22-caliber a 1 1 \ 1 0 03146310
|
31
2
|
.22-calibre a 1 1 \ 1 0 03146310
|
32
3
|
.22_caliber a 1 1 \ 1 0 03146310
|
data/lib/dict/index.adv
CHANGED
@@ -1,32 +1,3 @@
|
|
1
|
-
1 This software and database is being provided to you, the LICENSEE, by
|
2
|
-
2 Princeton University under the following license. By obtaining, using
|
3
|
-
3 and/or copying this software and database, you agree that you have
|
4
|
-
4 read, understood, and will comply with these terms and conditions.:
|
5
|
-
5
|
6
|
-
6 Permission to use, copy, modify and distribute this software and
|
7
|
-
7 database and its documentation for any purpose and without fee or
|
8
|
-
8 royalty is hereby granted, provided that you agree to comply with
|
9
|
-
9 the following copyright notice and statements, including the disclaimer,
|
10
|
-
10 and that the same appear on ALL copies of the software, database and
|
11
|
-
11 documentation, including modifications that you make for internal
|
12
|
-
12 use or for distribution.
|
13
|
-
13
|
14
|
-
14 WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
|
15
|
-
15
|
16
|
-
16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
|
17
|
-
17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
|
18
|
-
18 IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
|
19
|
-
19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
|
20
|
-
20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
|
21
|
-
21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
|
22
|
-
22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
|
23
|
-
23 OTHER RIGHTS.
|
24
|
-
24
|
25
|
-
25 The name of Princeton University or Princeton may not be used in
|
26
|
-
26 advertising or publicity pertaining to distribution of the software
|
27
|
-
27 and/or database. Title to copyright in this software, database and
|
28
|
-
28 any associated documentation shall at all times remain with
|
29
|
-
29 Princeton University and LICENSEE agrees to preserve same.
|
30
1
|
'tween r 1 0 1 0 00250898
|
31
2
|
'tween_decks r 1 0 1 0 00498293
|
32
3
|
a.d. r 1 0 1 0 00001837
|
data/lib/dict/index.verb
CHANGED
data/lib/lemmatizer.rb
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
# -*- coding: utf-8; mode: ruby -*-
|
3
3
|
|
4
|
-
#
|
5
|
-
# http://
|
4
|
+
# Inspired by nltk.corpus.reader.wordnet.morphy http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
|
5
|
+
# Original code posted by mtbr at http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer
|
6
|
+
|
6
7
|
|
7
8
|
require "lemmatizer/version"
|
8
9
|
require "stringio"
|
@@ -22,11 +23,10 @@ class Lemmatizer
|
|
22
23
|
|
23
24
|
MORPHOLOGICAL_SUBSTITUTIONS = {
|
24
25
|
:noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
|
25
|
-
|
26
|
-
|
26
|
+
['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
|
27
|
+
['men', 'man'], ['ies', 'y']],
|
27
28
|
:verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
|
28
|
-
|
29
|
-
|
29
|
+
['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
|
30
30
|
:adj => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
|
31
31
|
:adv => []}
|
32
32
|
|
@@ -38,7 +38,7 @@ class Lemmatizer
|
|
38
38
|
@exceptions[x] = {}
|
39
39
|
end
|
40
40
|
if files then
|
41
|
-
files.each_pair do |pos,pair|
|
41
|
+
files.each_pair do |pos, pair|
|
42
42
|
load_wordnet_files(pos, pair[0], pair[1])
|
43
43
|
end
|
44
44
|
end
|
@@ -64,19 +64,33 @@ class Lemmatizer
|
|
64
64
|
|
65
65
|
open_file(exc) do |io|
|
66
66
|
io.each_line do |line|
|
67
|
-
w,s = line.split(/\s+/)
|
67
|
+
w, s = line.split(/\s+/)
|
68
68
|
@exceptions[pos][w] ||= []
|
69
69
|
@exceptions[pos][w] << s
|
70
70
|
end
|
71
71
|
end
|
72
72
|
end
|
73
73
|
|
74
|
+
def _each_substitutions(form, pos)
|
75
|
+
if lemma = @wordlists[pos][form] then
|
76
|
+
yield lemma
|
77
|
+
end
|
78
|
+
MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
|
79
|
+
old, new = *entry
|
80
|
+
if form.endwith(old)
|
81
|
+
_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
|
82
|
+
yield x
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
74
88
|
def each_lemma(form, pos)
|
75
89
|
if lemma = @exceptions[pos][form] then
|
76
90
|
lemma.each{|x |yield x}
|
77
91
|
end
|
78
92
|
if pos == :noun and form.endwith('ful')
|
79
|
-
each_lemma(form[0,form.length-3], pos) do |x|
|
93
|
+
each_lemma(form[0, form.length-3], pos) do |x|
|
80
94
|
yield x+'ful'
|
81
95
|
end
|
82
96
|
else
|
@@ -86,23 +100,18 @@ class Lemmatizer
|
|
86
100
|
end
|
87
101
|
end
|
88
102
|
|
89
|
-
def lemma(form,pos)
|
103
|
+
def lemma(form, pos = nil)
|
104
|
+
if !pos
|
105
|
+
[:verb, :noun, :adj, :adv].each do |p|
|
106
|
+
result = lemma(form, p)
|
107
|
+
return result unless result == form
|
108
|
+
end
|
109
|
+
return form
|
110
|
+
end
|
90
111
|
each_lemma(form, pos) do |x|
|
91
112
|
return x
|
92
113
|
end
|
93
114
|
return form
|
94
115
|
end
|
95
|
-
|
96
|
-
if lemma = @wordlists[pos][form] then
|
97
|
-
yield lemma
|
98
|
-
end
|
99
|
-
MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
|
100
|
-
old, new = *entry
|
101
|
-
if form.endwith(old)
|
102
|
-
_each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
|
103
|
-
yield x
|
104
|
-
end
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|
116
|
+
|
108
117
|
end
|
data/lib/lemmatizer/version.rb
CHANGED
data/spec/lemmatizer_spec.rb
CHANGED
@@ -44,6 +44,23 @@ describe "Lemmatizer" do
|
|
44
44
|
|
45
45
|
result_r2 = @lemmatizer.lemma("best", :adv)
|
46
46
|
result_r2.should_not == "good"
|
47
|
+
|
48
|
+
# Lemmatizer give a result even when no pos is given, by assuming it to be :verb, :noun, :adv, or :adj.
|
49
|
+
result_1 = @lemmatizer.lemma("plays")
|
50
|
+
result_1.should == "play"
|
51
|
+
|
52
|
+
result_2 = @lemmatizer.lemma("oxen")
|
53
|
+
result_2.should == "ox"
|
54
|
+
|
55
|
+
result_3 = @lemmatizer.lemma("higher")
|
56
|
+
result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
|
57
|
+
|
58
|
+
# test cases for words used in README
|
59
|
+
result_t1 = @lemmatizer.lemma("fired")
|
60
|
+
result_t1.should == "fire"
|
61
|
+
|
62
|
+
result_t2 = @lemmatizer.lemma("slower")
|
63
|
+
result_t2.should == "slow"
|
47
64
|
end
|
48
65
|
end
|
49
66
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lemmatizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-10-
|
12
|
+
date: 2012-10-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70314483330880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70314483330880
|
25
25
|
description: Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
|
26
26
|
package.
|
27
27
|
email:
|