rwordnet 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.markdown +3 -0
- data/examples/full_hypernym.rb +5 -5
- data/examples/morphy.rb +20 -0
- data/examples/synset_find.rb +8 -0
- data/lib/wordnet/db.rb +11 -0
- data/lib/wordnet/lemma.rb +23 -2
- data/lib/wordnet/pointer.rb +19 -1
- data/lib/wordnet/synset.rb +186 -7
- data/lib/wordnet/version.rb +1 -1
- data/morphy/exceptions/adj.exc +1490 -0
- data/morphy/exceptions/adv.exc +7 -0
- data/morphy/exceptions/noun.exc +2054 -0
- data/morphy/exceptions/verb.exc +2401 -0
- data/test/test_helper.rb +19 -0
- data/test/unit/lemma_test.rb +1 -1
- data/test/unit/synset_test.rb +37 -1
- metadata +10 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: af20be262ff83829299dfcaab7bdaf6daca77d9e
|
4
|
+
data.tar.gz: 5fdf6de52538acc9e2e6c6cd59af7f8055aa1361
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9f232d93029c8f200e6ba54af4461df6a4430e7ecc1189510cee59447303a350bb47f6deab13ad9b3cb7d6730b52d514b73377f673f1d81c03b4997d2a6ababe
|
7
|
+
data.tar.gz: d374907bacd015be0274bb8f8219d1803bc884b9c852f99c83441ca303e7fe2ac4687be7341462e6bb5fb54657f09c91d4cf64edc746ee27c8577ce2286a2942
|
data/README.markdown
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# A pure Ruby interface to WordNet #
|
2
2
|
|
3
3
|
[![Build Status](https://travis-ci.org/doches/rwordnet.png)](https://travis-ci.org/doches/rwordnet)
|
4
|
+
[![Documentation Status](https://inch-ci.org/github/doches/rwordnet.svg?branch=master)](https://inch-ci.org/github/doches/rwordnet)
|
5
|
+
[![Code Climate](https://codeclimate.com/github/doches/rwordnet/badges/gpa.svg)](https://codeclimate.com/github/doches/rwordnet)
|
6
|
+
[![Test Coverage](https://codeclimate.com/github/doches/rwordnet/badges/coverage.svg)](https://codeclimate.com/github/doches/rwordnet/coverage)
|
4
7
|
|
5
8
|
## Summary ##
|
6
9
|
|
data/examples/full_hypernym.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
require 'wordnet'
|
2
2
|
|
3
|
-
# Find the word '
|
4
|
-
lemma = WordNet::Lemma.find("
|
5
|
-
# Find all the synsets for '
|
3
|
+
# Find the word 'dog'
|
4
|
+
lemma = WordNet::Lemma.find("dog", :noun)
|
5
|
+
# Find all the synsets for 'dog', and pick the first one.
|
6
6
|
synset = lemma.synsets[0]
|
7
7
|
puts synset
|
8
|
-
# Print the full hypernym derivation for the first sense of '
|
9
|
-
synset.
|
8
|
+
# Print the full hypernym derivation for the first sense of 'dog'.
|
9
|
+
synset.expanded_hypernyms.each { |d| puts d }
|
data/examples/morphy.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'wordnet'
|
2
|
+
|
3
|
+
puts 'dogs'
|
4
|
+
puts '--------------'
|
5
|
+
puts 'as noun'
|
6
|
+
p WordNet::Synset.morphy('dogs', 'noun')
|
7
|
+
puts 'as verb'
|
8
|
+
p WordNet::Synset.morphy('dogs', 'verb')
|
9
|
+
|
10
|
+
|
11
|
+
puts ''
|
12
|
+
puts 'hiking'
|
13
|
+
puts '--------------'
|
14
|
+
puts 'as noun'
|
15
|
+
p WordNet::Synset.morphy('hiking', 'noun')
|
16
|
+
puts 'as verb'
|
17
|
+
p WordNet::Synset.morphy('hiking', 'verb')
|
18
|
+
puts 'as all'
|
19
|
+
p WordNet::Synset.morphy_all('hiking')
|
20
|
+
|
data/lib/wordnet/db.rb
CHANGED
@@ -1,14 +1,25 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
|
1
3
|
module WordNet
|
2
4
|
# Represents the WordNet database, and provides some basic interaction.
|
3
5
|
class DB
|
4
6
|
# By default, use the bundled WordNet
|
5
7
|
@path = File.expand_path("../../../WordNet-3.0/", __FILE__)
|
6
8
|
|
9
|
+
class << self; attr_accessor :cached end
|
10
|
+
@raw_wordnet = {}
|
11
|
+
|
12
|
+
|
7
13
|
class << self
|
8
14
|
# To use your own WordNet installation (rather than the one bundled with rwordnet:
|
9
15
|
# Returns the path to the WordNet installation currently in use. Defaults to the bundled version of WordNet.
|
10
16
|
attr_accessor :path
|
11
17
|
|
18
|
+
# Open a wordnet database. You shouldn't have to call this directly; it's
|
19
|
+
# handled by the autocaching implemented in lemma.rb.
|
20
|
+
#
|
21
|
+
# `path` should be a string containing the absolute path to the root of a
|
22
|
+
# WordNet installation.
|
12
23
|
def open(path, &block)
|
13
24
|
File.open(File.join(self.path, path), "r", &block)
|
14
25
|
end
|
data/lib/wordnet/lemma.rb
CHANGED
@@ -2,9 +2,27 @@ module WordNet
|
|
2
2
|
# Represents a single word in the WordNet lexicon, which can be used to look up a set of synsets.
|
3
3
|
class Lemma
|
4
4
|
SPACE = ' '
|
5
|
-
attr_accessor :word, :pos, :pointer_symbols, :tagsense_count, :synset_offsets, :id
|
6
5
|
|
7
|
-
#
|
6
|
+
# The word this lemma represents
|
7
|
+
attr_accessor :word
|
8
|
+
|
9
|
+
# The part of speech (noun, verb, adjective) of this lemma. One of 'n', 'v', 'a' (adjective), or 'r' (adverb)
|
10
|
+
attr_accessor :pos
|
11
|
+
|
12
|
+
# The number of times the sense is tagged in various semantic concordance texts. A tagsense_count of 0 indicates that the sense has not been semantically tagged.
|
13
|
+
attr_accessor :tagsense_count
|
14
|
+
|
15
|
+
# The offset, in bytes, at which the synsets contained in this lemma are stored in WordNet's internal database.
|
16
|
+
attr_accessor :synset_offsets
|
17
|
+
|
18
|
+
# A unique integer id that references this lemma. Used internally within WordNet's database.
|
19
|
+
attr_accessor :id
|
20
|
+
|
21
|
+
# An array of valid pointer symbols for this lemma. The list of all valid
|
22
|
+
# pointer symbols is defined in pointers.rb.
|
23
|
+
attr_accessor :pointer_symbols
|
24
|
+
|
25
|
+
# Create a lemma from a line in an lexicon file. You should not be creating Lemmas by hand; instead,
|
8
26
|
# use the WordNet::Lemma.find and WordNet::Lemma.find_all methods to find the Lemma for a word.
|
9
27
|
def initialize(lexicon_line, id)
|
10
28
|
@id = id
|
@@ -24,6 +42,8 @@ module WordNet
|
|
24
42
|
@synset_offsets.map { |offset| Synset.new(@pos, offset) }
|
25
43
|
end
|
26
44
|
|
45
|
+
# Returns a compact string representation of this lemma, e.g. "fall, v" for
|
46
|
+
# the verb form of the word "fall".
|
27
47
|
def to_s
|
28
48
|
[@word, @pos].join(",")
|
29
49
|
end
|
@@ -31,6 +51,7 @@ module WordNet
|
|
31
51
|
class << self
|
32
52
|
@@cache = {}
|
33
53
|
|
54
|
+
# Find all lemmas for this word across all known parts of speech
|
34
55
|
def find_all(word)
|
35
56
|
[:noun, :verb, :adj, :adv].flat_map do |pos|
|
36
57
|
find(word, pos) || []
|
data/lib/wordnet/pointer.rb
CHANGED
@@ -1,7 +1,25 @@
|
|
1
1
|
module WordNet
|
2
|
+
# Pointers represent the relations between the words in one synset and another.
|
2
3
|
class Pointer
|
3
|
-
|
4
|
+
# The symbol that devices the relationship this pointer represents, e.g. "!" for verb antonym. Valid
|
5
|
+
# pointer symbols are defined in pointers.rb
|
6
|
+
attr_reader :symbol
|
4
7
|
|
8
|
+
# The offset, in bytes, of this pointer in WordNet's internal database.
|
9
|
+
attr_reader :offset
|
10
|
+
|
11
|
+
# The part of speech this pointer represents. One of 'n', 'v', 'a' (adjective), or 'r' (adverb).
|
12
|
+
attr_reader :pos
|
13
|
+
|
14
|
+
# The synset from which this pointer...points.
|
15
|
+
attr_reader :source
|
16
|
+
|
17
|
+
# The synset to which this pointer...points.
|
18
|
+
attr_reader :target
|
19
|
+
|
20
|
+
# Create a pointer. Pointers represent the relations between the words in one synset and another,
|
21
|
+
# and are referenced by a shorthand symbol (e.g. '!' for verb antonym). The list
|
22
|
+
# of valid pointer symbols is defined in pointers.rb
|
5
23
|
def initialize(symbol: raise, offset: raise, pos: raise, source: raise)
|
6
24
|
@symbol, @offset, @pos, @source = symbol, offset, pos, source
|
7
25
|
@target = source.slice!(2,2)
|
data/lib/wordnet/synset.rb
CHANGED
@@ -4,7 +4,47 @@ module WordNet
|
|
4
4
|
# Represents a synset (or group of synonymous words) in WordNet. Synsets are related to each other by various (and numerous!)
|
5
5
|
# relationships, including Hypernym (x is a hypernym of y <=> x is a parent of y) and Hyponym (x is a child of y)
|
6
6
|
class Synset
|
7
|
-
|
7
|
+
@morphy_path = File.expand_path("../../../morphy/", __FILE__)
|
8
|
+
@exception_map = {}
|
9
|
+
@morphological_substitutions = {
|
10
|
+
'noun' => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
|
11
|
+
['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
|
12
|
+
['men', 'man'], ['ies', 'y']],
|
13
|
+
'verb' => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
|
14
|
+
['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
|
15
|
+
'adj' => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
|
16
|
+
'adv' => []}
|
17
|
+
|
18
|
+
# Get the offset, in bytes, at which this synset's information is stored in WordNet's internal DB.
|
19
|
+
# You almost certainly don't care about this.
|
20
|
+
attr_reader :synset_offset
|
21
|
+
|
22
|
+
# A two digit decimal integer representing the name of the lexicographer file containing the synset for the sense.
|
23
|
+
# Probably only of interest if you're using a wordnet database marked up with custom attributes, and you
|
24
|
+
# want to ensure that you're using your own additions.
|
25
|
+
attr_reader :lex_filenum
|
26
|
+
|
27
|
+
# Get the list of words (and their frequencies within the WordNet graph) contained
|
28
|
+
# in this Synset.
|
29
|
+
attr_reader :word_counts
|
30
|
+
|
31
|
+
# Get the part of speech type of this synset. One of 'n' (noun), 'v' (verb), 'a' (adjective), or 'r' (adverb)
|
32
|
+
attr_reader :synset_type
|
33
|
+
|
34
|
+
# Get the offset, in bytes, at which this synset's POS information is stored in WordNet's internal DB.
|
35
|
+
# You almost certainly don't care about this.
|
36
|
+
attr_reader :pos_offset
|
37
|
+
|
38
|
+
# Get a shorthand representation of the part of speech this synset represents, e.g. "v" for verbs.
|
39
|
+
attr_reader :pos
|
40
|
+
|
41
|
+
# Get a string representation of this synset's gloss. "Gloss" is a human-readable
|
42
|
+
# description of this concept, often with example usage, e.g:
|
43
|
+
#
|
44
|
+
# move upward; "The fog lifted"; "The smoke arose from the forest fire"; "The mist uprose from the meadows"
|
45
|
+
#
|
46
|
+
# for the second sense of the verb "fall"
|
47
|
+
attr_reader :gloss
|
8
48
|
|
9
49
|
# Create a new synset by reading from the data file specified by +pos+, at +offset+ bytes into the file. This is how
|
10
50
|
# the WordNet database is organized. You shouldn't be creating Synsets directly; instead, use Lemma#synsets.
|
@@ -40,6 +80,93 @@ module WordNet
|
|
40
80
|
end
|
41
81
|
end
|
42
82
|
|
83
|
+
# Ported from python NLTK
|
84
|
+
# Load all synsets with a given lemma and part of speech tag.
|
85
|
+
# If no pos is specified, all synsets for all parts of speech
|
86
|
+
# will be loaded.
|
87
|
+
# If lang is specified, all the synsets associated with the lemma name
|
88
|
+
# of that language will be returned.
|
89
|
+
def self.find(word, pos)
|
90
|
+
word = word.downcase
|
91
|
+
lemmas = self.morphy(word, pos).map{|form| WordNet::Lemma.find(form, pos)}
|
92
|
+
lemmas.map{|lemma| lemma.synsets}.flatten
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.find_all(word)
|
96
|
+
SYNSET_TYPES.values.map{|pos| self.find(word, pos)}.flatten
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.load_exception_map
|
100
|
+
SYNSET_TYPES.each do |_, pos|
|
101
|
+
@exception_map[pos] = {}
|
102
|
+
File.open(File.join(@morphy_path, 'exceptions', "#{pos}.exc"), 'r').each_line do |line|
|
103
|
+
line = line.split
|
104
|
+
@exception_map[pos][line[0]] = line[1..-1]
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def self._apply_rules(forms, pos)
|
110
|
+
substitutions = @morphological_substitutions[pos]
|
111
|
+
out = []
|
112
|
+
forms.each do |form|
|
113
|
+
substitutions.each do |old, new|
|
114
|
+
if form.end_with? old
|
115
|
+
out.push form[0...-old.length] + new
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
return out
|
120
|
+
end
|
121
|
+
|
122
|
+
def self._filter_forms(forms, pos)
|
123
|
+
forms.reject{|form| Lemma.find(form, pos).nil?}.uniq
|
124
|
+
end
|
125
|
+
|
126
|
+
# ported from nltk python
|
127
|
+
# from jordanbg:
|
128
|
+
# Given an original string x
|
129
|
+
# 1. Apply rules once to the input to get y1, y2, y3, etc.
|
130
|
+
# 2. Return all that are in the database
|
131
|
+
# 3. If there are no matches, keep applying rules until you either
|
132
|
+
# find a match or you can't go any further
|
133
|
+
def self.morphy(form, pos)
|
134
|
+
if @exception_map == {}
|
135
|
+
self.load_exception_map
|
136
|
+
end
|
137
|
+
exceptions = @exception_map[pos]
|
138
|
+
|
139
|
+
# 0. Check the exception lists
|
140
|
+
if exceptions.has_key? form
|
141
|
+
return self._filter_forms([form] + exceptions[form], pos)
|
142
|
+
end
|
143
|
+
|
144
|
+
# 1. Apply rules once to the input to get y1, y2, y3, etc.
|
145
|
+
forms = self._apply_rules([form], pos)
|
146
|
+
|
147
|
+
# 2. Return all that are in the database (and check the original too)
|
148
|
+
results = self._filter_forms([form] + forms, pos)
|
149
|
+
if results != []
|
150
|
+
return results
|
151
|
+
end
|
152
|
+
|
153
|
+
# 3. If there are no matches, keep applying rules until we find a match
|
154
|
+
while forms.length > 0
|
155
|
+
forms = self._apply_rules(forms, pos)
|
156
|
+
results = self._filter_forms(forms, pos)
|
157
|
+
if results != []
|
158
|
+
return results
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Return an empty list if we can't find anything
|
163
|
+
return []
|
164
|
+
end
|
165
|
+
|
166
|
+
def self.morphy_all(form)
|
167
|
+
SYNSET_TYPES.values.map{|pos| self.morphy(form, pos)}.flatten
|
168
|
+
end
|
169
|
+
|
43
170
|
# How many words does this Synset include?
|
44
171
|
def word_count
|
45
172
|
@word_counts.size
|
@@ -50,14 +177,19 @@ module WordNet
|
|
50
177
|
@word_counts.keys
|
51
178
|
end
|
52
179
|
|
53
|
-
#
|
180
|
+
# Get an array of Synsets with the relation `pointer_symbol` relative to this
|
181
|
+
# Synset. Mostly, this is an internal method used by convience methods (e.g. Synset#antonym), but
|
182
|
+
# it can take any valid valid +pointer_symbol+ defined in pointers.rb.
|
183
|
+
#
|
184
|
+
# Example (get the gloss of an antonym for 'fall'):
|
185
|
+
# WordNet::Lemma.find("fall", :verb).synsets[1].relation("!")[0].gloss
|
54
186
|
def relation(pointer_symbol)
|
55
187
|
@pointers.select { |pointer| pointer.symbol == pointer_symbol }.
|
56
188
|
map! { |pointer| Synset.new(@synset_type, pointer.offset) }
|
57
189
|
end
|
58
190
|
|
59
191
|
# Get the Synset of this sense's antonym
|
60
|
-
def
|
192
|
+
def antonyms
|
61
193
|
relation(ANTONYM)
|
62
194
|
end
|
63
195
|
|
@@ -66,13 +198,18 @@ module WordNet
|
|
66
198
|
relation(HYPERNYM)[0]
|
67
199
|
end
|
68
200
|
|
201
|
+
# Get the parent synset (higher-level category, i.e. fruit -> reproductive_structure).
|
202
|
+
def hypernyms
|
203
|
+
relation(HYPERNYM)
|
204
|
+
end
|
205
|
+
|
69
206
|
# Get the child synset(s) (i.e., lower-level categories, i.e. fruit -> edible_fruit)
|
70
|
-
def
|
207
|
+
def hyponyms
|
71
208
|
relation(HYPONYM)
|
72
209
|
end
|
73
210
|
|
74
211
|
# Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
|
75
|
-
def
|
212
|
+
def expanded_first_hypernyms
|
76
213
|
parent = hypernym
|
77
214
|
list = []
|
78
215
|
return list unless parent
|
@@ -80,19 +217,61 @@ module WordNet
|
|
80
217
|
while parent
|
81
218
|
break if list.include? parent.pos_offset
|
82
219
|
list.push parent.pos_offset
|
83
|
-
parent = parent.
|
220
|
+
parent = parent.hypernym
|
221
|
+
end
|
222
|
+
|
223
|
+
list.flatten!
|
224
|
+
list.map! { |offset| Synset.new(@pos, offset)}
|
225
|
+
end
|
226
|
+
|
227
|
+
# Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
|
228
|
+
def expanded_hypernyms
|
229
|
+
parents = hypernyms
|
230
|
+
list = []
|
231
|
+
return list unless parents
|
232
|
+
|
233
|
+
while parents.length > 0
|
234
|
+
parent = parents.pop
|
235
|
+
next if list.include? parent.pos_offset
|
236
|
+
list.push parent.pos_offset
|
237
|
+
parents.push *parent.hypernyms
|
84
238
|
end
|
85
239
|
|
86
240
|
list.flatten!
|
87
241
|
list.map! { |offset| Synset.new(@pos, offset)}
|
88
242
|
end
|
89
243
|
|
244
|
+
def expanded_hypernyms_depth
|
245
|
+
parents = hypernyms.map{|hypernym| [hypernym, 1]}
|
246
|
+
list = []
|
247
|
+
out = []
|
248
|
+
return list unless parents
|
249
|
+
|
250
|
+
max_depth = 1
|
251
|
+
while parents.length > 0
|
252
|
+
parent, depth = parents.pop
|
253
|
+
next if list.include? parent.pos_offset
|
254
|
+
list.push parent.pos_offset
|
255
|
+
out.push [Synset.new(@pos, parent.pos_offset), depth]
|
256
|
+
parents.push *(parent.hypernyms.map{|hypernym| [hypernym, depth + 1]})
|
257
|
+
max_depth = [max_depth, depth].max
|
258
|
+
end
|
259
|
+
return [out, max_depth]
|
260
|
+
end
|
261
|
+
|
262
|
+
# Returns a compact, human-readable form of this synset, e.g.
|
263
|
+
#
|
264
|
+
# (v) fall (descend in free fall under the influence of gravity; "The branch fell from the tree"; "The unfortunate hiker fell into a crevasse")
|
265
|
+
#
|
266
|
+
# for the second meaning of the verb "fall."
|
90
267
|
def to_s
|
91
268
|
"(#{@synset_type}) #{words.map { |x| x.tr('_',' ') }.join(', ')} (#{@gloss})"
|
92
269
|
end
|
93
270
|
|
271
|
+
alias to_str to_s
|
94
272
|
alias size word_count
|
95
273
|
alias parent hypernym
|
96
|
-
alias
|
274
|
+
alias parents hypernyms
|
275
|
+
alias children hyponyms
|
97
276
|
end
|
98
277
|
end
|
data/lib/wordnet/version.rb
CHANGED