rwordnet2 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ require 'benchmark'
2
+ require 'rwordnet'
3
+
4
+ initial = Benchmark.realtime do
5
+ WordNet::Lemma.find(ARGV[0] || raise("Usage: ruby benchmark.rb noun"), :noun)
6
+ end
7
+
8
+ puts "Time to initial word #{initial}"
9
+
10
+ lookup = Benchmark.realtime do
11
+ 1000.times { WordNet::Lemma.find('fruit', :noun) }
12
+ end
13
+
14
+ puts "Time for 1k lookups #{lookup}"
@@ -0,0 +1,20 @@
1
+ # Use WordNet as a command-line dictionary.
2
+ require 'rwordnet'
3
+
4
+ if ARGV.size != 1
5
+ puts "Usage: ruby dictionary.rb word"
6
+ exit(1)
7
+ end
8
+
9
+ word = ARGV[0]
10
+
11
+ # Find all the lemmas for a word (i.e., whether it occurs as a noun, verb, etc.)
12
+ lemmas = WordNet::Lemma.find_all(word)
13
+
14
+ # Print out each lemma with a list of possible meanings.
15
+ lemmas.each do |lemma|
16
+ puts lemma
17
+ lemma.synsets.each_with_index do |synset,i|
18
+ puts "\t#{i+1}) #{synset.gloss}"
19
+ end
20
+ end
@@ -0,0 +1,9 @@
1
+ require 'rwordnet'
2
+
3
+ # Find the word 'dog'
4
+ lemma = WordNet::Lemma.find("dog", :noun)
5
+ # Find all the synsets for 'dog', and pick the first one.
6
+ synset = lemma.synsets[0]
7
+ puts synset
8
+ # Print the full hypernym derivation for the first sense of 'dog'.
9
+ synset.expanded_hypernyms.each { |d| puts d }
@@ -0,0 +1,20 @@
1
+ require 'rwordnet'
2
+
3
+ puts 'dogs'
4
+ puts '--------------'
5
+ puts 'as noun'
6
+ p WordNet::Synset.morphy('dogs', 'noun')
7
+ puts 'as verb'
8
+ p WordNet::Synset.morphy('dogs', 'verb')
9
+
10
+
11
+ puts ''
12
+ puts 'hiking'
13
+ puts '--------------'
14
+ puts 'as noun'
15
+ p WordNet::Synset.morphy('hiking', 'noun')
16
+ puts 'as verb'
17
+ p WordNet::Synset.morphy('hiking', 'verb')
18
+ puts 'as all'
19
+ p WordNet::Synset.morphy_all('hiking')
20
+
@@ -0,0 +1,8 @@
1
+ require 'rwordnet'
2
+
3
+ puts 'hiking'
4
+ WordNet::Synset.find_all('hiking').each{|d| puts d}
5
+
6
+ puts''
7
+ puts 'dogs'
8
+ WordNet::Synset.find_all('dogs').each{|d| puts d}
@@ -0,0 +1,25 @@
1
+ module WordNet
2
+ # Represents the WordNet database, and provides some basic interaction.
3
+ class DB
4
+ # By default, use the bundled WordNet
5
+ @path = File.expand_path("../../../WordNet-3.0/", __FILE__)
6
+
7
+ class << self; attr_accessor :cached end
8
+ @raw_wordnet = {}
9
+
10
+ class << self
11
+ # To use your own WordNet installation (rather than the one bundled with rwordnet:
12
+ # Returns the path to the WordNet installation currently in use. Defaults to the bundled version of WordNet.
13
+ attr_accessor :path
14
+
15
+ # Open a wordnet database. You shouldn't have to call this directly; it's
16
+ # handled by the autocaching implemented in lemma.rb.
17
+ #
18
+ # `path` should be a string containing the absolute path to the root of a
19
+ # WordNet installation.
20
+ def open(path, &block)
21
+ File.open(File.join(self.path, path), "r", &block)
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,87 @@
1
+ module WordNet
2
+ # Represents a single word in the WordNet lexicon, which can be used to look up a set of synsets.
3
+ class Lemma
4
+ SPACE = ' '
5
+ POS_SHORTHAND = {:v => :verb, :n => :noun, :a => :adj, :r => :adv}
6
+
7
+ # The word this lemma represents
8
+ attr_accessor :word
9
+
10
+ # The part of speech (noun, verb, adjective) of this lemma. One of 'n', 'v', 'a' (adjective), or 'r' (adverb)
11
+ attr_accessor :pos
12
+
13
+ # The number of times the sense is tagged in various semantic concordance texts. A tagsense_count of 0 indicates that the sense has not been semantically tagged.
14
+ attr_accessor :tagsense_count
15
+
16
+ # The offset, in bytes, at which the synsets contained in this lemma are stored in WordNet's internal database.
17
+ attr_accessor :synset_offsets
18
+
19
+ # A unique integer id that references this lemma. Used internally within WordNet's database.
20
+ attr_accessor :id
21
+
22
+ # An array of valid pointer symbols for this lemma. The list of all valid
23
+ # pointer symbols is defined in pointers.rb.
24
+ attr_accessor :pointer_symbols
25
+
26
+ # Create a lemma from a line in an lexicon file. You should not be creating Lemmas by hand; instead,
27
+ # use the WordNet::Lemma.find and WordNet::Lemma.find_all methods to find the Lemma for a word.
28
+ def initialize(lexicon_line, id)
29
+ @id = id
30
+ line = lexicon_line.split(" ")
31
+
32
+ @word = line.shift
33
+ @pos = line.shift
34
+ synset_count = line.shift.to_i
35
+ @pointer_symbols = line.slice!(0, line.shift.to_i)
36
+ line.shift # Throw away redundant sense_cnt
37
+ @tagsense_count = line.shift.to_i
38
+ @synset_offsets = line.slice!(0, synset_count).map(&:to_i)
39
+ end
40
+
41
+ # Return a list of synsets for this Lemma. Each synset represents a different sense, or meaning, of the word.
42
+ def synsets
43
+ @synset_offsets.map { |offset| Synset.new(@pos, offset) }
44
+ end
45
+
46
+ # Returns a compact string representation of this lemma, e.g. "fall, v" for
47
+ # the verb form of the word "fall".
48
+ def to_s
49
+ [@word, @pos].join(",")
50
+ end
51
+
52
+ class << self
53
+ @@cache = {}
54
+
55
+ # Find all lemmas for this word across all known parts of speech
56
+ def find_all(word)
57
+ [:noun, :verb, :adj, :adv].flat_map do |pos|
58
+ find(word, pos) || []
59
+ end
60
+ end
61
+
62
+ # Find a lemma for a given word and pos. Valid parts of speech are:
63
+ # 'adj', 'adv', 'noun', 'verb'. Additionally, you can use the shorthand
64
+ # forms of each of these ('a', 'r', 'n', 'v')/
65
+ def find(word, pos)
66
+ # Map shorthand POS to full forms
67
+ pos = POS_SHORTHAND[pos] || pos
68
+
69
+ cache = @@cache[pos] ||= build_cache(pos)
70
+ if found = cache[word]
71
+ Lemma.new(*found)
72
+ end
73
+ end
74
+
75
+ private
76
+
77
+ def build_cache(pos)
78
+ cache = {}
79
+ DB.open(File.join("dict", "index.#{pos}")).each_line.each_with_index do |line, index|
80
+ word = line.slice(0, line.index(SPACE))
81
+ cache[word] = [line, index+1]
82
+ end
83
+ cache
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,32 @@
1
+ module WordNet
2
+ # Pointers represent the relations between the words in one synset and another.
3
+ class Pointer
4
+ # The symbol that devices the relationship this pointer represents, e.g. "!" for verb antonym. Valid
5
+ # pointer symbols are defined in pointers.rb
6
+ attr_reader :symbol
7
+
8
+ # The offset, in bytes, of this pointer in WordNet's internal database.
9
+ attr_reader :offset
10
+
11
+ # The part of speech this pointer represents. One of 'n', 'v', 'a' (adjective), or 'r' (adverb).
12
+ attr_reader :pos
13
+
14
+ # The synset from which this pointer...points.
15
+ attr_reader :source
16
+
17
+ # The synset to which this pointer...points.
18
+ attr_reader :target
19
+
20
+ # Create a pointer. Pointers represent the relations between the words in one synset and another,
21
+ # and are referenced by a shorthand symbol (e.g. '!' for verb antonym). The list
22
+ # of valid pointer symbols is defined in pointers.rb
23
+ def initialize(symbol: raise, offset: raise, pos: raise, source: raise)
24
+ @symbol, @offset, @pos, @source = symbol, offset, pos, source
25
+ @target = source.slice!(2,2)
26
+ end
27
+
28
+ def is_semantic?
29
+ source == "00" && target == "00"
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,82 @@
1
+ # A container for various constants.
2
+ # In particular, contains constants representing the WordNet symbols used to look up synsets by relation, i.e. Hypernym/Hyponym.
3
+ # Use these symbols in conjunction with the Synset#relation method.
4
+
5
+ module WordNet
6
+ NOUN_POINTERS = {
7
+ "-c" => "Member of this domain - TOPIC",
8
+ "+" => "Derivationally related form",
9
+ "%p" => "Part meronym",
10
+ "~i" => "Instance Hyponym",
11
+ "@" => "Hypernym",
12
+ ";r" => "Domain of synset - REGION",
13
+ "!" => "Antonym",
14
+ "#p" => "Part holonym",
15
+ "%s" => "Substance meronym",
16
+ ";u" => "Domain of synset - USAGE",
17
+ "-r" => "Member of this domain - REGION",
18
+ "#s" => "Substance holonym",
19
+ "=" => "Attribute",
20
+ "-u" => "Member of this domain - USAGE",
21
+ ";c" => "Domain of synset - TOPIC",
22
+ "%m" => "Member meronym",
23
+ "~" => "Hyponym",
24
+ "@i" => "Instance Hypernym",
25
+ "#m" => "Member holonym"
26
+ }
27
+ VERB_POINTERS = {
28
+ "+" => "Derivationally related form",
29
+ "@" => "Hypernym",
30
+ ";r" => "Domain of synset - REGION",
31
+ "!" => "Antonym",
32
+ ";u" => "Domain of synset - USAGE",
33
+ "$" => "Verb Group",
34
+ ";c" => "Domain of synset - TOPIC",
35
+ ">" => "Cause",
36
+ "~" => "Hyponym",
37
+ "*" => "Entailment"
38
+ }
39
+ ADJECTIVE_POINTERS = {
40
+ ";r" => "Domain of synset - REGION",
41
+ "!" => "Antonym",
42
+ "\\" => "Pertainym (pertains to noun)",
43
+ "<" => "Participle of verb",
44
+ "&" => "Similar to",
45
+ "=" => "Attribute",
46
+ ";c" => "Domain of synset - TOPIC"
47
+ }
48
+ ADVERB_POINTERS = {
49
+ ";r" => "Domain of synset - REGION",
50
+ "!" => "Antonym",
51
+ ";u" => "Domain of synset - USAGE",
52
+ "\\" => "Derived from adjective",
53
+ ";c" => "Domain of synset - TOPIC"
54
+ }
55
+
56
+ MEMBER_OF_THIS_DOMAIN_TOPIC = "-c"
57
+ DERIVATIONALLY_RELATED_FORM = "+"
58
+ PART_MERONYM = "%p"
59
+ InstanceHyponym = "~i"
60
+ HYPERNYM = "@"
61
+ DOMAIN_OF_SYNSET_REGION = ";r"
62
+ ANTONYM = "!"
63
+ PART_HOLONYM = "#p"
64
+ SUBSTANCE_MERONYM = "%s"
65
+ VERB_GROUP = "$"
66
+ DOMAIN_OF_SYNSET_USAGE = ";u"
67
+ MEMBER_OF_THIS_DOMAIN_REGION = "-r"
68
+ SUBSTANCE_HOLONYM = "#s"
69
+ DERIVED_FROM_ADJECTIVE = "\\"
70
+ PARTICIPLE_OF_VERB = "<"
71
+ SIMILAR_TO = "&"
72
+ ATTRIBUTE = "="
73
+ ALSO_SEE = "^"
74
+ CAUSE = ">"
75
+ MEMBER_OF_THIS_DOMAIN_USAGE = "-u"
76
+ DOMAIN_OF_SYNSET_TOPIC = ";c"
77
+ MEMBER_MERONYM = "%m"
78
+ HYPONYM = "~"
79
+ INSTANCE_HYPERNYM = "@i"
80
+ ENTAILMENT = "*"
81
+ MEMBER_HOLONYM = "#m"
82
+ end
@@ -0,0 +1,286 @@
1
+ module WordNet
2
+ SYNSET_TYPES = {"n" => "noun", "v" => "verb", "a" => "adj", "r" => "adv"}
3
+ MORPHOLOGICAL_SUBSTITUTIONS = {
4
+ 'noun' => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
5
+ ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
6
+ ['men', 'man'], ['ies', 'y']],
7
+ 'verb' => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
8
+ ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
9
+ 'adj' => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
10
+ 'adv' => []}
11
+
12
+ # Represents a synset (or group of synonymous words) in WordNet. Synsets are related to each other by various (and numerous!)
13
+ # relationships, including Hypernym (x is a hypernym of y <=> x is a parent of y) and Hyponym (x is a child of y)
14
+ class Synset
15
+ @morphy_path = File.expand_path("../../../morphy/", __FILE__)
16
+ @exception_map = {}
17
+
18
+ # Get the offset, in bytes, at which this synset's information is stored in WordNet's internal DB.
19
+ # You almost certainly don't care about this.
20
+ attr_reader :synset_offset
21
+
22
+ # A two digit decimal integer representing the name of the lexicographer file containing the synset for the sense.
23
+ # Probably only of interest if you're using a wordnet database marked up with custom attributes, and you
24
+ # want to ensure that you're using your own additions.
25
+ attr_reader :lex_filenum
26
+
27
+ # Get the list of words (and their frequencies within the WordNet graph) contained
28
+ # in this Synset.
29
+ attr_reader :word_counts
30
+
31
+ # Get the part of speech type of this synset. One of 'n' (noun), 'v' (verb), 'a' (adjective), or 'r' (adverb)
32
+ attr_reader :synset_type
33
+
34
+ # Get the offset, in bytes, at which this synset's POS information is stored in WordNet's internal DB.
35
+ # You almost certainly don't care about this.
36
+ attr_reader :pos_offset
37
+
38
+ # Get a shorthand representation of the part of speech this synset represents, e.g. "v" for verbs.
39
+ attr_reader :pos
40
+
41
+ # Get a string representation of this synset's gloss. "Gloss" is a human-readable
42
+ # description of this concept, often with example usage, e.g:
43
+ #
44
+ # move upward; "The fog lifted"; "The smoke arose from the forest fire"; "The mist uprose from the meadows"
45
+ #
46
+ # for the second sense of the verb "fall"
47
+ attr_reader :gloss
48
+
49
+ # Create a new synset by reading from the data file specified by +pos+, at +offset+ bytes into the file. This is how
50
+ # the WordNet database is organized. You shouldn't be creating Synsets directly; instead, use Lemma#synsets.
51
+ def initialize(pos, offset)
52
+ data_line = DB.open(File.join("dict", "data.#{SYNSET_TYPES.fetch(pos)}")) do |f|
53
+ f.seek(offset)
54
+ f.readline.strip
55
+ end
56
+
57
+ info_line, @gloss = data_line.split(" | ", 2)
58
+ line = info_line.split(" ")
59
+
60
+ @pos = pos
61
+ @pos_offset = offset
62
+ @synset_offset = line.shift
63
+ @lex_filenum = line.shift
64
+ @synset_type = line.shift
65
+
66
+ @word_counts = {}
67
+ word_count = line.shift.to_i
68
+ word_count.times do
69
+ @word_counts[line.shift] = line.shift.to_i
70
+ end
71
+
72
+ pointer_count = line.shift.to_i
73
+ @pointers = Array.new(pointer_count).map do
74
+ Pointer.new(
75
+ symbol: line.shift[0],
76
+ offset: line.shift.to_i,
77
+ pos: line.shift,
78
+ source: line.shift
79
+ )
80
+ end
81
+ end
82
+
83
+ # Ported from python NLTK
84
+ # Load all synsets with a given lemma and part of speech tag.
85
+ # If no pos is specified, all synsets for all parts of speech
86
+ # will be loaded.
87
+ # If lang is specified, all the synsets associated with the lemma name
88
+ # of that language will be returned.
89
+ def self.find(word, pos)
90
+ word = word.downcase
91
+ lemmas = self.morphy(word, pos).map{|form| WordNet::Lemma.find(form, pos)}
92
+ lemmas.map{|lemma| lemma.synsets}.flatten
93
+ end
94
+
95
+ def self.find_all(word)
96
+ SYNSET_TYPES.values.map{|pos| self.find(word, pos)}.flatten
97
+ end
98
+
99
+ def self.load_exception_map
100
+ SYNSET_TYPES.each do |_, pos|
101
+ @exception_map[pos] = {}
102
+ File.open(File.join(@morphy_path, 'exceptions', "#{pos}.exc"), 'r').each_line do |line|
103
+ line = line.split
104
+ @exception_map[pos][line[0]] = line[1..-1]
105
+ end
106
+ end
107
+ end
108
+
109
+ def self._apply_rules(forms, pos)
110
+ substitutions = MORPHOLOGICAL_SUBSTITUTIONS[pos]
111
+ out = []
112
+ forms.each do |form|
113
+ substitutions.each do |old, new|
114
+ if form.end_with? old
115
+ out.push form[0...-old.length] + new
116
+ end
117
+ end
118
+ end
119
+ return out
120
+ end
121
+
122
+ def self._filter_forms(forms, pos)
123
+ forms.reject{|form| Lemma.find(form, pos).nil?}.uniq
124
+ end
125
+
126
+ # ported from nltk python
127
+ # from jordanbg:
128
+ # Given an original string x
129
+ # 1. Apply rules once to the input to get y1, y2, y3, etc.
130
+ # 2. Return all that are in the database
131
+ # 3. If there are no matches, keep applying rules until you either
132
+ # find a match or you can't go any further
133
+ def self.morphy(form, pos)
134
+ if @exception_map == {}
135
+ self.load_exception_map
136
+ end
137
+ exceptions = @exception_map[pos]
138
+
139
+ # 0. Check the exception lists
140
+ if exceptions.has_key? form
141
+ return self._filter_forms([form] + exceptions[form], pos)
142
+ end
143
+
144
+ # 1. Apply rules once to the input to get y1, y2, y3, etc.
145
+ forms = self._apply_rules([form], pos)
146
+
147
+ # 2. Return all that are in the database (and check the original too)
148
+ results = self._filter_forms([form] + forms, pos)
149
+ if results != []
150
+ return results
151
+ end
152
+
153
+ # 3. If there are no matches, keep applying rules until we find a match
154
+ while forms.length > 0
155
+ forms = self._apply_rules(forms, pos)
156
+ results = self._filter_forms(forms, pos)
157
+ if results != []
158
+ return results
159
+ end
160
+ end
161
+
162
+ # Return an empty list if we can't find anything
163
+ return []
164
+ end
165
+
166
+ def self.morphy_all(form)
167
+ SYNSET_TYPES.values.map{|pos| self.morphy(form, pos)}.flatten
168
+ end
169
+
170
+ # How many words does this Synset include?
171
+ def word_count
172
+ @word_counts.size
173
+ end
174
+
175
+ # Get a list of words included in this Synset
176
+ def words
177
+ @word_counts.keys
178
+ end
179
+
180
+ # Get an array of Synsets with the relation `pointer_symbol` relative to this
181
+ # Synset. Mostly, this is an internal method used by convience methods (e.g. Synset#antonym), but
182
+ # it can take any valid valid +pointer_symbol+ defined in pointers.rb.
183
+ #
184
+ # Example (get the gloss of an antonym for 'fall'):
185
+ # WordNet::Lemma.find("fall", :verb).synsets[1].relation("!")[0].gloss
186
+ def relation(pointer_symbol)
187
+ @pointers.select { |pointer| pointer.symbol == pointer_symbol }.
188
+ map! { |pointer| Synset.new(pointer.pos, pointer.offset) }
189
+ end
190
+
191
+ # Get the Synsets of this sense's antonym
192
+ def antonyms
193
+ relation(ANTONYM)
194
+ end
195
+
196
+ # Get the parent synset (higher-level category, i.e. fruit -> reproductive_structure).
197
+ def hypernym
198
+ relation(HYPERNYM)[0]
199
+ end
200
+
201
+ # Get the parent synset (higher-level category, i.e. fruit -> reproductive_structure)
202
+ # as an array.
203
+ def hypernyms
204
+ relation(HYPERNYM)
205
+ end
206
+
207
+ # Get the child synset(s) (i.e., lower-level categories, i.e. fruit -> edible_fruit)
208
+ def hyponyms
209
+ relation(HYPONYM)
210
+ end
211
+
212
+ # Get the entire hyponym tree as an array
213
+ def expanded_hyponyms
214
+ children = self.hyponyms
215
+ return [] if children.empty?
216
+
217
+ return [children, children.collect{|child| child.expanded_hyponyms}.flatten].flatten
218
+ end
219
+
220
+ # Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
221
+ def expanded_first_hypernyms
222
+ parent = hypernym
223
+ list = []
224
+ return list unless parent
225
+
226
+ while parent
227
+ break if list.include? parent.pos_offset
228
+ list.push parent.pos_offset
229
+ parent = parent.hypernym
230
+ end
231
+
232
+ list.flatten!
233
+ list.map! { |offset| Synset.new(@pos, offset)}
234
+ end
235
+
236
+ # Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
237
+ def expanded_hypernyms
238
+ parents = hypernyms
239
+ list = []
240
+ return list unless parents
241
+
242
+ while parents.length > 0
243
+ parent = parents.pop
244
+ next if list.include? parent.pos_offset
245
+ list.push parent.pos_offset
246
+ parents.push *parent.hypernyms
247
+ end
248
+
249
+ list.flatten!
250
+ list.map! { |offset| Synset.new(@pos, offset)}
251
+ end
252
+
253
+ def expanded_hypernyms_depth
254
+ parents = hypernyms.map{|hypernym| [hypernym, 1]}
255
+ list = []
256
+ out = []
257
+ return list unless parents
258
+
259
+ max_depth = 1
260
+ while parents.length > 0
261
+ parent, depth = parents.pop
262
+ next if list.include? parent.pos_offset
263
+ list.push parent.pos_offset
264
+ out.push [Synset.new(@pos, parent.pos_offset), depth]
265
+ parents.push *(parent.hypernyms.map{|hypernym| [hypernym, depth + 1]})
266
+ max_depth = [max_depth, depth].max
267
+ end
268
+ return [out, max_depth]
269
+ end
270
+
271
+ # Returns a compact, human-readable form of this synset, e.g.
272
+ #
273
+ # (v) fall (descend in free fall under the influence of gravity; "The branch fell from the tree"; "The unfortunate hiker fell into a crevasse")
274
+ #
275
+ # for the second meaning of the verb "fall."
276
+ def to_s
277
+ "(#{@synset_type}) #{words.map { |x| x.tr('_',' ') }.join(', ')} (#{@gloss})"
278
+ end
279
+
280
+ alias to_str to_s
281
+ alias size word_count
282
+ alias parent hypernym
283
+ alias parents hypernyms
284
+ alias children hyponyms
285
+ end
286
+ end
@@ -0,0 +1,3 @@
1
+ module WordNet
2
+ VERSION = "2.0.1"
3
+ end
data/lib/rwordnet.rb ADDED
@@ -0,0 +1,5 @@
1
+ require 'rwordnet/pointer'
2
+ require 'rwordnet/db'
3
+ require 'rwordnet/lemma'
4
+ require 'rwordnet/pointers'
5
+ require 'rwordnet/synset'