rwordnet 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4bea9b677b6a581d27c04ad1912c3034bf8329d8
4
- data.tar.gz: 4fdef6fdfdbe2373b445f7857d3fe5ff5071fba2
3
+ metadata.gz: af20be262ff83829299dfcaab7bdaf6daca77d9e
4
+ data.tar.gz: 5fdf6de52538acc9e2e6c6cd59af7f8055aa1361
5
5
  SHA512:
6
- metadata.gz: c1471523dcb27e496eb72b406f37ddd83184dfcecbcabe95c492f6018b29fa53180d06035b7f9c0a938bfde0e4c28a37f89473c0b3a712be527ed2abf6261af3
7
- data.tar.gz: 25cf40f306f3dfcf8076d9430cfd5d8d805af9198762a530ed4f293e5b0c4e5ccc680e941b2256f63c5141d5499ad77b7821eec5b4e245a72a47e1f9ca6e83b3
6
+ metadata.gz: 9f232d93029c8f200e6ba54af4461df6a4430e7ecc1189510cee59447303a350bb47f6deab13ad9b3cb7d6730b52d514b73377f673f1d81c03b4997d2a6ababe
7
+ data.tar.gz: d374907bacd015be0274bb8f8219d1803bc884b9c852f99c83441ca303e7fe2ac4687be7341462e6bb5fb54657f09c91d4cf64edc746ee27c8577ce2286a2942
data/README.markdown CHANGED
@@ -1,6 +1,9 @@
1
1
  # A pure Ruby interface to WordNet #
2
2
 
3
3
  [![Build Status](https://travis-ci.org/doches/rwordnet.png)](https://travis-ci.org/doches/rwordnet)
4
+ [![Documentation Status](https://inch-ci.org/github/doches/rwordnet.svg?branch=master)](https://inch-ci.org/github/doches/rwordnet)
5
+ [![Code Climate](https://codeclimate.com/github/doches/rwordnet/badges/gpa.svg)](https://codeclimate.com/github/doches/rwordnet)
6
+ [![Test Coverage](https://codeclimate.com/github/doches/rwordnet/badges/coverage.svg)](https://codeclimate.com/github/doches/rwordnet/coverage)
4
7
 
5
8
  ## Summary ##
6
9
 
@@ -1,9 +1,9 @@
1
1
  require 'wordnet'
2
2
 
3
- # Find the word 'fruit'
4
- lemma = WordNet::Lemma.find("fruit", :noun)
5
- # Find all the synsets for 'fruit', and pick the first one.
3
+ # Find the word 'dog'
4
+ lemma = WordNet::Lemma.find("dog", :noun)
5
+ # Find all the synsets for 'dog', and pick the first one.
6
6
  synset = lemma.synsets[0]
7
7
  puts synset
8
- # Print the full hypernym derivation for the first sense of 'fruit'.
9
- synset.expanded_hypernym.each { |d| puts d }
8
+ # Print the full hypernym derivation for the first sense of 'dog'.
9
+ synset.expanded_hypernyms.each { |d| puts d }
@@ -0,0 +1,20 @@
1
+ require 'wordnet'
2
+
3
+ puts 'dogs'
4
+ puts '--------------'
5
+ puts 'as noun'
6
+ p WordNet::Synset.morphy('dogs', 'noun')
7
+ puts 'as verb'
8
+ p WordNet::Synset.morphy('dogs', 'verb')
9
+
10
+
11
+ puts ''
12
+ puts 'hiking'
13
+ puts '--------------'
14
+ puts 'as noun'
15
+ p WordNet::Synset.morphy('hiking', 'noun')
16
+ puts 'as verb'
17
+ p WordNet::Synset.morphy('hiking', 'verb')
18
+ puts 'as all'
19
+ p WordNet::Synset.morphy_all('hiking')
20
+
@@ -0,0 +1,8 @@
1
+ require 'wordnet'
2
+
3
+ puts 'hiking'
4
+ WordNet::Synset.find_all('hiking').each{|d| puts d}
5
+
6
+ puts''
7
+ puts 'dogs'
8
+ WordNet::Synset.find_all('dogs').each{|d| puts d}
data/lib/wordnet/db.rb CHANGED
@@ -1,14 +1,25 @@
1
+ require 'stringio'
2
+
1
3
  module WordNet
2
4
  # Represents the WordNet database, and provides some basic interaction.
3
5
  class DB
4
6
  # By default, use the bundled WordNet
5
7
  @path = File.expand_path("../../../WordNet-3.0/", __FILE__)
6
8
 
9
+ class << self; attr_accessor :cached end
10
+ @raw_wordnet = {}
11
+
12
+
7
13
  class << self
8
14
  # To use your own WordNet installation (rather than the one bundled with rwordnet:
9
15
  # Returns the path to the WordNet installation currently in use. Defaults to the bundled version of WordNet.
10
16
  attr_accessor :path
11
17
 
18
+ # Open a wordnet database. You shouldn't have to call this directly; it's
19
+ # handled by the autocaching implemented in lemma.rb.
20
+ #
21
+ # `path` should be a string containing the absolute path to the root of a
22
+ # WordNet installation.
12
23
  def open(path, &block)
13
24
  File.open(File.join(self.path, path), "r", &block)
14
25
  end
data/lib/wordnet/lemma.rb CHANGED
@@ -2,9 +2,27 @@ module WordNet
2
2
  # Represents a single word in the WordNet lexicon, which can be used to look up a set of synsets.
3
3
  class Lemma
4
4
  SPACE = ' '
5
- attr_accessor :word, :pos, :pointer_symbols, :tagsense_count, :synset_offsets, :id
6
5
 
7
- # Create a lemma from a line in an lexicon file. You should be creating Lemmas by hand; instead,
6
+ # The word this lemma represents
7
+ attr_accessor :word
8
+
9
+ # The part of speech (noun, verb, adjective) of this lemma. One of 'n', 'v', 'a' (adjective), or 'r' (adverb)
10
+ attr_accessor :pos
11
+
12
+ # The number of times the sense is tagged in various semantic concordance texts. A tagsense_count of 0 indicates that the sense has not been semantically tagged.
13
+ attr_accessor :tagsense_count
14
+
15
+ # The offset, in bytes, at which the synsets contained in this lemma are stored in WordNet's internal database.
16
+ attr_accessor :synset_offsets
17
+
18
+ # A unique integer id that references this lemma. Used internally within WordNet's database.
19
+ attr_accessor :id
20
+
21
+ # An array of valid pointer symbols for this lemma. The list of all valid
22
+ # pointer symbols is defined in pointers.rb.
23
+ attr_accessor :pointer_symbols
24
+
25
+ # Create a lemma from a line in an lexicon file. You should not be creating Lemmas by hand; instead,
8
26
  # use the WordNet::Lemma.find and WordNet::Lemma.find_all methods to find the Lemma for a word.
9
27
  def initialize(lexicon_line, id)
10
28
  @id = id
@@ -24,6 +42,8 @@ module WordNet
24
42
  @synset_offsets.map { |offset| Synset.new(@pos, offset) }
25
43
  end
26
44
 
45
+ # Returns a compact string representation of this lemma, e.g. "fall, v" for
46
+ # the verb form of the word "fall".
27
47
  def to_s
28
48
  [@word, @pos].join(",")
29
49
  end
@@ -31,6 +51,7 @@ module WordNet
31
51
  class << self
32
52
  @@cache = {}
33
53
 
54
+ # Find all lemmas for this word across all known parts of speech
34
55
  def find_all(word)
35
56
  [:noun, :verb, :adj, :adv].flat_map do |pos|
36
57
  find(word, pos) || []
@@ -1,7 +1,25 @@
1
1
  module WordNet
2
+ # Pointers represent the relations between the words in one synset and another.
2
3
  class Pointer
3
- attr_reader :symbol, :offset, :pos, :source, :target
4
+ # The symbol that devices the relationship this pointer represents, e.g. "!" for verb antonym. Valid
5
+ # pointer symbols are defined in pointers.rb
6
+ attr_reader :symbol
4
7
 
8
+ # The offset, in bytes, of this pointer in WordNet's internal database.
9
+ attr_reader :offset
10
+
11
+ # The part of speech this pointer represents. One of 'n', 'v', 'a' (adjective), or 'r' (adverb).
12
+ attr_reader :pos
13
+
14
+ # The synset from which this pointer...points.
15
+ attr_reader :source
16
+
17
+ # The synset to which this pointer...points.
18
+ attr_reader :target
19
+
20
+ # Create a pointer. Pointers represent the relations between the words in one synset and another,
21
+ # and are referenced by a shorthand symbol (e.g. '!' for verb antonym). The list
22
+ # of valid pointer symbols is defined in pointers.rb
5
23
  def initialize(symbol: raise, offset: raise, pos: raise, source: raise)
6
24
  @symbol, @offset, @pos, @source = symbol, offset, pos, source
7
25
  @target = source.slice!(2,2)
@@ -4,7 +4,47 @@ module WordNet
4
4
  # Represents a synset (or group of synonymous words) in WordNet. Synsets are related to each other by various (and numerous!)
5
5
  # relationships, including Hypernym (x is a hypernym of y <=> x is a parent of y) and Hyponym (x is a child of y)
6
6
  class Synset
7
- attr_reader :gloss, :synset_offset, :lex_filenum, :synset_type, :word_counts, :pos_offset, :pos
7
+ @morphy_path = File.expand_path("../../../morphy/", __FILE__)
8
+ @exception_map = {}
9
+ @morphological_substitutions = {
10
+ 'noun' => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
11
+ ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
12
+ ['men', 'man'], ['ies', 'y']],
13
+ 'verb' => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
14
+ ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
15
+ 'adj' => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
16
+ 'adv' => []}
17
+
18
+ # Get the offset, in bytes, at which this synset's information is stored in WordNet's internal DB.
19
+ # You almost certainly don't care about this.
20
+ attr_reader :synset_offset
21
+
22
+ # A two digit decimal integer representing the name of the lexicographer file containing the synset for the sense.
23
+ # Probably only of interest if you're using a wordnet database marked up with custom attributes, and you
24
+ # want to ensure that you're using your own additions.
25
+ attr_reader :lex_filenum
26
+
27
+ # Get the list of words (and their frequencies within the WordNet graph) contained
28
+ # in this Synset.
29
+ attr_reader :word_counts
30
+
31
+ # Get the part of speech type of this synset. One of 'n' (noun), 'v' (verb), 'a' (adjective), or 'r' (adverb)
32
+ attr_reader :synset_type
33
+
34
+ # Get the offset, in bytes, at which this synset's POS information is stored in WordNet's internal DB.
35
+ # You almost certainly don't care about this.
36
+ attr_reader :pos_offset
37
+
38
+ # Get a shorthand representation of the part of speech this synset represents, e.g. "v" for verbs.
39
+ attr_reader :pos
40
+
41
+ # Get a string representation of this synset's gloss. "Gloss" is a human-readable
42
+ # description of this concept, often with example usage, e.g:
43
+ #
44
+ # move upward; "The fog lifted"; "The smoke arose from the forest fire"; "The mist uprose from the meadows"
45
+ #
46
+ # for the second sense of the verb "fall"
47
+ attr_reader :gloss
8
48
 
9
49
  # Create a new synset by reading from the data file specified by +pos+, at +offset+ bytes into the file. This is how
10
50
  # the WordNet database is organized. You shouldn't be creating Synsets directly; instead, use Lemma#synsets.
@@ -40,6 +80,93 @@ module WordNet
40
80
  end
41
81
  end
42
82
 
83
+ # Ported from python NLTK
84
+ # Load all synsets with a given lemma and part of speech tag.
85
+ # If no pos is specified, all synsets for all parts of speech
86
+ # will be loaded.
87
+ # If lang is specified, all the synsets associated with the lemma name
88
+ # of that language will be returned.
89
+ def self.find(word, pos)
90
+ word = word.downcase
91
+ lemmas = self.morphy(word, pos).map{|form| WordNet::Lemma.find(form, pos)}
92
+ lemmas.map{|lemma| lemma.synsets}.flatten
93
+ end
94
+
95
+ def self.find_all(word)
96
+ SYNSET_TYPES.values.map{|pos| self.find(word, pos)}.flatten
97
+ end
98
+
99
+ def self.load_exception_map
100
+ SYNSET_TYPES.each do |_, pos|
101
+ @exception_map[pos] = {}
102
+ File.open(File.join(@morphy_path, 'exceptions', "#{pos}.exc"), 'r').each_line do |line|
103
+ line = line.split
104
+ @exception_map[pos][line[0]] = line[1..-1]
105
+ end
106
+ end
107
+ end
108
+
109
+ def self._apply_rules(forms, pos)
110
+ substitutions = @morphological_substitutions[pos]
111
+ out = []
112
+ forms.each do |form|
113
+ substitutions.each do |old, new|
114
+ if form.end_with? old
115
+ out.push form[0...-old.length] + new
116
+ end
117
+ end
118
+ end
119
+ return out
120
+ end
121
+
122
+ def self._filter_forms(forms, pos)
123
+ forms.reject{|form| Lemma.find(form, pos).nil?}.uniq
124
+ end
125
+
126
+ # ported from nltk python
127
+ # from jordanbg:
128
+ # Given an original string x
129
+ # 1. Apply rules once to the input to get y1, y2, y3, etc.
130
+ # 2. Return all that are in the database
131
+ # 3. If there are no matches, keep applying rules until you either
132
+ # find a match or you can't go any further
133
+ def self.morphy(form, pos)
134
+ if @exception_map == {}
135
+ self.load_exception_map
136
+ end
137
+ exceptions = @exception_map[pos]
138
+
139
+ # 0. Check the exception lists
140
+ if exceptions.has_key? form
141
+ return self._filter_forms([form] + exceptions[form], pos)
142
+ end
143
+
144
+ # 1. Apply rules once to the input to get y1, y2, y3, etc.
145
+ forms = self._apply_rules([form], pos)
146
+
147
+ # 2. Return all that are in the database (and check the original too)
148
+ results = self._filter_forms([form] + forms, pos)
149
+ if results != []
150
+ return results
151
+ end
152
+
153
+ # 3. If there are no matches, keep applying rules until we find a match
154
+ while forms.length > 0
155
+ forms = self._apply_rules(forms, pos)
156
+ results = self._filter_forms(forms, pos)
157
+ if results != []
158
+ return results
159
+ end
160
+ end
161
+
162
+ # Return an empty list if we can't find anything
163
+ return []
164
+ end
165
+
166
+ def self.morphy_all(form)
167
+ SYNSET_TYPES.values.map{|pos| self.morphy(form, pos)}.flatten
168
+ end
169
+
43
170
  # How many words does this Synset include?
44
171
  def word_count
45
172
  @word_counts.size
@@ -50,14 +177,19 @@ module WordNet
50
177
  @word_counts.keys
51
178
  end
52
179
 
53
- # List of valid +pointer_symbol+s is in pointers.rb
180
+ # Get an array of Synsets with the relation `pointer_symbol` relative to this
181
+ # Synset. Mostly, this is an internal method used by convience methods (e.g. Synset#antonym), but
182
+ # it can take any valid valid +pointer_symbol+ defined in pointers.rb.
183
+ #
184
+ # Example (get the gloss of an antonym for 'fall'):
185
+ # WordNet::Lemma.find("fall", :verb).synsets[1].relation("!")[0].gloss
54
186
  def relation(pointer_symbol)
55
187
  @pointers.select { |pointer| pointer.symbol == pointer_symbol }.
56
188
  map! { |pointer| Synset.new(@synset_type, pointer.offset) }
57
189
  end
58
190
 
59
191
  # Get the Synset of this sense's antonym
60
- def antonym
192
+ def antonyms
61
193
  relation(ANTONYM)
62
194
  end
63
195
 
@@ -66,13 +198,18 @@ module WordNet
66
198
  relation(HYPERNYM)[0]
67
199
  end
68
200
 
201
+ # Get the parent synset (higher-level category, i.e. fruit -> reproductive_structure).
202
+ def hypernyms
203
+ relation(HYPERNYM)
204
+ end
205
+
69
206
  # Get the child synset(s) (i.e., lower-level categories, i.e. fruit -> edible_fruit)
70
- def hyponym
207
+ def hyponyms
71
208
  relation(HYPONYM)
72
209
  end
73
210
 
74
211
  # Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
75
- def expanded_hypernym
212
+ def expanded_first_hypernyms
76
213
  parent = hypernym
77
214
  list = []
78
215
  return list unless parent
@@ -80,19 +217,61 @@ module WordNet
80
217
  while parent
81
218
  break if list.include? parent.pos_offset
82
219
  list.push parent.pos_offset
83
- parent = parent.parent
220
+ parent = parent.hypernym
221
+ end
222
+
223
+ list.flatten!
224
+ list.map! { |offset| Synset.new(@pos, offset)}
225
+ end
226
+
227
+ # Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
228
+ def expanded_hypernyms
229
+ parents = hypernyms
230
+ list = []
231
+ return list unless parents
232
+
233
+ while parents.length > 0
234
+ parent = parents.pop
235
+ next if list.include? parent.pos_offset
236
+ list.push parent.pos_offset
237
+ parents.push *parent.hypernyms
84
238
  end
85
239
 
86
240
  list.flatten!
87
241
  list.map! { |offset| Synset.new(@pos, offset)}
88
242
  end
89
243
 
244
+ def expanded_hypernyms_depth
245
+ parents = hypernyms.map{|hypernym| [hypernym, 1]}
246
+ list = []
247
+ out = []
248
+ return list unless parents
249
+
250
+ max_depth = 1
251
+ while parents.length > 0
252
+ parent, depth = parents.pop
253
+ next if list.include? parent.pos_offset
254
+ list.push parent.pos_offset
255
+ out.push [Synset.new(@pos, parent.pos_offset), depth]
256
+ parents.push *(parent.hypernyms.map{|hypernym| [hypernym, depth + 1]})
257
+ max_depth = [max_depth, depth].max
258
+ end
259
+ return [out, max_depth]
260
+ end
261
+
262
+ # Returns a compact, human-readable form of this synset, e.g.
263
+ #
264
+ # (v) fall (descend in free fall under the influence of gravity; "The branch fell from the tree"; "The unfortunate hiker fell into a crevasse")
265
+ #
266
+ # for the second meaning of the verb "fall."
90
267
  def to_s
91
268
  "(#{@synset_type}) #{words.map { |x| x.tr('_',' ') }.join(', ')} (#{@gloss})"
92
269
  end
93
270
 
271
+ alias to_str to_s
94
272
  alias size word_count
95
273
  alias parent hypernym
96
- alias children hyponym
274
+ alias parents hypernyms
275
+ alias children hyponyms
97
276
  end
98
277
  end
@@ -1,3 +1,3 @@
1
1
  module WordNet
2
- VERSION = "1.0.0"
2
+ VERSION = "1.1.0"
3
3
  end