rwordnet 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +2 -0
- data/README.markdown +51 -0
- data/WordNet-3.0/AUTHORS +6 -0
- data/WordNet-3.0/COPYING +31 -0
- data/WordNet-3.0/LICENSE +31 -0
- data/WordNet-3.0/README +101 -0
- data/WordNet-3.0/dict/data.adj +18185 -0
- data/WordNet-3.0/dict/data.adv +3650 -0
- data/WordNet-3.0/dict/data.noun +82144 -0
- data/WordNet-3.0/dict/data.verb +13796 -0
- data/WordNet-3.0/dict/index.adj +21508 -0
- data/WordNet-3.0/dict/index.adv +4510 -0
- data/WordNet-3.0/dict/index.noun +117827 -0
- data/WordNet-3.0/dict/index.verb +11558 -0
- data/examples/dictionary.rb +21 -0
- data/examples/full_hypernym.rb +12 -0
- data/lib/wordnet.rb +7 -0
- data/lib/wordnet/index.rb +65 -0
- data/lib/wordnet/lemma.rb +38 -0
- data/lib/wordnet/pointer.rb +15 -0
- data/lib/wordnet/pointers.rb +37 -0
- data/lib/wordnet/pos.rb +3 -0
- data/lib/wordnet/synset.rb +90 -0
- data/lib/wordnet/wordnetdb.rb +54 -0
- data/test/test_helper.rb +17 -0
- data/test/unit/index_test.rb +21 -0
- data/test/unit/synset_test.rb +44 -0
- data/test/unit/wordnetdb_test.rb +16 -0
- metadata +87 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
# Use WordNet as a command-line dictionary.
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wordnet'
|
4
|
+
|
5
|
+
if ARGV.size != 1
|
6
|
+
puts "Usage: ruby dictionary.rb word"
|
7
|
+
exit(1)
|
8
|
+
end
|
9
|
+
|
10
|
+
word = ARGV[0]
|
11
|
+
|
12
|
+
# Find all the lemmas for a word (i.e., whether it occurs as a noun, verb, etc.)
|
13
|
+
lemmas = WordNet::WordNetDB.find(word)
|
14
|
+
|
15
|
+
# Print out each lemma with a list of possible meanings.
|
16
|
+
lemmas.each do |lemma|
|
17
|
+
puts lemma
|
18
|
+
lemma.synsets.each_with_index do |synset,i|
|
19
|
+
puts "\t#{i+1}) #{synset.gloss}"
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'wordnet'
|
3
|
+
|
4
|
+
# Open the index file for nouns
|
5
|
+
index = WordNet::NounIndex.new
|
6
|
+
# Find the word 'fruit'
|
7
|
+
lemma = index.find("fruit")
|
8
|
+
# Find all the synsets for 'fruit', and pick the first one.
|
9
|
+
synset = lemma.synsets[0]
|
10
|
+
puts synset
|
11
|
+
# Print the full hypernym derivation for the first sense of 'fruit'.
|
12
|
+
synset.expanded_hypernym.each { |d| puts d }
|
data/lib/wordnet.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
module WordNet
|
2
|
+
|
3
|
+
# Index is a WordNet lexicon. Note that Index is the base class; you probably want to be using the NounIndex, VerbIndex, etc. classes instead.
|
4
|
+
class Index
|
5
|
+
# Create a new index for the given part of speech. +pos+ can be one of +noun+, +verb+, +adj+, or +adv+.
|
6
|
+
def initialize(pos)
|
7
|
+
@pos = pos
|
8
|
+
@db = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
# Find a lemma for a given word. Returns a Lemma which can then be used to access the synsets for the word.
|
12
|
+
def find(lemma_str)
|
13
|
+
# Look for the lemma in the part of the DB already read...
|
14
|
+
@db.each_key do |word|
|
15
|
+
return @db[word] if word == lemma_str
|
16
|
+
end
|
17
|
+
|
18
|
+
# If we didn't find it, read in some more from the DB. Some optimisation is possible here. TODO.
|
19
|
+
index = WordNetDB.open(File.join(WordNetDB.path,"dict","index.#{@pos}"))
|
20
|
+
if not index.closed?
|
21
|
+
loop do
|
22
|
+
break if index.eof?
|
23
|
+
line = index.readline
|
24
|
+
lemma = Lemma.new(line)
|
25
|
+
@db[lemma.word] = lemma
|
26
|
+
if line =~ /^#{lemma_str} /
|
27
|
+
return lemma
|
28
|
+
end
|
29
|
+
end
|
30
|
+
index.close
|
31
|
+
end
|
32
|
+
|
33
|
+
return nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# An Index of nouns.
|
38
|
+
class NounIndex < Index
|
39
|
+
def initialize
|
40
|
+
super("noun")
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# An Index of verbs.
|
45
|
+
class VerbIndex < Index
|
46
|
+
def initialize
|
47
|
+
super("verb")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# An Index of adjectives.
|
52
|
+
class AdjectiveIndex < Index
|
53
|
+
def initialize
|
54
|
+
super("adj")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# An Index of adverbs.
|
59
|
+
class AdverbIndex < Index
|
60
|
+
def initialize
|
61
|
+
super("adv")
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module WordNet
|
2
|
+
|
3
|
+
# Represents a single word in the WordNet lexicon, which can be used to look up a set of synsets.
|
4
|
+
class Lemma
|
5
|
+
attr_accessor :lemma, :pos, :synset_cnt, :p_cnt, :ptr_symbol, :tagsense_cnt, :synset_offset
|
6
|
+
|
7
|
+
# Create a lemma from a line in an index file. You should be creating Lemmas by hand; instead,
|
8
|
+
# use the WordNet#find and Index#find methods to find the Lemma for a word.
|
9
|
+
def initialize(index_line)
|
10
|
+
line = index_line.split(" ")
|
11
|
+
|
12
|
+
@lemma = line.shift
|
13
|
+
@pos = line.shift
|
14
|
+
@synset_cnt = line.shift.to_i
|
15
|
+
@p_cnt = line.shift.to_i
|
16
|
+
|
17
|
+
@ptr_symbol = []
|
18
|
+
@p_cnt.times { @ptr_symbol.push line.shift }
|
19
|
+
line.shift # Throw away redundant sense_cnt
|
20
|
+
@tagsense_cnt = line.shift.to_i
|
21
|
+
@synset_offset = []
|
22
|
+
@synset_cnt.times { @synset_offset.push line.shift.to_i }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return a list of synsets for this Lemma. Each synset represents a different sense, or meaning, of the word.
|
26
|
+
def get_synsets
|
27
|
+
return @synset_offset.map { |offset| Synset.new(@pos, offset) }
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
[@lemma, @pos].join(",")
|
32
|
+
end
|
33
|
+
|
34
|
+
alias synsets get_synsets
|
35
|
+
alias word lemma
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module WordNet
|
2
|
+
|
3
|
+
# Convenience class for treating hashes as objects, i.e. obj[:key] <=> obj.key. I know
|
4
|
+
# this is probably a bad idea, but it's so convenient...
|
5
|
+
class Pointer < Hash
|
6
|
+
def method_missing(msg, *args)
|
7
|
+
if self.include?(msg)
|
8
|
+
return self[msg]
|
9
|
+
else
|
10
|
+
throw NoMethodError.new("undefined method `#{msg}' for #{self}:Pointer")
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# A container for various constants. In particular, contains constants representing the WordNet symbols used to look up synsets by relation, i.e. Hypernym/Hyponym.
|
2
|
+
# Use these symbols in conjunction with the Synset#get_relation method.
|
3
|
+
|
4
|
+
module WordNet
|
5
|
+
|
6
|
+
NounPointers = {"-c"=>"Member of this domain - TOPIC", "+"=>"Derivationally related form", "%p"=>"Part meronym", "~i"=>"Instance Hyponym", "@"=>"Hypernym", ";r"=>"Domain of synset - REGION", "!"=>"Antonym", "#p"=>"Part holonym", "%s"=>"Substance meronym", ";u"=>"Domain of synset - USAGE", "-r"=>"Member of this domain - REGION", "#s"=>"Substance holonym", "="=>"Attribute", "-u"=>"Member of this domain - USAGE", ";c"=>"Domain of synset - TOPIC", "%m"=>"Member meronym", "~"=>"Hyponym", "@i"=>"Instance Hypernym", "#m"=>"Member holonym"}
|
7
|
+
VerbPointers = {"+"=>"Derivationally related form", "@"=>"Hypernym", ";r"=>"Domain of synset - REGION", "!"=>"Antonym", ";u"=>"Domain of synset - USAGE", "$"=>"Verb Group", ";c"=>"Domain of synset - TOPIC", ">"=>"Cause", "~"=>"Hyponym", "*"=>"Entailment"}
|
8
|
+
AdjectivePointers = {";r"=>"Domain of synset - REGION", "!"=>"Antonym", "\\"=>"Pertainym (pertains to noun)", "<"=>"Participle of verb", "&"=>"Similar to", "="=>"Attribute", ";c"=>"Domain of synset - TOPIC"}
|
9
|
+
AdverbPointers = {";r"=>"Domain of synset - REGION", "!"=>"Antonym", ";u"=>"Domain of synset - USAGE", "\\"=>"Derived from adjective", ";c"=>"Domain of synset - TOPIC"}
|
10
|
+
|
11
|
+
MemberOfThisDomainTopic = "-c"
|
12
|
+
DerivationallyRelatedForm = "+"
|
13
|
+
PartMeronym = "%p"
|
14
|
+
InstanceHyponym = "~i"
|
15
|
+
Hypernym = "@"
|
16
|
+
DomainOfSynsetRegion = ";r"
|
17
|
+
Antonym = "!"
|
18
|
+
PartHolonym = "#p"
|
19
|
+
SubstanceMeronym = "%s"
|
20
|
+
VerbGroup = "$"
|
21
|
+
DomainOfSynsetUsage = ";u"
|
22
|
+
MemberOfThisDomainRegion = "-r"
|
23
|
+
SubstanceHolonym = "#s"
|
24
|
+
DerivedFromAdjective = "\\"
|
25
|
+
ParticipleOfVerb = "<"
|
26
|
+
SimilarTo = "&"
|
27
|
+
Attribute = "="
|
28
|
+
AlsoSee = "^"
|
29
|
+
Cause = ">"
|
30
|
+
MemberOfThisDomainUsage = "-u"
|
31
|
+
DomainOfSynsetTopic = ";c"
|
32
|
+
MemberMeronym = "%m"
|
33
|
+
Hyponym = "~"
|
34
|
+
InstanceHypernym = "@i"
|
35
|
+
Entailment = "*"
|
36
|
+
MemberHolonym = "#m"
|
37
|
+
end
|
data/lib/wordnet/pos.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
module WordNet
|
2
|
+
|
3
|
+
# Represents a synset (or group of synonymous words) in WordNet. Synsets are related to each other by various (and numerous!)
|
4
|
+
# relationships, including Hypernym (x is a hypernym of y <=> x is a parent of y) and Hyponym (x is a child of y)
|
5
|
+
class Synset
|
6
|
+
attr_reader :gloss, :synset_offset, :lex_filenum, :ss_type, :w_cnt, :wordcounts
|
7
|
+
|
8
|
+
# Create a new synset by reading from the data file specified by +pos+, at +offset+ bytes into the file. This is how
|
9
|
+
# the WordNet database is organized. You shouldn't be creating Synsets directly; instead, use Lemma#synsets.
|
10
|
+
def initialize(pos, offset)
|
11
|
+
data = File.open(File.join(WordNetDB.path,"dict","data.#{SynsetType[pos]}"),"r")
|
12
|
+
data.seek(offset)
|
13
|
+
data_line = data.readline.strip
|
14
|
+
data.close
|
15
|
+
|
16
|
+
info_line, @gloss = data_line.split(" | ")
|
17
|
+
line = info_line.split(" ")
|
18
|
+
|
19
|
+
@synset_offset = line.shift
|
20
|
+
@lex_filenum = line.shift
|
21
|
+
@ss_type = line.shift
|
22
|
+
@w_cnt = line.shift.to_i
|
23
|
+
@wordcounts = {}
|
24
|
+
@w_cnt.times do
|
25
|
+
@wordcounts[line.shift] = line.shift.to_i
|
26
|
+
end
|
27
|
+
|
28
|
+
@p_cnt = line.shift.to_i
|
29
|
+
@pointers = []
|
30
|
+
@p_cnt.times do
|
31
|
+
pointer = Pointer.new
|
32
|
+
pointer[:symbol] = line.shift,
|
33
|
+
pointer[:offset] = line.shift.to_i
|
34
|
+
pointer[:pos] = line.shift
|
35
|
+
pointer[:source] = line.shift
|
36
|
+
pointer[:is_semantic?] = (pointer[:source] == "0000")
|
37
|
+
pointer[:target] = pointer[:source][2..3]
|
38
|
+
pointer[:source] = pointer[:source][0..1]
|
39
|
+
pointer[:symbol] = pointer[:symbol][0]
|
40
|
+
@pointers.push pointer
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# How many words does this Synset include?
|
45
|
+
def size
|
46
|
+
@wordcounts.size
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get a list of words included in this Synset
|
50
|
+
def words
|
51
|
+
@wordcounts.keys
|
52
|
+
end
|
53
|
+
|
54
|
+
# List of valid +pointer_symbol+s is in pointers.rb
|
55
|
+
def get_relation(pointer_symbol)
|
56
|
+
@pointers.reject { |pointer| pointer.symbol != pointer_symbol }.map { |pointer| Synset.new(@ss_type, pointer.offset) }
|
57
|
+
end
|
58
|
+
|
59
|
+
# Get the Synset of this sense's antonym
|
60
|
+
def antonym
|
61
|
+
get_relation(Antonym)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Get the parent synset (higher-level category, i.e. fruit -> reproductive_structure).
|
65
|
+
def hypernym
|
66
|
+
get_relation(Hypernym)[0]
|
67
|
+
end
|
68
|
+
|
69
|
+
# Get the child synset(s) (i.e., lower-level categories, i.e. fruit -> edible_fruit)
|
70
|
+
def hyponym
|
71
|
+
get_relation(Hyponym)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
|
75
|
+
def expanded_hypernym
|
76
|
+
parent = self.hypernym
|
77
|
+
return [] if parent.nil?
|
78
|
+
|
79
|
+
return [parent, parent.expanded_hypernym].flatten
|
80
|
+
end
|
81
|
+
|
82
|
+
def to_s
|
83
|
+
"(#{@ss_type}) #{words.map {|x| x.gsub('_',' ')}.join(', ')} (#{@gloss})"
|
84
|
+
end
|
85
|
+
|
86
|
+
alias parent hypernym
|
87
|
+
alias children hyponym
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module WordNet
|
2
|
+
|
3
|
+
# Represents the WordNet database, and provides some basic interaction.
|
4
|
+
class WordNetDB
|
5
|
+
# By default, use the bundled WordNet
|
6
|
+
@@path = File.join(File.dirname(__FILE__),"/../../WordNet-3.0/")
|
7
|
+
@@files = {}
|
8
|
+
|
9
|
+
# To use your own WordNet installation (rather than the one bundled with rwordnet:
|
10
|
+
def WordNetDB.path=(path_to_wordnet)
|
11
|
+
@@path = path_to_wordnet
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns the path to the WordNet installation currently in use. Defaults to the bundled version of WordNet.
|
15
|
+
def WordNetDB.path
|
16
|
+
@@path
|
17
|
+
end
|
18
|
+
|
19
|
+
# Look up a word in WordNet. Returns a list of lemmas occuring in any of the index files (noun, verb, adjective, adverb).
|
20
|
+
def WordNetDB.find(word)
|
21
|
+
lemmas = []
|
22
|
+
[NounIndex, VerbIndex, AdjectiveIndex, AdverbIndex].each do |index|
|
23
|
+
lemmas.push index.new.find(word)
|
24
|
+
end
|
25
|
+
return lemmas.flatten.reject { |x| x.nil? }
|
26
|
+
end
|
27
|
+
|
28
|
+
# Register a new DB file handle. You shouldn't need to call this method; it's called automatically every time you open an index or data file.
|
29
|
+
def WordNetDB.open(path)
|
30
|
+
# If the file is already open, just return the handle.
|
31
|
+
return @@files[path] if @@files.include?(path) and not @@files[path].closed?
|
32
|
+
|
33
|
+
# Open and store
|
34
|
+
@@files[path] = File.open(path,"r")
|
35
|
+
return @@files[path]
|
36
|
+
end
|
37
|
+
|
38
|
+
# You should call this method after you're done using WordNet.
|
39
|
+
def WordNetDB.close
|
40
|
+
WordNetDB.finalize(0)
|
41
|
+
end
|
42
|
+
|
43
|
+
def WordNetDB.finalize(id)
|
44
|
+
@@files.each_value do |handle|
|
45
|
+
begin
|
46
|
+
handle.close
|
47
|
+
rescue IOError
|
48
|
+
; # Keep going, close the next file.
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require File.dirname(__FILE__) + "/../lib/wordnet"
|
3
|
+
|
4
|
+
|
5
|
+
class << Test::Unit::TestCase
|
6
|
+
def test(name, &block)
|
7
|
+
test_name = :"test_#{name.gsub(' ','_')}"
|
8
|
+
raise ArgumentError, "#{test_name} is already defined" if self.instance_methods.include? test_name.to_s
|
9
|
+
define_method test_name, &block
|
10
|
+
end
|
11
|
+
|
12
|
+
def expect(expected_value, &block)
|
13
|
+
define_method :"test_#{caller.first.split("/").last}" do
|
14
|
+
assert_equal expected_value, instance_eval(&block)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../test_helper.rb"
|
2
|
+
|
3
|
+
class TestIndex < Test::Unit::TestCase
|
4
|
+
@@index = nil
|
5
|
+
|
6
|
+
def setup
|
7
|
+
@@index = WordNet::NounIndex.new if @@index.nil?
|
8
|
+
end
|
9
|
+
|
10
|
+
test 'find a lemma by string' do
|
11
|
+
lemma = @@index.find("fruit")
|
12
|
+
assert_equal "fruit,n",lemma.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
test 'get synsets for a lemma' do
|
16
|
+
lemma = @@index.find("fruit")
|
17
|
+
synsets = lemma.get_synsets
|
18
|
+
assert_equal 3, synsets.size
|
19
|
+
assert_equal "(n) yield, fruit (an amount of a product)",synsets[1].to_s
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../test_helper.rb"
|
2
|
+
|
3
|
+
class TestSynset < Test::Unit::TestCase
|
4
|
+
@@synsets = nil
|
5
|
+
|
6
|
+
def setup
|
7
|
+
if @@synsets.nil?
|
8
|
+
index = WordNet::NounIndex.new
|
9
|
+
lemma = index.find("fruit")
|
10
|
+
@@synsets = lemma.get_synsets
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
test 'get synsets for a lemma' do
|
15
|
+
assert_equal 3, @@synsets.size
|
16
|
+
assert_equal "(n) fruit (the ripened reproductive body of a seed plant)",@@synsets[0].to_s
|
17
|
+
assert_equal "an amount of a product",@@synsets[1].gloss
|
18
|
+
end
|
19
|
+
|
20
|
+
test 'get hypernym for a synset' do
|
21
|
+
hypernym = @@synsets[0].get_relation(WordNet::Hypernym)
|
22
|
+
hypernym = @@synsets[0].hypernym
|
23
|
+
assert_equal 1,hypernym.size
|
24
|
+
assert_equal "(n) reproductive structure (the parts of a plant involved in its reproduction)",hypernym.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
test 'test shorthand for get_relation' do
|
28
|
+
hypernym = @@synsets[0].get_relation(WordNet::Hypernym)
|
29
|
+
hypernym2 = @@synsets[0].hypernym
|
30
|
+
assert_equal hypernym[0].gloss, hypernym2.gloss
|
31
|
+
end
|
32
|
+
|
33
|
+
test 'get hyponyms for a synset' do
|
34
|
+
hyponym = @@synsets[0].get_relation(WordNet::Hyponym)
|
35
|
+
assert_equal 29,hyponym.size
|
36
|
+
assert_equal "fruit of various buckthorns yielding dyes or pigments",hyponym[26].gloss
|
37
|
+
end
|
38
|
+
|
39
|
+
test 'test expanded hypernym tree' do
|
40
|
+
expanded = @@synsets[0].expanded_hypernym
|
41
|
+
assert_equal 8, expanded.size
|
42
|
+
assert_equal "entity", expanded[expanded.size-1].words[0]
|
43
|
+
end
|
44
|
+
end
|