rwordnet 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +2 -0
- data/README.markdown +51 -0
- data/WordNet-3.0/AUTHORS +6 -0
- data/WordNet-3.0/COPYING +31 -0
- data/WordNet-3.0/LICENSE +31 -0
- data/WordNet-3.0/README +101 -0
- data/WordNet-3.0/dict/data.adj +18185 -0
- data/WordNet-3.0/dict/data.adv +3650 -0
- data/WordNet-3.0/dict/data.noun +82144 -0
- data/WordNet-3.0/dict/data.verb +13796 -0
- data/WordNet-3.0/dict/index.adj +21508 -0
- data/WordNet-3.0/dict/index.adv +4510 -0
- data/WordNet-3.0/dict/index.noun +117827 -0
- data/WordNet-3.0/dict/index.verb +11558 -0
- data/examples/dictionary.rb +21 -0
- data/examples/full_hypernym.rb +12 -0
- data/lib/wordnet.rb +7 -0
- data/lib/wordnet/index.rb +65 -0
- data/lib/wordnet/lemma.rb +38 -0
- data/lib/wordnet/pointer.rb +15 -0
- data/lib/wordnet/pointers.rb +37 -0
- data/lib/wordnet/pos.rb +3 -0
- data/lib/wordnet/synset.rb +90 -0
- data/lib/wordnet/wordnetdb.rb +54 -0
- data/test/test_helper.rb +17 -0
- data/test/unit/index_test.rb +21 -0
- data/test/unit/synset_test.rb +44 -0
- data/test/unit/wordnetdb_test.rb +16 -0
- metadata +87 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
# Use WordNet as a command-line dictionary.
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wordnet'
|
4
|
+
|
5
|
+
if ARGV.size != 1
|
6
|
+
puts "Usage: ruby dictionary.rb word"
|
7
|
+
exit(1)
|
8
|
+
end
|
9
|
+
|
10
|
+
word = ARGV[0]
|
11
|
+
|
12
|
+
# Find all the lemmas for a word (i.e., whether it occurs as a noun, verb, etc.)
|
13
|
+
lemmas = WordNet::WordNetDB.find(word)
|
14
|
+
|
15
|
+
# Print out each lemma with a list of possible meanings.
|
16
|
+
lemmas.each do |lemma|
|
17
|
+
puts lemma
|
18
|
+
lemma.synsets.each_with_index do |synset,i|
|
19
|
+
puts "\t#{i+1}) #{synset.gloss}"
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'wordnet'
|
3
|
+
|
4
|
+
# Open the index file for nouns
|
5
|
+
index = WordNet::NounIndex.new
|
6
|
+
# Find the word 'fruit'
|
7
|
+
lemma = index.find("fruit")
|
8
|
+
# Find all the synsets for 'fruit', and pick the first one.
|
9
|
+
synset = lemma.synsets[0]
|
10
|
+
puts synset
|
11
|
+
# Print the full hypernym derivation for the first sense of 'fruit'.
|
12
|
+
synset.expanded_hypernym.each { |d| puts d }
|
data/lib/wordnet.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
module WordNet
|
2
|
+
|
3
|
+
# Index is a WordNet lexicon. Note that Index is the base class; you probably want to be using the NounIndex, VerbIndex, etc. classes instead.
|
4
|
+
class Index
|
5
|
+
# Create a new index for the given part of speech. +pos+ can be one of +noun+, +verb+, +adj+, or +adv+.
|
6
|
+
def initialize(pos)
|
7
|
+
@pos = pos
|
8
|
+
@db = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
# Find a lemma for a given word. Returns a Lemma which can then be used to access the synsets for the word.
|
12
|
+
def find(lemma_str)
|
13
|
+
# Look for the lemma in the part of the DB already read...
|
14
|
+
@db.each_key do |word|
|
15
|
+
return @db[word] if word == lemma_str
|
16
|
+
end
|
17
|
+
|
18
|
+
# If we didn't find it, read in some more from the DB. Some optimisation is possible here. TODO.
|
19
|
+
index = WordNetDB.open(File.join(WordNetDB.path,"dict","index.#{@pos}"))
|
20
|
+
if not index.closed?
|
21
|
+
loop do
|
22
|
+
break if index.eof?
|
23
|
+
line = index.readline
|
24
|
+
lemma = Lemma.new(line)
|
25
|
+
@db[lemma.word] = lemma
|
26
|
+
if line =~ /^#{lemma_str} /
|
27
|
+
return lemma
|
28
|
+
end
|
29
|
+
end
|
30
|
+
index.close
|
31
|
+
end
|
32
|
+
|
33
|
+
return nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# An Index of nouns.
|
38
|
+
class NounIndex < Index
|
39
|
+
def initialize
|
40
|
+
super("noun")
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# An Index of verbs.
|
45
|
+
class VerbIndex < Index
|
46
|
+
def initialize
|
47
|
+
super("verb")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# An Index of adjectives.
|
52
|
+
class AdjectiveIndex < Index
|
53
|
+
def initialize
|
54
|
+
super("adj")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# An Index of adverbs.
|
59
|
+
class AdverbIndex < Index
|
60
|
+
def initialize
|
61
|
+
super("adv")
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module WordNet
|
2
|
+
|
3
|
+
# Represents a single word in the WordNet lexicon, which can be used to look up a set of synsets.
|
4
|
+
class Lemma
|
5
|
+
attr_accessor :lemma, :pos, :synset_cnt, :p_cnt, :ptr_symbol, :tagsense_cnt, :synset_offset
|
6
|
+
|
7
|
+
# Create a lemma from a line in an index file. You should be creating Lemmas by hand; instead,
|
8
|
+
# use the WordNet#find and Index#find methods to find the Lemma for a word.
|
9
|
+
def initialize(index_line)
|
10
|
+
line = index_line.split(" ")
|
11
|
+
|
12
|
+
@lemma = line.shift
|
13
|
+
@pos = line.shift
|
14
|
+
@synset_cnt = line.shift.to_i
|
15
|
+
@p_cnt = line.shift.to_i
|
16
|
+
|
17
|
+
@ptr_symbol = []
|
18
|
+
@p_cnt.times { @ptr_symbol.push line.shift }
|
19
|
+
line.shift # Throw away redundant sense_cnt
|
20
|
+
@tagsense_cnt = line.shift.to_i
|
21
|
+
@synset_offset = []
|
22
|
+
@synset_cnt.times { @synset_offset.push line.shift.to_i }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return a list of synsets for this Lemma. Each synset represents a different sense, or meaning, of the word.
|
26
|
+
def get_synsets
|
27
|
+
return @synset_offset.map { |offset| Synset.new(@pos, offset) }
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
[@lemma, @pos].join(",")
|
32
|
+
end
|
33
|
+
|
34
|
+
alias synsets get_synsets
|
35
|
+
alias word lemma
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module WordNet
|
2
|
+
|
3
|
+
# Convenience class for treating hashes as objects, i.e. obj[:key] <=> obj.key. I know
|
4
|
+
# this is probably a bad idea, but it's so convenient...
|
5
|
+
class Pointer < Hash
|
6
|
+
def method_missing(msg, *args)
|
7
|
+
if self.include?(msg)
|
8
|
+
return self[msg]
|
9
|
+
else
|
10
|
+
throw NoMethodError.new("undefined method `#{msg}' for #{self}:Pointer")
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# A container for various constants. In particular, contains constants representing the WordNet symbols used to look up synsets by relation, i.e. Hypernym/Hyponym.
|
2
|
+
# Use these symbols in conjunction with the Synset#get_relation method.
|
3
|
+
|
4
|
+
module WordNet
|
5
|
+
|
6
|
+
NounPointers = {"-c"=>"Member of this domain - TOPIC", "+"=>"Derivationally related form", "%p"=>"Part meronym", "~i"=>"Instance Hyponym", "@"=>"Hypernym", ";r"=>"Domain of synset - REGION", "!"=>"Antonym", "#p"=>"Part holonym", "%s"=>"Substance meronym", ";u"=>"Domain of synset - USAGE", "-r"=>"Member of this domain - REGION", "#s"=>"Substance holonym", "="=>"Attribute", "-u"=>"Member of this domain - USAGE", ";c"=>"Domain of synset - TOPIC", "%m"=>"Member meronym", "~"=>"Hyponym", "@i"=>"Instance Hypernym", "#m"=>"Member holonym"}
|
7
|
+
VerbPointers = {"+"=>"Derivationally related form", "@"=>"Hypernym", ";r"=>"Domain of synset - REGION", "!"=>"Antonym", ";u"=>"Domain of synset - USAGE", "$"=>"Verb Group", ";c"=>"Domain of synset - TOPIC", ">"=>"Cause", "~"=>"Hyponym", "*"=>"Entailment"}
|
8
|
+
AdjectivePointers = {";r"=>"Domain of synset - REGION", "!"=>"Antonym", "\\"=>"Pertainym (pertains to noun)", "<"=>"Participle of verb", "&"=>"Similar to", "="=>"Attribute", ";c"=>"Domain of synset - TOPIC"}
|
9
|
+
AdverbPointers = {";r"=>"Domain of synset - REGION", "!"=>"Antonym", ";u"=>"Domain of synset - USAGE", "\\"=>"Derived from adjective", ";c"=>"Domain of synset - TOPIC"}
|
10
|
+
|
11
|
+
MemberOfThisDomainTopic = "-c"
|
12
|
+
DerivationallyRelatedForm = "+"
|
13
|
+
PartMeronym = "%p"
|
14
|
+
InstanceHyponym = "~i"
|
15
|
+
Hypernym = "@"
|
16
|
+
DomainOfSynsetRegion = ";r"
|
17
|
+
Antonym = "!"
|
18
|
+
PartHolonym = "#p"
|
19
|
+
SubstanceMeronym = "%s"
|
20
|
+
VerbGroup = "$"
|
21
|
+
DomainOfSynsetUsage = ";u"
|
22
|
+
MemberOfThisDomainRegion = "-r"
|
23
|
+
SubstanceHolonym = "#s"
|
24
|
+
DerivedFromAdjective = "\\"
|
25
|
+
ParticipleOfVerb = "<"
|
26
|
+
SimilarTo = "&"
|
27
|
+
Attribute = "="
|
28
|
+
AlsoSee = "^"
|
29
|
+
Cause = ">"
|
30
|
+
MemberOfThisDomainUsage = "-u"
|
31
|
+
DomainOfSynsetTopic = ";c"
|
32
|
+
MemberMeronym = "%m"
|
33
|
+
Hyponym = "~"
|
34
|
+
InstanceHypernym = "@i"
|
35
|
+
Entailment = "*"
|
36
|
+
MemberHolonym = "#m"
|
37
|
+
end
|
data/lib/wordnet/pos.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
module WordNet
|
2
|
+
|
3
|
+
# Represents a synset (or group of synonymous words) in WordNet. Synsets are related to each other by various (and numerous!)
|
4
|
+
# relationships, including Hypernym (x is a hypernym of y <=> x is a parent of y) and Hyponym (x is a child of y)
|
5
|
+
class Synset
|
6
|
+
attr_reader :gloss, :synset_offset, :lex_filenum, :ss_type, :w_cnt, :wordcounts
|
7
|
+
|
8
|
+
# Create a new synset by reading from the data file specified by +pos+, at +offset+ bytes into the file. This is how
|
9
|
+
# the WordNet database is organized. You shouldn't be creating Synsets directly; instead, use Lemma#synsets.
|
10
|
+
def initialize(pos, offset)
|
11
|
+
data = File.open(File.join(WordNetDB.path,"dict","data.#{SynsetType[pos]}"),"r")
|
12
|
+
data.seek(offset)
|
13
|
+
data_line = data.readline.strip
|
14
|
+
data.close
|
15
|
+
|
16
|
+
info_line, @gloss = data_line.split(" | ")
|
17
|
+
line = info_line.split(" ")
|
18
|
+
|
19
|
+
@synset_offset = line.shift
|
20
|
+
@lex_filenum = line.shift
|
21
|
+
@ss_type = line.shift
|
22
|
+
@w_cnt = line.shift.to_i
|
23
|
+
@wordcounts = {}
|
24
|
+
@w_cnt.times do
|
25
|
+
@wordcounts[line.shift] = line.shift.to_i
|
26
|
+
end
|
27
|
+
|
28
|
+
@p_cnt = line.shift.to_i
|
29
|
+
@pointers = []
|
30
|
+
@p_cnt.times do
|
31
|
+
pointer = Pointer.new
|
32
|
+
pointer[:symbol] = line.shift,
|
33
|
+
pointer[:offset] = line.shift.to_i
|
34
|
+
pointer[:pos] = line.shift
|
35
|
+
pointer[:source] = line.shift
|
36
|
+
pointer[:is_semantic?] = (pointer[:source] == "0000")
|
37
|
+
pointer[:target] = pointer[:source][2..3]
|
38
|
+
pointer[:source] = pointer[:source][0..1]
|
39
|
+
pointer[:symbol] = pointer[:symbol][0]
|
40
|
+
@pointers.push pointer
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# How many words does this Synset include?
|
45
|
+
def size
|
46
|
+
@wordcounts.size
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get a list of words included in this Synset
|
50
|
+
def words
|
51
|
+
@wordcounts.keys
|
52
|
+
end
|
53
|
+
|
54
|
+
# List of valid +pointer_symbol+s is in pointers.rb
|
55
|
+
def get_relation(pointer_symbol)
|
56
|
+
@pointers.reject { |pointer| pointer.symbol != pointer_symbol }.map { |pointer| Synset.new(@ss_type, pointer.offset) }
|
57
|
+
end
|
58
|
+
|
59
|
+
# Get the Synset of this sense's antonym
|
60
|
+
def antonym
|
61
|
+
get_relation(Antonym)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Get the parent synset (higher-level category, i.e. fruit -> reproductive_structure).
|
65
|
+
def hypernym
|
66
|
+
get_relation(Hypernym)[0]
|
67
|
+
end
|
68
|
+
|
69
|
+
# Get the child synset(s) (i.e., lower-level categories, i.e. fruit -> edible_fruit)
|
70
|
+
def hyponym
|
71
|
+
get_relation(Hyponym)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
|
75
|
+
def expanded_hypernym
|
76
|
+
parent = self.hypernym
|
77
|
+
return [] if parent.nil?
|
78
|
+
|
79
|
+
return [parent, parent.expanded_hypernym].flatten
|
80
|
+
end
|
81
|
+
|
82
|
+
def to_s
|
83
|
+
"(#{@ss_type}) #{words.map {|x| x.gsub('_',' ')}.join(', ')} (#{@gloss})"
|
84
|
+
end
|
85
|
+
|
86
|
+
alias parent hypernym
|
87
|
+
alias children hyponym
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module WordNet
|
2
|
+
|
3
|
+
# Represents the WordNet database, and provides some basic interaction.
|
4
|
+
class WordNetDB
|
5
|
+
# By default, use the bundled WordNet
|
6
|
+
@@path = File.join(File.dirname(__FILE__),"/../../WordNet-3.0/")
|
7
|
+
@@files = {}
|
8
|
+
|
9
|
+
# To use your own WordNet installation (rather than the one bundled with rwordnet:
|
10
|
+
def WordNetDB.path=(path_to_wordnet)
|
11
|
+
@@path = path_to_wordnet
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns the path to the WordNet installation currently in use. Defaults to the bundled version of WordNet.
|
15
|
+
def WordNetDB.path
|
16
|
+
@@path
|
17
|
+
end
|
18
|
+
|
19
|
+
# Look up a word in WordNet. Returns a list of lemmas occuring in any of the index files (noun, verb, adjective, adverb).
|
20
|
+
def WordNetDB.find(word)
|
21
|
+
lemmas = []
|
22
|
+
[NounIndex, VerbIndex, AdjectiveIndex, AdverbIndex].each do |index|
|
23
|
+
lemmas.push index.new.find(word)
|
24
|
+
end
|
25
|
+
return lemmas.flatten.reject { |x| x.nil? }
|
26
|
+
end
|
27
|
+
|
28
|
+
# Register a new DB file handle. You shouldn't need to call this method; it's called automatically every time you open an index or data file.
|
29
|
+
def WordNetDB.open(path)
|
30
|
+
# If the file is already open, just return the handle.
|
31
|
+
return @@files[path] if @@files.include?(path) and not @@files[path].closed?
|
32
|
+
|
33
|
+
# Open and store
|
34
|
+
@@files[path] = File.open(path,"r")
|
35
|
+
return @@files[path]
|
36
|
+
end
|
37
|
+
|
38
|
+
# You should call this method after you're done using WordNet.
|
39
|
+
def WordNetDB.close
|
40
|
+
WordNetDB.finalize(0)
|
41
|
+
end
|
42
|
+
|
43
|
+
def WordNetDB.finalize(id)
|
44
|
+
@@files.each_value do |handle|
|
45
|
+
begin
|
46
|
+
handle.close
|
47
|
+
rescue IOError
|
48
|
+
; # Keep going, close the next file.
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require File.dirname(__FILE__) + "/../lib/wordnet"
|
3
|
+
|
4
|
+
|
5
|
+
class << Test::Unit::TestCase
|
6
|
+
def test(name, &block)
|
7
|
+
test_name = :"test_#{name.gsub(' ','_')}"
|
8
|
+
raise ArgumentError, "#{test_name} is already defined" if self.instance_methods.include? test_name.to_s
|
9
|
+
define_method test_name, &block
|
10
|
+
end
|
11
|
+
|
12
|
+
def expect(expected_value, &block)
|
13
|
+
define_method :"test_#{caller.first.split("/").last}" do
|
14
|
+
assert_equal expected_value, instance_eval(&block)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../test_helper.rb"
|
2
|
+
|
3
|
+
class TestIndex < Test::Unit::TestCase
|
4
|
+
@@index = nil
|
5
|
+
|
6
|
+
def setup
|
7
|
+
@@index = WordNet::NounIndex.new if @@index.nil?
|
8
|
+
end
|
9
|
+
|
10
|
+
test 'find a lemma by string' do
|
11
|
+
lemma = @@index.find("fruit")
|
12
|
+
assert_equal "fruit,n",lemma.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
test 'get synsets for a lemma' do
|
16
|
+
lemma = @@index.find("fruit")
|
17
|
+
synsets = lemma.get_synsets
|
18
|
+
assert_equal 3, synsets.size
|
19
|
+
assert_equal "(n) yield, fruit (an amount of a product)",synsets[1].to_s
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../test_helper.rb"
|
2
|
+
|
3
|
+
class TestSynset < Test::Unit::TestCase
|
4
|
+
@@synsets = nil
|
5
|
+
|
6
|
+
def setup
|
7
|
+
if @@synsets.nil?
|
8
|
+
index = WordNet::NounIndex.new
|
9
|
+
lemma = index.find("fruit")
|
10
|
+
@@synsets = lemma.get_synsets
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
test 'get synsets for a lemma' do
|
15
|
+
assert_equal 3, @@synsets.size
|
16
|
+
assert_equal "(n) fruit (the ripened reproductive body of a seed plant)",@@synsets[0].to_s
|
17
|
+
assert_equal "an amount of a product",@@synsets[1].gloss
|
18
|
+
end
|
19
|
+
|
20
|
+
test 'get hypernym for a synset' do
|
21
|
+
hypernym = @@synsets[0].get_relation(WordNet::Hypernym)
|
22
|
+
hypernym = @@synsets[0].hypernym
|
23
|
+
assert_equal 1,hypernym.size
|
24
|
+
assert_equal "(n) reproductive structure (the parts of a plant involved in its reproduction)",hypernym.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
test 'test shorthand for get_relation' do
|
28
|
+
hypernym = @@synsets[0].get_relation(WordNet::Hypernym)
|
29
|
+
hypernym2 = @@synsets[0].hypernym
|
30
|
+
assert_equal hypernym[0].gloss, hypernym2.gloss
|
31
|
+
end
|
32
|
+
|
33
|
+
test 'get hyponyms for a synset' do
|
34
|
+
hyponym = @@synsets[0].get_relation(WordNet::Hyponym)
|
35
|
+
assert_equal 29,hyponym.size
|
36
|
+
assert_equal "fruit of various buckthorns yielding dyes or pigments",hyponym[26].gloss
|
37
|
+
end
|
38
|
+
|
39
|
+
test 'test expanded hypernym tree' do
|
40
|
+
expanded = @@synsets[0].expanded_hypernym
|
41
|
+
assert_equal 8, expanded.size
|
42
|
+
assert_equal "entity", expanded[expanded.size-1].words[0]
|
43
|
+
end
|
44
|
+
end
|