rwordnet 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,21 @@
1
+ # Use WordNet as a command-line dictionary.
2
+ require 'rubygems'
3
+ require 'wordnet'
4
+
5
+ if ARGV.size != 1
6
+ puts "Usage: ruby dictionary.rb word"
7
+ exit(1)
8
+ end
9
+
10
+ word = ARGV[0]
11
+
12
+ # Find all the lemmas for a word (i.e., whether it occurs as a noun, verb, etc.)
13
+ lemmas = WordNet::WordNetDB.find(word)
14
+
15
+ # Print out each lemma with a list of possible meanings.
16
+ lemmas.each do |lemma|
17
+ puts lemma
18
+ lemma.synsets.each_with_index do |synset,i|
19
+ puts "\t#{i+1}) #{synset.gloss}"
20
+ end
21
+ end
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'wordnet'
3
+
4
+ # Open the index file for nouns
5
+ index = WordNet::NounIndex.new
6
+ # Find the word 'fruit'
7
+ lemma = index.find("fruit")
8
+ # Find all the synsets for 'fruit', and pick the first one.
9
+ synset = lemma.synsets[0]
10
+ puts synset
11
+ # Print the full hypernym derivation for the first sense of 'fruit'.
12
+ synset.expanded_hypernym.each { |d| puts d }
data/lib/wordnet.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'wordnet/pointer'
2
+ require 'wordnet/wordnetdb'
3
+ require 'wordnet/index'
4
+ require 'wordnet/lemma'
5
+ require 'wordnet/pointers'
6
+ require 'wordnet/pos'
7
+ require 'wordnet/synset'
@@ -0,0 +1,65 @@
1
+ module WordNet
2
+
3
+ # Index is a WordNet lexicon. Note that Index is the base class; you probably want to be using the NounIndex, VerbIndex, etc. classes instead.
4
+ class Index
5
+ # Create a new index for the given part of speech. +pos+ can be one of +noun+, +verb+, +adj+, or +adv+.
6
+ def initialize(pos)
7
+ @pos = pos
8
+ @db = {}
9
+ end
10
+
11
+ # Find a lemma for a given word. Returns a Lemma which can then be used to access the synsets for the word.
12
+ def find(lemma_str)
13
+ # Look for the lemma in the part of the DB already read...
14
+ @db.each_key do |word|
15
+ return @db[word] if word == lemma_str
16
+ end
17
+
18
+ # If we didn't find it, read in some more from the DB. Some optimisation is possible here. TODO.
19
+ index = WordNetDB.open(File.join(WordNetDB.path,"dict","index.#{@pos}"))
20
+ if not index.closed?
21
+ loop do
22
+ break if index.eof?
23
+ line = index.readline
24
+ lemma = Lemma.new(line)
25
+ @db[lemma.word] = lemma
26
+ if line =~ /^#{lemma_str} /
27
+ return lemma
28
+ end
29
+ end
30
+ index.close
31
+ end
32
+
33
+ return nil
34
+ end
35
+ end
36
+
37
+ # An Index of nouns.
38
+ class NounIndex < Index
39
+ def initialize
40
+ super("noun")
41
+ end
42
+ end
43
+
44
+ # An Index of verbs.
45
+ class VerbIndex < Index
46
+ def initialize
47
+ super("verb")
48
+ end
49
+ end
50
+
51
+ # An Index of adjectives.
52
+ class AdjectiveIndex < Index
53
+ def initialize
54
+ super("adj")
55
+ end
56
+ end
57
+
58
+ # An Index of adverbs.
59
+ class AdverbIndex < Index
60
+ def initialize
61
+ super("adv")
62
+ end
63
+ end
64
+
65
+ end
@@ -0,0 +1,38 @@
1
+ module WordNet
2
+
3
+ # Represents a single word in the WordNet lexicon, which can be used to look up a set of synsets.
4
+ class Lemma
5
+ attr_accessor :lemma, :pos, :synset_cnt, :p_cnt, :ptr_symbol, :tagsense_cnt, :synset_offset
6
+
7
+ # Create a lemma from a line in an index file. You should be creating Lemmas by hand; instead,
8
+ # use the WordNet#find and Index#find methods to find the Lemma for a word.
9
+ def initialize(index_line)
10
+ line = index_line.split(" ")
11
+
12
+ @lemma = line.shift
13
+ @pos = line.shift
14
+ @synset_cnt = line.shift.to_i
15
+ @p_cnt = line.shift.to_i
16
+
17
+ @ptr_symbol = []
18
+ @p_cnt.times { @ptr_symbol.push line.shift }
19
+ line.shift # Throw away redundant sense_cnt
20
+ @tagsense_cnt = line.shift.to_i
21
+ @synset_offset = []
22
+ @synset_cnt.times { @synset_offset.push line.shift.to_i }
23
+ end
24
+
25
+ # Return a list of synsets for this Lemma. Each synset represents a different sense, or meaning, of the word.
26
+ def get_synsets
27
+ return @synset_offset.map { |offset| Synset.new(@pos, offset) }
28
+ end
29
+
30
+ def to_s
31
+ [@lemma, @pos].join(",")
32
+ end
33
+
34
+ alias synsets get_synsets
35
+ alias word lemma
36
+ end
37
+
38
+ end
@@ -0,0 +1,15 @@
1
+ module WordNet
2
+
3
+ # Convenience class for treating hashes as objects, i.e. obj[:key] <=> obj.key. I know
4
+ # this is probably a bad idea, but it's so convenient...
5
+ class Pointer < Hash
6
+ def method_missing(msg, *args)
7
+ if self.include?(msg)
8
+ return self[msg]
9
+ else
10
+ throw NoMethodError.new("undefined method `#{msg}' for #{self}:Pointer")
11
+ end
12
+ end
13
+ end
14
+
15
+ end
@@ -0,0 +1,37 @@
1
+ # A container for various constants. In particular, contains constants representing the WordNet symbols used to look up synsets by relation, i.e. Hypernym/Hyponym.
2
+ # Use these symbols in conjunction with the Synset#get_relation method.
3
+
4
+ module WordNet
5
+
6
+ NounPointers = {"-c"=>"Member of this domain - TOPIC", "+"=>"Derivationally related form", "%p"=>"Part meronym", "~i"=>"Instance Hyponym", "@"=>"Hypernym", ";r"=>"Domain of synset - REGION", "!"=>"Antonym", "#p"=>"Part holonym", "%s"=>"Substance meronym", ";u"=>"Domain of synset - USAGE", "-r"=>"Member of this domain - REGION", "#s"=>"Substance holonym", "="=>"Attribute", "-u"=>"Member of this domain - USAGE", ";c"=>"Domain of synset - TOPIC", "%m"=>"Member meronym", "~"=>"Hyponym", "@i"=>"Instance Hypernym", "#m"=>"Member holonym"}
7
+ VerbPointers = {"+"=>"Derivationally related form", "@"=>"Hypernym", ";r"=>"Domain of synset - REGION", "!"=>"Antonym", ";u"=>"Domain of synset - USAGE", "$"=>"Verb Group", ";c"=>"Domain of synset - TOPIC", ">"=>"Cause", "~"=>"Hyponym", "*"=>"Entailment"}
8
+ AdjectivePointers = {";r"=>"Domain of synset - REGION", "!"=>"Antonym", "\\"=>"Pertainym (pertains to noun)", "<"=>"Participle of verb", "&"=>"Similar to", "="=>"Attribute", ";c"=>"Domain of synset - TOPIC"}
9
+ AdverbPointers = {";r"=>"Domain of synset - REGION", "!"=>"Antonym", ";u"=>"Domain of synset - USAGE", "\\"=>"Derived from adjective", ";c"=>"Domain of synset - TOPIC"}
10
+
11
+ MemberOfThisDomainTopic = "-c"
12
+ DerivationallyRelatedForm = "+"
13
+ PartMeronym = "%p"
14
+ InstanceHyponym = "~i"
15
+ Hypernym = "@"
16
+ DomainOfSynsetRegion = ";r"
17
+ Antonym = "!"
18
+ PartHolonym = "#p"
19
+ SubstanceMeronym = "%s"
20
+ VerbGroup = "$"
21
+ DomainOfSynsetUsage = ";u"
22
+ MemberOfThisDomainRegion = "-r"
23
+ SubstanceHolonym = "#s"
24
+ DerivedFromAdjective = "\\"
25
+ ParticipleOfVerb = "<"
26
+ SimilarTo = "&"
27
+ Attribute = "="
28
+ AlsoSee = "^"
29
+ Cause = ">"
30
+ MemberOfThisDomainUsage = "-u"
31
+ DomainOfSynsetTopic = ";c"
32
+ MemberMeronym = "%m"
33
+ Hyponym = "~"
34
+ InstanceHypernym = "@i"
35
+ Entailment = "*"
36
+ MemberHolonym = "#m"
37
+ end
@@ -0,0 +1,3 @@
1
+ module WordNet
2
+ SynsetType = {"n" => "noun", "v" => "verb", "adj" => "adj", "adv" => "adv"}
3
+ end
@@ -0,0 +1,90 @@
1
+ module WordNet
2
+
3
+ # Represents a synset (or group of synonymous words) in WordNet. Synsets are related to each other by various (and numerous!)
4
+ # relationships, including Hypernym (x is a hypernym of y <=> x is a parent of y) and Hyponym (x is a child of y)
5
+ class Synset
6
+ attr_reader :gloss, :synset_offset, :lex_filenum, :ss_type, :w_cnt, :wordcounts
7
+
8
+ # Create a new synset by reading from the data file specified by +pos+, at +offset+ bytes into the file. This is how
9
+ # the WordNet database is organized. You shouldn't be creating Synsets directly; instead, use Lemma#synsets.
10
+ def initialize(pos, offset)
11
+ data = File.open(File.join(WordNetDB.path,"dict","data.#{SynsetType[pos]}"),"r")
12
+ data.seek(offset)
13
+ data_line = data.readline.strip
14
+ data.close
15
+
16
+ info_line, @gloss = data_line.split(" | ")
17
+ line = info_line.split(" ")
18
+
19
+ @synset_offset = line.shift
20
+ @lex_filenum = line.shift
21
+ @ss_type = line.shift
22
+ @w_cnt = line.shift.to_i
23
+ @wordcounts = {}
24
+ @w_cnt.times do
25
+ @wordcounts[line.shift] = line.shift.to_i
26
+ end
27
+
28
+ @p_cnt = line.shift.to_i
29
+ @pointers = []
30
+ @p_cnt.times do
31
+ pointer = Pointer.new
32
+ pointer[:symbol] = line.shift,
33
+ pointer[:offset] = line.shift.to_i
34
+ pointer[:pos] = line.shift
35
+ pointer[:source] = line.shift
36
+ pointer[:is_semantic?] = (pointer[:source] == "0000")
37
+ pointer[:target] = pointer[:source][2..3]
38
+ pointer[:source] = pointer[:source][0..1]
39
+ pointer[:symbol] = pointer[:symbol][0]
40
+ @pointers.push pointer
41
+ end
42
+ end
43
+
44
+ # How many words does this Synset include?
45
+ def size
46
+ @wordcounts.size
47
+ end
48
+
49
+ # Get a list of words included in this Synset
50
+ def words
51
+ @wordcounts.keys
52
+ end
53
+
54
+ # List of valid +pointer_symbol+s is in pointers.rb
55
+ def get_relation(pointer_symbol)
56
+ @pointers.reject { |pointer| pointer.symbol != pointer_symbol }.map { |pointer| Synset.new(@ss_type, pointer.offset) }
57
+ end
58
+
59
+ # Get the Synset of this sense's antonym
60
+ def antonym
61
+ get_relation(Antonym)
62
+ end
63
+
64
+ # Get the parent synset (higher-level category, i.e. fruit -> reproductive_structure).
65
+ def hypernym
66
+ get_relation(Hypernym)[0]
67
+ end
68
+
69
+ # Get the child synset(s) (i.e., lower-level categories, i.e. fruit -> edible_fruit)
70
+ def hyponym
71
+ get_relation(Hyponym)
72
+ end
73
+
74
+ # Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
75
+ def expanded_hypernym
76
+ parent = self.hypernym
77
+ return [] if parent.nil?
78
+
79
+ return [parent, parent.expanded_hypernym].flatten
80
+ end
81
+
82
+ def to_s
83
+ "(#{@ss_type}) #{words.map {|x| x.gsub('_',' ')}.join(', ')} (#{@gloss})"
84
+ end
85
+
86
+ alias parent hypernym
87
+ alias children hyponym
88
+ end
89
+
90
+ end
@@ -0,0 +1,54 @@
1
+ module WordNet
2
+
3
+ # Represents the WordNet database, and provides some basic interaction.
4
+ class WordNetDB
5
+ # By default, use the bundled WordNet
6
+ @@path = File.join(File.dirname(__FILE__),"/../../WordNet-3.0/")
7
+ @@files = {}
8
+
9
+ # To use your own WordNet installation (rather than the one bundled with rwordnet:
10
+ def WordNetDB.path=(path_to_wordnet)
11
+ @@path = path_to_wordnet
12
+ end
13
+
14
+ # Returns the path to the WordNet installation currently in use. Defaults to the bundled version of WordNet.
15
+ def WordNetDB.path
16
+ @@path
17
+ end
18
+
19
+ # Look up a word in WordNet. Returns a list of lemmas occuring in any of the index files (noun, verb, adjective, adverb).
20
+ def WordNetDB.find(word)
21
+ lemmas = []
22
+ [NounIndex, VerbIndex, AdjectiveIndex, AdverbIndex].each do |index|
23
+ lemmas.push index.new.find(word)
24
+ end
25
+ return lemmas.flatten.reject { |x| x.nil? }
26
+ end
27
+
28
+ # Register a new DB file handle. You shouldn't need to call this method; it's called automatically every time you open an index or data file.
29
+ def WordNetDB.open(path)
30
+ # If the file is already open, just return the handle.
31
+ return @@files[path] if @@files.include?(path) and not @@files[path].closed?
32
+
33
+ # Open and store
34
+ @@files[path] = File.open(path,"r")
35
+ return @@files[path]
36
+ end
37
+
38
+ # You should call this method after you're done using WordNet.
39
+ def WordNetDB.close
40
+ WordNetDB.finalize(0)
41
+ end
42
+
43
+ def WordNetDB.finalize(id)
44
+ @@files.each_value do |handle|
45
+ begin
46
+ handle.close
47
+ rescue IOError
48
+ ; # Keep going, close the next file.
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ end
@@ -0,0 +1,17 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/wordnet"
3
+
4
+
5
+ class << Test::Unit::TestCase
6
+ def test(name, &block)
7
+ test_name = :"test_#{name.gsub(' ','_')}"
8
+ raise ArgumentError, "#{test_name} is already defined" if self.instance_methods.include? test_name.to_s
9
+ define_method test_name, &block
10
+ end
11
+
12
+ def expect(expected_value, &block)
13
+ define_method :"test_#{caller.first.split("/").last}" do
14
+ assert_equal expected_value, instance_eval(&block)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,21 @@
1
+ require File.dirname(__FILE__) + "/../test_helper.rb"
2
+
3
+ class TestIndex < Test::Unit::TestCase
4
+ @@index = nil
5
+
6
+ def setup
7
+ @@index = WordNet::NounIndex.new if @@index.nil?
8
+ end
9
+
10
+ test 'find a lemma by string' do
11
+ lemma = @@index.find("fruit")
12
+ assert_equal "fruit,n",lemma.to_s
13
+ end
14
+
15
+ test 'get synsets for a lemma' do
16
+ lemma = @@index.find("fruit")
17
+ synsets = lemma.get_synsets
18
+ assert_equal 3, synsets.size
19
+ assert_equal "(n) yield, fruit (an amount of a product)",synsets[1].to_s
20
+ end
21
+ end
@@ -0,0 +1,44 @@
1
+ require File.dirname(__FILE__) + "/../test_helper.rb"
2
+
3
+ class TestSynset < Test::Unit::TestCase
4
+ @@synsets = nil
5
+
6
+ def setup
7
+ if @@synsets.nil?
8
+ index = WordNet::NounIndex.new
9
+ lemma = index.find("fruit")
10
+ @@synsets = lemma.get_synsets
11
+ end
12
+ end
13
+
14
+ test 'get synsets for a lemma' do
15
+ assert_equal 3, @@synsets.size
16
+ assert_equal "(n) fruit (the ripened reproductive body of a seed plant)",@@synsets[0].to_s
17
+ assert_equal "an amount of a product",@@synsets[1].gloss
18
+ end
19
+
20
+ test 'get hypernym for a synset' do
21
+ hypernym = @@synsets[0].get_relation(WordNet::Hypernym)
22
+ hypernym = @@synsets[0].hypernym
23
+ assert_equal 1,hypernym.size
24
+ assert_equal "(n) reproductive structure (the parts of a plant involved in its reproduction)",hypernym.to_s
25
+ end
26
+
27
+ test 'test shorthand for get_relation' do
28
+ hypernym = @@synsets[0].get_relation(WordNet::Hypernym)
29
+ hypernym2 = @@synsets[0].hypernym
30
+ assert_equal hypernym[0].gloss, hypernym2.gloss
31
+ end
32
+
33
+ test 'get hyponyms for a synset' do
34
+ hyponym = @@synsets[0].get_relation(WordNet::Hyponym)
35
+ assert_equal 29,hyponym.size
36
+ assert_equal "fruit of various buckthorns yielding dyes or pigments",hyponym[26].gloss
37
+ end
38
+
39
+ test 'test expanded hypernym tree' do
40
+ expanded = @@synsets[0].expanded_hypernym
41
+ assert_equal 8, expanded.size
42
+ assert_equal "entity", expanded[expanded.size-1].words[0]
43
+ end
44
+ end