rwordnet 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ # Use WordNet as a command-line dictionary.
2
+ require 'rubygems'
3
+ require 'wordnet'
4
+
5
+ if ARGV.size != 1
6
+ puts "Usage: ruby dictionary.rb word"
7
+ exit(1)
8
+ end
9
+
10
+ word = ARGV[0]
11
+
12
+ # Find all the lemmas for a word (i.e., whether it occurs as a noun, verb, etc.)
13
+ lemmas = WordNet::WordNetDB.find(word)
14
+
15
+ # Print out each lemma with a list of possible meanings.
16
+ lemmas.each do |lemma|
17
+ puts lemma
18
+ lemma.synsets.each_with_index do |synset,i|
19
+ puts "\t#{i+1}) #{synset.gloss}"
20
+ end
21
+ end
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'wordnet'
3
+
4
+ # Open the index file for nouns
5
+ index = WordNet::NounIndex.new
6
+ # Find the word 'fruit'
7
+ lemma = index.find("fruit")
8
+ # Find all the synsets for 'fruit', and pick the first one.
9
+ synset = lemma.synsets[0]
10
+ puts synset
11
+ # Print the full hypernym derivation for the first sense of 'fruit'.
12
+ synset.expanded_hypernym.each { |d| puts d }
data/lib/wordnet.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'wordnet/pointer'
2
+ require 'wordnet/wordnetdb'
3
+ require 'wordnet/index'
4
+ require 'wordnet/lemma'
5
+ require 'wordnet/pointers'
6
+ require 'wordnet/pos'
7
+ require 'wordnet/synset'
@@ -0,0 +1,65 @@
1
+ module WordNet
2
+
3
+ # Index is a WordNet lexicon. Note that Index is the base class; you probably want to be using the NounIndex, VerbIndex, etc. classes instead.
4
+ class Index
5
+ # Create a new index for the given part of speech. +pos+ can be one of +noun+, +verb+, +adj+, or +adv+.
6
+ def initialize(pos)
7
+ @pos = pos
8
+ @db = {}
9
+ end
10
+
11
+ # Find a lemma for a given word. Returns a Lemma which can then be used to access the synsets for the word.
12
+ def find(lemma_str)
13
+ # Look for the lemma in the part of the DB already read...
14
+ @db.each_key do |word|
15
+ return @db[word] if word == lemma_str
16
+ end
17
+
18
+ # If we didn't find it, read in some more from the DB. Some optimisation is possible here. TODO.
19
+ index = WordNetDB.open(File.join(WordNetDB.path,"dict","index.#{@pos}"))
20
+ if not index.closed?
21
+ loop do
22
+ break if index.eof?
23
+ line = index.readline
24
+ lemma = Lemma.new(line)
25
+ @db[lemma.word] = lemma
26
+ if line =~ /^#{lemma_str} /
27
+ return lemma
28
+ end
29
+ end
30
+ index.close
31
+ end
32
+
33
+ return nil
34
+ end
35
+ end
36
+
37
+ # An Index of nouns.
38
+ class NounIndex < Index
39
+ def initialize
40
+ super("noun")
41
+ end
42
+ end
43
+
44
+ # An Index of verbs.
45
+ class VerbIndex < Index
46
+ def initialize
47
+ super("verb")
48
+ end
49
+ end
50
+
51
+ # An Index of adjectives.
52
+ class AdjectiveIndex < Index
53
+ def initialize
54
+ super("adj")
55
+ end
56
+ end
57
+
58
+ # An Index of adverbs.
59
+ class AdverbIndex < Index
60
+ def initialize
61
+ super("adv")
62
+ end
63
+ end
64
+
65
+ end
@@ -0,0 +1,38 @@
1
+ module WordNet
2
+
3
+ # Represents a single word in the WordNet lexicon, which can be used to look up a set of synsets.
4
+ class Lemma
5
+ attr_accessor :lemma, :pos, :synset_cnt, :p_cnt, :ptr_symbol, :tagsense_cnt, :synset_offset
6
+
7
+ # Create a lemma from a line in an index file. You should be creating Lemmas by hand; instead,
8
+ # use the WordNet#find and Index#find methods to find the Lemma for a word.
9
+ def initialize(index_line)
10
+ line = index_line.split(" ")
11
+
12
+ @lemma = line.shift
13
+ @pos = line.shift
14
+ @synset_cnt = line.shift.to_i
15
+ @p_cnt = line.shift.to_i
16
+
17
+ @ptr_symbol = []
18
+ @p_cnt.times { @ptr_symbol.push line.shift }
19
+ line.shift # Throw away redundant sense_cnt
20
+ @tagsense_cnt = line.shift.to_i
21
+ @synset_offset = []
22
+ @synset_cnt.times { @synset_offset.push line.shift.to_i }
23
+ end
24
+
25
+ # Return a list of synsets for this Lemma. Each synset represents a different sense, or meaning, of the word.
26
+ def get_synsets
27
+ return @synset_offset.map { |offset| Synset.new(@pos, offset) }
28
+ end
29
+
30
+ def to_s
31
+ [@lemma, @pos].join(",")
32
+ end
33
+
34
+ alias synsets get_synsets
35
+ alias word lemma
36
+ end
37
+
38
+ end
@@ -0,0 +1,15 @@
1
+ module WordNet
2
+
3
+ # Convenience class for treating hashes as objects, i.e. obj[:key] <=> obj.key. I know
4
+ # this is probably a bad idea, but it's so convenient...
5
+ class Pointer < Hash
6
+ def method_missing(msg, *args)
7
+ if self.include?(msg)
8
+ return self[msg]
9
+ else
10
+ throw NoMethodError.new("undefined method `#{msg}' for #{self}:Pointer")
11
+ end
12
+ end
13
+ end
14
+
15
+ end
@@ -0,0 +1,37 @@
1
+ # A container for various constants. In particular, contains constants representing the WordNet symbols used to look up synsets by relation, i.e. Hypernym/Hyponym.
2
+ # Use these symbols in conjunction with the Synset#get_relation method.
3
+
4
+ module WordNet
5
+
6
+ NounPointers = {"-c"=>"Member of this domain - TOPIC", "+"=>"Derivationally related form", "%p"=>"Part meronym", "~i"=>"Instance Hyponym", "@"=>"Hypernym", ";r"=>"Domain of synset - REGION", "!"=>"Antonym", "#p"=>"Part holonym", "%s"=>"Substance meronym", ";u"=>"Domain of synset - USAGE", "-r"=>"Member of this domain - REGION", "#s"=>"Substance holonym", "="=>"Attribute", "-u"=>"Member of this domain - USAGE", ";c"=>"Domain of synset - TOPIC", "%m"=>"Member meronym", "~"=>"Hyponym", "@i"=>"Instance Hypernym", "#m"=>"Member holonym"}
7
+ VerbPointers = {"+"=>"Derivationally related form", "@"=>"Hypernym", ";r"=>"Domain of synset - REGION", "!"=>"Antonym", ";u"=>"Domain of synset - USAGE", "$"=>"Verb Group", ";c"=>"Domain of synset - TOPIC", ">"=>"Cause", "~"=>"Hyponym", "*"=>"Entailment"}
8
+ AdjectivePointers = {";r"=>"Domain of synset - REGION", "!"=>"Antonym", "\\"=>"Pertainym (pertains to noun)", "<"=>"Participle of verb", "&"=>"Similar to", "="=>"Attribute", ";c"=>"Domain of synset - TOPIC"}
9
+ AdverbPointers = {";r"=>"Domain of synset - REGION", "!"=>"Antonym", ";u"=>"Domain of synset - USAGE", "\\"=>"Derived from adjective", ";c"=>"Domain of synset - TOPIC"}
10
+
11
+ MemberOfThisDomainTopic = "-c"
12
+ DerivationallyRelatedForm = "+"
13
+ PartMeronym = "%p"
14
+ InstanceHyponym = "~i"
15
+ Hypernym = "@"
16
+ DomainOfSynsetRegion = ";r"
17
+ Antonym = "!"
18
+ PartHolonym = "#p"
19
+ SubstanceMeronym = "%s"
20
+ VerbGroup = "$"
21
+ DomainOfSynsetUsage = ";u"
22
+ MemberOfThisDomainRegion = "-r"
23
+ SubstanceHolonym = "#s"
24
+ DerivedFromAdjective = "\\"
25
+ ParticipleOfVerb = "<"
26
+ SimilarTo = "&"
27
+ Attribute = "="
28
+ AlsoSee = "^"
29
+ Cause = ">"
30
+ MemberOfThisDomainUsage = "-u"
31
+ DomainOfSynsetTopic = ";c"
32
+ MemberMeronym = "%m"
33
+ Hyponym = "~"
34
+ InstanceHypernym = "@i"
35
+ Entailment = "*"
36
+ MemberHolonym = "#m"
37
+ end
@@ -0,0 +1,3 @@
1
+ module WordNet
2
+ SynsetType = {"n" => "noun", "v" => "verb", "adj" => "adj", "adv" => "adv"}
3
+ end
@@ -0,0 +1,90 @@
1
+ module WordNet
2
+
3
+ # Represents a synset (or group of synonymous words) in WordNet. Synsets are related to each other by various (and numerous!)
4
+ # relationships, including Hypernym (x is a hypernym of y <=> x is a parent of y) and Hyponym (x is a child of y)
5
+ class Synset
6
+ attr_reader :gloss, :synset_offset, :lex_filenum, :ss_type, :w_cnt, :wordcounts
7
+
8
+ # Create a new synset by reading from the data file specified by +pos+, at +offset+ bytes into the file. This is how
9
+ # the WordNet database is organized. You shouldn't be creating Synsets directly; instead, use Lemma#synsets.
10
+ def initialize(pos, offset)
11
+ data = File.open(File.join(WordNetDB.path,"dict","data.#{SynsetType[pos]}"),"r")
12
+ data.seek(offset)
13
+ data_line = data.readline.strip
14
+ data.close
15
+
16
+ info_line, @gloss = data_line.split(" | ")
17
+ line = info_line.split(" ")
18
+
19
+ @synset_offset = line.shift
20
+ @lex_filenum = line.shift
21
+ @ss_type = line.shift
22
+ @w_cnt = line.shift.to_i
23
+ @wordcounts = {}
24
+ @w_cnt.times do
25
+ @wordcounts[line.shift] = line.shift.to_i
26
+ end
27
+
28
+ @p_cnt = line.shift.to_i
29
+ @pointers = []
30
+ @p_cnt.times do
31
+ pointer = Pointer.new
32
+ pointer[:symbol] = line.shift,
33
+ pointer[:offset] = line.shift.to_i
34
+ pointer[:pos] = line.shift
35
+ pointer[:source] = line.shift
36
+ pointer[:is_semantic?] = (pointer[:source] == "0000")
37
+ pointer[:target] = pointer[:source][2..3]
38
+ pointer[:source] = pointer[:source][0..1]
39
+ pointer[:symbol] = pointer[:symbol][0]
40
+ @pointers.push pointer
41
+ end
42
+ end
43
+
44
+ # How many words does this Synset include?
45
+ def size
46
+ @wordcounts.size
47
+ end
48
+
49
+ # Get a list of words included in this Synset
50
+ def words
51
+ @wordcounts.keys
52
+ end
53
+
54
+ # List of valid +pointer_symbol+s is in pointers.rb
55
+ def get_relation(pointer_symbol)
56
+ @pointers.reject { |pointer| pointer.symbol != pointer_symbol }.map { |pointer| Synset.new(@ss_type, pointer.offset) }
57
+ end
58
+
59
+ # Get the Synset of this sense's antonym
60
+ def antonym
61
+ get_relation(Antonym)
62
+ end
63
+
64
+ # Get the parent synset (higher-level category, i.e. fruit -> reproductive_structure).
65
+ def hypernym
66
+ get_relation(Hypernym)[0]
67
+ end
68
+
69
+ # Get the child synset(s) (i.e., lower-level categories, i.e. fruit -> edible_fruit)
70
+ def hyponym
71
+ get_relation(Hyponym)
72
+ end
73
+
74
+ # Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
75
+ def expanded_hypernym
76
+ parent = self.hypernym
77
+ return [] if parent.nil?
78
+
79
+ return [parent, parent.expanded_hypernym].flatten
80
+ end
81
+
82
+ def to_s
83
+ "(#{@ss_type}) #{words.map {|x| x.gsub('_',' ')}.join(', ')} (#{@gloss})"
84
+ end
85
+
86
+ alias parent hypernym
87
+ alias children hyponym
88
+ end
89
+
90
+ end
@@ -0,0 +1,54 @@
1
+ module WordNet
2
+
3
+ # Represents the WordNet database, and provides some basic interaction.
4
+ class WordNetDB
5
+ # By default, use the bundled WordNet
6
+ @@path = File.join(File.dirname(__FILE__),"/../../WordNet-3.0/")
7
+ @@files = {}
8
+
9
+ # To use your own WordNet installation (rather than the one bundled with rwordnet:
10
+ def WordNetDB.path=(path_to_wordnet)
11
+ @@path = path_to_wordnet
12
+ end
13
+
14
+ # Returns the path to the WordNet installation currently in use. Defaults to the bundled version of WordNet.
15
+ def WordNetDB.path
16
+ @@path
17
+ end
18
+
19
+ # Look up a word in WordNet. Returns a list of lemmas occuring in any of the index files (noun, verb, adjective, adverb).
20
+ def WordNetDB.find(word)
21
+ lemmas = []
22
+ [NounIndex, VerbIndex, AdjectiveIndex, AdverbIndex].each do |index|
23
+ lemmas.push index.new.find(word)
24
+ end
25
+ return lemmas.flatten.reject { |x| x.nil? }
26
+ end
27
+
28
+ # Register a new DB file handle. You shouldn't need to call this method; it's called automatically every time you open an index or data file.
29
+ def WordNetDB.open(path)
30
+ # If the file is already open, just return the handle.
31
+ return @@files[path] if @@files.include?(path) and not @@files[path].closed?
32
+
33
+ # Open and store
34
+ @@files[path] = File.open(path,"r")
35
+ return @@files[path]
36
+ end
37
+
38
+ # You should call this method after you're done using WordNet.
39
+ def WordNetDB.close
40
+ WordNetDB.finalize(0)
41
+ end
42
+
43
+ def WordNetDB.finalize(id)
44
+ @@files.each_value do |handle|
45
+ begin
46
+ handle.close
47
+ rescue IOError
48
+ ; # Keep going, close the next file.
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ end
@@ -0,0 +1,17 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/wordnet"
3
+
4
+
5
+ class << Test::Unit::TestCase
6
+ def test(name, &block)
7
+ test_name = :"test_#{name.gsub(' ','_')}"
8
+ raise ArgumentError, "#{test_name} is already defined" if self.instance_methods.include? test_name.to_s
9
+ define_method test_name, &block
10
+ end
11
+
12
+ def expect(expected_value, &block)
13
+ define_method :"test_#{caller.first.split("/").last}" do
14
+ assert_equal expected_value, instance_eval(&block)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,21 @@
1
+ require File.dirname(__FILE__) + "/../test_helper.rb"
2
+
3
+ class TestIndex < Test::Unit::TestCase
4
+ @@index = nil
5
+
6
+ def setup
7
+ @@index = WordNet::NounIndex.new if @@index.nil?
8
+ end
9
+
10
+ test 'find a lemma by string' do
11
+ lemma = @@index.find("fruit")
12
+ assert_equal "fruit,n",lemma.to_s
13
+ end
14
+
15
+ test 'get synsets for a lemma' do
16
+ lemma = @@index.find("fruit")
17
+ synsets = lemma.get_synsets
18
+ assert_equal 3, synsets.size
19
+ assert_equal "(n) yield, fruit (an amount of a product)",synsets[1].to_s
20
+ end
21
+ end
@@ -0,0 +1,44 @@
1
+ require File.dirname(__FILE__) + "/../test_helper.rb"
2
+
3
+ class TestSynset < Test::Unit::TestCase
4
+ @@synsets = nil
5
+
6
+ def setup
7
+ if @@synsets.nil?
8
+ index = WordNet::NounIndex.new
9
+ lemma = index.find("fruit")
10
+ @@synsets = lemma.get_synsets
11
+ end
12
+ end
13
+
14
+ test 'get synsets for a lemma' do
15
+ assert_equal 3, @@synsets.size
16
+ assert_equal "(n) fruit (the ripened reproductive body of a seed plant)",@@synsets[0].to_s
17
+ assert_equal "an amount of a product",@@synsets[1].gloss
18
+ end
19
+
20
+ test 'get hypernym for a synset' do
21
+ hypernym = @@synsets[0].get_relation(WordNet::Hypernym)
22
+ hypernym = @@synsets[0].hypernym
23
+ assert_equal 1,hypernym.size
24
+ assert_equal "(n) reproductive structure (the parts of a plant involved in its reproduction)",hypernym.to_s
25
+ end
26
+
27
+ test 'test shorthand for get_relation' do
28
+ hypernym = @@synsets[0].get_relation(WordNet::Hypernym)
29
+ hypernym2 = @@synsets[0].hypernym
30
+ assert_equal hypernym[0].gloss, hypernym2.gloss
31
+ end
32
+
33
+ test 'get hyponyms for a synset' do
34
+ hyponym = @@synsets[0].get_relation(WordNet::Hyponym)
35
+ assert_equal 29,hyponym.size
36
+ assert_equal "fruit of various buckthorns yielding dyes or pigments",hyponym[26].gloss
37
+ end
38
+
39
+ test 'test expanded hypernym tree' do
40
+ expanded = @@synsets[0].expanded_hypernym
41
+ assert_equal 8, expanded.size
42
+ assert_equal "entity", expanded[expanded.size-1].words[0]
43
+ end
44
+ end