words 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -4
- data/README.markdown +45 -14
- data/VERSION +1 -1
- data/bin/build_wordnet +75 -86
- data/examples.rb +44 -31
- data/lib/evocations.rb +85 -0
- data/lib/homographs.rb +106 -0
- data/lib/relation.rb +91 -0
- data/lib/synset.rb +199 -0
- data/lib/wordnet_connection.rb +187 -0
- data/lib/wordnet_connectors/pure_wordnet_connection.rb +142 -0
- data/lib/wordnet_connectors/tokyo_wordnet_connection.rb +85 -0
- data/lib/words.rb +79 -498
- data/spec/words_spec.rb +113 -0
- data/words.gemspec +11 -6
- metadata +11 -6
- data/test/helper.rb +0 -9
- data/test/test_words.rb +0 -7
data/.gitignore
CHANGED
data/README.markdown
CHANGED
@@ -6,6 +6,7 @@ Words implements a fast interface to [Wordnet®](http://wordnet.princeton.edu) w
|
|
6
6
|
|
7
7
|
* Version 0.2 Introduced Pure Ruby Backend
|
8
8
|
* Version 0.3 Introduced Evocation Support (see examples & below) as developed by the [Wordnet® Evocation Project](http://wordnet.cs.princeton.edu/downloads/evocation/release-0.4/README.TXT)
|
9
|
+
* Version 0.4 Substantial performance increase in pure mode (now faster at some things than the tokyo backend) and simplification of use! Full refactoring. Move to RSpec for testing.
|
9
10
|
|
10
11
|
## Pre-Installation ##
|
11
12
|
|
@@ -31,12 +32,20 @@ or (Windows)
|
|
31
32
|
Download http://wordnetcode.princeton.edu/3.0/WNdb-3.0.tar.gz
|
32
33
|
Unzip
|
33
34
|
|
35
|
+
# due to the way windows tends not to have a folder for things you will
|
36
|
+
# have to specify the location of the wordnet files when using the gem
|
37
|
+
|
34
38
|
## For Tokyo Backend Only ##
|
35
39
|
|
36
|
-
Unless you want to use the tokyo backend you are now ready to install Words
|
40
|
+
Unless you want to use the tokyo backend you are now ready to install Words (and build the data if you want extras), otherwise if you want to use the tokyo backend (FAST!) you will also need [Tokyo Cabinet](http://1978th.net/tokyocabinet/) installed. It should be nice and easy... something like:
|
41
|
+
|
42
|
+
# osx users should, if ports is installed, simply do
|
43
|
+
sudo ports install tokyocabinet
|
37
44
|
|
45
|
+
# otherwise the best route is from source
|
38
46
|
wget http://1978th.net/tokyocabinet/tokyocabinet-1.4.41.tar.gz
|
39
|
-
|
47
|
+
tar -xzf tokyocabinet-1.4.41.tar.gz
|
48
|
+
cd tokyocabinet-1.4.41/
|
40
49
|
./configure
|
41
50
|
make
|
42
51
|
sudo make install
|
@@ -56,18 +65,24 @@ Then your ready to rock and roll. :)
|
|
56
65
|
|
57
66
|
## Build Data ##
|
58
67
|
|
59
|
-
|
68
|
+
If all you want to do is use wordnet in it's standard form you don't have to do any databuilding and can skip this section. If however you either
|
69
|
+
want to take advantage of evocations ([Wordnet® Evocation Project](http://wordnet.cs.princeton.edu/downloads/evocation/release-0.4/README.TXT)) or want to use the tokyo backend, read on!
|
70
|
+
|
71
|
+
To build the wordnet dataset file yourself, from the original wordnet files, you can use the bundled "build_wordnet" command
|
60
72
|
|
61
73
|
build_wordnet -h # this will give you the usage information & additional options/features
|
62
74
|
|
63
|
-
# this would attempt to build the tokyo backend data locating the original wordnet
|
64
|
-
|
75
|
+
# this would attempt to build the tokyo backend data locating the original wordnet
|
76
|
+
# files through a search...
|
77
|
+
sudo build_wordnet --build-tokyo
|
65
78
|
|
66
|
-
# this would attempt to build the
|
67
|
-
|
79
|
+
# this would attempt to build the tokyo backend locating the original wordnet files
|
80
|
+
# through a search with the addition of evocation support...
|
81
|
+
sudo build_wordnet --build-tokyo-with-evocations
|
68
82
|
|
69
|
-
# this would attempt to build
|
70
|
-
|
83
|
+
# this would attempt to build evocation support for the pure backend
|
84
|
+
# (remember no dataset needs to be built to use wordnet with the pure backend)
|
85
|
+
sudo build_wordnet --build-pure-evocations
|
71
86
|
|
72
87
|
## Usage ##
|
73
88
|
|
@@ -76,8 +91,19 @@ Heres a few little examples of using words within your programs.
|
|
76
91
|
require 'rubygems'
|
77
92
|
require 'words'
|
78
93
|
|
79
|
-
data = Words::
|
80
|
-
|
94
|
+
data = Words::Wordnet.new # or: data = Words::Words.new(:tokyo) for the tokyo backend
|
95
|
+
|
96
|
+
# to specify a wordnet path Words::Words.new(:pure, '/path/to/wordnet')
|
97
|
+
# to specify the tokyo dataset Words::Words.new(:pure, :search, '/path/to/data.tct')
|
98
|
+
|
99
|
+
# play with connections
|
100
|
+
data.connected? # => true
|
101
|
+
data.close!
|
102
|
+
data.connected? # => false
|
103
|
+
data.open!
|
104
|
+
data.connected? # => true
|
105
|
+
data.connection_type # => :pure or :tokyo depending...
|
106
|
+
|
81
107
|
# locate a word
|
82
108
|
lemma = data.find("bat")
|
83
109
|
|
@@ -112,9 +138,14 @@ Heres a few little examples of using words within your programs.
|
|
112
138
|
sense.derivationally_related_forms.first.source_word # => "bat"
|
113
139
|
sense.derivationally_related_forms.first.destination_word # => "bat"
|
114
140
|
sense.derivationally_related_forms.first.destination # => the synset of v01413191
|
115
|
-
|
116
|
-
data.
|
117
|
-
|
141
|
+
|
142
|
+
if data.evocations? # check for evocation support
|
143
|
+
data.find("broadcast").senses.first.evocations # => sense relevant evocations
|
144
|
+
data.find("broadcast").senses.first.evocations[1] # => the evocation at index 1
|
145
|
+
data.find("broadcast").senses.first.evocations[1][:destination].words # => synset
|
146
|
+
end
|
147
|
+
|
148
|
+
data.close!
|
118
149
|
|
119
150
|
These and more examples are available from within the examples.rb file!
|
120
151
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/bin/build_wordnet
CHANGED
@@ -5,34 +5,31 @@ require 'pathname'
|
|
5
5
|
|
6
6
|
# gem includes
|
7
7
|
require 'rubygems'
|
8
|
+
|
9
|
+
# standard library includes
|
8
10
|
require 'trollop'
|
9
11
|
require 'zlib'
|
10
12
|
require 'net/http'
|
11
13
|
|
14
|
+
# local includes
|
15
|
+
require File.join(File.dirname(__FILE__), '..', 'lib', 'words.rb')
|
16
|
+
|
12
17
|
POS_FILE_TYPES = %w{ adj adv noun verb }
|
13
18
|
POS_FILE_TYPE_TO_SHORT = { 'adj' => 'a', 'adv' => 'r', 'noun' => 'n', 'verb' => 'v' }
|
14
19
|
|
15
|
-
def locate_wordnet(base_dir)
|
16
|
-
puts "Checking #{base_dir} & spcific children for wordnet files..." if VERBOSE
|
17
|
-
path = Pathname.new base_dir
|
18
|
-
return nil unless path.exist?
|
19
|
-
return path if (path + "data.noun").exist?
|
20
|
-
return path + "dict" if (path + "dict/data.noun").exist?
|
21
|
-
end
|
22
|
-
|
23
20
|
puts "Words Dataset Constructor 2010 (c) Roja Buck"
|
24
21
|
|
25
22
|
opts = Trollop::options do
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
23
|
+
opt :quiet, "Don't output verbose program detail.", :default => false
|
24
|
+
opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
|
25
|
+
opt :build_tokyo, "Build the tokyo wordnet dataset?", :default => false
|
26
|
+
opt :build_tokyo_with_evocations, "Build the tokyo dataset with the similarity dataset based on the wordnet evocation project (requires internet connection)?", :default => false
|
27
|
+
opt :build_pure_evocations, "Build the similarity dataset based on the wordnet evocation project for use with the pure words mode (requires internet connection)", :default => false
|
31
28
|
end
|
32
|
-
Trollop::die :build_tokyo, "You need to specify
|
33
|
-
puts "Verbose mode enabled" if (VERBOSE = opts[:
|
29
|
+
Trollop::die :build_tokyo, "You need to specify which dataset you want to build." if !opts[:build_tokyo] && !opts[:build_tokyo_with_evocations] && !opts[:build_pure_evocations]
|
30
|
+
puts "Verbose mode enabled" if (VERBOSE = !opts[:quiet])
|
34
31
|
|
35
|
-
require 'rufus-tokyo' if opts[:build_tokyo]
|
32
|
+
require 'rufus-tokyo' if opts[:build_tokyo] || opts[:build_tokyo_with_evocations]
|
36
33
|
|
37
34
|
gem_path = Pathname.new "#{File.dirname(__FILE__)}/.."
|
38
35
|
abort "Ensure you run the command using sudo or as a Superuser / Administrator" unless gem_path.writable?
|
@@ -41,13 +38,11 @@ data_path.mkpath
|
|
41
38
|
|
42
39
|
wordnet_dir = nil
|
43
40
|
if opts[:wordnet] == "Search..."
|
44
|
-
|
45
|
-
|
46
|
-
end
|
47
|
-
abort( "Unable to locate wordnet dictionary. To specify check --help." ) if wordnet_dir.nil?
|
41
|
+
wordnet_dir = Words::Wordnet.locate_wordnet :search
|
42
|
+
abort( "Unable to locate wordnet dictionary. To specify check --help." ) if wordnet_dir.nil?
|
48
43
|
else
|
49
|
-
|
50
|
-
|
44
|
+
wordnet_dir = Words::Wordnet.locate_wordnet opts[:wordnet]
|
45
|
+
abort( "Unable to locate wordnet dictionary in directory #{opts[:wordnet]}. Please check and try again." ) if wordnet_dir.nil?
|
51
46
|
end
|
52
47
|
|
53
48
|
# At this point we know we should have a wordnet directory within wordnet_dir
|
@@ -56,9 +51,9 @@ puts "Found wordnet files in #{wordnet_dir}..." if VERBOSE
|
|
56
51
|
index_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "index.#{pos}" }
|
57
52
|
data_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "data.#{pos}" }
|
58
53
|
|
59
|
-
|
60
|
-
|
61
|
-
|
54
|
+
(index_files + data_files).each do |required_file|
|
55
|
+
abort( "Unable to locate #{required_file} within the wordnet dictionary. Please check your wordnet copy is valid and try again." ) unless required_file.exist?
|
56
|
+
abort( "Cannot get readable permissions to #{required_file} within the wordnet dictionary. Please check the file permissions and try again." ) unless required_file.readable?
|
62
57
|
end
|
63
58
|
|
64
59
|
# At this point we know we have the correct files, though we don't know there validity
|
@@ -70,88 +65,82 @@ index_hash = Hash.new
|
|
70
65
|
data_hash = Hash.new
|
71
66
|
POS_FILE_TYPES.each do |file_pos|
|
72
67
|
|
73
|
-
|
68
|
+
puts "Building #{file_pos} indexes..." if VERBOSE
|
74
69
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
70
|
+
# add indexes
|
71
|
+
(wordnet_dir + "index.#{file_pos}").each_line do |index_line|
|
72
|
+
next if index_line[0, 2] == " "
|
73
|
+
index_parts = index_line.split(" ")
|
79
74
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
75
|
+
lemma, pos, synset_count, pointer_count = index_parts.shift, index_parts.shift, index_parts.shift.to_i, index_parts.shift.to_i
|
76
|
+
pointer_symbols = Array.new(pointer_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
77
|
+
sense_count = index_parts.shift
|
78
|
+
tagsense_count = pos + index_parts.shift
|
79
|
+
synset_ids = Array.new(synset_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
85
80
|
|
86
|
-
|
87
|
-
|
81
|
+
index_hash[lemma] = { "synset_ids" => [], "tagsense_counts" => [] } if index_hash[lemma].nil?
|
82
|
+
index_hash[lemma] = { "lemma" => lemma, "synset_ids" => index_hash[lemma]["synset_ids"] + synset_ids, "tagsense_counts" => index_hash[lemma]["tagsense_counts"] + [tagsense_count] }
|
88
83
|
|
89
|
-
|
84
|
+
end
|
90
85
|
|
91
|
-
|
92
|
-
|
86
|
+
if opts[:build_tokyo] || opts[:build_tokyo_with_evocations]
|
87
|
+
puts "Building #{file_pos} data..." if VERBOSE
|
93
88
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
89
|
+
# add data
|
90
|
+
(wordnet_dir + "data.#{file_pos}").each_line do |data_line|
|
91
|
+
next if data_line[0, 2] == " "
|
92
|
+
data_line, gloss = data_line.split(" | ")
|
93
|
+
data_parts = data_line.split(" ")
|
99
94
|
|
100
|
-
|
101
|
-
|
102
|
-
|
95
|
+
synset_id, lexical_filenum, synset_type, word_count = POS_FILE_TYPE_TO_SHORT[file_pos] + data_parts.shift, data_parts.shift, data_parts.shift, data_parts.shift.to_i(16)
|
96
|
+
words = Array.new(word_count).map { "#{data_parts.shift}.#{data_parts.shift}" }
|
97
|
+
relations = Array.new(data_parts.shift.to_i).map { "#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}" }
|
103
98
|
|
104
|
-
|
105
|
-
|
99
|
+
data_hash[synset_id] = { "synset_id" => synset_id, "lexical_filenum" => lexical_filenum, "synset_type" => synset_type,
|
100
|
+
"words" => words.join('|'), "relations" => relations.join('|'), "gloss" => gloss.strip }
|
101
|
+
end
|
106
102
|
end
|
107
|
-
end
|
108
103
|
|
109
104
|
end
|
110
105
|
|
111
106
|
score_hash = Hash.new
|
112
|
-
if opts[:
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
107
|
+
if opts[:build_tokyo_with_evocations] || opts[:build_pure_evocations]
|
108
|
+
puts "Downloading score data..." if VERBOSE
|
109
|
+
scores_file = data_path + "scores.txt.gz"
|
110
|
+
scores_file.delete if scores_file.exist?
|
111
|
+
File.open(scores_file,'w') do |file|
|
112
|
+
file.write Net::HTTP.get(URI.parse('http://cloud.github.com/downloads/roja/words/scores.txt.gz'))
|
113
|
+
end
|
114
|
+
abort( "Unable to gather similarities information from http://cloud.github.com/downloads/roja/words/scores.txt.gz... Try again later." ) unless scores_file.exist?
|
120
115
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
116
|
+
puts "Compiling score data..." if VERBOSE
|
117
|
+
Zlib::GzipReader.open(scores_file) do |gz|
|
118
|
+
gz.each_line do |line|
|
119
|
+
mean, median, sense1, sense2 = line.split(',')
|
120
|
+
senses = [sense1, sense2].map! { |sense| sense.strip.split('.') }.map! { |sense| index_hash[sense[0]]["synset_ids"].select { |synset_id| synset_id[0,1] == sense[1].gsub("s", "a") }[sense[2].to_i-1] }
|
121
|
+
senses.each do |sense|
|
122
|
+
relation = (senses - [sense]).first.nil? ? sense : (senses - [sense]).first
|
123
|
+
score_name = sense + "s"
|
124
|
+
score_hash[score_name] = { "relations" => [], "means" => [], "medians" => [] } if score_hash[score_name].nil?
|
125
|
+
score_hash[score_name] = { "relations" => score_hash[score_name]["relations"] << relation, "means" => score_hash[score_name]["means"] << mean, "medians" => score_hash[score_name]["medians"] << median }
|
126
|
+
end unless senses.include? nil
|
127
|
+
end
|
132
128
|
end
|
133
|
-
end
|
134
129
|
end
|
135
130
|
|
136
|
-
if opts[:build_tokyo]
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
131
|
+
if opts[:build_tokyo] || opts[:build_tokyo_with_evocations]
|
132
|
+
tokyo_hash = Rufus::Tokyo::Table.new((data_path + "wordnet.tct").to_s)
|
133
|
+
index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
|
134
|
+
data_hash.each { |k,v| tokyo_hash[k] = v }
|
135
|
+
score_hash.each { |k,v| tokyo_hash[k] = { "relations" => v["relations"].join('|'), "means" => v["means"].join('|'), "medians" => v["medians"].join('|') } } if opts[:build_tokyo_with_evocations]
|
136
|
+
tokyo_hash.close
|
142
137
|
end
|
143
138
|
|
144
|
-
if opts[:
|
145
|
-
index = Hash.new
|
146
|
-
index_hash.each { |k,v| index[k] = [v["lemma"], v["tagsense_counts"].join('|'), v["synset_ids"].join('|')] }
|
147
|
-
File.open(data_path + "index.dmp",'w') do |file|
|
148
|
-
file.write Marshal.dump(index)
|
149
|
-
end
|
150
|
-
if opts[:build_evocations]
|
139
|
+
if opts[:build_pure_evocations]
|
151
140
|
score = Hash.new
|
152
141
|
score_hash.each { |k,v| score[k] = [v["relations"].join('|'), v["means"].join('|'), v["medians"].join('|')] }
|
153
142
|
File.open(data_path + "evocations.dmp",'w') do |file|
|
154
|
-
|
143
|
+
file.write Marshal.dump(score)
|
155
144
|
end
|
156
|
-
end
|
157
145
|
end
|
146
|
+
|
data/examples.rb
CHANGED
@@ -2,40 +2,53 @@
|
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'words'
|
5
|
+
#require 'lib/words.rb'
|
5
6
|
|
6
7
|
if __FILE__ == $0
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
puts wordnet.find('bat').nouns?
|
16
|
-
puts wordnet.find('bat').synsets('noun')
|
17
|
-
puts wordnet.find('bat').noun_ids
|
18
|
-
puts wordnet.find('bat').synsets(:noun).last.words.inspect
|
19
|
-
puts wordnet.find('bat').nouns.last.relations
|
20
|
-
wordnet.find('bat').synsets('noun').last.relations.each { |relation| puts relation.inspect }
|
21
|
-
puts wordnet.find('bat').synsets('noun').last.hyponyms?
|
22
|
-
puts wordnet.find('bat').synsets('noun').last.participle_of_verbs?
|
23
|
-
|
24
|
-
puts wordnet.find('bat').synsets('noun').last.relations(:hyponym)
|
25
|
-
puts wordnet.find('bat').synsets('noun').last.hyponyms?
|
26
|
-
puts wordnet.find('bat').synsets('noun').last.relations("~")
|
27
|
-
puts wordnet.find('bat').synsets('verb').last.inspect
|
28
|
-
puts wordnet.find('bat').synsets('verb').last.words
|
29
|
-
puts wordnet.find('bat').synsets('verb').last.words_with_lexical_ids.inspect
|
30
|
-
|
31
|
-
puts wordnet.find('bat').synsets('verb').first.lexical.inspect
|
32
|
-
puts wordnet.find('bat').synsets('verb').first.lexical_description
|
33
|
-
|
34
|
-
puts wordnet.find('jkashdfajkshfksjdhf')
|
35
|
-
|
36
|
-
puts wordnet.find("broadcast").senses.first.evocations
|
37
|
-
puts wordnet.find("broadcast").senses.first.evocations[1].inspect
|
9
|
+
wordnet = Words::Wordnet.new :tokyo
|
10
|
+
|
11
|
+
puts wordnet.connected?
|
12
|
+
wordnet.close!
|
13
|
+
puts wordnet.connected?
|
14
|
+
wordnet.open!
|
15
|
+
puts wordnet.connected?
|
38
16
|
|
39
|
-
|
17
|
+
puts wordnet
|
18
|
+
|
19
|
+
puts wordnet.find('squash racquet')
|
20
|
+
|
21
|
+
puts wordnet.find('bat')
|
22
|
+
puts wordnet.find('bat').available_pos.inspect
|
23
|
+
puts wordnet.find('bat').lemma
|
24
|
+
puts wordnet.find('bat').nouns?
|
25
|
+
puts wordnet.find('bat').synsets('noun')
|
26
|
+
puts wordnet.find('bat').noun_ids
|
27
|
+
puts wordnet.find('bat').synsets(:noun)[2].words.inspect
|
28
|
+
puts wordnet.find('bat').nouns.last.relations
|
29
|
+
wordnet.find('bat').synsets('noun').last.relations.each { |relation| puts relation.inspect }
|
30
|
+
puts wordnet.find('bat').synsets('noun').last.hyponyms?
|
31
|
+
puts wordnet.find('bat').synsets('noun').last.participle_of_verbs?
|
32
|
+
|
33
|
+
puts wordnet.find('bat').synsets('noun').last.relations(:hyponym)
|
34
|
+
puts wordnet.find('bat').synsets('noun').last.hyponyms?
|
35
|
+
puts wordnet.find('bat').synsets('noun').last.relations("~")
|
36
|
+
puts wordnet.find('bat').synsets('verb').last.inspect
|
37
|
+
puts wordnet.find('bat').synsets('verb').last.words.inspect
|
38
|
+
puts wordnet.find('bat').synsets('verb').last.words_with_lexical_ids.inspect
|
39
|
+
|
40
|
+
puts wordnet.find('bat').synsets('verb').first.lexical.inspect
|
41
|
+
puts wordnet.find('bat').synsets('verb').first.lexical_description
|
42
|
+
|
43
|
+
puts wordnet.find('jkashdfajkshfksjdhf')
|
44
|
+
|
45
|
+
if wordnet.evocations?
|
46
|
+
puts wordnet.find("broadcast").senses.first.evocations
|
47
|
+
puts wordnet.find("broadcast").senses.first.evocations.means
|
48
|
+
puts wordnet.find("broadcast").senses.first.evocations[1].inspect
|
49
|
+
puts wordnet.find("broadcast").senses.first.evocations[20][:destination].words
|
50
|
+
end
|
51
|
+
|
52
|
+
wordnet.close!
|
40
53
|
|
41
54
|
end
|
data/lib/evocations.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
# local includes
|
2
|
+
require File.join(File.dirname(__FILE__), 'synset.rb')
|
3
|
+
|
4
|
+
module Words
|
5
|
+
|
6
|
+
class Evocations
|
7
|
+
|
8
|
+
def initialize(evocation_construct, source_synset, wordnet_connection)
|
9
|
+
|
10
|
+
@wordnet_connection = wordnet_connection
|
11
|
+
@source = source_synset
|
12
|
+
@evocation_construct = evocation_construct
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
def means
|
17
|
+
|
18
|
+
@means ||= @evocation_construct["means"].split('|')
|
19
|
+
|
20
|
+
@means
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
def medians
|
25
|
+
|
26
|
+
@medians ||= @evocation_construct["medians"].split('|')
|
27
|
+
|
28
|
+
@medians
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
def size
|
33
|
+
|
34
|
+
means.size
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
def first
|
39
|
+
|
40
|
+
self[0]
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
def last
|
45
|
+
|
46
|
+
self[size-1]
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
def [] (index)
|
51
|
+
|
52
|
+
{ :destination => Synset.new(destination_ids[index], @wordnet_connection, @source.homographs), :mean => means[index], :median => medians[index] }
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
def destinations(pos = :all)
|
57
|
+
|
58
|
+
destination_ids(pos).map { |synset_id| Synset.new synset_id, @wordnet_connection, @source.homographs }
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
def destination_ids(pos = :all)
|
63
|
+
|
64
|
+
@destination_ids ||= @evocation_construct["relations"].split('|')
|
65
|
+
|
66
|
+
case
|
67
|
+
when Homographs::SYMBOL_TO_POS.include?(pos.to_sym)
|
68
|
+
@destination_ids.select { |synset_id| synset_id[0,1] == Homographs::SYMBOL_TO_POS[pos.to_sym] }
|
69
|
+
when Homographs::POS_TO_SYMBOL.include?(pos.to_s)
|
70
|
+
@destination_ids.select { |synset_id| synset_id[0,1] == pos.to_s }
|
71
|
+
else
|
72
|
+
@destination_ids
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
def to_s
|
78
|
+
|
79
|
+
"#{size} evocations from the #{@source}"
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
data/lib/homographs.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
# local includes
|
2
|
+
require File.join(File.dirname(__FILE__), 'synset.rb')
|
3
|
+
|
4
|
+
module Words
|
5
|
+
|
6
|
+
class Homographs
|
7
|
+
|
8
|
+
POS_TO_SYMBOL = {"n" => :noun, "v" => :verb, "a" => :adjective, "r" => :adverb}
|
9
|
+
SYMBOL_TO_POS = POS_TO_SYMBOL.invert
|
10
|
+
|
11
|
+
def initialize(raw_homographs, wordnet_connection)
|
12
|
+
|
13
|
+
@wordnet_connection = wordnet_connection
|
14
|
+
@raw_homographs = raw_homographs
|
15
|
+
|
16
|
+
# construct some conveniance menthods for relation type access
|
17
|
+
SYMBOL_TO_POS.keys.each do |pos|
|
18
|
+
self.class.send(:define_method, "#{pos}s?") do
|
19
|
+
size(pos) > 0
|
20
|
+
end
|
21
|
+
self.class.send(:define_method, "#{pos}s") do
|
22
|
+
synsets(pos)
|
23
|
+
end
|
24
|
+
self.class.send(:define_method, "#{pos}_count") do
|
25
|
+
size(pos)
|
26
|
+
end
|
27
|
+
self.class.send(:define_method, "#{pos}_ids") do
|
28
|
+
synset_ids(pos)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
def tagsense_counts
|
35
|
+
|
36
|
+
@tagsense_counts ||= @raw_homographs["tagsense_counts"].split('|').map { |count| { POS_TO_SYMBOL[count[0,1]] => count[1..-1].to_i } }
|
37
|
+
|
38
|
+
@tagsense_counts
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
def lemma
|
43
|
+
|
44
|
+
@lemma ||= @raw_homographs["lemma"].gsub('_', ' ')
|
45
|
+
|
46
|
+
@lemma
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
def available_pos
|
51
|
+
|
52
|
+
@available_pos ||= synset_ids.map { |synset_id| POS_TO_SYMBOL[synset_id[0,1]] }.uniq
|
53
|
+
|
54
|
+
@available_pos
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
def to_s
|
59
|
+
|
60
|
+
@to_s ||= [lemma, " " + available_pos.join("/")].join(",")
|
61
|
+
|
62
|
+
@to_s
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
def size(pos = :all)
|
67
|
+
|
68
|
+
synset_ids(pos).size
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
def synsets(pos = :all)
|
73
|
+
|
74
|
+
synset_ids(pos).map { |synset_id| Synset.new synset_id, @wordnet_connection, self }
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
def synset_ids(pos = :all)
|
79
|
+
|
80
|
+
@synset_ids ||= @raw_homographs["synset_ids"].split('|')
|
81
|
+
|
82
|
+
case
|
83
|
+
when SYMBOL_TO_POS.include?(pos.to_sym)
|
84
|
+
@synset_ids.select { |synset_id| synset_id[0,1] == SYMBOL_TO_POS[pos.to_sym] }
|
85
|
+
when POS_TO_SYMBOL.include?(pos.to_s)
|
86
|
+
@synset_ids.select { |synset_id| synset_id[0,1] == pos.to_s }
|
87
|
+
else
|
88
|
+
@synset_ids
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
def inspect
|
94
|
+
|
95
|
+
@raw_homographs.inspect
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
alias word lemma
|
100
|
+
alias pos available_pos
|
101
|
+
alias senses synsets
|
102
|
+
alias sense_ids synset_ids
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|