words 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/bin/build_wordnet +36 -0
- data/examples.rb +6 -1
- data/lib/words.rb +87 -8
- data/words.gemspec +1 -1
- metadata +1 -1
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/bin/build_wordnet
CHANGED
@@ -7,6 +7,8 @@ require 'pathname'
|
|
7
7
|
require 'rubygems'
|
8
8
|
require 'trollop'
|
9
9
|
require 'rufus-tokyo'
|
10
|
+
require 'zlib'
|
11
|
+
require 'net/http'
|
10
12
|
|
11
13
|
POS_FILE_TYPES = %w{ adj adv noun verb }
|
12
14
|
POS_FILE_TYPE_TO_SHORT = { 'adj' => 'a', 'adv' => 'r', 'noun' => 'n', 'verb' => 'v' }
|
@@ -26,6 +28,7 @@ opts = Trollop::options do
|
|
26
28
|
opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
|
27
29
|
opt :build_tokyo, "Build the tokyo dataset?", :default => false
|
28
30
|
opt :build_pure, "Build the pure ruby dataset?", :default => false
|
31
|
+
opt :build_evocations, "Build the similarity dataset based on the wordnet evocation project (requires internet connection)", :default => false
|
29
32
|
end
|
30
33
|
Trollop::die :build_tokyo, "You need to specify whether tokyo dataset or pure ruby index building is required" if !opts[:build_tokyo] && !opts[:build_pure]
|
31
34
|
puts "Verbose mode enabled" if (VERBOSE = opts[:verbose])
|
@@ -104,10 +107,36 @@ POS_FILE_TYPES.each do |file_pos|
|
|
104
107
|
|
105
108
|
end
|
106
109
|
|
110
|
+
score_hash = Hash.new
|
111
|
+
if opts[:build_evocations]
|
112
|
+
puts "Downlaoding score data..." if VERBOSE
|
113
|
+
scores_file = data_path + "scores.txt.gz"
|
114
|
+
scores_file.delete if scores_file.exist?
|
115
|
+
File.open(scores_file,'w') do |file|
|
116
|
+
file.write Net::HTTP.get(URI.parse('http://cloud.github.com/downloads/roja/words/scores.txt.gz'))
|
117
|
+
end
|
118
|
+
abort( "Unable to gather similarities information from http://cloud.github.com/downloads/roja/words/scores.txt.gz... Try again later." ) unless scores_file.exist?
|
119
|
+
|
120
|
+
puts "Compiling score data..." if VERBOSE
|
121
|
+
Zlib::GzipReader.open(scores_file) do |gz|
|
122
|
+
gz.each_line do |line|
|
123
|
+
mean, median, sense1, sense2 = line.split(',')
|
124
|
+
senses = [sense1, sense2].map! { |sense| sense.strip.split('.') }.map! { |sense| index_hash[sense[0]]["synset_ids"].select { |synset_id| synset_id[0,1] == sense[1].gsub("s", "a") }[sense[2].to_i-1] }
|
125
|
+
senses.each do |sense|
|
126
|
+
relation = (senses - [sense]).first.nil? ? sense : (senses - [sense]).first
|
127
|
+
score_name = sense + "s"
|
128
|
+
score_hash[score_name] = { "relations" => [], "means" => [], "medians" => [] } if score_hash[score_name].nil?
|
129
|
+
score_hash[score_name] = { "relations" => score_hash[score_name]["relations"] << relation, "means" => score_hash[score_name]["means"] << mean, "medians" => score_hash[score_name]["medians"] << median }
|
130
|
+
end unless senses.include? nil
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
107
135
|
if opts[:build_tokyo]
|
108
136
|
tokyo_hash = Rufus::Tokyo::Table.new((data_path + "wordnet.tct").to_s)
|
109
137
|
index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
|
110
138
|
data_hash.each { |k,v| tokyo_hash[k] = v }
|
139
|
+
score_hash.each { |k,v| tokyo_hash[k] = { "relations" => v["relations"].join('|'), "means" => v["means"].join('|'), "medians" => v["medians"].join('|') } } if opts[:build_evocations]
|
111
140
|
tokyo_hash.close
|
112
141
|
end
|
113
142
|
|
@@ -117,4 +146,11 @@ if opts[:build_pure]
|
|
117
146
|
File.open(data_path + "index.dmp",'w') do |file|
|
118
147
|
file.write Marshal.dump(index)
|
119
148
|
end
|
149
|
+
if opts[:build_evocations]
|
150
|
+
score = Hash.new
|
151
|
+
score_hash.each { |k,v| score[k] = [v["relations"].join('|'), v["means"].join('|'), v["medians"].join('|')] }
|
152
|
+
File.open(data_path + "evocations.dmp",'w') do |file|
|
153
|
+
file.write Marshal.dump(score)
|
154
|
+
end
|
155
|
+
end
|
120
156
|
end
|
data/examples.rb
CHANGED
@@ -26,11 +26,16 @@ if __FILE__ == $0
|
|
26
26
|
puts wordnet.find('bat').synsets('noun').last.relations("~")
|
27
27
|
puts wordnet.find('bat').synsets('verb').last.inspect
|
28
28
|
puts wordnet.find('bat').synsets('verb').last.words
|
29
|
-
puts wordnet.find('bat').synsets('verb').last.
|
29
|
+
puts wordnet.find('bat').synsets('verb').last.words_with_lexical_ids.inspect
|
30
30
|
|
31
31
|
puts wordnet.find('bat').synsets('verb').first.lexical.inspect
|
32
32
|
puts wordnet.find('bat').synsets('verb').first.lexical_description
|
33
33
|
|
34
|
+
puts wordnet.find('jkashdfajkshfksjdhf')
|
35
|
+
|
36
|
+
puts wordnet.find("broadcast").senses.first.evocations
|
37
|
+
puts wordnet.find("broadcast").senses.first.evocations[1].inspect
|
38
|
+
|
34
39
|
wordnet.close
|
35
40
|
|
36
41
|
end
|
data/lib/words.rb
CHANGED
@@ -25,9 +25,13 @@ module Words
|
|
25
25
|
@connected = true
|
26
26
|
elsif @connection_type == :pure
|
27
27
|
# open the index is there
|
28
|
-
File.open(@data_path,'r') do |file|
|
28
|
+
File.open(@data_path, 'r') do |file|
|
29
29
|
@connection = Marshal.load file.read
|
30
30
|
end
|
31
|
+
evocation_path = Pathname.new("#{File.dirname(__FILE__)}/../data/evocations.dmp")
|
32
|
+
File.open(evocation_path, 'r') do |file|
|
33
|
+
@evocations = Marshal.load file.read
|
34
|
+
end if evocation_path.exist?
|
31
35
|
# search for the wordnet files
|
32
36
|
if locate_wordnet?(wordnet_path)
|
33
37
|
@connected = true
|
@@ -56,12 +60,25 @@ module Words
|
|
56
60
|
def homographs(term)
|
57
61
|
if connection_type == :pure
|
58
62
|
raw_homographs = @connection[term]
|
59
|
-
{ 'lemma' => raw_homographs[0], 'tagsense_counts' => raw_homographs[1], 'synset_ids' => raw_homographs[2]}
|
63
|
+
{ 'lemma' => raw_homographs[0], 'tagsense_counts' => raw_homographs[1], 'synset_ids' => raw_homographs[2]} unless raw_homographs.nil?
|
60
64
|
else
|
61
65
|
@connection[term]
|
62
66
|
end
|
63
67
|
end
|
64
68
|
|
69
|
+
def evocations(senset_id)
|
70
|
+
if connection_type == :pure
|
71
|
+
if defined? @evocations
|
72
|
+
raw_evocations = @evocations[senset_id + "s"]
|
73
|
+
{ 'relations' => raw_evocations[0], 'means' => raw_evocations[1], 'medians' => raw_evocations[2]} unless raw_evocations.nil?
|
74
|
+
else
|
75
|
+
nil
|
76
|
+
end
|
77
|
+
else
|
78
|
+
@connection[senset_id + "s"]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
65
82
|
def synset(synset_id)
|
66
83
|
if connection_type == :pure
|
67
84
|
pos = synset_id[0,1]
|
@@ -102,6 +119,62 @@ module Words
|
|
102
119
|
|
103
120
|
end
|
104
121
|
|
122
|
+
class Evocations
|
123
|
+
|
124
|
+
def initialize(evocation_construct, source_synset, wordnet_connection)
|
125
|
+
@wordnet_connection = wordnet_connection
|
126
|
+
@source = source_synset
|
127
|
+
@evocation_construct = evocation_construct
|
128
|
+
end
|
129
|
+
|
130
|
+
def means
|
131
|
+
@means = @evocation_construct["means"].split('|') unless defined? @means
|
132
|
+
@means
|
133
|
+
end
|
134
|
+
|
135
|
+
def medians
|
136
|
+
@medians = @evocation_construct["medians"].split('|') unless defined? @medians
|
137
|
+
@medians
|
138
|
+
end
|
139
|
+
|
140
|
+
def size
|
141
|
+
means.size
|
142
|
+
end
|
143
|
+
|
144
|
+
def first
|
145
|
+
self[0]
|
146
|
+
end
|
147
|
+
|
148
|
+
def last
|
149
|
+
self[size-1]
|
150
|
+
end
|
151
|
+
|
152
|
+
def [] (index)
|
153
|
+
{ :destination => destinations[index], :mean => means[index], :median => medians[index] }
|
154
|
+
end
|
155
|
+
|
156
|
+
def destinations(pos = :all)
|
157
|
+
destination_ids(pos).map { |synset_id| Synset.new synset_id, @wordnet_connection, @source.homographs }
|
158
|
+
end
|
159
|
+
|
160
|
+
def destination_ids(pos = :all)
|
161
|
+
@destination_ids = @evocation_construct["relations"].split('|') unless defined? @destination_ids
|
162
|
+
case
|
163
|
+
when Homographs::SYMBOL_TO_POS.include?(pos.to_sym)
|
164
|
+
@destination_ids.select { |synset_id| synset_id[0,1] == Homographs::SYMBOL_TO_POS[pos.to_sym] }
|
165
|
+
when Homographs::POS_TO_SYMBOL.include?(pos.to_s)
|
166
|
+
@destination_ids.select { |synset_id| synset_id[0,1] == pos.to_s }
|
167
|
+
else
|
168
|
+
@destination_ids
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def to_s
|
173
|
+
"#{size} evocations from #{@source}"
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
105
178
|
class Relation
|
106
179
|
|
107
180
|
RELATION_TO_SYMBOL = { "-c" => :member_of_this_domain_topic, "+" => :derivationally_related_form, "%p" => :part_meronym, "~i" => :instance_hyponym, "@" => :hypernym,
|
@@ -149,7 +222,7 @@ module Words
|
|
149
222
|
end
|
150
223
|
|
151
224
|
def destination
|
152
|
-
@destination = Synset.new
|
225
|
+
@destination = Synset.new(@dest_synset_id, @wordnet_connection, nil) unless defined? @destination
|
153
226
|
@destination
|
154
227
|
end
|
155
228
|
|
@@ -235,12 +308,12 @@ module Words
|
|
235
308
|
end
|
236
309
|
|
237
310
|
def words
|
238
|
-
@words =
|
311
|
+
@words = words_with_lexical_ids.map { |word_with_num| word_with_num[:word] } unless defined? @words
|
239
312
|
@words
|
240
313
|
end
|
241
314
|
|
242
315
|
def lexical_ids
|
243
|
-
@words =
|
316
|
+
@words = words_with_lexical_ids.map { |word_with_num| word_with_num[:lexical_id] } unless defined? @words
|
244
317
|
@words
|
245
318
|
end
|
246
319
|
|
@@ -301,6 +374,11 @@ module Words
|
|
301
374
|
end
|
302
375
|
end
|
303
376
|
|
377
|
+
def evocations
|
378
|
+
evocations_arr = @wordnet_connection.evocations(synset_id)
|
379
|
+
Evocations.new evocations_arr, self, @wordnet_connection unless evocations_arr.nil?
|
380
|
+
end
|
381
|
+
|
304
382
|
def to_s
|
305
383
|
@to_s = "#{synset_type.to_s.capitalize} including word(s): #{words.map { |word| '"' + word + '"' }.join(', ')} meaning: #{gloss}" unless defined? @to_s
|
306
384
|
@to_s
|
@@ -315,7 +393,7 @@ module Words
|
|
315
393
|
|
316
394
|
def initialize(raw_homographs, wordnet_connection)
|
317
395
|
@wordnet_connection = wordnet_connection
|
318
|
-
@
|
396
|
+
@raw_homographs = raw_homographs
|
319
397
|
# construct some conveniance menthods for relation type access
|
320
398
|
SYMBOL_TO_POS.keys.each do |pos|
|
321
399
|
self.class.send(:define_method, "#{pos}s?") do
|
@@ -358,7 +436,7 @@ module Words
|
|
358
436
|
end
|
359
437
|
|
360
438
|
def synsets(pos = :all)
|
361
|
-
synset_ids(pos).map { |synset_id| Synset.new synset_id,
|
439
|
+
synset_ids(pos).map { |synset_id| Synset.new synset_id, @wordnet_connection, self }
|
362
440
|
end
|
363
441
|
|
364
442
|
def synset_ids(pos = :all)
|
@@ -393,7 +471,8 @@ module Words
|
|
393
471
|
end
|
394
472
|
|
395
473
|
def find(word)
|
396
|
-
|
474
|
+
homographs = @wordnet_connection.homographs(word)
|
475
|
+
Homographs.new homographs, @wordnet_connection unless homographs.nil?
|
397
476
|
end
|
398
477
|
|
399
478
|
def connection_type
|
data/words.gemspec
CHANGED