words 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/bin/build_wordnet +36 -0
- data/examples.rb +6 -1
- data/lib/words.rb +87 -8
- data/words.gemspec +1 -1
- metadata +1 -1
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/bin/build_wordnet
CHANGED
@@ -7,6 +7,8 @@ require 'pathname'
|
|
7
7
|
require 'rubygems'
|
8
8
|
require 'trollop'
|
9
9
|
require 'rufus-tokyo'
|
10
|
+
require 'zlib'
|
11
|
+
require 'net/http'
|
10
12
|
|
11
13
|
POS_FILE_TYPES = %w{ adj adv noun verb }
|
12
14
|
POS_FILE_TYPE_TO_SHORT = { 'adj' => 'a', 'adv' => 'r', 'noun' => 'n', 'verb' => 'v' }
|
@@ -26,6 +28,7 @@ opts = Trollop::options do
|
|
26
28
|
opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
|
27
29
|
opt :build_tokyo, "Build the tokyo dataset?", :default => false
|
28
30
|
opt :build_pure, "Build the pure ruby dataset?", :default => false
|
31
|
+
opt :build_evocations, "Build the similarity dataset based on the wordnet evocation project (requires internet connection)", :default => false
|
29
32
|
end
|
30
33
|
Trollop::die :build_tokyo, "You need to specify whether tokyo dataset or pure ruby index building is required" if !opts[:build_tokyo] && !opts[:build_pure]
|
31
34
|
puts "Verbose mode enabled" if (VERBOSE = opts[:verbose])
|
@@ -104,10 +107,36 @@ POS_FILE_TYPES.each do |file_pos|
|
|
104
107
|
|
105
108
|
end
|
106
109
|
|
110
|
+
score_hash = Hash.new
|
111
|
+
if opts[:build_evocations]
|
112
|
+
puts "Downlaoding score data..." if VERBOSE
|
113
|
+
scores_file = data_path + "scores.txt.gz"
|
114
|
+
scores_file.delete if scores_file.exist?
|
115
|
+
File.open(scores_file,'w') do |file|
|
116
|
+
file.write Net::HTTP.get(URI.parse('http://cloud.github.com/downloads/roja/words/scores.txt.gz'))
|
117
|
+
end
|
118
|
+
abort( "Unable to gather similarities information from http://cloud.github.com/downloads/roja/words/scores.txt.gz... Try again later." ) unless scores_file.exist?
|
119
|
+
|
120
|
+
puts "Compiling score data..." if VERBOSE
|
121
|
+
Zlib::GzipReader.open(scores_file) do |gz|
|
122
|
+
gz.each_line do |line|
|
123
|
+
mean, median, sense1, sense2 = line.split(',')
|
124
|
+
senses = [sense1, sense2].map! { |sense| sense.strip.split('.') }.map! { |sense| index_hash[sense[0]]["synset_ids"].select { |synset_id| synset_id[0,1] == sense[1].gsub("s", "a") }[sense[2].to_i-1] }
|
125
|
+
senses.each do |sense|
|
126
|
+
relation = (senses - [sense]).first.nil? ? sense : (senses - [sense]).first
|
127
|
+
score_name = sense + "s"
|
128
|
+
score_hash[score_name] = { "relations" => [], "means" => [], "medians" => [] } if score_hash[score_name].nil?
|
129
|
+
score_hash[score_name] = { "relations" => score_hash[score_name]["relations"] << relation, "means" => score_hash[score_name]["means"] << mean, "medians" => score_hash[score_name]["medians"] << median }
|
130
|
+
end unless senses.include? nil
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
107
135
|
if opts[:build_tokyo]
|
108
136
|
tokyo_hash = Rufus::Tokyo::Table.new((data_path + "wordnet.tct").to_s)
|
109
137
|
index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
|
110
138
|
data_hash.each { |k,v| tokyo_hash[k] = v }
|
139
|
+
score_hash.each { |k,v| tokyo_hash[k] = { "relations" => v["relations"].join('|'), "means" => v["means"].join('|'), "medians" => v["medians"].join('|') } } if opts[:build_evocations]
|
111
140
|
tokyo_hash.close
|
112
141
|
end
|
113
142
|
|
@@ -117,4 +146,11 @@ if opts[:build_pure]
|
|
117
146
|
File.open(data_path + "index.dmp",'w') do |file|
|
118
147
|
file.write Marshal.dump(index)
|
119
148
|
end
|
149
|
+
if opts[:build_evocations]
|
150
|
+
score = Hash.new
|
151
|
+
score_hash.each { |k,v| score[k] = [v["relations"].join('|'), v["means"].join('|'), v["medians"].join('|')] }
|
152
|
+
File.open(data_path + "evocations.dmp",'w') do |file|
|
153
|
+
file.write Marshal.dump(score)
|
154
|
+
end
|
155
|
+
end
|
120
156
|
end
|
data/examples.rb
CHANGED
@@ -26,11 +26,16 @@ if __FILE__ == $0
|
|
26
26
|
puts wordnet.find('bat').synsets('noun').last.relations("~")
|
27
27
|
puts wordnet.find('bat').synsets('verb').last.inspect
|
28
28
|
puts wordnet.find('bat').synsets('verb').last.words
|
29
|
-
puts wordnet.find('bat').synsets('verb').last.
|
29
|
+
puts wordnet.find('bat').synsets('verb').last.words_with_lexical_ids.inspect
|
30
30
|
|
31
31
|
puts wordnet.find('bat').synsets('verb').first.lexical.inspect
|
32
32
|
puts wordnet.find('bat').synsets('verb').first.lexical_description
|
33
33
|
|
34
|
+
puts wordnet.find('jkashdfajkshfksjdhf')
|
35
|
+
|
36
|
+
puts wordnet.find("broadcast").senses.first.evocations
|
37
|
+
puts wordnet.find("broadcast").senses.first.evocations[1].inspect
|
38
|
+
|
34
39
|
wordnet.close
|
35
40
|
|
36
41
|
end
|
data/lib/words.rb
CHANGED
@@ -25,9 +25,13 @@ module Words
|
|
25
25
|
@connected = true
|
26
26
|
elsif @connection_type == :pure
|
27
27
|
# open the index is there
|
28
|
-
File.open(@data_path,'r') do |file|
|
28
|
+
File.open(@data_path, 'r') do |file|
|
29
29
|
@connection = Marshal.load file.read
|
30
30
|
end
|
31
|
+
evocation_path = Pathname.new("#{File.dirname(__FILE__)}/../data/evocations.dmp")
|
32
|
+
File.open(evocation_path, 'r') do |file|
|
33
|
+
@evocations = Marshal.load file.read
|
34
|
+
end if evocation_path.exist?
|
31
35
|
# search for the wordnet files
|
32
36
|
if locate_wordnet?(wordnet_path)
|
33
37
|
@connected = true
|
@@ -56,12 +60,25 @@ module Words
|
|
56
60
|
def homographs(term)
|
57
61
|
if connection_type == :pure
|
58
62
|
raw_homographs = @connection[term]
|
59
|
-
{ 'lemma' => raw_homographs[0], 'tagsense_counts' => raw_homographs[1], 'synset_ids' => raw_homographs[2]}
|
63
|
+
{ 'lemma' => raw_homographs[0], 'tagsense_counts' => raw_homographs[1], 'synset_ids' => raw_homographs[2]} unless raw_homographs.nil?
|
60
64
|
else
|
61
65
|
@connection[term]
|
62
66
|
end
|
63
67
|
end
|
64
68
|
|
69
|
+
def evocations(senset_id)
|
70
|
+
if connection_type == :pure
|
71
|
+
if defined? @evocations
|
72
|
+
raw_evocations = @evocations[senset_id + "s"]
|
73
|
+
{ 'relations' => raw_evocations[0], 'means' => raw_evocations[1], 'medians' => raw_evocations[2]} unless raw_evocations.nil?
|
74
|
+
else
|
75
|
+
nil
|
76
|
+
end
|
77
|
+
else
|
78
|
+
@connection[senset_id + "s"]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
65
82
|
def synset(synset_id)
|
66
83
|
if connection_type == :pure
|
67
84
|
pos = synset_id[0,1]
|
@@ -102,6 +119,62 @@ module Words
|
|
102
119
|
|
103
120
|
end
|
104
121
|
|
122
|
+
class Evocations
|
123
|
+
|
124
|
+
def initialize(evocation_construct, source_synset, wordnet_connection)
|
125
|
+
@wordnet_connection = wordnet_connection
|
126
|
+
@source = source_synset
|
127
|
+
@evocation_construct = evocation_construct
|
128
|
+
end
|
129
|
+
|
130
|
+
def means
|
131
|
+
@means = @evocation_construct["means"].split('|') unless defined? @means
|
132
|
+
@means
|
133
|
+
end
|
134
|
+
|
135
|
+
def medians
|
136
|
+
@medians = @evocation_construct["medians"].split('|') unless defined? @medians
|
137
|
+
@medians
|
138
|
+
end
|
139
|
+
|
140
|
+
def size
|
141
|
+
means.size
|
142
|
+
end
|
143
|
+
|
144
|
+
def first
|
145
|
+
self[0]
|
146
|
+
end
|
147
|
+
|
148
|
+
def last
|
149
|
+
self[size-1]
|
150
|
+
end
|
151
|
+
|
152
|
+
def [] (index)
|
153
|
+
{ :destination => destinations[index], :mean => means[index], :median => medians[index] }
|
154
|
+
end
|
155
|
+
|
156
|
+
def destinations(pos = :all)
|
157
|
+
destination_ids(pos).map { |synset_id| Synset.new synset_id, @wordnet_connection, @source.homographs }
|
158
|
+
end
|
159
|
+
|
160
|
+
def destination_ids(pos = :all)
|
161
|
+
@destination_ids = @evocation_construct["relations"].split('|') unless defined? @destination_ids
|
162
|
+
case
|
163
|
+
when Homographs::SYMBOL_TO_POS.include?(pos.to_sym)
|
164
|
+
@destination_ids.select { |synset_id| synset_id[0,1] == Homographs::SYMBOL_TO_POS[pos.to_sym] }
|
165
|
+
when Homographs::POS_TO_SYMBOL.include?(pos.to_s)
|
166
|
+
@destination_ids.select { |synset_id| synset_id[0,1] == pos.to_s }
|
167
|
+
else
|
168
|
+
@destination_ids
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def to_s
|
173
|
+
"#{size} evocations from #{@source}"
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
105
178
|
class Relation
|
106
179
|
|
107
180
|
RELATION_TO_SYMBOL = { "-c" => :member_of_this_domain_topic, "+" => :derivationally_related_form, "%p" => :part_meronym, "~i" => :instance_hyponym, "@" => :hypernym,
|
@@ -149,7 +222,7 @@ module Words
|
|
149
222
|
end
|
150
223
|
|
151
224
|
def destination
|
152
|
-
@destination = Synset.new
|
225
|
+
@destination = Synset.new(@dest_synset_id, @wordnet_connection, nil) unless defined? @destination
|
153
226
|
@destination
|
154
227
|
end
|
155
228
|
|
@@ -235,12 +308,12 @@ module Words
|
|
235
308
|
end
|
236
309
|
|
237
310
|
def words
|
238
|
-
@words =
|
311
|
+
@words = words_with_lexical_ids.map { |word_with_num| word_with_num[:word] } unless defined? @words
|
239
312
|
@words
|
240
313
|
end
|
241
314
|
|
242
315
|
def lexical_ids
|
243
|
-
@words =
|
316
|
+
@words = words_with_lexical_ids.map { |word_with_num| word_with_num[:lexical_id] } unless defined? @words
|
244
317
|
@words
|
245
318
|
end
|
246
319
|
|
@@ -301,6 +374,11 @@ module Words
|
|
301
374
|
end
|
302
375
|
end
|
303
376
|
|
377
|
+
def evocations
|
378
|
+
evocations_arr = @wordnet_connection.evocations(synset_id)
|
379
|
+
Evocations.new evocations_arr, self, @wordnet_connection unless evocations_arr.nil?
|
380
|
+
end
|
381
|
+
|
304
382
|
def to_s
|
305
383
|
@to_s = "#{synset_type.to_s.capitalize} including word(s): #{words.map { |word| '"' + word + '"' }.join(', ')} meaning: #{gloss}" unless defined? @to_s
|
306
384
|
@to_s
|
@@ -315,7 +393,7 @@ module Words
|
|
315
393
|
|
316
394
|
def initialize(raw_homographs, wordnet_connection)
|
317
395
|
@wordnet_connection = wordnet_connection
|
318
|
-
@
|
396
|
+
@raw_homographs = raw_homographs
|
319
397
|
# construct some conveniance menthods for relation type access
|
320
398
|
SYMBOL_TO_POS.keys.each do |pos|
|
321
399
|
self.class.send(:define_method, "#{pos}s?") do
|
@@ -358,7 +436,7 @@ module Words
|
|
358
436
|
end
|
359
437
|
|
360
438
|
def synsets(pos = :all)
|
361
|
-
synset_ids(pos).map { |synset_id| Synset.new synset_id,
|
439
|
+
synset_ids(pos).map { |synset_id| Synset.new synset_id, @wordnet_connection, self }
|
362
440
|
end
|
363
441
|
|
364
442
|
def synset_ids(pos = :all)
|
@@ -393,7 +471,8 @@ module Words
|
|
393
471
|
end
|
394
472
|
|
395
473
|
def find(word)
|
396
|
-
|
474
|
+
homographs = @wordnet_connection.homographs(word)
|
475
|
+
Homographs.new homographs, @wordnet_connection unless homographs.nil?
|
397
476
|
end
|
398
477
|
|
399
478
|
def connection_type
|
data/words.gemspec
CHANGED