words 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.2
1
+ 0.3.0
data/bin/build_wordnet CHANGED
@@ -7,6 +7,8 @@ require 'pathname'
7
7
  require 'rubygems'
8
8
  require 'trollop'
9
9
  require 'rufus-tokyo'
10
+ require 'zlib'
11
+ require 'net/http'
10
12
 
11
13
  POS_FILE_TYPES = %w{ adj adv noun verb }
12
14
  POS_FILE_TYPE_TO_SHORT = { 'adj' => 'a', 'adv' => 'r', 'noun' => 'n', 'verb' => 'v' }
@@ -26,6 +28,7 @@ opts = Trollop::options do
26
28
  opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
27
29
  opt :build_tokyo, "Build the tokyo dataset?", :default => false
28
30
  opt :build_pure, "Build the pure ruby dataset?", :default => false
31
+ opt :build_evocations, "Build the similarity dataset based on the wordnet evocation project (requires internet connection)", :default => false
29
32
  end
30
33
  Trollop::die :build_tokyo, "You need to specify whether tokyo dataset or pure ruby index building is required" if !opts[:build_tokyo] && !opts[:build_pure]
31
34
  puts "Verbose mode enabled" if (VERBOSE = opts[:verbose])
@@ -104,10 +107,36 @@ POS_FILE_TYPES.each do |file_pos|
104
107
 
105
108
  end
106
109
 
110
+ score_hash = Hash.new
111
+ if opts[:build_evocations]
112
+ puts "Downlaoding score data..." if VERBOSE
113
+ scores_file = data_path + "scores.txt.gz"
114
+ scores_file.delete if scores_file.exist?
115
+ File.open(scores_file,'w') do |file|
116
+ file.write Net::HTTP.get(URI.parse('http://cloud.github.com/downloads/roja/words/scores.txt.gz'))
117
+ end
118
+ abort( "Unable to gather similarities information from http://cloud.github.com/downloads/roja/words/scores.txt.gz... Try again later." ) unless scores_file.exist?
119
+
120
+ puts "Compiling score data..." if VERBOSE
121
+ Zlib::GzipReader.open(scores_file) do |gz|
122
+ gz.each_line do |line|
123
+ mean, median, sense1, sense2 = line.split(',')
124
+ senses = [sense1, sense2].map! { |sense| sense.strip.split('.') }.map! { |sense| index_hash[sense[0]]["synset_ids"].select { |synset_id| synset_id[0,1] == sense[1].gsub("s", "a") }[sense[2].to_i-1] }
125
+ senses.each do |sense|
126
+ relation = (senses - [sense]).first.nil? ? sense : (senses - [sense]).first
127
+ score_name = sense + "s"
128
+ score_hash[score_name] = { "relations" => [], "means" => [], "medians" => [] } if score_hash[score_name].nil?
129
+ score_hash[score_name] = { "relations" => score_hash[score_name]["relations"] << relation, "means" => score_hash[score_name]["means"] << mean, "medians" => score_hash[score_name]["medians"] << median }
130
+ end unless senses.include? nil
131
+ end
132
+ end
133
+ end
134
+
107
135
  if opts[:build_tokyo]
108
136
  tokyo_hash = Rufus::Tokyo::Table.new((data_path + "wordnet.tct").to_s)
109
137
  index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
110
138
  data_hash.each { |k,v| tokyo_hash[k] = v }
139
+ score_hash.each { |k,v| tokyo_hash[k] = { "relations" => v["relations"].join('|'), "means" => v["means"].join('|'), "medians" => v["medians"].join('|') } } if opts[:build_evocations]
111
140
  tokyo_hash.close
112
141
  end
113
142
 
@@ -117,4 +146,11 @@ if opts[:build_pure]
117
146
  File.open(data_path + "index.dmp",'w') do |file|
118
147
  file.write Marshal.dump(index)
119
148
  end
149
+ if opts[:build_evocations]
150
+ score = Hash.new
151
+ score_hash.each { |k,v| score[k] = [v["relations"].join('|'), v["means"].join('|'), v["medians"].join('|')] }
152
+ File.open(data_path + "evocations.dmp",'w') do |file|
153
+ file.write Marshal.dump(score)
154
+ end
155
+ end
120
156
  end
data/examples.rb CHANGED
@@ -26,11 +26,16 @@ if __FILE__ == $0
26
26
  puts wordnet.find('bat').synsets('noun').last.relations("~")
27
27
  puts wordnet.find('bat').synsets('verb').last.inspect
28
28
  puts wordnet.find('bat').synsets('verb').last.words
29
- puts wordnet.find('bat').synsets('verb').last.words_with_num.inspect
29
+ puts wordnet.find('bat').synsets('verb').last.words_with_lexical_ids.inspect
30
30
 
31
31
  puts wordnet.find('bat').synsets('verb').first.lexical.inspect
32
32
  puts wordnet.find('bat').synsets('verb').first.lexical_description
33
33
 
34
+ puts wordnet.find('jkashdfajkshfksjdhf')
35
+
36
+ puts wordnet.find("broadcast").senses.first.evocations
37
+ puts wordnet.find("broadcast").senses.first.evocations[1].inspect
38
+
34
39
  wordnet.close
35
40
 
36
41
  end
data/lib/words.rb CHANGED
@@ -25,9 +25,13 @@ module Words
25
25
  @connected = true
26
26
  elsif @connection_type == :pure
27
27
  # open the index is there
28
- File.open(@data_path,'r') do |file|
28
+ File.open(@data_path, 'r') do |file|
29
29
  @connection = Marshal.load file.read
30
30
  end
31
+ evocation_path = Pathname.new("#{File.dirname(__FILE__)}/../data/evocations.dmp")
32
+ File.open(evocation_path, 'r') do |file|
33
+ @evocations = Marshal.load file.read
34
+ end if evocation_path.exist?
31
35
  # search for the wordnet files
32
36
  if locate_wordnet?(wordnet_path)
33
37
  @connected = true
@@ -56,12 +60,25 @@ module Words
56
60
  def homographs(term)
57
61
  if connection_type == :pure
58
62
  raw_homographs = @connection[term]
59
- { 'lemma' => raw_homographs[0], 'tagsense_counts' => raw_homographs[1], 'synset_ids' => raw_homographs[2]}
63
+ { 'lemma' => raw_homographs[0], 'tagsense_counts' => raw_homographs[1], 'synset_ids' => raw_homographs[2]} unless raw_homographs.nil?
60
64
  else
61
65
  @connection[term]
62
66
  end
63
67
  end
64
68
 
69
+ def evocations(senset_id)
70
+ if connection_type == :pure
71
+ if defined? @evocations
72
+ raw_evocations = @evocations[senset_id + "s"]
73
+ { 'relations' => raw_evocations[0], 'means' => raw_evocations[1], 'medians' => raw_evocations[2]} unless raw_evocations.nil?
74
+ else
75
+ nil
76
+ end
77
+ else
78
+ @connection[senset_id + "s"]
79
+ end
80
+ end
81
+
65
82
  def synset(synset_id)
66
83
  if connection_type == :pure
67
84
  pos = synset_id[0,1]
@@ -102,6 +119,62 @@ module Words
102
119
 
103
120
  end
104
121
 
122
+ class Evocations
123
+
124
+ def initialize(evocation_construct, source_synset, wordnet_connection)
125
+ @wordnet_connection = wordnet_connection
126
+ @source = source_synset
127
+ @evocation_construct = evocation_construct
128
+ end
129
+
130
+ def means
131
+ @means = @evocation_construct["means"].split('|') unless defined? @means
132
+ @means
133
+ end
134
+
135
+ def medians
136
+ @medians = @evocation_construct["medians"].split('|') unless defined? @medians
137
+ @medians
138
+ end
139
+
140
+ def size
141
+ means.size
142
+ end
143
+
144
+ def first
145
+ self[0]
146
+ end
147
+
148
+ def last
149
+ self[size-1]
150
+ end
151
+
152
+ def [] (index)
153
+ { :destination => destinations[index], :mean => means[index], :median => medians[index] }
154
+ end
155
+
156
+ def destinations(pos = :all)
157
+ destination_ids(pos).map { |synset_id| Synset.new synset_id, @wordnet_connection, @source.homographs }
158
+ end
159
+
160
+ def destination_ids(pos = :all)
161
+ @destination_ids = @evocation_construct["relations"].split('|') unless defined? @destination_ids
162
+ case
163
+ when Homographs::SYMBOL_TO_POS.include?(pos.to_sym)
164
+ @destination_ids.select { |synset_id| synset_id[0,1] == Homographs::SYMBOL_TO_POS[pos.to_sym] }
165
+ when Homographs::POS_TO_SYMBOL.include?(pos.to_s)
166
+ @destination_ids.select { |synset_id| synset_id[0,1] == pos.to_s }
167
+ else
168
+ @destination_ids
169
+ end
170
+ end
171
+
172
+ def to_s
173
+ "#{size} evocations from #{@source}"
174
+ end
175
+
176
+ end
177
+
105
178
  class Relation
106
179
 
107
180
  RELATION_TO_SYMBOL = { "-c" => :member_of_this_domain_topic, "+" => :derivationally_related_form, "%p" => :part_meronym, "~i" => :instance_hyponym, "@" => :hypernym,
@@ -149,7 +222,7 @@ module Words
149
222
  end
150
223
 
151
224
  def destination
152
- @destination = Synset.new @dest_synset_id, @wordnet_connection unless defined? @destination
225
+ @destination = Synset.new(@dest_synset_id, @wordnet_connection, nil) unless defined? @destination
153
226
  @destination
154
227
  end
155
228
 
@@ -235,12 +308,12 @@ module Words
235
308
  end
236
309
 
237
310
  def words
238
- @words = words_with_num.map { |word_with_num| word_with_num[:word] } unless defined? @words
311
+ @words = words_with_lexical_ids.map { |word_with_num| word_with_num[:word] } unless defined? @words
239
312
  @words
240
313
  end
241
314
 
242
315
  def lexical_ids
243
- @words = words_with_num.map { |word_with_num| word_with_num[:lexical_id] } unless defined? @words
316
+ @words = words_with_lexical_ids.map { |word_with_num| word_with_num[:lexical_id] } unless defined? @words
244
317
  @words
245
318
  end
246
319
 
@@ -301,6 +374,11 @@ module Words
301
374
  end
302
375
  end
303
376
 
377
+ def evocations
378
+ evocations_arr = @wordnet_connection.evocations(synset_id)
379
+ Evocations.new evocations_arr, self, @wordnet_connection unless evocations_arr.nil?
380
+ end
381
+
304
382
  def to_s
305
383
  @to_s = "#{synset_type.to_s.capitalize} including word(s): #{words.map { |word| '"' + word + '"' }.join(', ')} meaning: #{gloss}" unless defined? @to_s
306
384
  @to_s
@@ -315,7 +393,7 @@ module Words
315
393
 
316
394
  def initialize(raw_homographs, wordnet_connection)
317
395
  @wordnet_connection = wordnet_connection
318
- @lemma_hash = raw_homographs
396
+ @raw_homographs = raw_homographs
319
397
  # construct some conveniance menthods for relation type access
320
398
  SYMBOL_TO_POS.keys.each do |pos|
321
399
  self.class.send(:define_method, "#{pos}s?") do
@@ -358,7 +436,7 @@ module Words
358
436
  end
359
437
 
360
438
  def synsets(pos = :all)
361
- synset_ids(pos).map { |synset_id| Synset.new synset_id, self, @wordnet_connection }
439
+ synset_ids(pos).map { |synset_id| Synset.new synset_id, @wordnet_connection, self }
362
440
  end
363
441
 
364
442
  def synset_ids(pos = :all)
@@ -393,7 +471,8 @@ module Words
393
471
  end
394
472
 
395
473
  def find(word)
396
- Homographs.new @wordnet_connection.homographs(word), @wordnet_connection
474
+ homographs = @wordnet_connection.homographs(word)
475
+ Homographs.new homographs, @wordnet_connection unless homographs.nil?
397
476
  end
398
477
 
399
478
  def connection_type
data/words.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{words}
8
- s.version = "0.2.2"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Roja Buck"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: words
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Roja Buck