words 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.2
1
+ 0.3.0
data/bin/build_wordnet CHANGED
@@ -7,6 +7,8 @@ require 'pathname'
7
7
  require 'rubygems'
8
8
  require 'trollop'
9
9
  require 'rufus-tokyo'
10
+ require 'zlib'
11
+ require 'net/http'
10
12
 
11
13
  POS_FILE_TYPES = %w{ adj adv noun verb }
12
14
  POS_FILE_TYPE_TO_SHORT = { 'adj' => 'a', 'adv' => 'r', 'noun' => 'n', 'verb' => 'v' }
@@ -26,6 +28,7 @@ opts = Trollop::options do
26
28
  opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
27
29
  opt :build_tokyo, "Build the tokyo dataset?", :default => false
28
30
  opt :build_pure, "Build the pure ruby dataset?", :default => false
31
+ opt :build_evocations, "Build the similarity dataset based on the wordnet evocation project (requires internet connection)", :default => false
29
32
  end
30
33
  Trollop::die :build_tokyo, "You need to specify whether tokyo dataset or pure ruby index building is required" if !opts[:build_tokyo] && !opts[:build_pure]
31
34
  puts "Verbose mode enabled" if (VERBOSE = opts[:verbose])
@@ -104,10 +107,36 @@ POS_FILE_TYPES.each do |file_pos|
104
107
 
105
108
  end
106
109
 
110
+ score_hash = Hash.new
111
+ if opts[:build_evocations]
112
+ puts "Downlaoding score data..." if VERBOSE
113
+ scores_file = data_path + "scores.txt.gz"
114
+ scores_file.delete if scores_file.exist?
115
+ File.open(scores_file,'w') do |file|
116
+ file.write Net::HTTP.get(URI.parse('http://cloud.github.com/downloads/roja/words/scores.txt.gz'))
117
+ end
118
+ abort( "Unable to gather similarities information from http://cloud.github.com/downloads/roja/words/scores.txt.gz... Try again later." ) unless scores_file.exist?
119
+
120
+ puts "Compiling score data..." if VERBOSE
121
+ Zlib::GzipReader.open(scores_file) do |gz|
122
+ gz.each_line do |line|
123
+ mean, median, sense1, sense2 = line.split(',')
124
+ senses = [sense1, sense2].map! { |sense| sense.strip.split('.') }.map! { |sense| index_hash[sense[0]]["synset_ids"].select { |synset_id| synset_id[0,1] == sense[1].gsub("s", "a") }[sense[2].to_i-1] }
125
+ senses.each do |sense|
126
+ relation = (senses - [sense]).first.nil? ? sense : (senses - [sense]).first
127
+ score_name = sense + "s"
128
+ score_hash[score_name] = { "relations" => [], "means" => [], "medians" => [] } if score_hash[score_name].nil?
129
+ score_hash[score_name] = { "relations" => score_hash[score_name]["relations"] << relation, "means" => score_hash[score_name]["means"] << mean, "medians" => score_hash[score_name]["medians"] << median }
130
+ end unless senses.include? nil
131
+ end
132
+ end
133
+ end
134
+
107
135
  if opts[:build_tokyo]
108
136
  tokyo_hash = Rufus::Tokyo::Table.new((data_path + "wordnet.tct").to_s)
109
137
  index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
110
138
  data_hash.each { |k,v| tokyo_hash[k] = v }
139
+ score_hash.each { |k,v| tokyo_hash[k] = { "relations" => v["relations"].join('|'), "means" => v["means"].join('|'), "medians" => v["medians"].join('|') } } if opts[:build_evocations]
111
140
  tokyo_hash.close
112
141
  end
113
142
 
@@ -117,4 +146,11 @@ if opts[:build_pure]
117
146
  File.open(data_path + "index.dmp",'w') do |file|
118
147
  file.write Marshal.dump(index)
119
148
  end
149
+ if opts[:build_evocations]
150
+ score = Hash.new
151
+ score_hash.each { |k,v| score[k] = [v["relations"].join('|'), v["means"].join('|'), v["medians"].join('|')] }
152
+ File.open(data_path + "evocations.dmp",'w') do |file|
153
+ file.write Marshal.dump(score)
154
+ end
155
+ end
120
156
  end
data/examples.rb CHANGED
@@ -26,11 +26,16 @@ if __FILE__ == $0
26
26
  puts wordnet.find('bat').synsets('noun').last.relations("~")
27
27
  puts wordnet.find('bat').synsets('verb').last.inspect
28
28
  puts wordnet.find('bat').synsets('verb').last.words
29
- puts wordnet.find('bat').synsets('verb').last.words_with_num.inspect
29
+ puts wordnet.find('bat').synsets('verb').last.words_with_lexical_ids.inspect
30
30
 
31
31
  puts wordnet.find('bat').synsets('verb').first.lexical.inspect
32
32
  puts wordnet.find('bat').synsets('verb').first.lexical_description
33
33
 
34
+ puts wordnet.find('jkashdfajkshfksjdhf')
35
+
36
+ puts wordnet.find("broadcast").senses.first.evocations
37
+ puts wordnet.find("broadcast").senses.first.evocations[1].inspect
38
+
34
39
  wordnet.close
35
40
 
36
41
  end
data/lib/words.rb CHANGED
@@ -25,9 +25,13 @@ module Words
25
25
  @connected = true
26
26
  elsif @connection_type == :pure
27
27
  # open the index is there
28
- File.open(@data_path,'r') do |file|
28
+ File.open(@data_path, 'r') do |file|
29
29
  @connection = Marshal.load file.read
30
30
  end
31
+ evocation_path = Pathname.new("#{File.dirname(__FILE__)}/../data/evocations.dmp")
32
+ File.open(evocation_path, 'r') do |file|
33
+ @evocations = Marshal.load file.read
34
+ end if evocation_path.exist?
31
35
  # search for the wordnet files
32
36
  if locate_wordnet?(wordnet_path)
33
37
  @connected = true
@@ -56,12 +60,25 @@ module Words
56
60
  def homographs(term)
57
61
  if connection_type == :pure
58
62
  raw_homographs = @connection[term]
59
- { 'lemma' => raw_homographs[0], 'tagsense_counts' => raw_homographs[1], 'synset_ids' => raw_homographs[2]}
63
+ { 'lemma' => raw_homographs[0], 'tagsense_counts' => raw_homographs[1], 'synset_ids' => raw_homographs[2]} unless raw_homographs.nil?
60
64
  else
61
65
  @connection[term]
62
66
  end
63
67
  end
64
68
 
69
+ def evocations(senset_id)
70
+ if connection_type == :pure
71
+ if defined? @evocations
72
+ raw_evocations = @evocations[senset_id + "s"]
73
+ { 'relations' => raw_evocations[0], 'means' => raw_evocations[1], 'medians' => raw_evocations[2]} unless raw_evocations.nil?
74
+ else
75
+ nil
76
+ end
77
+ else
78
+ @connection[senset_id + "s"]
79
+ end
80
+ end
81
+
65
82
  def synset(synset_id)
66
83
  if connection_type == :pure
67
84
  pos = synset_id[0,1]
@@ -102,6 +119,62 @@ module Words
102
119
 
103
120
  end
104
121
 
122
+ class Evocations
123
+
124
+ def initialize(evocation_construct, source_synset, wordnet_connection)
125
+ @wordnet_connection = wordnet_connection
126
+ @source = source_synset
127
+ @evocation_construct = evocation_construct
128
+ end
129
+
130
+ def means
131
+ @means = @evocation_construct["means"].split('|') unless defined? @means
132
+ @means
133
+ end
134
+
135
+ def medians
136
+ @medians = @evocation_construct["medians"].split('|') unless defined? @medians
137
+ @medians
138
+ end
139
+
140
+ def size
141
+ means.size
142
+ end
143
+
144
+ def first
145
+ self[0]
146
+ end
147
+
148
+ def last
149
+ self[size-1]
150
+ end
151
+
152
+ def [] (index)
153
+ { :destination => destinations[index], :mean => means[index], :median => medians[index] }
154
+ end
155
+
156
+ def destinations(pos = :all)
157
+ destination_ids(pos).map { |synset_id| Synset.new synset_id, @wordnet_connection, @source.homographs }
158
+ end
159
+
160
+ def destination_ids(pos = :all)
161
+ @destination_ids = @evocation_construct["relations"].split('|') unless defined? @destination_ids
162
+ case
163
+ when Homographs::SYMBOL_TO_POS.include?(pos.to_sym)
164
+ @destination_ids.select { |synset_id| synset_id[0,1] == Homographs::SYMBOL_TO_POS[pos.to_sym] }
165
+ when Homographs::POS_TO_SYMBOL.include?(pos.to_s)
166
+ @destination_ids.select { |synset_id| synset_id[0,1] == pos.to_s }
167
+ else
168
+ @destination_ids
169
+ end
170
+ end
171
+
172
+ def to_s
173
+ "#{size} evocations from #{@source}"
174
+ end
175
+
176
+ end
177
+
105
178
  class Relation
106
179
 
107
180
  RELATION_TO_SYMBOL = { "-c" => :member_of_this_domain_topic, "+" => :derivationally_related_form, "%p" => :part_meronym, "~i" => :instance_hyponym, "@" => :hypernym,
@@ -149,7 +222,7 @@ module Words
149
222
  end
150
223
 
151
224
  def destination
152
- @destination = Synset.new @dest_synset_id, @wordnet_connection unless defined? @destination
225
+ @destination = Synset.new(@dest_synset_id, @wordnet_connection, nil) unless defined? @destination
153
226
  @destination
154
227
  end
155
228
 
@@ -235,12 +308,12 @@ module Words
235
308
  end
236
309
 
237
310
  def words
238
- @words = words_with_num.map { |word_with_num| word_with_num[:word] } unless defined? @words
311
+ @words = words_with_lexical_ids.map { |word_with_num| word_with_num[:word] } unless defined? @words
239
312
  @words
240
313
  end
241
314
 
242
315
  def lexical_ids
243
- @words = words_with_num.map { |word_with_num| word_with_num[:lexical_id] } unless defined? @words
316
+ @words = words_with_lexical_ids.map { |word_with_num| word_with_num[:lexical_id] } unless defined? @words
244
317
  @words
245
318
  end
246
319
 
@@ -301,6 +374,11 @@ module Words
301
374
  end
302
375
  end
303
376
 
377
+ def evocations
378
+ evocations_arr = @wordnet_connection.evocations(synset_id)
379
+ Evocations.new evocations_arr, self, @wordnet_connection unless evocations_arr.nil?
380
+ end
381
+
304
382
  def to_s
305
383
  @to_s = "#{synset_type.to_s.capitalize} including word(s): #{words.map { |word| '"' + word + '"' }.join(', ')} meaning: #{gloss}" unless defined? @to_s
306
384
  @to_s
@@ -315,7 +393,7 @@ module Words
315
393
 
316
394
  def initialize(raw_homographs, wordnet_connection)
317
395
  @wordnet_connection = wordnet_connection
318
- @lemma_hash = raw_homographs
396
+ @raw_homographs = raw_homographs
319
397
  # construct some conveniance menthods for relation type access
320
398
  SYMBOL_TO_POS.keys.each do |pos|
321
399
  self.class.send(:define_method, "#{pos}s?") do
@@ -358,7 +436,7 @@ module Words
358
436
  end
359
437
 
360
438
  def synsets(pos = :all)
361
- synset_ids(pos).map { |synset_id| Synset.new synset_id, self, @wordnet_connection }
439
+ synset_ids(pos).map { |synset_id| Synset.new synset_id, @wordnet_connection, self }
362
440
  end
363
441
 
364
442
  def synset_ids(pos = :all)
@@ -393,7 +471,8 @@ module Words
393
471
  end
394
472
 
395
473
  def find(word)
396
- Homographs.new @wordnet_connection.homographs(word), @wordnet_connection
474
+ homographs = @wordnet_connection.homographs(word)
475
+ Homographs.new homographs, @wordnet_connection unless homographs.nil?
397
476
  end
398
477
 
399
478
  def connection_type
data/words.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{words}
8
- s.version = "0.2.2"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Roja Buck"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: words
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Roja Buck