words 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +26 -0
- data/LICENSE +20 -0
- data/README.markdown +85 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/build_dataset.rb +102 -0
- data/data/wordnet.tct +0 -0
- data/examples.rb +28 -0
- data/lib/words.rb +229 -0
- data/test/helper.rb +9 -0
- data/test/test_words.rb +7 -0
- metadata +86 -0
data/.gitignore
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
## MAC OS
|
2
|
+
.DS_Store
|
3
|
+
|
4
|
+
## TEXTMATE
|
5
|
+
*.tmproj
|
6
|
+
tmtags
|
7
|
+
|
8
|
+
## EMACS
|
9
|
+
*~
|
10
|
+
\#*
|
11
|
+
.\#*
|
12
|
+
|
13
|
+
## VIM
|
14
|
+
*.swp
|
15
|
+
|
16
|
+
## PROJECT::GENERAL
|
17
|
+
coverage
|
18
|
+
rdoc
|
19
|
+
pkg
|
20
|
+
|
21
|
+
## ECLIPSE
|
22
|
+
.loadpath
|
23
|
+
.project
|
24
|
+
.document
|
25
|
+
|
26
|
+
## PROJECT::SPECIFIC
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Roja Buck
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
# Words - A fast, easy to use interface to WordNet® with cross ruby distribution compatability. #
|
2
|
+
|
3
|
+
## About ##
|
4
|
+
|
5
|
+
Words implements a fast interface to [Wordnet®](http://wordnet.princeton.edu) which makes use of [Tokyo Cabinet](http://1978th.net/tokyocabinet/) and a FFI interface, [rufus-tokyo](http://github.com/jmettraux/rufus-tokyo), to provide cross ruby distribution compatability and blistering speed. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch, we even include the data in it's tokyo data format (subject to the original wordnet licencing.)
|
6
|
+
|
7
|
+
## Installation ##
|
8
|
+
|
9
|
+
First ensure you have [Tokyo Cabinet](http://1978th.net/tokyocabinet/) installed. It should be nice and easy...
|
10
|
+
|
11
|
+
After this it should be just a gem to install. For those of you with old rubygems versions first:
|
12
|
+
|
13
|
+
gem install gemcutter # These two steps are only necessary if you haven't
|
14
|
+
gem tumble # yet installed the gemcutter tools
|
15
|
+
|
16
|
+
Otherwise and after it's simply:
|
17
|
+
|
18
|
+
gem install words
|
19
|
+
|
20
|
+
Then your ready to rock and roll. :)
|
21
|
+
|
22
|
+
## Build Data (Optional) ##
|
23
|
+
|
24
|
+
If you want to build the wordnet dataset file yourself, from the original wordnet files, you can use the bundled "build_dataset.rb"
|
25
|
+
|
26
|
+
./build_dataset.rb -h #this will give you the usage
|
27
|
+
sudo ./build_dataset.rb #this will attempt to build the data locating the original wordnet files through a search...
|
28
|
+
|
29
|
+
## Usage ##
|
30
|
+
|
31
|
+
Heres a few little examples of using words within your programs.
|
32
|
+
|
33
|
+
|
34
|
+
require 'rubygems'
|
35
|
+
require 'words'
|
36
|
+
|
37
|
+
data = Words::Words.new
|
38
|
+
|
39
|
+
# locate a word
|
40
|
+
lemma = data.find("bat")
|
41
|
+
|
42
|
+
lemma.to_s # => bat, noun/verb
|
43
|
+
lemma.available_pos.inspect # => [:noun, :verb]
|
44
|
+
|
45
|
+
lemma.synsets(:noun) # => array of synsets which represent nouns of the lemma bat
|
46
|
+
# or
|
47
|
+
lemma.nouns # => array of synsets which represent nouns of the lemma bat
|
48
|
+
lemma.verbs? #=> true
|
49
|
+
|
50
|
+
# specify a sense
|
51
|
+
sense = lemma.nouns.last
|
52
|
+
sense2 = lemma.nouns[2]
|
53
|
+
|
54
|
+
sense.gloss # => a club used for hitting a ball in various games
|
55
|
+
sense2.words # => ["cricket bat", "bat"]
|
56
|
+
sense.relations.first # => "Semantic hypernym relation between n02806379 and n03053474"
|
57
|
+
|
58
|
+
sense.relations(:hyponym) # => Array of hyponyms associated with the sense
|
59
|
+
# or
|
60
|
+
sense.hyponyms # => Array of hyponyms associated with the sense
|
61
|
+
sense.hyponyms? # => true
|
62
|
+
|
63
|
+
sense.relations.first.is_semantic? # => true
|
64
|
+
sense.relations.first.source_word # => nil
|
65
|
+
sense.relations.first.destination # => the synset of n03053474
|
66
|
+
|
67
|
+
sense.derivationally_related_forms.first.is_semantic? # => false
|
68
|
+
sense.derivationally_related_forms.first.source_word # => "bat"
|
69
|
+
sense.derivationally_related_forms.first.destination_word # => "bat"
|
70
|
+
sense.derivationally_related_forms.first.destination # => the synset of v01413191
|
71
|
+
|
72
|
+
|
73
|
+
## Note on Patches/Pull Requests ##
|
74
|
+
|
75
|
+
* Fork the project.
|
76
|
+
* Make your feature addition or bug fix.
|
77
|
+
* Add tests for it. This is important so I don't break it in a
|
78
|
+
future version unintentionally.
|
79
|
+
* Commit, do not mess with rakefile, version, or history.
|
80
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
81
|
+
* Send me a pull request. Bonus points for topic branches.
|
82
|
+
|
83
|
+
## Copyright ##
|
84
|
+
|
85
|
+
Copyright (c) 2010 Roja Buck. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "words"
|
8
|
+
gem.summary = %Q{A fast, easy to use interface to WordNet® with cross ruby distribution compatability.}
|
9
|
+
gem.description = %Q{A fast, easy to use interface to WordNet® with cross ruby distribution compatability. We use TokyoCabinet to store the dataset and the excellent rufus-tokyo to interface with it. This allows us to have full compatability across ruby distributions while still remaining both fast and simple to use.}
|
10
|
+
gem.email = "roja@arbia.co.uk"
|
11
|
+
gem.homepage = "http://github.com/roja/words"
|
12
|
+
gem.authors = ["Roja Buck"]
|
13
|
+
gem.add_development_dependency "trollop", ">= 1.15"
|
14
|
+
gem.add_dependency 'rufus-tokyo', '>= 1.0.5'
|
15
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
16
|
+
end
|
17
|
+
Jeweler::GemcutterTasks.new
|
18
|
+
rescue LoadError
|
19
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
20
|
+
end
|
21
|
+
|
22
|
+
require 'rake/testtask'
|
23
|
+
Rake::TestTask.new(:test) do |test|
|
24
|
+
test.libs << 'lib' << 'test'
|
25
|
+
test.pattern = 'test/**/test_*.rb'
|
26
|
+
test.verbose = true
|
27
|
+
end
|
28
|
+
|
29
|
+
begin
|
30
|
+
require 'rcov/rcovtask'
|
31
|
+
Rcov::RcovTask.new do |test|
|
32
|
+
test.libs << 'test'
|
33
|
+
test.pattern = 'test/**/test_*.rb'
|
34
|
+
test.verbose = true
|
35
|
+
end
|
36
|
+
rescue LoadError
|
37
|
+
task :rcov do
|
38
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
task :test => :check_dependencies
|
43
|
+
|
44
|
+
task :default => :test
|
45
|
+
|
46
|
+
require 'rake/rdoctask'
|
47
|
+
Rake::RDocTask.new do |rdoc|
|
48
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
49
|
+
|
50
|
+
rdoc.rdoc_dir = 'rdoc'
|
51
|
+
rdoc.title = "words #{version}"
|
52
|
+
rdoc.rdoc_files.include('README*')
|
53
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
54
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/build_dataset.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# std includes
|
4
|
+
require 'pathname'
|
5
|
+
|
6
|
+
# gem includes
|
7
|
+
require 'rubygems'
|
8
|
+
require 'trollop'
|
9
|
+
require 'pstore'
|
10
|
+
require 'rufus-tokyo'
|
11
|
+
|
12
|
+
POS_FILE_TYPES = %w{ adj adv noun verb }
|
13
|
+
POS_FILE_TYPE_TO_SHORT = { 'adj' => 'a', 'adv' => 'r', 'noun' => 'n', 'verb' => 'v' }
|
14
|
+
|
15
|
+
def locate_wordnet(base_dir)
|
16
|
+
puts "Checking #{base_dir} & spcific children for wordnet files..." if VERBOSE
|
17
|
+
path = Pathname.new base_dir
|
18
|
+
return nil unless path.exist?
|
19
|
+
return path if (path + "data.noun").exist?
|
20
|
+
return path + "dict" if (path + "dict/data.noun").exist?
|
21
|
+
end
|
22
|
+
|
23
|
+
if __FILE__ == $0
|
24
|
+
|
25
|
+
puts "Words Dataset Constructor 2010 (c) Roja Buck"
|
26
|
+
|
27
|
+
opts = Trollop::options do
|
28
|
+
opt :verbose, "Output verbose program detail.", :default => false
|
29
|
+
opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
|
30
|
+
end
|
31
|
+
puts "Verbose mode enabled" if (VERBOSE = opts[:verbose])
|
32
|
+
|
33
|
+
wordnet_dir = nil
|
34
|
+
if opts[:wordnet] == "Search..."
|
35
|
+
['/usr/share/wordnet', '/usr/local/share/wordnet', '/usr/local/WordNet-3.0'].each do |potential_dir|
|
36
|
+
break unless (wordnet_dir = locate_wordnet potential_dir).nil?
|
37
|
+
end
|
38
|
+
abort( "Unable to locate wordnet dictionary. To specify check --help." ) if wordnet_dir.nil?
|
39
|
+
else
|
40
|
+
wordnet_dir = locate_wordnet opts[:wordnet]
|
41
|
+
abort( "Unable to locate wordnet dictionary in directory #{opts[:wordnet]}. Please check and try again." ) if wordnet_dir.nil?
|
42
|
+
end
|
43
|
+
|
44
|
+
# At this point we know we should have a wordnet directory within wordnet_dir
|
45
|
+
puts "Found wordnet files in #{wordnet_dir}..." if VERBOSE
|
46
|
+
|
47
|
+
index_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "index.#{pos}" }
|
48
|
+
data_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "data.#{pos}" }
|
49
|
+
|
50
|
+
(index_files + data_files).each do |required_file|
|
51
|
+
abort( "Unable to locate #{required_file} within the wordnet dictionary. Please check your wordnet copy is valid and try again." ) unless required_file.exist?
|
52
|
+
abort( "Cannot get readable permissions to #{required_file} within the wordnet dictionary. Please check the file permissions and try again." ) unless required_file.readable?
|
53
|
+
end
|
54
|
+
|
55
|
+
# At this point we know we have the correct files, though we don't know there validity
|
56
|
+
puts "Validated existance of wordnet files in #{wordnet_dir}..." if VERBOSE
|
57
|
+
|
58
|
+
# Build data
|
59
|
+
|
60
|
+
hash = Rufus::Tokyo::Table.new("data/wordnet.tct")
|
61
|
+
POS_FILE_TYPES.each do |file_pos|
|
62
|
+
|
63
|
+
puts "Building #{file_pos} indexes..." if VERBOSE
|
64
|
+
|
65
|
+
# add indexes
|
66
|
+
(wordnet_dir + "index.#{file_pos}").each_line do |index_line|
|
67
|
+
next if index_line[0, 2] == " "
|
68
|
+
index_parts = index_line.split(" ")
|
69
|
+
|
70
|
+
lemma, pos, synset_count, pointer_count = index_parts.shift, index_parts.shift, index_parts.shift.to_i, index_parts.shift.to_i
|
71
|
+
pointer_symbols = Array.new(pointer_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
72
|
+
sense_count = index_parts.shift
|
73
|
+
tagsense_count = pos + index_parts.shift
|
74
|
+
synset_ids = Array.new(synset_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
75
|
+
|
76
|
+
hash[lemma] = { "synset_ids" => '', "tagsense_counts" => '' } if hash[lemma].nil?
|
77
|
+
|
78
|
+
hash[lemma] = { "lemma" => lemma, "synset_ids" => (hash[lemma]["synset_ids"].split('|') + synset_ids).join('|'), # append synsets
|
79
|
+
"tagsense_counts" => (hash[lemma]["tagsense_counts"].split('|') << tagsense_count).join('|') } # append pointer symbols
|
80
|
+
end
|
81
|
+
|
82
|
+
puts "Adding #{file_pos} data..." if VERBOSE
|
83
|
+
|
84
|
+
# add data
|
85
|
+
(wordnet_dir + "data.#{file_pos}").each_line do |data_line|
|
86
|
+
next if data_line[0, 2] == " "
|
87
|
+
data_line, gloss = data_line.split(" | ")
|
88
|
+
data_parts = data_line.split(" ")
|
89
|
+
|
90
|
+
synset_id, lexical_filenum, synset_type, word_count = POS_FILE_TYPE_TO_SHORT[file_pos] + data_parts.shift, data_parts.shift, data_parts.shift, data_parts.shift.to_i(16)
|
91
|
+
words = Array.new(word_count).map { "#{data_parts.shift}.#{data_parts.shift}" }
|
92
|
+
relations = Array.new(data_parts.shift.to_i).map { "#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}" }
|
93
|
+
|
94
|
+
hash[synset_id] = { "synset_id" => synset_id, "lexical_filenum" => lexical_filenum, "synset_type" => synset_type,
|
95
|
+
"words" => words.join('|'), "relations" => relations.join('|'), "gloss" => gloss }
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
end
|
100
|
+
hash.close
|
101
|
+
|
102
|
+
end
|
data/data/wordnet.tct
ADDED
Binary file
|
data/examples.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'lib/words'
|
4
|
+
|
5
|
+
if __FILE__ == $0
|
6
|
+
|
7
|
+
wordnet = Words::Words.new
|
8
|
+
|
9
|
+
puts wordnet.find('bat')
|
10
|
+
puts wordnet.find('bat').available_pos.inspect
|
11
|
+
puts wordnet.find('bat').lemma
|
12
|
+
puts wordnet.find('bat').synsets('noun')
|
13
|
+
puts wordnet.find('bat').synsets('noun').last.words.inspect
|
14
|
+
puts wordnet.find('bat').synsets('noun').last.relations
|
15
|
+
wordnet.find('bat').synsets('noun').last.relations.each { |relation| puts relation.inspect }
|
16
|
+
puts wordnet.find('bat').synsets('noun').last.methods
|
17
|
+
puts wordnet.find('bat').synsets('noun').last.hyponyms?
|
18
|
+
puts wordnet.find('bat').synsets('noun').last.participle_of_verbs?
|
19
|
+
|
20
|
+
puts wordnet.find('bat').synsets('noun').last.relations(:hyponym)
|
21
|
+
puts wordnet.find('bat').synsets('noun').last.relations("~")
|
22
|
+
puts wordnet.find('bat').synsets('verb').last.inspect
|
23
|
+
puts wordnet.find('bat').synsets('verb').last.words
|
24
|
+
puts wordnet.find('bat').synsets('verb').last.words_with_num.inspect
|
25
|
+
|
26
|
+
wordnet.close
|
27
|
+
|
28
|
+
end
|
data/lib/words.rb
ADDED
@@ -0,0 +1,229 @@
|
|
1
|
+
# std includes
|
2
|
+
require 'pathname'
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
# gem includes
|
6
|
+
require 'rubygems'
|
7
|
+
require 'rufus-tokyo'
|
8
|
+
|
9
|
+
module Words
|
10
|
+
|
11
|
+
class WordnetConnection
|
12
|
+
|
13
|
+
def self.wordnet_connection
|
14
|
+
@@wordnet_connection
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.wordnet_connection=(x)
|
18
|
+
@@wordnet_connection = x
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
class Relation
|
24
|
+
|
25
|
+
RELATION_TO_SYMBOL = { "-c" => :member_of_this_domain_topic, "+" => :derivationally_related_form, "%p" => :part_meronym, "~i" => :instance_hyponym, "@" => :hypernym,
|
26
|
+
";r" => :domain_of_synset_region, "!" => :antonym, "#p" => :part_holonym, "%s" => :substance_meronym, ";u" => :domain_of_synset_usage,
|
27
|
+
"-r" => :member_of_this_domain_region, "#s" => :substance_holonym, "=" => :attribute, "-u" => :member_of_this_domain_usage, ";c" => :domain_of_synset_topic,
|
28
|
+
"%m"=> :member_meronym, "~" => :hyponym, "@i" => :instance_hypernym, "#m" => :member_holonym, "$" => :verb_group, ">" => :cause, "*" => :entailment,
|
29
|
+
"\\" => :pertainym, "<" => :participle_of_verb, "&" => :similar_to, "^" => :see_also }
|
30
|
+
SYMBOL_TO_RELATION = RELATION_TO_SYMBOL.invert
|
31
|
+
|
32
|
+
def initialize(relation_construct, source_synset)
|
33
|
+
@symbol, @dest_synset_id, @pos, @source_dest = relation_construct.split('.')
|
34
|
+
@dest_synset_id = @pos + @dest_synset_id
|
35
|
+
@symbol = RELATION_TO_SYMBOL[@symbol]
|
36
|
+
@source_synset = source_synset
|
37
|
+
end
|
38
|
+
|
39
|
+
def is_semantic?
|
40
|
+
@source_dest == "0000"
|
41
|
+
end
|
42
|
+
|
43
|
+
def source_word
|
44
|
+
is_semantic? ? @source_word = nil : @source_word = @source_synset.words[@source_dest[0..1].to_i(16)-1] unless defined? @source_word
|
45
|
+
@source_word
|
46
|
+
end
|
47
|
+
|
48
|
+
def destination_word
|
49
|
+
is_semantic? ? @destination_word = nil : @destination_word = destination.words[@source_dest[2..3].to_i(16)-1] unless defined? @destination_word
|
50
|
+
@destination_word
|
51
|
+
end
|
52
|
+
|
53
|
+
def relation_type?(type)
|
54
|
+
case
|
55
|
+
when SYMBOL_TO_RELATION.include?(type.to_sym)
|
56
|
+
type.to_sym == @symbol
|
57
|
+
when RELATION_TO_SYMBOL.include?(pos.to_s)
|
58
|
+
POINTER_TO_SYMBOL[type.to_sym] == @symbol
|
59
|
+
else
|
60
|
+
false
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def relation_type
|
65
|
+
@symbol
|
66
|
+
end
|
67
|
+
|
68
|
+
def destination
|
69
|
+
@destination = Synset.new(@dest_synset_id) unless defined? @destination
|
70
|
+
@destination
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_s
|
74
|
+
@to_s = "#{relation_type.to_s.gsub('_', ' ').capitalize} relation between #{@source_synset.synset_id}'s word \"#{source_word}\" and #{@dest_synset_id}'s word \"#{destination_word}\"" if !is_semantic? && !defined?(@to_s)
|
75
|
+
@to_s = "Semantic #{relation_type.to_s.gsub('_', ' ')} relation between #{@source_synset.synset_id} and #{@dest_synset_id}" if is_semantic? && !defined?(@to_s)
|
76
|
+
@to_s
|
77
|
+
end
|
78
|
+
|
79
|
+
def inspect
|
80
|
+
{ :symbol => @symbol, :dest_synset_id => @dest_synset_id, :pos => @pos, :source_dest => @source_dest }.inspect
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
class Synset
|
86
|
+
|
87
|
+
SYNSET_TYPE_TO_SYMBOL = {"n" => :noun, "v" => :verb, "a" => :adjective, "r" => :adverb, "s" => :adjective_satallite }
|
88
|
+
|
89
|
+
def initialize(synset_id)
|
90
|
+
@synset_hash = WordnetConnection::wordnet_connection[synset_id]
|
91
|
+
# construct some conveniance menthods for relation type access
|
92
|
+
Relation::SYMBOL_TO_RELATION.keys.each do |relation_type|
|
93
|
+
self.class.send(:define_method, "#{relation_type}s?") do
|
94
|
+
relations(relation_type).size > 0
|
95
|
+
end
|
96
|
+
self.class.send(:define_method, "#{relation_type}s") do
|
97
|
+
relations(relation_type)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def synset_type
|
103
|
+
SYNSET_TYPE_TO_SYMBOL[@synset_hash["synset_type"]]
|
104
|
+
end
|
105
|
+
|
106
|
+
def words
|
107
|
+
@words = words_with_num.map { |word_with_num| word_with_num[:word] } unless defined? @words
|
108
|
+
@words
|
109
|
+
end
|
110
|
+
|
111
|
+
def size
|
112
|
+
words.size
|
113
|
+
end
|
114
|
+
|
115
|
+
def words_with_num
|
116
|
+
@words_with_num = @synset_hash["words"].split('|').map { |word| word_parts = word.split('.'); { :word => word_parts[0].gsub('_', ' '), :num => word_parts[1] } } unless defined? @words_with_num
|
117
|
+
@words_with_num
|
118
|
+
end
|
119
|
+
|
120
|
+
def synset_id
|
121
|
+
@synset_hash["synset_id"]
|
122
|
+
end
|
123
|
+
|
124
|
+
def gloss
|
125
|
+
@synset_hash["gloss"]
|
126
|
+
end
|
127
|
+
|
128
|
+
def inspect
|
129
|
+
@synset_hash.inspect
|
130
|
+
end
|
131
|
+
|
132
|
+
def relations(type = :all)
|
133
|
+
@relations = @synset_hash["relations"].split('|').map { |relation| Relation.new(relation, self) } unless defined? @relations
|
134
|
+
case
|
135
|
+
when Relation::SYMBOL_TO_RELATION.include?(type.to_sym)
|
136
|
+
@relations.select { |relation| relation.relation_type == type.to_sym }
|
137
|
+
when Relation::RELATION_TO_SYMBOL.include?(type.to_s)
|
138
|
+
@relations.select { |relation| relation.relation_type == Relation::RELATION_TO_SYMBOL[type.to_s] }
|
139
|
+
else
|
140
|
+
@relations
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def to_s
|
145
|
+
@to_s = "#{synset_type.to_s.capitalize} including word(s): #{words.map { |word| '"' + word + '"' }.join(', ')} meaning: #{gloss}" unless defined? @to_s
|
146
|
+
@to_s
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
class Lemma
|
152
|
+
|
153
|
+
POS_TO_SYMBOL = {"n" => :noun, "v" => :verb, "a" => :adjective, "r" => :adverb}
|
154
|
+
SYMBOL_TO_POS = POS_TO_SYMBOL.invert
|
155
|
+
|
156
|
+
def initialize(lemma_hash)
|
157
|
+
@lemma_hash = lemma_hash
|
158
|
+
# construct some conveniance menthods for relation type access
|
159
|
+
SYMBOL_TO_POS.keys.each do |pos|
|
160
|
+
self.class.send(:define_method, "#{pos}s?") do
|
161
|
+
synsets(pos).size > 0
|
162
|
+
end
|
163
|
+
self.class.send(:define_method, "#{pos}s") do
|
164
|
+
synsets(pos)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def lemma
|
170
|
+
@lemma = @lemma_hash["lemma"].gsub('_', ' ') unless defined? @lemma
|
171
|
+
@lemma
|
172
|
+
end
|
173
|
+
|
174
|
+
def available_pos
|
175
|
+
@available_pos = synset_ids.map { |synset_id| POS_TO_SYMBOL[synset_id[0,1]] }.uniq unless defined? @available_pos
|
176
|
+
@available_pos
|
177
|
+
end
|
178
|
+
|
179
|
+
def to_s
|
180
|
+
@to_s = [lemma, " " + available_pos.join("/")].join(",") unless defined? @to_s
|
181
|
+
@to_s
|
182
|
+
end
|
183
|
+
|
184
|
+
def synsets(pos = :all)
|
185
|
+
relevent_synsets = case
|
186
|
+
when SYMBOL_TO_POS.include?(pos.to_sym)
|
187
|
+
synset_ids.select { |synset_id| synset_id[0,1] == SYMBOL_TO_POS[pos.to_sym] }
|
188
|
+
when POS_TO_SYMBOL.include?(pos.to_s)
|
189
|
+
synset_ids.select { |synset_id| synset_id[0,1] == pos.to_s }
|
190
|
+
else
|
191
|
+
synset_ids
|
192
|
+
end
|
193
|
+
relevent_synsets.map { |synset_id| Synset.new synset_id }
|
194
|
+
end
|
195
|
+
|
196
|
+
def synset_ids
|
197
|
+
@synset_ids = @lemma_hash["synset_ids"].split('|') unless defined? @synset_ids
|
198
|
+
@synset_ids
|
199
|
+
end
|
200
|
+
|
201
|
+
def inspect
|
202
|
+
@lemma_hash.inspect
|
203
|
+
end
|
204
|
+
|
205
|
+
alias word lemma
|
206
|
+
|
207
|
+
end
|
208
|
+
|
209
|
+
class Words
|
210
|
+
|
211
|
+
def initialize(path = 'data/wordnet.tct')
|
212
|
+
if (Pathname.new path).exist?
|
213
|
+
WordnetConnection::wordnet_connection = Rufus::Tokyo::Table.new(path)
|
214
|
+
else
|
215
|
+
abort("Failed to locate the words database at #{(Pathname.new path).realpath}")
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def find(word)
|
220
|
+
Lemma.new WordnetConnection::wordnet_connection[word]
|
221
|
+
end
|
222
|
+
|
223
|
+
def close
|
224
|
+
WordnetConnection::wordnet_connection.close
|
225
|
+
end
|
226
|
+
|
227
|
+
end
|
228
|
+
|
229
|
+
end
|
data/test/helper.rb
ADDED
data/test/test_words.rb
ADDED
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: words
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Roja Buck
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-14 00:00:00 +00:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: trollop
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "1.15"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rufus-tokyo
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.0.5
|
34
|
+
version:
|
35
|
+
description: "A fast, easy to use interface to WordNet\xC2\xAE with cross ruby distribution compatability. We use TokyoCabinet to store the dataset and the excellent rufus-tokyo to interface with it. This allows us to have full compatability across ruby distributions while still remaining both fast and simple to use."
|
36
|
+
email: roja@arbia.co.uk
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- LICENSE
|
43
|
+
- README.markdown
|
44
|
+
files:
|
45
|
+
- .gitignore
|
46
|
+
- LICENSE
|
47
|
+
- README.markdown
|
48
|
+
- Rakefile
|
49
|
+
- VERSION
|
50
|
+
- build_dataset.rb
|
51
|
+
- data/wordnet.tct
|
52
|
+
- examples.rb
|
53
|
+
- lib/words.rb
|
54
|
+
- test/helper.rb
|
55
|
+
- test/test_words.rb
|
56
|
+
has_rdoc: true
|
57
|
+
homepage: http://github.com/roja/words
|
58
|
+
licenses: []
|
59
|
+
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options:
|
62
|
+
- --charset=UTF-8
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: "0"
|
70
|
+
version:
|
71
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: "0"
|
76
|
+
version:
|
77
|
+
requirements: []
|
78
|
+
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.3.5
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: "A fast, easy to use interface to WordNet\xC2\xAE with cross ruby distribution compatability."
|
84
|
+
test_files:
|
85
|
+
- test/test_words.rb
|
86
|
+
- test/helper.rb
|