words 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +26 -0
- data/LICENSE +20 -0
- data/README.markdown +85 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/build_dataset.rb +102 -0
- data/data/wordnet.tct +0 -0
- data/examples.rb +28 -0
- data/lib/words.rb +229 -0
- data/test/helper.rb +9 -0
- data/test/test_words.rb +7 -0
- metadata +86 -0
data/.gitignore
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
## MAC OS
|
2
|
+
.DS_Store
|
3
|
+
|
4
|
+
## TEXTMATE
|
5
|
+
*.tmproj
|
6
|
+
tmtags
|
7
|
+
|
8
|
+
## EMACS
|
9
|
+
*~
|
10
|
+
\#*
|
11
|
+
.\#*
|
12
|
+
|
13
|
+
## VIM
|
14
|
+
*.swp
|
15
|
+
|
16
|
+
## PROJECT::GENERAL
|
17
|
+
coverage
|
18
|
+
rdoc
|
19
|
+
pkg
|
20
|
+
|
21
|
+
## ECLIPSE
|
22
|
+
.loadpath
|
23
|
+
.project
|
24
|
+
.document
|
25
|
+
|
26
|
+
## PROJECT::SPECIFIC
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Roja Buck
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
# Words - A fast, easy to use interface to WordNet® with cross ruby distribution compatability. #
|
2
|
+
|
3
|
+
## About ##
|
4
|
+
|
5
|
+
Words implements a fast interface to [Wordnet®](http://wordnet.princeton.edu) which makes use of [Tokyo Cabinet](http://1978th.net/tokyocabinet/) and a FFI interface, [rufus-tokyo](http://github.com/jmettraux/rufus-tokyo), to provide cross ruby distribution compatability and blistering speed. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch, we even include the data in it's tokyo data format (subject to the original wordnet licencing.)
|
6
|
+
|
7
|
+
## Installation ##
|
8
|
+
|
9
|
+
First ensure you have [Tokyo Cabinet](http://1978th.net/tokyocabinet/) installed. It should be nice and easy...
|
10
|
+
|
11
|
+
After this it should be just a gem to install. For those of you with old rubygems versions first:
|
12
|
+
|
13
|
+
gem install gemcutter # These two steps are only necessary if you haven't
|
14
|
+
gem tumble # yet installed the gemcutter tools
|
15
|
+
|
16
|
+
Otherwise and after it's simply:
|
17
|
+
|
18
|
+
gem install words
|
19
|
+
|
20
|
+
Then your ready to rock and roll. :)
|
21
|
+
|
22
|
+
## Build Data (Optional) ##
|
23
|
+
|
24
|
+
If you want to build the wordnet dataset file yourself, from the original wordnet files, you can use the bundled "build_dataset.rb"
|
25
|
+
|
26
|
+
./build_dataset.rb -h #this will give you the usage
|
27
|
+
sudo ./build_dataset.rb #this will attempt to build the data locating the original wordnet files through a search...
|
28
|
+
|
29
|
+
## Usage ##
|
30
|
+
|
31
|
+
Heres a few little examples of using words within your programs.
|
32
|
+
|
33
|
+
|
34
|
+
require 'rubygems'
|
35
|
+
require 'words'
|
36
|
+
|
37
|
+
data = Words::Words.new
|
38
|
+
|
39
|
+
# locate a word
|
40
|
+
lemma = data.find("bat")
|
41
|
+
|
42
|
+
lemma.to_s # => bat, noun/verb
|
43
|
+
lemma.available_pos.inspect # => [:noun, :verb]
|
44
|
+
|
45
|
+
lemma.synsets(:noun) # => array of synsets which represent nouns of the lemma bat
|
46
|
+
# or
|
47
|
+
lemma.nouns # => array of synsets which represent nouns of the lemma bat
|
48
|
+
lemma.verbs? #=> true
|
49
|
+
|
50
|
+
# specify a sense
|
51
|
+
sense = lemma.nouns.last
|
52
|
+
sense2 = lemma.nouns[2]
|
53
|
+
|
54
|
+
sense.gloss # => a club used for hitting a ball in various games
|
55
|
+
sense2.words # => ["cricket bat", "bat"]
|
56
|
+
sense.relations.first # => "Semantic hypernym relation between n02806379 and n03053474"
|
57
|
+
|
58
|
+
sense.relations(:hyponym) # => Array of hyponyms associated with the sense
|
59
|
+
# or
|
60
|
+
sense.hyponyms # => Array of hyponyms associated with the sense
|
61
|
+
sense.hyponyms? # => true
|
62
|
+
|
63
|
+
sense.relations.first.is_semantic? # => true
|
64
|
+
sense.relations.first.source_word # => nil
|
65
|
+
sense.relations.first.destination # => the synset of n03053474
|
66
|
+
|
67
|
+
sense.derivationally_related_forms.first.is_semantic? # => false
|
68
|
+
sense.derivationally_related_forms.first.source_word # => "bat"
|
69
|
+
sense.derivationally_related_forms.first.destination_word # => "bat"
|
70
|
+
sense.derivationally_related_forms.first.destination # => the synset of v01413191
|
71
|
+
|
72
|
+
|
73
|
+
## Note on Patches/Pull Requests ##
|
74
|
+
|
75
|
+
* Fork the project.
|
76
|
+
* Make your feature addition or bug fix.
|
77
|
+
* Add tests for it. This is important so I don't break it in a
|
78
|
+
future version unintentionally.
|
79
|
+
* Commit, do not mess with rakefile, version, or history.
|
80
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
81
|
+
* Send me a pull request. Bonus points for topic branches.
|
82
|
+
|
83
|
+
## Copyright ##
|
84
|
+
|
85
|
+
Copyright (c) 2010 Roja Buck. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "words"
|
8
|
+
gem.summary = %Q{A fast, easy to use interface to WordNet® with cross ruby distribution compatability.}
|
9
|
+
gem.description = %Q{A fast, easy to use interface to WordNet® with cross ruby distribution compatability. We use TokyoCabinet to store the dataset and the excellent rufus-tokyo to interface with it. This allows us to have full compatability across ruby distributions while still remaining both fast and simple to use.}
|
10
|
+
gem.email = "roja@arbia.co.uk"
|
11
|
+
gem.homepage = "http://github.com/roja/words"
|
12
|
+
gem.authors = ["Roja Buck"]
|
13
|
+
gem.add_development_dependency "trollop", ">= 1.15"
|
14
|
+
gem.add_dependency 'rufus-tokyo', '>= 1.0.5'
|
15
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
16
|
+
end
|
17
|
+
Jeweler::GemcutterTasks.new
|
18
|
+
rescue LoadError
|
19
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
20
|
+
end
|
21
|
+
|
22
|
+
require 'rake/testtask'
|
23
|
+
Rake::TestTask.new(:test) do |test|
|
24
|
+
test.libs << 'lib' << 'test'
|
25
|
+
test.pattern = 'test/**/test_*.rb'
|
26
|
+
test.verbose = true
|
27
|
+
end
|
28
|
+
|
29
|
+
begin
|
30
|
+
require 'rcov/rcovtask'
|
31
|
+
Rcov::RcovTask.new do |test|
|
32
|
+
test.libs << 'test'
|
33
|
+
test.pattern = 'test/**/test_*.rb'
|
34
|
+
test.verbose = true
|
35
|
+
end
|
36
|
+
rescue LoadError
|
37
|
+
task :rcov do
|
38
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
task :test => :check_dependencies
|
43
|
+
|
44
|
+
task :default => :test
|
45
|
+
|
46
|
+
require 'rake/rdoctask'
|
47
|
+
Rake::RDocTask.new do |rdoc|
|
48
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
49
|
+
|
50
|
+
rdoc.rdoc_dir = 'rdoc'
|
51
|
+
rdoc.title = "words #{version}"
|
52
|
+
rdoc.rdoc_files.include('README*')
|
53
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
54
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/build_dataset.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# std includes
|
4
|
+
require 'pathname'
|
5
|
+
|
6
|
+
# gem includes
|
7
|
+
require 'rubygems'
|
8
|
+
require 'trollop'
|
9
|
+
require 'pstore'
|
10
|
+
require 'rufus-tokyo'
|
11
|
+
|
12
|
+
POS_FILE_TYPES = %w{ adj adv noun verb }
|
13
|
+
POS_FILE_TYPE_TO_SHORT = { 'adj' => 'a', 'adv' => 'r', 'noun' => 'n', 'verb' => 'v' }
|
14
|
+
|
15
|
+
def locate_wordnet(base_dir)
|
16
|
+
puts "Checking #{base_dir} & spcific children for wordnet files..." if VERBOSE
|
17
|
+
path = Pathname.new base_dir
|
18
|
+
return nil unless path.exist?
|
19
|
+
return path if (path + "data.noun").exist?
|
20
|
+
return path + "dict" if (path + "dict/data.noun").exist?
|
21
|
+
end
|
22
|
+
|
23
|
+
if __FILE__ == $0
|
24
|
+
|
25
|
+
puts "Words Dataset Constructor 2010 (c) Roja Buck"
|
26
|
+
|
27
|
+
opts = Trollop::options do
|
28
|
+
opt :verbose, "Output verbose program detail.", :default => false
|
29
|
+
opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
|
30
|
+
end
|
31
|
+
puts "Verbose mode enabled" if (VERBOSE = opts[:verbose])
|
32
|
+
|
33
|
+
wordnet_dir = nil
|
34
|
+
if opts[:wordnet] == "Search..."
|
35
|
+
['/usr/share/wordnet', '/usr/local/share/wordnet', '/usr/local/WordNet-3.0'].each do |potential_dir|
|
36
|
+
break unless (wordnet_dir = locate_wordnet potential_dir).nil?
|
37
|
+
end
|
38
|
+
abort( "Unable to locate wordnet dictionary. To specify check --help." ) if wordnet_dir.nil?
|
39
|
+
else
|
40
|
+
wordnet_dir = locate_wordnet opts[:wordnet]
|
41
|
+
abort( "Unable to locate wordnet dictionary in directory #{opts[:wordnet]}. Please check and try again." ) if wordnet_dir.nil?
|
42
|
+
end
|
43
|
+
|
44
|
+
# At this point we know we should have a wordnet directory within wordnet_dir
|
45
|
+
puts "Found wordnet files in #{wordnet_dir}..." if VERBOSE
|
46
|
+
|
47
|
+
index_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "index.#{pos}" }
|
48
|
+
data_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "data.#{pos}" }
|
49
|
+
|
50
|
+
(index_files + data_files).each do |required_file|
|
51
|
+
abort( "Unable to locate #{required_file} within the wordnet dictionary. Please check your wordnet copy is valid and try again." ) unless required_file.exist?
|
52
|
+
abort( "Cannot get readable permissions to #{required_file} within the wordnet dictionary. Please check the file permissions and try again." ) unless required_file.readable?
|
53
|
+
end
|
54
|
+
|
55
|
+
# At this point we know we have the correct files, though we don't know there validity
|
56
|
+
puts "Validated existance of wordnet files in #{wordnet_dir}..." if VERBOSE
|
57
|
+
|
58
|
+
# Build data
|
59
|
+
|
60
|
+
hash = Rufus::Tokyo::Table.new("data/wordnet.tct")
|
61
|
+
POS_FILE_TYPES.each do |file_pos|
|
62
|
+
|
63
|
+
puts "Building #{file_pos} indexes..." if VERBOSE
|
64
|
+
|
65
|
+
# add indexes
|
66
|
+
(wordnet_dir + "index.#{file_pos}").each_line do |index_line|
|
67
|
+
next if index_line[0, 2] == " "
|
68
|
+
index_parts = index_line.split(" ")
|
69
|
+
|
70
|
+
lemma, pos, synset_count, pointer_count = index_parts.shift, index_parts.shift, index_parts.shift.to_i, index_parts.shift.to_i
|
71
|
+
pointer_symbols = Array.new(pointer_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
72
|
+
sense_count = index_parts.shift
|
73
|
+
tagsense_count = pos + index_parts.shift
|
74
|
+
synset_ids = Array.new(synset_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
75
|
+
|
76
|
+
hash[lemma] = { "synset_ids" => '', "tagsense_counts" => '' } if hash[lemma].nil?
|
77
|
+
|
78
|
+
hash[lemma] = { "lemma" => lemma, "synset_ids" => (hash[lemma]["synset_ids"].split('|') + synset_ids).join('|'), # append synsets
|
79
|
+
"tagsense_counts" => (hash[lemma]["tagsense_counts"].split('|') << tagsense_count).join('|') } # append pointer symbols
|
80
|
+
end
|
81
|
+
|
82
|
+
puts "Adding #{file_pos} data..." if VERBOSE
|
83
|
+
|
84
|
+
# add data
|
85
|
+
(wordnet_dir + "data.#{file_pos}").each_line do |data_line|
|
86
|
+
next if data_line[0, 2] == " "
|
87
|
+
data_line, gloss = data_line.split(" | ")
|
88
|
+
data_parts = data_line.split(" ")
|
89
|
+
|
90
|
+
synset_id, lexical_filenum, synset_type, word_count = POS_FILE_TYPE_TO_SHORT[file_pos] + data_parts.shift, data_parts.shift, data_parts.shift, data_parts.shift.to_i(16)
|
91
|
+
words = Array.new(word_count).map { "#{data_parts.shift}.#{data_parts.shift}" }
|
92
|
+
relations = Array.new(data_parts.shift.to_i).map { "#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}" }
|
93
|
+
|
94
|
+
hash[synset_id] = { "synset_id" => synset_id, "lexical_filenum" => lexical_filenum, "synset_type" => synset_type,
|
95
|
+
"words" => words.join('|'), "relations" => relations.join('|'), "gloss" => gloss }
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
end
|
100
|
+
hash.close
|
101
|
+
|
102
|
+
end
|
data/data/wordnet.tct
ADDED
Binary file
|
data/examples.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'lib/words'
|
4
|
+
|
5
|
+
if __FILE__ == $0
|
6
|
+
|
7
|
+
wordnet = Words::Words.new
|
8
|
+
|
9
|
+
puts wordnet.find('bat')
|
10
|
+
puts wordnet.find('bat').available_pos.inspect
|
11
|
+
puts wordnet.find('bat').lemma
|
12
|
+
puts wordnet.find('bat').synsets('noun')
|
13
|
+
puts wordnet.find('bat').synsets('noun').last.words.inspect
|
14
|
+
puts wordnet.find('bat').synsets('noun').last.relations
|
15
|
+
wordnet.find('bat').synsets('noun').last.relations.each { |relation| puts relation.inspect }
|
16
|
+
puts wordnet.find('bat').synsets('noun').last.methods
|
17
|
+
puts wordnet.find('bat').synsets('noun').last.hyponyms?
|
18
|
+
puts wordnet.find('bat').synsets('noun').last.participle_of_verbs?
|
19
|
+
|
20
|
+
puts wordnet.find('bat').synsets('noun').last.relations(:hyponym)
|
21
|
+
puts wordnet.find('bat').synsets('noun').last.relations("~")
|
22
|
+
puts wordnet.find('bat').synsets('verb').last.inspect
|
23
|
+
puts wordnet.find('bat').synsets('verb').last.words
|
24
|
+
puts wordnet.find('bat').synsets('verb').last.words_with_num.inspect
|
25
|
+
|
26
|
+
wordnet.close
|
27
|
+
|
28
|
+
end
|
data/lib/words.rb
ADDED
@@ -0,0 +1,229 @@
|
|
1
|
+
# std includes
|
2
|
+
require 'pathname'
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
# gem includes
|
6
|
+
require 'rubygems'
|
7
|
+
require 'rufus-tokyo'
|
8
|
+
|
9
|
+
module Words
|
10
|
+
|
11
|
+
class WordnetConnection
|
12
|
+
|
13
|
+
def self.wordnet_connection
|
14
|
+
@@wordnet_connection
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.wordnet_connection=(x)
|
18
|
+
@@wordnet_connection = x
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
class Relation
|
24
|
+
|
25
|
+
RELATION_TO_SYMBOL = { "-c" => :member_of_this_domain_topic, "+" => :derivationally_related_form, "%p" => :part_meronym, "~i" => :instance_hyponym, "@" => :hypernym,
|
26
|
+
";r" => :domain_of_synset_region, "!" => :antonym, "#p" => :part_holonym, "%s" => :substance_meronym, ";u" => :domain_of_synset_usage,
|
27
|
+
"-r" => :member_of_this_domain_region, "#s" => :substance_holonym, "=" => :attribute, "-u" => :member_of_this_domain_usage, ";c" => :domain_of_synset_topic,
|
28
|
+
"%m"=> :member_meronym, "~" => :hyponym, "@i" => :instance_hypernym, "#m" => :member_holonym, "$" => :verb_group, ">" => :cause, "*" => :entailment,
|
29
|
+
"\\" => :pertainym, "<" => :participle_of_verb, "&" => :similar_to, "^" => :see_also }
|
30
|
+
SYMBOL_TO_RELATION = RELATION_TO_SYMBOL.invert
|
31
|
+
|
32
|
+
def initialize(relation_construct, source_synset)
|
33
|
+
@symbol, @dest_synset_id, @pos, @source_dest = relation_construct.split('.')
|
34
|
+
@dest_synset_id = @pos + @dest_synset_id
|
35
|
+
@symbol = RELATION_TO_SYMBOL[@symbol]
|
36
|
+
@source_synset = source_synset
|
37
|
+
end
|
38
|
+
|
39
|
+
def is_semantic?
|
40
|
+
@source_dest == "0000"
|
41
|
+
end
|
42
|
+
|
43
|
+
def source_word
|
44
|
+
is_semantic? ? @source_word = nil : @source_word = @source_synset.words[@source_dest[0..1].to_i(16)-1] unless defined? @source_word
|
45
|
+
@source_word
|
46
|
+
end
|
47
|
+
|
48
|
+
def destination_word
|
49
|
+
is_semantic? ? @destination_word = nil : @destination_word = destination.words[@source_dest[2..3].to_i(16)-1] unless defined? @destination_word
|
50
|
+
@destination_word
|
51
|
+
end
|
52
|
+
|
53
|
+
def relation_type?(type)
|
54
|
+
case
|
55
|
+
when SYMBOL_TO_RELATION.include?(type.to_sym)
|
56
|
+
type.to_sym == @symbol
|
57
|
+
when RELATION_TO_SYMBOL.include?(pos.to_s)
|
58
|
+
POINTER_TO_SYMBOL[type.to_sym] == @symbol
|
59
|
+
else
|
60
|
+
false
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def relation_type
|
65
|
+
@symbol
|
66
|
+
end
|
67
|
+
|
68
|
+
def destination
|
69
|
+
@destination = Synset.new(@dest_synset_id) unless defined? @destination
|
70
|
+
@destination
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_s
|
74
|
+
@to_s = "#{relation_type.to_s.gsub('_', ' ').capitalize} relation between #{@source_synset.synset_id}'s word \"#{source_word}\" and #{@dest_synset_id}'s word \"#{destination_word}\"" if !is_semantic? && !defined?(@to_s)
|
75
|
+
@to_s = "Semantic #{relation_type.to_s.gsub('_', ' ')} relation between #{@source_synset.synset_id} and #{@dest_synset_id}" if is_semantic? && !defined?(@to_s)
|
76
|
+
@to_s
|
77
|
+
end
|
78
|
+
|
79
|
+
def inspect
|
80
|
+
{ :symbol => @symbol, :dest_synset_id => @dest_synset_id, :pos => @pos, :source_dest => @source_dest }.inspect
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
class Synset
|
86
|
+
|
87
|
+
SYNSET_TYPE_TO_SYMBOL = {"n" => :noun, "v" => :verb, "a" => :adjective, "r" => :adverb, "s" => :adjective_satallite }
|
88
|
+
|
89
|
+
def initialize(synset_id)
|
90
|
+
@synset_hash = WordnetConnection::wordnet_connection[synset_id]
|
91
|
+
# construct some conveniance menthods for relation type access
|
92
|
+
Relation::SYMBOL_TO_RELATION.keys.each do |relation_type|
|
93
|
+
self.class.send(:define_method, "#{relation_type}s?") do
|
94
|
+
relations(relation_type).size > 0
|
95
|
+
end
|
96
|
+
self.class.send(:define_method, "#{relation_type}s") do
|
97
|
+
relations(relation_type)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def synset_type
|
103
|
+
SYNSET_TYPE_TO_SYMBOL[@synset_hash["synset_type"]]
|
104
|
+
end
|
105
|
+
|
106
|
+
def words
|
107
|
+
@words = words_with_num.map { |word_with_num| word_with_num[:word] } unless defined? @words
|
108
|
+
@words
|
109
|
+
end
|
110
|
+
|
111
|
+
def size
|
112
|
+
words.size
|
113
|
+
end
|
114
|
+
|
115
|
+
def words_with_num
|
116
|
+
@words_with_num = @synset_hash["words"].split('|').map { |word| word_parts = word.split('.'); { :word => word_parts[0].gsub('_', ' '), :num => word_parts[1] } } unless defined? @words_with_num
|
117
|
+
@words_with_num
|
118
|
+
end
|
119
|
+
|
120
|
+
def synset_id
|
121
|
+
@synset_hash["synset_id"]
|
122
|
+
end
|
123
|
+
|
124
|
+
def gloss
|
125
|
+
@synset_hash["gloss"]
|
126
|
+
end
|
127
|
+
|
128
|
+
def inspect
|
129
|
+
@synset_hash.inspect
|
130
|
+
end
|
131
|
+
|
132
|
+
def relations(type = :all)
|
133
|
+
@relations = @synset_hash["relations"].split('|').map { |relation| Relation.new(relation, self) } unless defined? @relations
|
134
|
+
case
|
135
|
+
when Relation::SYMBOL_TO_RELATION.include?(type.to_sym)
|
136
|
+
@relations.select { |relation| relation.relation_type == type.to_sym }
|
137
|
+
when Relation::RELATION_TO_SYMBOL.include?(type.to_s)
|
138
|
+
@relations.select { |relation| relation.relation_type == Relation::RELATION_TO_SYMBOL[type.to_s] }
|
139
|
+
else
|
140
|
+
@relations
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def to_s
|
145
|
+
@to_s = "#{synset_type.to_s.capitalize} including word(s): #{words.map { |word| '"' + word + '"' }.join(', ')} meaning: #{gloss}" unless defined? @to_s
|
146
|
+
@to_s
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
class Lemma
|
152
|
+
|
153
|
+
POS_TO_SYMBOL = {"n" => :noun, "v" => :verb, "a" => :adjective, "r" => :adverb}
|
154
|
+
SYMBOL_TO_POS = POS_TO_SYMBOL.invert
|
155
|
+
|
156
|
+
def initialize(lemma_hash)
|
157
|
+
@lemma_hash = lemma_hash
|
158
|
+
# construct some conveniance menthods for relation type access
|
159
|
+
SYMBOL_TO_POS.keys.each do |pos|
|
160
|
+
self.class.send(:define_method, "#{pos}s?") do
|
161
|
+
synsets(pos).size > 0
|
162
|
+
end
|
163
|
+
self.class.send(:define_method, "#{pos}s") do
|
164
|
+
synsets(pos)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def lemma
|
170
|
+
@lemma = @lemma_hash["lemma"].gsub('_', ' ') unless defined? @lemma
|
171
|
+
@lemma
|
172
|
+
end
|
173
|
+
|
174
|
+
def available_pos
|
175
|
+
@available_pos = synset_ids.map { |synset_id| POS_TO_SYMBOL[synset_id[0,1]] }.uniq unless defined? @available_pos
|
176
|
+
@available_pos
|
177
|
+
end
|
178
|
+
|
179
|
+
def to_s
|
180
|
+
@to_s = [lemma, " " + available_pos.join("/")].join(",") unless defined? @to_s
|
181
|
+
@to_s
|
182
|
+
end
|
183
|
+
|
184
|
+
def synsets(pos = :all)
|
185
|
+
relevent_synsets = case
|
186
|
+
when SYMBOL_TO_POS.include?(pos.to_sym)
|
187
|
+
synset_ids.select { |synset_id| synset_id[0,1] == SYMBOL_TO_POS[pos.to_sym] }
|
188
|
+
when POS_TO_SYMBOL.include?(pos.to_s)
|
189
|
+
synset_ids.select { |synset_id| synset_id[0,1] == pos.to_s }
|
190
|
+
else
|
191
|
+
synset_ids
|
192
|
+
end
|
193
|
+
relevent_synsets.map { |synset_id| Synset.new synset_id }
|
194
|
+
end
|
195
|
+
|
196
|
+
def synset_ids
|
197
|
+
@synset_ids = @lemma_hash["synset_ids"].split('|') unless defined? @synset_ids
|
198
|
+
@synset_ids
|
199
|
+
end
|
200
|
+
|
201
|
+
def inspect
|
202
|
+
@lemma_hash.inspect
|
203
|
+
end
|
204
|
+
|
205
|
+
alias word lemma
|
206
|
+
|
207
|
+
end
|
208
|
+
|
209
|
+
class Words
|
210
|
+
|
211
|
+
def initialize(path = 'data/wordnet.tct')
|
212
|
+
if (Pathname.new path).exist?
|
213
|
+
WordnetConnection::wordnet_connection = Rufus::Tokyo::Table.new(path)
|
214
|
+
else
|
215
|
+
abort("Failed to locate the words database at #{(Pathname.new path).realpath}")
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def find(word)
|
220
|
+
Lemma.new WordnetConnection::wordnet_connection[word]
|
221
|
+
end
|
222
|
+
|
223
|
+
def close
|
224
|
+
WordnetConnection::wordnet_connection.close
|
225
|
+
end
|
226
|
+
|
227
|
+
end
|
228
|
+
|
229
|
+
end
|
data/test/helper.rb
ADDED
data/test/test_words.rb
ADDED
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: words
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Roja Buck
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-14 00:00:00 +00:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: trollop
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "1.15"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rufus-tokyo
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.0.5
|
34
|
+
version:
|
35
|
+
description: "A fast, easy to use interface to WordNet\xC2\xAE with cross ruby distribution compatability. We use TokyoCabinet to store the dataset and the excellent rufus-tokyo to interface with it. This allows us to have full compatability across ruby distributions while still remaining both fast and simple to use."
|
36
|
+
email: roja@arbia.co.uk
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- LICENSE
|
43
|
+
- README.markdown
|
44
|
+
files:
|
45
|
+
- .gitignore
|
46
|
+
- LICENSE
|
47
|
+
- README.markdown
|
48
|
+
- Rakefile
|
49
|
+
- VERSION
|
50
|
+
- build_dataset.rb
|
51
|
+
- data/wordnet.tct
|
52
|
+
- examples.rb
|
53
|
+
- lib/words.rb
|
54
|
+
- test/helper.rb
|
55
|
+
- test/test_words.rb
|
56
|
+
has_rdoc: true
|
57
|
+
homepage: http://github.com/roja/words
|
58
|
+
licenses: []
|
59
|
+
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options:
|
62
|
+
- --charset=UTF-8
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: "0"
|
70
|
+
version:
|
71
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: "0"
|
76
|
+
version:
|
77
|
+
requirements: []
|
78
|
+
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.3.5
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: "A fast, easy to use interface to WordNet\xC2\xAE with cross ruby distribution compatability."
|
84
|
+
test_files:
|
85
|
+
- test/test_words.rb
|
86
|
+
- test/helper.rb
|