words 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +6 -2
- data/Rakefile +6 -2
- data/VERSION +1 -1
- data/bin/build_wordnet +86 -86
- data/examples.rb +3 -2
- data/lib/words.rb +1 -1
- data/words.gemspec +4 -3
- metadata +4 -4
data/README.markdown
CHANGED
@@ -56,8 +56,12 @@ Then your ready to rock and roll. :)
|
|
56
56
|
To build the wordnet dataset (or index for pure) file yourself, from the original wordnet files, you can use the bundled "build_wordnet" command
|
57
57
|
|
58
58
|
build_wordnet -h # this will give you the usage information
|
59
|
-
|
60
|
-
|
59
|
+
|
60
|
+
# this would attempt to build the tokyo backend data locating the original wordnet files through a search...
|
61
|
+
sudo build_wordnet -v --build-tokyo
|
62
|
+
|
63
|
+
# this would attempt to build the pure backend index locating the original wordnet files through a search...
|
64
|
+
sudo build_wordnet -v --build-pure
|
61
65
|
|
62
66
|
## Usage ##
|
63
67
|
|
data/Rakefile
CHANGED
@@ -5,8 +5,8 @@ begin
|
|
5
5
|
require 'jeweler'
|
6
6
|
Jeweler::Tasks.new do |gem|
|
7
7
|
gem.name = "words"
|
8
|
-
gem.summary = %Q{A
|
9
|
-
gem.description = %Q{
|
8
|
+
gem.summary = %Q{A Fast & Easy to use interface to WordNet® with cross ruby distribution compatability.}
|
9
|
+
gem.description = %Q{Words, with both pure ruby & tokyo-cabinate backends, implements a fast interface to Wordnet® over the same easy-to-use API. The FFI backend makes use of Tokyo Cabinet and the FFI interface, rufus-tokyo, to provide cross ruby distribution compatability and blistering speed. The pure ruby interface operates on a special ruby optimised index along with the basic dictionary files provided by WordNet®. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch!}
|
10
10
|
gem.email = "roja@arbia.co.uk"
|
11
11
|
gem.homepage = "http://github.com/roja/words"
|
12
12
|
gem.authors = ["Roja Buck"]
|
@@ -14,8 +14,12 @@ begin
|
|
14
14
|
gem.add_dependency 'rufus-tokyo', '>= 1.0.5'
|
15
15
|
gem.executables = [ "build_wordnet" ]
|
16
16
|
gem.default_executable = "build_wordnet"
|
17
|
+
gem.rubyforge_project = 'words'
|
17
18
|
end
|
18
19
|
Jeweler::GemcutterTasks.new
|
20
|
+
Jeweler::RubyforgeTasks.new do |rubyforge|
|
21
|
+
rubyforge.doc_task = "rdoc"
|
22
|
+
end
|
19
23
|
rescue LoadError
|
20
24
|
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
25
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
data/bin/build_wordnet
CHANGED
@@ -19,102 +19,102 @@ def locate_wordnet(base_dir)
|
|
19
19
|
return path + "dict" if (path + "dict/data.noun").exist?
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
wordnet_dir = locate_wordnet opts[:wordnet]
|
43
|
-
abort( "Unable to locate wordnet dictionary in directory #{opts[:wordnet]}. Please check and try again." ) if wordnet_dir.nil?
|
22
|
+
puts "Words Dataset Constructor 2010 (c) Roja Buck"
|
23
|
+
|
24
|
+
opts = Trollop::options do
|
25
|
+
opt :verbose, "Output verbose program detail.", :default => false
|
26
|
+
opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
|
27
|
+
opt :build_tokyo, "Build the tokyo dataset?", :default => false
|
28
|
+
opt :build_pure, "Build the pure ruby dataset?", :default => false
|
29
|
+
end
|
30
|
+
Trollop::die :build_tokyo, "You need to specify whether tokyo dataset or pure ruby index building is required" if !opts[:build_tokyo] && !opts[:build_pure]
|
31
|
+
puts "Verbose mode enabled" if (VERBOSE = opts[:verbose])
|
32
|
+
|
33
|
+
gem_path = Pathname.new "#{File.dirname(__FILE__)}/.."
|
34
|
+
abort "Ensure you run the command using sudo or as a Superuser / Administrator" unless gem_path.writable?
|
35
|
+
data_path = gem_path + "data/"
|
36
|
+
data_path.mkpath
|
37
|
+
|
38
|
+
wordnet_dir = nil
|
39
|
+
if opts[:wordnet] == "Search..."
|
40
|
+
['/usr/share/wordnet', '/usr/local/share/wordnet', '/usr/local/WordNet-3.0'].each do |potential_dir|
|
41
|
+
break unless (wordnet_dir = locate_wordnet potential_dir).nil?
|
44
42
|
end
|
43
|
+
abort( "Unable to locate wordnet dictionary. To specify check --help." ) if wordnet_dir.nil?
|
44
|
+
else
|
45
|
+
wordnet_dir = locate_wordnet opts[:wordnet]
|
46
|
+
abort( "Unable to locate wordnet dictionary in directory #{opts[:wordnet]}. Please check and try again." ) if wordnet_dir.nil?
|
47
|
+
end
|
48
|
+
|
49
|
+
# At this point we know we should have a wordnet directory within wordnet_dir
|
50
|
+
puts "Found wordnet files in #{wordnet_dir}..." if VERBOSE
|
51
|
+
|
52
|
+
index_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "index.#{pos}" }
|
53
|
+
data_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "data.#{pos}" }
|
54
|
+
|
55
|
+
(index_files + data_files).each do |required_file|
|
56
|
+
abort( "Unable to locate #{required_file} within the wordnet dictionary. Please check your wordnet copy is valid and try again." ) unless required_file.exist?
|
57
|
+
abort( "Cannot get readable permissions to #{required_file} within the wordnet dictionary. Please check the file permissions and try again." ) unless required_file.readable?
|
58
|
+
end
|
59
|
+
|
60
|
+
# At this point we know we have the correct files, though we don't know there validity
|
61
|
+
puts "Validated existance of wordnet files in #{wordnet_dir}..." if VERBOSE
|
62
|
+
|
63
|
+
# Build data
|
64
|
+
|
65
|
+
index_hash = Hash.new
|
66
|
+
data_hash = Hash.new
|
67
|
+
POS_FILE_TYPES.each do |file_pos|
|
45
68
|
|
46
|
-
|
47
|
-
puts "Found wordnet files in #{wordnet_dir}..." if VERBOSE
|
48
|
-
|
49
|
-
index_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "index.#{pos}" }
|
50
|
-
data_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "data.#{pos}" }
|
69
|
+
puts "Building #{file_pos} indexes..." if VERBOSE
|
51
70
|
|
52
|
-
|
53
|
-
|
54
|
-
|
71
|
+
# add indexes
|
72
|
+
(wordnet_dir + "index.#{file_pos}").each_line do |index_line|
|
73
|
+
next if index_line[0, 2] == " "
|
74
|
+
index_parts = index_line.split(" ")
|
75
|
+
|
76
|
+
lemma, pos, synset_count, pointer_count = index_parts.shift, index_parts.shift, index_parts.shift.to_i, index_parts.shift.to_i
|
77
|
+
pointer_symbols = Array.new(pointer_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
78
|
+
sense_count = index_parts.shift
|
79
|
+
tagsense_count = pos + index_parts.shift
|
80
|
+
synset_ids = Array.new(synset_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
81
|
+
|
82
|
+
index_hash[lemma] = { "synset_ids" => [], "tagsense_counts" => [] } if index_hash[lemma].nil?
|
83
|
+
index_hash[lemma] = { "lemma" => lemma, "synset_ids" => index_hash[lemma]["synset_ids"] + synset_ids, "tagsense_counts" => index_hash[lemma]["tagsense_counts"] + [tagsense_count] }
|
84
|
+
|
55
85
|
end
|
56
86
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
# Build data
|
61
|
-
|
62
|
-
index_hash = Hash.new
|
63
|
-
data_hash = Hash.new
|
64
|
-
POS_FILE_TYPES.each do |file_pos|
|
65
|
-
|
66
|
-
puts "Building #{file_pos} indexes..." if VERBOSE
|
87
|
+
if opts[:build_tokyo]
|
88
|
+
puts "Building #{file_pos} data..." if VERBOSE
|
67
89
|
|
68
|
-
# add
|
69
|
-
(wordnet_dir + "
|
70
|
-
next if
|
71
|
-
|
72
|
-
|
73
|
-
lemma, pos, synset_count, pointer_count = index_parts.shift, index_parts.shift, index_parts.shift.to_i, index_parts.shift.to_i
|
74
|
-
pointer_symbols = Array.new(pointer_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
75
|
-
sense_count = index_parts.shift
|
76
|
-
tagsense_count = pos + index_parts.shift
|
77
|
-
synset_ids = Array.new(synset_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
90
|
+
# add data
|
91
|
+
(wordnet_dir + "data.#{file_pos}").each_line do |data_line|
|
92
|
+
next if data_line[0, 2] == " "
|
93
|
+
data_line, gloss = data_line.split(" | ")
|
94
|
+
data_parts = data_line.split(" ")
|
78
95
|
|
79
|
-
|
80
|
-
|
96
|
+
synset_id, lexical_filenum, synset_type, word_count = POS_FILE_TYPE_TO_SHORT[file_pos] + data_parts.shift, data_parts.shift, data_parts.shift, data_parts.shift.to_i(16)
|
97
|
+
words = Array.new(word_count).map { "#{data_parts.shift}.#{data_parts.shift}" }
|
98
|
+
relations = Array.new(data_parts.shift.to_i).map { "#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}" }
|
81
99
|
|
82
|
-
|
83
|
-
|
84
|
-
if opts[:build_tokyo]
|
85
|
-
puts "Building #{file_pos} data..." if VERBOSE
|
86
|
-
|
87
|
-
# add data
|
88
|
-
(wordnet_dir + "data.#{file_pos}").each_line do |data_line|
|
89
|
-
next if data_line[0, 2] == " "
|
90
|
-
data_line, gloss = data_line.split(" | ")
|
91
|
-
data_parts = data_line.split(" ")
|
92
|
-
|
93
|
-
synset_id, lexical_filenum, synset_type, word_count = POS_FILE_TYPE_TO_SHORT[file_pos] + data_parts.shift, data_parts.shift, data_parts.shift, data_parts.shift.to_i(16)
|
94
|
-
words = Array.new(word_count).map { "#{data_parts.shift}.#{data_parts.shift}" }
|
95
|
-
relations = Array.new(data_parts.shift.to_i).map { "#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}" }
|
96
|
-
|
97
|
-
data_hash[synset_id] = { "synset_id" => synset_id, "lexical_filenum" => lexical_filenum, "synset_type" => synset_type,
|
100
|
+
data_hash[synset_id] = { "synset_id" => synset_id, "lexical_filenum" => lexical_filenum, "synset_type" => synset_type,
|
98
101
|
"words" => words.join('|'), "relations" => relations.join('|'), "gloss" => gloss.strip }
|
99
|
-
end
|
100
102
|
end
|
101
|
-
|
102
|
-
end
|
103
|
-
|
104
|
-
if opts[:build_tokyo]
|
105
|
-
tokyo_hash = Rufus::Tokyo::Table.new("#{File.dirname(__FILE__)}/../data/wordnet.tct")
|
106
|
-
index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
|
107
|
-
data_hash.each { |k,v| tokyo_hash[k] = v }
|
108
|
-
tokyo_hash.close
|
109
103
|
end
|
110
104
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
105
|
+
end
|
106
|
+
|
107
|
+
if opts[:build_tokyo]
|
108
|
+
tokyo_hash = Rufus::Tokyo::Table.new((data_path + "wordnet.tct").to_s)
|
109
|
+
index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
|
110
|
+
data_hash.each { |k,v| tokyo_hash[k] = v }
|
111
|
+
tokyo_hash.close
|
112
|
+
end
|
113
|
+
|
114
|
+
if opts[:build_pure]
|
115
|
+
index = Hash.new
|
116
|
+
index_hash.each { |k,v| index[k] = [v["lemma"], v["tagsense_counts"].join('|'), v["synset_ids"].join('|')] }
|
117
|
+
File.open(data_path + "index.dmp",'w') do |file|
|
118
|
+
file.write Marshal.dump(index)
|
117
119
|
end
|
118
|
-
|
119
|
-
|
120
|
-
end
|
120
|
+
end
|
data/examples.rb
CHANGED
data/lib/words.rb
CHANGED
@@ -21,7 +21,7 @@ module Words
|
|
21
21
|
|
22
22
|
if @data_path.exist?
|
23
23
|
if @connection_type == :tokyo
|
24
|
-
@connection = Rufus::Tokyo::Table.new(@data_path.to_s)
|
24
|
+
@connection = Rufus::Tokyo::Table.new(@data_path.to_s, :mode => 'r')
|
25
25
|
@connected = true
|
26
26
|
elsif @connection_type == :pure
|
27
27
|
# open the index is there
|
data/words.gemspec
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{words}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Roja Buck"]
|
12
12
|
s.date = %q{2010-01-16}
|
13
13
|
s.default_executable = %q{build_wordnet}
|
14
|
-
s.description = %q{
|
14
|
+
s.description = %q{Words, with both pure ruby & tokyo-cabinate backends, implements a fast interface to Wordnet® over the same easy-to-use API. The FFI backend makes use of Tokyo Cabinet and the FFI interface, rufus-tokyo, to provide cross ruby distribution compatability and blistering speed. The pure ruby interface operates on a special ruby optimised index along with the basic dictionary files provided by WordNet®. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch!}
|
15
15
|
s.email = %q{roja@arbia.co.uk}
|
16
16
|
s.executables = ["build_wordnet"]
|
17
17
|
s.extra_rdoc_files = [
|
@@ -34,8 +34,9 @@ Gem::Specification.new do |s|
|
|
34
34
|
s.homepage = %q{http://github.com/roja/words}
|
35
35
|
s.rdoc_options = ["--charset=UTF-8"]
|
36
36
|
s.require_paths = ["lib"]
|
37
|
+
s.rubyforge_project = %q{words}
|
37
38
|
s.rubygems_version = %q{1.3.5}
|
38
|
-
s.summary = %q{A
|
39
|
+
s.summary = %q{A Fast & Easy to use interface to WordNet® with cross ruby distribution compatability.}
|
39
40
|
s.test_files = [
|
40
41
|
"test/test_words.rb",
|
41
42
|
"test/helper.rb"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: words
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Roja Buck
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 1.0.5
|
34
34
|
version:
|
35
|
-
description: "
|
35
|
+
description: "Words, with both pure ruby & tokyo-cabinate backends, implements a fast interface to Wordnet\xC2\xAE over the same easy-to-use API. The FFI backend makes use of Tokyo Cabinet and the FFI interface, rufus-tokyo, to provide cross ruby distribution compatability and blistering speed. The pure ruby interface operates on a special ruby optimised index along with the basic dictionary files provided by WordNet\xC2\xAE. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch!"
|
36
36
|
email: roja@arbia.co.uk
|
37
37
|
executables:
|
38
38
|
- build_wordnet
|
@@ -76,11 +76,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
76
76
|
version:
|
77
77
|
requirements: []
|
78
78
|
|
79
|
-
rubyforge_project:
|
79
|
+
rubyforge_project: words
|
80
80
|
rubygems_version: 1.3.5
|
81
81
|
signing_key:
|
82
82
|
specification_version: 3
|
83
|
-
summary: "A
|
83
|
+
summary: "A Fast & Easy to use interface to WordNet\xC2\xAE with cross ruby distribution compatability."
|
84
84
|
test_files:
|
85
85
|
- test/test_words.rb
|
86
86
|
- test/helper.rb
|