words 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +6 -2
- data/Rakefile +6 -2
- data/VERSION +1 -1
- data/bin/build_wordnet +86 -86
- data/examples.rb +3 -2
- data/lib/words.rb +1 -1
- data/words.gemspec +4 -3
- metadata +4 -4
data/README.markdown
CHANGED
@@ -56,8 +56,12 @@ Then your ready to rock and roll. :)
|
|
56
56
|
To build the wordnet dataset (or index for pure) file yourself, from the original wordnet files, you can use the bundled "build_wordnet" command
|
57
57
|
|
58
58
|
build_wordnet -h # this will give you the usage information
|
59
|
-
|
60
|
-
|
59
|
+
|
60
|
+
# this would attempt to build the tokyo backend data locating the original wordnet files through a search...
|
61
|
+
sudo build_wordnet -v --build-tokyo
|
62
|
+
|
63
|
+
# this would attempt to build the pure backend index locating the original wordnet files through a search...
|
64
|
+
sudo build_wordnet -v --build-pure
|
61
65
|
|
62
66
|
## Usage ##
|
63
67
|
|
data/Rakefile
CHANGED
@@ -5,8 +5,8 @@ begin
|
|
5
5
|
require 'jeweler'
|
6
6
|
Jeweler::Tasks.new do |gem|
|
7
7
|
gem.name = "words"
|
8
|
-
gem.summary = %Q{A
|
9
|
-
gem.description = %Q{
|
8
|
+
gem.summary = %Q{A Fast & Easy to use interface to WordNet® with cross ruby distribution compatability.}
|
9
|
+
gem.description = %Q{Words, with both pure ruby & tokyo-cabinate backends, implements a fast interface to Wordnet® over the same easy-to-use API. The FFI backend makes use of Tokyo Cabinet and the FFI interface, rufus-tokyo, to provide cross ruby distribution compatability and blistering speed. The pure ruby interface operates on a special ruby optimised index along with the basic dictionary files provided by WordNet®. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch!}
|
10
10
|
gem.email = "roja@arbia.co.uk"
|
11
11
|
gem.homepage = "http://github.com/roja/words"
|
12
12
|
gem.authors = ["Roja Buck"]
|
@@ -14,8 +14,12 @@ begin
|
|
14
14
|
gem.add_dependency 'rufus-tokyo', '>= 1.0.5'
|
15
15
|
gem.executables = [ "build_wordnet" ]
|
16
16
|
gem.default_executable = "build_wordnet"
|
17
|
+
gem.rubyforge_project = 'words'
|
17
18
|
end
|
18
19
|
Jeweler::GemcutterTasks.new
|
20
|
+
Jeweler::RubyforgeTasks.new do |rubyforge|
|
21
|
+
rubyforge.doc_task = "rdoc"
|
22
|
+
end
|
19
23
|
rescue LoadError
|
20
24
|
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
25
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
data/bin/build_wordnet
CHANGED
@@ -19,102 +19,102 @@ def locate_wordnet(base_dir)
|
|
19
19
|
return path + "dict" if (path + "dict/data.noun").exist?
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
wordnet_dir = locate_wordnet opts[:wordnet]
|
43
|
-
abort( "Unable to locate wordnet dictionary in directory #{opts[:wordnet]}. Please check and try again." ) if wordnet_dir.nil?
|
22
|
+
puts "Words Dataset Constructor 2010 (c) Roja Buck"
|
23
|
+
|
24
|
+
opts = Trollop::options do
|
25
|
+
opt :verbose, "Output verbose program detail.", :default => false
|
26
|
+
opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
|
27
|
+
opt :build_tokyo, "Build the tokyo dataset?", :default => false
|
28
|
+
opt :build_pure, "Build the pure ruby dataset?", :default => false
|
29
|
+
end
|
30
|
+
Trollop::die :build_tokyo, "You need to specify whether tokyo dataset or pure ruby index building is required" if !opts[:build_tokyo] && !opts[:build_pure]
|
31
|
+
puts "Verbose mode enabled" if (VERBOSE = opts[:verbose])
|
32
|
+
|
33
|
+
gem_path = Pathname.new "#{File.dirname(__FILE__)}/.."
|
34
|
+
abort "Ensure you run the command using sudo or as a Superuser / Administrator" unless gem_path.writable?
|
35
|
+
data_path = gem_path + "data/"
|
36
|
+
data_path.mkpath
|
37
|
+
|
38
|
+
wordnet_dir = nil
|
39
|
+
if opts[:wordnet] == "Search..."
|
40
|
+
['/usr/share/wordnet', '/usr/local/share/wordnet', '/usr/local/WordNet-3.0'].each do |potential_dir|
|
41
|
+
break unless (wordnet_dir = locate_wordnet potential_dir).nil?
|
44
42
|
end
|
43
|
+
abort( "Unable to locate wordnet dictionary. To specify check --help." ) if wordnet_dir.nil?
|
44
|
+
else
|
45
|
+
wordnet_dir = locate_wordnet opts[:wordnet]
|
46
|
+
abort( "Unable to locate wordnet dictionary in directory #{opts[:wordnet]}. Please check and try again." ) if wordnet_dir.nil?
|
47
|
+
end
|
48
|
+
|
49
|
+
# At this point we know we should have a wordnet directory within wordnet_dir
|
50
|
+
puts "Found wordnet files in #{wordnet_dir}..." if VERBOSE
|
51
|
+
|
52
|
+
index_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "index.#{pos}" }
|
53
|
+
data_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "data.#{pos}" }
|
54
|
+
|
55
|
+
(index_files + data_files).each do |required_file|
|
56
|
+
abort( "Unable to locate #{required_file} within the wordnet dictionary. Please check your wordnet copy is valid and try again." ) unless required_file.exist?
|
57
|
+
abort( "Cannot get readable permissions to #{required_file} within the wordnet dictionary. Please check the file permissions and try again." ) unless required_file.readable?
|
58
|
+
end
|
59
|
+
|
60
|
+
# At this point we know we have the correct files, though we don't know there validity
|
61
|
+
puts "Validated existance of wordnet files in #{wordnet_dir}..." if VERBOSE
|
62
|
+
|
63
|
+
# Build data
|
64
|
+
|
65
|
+
index_hash = Hash.new
|
66
|
+
data_hash = Hash.new
|
67
|
+
POS_FILE_TYPES.each do |file_pos|
|
45
68
|
|
46
|
-
|
47
|
-
puts "Found wordnet files in #{wordnet_dir}..." if VERBOSE
|
48
|
-
|
49
|
-
index_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "index.#{pos}" }
|
50
|
-
data_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "data.#{pos}" }
|
69
|
+
puts "Building #{file_pos} indexes..." if VERBOSE
|
51
70
|
|
52
|
-
|
53
|
-
|
54
|
-
|
71
|
+
# add indexes
|
72
|
+
(wordnet_dir + "index.#{file_pos}").each_line do |index_line|
|
73
|
+
next if index_line[0, 2] == " "
|
74
|
+
index_parts = index_line.split(" ")
|
75
|
+
|
76
|
+
lemma, pos, synset_count, pointer_count = index_parts.shift, index_parts.shift, index_parts.shift.to_i, index_parts.shift.to_i
|
77
|
+
pointer_symbols = Array.new(pointer_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
78
|
+
sense_count = index_parts.shift
|
79
|
+
tagsense_count = pos + index_parts.shift
|
80
|
+
synset_ids = Array.new(synset_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
81
|
+
|
82
|
+
index_hash[lemma] = { "synset_ids" => [], "tagsense_counts" => [] } if index_hash[lemma].nil?
|
83
|
+
index_hash[lemma] = { "lemma" => lemma, "synset_ids" => index_hash[lemma]["synset_ids"] + synset_ids, "tagsense_counts" => index_hash[lemma]["tagsense_counts"] + [tagsense_count] }
|
84
|
+
|
55
85
|
end
|
56
86
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
# Build data
|
61
|
-
|
62
|
-
index_hash = Hash.new
|
63
|
-
data_hash = Hash.new
|
64
|
-
POS_FILE_TYPES.each do |file_pos|
|
65
|
-
|
66
|
-
puts "Building #{file_pos} indexes..." if VERBOSE
|
87
|
+
if opts[:build_tokyo]
|
88
|
+
puts "Building #{file_pos} data..." if VERBOSE
|
67
89
|
|
68
|
-
# add
|
69
|
-
(wordnet_dir + "
|
70
|
-
next if
|
71
|
-
|
72
|
-
|
73
|
-
lemma, pos, synset_count, pointer_count = index_parts.shift, index_parts.shift, index_parts.shift.to_i, index_parts.shift.to_i
|
74
|
-
pointer_symbols = Array.new(pointer_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
75
|
-
sense_count = index_parts.shift
|
76
|
-
tagsense_count = pos + index_parts.shift
|
77
|
-
synset_ids = Array.new(synset_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
|
90
|
+
# add data
|
91
|
+
(wordnet_dir + "data.#{file_pos}").each_line do |data_line|
|
92
|
+
next if data_line[0, 2] == " "
|
93
|
+
data_line, gloss = data_line.split(" | ")
|
94
|
+
data_parts = data_line.split(" ")
|
78
95
|
|
79
|
-
|
80
|
-
|
96
|
+
synset_id, lexical_filenum, synset_type, word_count = POS_FILE_TYPE_TO_SHORT[file_pos] + data_parts.shift, data_parts.shift, data_parts.shift, data_parts.shift.to_i(16)
|
97
|
+
words = Array.new(word_count).map { "#{data_parts.shift}.#{data_parts.shift}" }
|
98
|
+
relations = Array.new(data_parts.shift.to_i).map { "#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}" }
|
81
99
|
|
82
|
-
|
83
|
-
|
84
|
-
if opts[:build_tokyo]
|
85
|
-
puts "Building #{file_pos} data..." if VERBOSE
|
86
|
-
|
87
|
-
# add data
|
88
|
-
(wordnet_dir + "data.#{file_pos}").each_line do |data_line|
|
89
|
-
next if data_line[0, 2] == " "
|
90
|
-
data_line, gloss = data_line.split(" | ")
|
91
|
-
data_parts = data_line.split(" ")
|
92
|
-
|
93
|
-
synset_id, lexical_filenum, synset_type, word_count = POS_FILE_TYPE_TO_SHORT[file_pos] + data_parts.shift, data_parts.shift, data_parts.shift, data_parts.shift.to_i(16)
|
94
|
-
words = Array.new(word_count).map { "#{data_parts.shift}.#{data_parts.shift}" }
|
95
|
-
relations = Array.new(data_parts.shift.to_i).map { "#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}" }
|
96
|
-
|
97
|
-
data_hash[synset_id] = { "synset_id" => synset_id, "lexical_filenum" => lexical_filenum, "synset_type" => synset_type,
|
100
|
+
data_hash[synset_id] = { "synset_id" => synset_id, "lexical_filenum" => lexical_filenum, "synset_type" => synset_type,
|
98
101
|
"words" => words.join('|'), "relations" => relations.join('|'), "gloss" => gloss.strip }
|
99
|
-
end
|
100
102
|
end
|
101
|
-
|
102
|
-
end
|
103
|
-
|
104
|
-
if opts[:build_tokyo]
|
105
|
-
tokyo_hash = Rufus::Tokyo::Table.new("#{File.dirname(__FILE__)}/../data/wordnet.tct")
|
106
|
-
index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
|
107
|
-
data_hash.each { |k,v| tokyo_hash[k] = v }
|
108
|
-
tokyo_hash.close
|
109
103
|
end
|
110
104
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
105
|
+
end
|
106
|
+
|
107
|
+
if opts[:build_tokyo]
|
108
|
+
tokyo_hash = Rufus::Tokyo::Table.new((data_path + "wordnet.tct").to_s)
|
109
|
+
index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
|
110
|
+
data_hash.each { |k,v| tokyo_hash[k] = v }
|
111
|
+
tokyo_hash.close
|
112
|
+
end
|
113
|
+
|
114
|
+
if opts[:build_pure]
|
115
|
+
index = Hash.new
|
116
|
+
index_hash.each { |k,v| index[k] = [v["lemma"], v["tagsense_counts"].join('|'), v["synset_ids"].join('|')] }
|
117
|
+
File.open(data_path + "index.dmp",'w') do |file|
|
118
|
+
file.write Marshal.dump(index)
|
117
119
|
end
|
118
|
-
|
119
|
-
|
120
|
-
end
|
120
|
+
end
|
data/examples.rb
CHANGED
data/lib/words.rb
CHANGED
@@ -21,7 +21,7 @@ module Words
|
|
21
21
|
|
22
22
|
if @data_path.exist?
|
23
23
|
if @connection_type == :tokyo
|
24
|
-
@connection = Rufus::Tokyo::Table.new(@data_path.to_s)
|
24
|
+
@connection = Rufus::Tokyo::Table.new(@data_path.to_s, :mode => 'r')
|
25
25
|
@connected = true
|
26
26
|
elsif @connection_type == :pure
|
27
27
|
# open the index is there
|
data/words.gemspec
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{words}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Roja Buck"]
|
12
12
|
s.date = %q{2010-01-16}
|
13
13
|
s.default_executable = %q{build_wordnet}
|
14
|
-
s.description = %q{
|
14
|
+
s.description = %q{Words, with both pure ruby & tokyo-cabinate backends, implements a fast interface to Wordnet® over the same easy-to-use API. The FFI backend makes use of Tokyo Cabinet and the FFI interface, rufus-tokyo, to provide cross ruby distribution compatability and blistering speed. The pure ruby interface operates on a special ruby optimised index along with the basic dictionary files provided by WordNet®. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch!}
|
15
15
|
s.email = %q{roja@arbia.co.uk}
|
16
16
|
s.executables = ["build_wordnet"]
|
17
17
|
s.extra_rdoc_files = [
|
@@ -34,8 +34,9 @@ Gem::Specification.new do |s|
|
|
34
34
|
s.homepage = %q{http://github.com/roja/words}
|
35
35
|
s.rdoc_options = ["--charset=UTF-8"]
|
36
36
|
s.require_paths = ["lib"]
|
37
|
+
s.rubyforge_project = %q{words}
|
37
38
|
s.rubygems_version = %q{1.3.5}
|
38
|
-
s.summary = %q{A
|
39
|
+
s.summary = %q{A Fast & Easy to use interface to WordNet® with cross ruby distribution compatability.}
|
39
40
|
s.test_files = [
|
40
41
|
"test/test_words.rb",
|
41
42
|
"test/helper.rb"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: words
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Roja Buck
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 1.0.5
|
34
34
|
version:
|
35
|
-
description: "
|
35
|
+
description: "Words, with both pure ruby & tokyo-cabinate backends, implements a fast interface to Wordnet\xC2\xAE over the same easy-to-use API. The FFI backend makes use of Tokyo Cabinet and the FFI interface, rufus-tokyo, to provide cross ruby distribution compatability and blistering speed. The pure ruby interface operates on a special ruby optimised index along with the basic dictionary files provided by WordNet\xC2\xAE. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch!"
|
36
36
|
email: roja@arbia.co.uk
|
37
37
|
executables:
|
38
38
|
- build_wordnet
|
@@ -76,11 +76,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
76
76
|
version:
|
77
77
|
requirements: []
|
78
78
|
|
79
|
-
rubyforge_project:
|
79
|
+
rubyforge_project: words
|
80
80
|
rubygems_version: 1.3.5
|
81
81
|
signing_key:
|
82
82
|
specification_version: 3
|
83
|
-
summary: "A
|
83
|
+
summary: "A Fast & Easy to use interface to WordNet\xC2\xAE with cross ruby distribution compatability."
|
84
84
|
test_files:
|
85
85
|
- test/test_words.rb
|
86
86
|
- test/helper.rb
|