words 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.markdown CHANGED
@@ -56,8 +56,12 @@ Then your ready to rock and roll. :)
56
56
  To build the wordnet dataset (or index for pure) file yourself, from the original wordnet files, you can use the bundled "build_wordnet" command
57
57
 
58
58
  build_wordnet -h # this will give you the usage information
59
- sudo build_wordnet -v --build-tokyo # this would attempt to build the tokyo backend data locating the original wordnet files through a search...
60
- sudo build_wordnet -v --build-pure # this would attempt to build the pure backend index locating the original wordnet files through a search...
59
+
60
+ # this would attempt to build the tokyo backend data locating the original wordnet files through a search...
61
+ sudo build_wordnet -v --build-tokyo
62
+
63
+ # this would attempt to build the pure backend index locating the original wordnet files through a search...
64
+ sudo build_wordnet -v --build-pure
61
65
 
62
66
  ## Usage ##
63
67
 
data/Rakefile CHANGED
@@ -5,8 +5,8 @@ begin
5
5
  require 'jeweler'
6
6
  Jeweler::Tasks.new do |gem|
7
7
  gem.name = "words"
8
- gem.summary = %Q{A fast, easy to use interface to WordNet® with cross ruby distribution compatability.}
9
- gem.description = %Q{A fast, easy to use interface to WordNet® with cross ruby distribution compatability. We use TokyoCabinet to store the dataset and the excellent rufus-tokyo to interface with it. This allows us to have full compatability across ruby distributions while still remaining both fast and simple to use.}
8
+ gem.summary = %Q{A Fast & Easy to use interface to WordNet® with cross ruby distribution compatability.}
9
+ gem.description = %Q{Words, with both pure ruby & tokyo-cabinate backends, implements a fast interface to Wordnet® over the same easy-to-use API. The FFI backend makes use of Tokyo Cabinet and the FFI interface, rufus-tokyo, to provide cross ruby distribution compatability and blistering speed. The pure ruby interface operates on a special ruby optimised index along with the basic dictionary files provided by WordNet®. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch!}
10
10
  gem.email = "roja@arbia.co.uk"
11
11
  gem.homepage = "http://github.com/roja/words"
12
12
  gem.authors = ["Roja Buck"]
@@ -14,8 +14,12 @@ begin
14
14
  gem.add_dependency 'rufus-tokyo', '>= 1.0.5'
15
15
  gem.executables = [ "build_wordnet" ]
16
16
  gem.default_executable = "build_wordnet"
17
+ gem.rubyforge_project = 'words'
17
18
  end
18
19
  Jeweler::GemcutterTasks.new
20
+ Jeweler::RubyforgeTasks.new do |rubyforge|
21
+ rubyforge.doc_task = "rdoc"
22
+ end
19
23
  rescue LoadError
20
24
  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
25
  end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.2.1
data/bin/build_wordnet CHANGED
@@ -19,102 +19,102 @@ def locate_wordnet(base_dir)
19
19
  return path + "dict" if (path + "dict/data.noun").exist?
20
20
  end
21
21
 
22
- if __FILE__ == $0
23
-
24
- puts "Words Dataset Constructor 2010 (c) Roja Buck"
25
-
26
- opts = Trollop::options do
27
- opt :verbose, "Output verbose program detail.", :default => false
28
- opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
29
- opt :build_tokyo, "Build the tokyo dataset?", :default => false
30
- opt :build_pure, "Build the pure ruby dataset?", :default => false
31
- end
32
- Trollop::die :build_tokyo, "Either tokyo dataset or pure ruby dataset are required" if !opts[:build_tokyo] && !opts[:build_pure]
33
- puts "Verbose mode enabled" if (VERBOSE = opts[:verbose])
34
-
35
- wordnet_dir = nil
36
- if opts[:wordnet] == "Search..."
37
- ['/usr/share/wordnet', '/usr/local/share/wordnet', '/usr/local/WordNet-3.0'].each do |potential_dir|
38
- break unless (wordnet_dir = locate_wordnet potential_dir).nil?
39
- end
40
- abort( "Unable to locate wordnet dictionary. To specify check --help." ) if wordnet_dir.nil?
41
- else
42
- wordnet_dir = locate_wordnet opts[:wordnet]
43
- abort( "Unable to locate wordnet dictionary in directory #{opts[:wordnet]}. Please check and try again." ) if wordnet_dir.nil?
22
+ puts "Words Dataset Constructor 2010 (c) Roja Buck"
23
+
24
+ opts = Trollop::options do
25
+ opt :verbose, "Output verbose program detail.", :default => false
26
+ opt :wordnet, "Location of the wordnet dictionary directory", :default => "Search..."
27
+ opt :build_tokyo, "Build the tokyo dataset?", :default => false
28
+ opt :build_pure, "Build the pure ruby dataset?", :default => false
29
+ end
30
+ Trollop::die :build_tokyo, "You need to specify whether tokyo dataset or pure ruby index building is required" if !opts[:build_tokyo] && !opts[:build_pure]
31
+ puts "Verbose mode enabled" if (VERBOSE = opts[:verbose])
32
+
33
+ gem_path = Pathname.new "#{File.dirname(__FILE__)}/.."
34
+ abort "Ensure you run the command using sudo or as a Superuser / Administrator" unless gem_path.writable?
35
+ data_path = gem_path + "data/"
36
+ data_path.mkpath
37
+
38
+ wordnet_dir = nil
39
+ if opts[:wordnet] == "Search..."
40
+ ['/usr/share/wordnet', '/usr/local/share/wordnet', '/usr/local/WordNet-3.0'].each do |potential_dir|
41
+ break unless (wordnet_dir = locate_wordnet potential_dir).nil?
44
42
  end
43
+ abort( "Unable to locate wordnet dictionary. To specify check --help." ) if wordnet_dir.nil?
44
+ else
45
+ wordnet_dir = locate_wordnet opts[:wordnet]
46
+ abort( "Unable to locate wordnet dictionary in directory #{opts[:wordnet]}. Please check and try again." ) if wordnet_dir.nil?
47
+ end
48
+
49
+ # At this point we know we should have a wordnet directory within wordnet_dir
50
+ puts "Found wordnet files in #{wordnet_dir}..." if VERBOSE
51
+
52
+ index_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "index.#{pos}" }
53
+ data_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "data.#{pos}" }
54
+
55
+ (index_files + data_files).each do |required_file|
56
+ abort( "Unable to locate #{required_file} within the wordnet dictionary. Please check your wordnet copy is valid and try again." ) unless required_file.exist?
57
+ abort( "Cannot get readable permissions to #{required_file} within the wordnet dictionary. Please check the file permissions and try again." ) unless required_file.readable?
58
+ end
59
+
60
+ # At this point we know we have the correct files, though we don't know there validity
61
+ puts "Validated existance of wordnet files in #{wordnet_dir}..." if VERBOSE
62
+
63
+ # Build data
64
+
65
+ index_hash = Hash.new
66
+ data_hash = Hash.new
67
+ POS_FILE_TYPES.each do |file_pos|
45
68
 
46
- # At this point we know we should have a wordnet directory within wordnet_dir
47
- puts "Found wordnet files in #{wordnet_dir}..." if VERBOSE
48
-
49
- index_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "index.#{pos}" }
50
- data_files = POS_FILE_TYPES.map { |pos| wordnet_dir + "data.#{pos}" }
69
+ puts "Building #{file_pos} indexes..." if VERBOSE
51
70
 
52
- (index_files + data_files).each do |required_file|
53
- abort( "Unable to locate #{required_file} within the wordnet dictionary. Please check your wordnet copy is valid and try again." ) unless required_file.exist?
54
- abort( "Cannot get readable permissions to #{required_file} within the wordnet dictionary. Please check the file permissions and try again." ) unless required_file.readable?
71
+ # add indexes
72
+ (wordnet_dir + "index.#{file_pos}").each_line do |index_line|
73
+ next if index_line[0, 2] == " "
74
+ index_parts = index_line.split(" ")
75
+
76
+ lemma, pos, synset_count, pointer_count = index_parts.shift, index_parts.shift, index_parts.shift.to_i, index_parts.shift.to_i
77
+ pointer_symbols = Array.new(pointer_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
78
+ sense_count = index_parts.shift
79
+ tagsense_count = pos + index_parts.shift
80
+ synset_ids = Array.new(synset_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
81
+
82
+ index_hash[lemma] = { "synset_ids" => [], "tagsense_counts" => [] } if index_hash[lemma].nil?
83
+ index_hash[lemma] = { "lemma" => lemma, "synset_ids" => index_hash[lemma]["synset_ids"] + synset_ids, "tagsense_counts" => index_hash[lemma]["tagsense_counts"] + [tagsense_count] }
84
+
55
85
  end
56
86
 
57
- # At this point we know we have the correct files, though we don't know there validity
58
- puts "Validated existance of wordnet files in #{wordnet_dir}..." if VERBOSE
59
-
60
- # Build data
61
-
62
- index_hash = Hash.new
63
- data_hash = Hash.new
64
- POS_FILE_TYPES.each do |file_pos|
65
-
66
- puts "Building #{file_pos} indexes..." if VERBOSE
87
+ if opts[:build_tokyo]
88
+ puts "Building #{file_pos} data..." if VERBOSE
67
89
 
68
- # add indexes
69
- (wordnet_dir + "index.#{file_pos}").each_line do |index_line|
70
- next if index_line[0, 2] == " "
71
- index_parts = index_line.split(" ")
72
-
73
- lemma, pos, synset_count, pointer_count = index_parts.shift, index_parts.shift, index_parts.shift.to_i, index_parts.shift.to_i
74
- pointer_symbols = Array.new(pointer_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
75
- sense_count = index_parts.shift
76
- tagsense_count = pos + index_parts.shift
77
- synset_ids = Array.new(synset_count).map { POS_FILE_TYPE_TO_SHORT[file_pos] + index_parts.shift }
90
+ # add data
91
+ (wordnet_dir + "data.#{file_pos}").each_line do |data_line|
92
+ next if data_line[0, 2] == " "
93
+ data_line, gloss = data_line.split(" | ")
94
+ data_parts = data_line.split(" ")
78
95
 
79
- index_hash[lemma] = { "synset_ids" => [], "tagsense_counts" => [] } if index_hash[lemma].nil?
80
- index_hash[lemma] = { "lemma" => lemma, "synset_ids" => index_hash[lemma]["synset_ids"] + synset_ids, "tagsense_counts" => index_hash[lemma]["tagsense_counts"] + [tagsense_count] }
96
+ synset_id, lexical_filenum, synset_type, word_count = POS_FILE_TYPE_TO_SHORT[file_pos] + data_parts.shift, data_parts.shift, data_parts.shift, data_parts.shift.to_i(16)
97
+ words = Array.new(word_count).map { "#{data_parts.shift}.#{data_parts.shift}" }
98
+ relations = Array.new(data_parts.shift.to_i).map { "#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}" }
81
99
 
82
- end
83
-
84
- if opts[:build_tokyo]
85
- puts "Building #{file_pos} data..." if VERBOSE
86
-
87
- # add data
88
- (wordnet_dir + "data.#{file_pos}").each_line do |data_line|
89
- next if data_line[0, 2] == " "
90
- data_line, gloss = data_line.split(" | ")
91
- data_parts = data_line.split(" ")
92
-
93
- synset_id, lexical_filenum, synset_type, word_count = POS_FILE_TYPE_TO_SHORT[file_pos] + data_parts.shift, data_parts.shift, data_parts.shift, data_parts.shift.to_i(16)
94
- words = Array.new(word_count).map { "#{data_parts.shift}.#{data_parts.shift}" }
95
- relations = Array.new(data_parts.shift.to_i).map { "#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}" }
96
-
97
- data_hash[synset_id] = { "synset_id" => synset_id, "lexical_filenum" => lexical_filenum, "synset_type" => synset_type,
100
+ data_hash[synset_id] = { "synset_id" => synset_id, "lexical_filenum" => lexical_filenum, "synset_type" => synset_type,
98
101
  "words" => words.join('|'), "relations" => relations.join('|'), "gloss" => gloss.strip }
99
- end
100
102
  end
101
-
102
- end
103
-
104
- if opts[:build_tokyo]
105
- tokyo_hash = Rufus::Tokyo::Table.new("#{File.dirname(__FILE__)}/../data/wordnet.tct")
106
- index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
107
- data_hash.each { |k,v| tokyo_hash[k] = v }
108
- tokyo_hash.close
109
103
  end
110
104
 
111
- if opts[:build_pure]
112
- index = Hash.new
113
- index_hash.each { |k,v| index[k] = [v["lemma"], v["tagsense_counts"].join('|'), v["synset_ids"].join('|')] }
114
- File.open("#{File.dirname(__FILE__)}/../data/index.dmp",'w') do |file|
115
- file.write Marshal.dump(index)
116
- end
105
+ end
106
+
107
+ if opts[:build_tokyo]
108
+ tokyo_hash = Rufus::Tokyo::Table.new((data_path + "wordnet.tct").to_s)
109
+ index_hash.each { |k,v| tokyo_hash[k] = { "lemma" => v["lemma"], "synset_ids" => v["synset_ids"].join('|'), "tagsense_counts" => v["tagsense_counts"].join('|') } }
110
+ data_hash.each { |k,v| tokyo_hash[k] = v }
111
+ tokyo_hash.close
112
+ end
113
+
114
+ if opts[:build_pure]
115
+ index = Hash.new
116
+ index_hash.each { |k,v| index[k] = [v["lemma"], v["tagsense_counts"].join('|'), v["synset_ids"].join('|')] }
117
+ File.open(data_path + "index.dmp",'w') do |file|
118
+ file.write Marshal.dump(index)
117
119
  end
118
-
119
-
120
- end
120
+ end
data/examples.rb CHANGED
@@ -1,10 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'lib/words'
3
+ require 'rubygems'
4
+ require 'words'
4
5
 
5
6
  if __FILE__ == $0
6
7
 
7
- wordnet = Words::Words.new # :pure
8
+ wordnet = Words::Words.new :pure
8
9
 
9
10
  puts wordnet
10
11
 
data/lib/words.rb CHANGED
@@ -21,7 +21,7 @@ module Words
21
21
 
22
22
  if @data_path.exist?
23
23
  if @connection_type == :tokyo
24
- @connection = Rufus::Tokyo::Table.new(@data_path.to_s)
24
+ @connection = Rufus::Tokyo::Table.new(@data_path.to_s, :mode => 'r')
25
25
  @connected = true
26
26
  elsif @connection_type == :pure
27
27
  # open the index is there
data/words.gemspec CHANGED
@@ -5,13 +5,13 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{words}
8
- s.version = "0.2.0"
8
+ s.version = "0.2.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Roja Buck"]
12
12
  s.date = %q{2010-01-16}
13
13
  s.default_executable = %q{build_wordnet}
14
- s.description = %q{A fast, easy to use interface to WordNet® with cross ruby distribution compatability. We use TokyoCabinet to store the dataset and the excellent rufus-tokyo to interface with it. This allows us to have full compatability across ruby distributions while still remaining both fast and simple to use.}
14
+ s.description = %q{Words, with both pure ruby & tokyo-cabinate backends, implements a fast interface to Wordnet® over the same easy-to-use API. The FFI backend makes use of Tokyo Cabinet and the FFI interface, rufus-tokyo, to provide cross ruby distribution compatability and blistering speed. The pure ruby interface operates on a special ruby optimised index along with the basic dictionary files provided by WordNet®. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch!}
15
15
  s.email = %q{roja@arbia.co.uk}
16
16
  s.executables = ["build_wordnet"]
17
17
  s.extra_rdoc_files = [
@@ -34,8 +34,9 @@ Gem::Specification.new do |s|
34
34
  s.homepage = %q{http://github.com/roja/words}
35
35
  s.rdoc_options = ["--charset=UTF-8"]
36
36
  s.require_paths = ["lib"]
37
+ s.rubyforge_project = %q{words}
37
38
  s.rubygems_version = %q{1.3.5}
38
- s.summary = %q{A fast, easy to use interface to WordNet® with cross ruby distribution compatability.}
39
+ s.summary = %q{A Fast & Easy to use interface to WordNet® with cross ruby distribution compatability.}
39
40
  s.test_files = [
40
41
  "test/test_words.rb",
41
42
  "test/helper.rb"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: words
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Roja Buck
@@ -32,7 +32,7 @@ dependencies:
32
32
  - !ruby/object:Gem::Version
33
33
  version: 1.0.5
34
34
  version:
35
- description: "A fast, easy to use interface to WordNet\xC2\xAE with cross ruby distribution compatability. We use TokyoCabinet to store the dataset and the excellent rufus-tokyo to interface with it. This allows us to have full compatability across ruby distributions while still remaining both fast and simple to use."
35
+ description: "Words, with both pure ruby & tokyo-cabinate backends, implements a fast interface to Wordnet\xC2\xAE over the same easy-to-use API. The FFI backend makes use of Tokyo Cabinet and the FFI interface, rufus-tokyo, to provide cross ruby distribution compatability and blistering speed. The pure ruby interface operates on a special ruby optimised index along with the basic dictionary files provided by WordNet\xC2\xAE. I have attempted to provide ease of use in the form of a simple yet powerful api and installation is a sintch!"
36
36
  email: roja@arbia.co.uk
37
37
  executables:
38
38
  - build_wordnet
@@ -76,11 +76,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
76
76
  version:
77
77
  requirements: []
78
78
 
79
- rubyforge_project:
79
+ rubyforge_project: words
80
80
  rubygems_version: 1.3.5
81
81
  signing_key:
82
82
  specification_version: 3
83
- summary: "A fast, easy to use interface to WordNet\xC2\xAE with cross ruby distribution compatability."
83
+ summary: "A Fast & Easy to use interface to WordNet\xC2\xAE with cross ruby distribution compatability."
84
84
  test_files:
85
85
  - test/test_words.rb
86
86
  - test/helper.rb