wordnet 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/convertdb.rb ADDED
@@ -0,0 +1,417 @@
1
+ #!/usr/bin/ruby
2
+ #
3
+ # Conversion script for Ruby-WordNet
4
+ #
5
+ # == Synopsis
6
+ #
7
+ # ./convertdb.rb [DATADIR]
8
+ #
9
+ # == Authors
10
+ #
11
+ # This is a port of Dan Brian's convertdb.pl in the Lingua::Wordnet
12
+ # distribution. It requires the 'strscan' library, which is in the standard
13
+ # library of Ruby 1.8.
14
+ #
15
+ # * Michael Granger <ged@FaerieMUD.org>
16
+ #
17
+ # == Copyright
18
+ #
19
+ # Copyright (c) 2003-2008 The FaerieMUD Consortium. All rights reserved.
20
+ #
21
+ # This module is free software. You may use, modify, and/or redistribute this
22
+ # software under the terms of the Perl Artistic License. (See
23
+ # http://language.perl.com/misc/Artistic.html)
24
+ #
25
+ # == Version
26
+ #
27
+ # $Id: convertdb.rb 94 2008-07-25 02:47:42Z deveiant $
28
+ #
29
+
30
+ begin
31
+ base = File::dirname( File::expand_path(__FILE__) )
32
+ $LOAD_PATH.unshift "#{base}/lib" unless $LOAD_PATH.include?( "#{base}/lib" )
33
+ $LOAD_PATH.unshift base
34
+
35
+ unless defined?( UtilityFunctions )
36
+ require "#{base}/utils.rb"
37
+ include UtilityFunctions
38
+ end
39
+ end
40
+
41
+ require 'pathname'
42
+ require 'strscan'
43
+ require 'wordnet'
44
+ require 'optparse'
45
+ require 'fileutils'
46
+
47
+
48
+ # Globals: Index of words => senses, StringScanner for parsing.
49
+ $senseIndex = {}
50
+ $scanner = StringScanner::new( "" )
51
+
52
+ class WordNetConverter
53
+
54
+ # Source WordNet files
55
+ IndexFiles = %w[ index.noun index.verb index.adj index.adv ]
56
+ MorphFiles = {
57
+ 'adj.exc' => WordNet::Adjective,
58
+ 'adv.exc' => WordNet::Adverb,
59
+ 'noun.exc' => WordNet::Noun,
60
+ 'verb.exc' => WordNet::Verb,
61
+ 'cousin.exc' => '',
62
+ }
63
+ DataFiles = {
64
+ 'data.adj' => WordNet::Adjective,
65
+ 'data.adv' => WordNet::Adverb,
66
+ 'data.noun' => WordNet::Noun,
67
+ 'data.verb' => WordNet::Verb,
68
+ }
69
+
70
+ # Struct which represents a list of files, a database, and a processor function
71
+ # for moving records from each of the files into the database.
72
+ Fileset = Struct::new( "WordNetFileset", :files, :name, :db, :processor )
73
+
74
+ # How many records to insert between commits
75
+ CommitThreshold = 2000
76
+
77
+ # Temporary location for the lexicon data files
78
+ BuildDir = Pathname.new( __FILE__ ).expand_path.dirname +
79
+ Pathname.new( WordNet::Lexicon::DEFAULT_DB_ENV ).basename
80
+
81
+
82
+ ### Create a new converter that will dump WordNet dictionary files into a BerkeleyDB
83
+ ### in the given +builddir+
84
+ def initialize( builddir=BuildDir )
85
+ @builddir = Pathname.new( builddir )
86
+ end
87
+
88
+
89
+ ### Convert the various dict files from the WordNet project into a BerkeleyDB database
90
+ def convertdb( errorLimit=0 )
91
+ $stderr.sync = $stdout.sync = true
92
+ header "WordNet Lexicon Converter"
93
+
94
+ # Make sure the user knows what they're in for
95
+ message "This program will convert WordNet data files into databases\n"\
96
+ "used by Ruby-WordNet. This will not affect existing WordNet files,\n"\
97
+ "but will require up to 40Mb of disk space.\n"
98
+ exit unless /^y/i =~ prompt_with_default("Continue?", "y")
99
+
100
+ # Open the database and check to be sure it's empty. Confirm overwrite if
101
+ # not. Checkpoint and set up logging proc if debugging.
102
+ if @builddir.exist? && ( @builddir + 'data' ).exist?
103
+ message ">>> Warning: Existing data in the Ruby-WordNet databases\n"\
104
+ "will be overwritten.\n"
105
+ abort( "user cancelled." ) unless
106
+ /^y/i =~ prompt_with_default( "Continue?", "n" )
107
+ @builddir.rmtree
108
+ end
109
+
110
+ # Find the source data files
111
+ default = nil
112
+ wndirs = Pathname.glob( Pathname.getwd + 'WordNet-*' )
113
+ localdict = Pathname.getwd + 'dict'
114
+ if !wndirs.empty?
115
+ default = wndirs.first + 'dict'
116
+ elsif localdict.exist?
117
+ default = localdict
118
+ else
119
+ default = '/usr/local/WordNet-3.0/dict'
120
+ end
121
+
122
+ message "Where can I find the WordNet data files?\n"
123
+ datadir = prompt_with_default( "Data directory", default )
124
+ datadir = Pathname.new( datadir )
125
+
126
+ abort( "Directory '#{datadir}' does not exist" ) unless datadir.exist?
127
+ abort( "'#{datadir}' is not a directory" ) unless datadir.directory?
128
+ testfile = datadir + "data.noun"
129
+ abort( "'#{datadir}' doesn't seem to contain the necessary files.") unless testfile.exist?
130
+
131
+ # Open the lexicon readwrite into the temporary datadir
132
+ @builddir.mkpath
133
+ lexicon = WordNet::Lexicon::new( @builddir.to_s, 0666 )
134
+
135
+ # Process each fileset
136
+ [ # Fileset, name, database handle, processor
137
+ Fileset::new( IndexFiles, "index", lexicon.index_db, method(:parse_index_line) ),
138
+ Fileset::new( MorphFiles, "morph", lexicon.morph_db, method(:parse_morph_line) ),
139
+ Fileset::new( DataFiles, "data", lexicon.data_db, method(:parse_synset_line) ),
140
+ ].each do |set|
141
+ message "Converting %s files...\n" % set.name
142
+ set.db.truncate
143
+
144
+ # Process each file in the set with the appropriate processor method and
145
+ # insert results into the corresponding table.
146
+ set.files.each do |file,pos|
147
+ message " #{file}..."
148
+
149
+ filepath = File::join( datadir, file )
150
+ if !File::exists?( filepath )
151
+ message "missing: skipped\n"
152
+ next
153
+ end
154
+
155
+ txn, dbh = lexicon.env.txn_begin( 0, set.db )
156
+ entries = lineNumber = errors = 0
157
+ File::readlines( filepath ).each do |line|
158
+ lineNumber += 1
159
+ next if /^\s/ =~ line
160
+
161
+ key, value = set.processor.call( line.chomp, lineNumber, pos )
162
+ unless key
163
+ errors += 1
164
+ if errorLimit.nonzero? && errors >= errorLimit
165
+ abort( "Too many errors" )
166
+ end
167
+ end
168
+
169
+ dbh[ key ] = value
170
+ entries += 1
171
+ print "%d%s" % [ entries, "\x08" * entries.to_s.length ]
172
+
173
+ # Commit and start a new transaction every 1000 records
174
+ if (entries % CommitThreshold).zero?
175
+ print "."
176
+ txn.commit( BDB::TXN_NOSYNC )
177
+ txn, dbh = lexicon.env.txn_begin( 0, set.db )
178
+ end
179
+ end
180
+
181
+ message "committing..."
182
+ txn.commit( BDB::TXN_SYNC )
183
+ message "done (%d entries, %d errors).\n" %
184
+ [ entries, errors ]
185
+ end
186
+
187
+ lock_stats = lexicon.env.lock_stat
188
+ message "Lock statistics:\n"
189
+ puts " Lock objects: #{lock_stats['st_nobjects']}/#{lock_stats['st_maxnobjects']}",
190
+ " Locks: #{lock_stats['st_nlocks']}/#{lock_stats['st_maxnlocks']}",
191
+ " Lockers: #{lock_stats['st_nlockers']}/#{lock_stats['st_maxnlockers']}"
192
+
193
+
194
+ message "Checkpointing DB and cleaning logs..."
195
+ lexicon.checkpoint
196
+ lexicon.clean_logs
197
+ puts "done."
198
+ end
199
+
200
+ message "done.\n\n"
201
+ end
202
+
203
+
204
+ #######
205
+ private
206
+ #######
207
+
208
+ # Index entry patterns
209
+ IndexEntry = /^(\S+)\s(\w)\s(\d+)\s(\d+)\s/
210
+ PointerSymbol = /(\S{1,2})\s/
211
+ SenseCounts = /(\d+)\s(\d+)\s/
212
+ SynsetId = /(\d{8})\s*/
213
+
214
+ ### Parse an entry from one of the index files and return the key and
215
+ ### data. Returns +nil+ if any part of the netry isn't able to be parsed. The
216
+ ### +pos+ argument is not used -- it's just to make the interface between all
217
+ ### three processor methods the same.
218
+ def parse_index_line( string, lineNumber, pos=nil )
219
+ $scanner.string = string
220
+ synsets = []
221
+ lemma, pos, polycnt = nil, nil, nil
222
+
223
+ raise "whole error" unless $scanner.scan( IndexEntry )
224
+ lemma, pos, polycnt, pcnt = $scanner[1], $scanner[2], $scanner[3], $scanner[4]
225
+
226
+ # Discard pointer symbols
227
+ pcnt.to_i.times do |i|
228
+ $scanner.skip( PointerSymbol ) or raise "couldn't skip pointer #{i}"
229
+ end
230
+
231
+ # Parse sense and tagsense counts
232
+ $scanner.scan( SenseCounts ) or raise "couldn't parse sense counts"
233
+ senseCount, tagSenseCount = $scanner[1], $scanner[2]
234
+
235
+ # Find synsets
236
+ senseCount.to_i.times do |i|
237
+ $scanner.scan( SynsetId ) or raise "couldn't parse synset #{i}"
238
+ synset = $scanner[1]
239
+ synsets.push( synset )
240
+ $senseIndex[ synset + "%" + pos + "%" + lemma ] = i.to_s
241
+ end
242
+
243
+ # Make the index entry and return it
244
+ key = lemma + "%" + pos
245
+ data = synsets.join(WordNet::SUB_DELIM)
246
+
247
+ return key, data
248
+ rescue => err
249
+ message "Index entry did not parse: %s at '%s...' (line %d)\n\t%s\n" % [
250
+ err.message,
251
+ $scanner.rest[0,20],
252
+ lineNumber,
253
+ err.backtrace[0]
254
+ ]
255
+ return nil
256
+ end
257
+
258
+
259
+ ### "Parse" a morph line and return it as a key and value.
260
+ def parse_morph_line( string, lineNumber, pos )
261
+ key, value = string.split
262
+ return "#{key}%#{pos}", value
263
+ rescue => err
264
+ message "Morph entry did not parse: %s for %s (pos = %s, line %d)\n\t%s\n" % [
265
+ err.message,
266
+ string.inspect,
267
+ pos.inspect,
268
+ lineNumber,
269
+ err.backtrace[0]
270
+ ]
271
+ return nil
272
+ end
273
+
274
+
275
+ # Synset data patterns
276
+ Synset = /(\d+)\s(\d{2})\s(\w)\s(\w{2})\s/
277
+ SynWord = /(\S+)\s(\w)*\s*/
278
+ SynPtrCnt = /(\d{3})\s/
279
+ SynPtr = /(\S{1,2})\s(\d+)\s(\w)\s(\w{4})\s/
280
+ SynFrameCnt = /\s*(\d{2})\s/
281
+ SynFrame = /\+\s(\d{2})\s(\w{2})\s/
282
+ SynGloss = /\s*\|\s*(.+)?/
283
+
284
+ ### Parse an entry from a data file and return the key and data. Returns +nil+
285
+ ### if any part of the entry isn't able to be parsed.
286
+ def parse_synset_line( string, lineNumber, pos )
287
+ $scanner.string = string
288
+
289
+ filenum, synsetType, gloss = nil, nil, nil
290
+ words = []
291
+ ptrs = []
292
+ frames = []
293
+
294
+ # Parse the first part of the synset
295
+ $scanner.scan( Synset ) or raise "unable to parse synset"
296
+ offset, filenum, synsetType, wordCount =
297
+ $scanner[1], $scanner[2], $scanner[3], $scanner[4]
298
+
299
+ # Parse the words
300
+ wordCount.to_i(16).times do |i|
301
+ $scanner.scan( SynWord ) or raise "unable to parse word #{i}"
302
+ word, lexid = $scanner[1], $scanner[2]
303
+ senseKey = (offset + "%" + pos + "%" + word).downcase
304
+ if !$senseIndex.key?( senseKey )
305
+ newKey = senseKey.sub( /\(\w+\)$/, '' )
306
+ if !$senseIndex.key?( newKey )
307
+ raise "Sense index does not contain sense '#{senseKey}' "\
308
+ "(tried #{newKey}, too)."
309
+ end
310
+ senseKey = newKey
311
+ end
312
+
313
+ words.push( word + "%" + $senseIndex[senseKey].to_s )
314
+ end
315
+
316
+ # Parse pointers
317
+ if $scanner.scan( SynPtrCnt )
318
+ $scanner[1].to_i.times do |i|
319
+ $scanner.scan( SynPtr ) or raise "unable to parse synptr #{i}"
320
+ ptrs.push "%s %s%%%s %s" % [
321
+ $scanner[1],
322
+ $scanner[2],
323
+ $scanner[3],
324
+ $scanner[4],
325
+ ]
326
+ end
327
+ else
328
+ raise "Couldn't parse pointer count"
329
+ end
330
+
331
+ # Parse frames if this synset is a verb
332
+ if synsetType == WordNet::Verb
333
+ if $scanner.scan( SynFrameCnt )
334
+ $scanner[1].to_i.times do |i|
335
+ $scanner.scan( SynFrame ) or raise "unable to parse frame #{i}"
336
+ frames.push "#{$scanner[1]} #{$scanner[2]}"
337
+ end
338
+ else
339
+ raise "Couldn't parse frame count"
340
+ end
341
+ end
342
+
343
+ # Find the gloss
344
+ if $scanner.scan( SynGloss )
345
+ gloss = $scanner[1].strip
346
+ end
347
+
348
+ # This should never happen, as the gloss matches pretty much anything to
349
+ # the end of line.
350
+ if !$scanner.empty?
351
+ raise "Trailing miscellaneous found at end of entry"
352
+ end
353
+
354
+ # Build the synset entry and return it
355
+ synsetType = WordNet::Adjective if synsetType == WordNet::Other
356
+ key = [ offset, synsetType ].join("%")
357
+ data = [
358
+ filenum,
359
+ words.join( WordNet::SUB_DELIM ),
360
+ ptrs.join( WordNet::SUB_DELIM ),
361
+ frames.join( WordNet::SUB_DELIM ),
362
+ gloss,
363
+ ].join( WordNet::DELIM )
364
+
365
+ return key, data
366
+ rescue => err
367
+ message "Synset did not parse: %s at '%s...' (pos = %s, line %d)\n\t%s\n" % [
368
+ err.message,
369
+ $scanner.rest[0,20],
370
+ pos.inspect,
371
+ lineNumber,
372
+ err.backtrace[0]
373
+ ]
374
+ return nil
375
+ end
376
+
377
+ end # class WordNetConverter
378
+
379
+
380
+ # Start the program if it's run directly
381
+ if $0 == __FILE__
382
+ errorLimit = 0
383
+
384
+ ARGV.options {|oparser|
385
+ oparser.banner = "Usage: #{File::basename($0)} -dv\n"
386
+
387
+ # Debugging on/off
388
+ oparser.on( "--debug", "-d", TrueClass, "Turn debugging on" ) {
389
+ $DEBUG = true
390
+ debugMsg "Turned debugging on."
391
+ }
392
+
393
+ # Verbose
394
+ oparser.on( "--verbose", "-v", TrueClass, "Verbose progress messages" ) {
395
+ $VERBOSE = true
396
+ debugMsg "Turned verbose on."
397
+ }
398
+
399
+ # Error-limit
400
+ oparser.on( "--error-limit=COUNT", "-eCOUNT", Integer,
401
+ "Error limit -- quit after COUNT errors" ) {|arg|
402
+ errorLimit = arg.to_i
403
+ debugMsg "Set error limit to #{errorLimit}"
404
+ }
405
+
406
+ # Handle the 'help' option
407
+ oparser.on( "--help", "-h", "Display this text." ) {
408
+ $stderr.puts oparser
409
+ exit!(0)
410
+ }
411
+
412
+ oparser.parse!
413
+ }
414
+
415
+ WordNetConverter.new.convertdb( errorLimit )
416
+ end
417
+
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/ruby -w
2
+ #
3
+ # Add a synset for laced boots
4
+ #
5
+
6
+ $: << "lib"
7
+ require "WordNet"
8
+
9
+ lex = WordNet::Lexicon.new( "ruby-wordnet" )
10
+
11
+ boot = lex.lookup_synsets( "boot", "n", 1 )
12
+ laced_boot = lex.create_synset( "laced boot", "n" )
13
+ tongue = lex.lookup_synsets( "tongue", "n", 6 )
14
+
15
+ laced_boot.add_hypernyms( boot )
16
+ laced_boot.add_component_meronyms( tongue )
17
+
18
+ lex.unlock {
19
+ laced_boot.write
20
+ boot.write
21
+ tongue.write
22
+ }
23
+
24
+
25
+
26
+
27
+
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/ruby -w
2
+ #
3
+ # Find all articles of clothing that have collars (Adapted from the synopsis
4
+ # of Lingua::Wordnet::Analysis)
5
+ #
6
+
7
+ $LOAD_PATH.unshift "lib"
8
+ require "wordnet"
9
+
10
+ # Create the lexicon
11
+ lex = WordNet::Lexicon.new
12
+
13
+ # Look up the clothing synset as the origin
14
+ clothing = lex.lookup_synsets( "clothing", WordNet::Noun, 1 )
15
+
16
+ part_word = ARGV.shift || "collar"
17
+ part = lex.lookup_synsets( part_word, WordNet::Noun, 1 ) or
18
+ abort( "Couldn't find synset for #{part_word}" )
19
+
20
+
21
+ puts "Looking for instances of:",
22
+ " #{part}",
23
+ "in the hyponyms of",
24
+ " #{clothing}",
25
+ ""
26
+
27
+ # Now traverse all hyponyms of the clothing synset, and check for "part" among
28
+ # each one's meronyms, printing any we find
29
+ clothing.traverse( :hyponyms ) do |syn,depth|
30
+ if syn.search( :meronyms, part )
31
+ puts "Has a #{part_word}: #{syn}"
32
+ else
33
+ puts "Doesn't have a #{part_word}: #{syn}" if $DEBUG
34
+ end
35
+ end
36
+