wordnet 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/convertdb.rb ADDED
@@ -0,0 +1,417 @@
1
+ #!/usr/bin/ruby
2
+ #
3
+ # Conversion script for Ruby-WordNet
4
+ #
5
+ # == Synopsis
6
+ #
7
+ # ./convertdb.rb [DATADIR]
8
+ #
9
+ # == Authors
10
+ #
11
+ # This is a port of Dan Brian's convertdb.pl in the Lingua::Wordnet
12
+ # distribution. It requires the 'strscan' library, which is in the standard
13
+ # library of Ruby 1.8.
14
+ #
15
+ # * Michael Granger <ged@FaerieMUD.org>
16
+ #
17
+ # == Copyright
18
+ #
19
+ # Copyright (c) 2003-2008 The FaerieMUD Consortium. All rights reserved.
20
+ #
21
+ # This module is free software. You may use, modify, and/or redistribute this
22
+ # software under the terms of the Perl Artistic License. (See
23
+ # http://language.perl.com/misc/Artistic.html)
24
+ #
25
+ # == Version
26
+ #
27
+ # $Id: convertdb.rb 94 2008-07-25 02:47:42Z deveiant $
28
+ #
29
+
30
+ begin
31
+ base = File::dirname( File::expand_path(__FILE__) )
32
+ $LOAD_PATH.unshift "#{base}/lib" unless $LOAD_PATH.include?( "#{base}/lib" )
33
+ $LOAD_PATH.unshift base
34
+
35
+ unless defined?( UtilityFunctions )
36
+ require "#{base}/utils.rb"
37
+ include UtilityFunctions
38
+ end
39
+ end
40
+
41
+ require 'pathname'
42
+ require 'strscan'
43
+ require 'wordnet'
44
+ require 'optparse'
45
+ require 'fileutils'
46
+
47
+
48
+ # Globals: Index of words => senses, StringScanner for parsing.
49
+ $senseIndex = {}
50
+ $scanner = StringScanner::new( "" )
51
+
52
+ class WordNetConverter
53
+
54
+ # Source WordNet files
55
+ IndexFiles = %w[ index.noun index.verb index.adj index.adv ]
56
+ MorphFiles = {
57
+ 'adj.exc' => WordNet::Adjective,
58
+ 'adv.exc' => WordNet::Adverb,
59
+ 'noun.exc' => WordNet::Noun,
60
+ 'verb.exc' => WordNet::Verb,
61
+ 'cousin.exc' => '',
62
+ }
63
+ DataFiles = {
64
+ 'data.adj' => WordNet::Adjective,
65
+ 'data.adv' => WordNet::Adverb,
66
+ 'data.noun' => WordNet::Noun,
67
+ 'data.verb' => WordNet::Verb,
68
+ }
69
+
70
+ # Struct which represents a list of files, a database, and a processor function
71
+ # for moving records from each of the files into the database.
72
+ Fileset = Struct::new( "WordNetFileset", :files, :name, :db, :processor )
73
+
74
+ # How many records to insert between commits
75
+ CommitThreshold = 2000
76
+
77
+ # Temporary location for the lexicon data files
78
+ BuildDir = Pathname.new( __FILE__ ).expand_path.dirname +
79
+ Pathname.new( WordNet::Lexicon::DEFAULT_DB_ENV ).basename
80
+
81
+
82
+ ### Create a new converter that will dump WordNet dictionary files into a BerkeleyDB
83
+ ### in the given +builddir+
84
+ def initialize( builddir=BuildDir )
85
+ @builddir = Pathname.new( builddir )
86
+ end
87
+
88
+
89
+ ### Convert the various dict files from the WordNet project into a BerkeleyDB database
90
+ def convertdb( errorLimit=0 )
91
+ $stderr.sync = $stdout.sync = true
92
+ header "WordNet Lexicon Converter"
93
+
94
+ # Make sure the user knows what they're in for
95
+ message "This program will convert WordNet data files into databases\n"\
96
+ "used by Ruby-WordNet. This will not affect existing WordNet files,\n"\
97
+ "but will require up to 40Mb of disk space.\n"
98
+ exit unless /^y/i =~ prompt_with_default("Continue?", "y")
99
+
100
+ # Open the database and check to be sure it's empty. Confirm overwrite if
101
+ # not. Checkpoint and set up logging proc if debugging.
102
+ if @builddir.exist? && ( @builddir + 'data' ).exist?
103
+ message ">>> Warning: Existing data in the Ruby-WordNet databases\n"\
104
+ "will be overwritten.\n"
105
+ abort( "user cancelled." ) unless
106
+ /^y/i =~ prompt_with_default( "Continue?", "n" )
107
+ @builddir.rmtree
108
+ end
109
+
110
+ # Find the source data files
111
+ default = nil
112
+ wndirs = Pathname.glob( Pathname.getwd + 'WordNet-*' )
113
+ localdict = Pathname.getwd + 'dict'
114
+ if !wndirs.empty?
115
+ default = wndirs.first + 'dict'
116
+ elsif localdict.exist?
117
+ default = localdict
118
+ else
119
+ default = '/usr/local/WordNet-3.0/dict'
120
+ end
121
+
122
+ message "Where can I find the WordNet data files?\n"
123
+ datadir = prompt_with_default( "Data directory", default )
124
+ datadir = Pathname.new( datadir )
125
+
126
+ abort( "Directory '#{datadir}' does not exist" ) unless datadir.exist?
127
+ abort( "'#{datadir}' is not a directory" ) unless datadir.directory?
128
+ testfile = datadir + "data.noun"
129
+ abort( "'#{datadir}' doesn't seem to contain the necessary files.") unless testfile.exist?
130
+
131
+ # Open the lexicon readwrite into the temporary datadir
132
+ @builddir.mkpath
133
+ lexicon = WordNet::Lexicon::new( @builddir.to_s, 0666 )
134
+
135
+ # Process each fileset
136
+ [ # Fileset, name, database handle, processor
137
+ Fileset::new( IndexFiles, "index", lexicon.index_db, method(:parse_index_line) ),
138
+ Fileset::new( MorphFiles, "morph", lexicon.morph_db, method(:parse_morph_line) ),
139
+ Fileset::new( DataFiles, "data", lexicon.data_db, method(:parse_synset_line) ),
140
+ ].each do |set|
141
+ message "Converting %s files...\n" % set.name
142
+ set.db.truncate
143
+
144
+ # Process each file in the set with the appropriate processor method and
145
+ # insert results into the corresponding table.
146
+ set.files.each do |file,pos|
147
+ message " #{file}..."
148
+
149
+ filepath = File::join( datadir, file )
150
+ if !File::exists?( filepath )
151
+ message "missing: skipped\n"
152
+ next
153
+ end
154
+
155
+ txn, dbh = lexicon.env.txn_begin( 0, set.db )
156
+ entries = lineNumber = errors = 0
157
+ File::readlines( filepath ).each do |line|
158
+ lineNumber += 1
159
+ next if /^\s/ =~ line
160
+
161
+ key, value = set.processor.call( line.chomp, lineNumber, pos )
162
+ unless key
163
+ errors += 1
164
+ if errorLimit.nonzero? && errors >= errorLimit
165
+ abort( "Too many errors" )
166
+ end
167
+ end
168
+
169
+ dbh[ key ] = value
170
+ entries += 1
171
+ print "%d%s" % [ entries, "\x08" * entries.to_s.length ]
172
+
173
+ # Commit and start a new transaction every 1000 records
174
+ if (entries % CommitThreshold).zero?
175
+ print "."
176
+ txn.commit( BDB::TXN_NOSYNC )
177
+ txn, dbh = lexicon.env.txn_begin( 0, set.db )
178
+ end
179
+ end
180
+
181
+ message "committing..."
182
+ txn.commit( BDB::TXN_SYNC )
183
+ message "done (%d entries, %d errors).\n" %
184
+ [ entries, errors ]
185
+ end
186
+
187
+ lock_stats = lexicon.env.lock_stat
188
+ message "Lock statistics:\n"
189
+ puts " Lock objects: #{lock_stats['st_nobjects']}/#{lock_stats['st_maxnobjects']}",
190
+ " Locks: #{lock_stats['st_nlocks']}/#{lock_stats['st_maxnlocks']}",
191
+ " Lockers: #{lock_stats['st_nlockers']}/#{lock_stats['st_maxnlockers']}"
192
+
193
+
194
+ message "Checkpointing DB and cleaning logs..."
195
+ lexicon.checkpoint
196
+ lexicon.clean_logs
197
+ puts "done."
198
+ end
199
+
200
+ message "done.\n\n"
201
+ end
202
+
203
+
204
+ #######
205
+ private
206
+ #######
207
+
208
+ # Index entry patterns
209
+ IndexEntry = /^(\S+)\s(\w)\s(\d+)\s(\d+)\s/
210
+ PointerSymbol = /(\S{1,2})\s/
211
+ SenseCounts = /(\d+)\s(\d+)\s/
212
+ SynsetId = /(\d{8})\s*/
213
+
214
+ ### Parse an entry from one of the index files and return the key and
215
+ ### data. Returns +nil+ if any part of the netry isn't able to be parsed. The
216
+ ### +pos+ argument is not used -- it's just to make the interface between all
217
+ ### three processor methods the same.
218
+ def parse_index_line( string, lineNumber, pos=nil )
219
+ $scanner.string = string
220
+ synsets = []
221
+ lemma, pos, polycnt = nil, nil, nil
222
+
223
+ raise "whole error" unless $scanner.scan( IndexEntry )
224
+ lemma, pos, polycnt, pcnt = $scanner[1], $scanner[2], $scanner[3], $scanner[4]
225
+
226
+ # Discard pointer symbols
227
+ pcnt.to_i.times do |i|
228
+ $scanner.skip( PointerSymbol ) or raise "couldn't skip pointer #{i}"
229
+ end
230
+
231
+ # Parse sense and tagsense counts
232
+ $scanner.scan( SenseCounts ) or raise "couldn't parse sense counts"
233
+ senseCount, tagSenseCount = $scanner[1], $scanner[2]
234
+
235
+ # Find synsets
236
+ senseCount.to_i.times do |i|
237
+ $scanner.scan( SynsetId ) or raise "couldn't parse synset #{i}"
238
+ synset = $scanner[1]
239
+ synsets.push( synset )
240
+ $senseIndex[ synset + "%" + pos + "%" + lemma ] = i.to_s
241
+ end
242
+
243
+ # Make the index entry and return it
244
+ key = lemma + "%" + pos
245
+ data = synsets.join(WordNet::SUB_DELIM)
246
+
247
+ return key, data
248
+ rescue => err
249
+ message "Index entry did not parse: %s at '%s...' (line %d)\n\t%s\n" % [
250
+ err.message,
251
+ $scanner.rest[0,20],
252
+ lineNumber,
253
+ err.backtrace[0]
254
+ ]
255
+ return nil
256
+ end
257
+
258
+
259
+ ### "Parse" a morph line and return it as a key and value.
260
+ def parse_morph_line( string, lineNumber, pos )
261
+ key, value = string.split
262
+ return "#{key}%#{pos}", value
263
+ rescue => err
264
+ message "Morph entry did not parse: %s for %s (pos = %s, line %d)\n\t%s\n" % [
265
+ err.message,
266
+ string.inspect,
267
+ pos.inspect,
268
+ lineNumber,
269
+ err.backtrace[0]
270
+ ]
271
+ return nil
272
+ end
273
+
274
+
275
+ # Synset data patterns
276
+ Synset = /(\d+)\s(\d{2})\s(\w)\s(\w{2})\s/
277
+ SynWord = /(\S+)\s(\w)*\s*/
278
+ SynPtrCnt = /(\d{3})\s/
279
+ SynPtr = /(\S{1,2})\s(\d+)\s(\w)\s(\w{4})\s/
280
+ SynFrameCnt = /\s*(\d{2})\s/
281
+ SynFrame = /\+\s(\d{2})\s(\w{2})\s/
282
+ SynGloss = /\s*\|\s*(.+)?/
283
+
284
+ ### Parse an entry from a data file and return the key and data. Returns +nil+
285
+ ### if any part of the entry isn't able to be parsed.
286
+ def parse_synset_line( string, lineNumber, pos )
287
+ $scanner.string = string
288
+
289
+ filenum, synsetType, gloss = nil, nil, nil
290
+ words = []
291
+ ptrs = []
292
+ frames = []
293
+
294
+ # Parse the first part of the synset
295
+ $scanner.scan( Synset ) or raise "unable to parse synset"
296
+ offset, filenum, synsetType, wordCount =
297
+ $scanner[1], $scanner[2], $scanner[3], $scanner[4]
298
+
299
+ # Parse the words
300
+ wordCount.to_i(16).times do |i|
301
+ $scanner.scan( SynWord ) or raise "unable to parse word #{i}"
302
+ word, lexid = $scanner[1], $scanner[2]
303
+ senseKey = (offset + "%" + pos + "%" + word).downcase
304
+ if !$senseIndex.key?( senseKey )
305
+ newKey = senseKey.sub( /\(\w+\)$/, '' )
306
+ if !$senseIndex.key?( newKey )
307
+ raise "Sense index does not contain sense '#{senseKey}' "\
308
+ "(tried #{newKey}, too)."
309
+ end
310
+ senseKey = newKey
311
+ end
312
+
313
+ words.push( word + "%" + $senseIndex[senseKey].to_s )
314
+ end
315
+
316
+ # Parse pointers
317
+ if $scanner.scan( SynPtrCnt )
318
+ $scanner[1].to_i.times do |i|
319
+ $scanner.scan( SynPtr ) or raise "unable to parse synptr #{i}"
320
+ ptrs.push "%s %s%%%s %s" % [
321
+ $scanner[1],
322
+ $scanner[2],
323
+ $scanner[3],
324
+ $scanner[4],
325
+ ]
326
+ end
327
+ else
328
+ raise "Couldn't parse pointer count"
329
+ end
330
+
331
+ # Parse frames if this synset is a verb
332
+ if synsetType == WordNet::Verb
333
+ if $scanner.scan( SynFrameCnt )
334
+ $scanner[1].to_i.times do |i|
335
+ $scanner.scan( SynFrame ) or raise "unable to parse frame #{i}"
336
+ frames.push "#{$scanner[1]} #{$scanner[2]}"
337
+ end
338
+ else
339
+ raise "Couldn't parse frame count"
340
+ end
341
+ end
342
+
343
+ # Find the gloss
344
+ if $scanner.scan( SynGloss )
345
+ gloss = $scanner[1].strip
346
+ end
347
+
348
+ # This should never happen, as the gloss matches pretty much anything to
349
+ # the end of line.
350
+ if !$scanner.empty?
351
+ raise "Trailing miscellaneous found at end of entry"
352
+ end
353
+
354
+ # Build the synset entry and return it
355
+ synsetType = WordNet::Adjective if synsetType == WordNet::Other
356
+ key = [ offset, synsetType ].join("%")
357
+ data = [
358
+ filenum,
359
+ words.join( WordNet::SUB_DELIM ),
360
+ ptrs.join( WordNet::SUB_DELIM ),
361
+ frames.join( WordNet::SUB_DELIM ),
362
+ gloss,
363
+ ].join( WordNet::DELIM )
364
+
365
+ return key, data
366
+ rescue => err
367
+ message "Synset did not parse: %s at '%s...' (pos = %s, line %d)\n\t%s\n" % [
368
+ err.message,
369
+ $scanner.rest[0,20],
370
+ pos.inspect,
371
+ lineNumber,
372
+ err.backtrace[0]
373
+ ]
374
+ return nil
375
+ end
376
+
377
+ end # class WordNetConverter
378
+
379
+
380
+ # Start the program if it's run directly
381
+ if $0 == __FILE__
382
+ errorLimit = 0
383
+
384
+ ARGV.options {|oparser|
385
+ oparser.banner = "Usage: #{File::basename($0)} -dv\n"
386
+
387
+ # Debugging on/off
388
+ oparser.on( "--debug", "-d", TrueClass, "Turn debugging on" ) {
389
+ $DEBUG = true
390
+ debugMsg "Turned debugging on."
391
+ }
392
+
393
+ # Verbose
394
+ oparser.on( "--verbose", "-v", TrueClass, "Verbose progress messages" ) {
395
+ $VERBOSE = true
396
+ debugMsg "Turned verbose on."
397
+ }
398
+
399
+ # Error-limit
400
+ oparser.on( "--error-limit=COUNT", "-eCOUNT", Integer,
401
+ "Error limit -- quit after COUNT errors" ) {|arg|
402
+ errorLimit = arg.to_i
403
+ debugMsg "Set error limit to #{errorLimit}"
404
+ }
405
+
406
+ # Handle the 'help' option
407
+ oparser.on( "--help", "-h", "Display this text." ) {
408
+ $stderr.puts oparser
409
+ exit!(0)
410
+ }
411
+
412
+ oparser.parse!
413
+ }
414
+
415
+ WordNetConverter.new.convertdb( errorLimit )
416
+ end
417
+
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/ruby -w
2
+ #
3
+ # Add a synset for laced boots
4
+ #
5
+
6
+ $: << "lib"
7
+ require "WordNet"
8
+
9
+ lex = WordNet::Lexicon.new( "ruby-wordnet" )
10
+
11
+ boot = lex.lookup_synsets( "boot", "n", 1 )
12
+ laced_boot = lex.create_synset( "laced boot", "n" )
13
+ tongue = lex.lookup_synsets( "tongue", "n", 6 )
14
+
15
+ laced_boot.add_hypernyms( boot )
16
+ laced_boot.add_component_meronyms( tongue )
17
+
18
+ lex.unlock {
19
+ laced_boot.write
20
+ boot.write
21
+ tongue.write
22
+ }
23
+
24
+
25
+
26
+
27
+
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/ruby -w
2
+ #
3
+ # Find all articles of clothing that have collars (Adapted from the synopsis
4
+ # of Lingua::Wordnet::Analysis)
5
+ #
6
+
7
+ $LOAD_PATH.unshift "lib"
8
+ require "wordnet"
9
+
10
+ # Create the lexicon
11
+ lex = WordNet::Lexicon.new
12
+
13
+ # Look up the clothing synset as the origin
14
+ clothing = lex.lookup_synsets( "clothing", WordNet::Noun, 1 )
15
+
16
+ part_word = ARGV.shift || "collar"
17
+ part = lex.lookup_synsets( part_word, WordNet::Noun, 1 ) or
18
+ abort( "Couldn't find synset for #{part_word}" )
19
+
20
+
21
+ puts "Looking for instances of:",
22
+ " #{part}",
23
+ "in the hyponyms of",
24
+ " #{clothing}",
25
+ ""
26
+
27
+ # Now traverse all hyponyms of the clothing synset, and check for "part" among
28
+ # each one's meronyms, printing any we find
29
+ clothing.traverse( :hyponyms ) do |syn,depth|
30
+ if syn.search( :meronyms, part )
31
+ puts "Has a #{part_word}: #{syn}"
32
+ else
33
+ puts "Doesn't have a #{part_word}: #{syn}" if $DEBUG
34
+ end
35
+ end
36
+