delphin 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,17 @@
1
+ = Ruby Utilities for DELPH-IN
2
+
3
+ This is a set of Ruby utilities for the {Delphin}[http://www.delph-in.net/] HPSG processing project.
4
+
5
+ = History
6
+
7
+ 1.0.0:: Profile data structures
8
+
9
+ = Copyright
10
+
11
+ Copyright 2009, William Patrick McNeill
12
+
13
+ This program is distributed under the GNU General Public License.
14
+
15
+ = Author
16
+
17
+ W.P. McNeill mailto:billmcn@gmail.com
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "delphin"
4
+ require "fileutils"
5
+ require "optparse"
6
+
7
+
8
+ # Display an error message and the generic help message and exit.
9
+ def error_exit(parser, error, exit_code = -1)
10
+ puts error
11
+ puts parser.help
12
+ exit(exit_code)
13
+ end
14
+
15
+
16
+ delete = false
17
+
18
+ parser = OptionParser.new do |opts|
19
+ opts.banner =<<-EOTEXT
20
+ #{File.basename(__FILE__)} [OPTION] glob [table table...]
21
+
22
+ List all invalid TSDB profiles matching the specified file glob and optionally delete them.
23
+
24
+ A directory is a valid profile if it contains a non-empty relations file. Other required non-empty table files may be specified as command line arguments.
25
+
26
+ By default this script merely prints the names of invalid profiles. It will also delete them if the delete switch is specified.
27
+
28
+ Bracket characters in globs must be escaped with two backslashes, e.g. \\\\[
29
+ EOTEXT
30
+ opts.on("-l", "--logging LEVEL", "Logging level") do |level|
31
+ Delphin.set_log_level(eval("Logger::#{level.upcase}"))
32
+ end
33
+
34
+ opts.on("-d", "--delete", "Delete invalid profiles") do
35
+ delete = true
36
+ end
37
+ end
38
+
39
+ begin
40
+ parser.parse!
41
+ rescue OptionParser::ParseError => e
42
+ error_exit(parser, e)
43
+ end
44
+
45
+ if ARGV.length < 1
46
+ error_exit(parser, "Incorrect number of arguments.")
47
+ end
48
+
49
+ glob = ARGV.shift
50
+ table_names = ARGV
51
+
52
+ Pathname.glob(glob).select {|d| d.directory?}.collect do |d|
53
+ Delphin::LOGGER.debug("Profile directory #{d}")
54
+ begin
55
+ profile = Delphin::Profile.new(d)
56
+ table_names.each do |table_name|
57
+ table = profile[table_name]
58
+ if File.zero?(table.filename)
59
+ raise Delphin::EmptyDataFile.new(table_name, profile)
60
+ end
61
+ end
62
+ rescue Delphin::InvalidProfileException => e
63
+ puts e.message
64
+ FileUtils.rm_rf(d) if delete
65
+ next
66
+ end
67
+ end
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "delphin"
4
+ require "optparse"
5
+ require "yaml"
6
+
7
+
8
+ # Display an error message and the generic help message and exit.
9
+ def error_exit(parser, error, exit_code = -1)
10
+ puts error
11
+ puts parser.help
12
+ exit(exit_code)
13
+ end
14
+
15
+
16
+ full_name = false
17
+ to_yaml = false
18
+
19
+ parser = OptionParser.new do |opts|
20
+ opts.banner =<<-EOTEXT
21
+ #{File.basename(__FILE__)} [OPTION] glob
22
+
23
+ Print a summary of the scores in all the profiles matching the specified file
24
+ glob.
25
+
26
+ Bracket characters in paths must be escaped with two backslashes, e.g. \\\\[
27
+ EOTEXT
28
+ opts.on("-l", "--logging LEVEL", "Logging level") do |level|
29
+ Delphin.set_log_level(eval("Logger::#{level.upcase}"))
30
+ end
31
+
32
+ opts.on("-f", "--full-name", "Print full directory name") do
33
+ full_name = true
34
+ end
35
+
36
+ opts.on("-y", "--yaml", "Output results as YAML") do
37
+ to_yaml = true
38
+ end
39
+
40
+ end
41
+
42
+ begin
43
+ parser.parse!
44
+ rescue OptionParser::ParseError => e
45
+ error_exit(parser, e)
46
+ end
47
+
48
+ if not ARGV.length == 1
49
+ error_exit(parser, "Incorrect number of arguments.")
50
+ end
51
+
52
+ stats = Delphin.summarize_folds(File.expand_path(ARGV.first))
53
+ s = if to_yaml
54
+ YAML::dump(stats)
55
+ else
56
+ stats.collect do |stat|
57
+ stat[3] = File.basename(stat[3]) if not full_name
58
+ stat.join(" ")
59
+ end.join("\n")
60
+ end
61
+ puts s
@@ -0,0 +1,377 @@
1
+ # Copyright 2009 William Patrick McNeill
2
+ #
3
+ # This file is part of DELPHN-IN Ruby Utility Package.
4
+ #
5
+ # The DELPHN-IN Ruby Utility Package is free software; you can redistribute it
6
+ # and/or modify it under the terms of the GNU General Public License as
7
+ # published by the Free Software Foundation; either version 2 of the License,
8
+ # or (at your option) any later version.
9
+ #
10
+ # The DELPHN-IN Ruby Utility Package is distributed in the hope that it will
11
+ # be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
13
+ # Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License along with
16
+ # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
17
+ # St, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+ require "logger"
20
+ require "pathname"
21
+ require "set"
22
+
23
+ # Utilities for use with the DELPH-IN project.
24
+ module Delphin
25
+ VERSION = "0.0.1"
26
+
27
+ # Create the logger and set its default log level to ERROR. This function
28
+ # is called when the module is loaded.
29
+ def Delphin.initialize_logger
30
+ logger = Logger.new(STDERR)
31
+ logger.level = Logger::ERROR
32
+ logger.datetime_format = "%Y-%m-%d %H:%M:%S"
33
+ logger
34
+ end
35
+
36
+ private_class_method :initialize_logger
37
+
38
+ # Logger used by all objects in this module. This is initialized at module
39
+ # load time. The default log level is ERROR.
40
+ LOGGER = initialize_logger
41
+
42
+ # Set the logging level. For example:
43
+ #
44
+ # > Delphin.set_log_level(Logger::DEBUG)
45
+ def Delphin.set_log_level(level)
46
+ Delphin::LOGGER.level = level
47
+ end
48
+
49
+ # Abstract base class for all exceptions raised by this module.
50
+ class InvalidProfileException < Exception
51
+ end
52
+
53
+ class MissingDataFile < InvalidProfileException
54
+ def initialize(name, profile)
55
+ @name = name
56
+ @profile = profile
57
+ end
58
+
59
+ def to_s
60
+ "Missing data file for table #{@name} in #{@profile}."
61
+ end
62
+ end
63
+
64
+ class EmptyDataFile < InvalidProfileException
65
+ def initialize(name, profile)
66
+ @name = name
67
+ @profile = profile
68
+ end
69
+
70
+ def to_s
71
+ "Empty data file for table #{@name} in #{@profile}."
72
+ end
73
+ end
74
+
75
+ class MissingRelationsFile < InvalidProfileException
76
+ def initialize(directory)
77
+ @directory = directory
78
+ end
79
+
80
+ def to_s
81
+ "Missing relations file in #{@directory}."
82
+ end
83
+ end
84
+
85
+ class InvalidRelationsFile < InvalidProfileException
86
+ def initialize(filename, linenum, line)
87
+ @filename = filename
88
+ @linenum = linenum
89
+ @line = line
90
+ end
91
+
92
+ def to_s
93
+ "Invalid line #{@linenum} #{@filename}\n#{@line}"
94
+ end
95
+ end
96
+
97
+ # Given a file glob that picks out profiles, extract the specified numerical
98
+ # information, calculate statistics, and return a table sorted by mean.
99
+ #
100
+ # [_glob_] A file glob
101
+ def Delphin.summarize_folds(glob)
102
+ Pathname.glob(glob).select {|d| d.directory?}.collect do |d|
103
+ LOGGER.debug("Profile directory #{d}")
104
+ begin
105
+ s = Profile.new(d).statistics("fold", "f-accuracy")
106
+ rescue InvalidProfileException => e
107
+ LOGGER.error(e.message)
108
+ next
109
+ end
110
+ [s.mean, s.sdev, s.range, d.to_s]
111
+ # Filter out nils left in the list by handled exceptions.
112
+ end.select{|r| not r.nil?}.sort_by {|r| -r.first}
113
+ end
114
+
115
+
116
+ # The name of a Logon TSDB output profile.
117
+ #
118
+ # These are long strings with information about the features used to
119
+ # generate them in brackets.
120
+ class OutputProfileName < Struct.new(:prefix, :grandparenting,
121
+ :constituent_weight, :active_edges,
122
+ :ngram_size, :ngram_back_off,
123
+ :relative_tolerance, :variance)
124
+ # The basename of the profile
125
+ attr_reader :name
126
+
127
+ # Create a output profile name from a directory name
128
+ #
129
+ # Path information will be stripped from the name parameter.
130
+ #
131
+ # [_name_] A directory name
132
+ def initialize(name)
133
+ @name = File.basename(name)
134
+ @name =~ /\[(\S+)\]\s # 1 Prefix
135
+ GP\[(\d+)\]\s # 2 Grandparenting
136
+ [+-]PT\s
137
+ [+-]LEX\s
138
+ CW\[(\d*)\]\s # 3 Constituent weight
139
+ ([+-])AE\s # 4 Active edges
140
+ NS\[(\d+)\]\s # 5 N-gram size
141
+ NT\[\w*\]\s
142
+ ([+-])NB\s # 6 N-gram backoff
143
+ LM\[\d+\]\s
144
+ FT\[:::\d+\]\s
145
+ RS\[\]\s
146
+ MM\[\S+\]\s
147
+ MI\[\d+\]\s
148
+ RT\[(\d+(?:\.\d+)?e[+-]\d+)\]\s # 7 Relative tolerance
149
+ AT\[\d+(?:\.\d+)?e[+-]\d+\]\s
150
+ VA\[(\d+(?:\.\d+)?e[+-]\d+)?\]\s # 8 Variance
151
+ PC\[\d+\]
152
+ /x or
153
+ raise ArgumentError.new("Invalid profile name #{name}")
154
+ super($1, $2.to_i, $3.to_i, $4 == "+", $5.to_i, $6 == "+", $7.to_f,
155
+ $8.nil? ? nil : $8.to_f)
156
+ end
157
+
158
+ def to_s
159
+ "#{prefix}:#{feature_string}"
160
+ end
161
+
162
+ # Do this profile name and other have the all same machine learning
163
+ # features?
164
+ def equal_learner_features?(other)
165
+ [:grandparenting, :constituent_weight, :active_edges, :ngram_size,
166
+ :ngram_back_off, :relative_tolerance, :variance].all? do |feature|
167
+ send(feature) == other.send(feature)
168
+ end
169
+ end
170
+
171
+ # A compact and readable representation of all the feature values.
172
+ def feature_string
173
+ [:grandparenting, :constituent_weight, :active_edges, :ngram_size,
174
+ :ngram_back_off, :relative_tolerance, :variance].collect do |feature|
175
+ "#{feature.to_s}=#{send(feature)}"
176
+ end.join(",")
177
+ end
178
+
179
+ end # OutputProfileName
180
+
181
+
182
+ # A TSDB profile
183
+ class Profile
184
+ attr_reader :directory, :relations
185
+
186
+ def initialize(directory)
187
+ @directory = directory
188
+ begin
189
+ @relations = open(File.join(directory, "relations")) do |file|
190
+ RelationsFile.new(file)
191
+ end
192
+ rescue Errno::ENOENT
193
+ raise MissingRelationsFile.new(directory)
194
+ end
195
+ end
196
+
197
+ def inspect
198
+ "#{self.class}(#{directory})"
199
+ end
200
+
201
+ def to_s
202
+ inspect
203
+ end
204
+
205
+ # A list of all the tables in the profile
206
+ def tables
207
+ @relations.keys
208
+ end
209
+
210
+ # Open the specified table file.
211
+ def [](name)
212
+ ProfileTable.new(self, name, @relations[name])
213
+ end
214
+
215
+ # Retun mean and standard deviation for numeric values in the specified
216
+ # field.
217
+ def statistics(table, field)
218
+ s = self[table].collect {|r| r[field].to_f}
219
+ n = s.length
220
+ raise EmptyDataFile.new(table, self) if n.zero?
221
+ mean = (s.inject(0) {|sum, x| sum + x})/n
222
+ n = n-1 if n > 1
223
+ sdev = Math.sqrt((s.inject(0) {|sum, x| sum + (x-mean)**2})/n)
224
+ range = s.max - s.min
225
+ Struct.new(:mean, :sdev, :range).new(mean, sdev, range)
226
+ end
227
+ end
228
+
229
+
230
+ # A data table in a TSDB profile.
231
+ class ProfileTable
232
+ include Enumerable
233
+
234
+ attr_reader :profile, :name, :schema, :filename
235
+
236
+ def initialize(profile, name, schema)
237
+ @profile = profile
238
+ @name = name
239
+ @schema = schema
240
+ # Find the table containing this table. It may be gzipped.
241
+ filename = File.join(profile.directory, name)
242
+ gzname = filename + ".gz"
243
+ if File.exist?(filename)
244
+ @filename = filename
245
+ @file = open(filename)
246
+ elsif File.exist?(gzname)
247
+ @filename = gzname
248
+ @file = Zlib::GzipReader.open(gzname)
249
+ else
250
+ raise MissingDataFile.new(@name, @profile)
251
+ end
252
+ end
253
+
254
+ def inspect
255
+ "#{self.class}(#{name}) in #{profile}"
256
+ end
257
+
258
+ def to_s
259
+ inspect
260
+ end
261
+
262
+ # Enumerate the records in this table.
263
+ def each
264
+ @file.each do |line|
265
+ yield @schema.record(line.strip!)
266
+ end
267
+ end
268
+ end # ProfileTable
269
+
270
+
271
+ # A database schema table in a profile.
272
+ #
273
+ # This is a list of field labels and their types.
274
+ class ProfileTableSchema < Array
275
+ attr_reader :name, :keys, :partials
276
+
277
+ def initialize(init_name)
278
+ super()
279
+ @name = init_name
280
+ @keys = Set.new
281
+ @partials = Set.new
282
+ end
283
+
284
+ # The string representation is identical to what appears in the relations
285
+ # file.
286
+ def to_s
287
+ "#{name}:\n" + collect do |field|
288
+ s = " #{field.label} :#{field.type}"
289
+ s += " :key" if is_key?(field.label)
290
+ s += " :partial" if is_partial?(field.label)
291
+ s
292
+ end.join("\n")
293
+ end
294
+
295
+ # Generate a data record from a line of text.
296
+ #
297
+ # A data record is a hash of field labels to values.
298
+ def record(text)
299
+ data_fields = text.split(/@/)
300
+ field_names = collect {|f| f.label}
301
+ field_types = collect {|f| f.type}
302
+ # Do type conversion if the field is of type integer.
303
+ data_fields = field_types.zip(data_fields).collect do |type, data|
304
+ case type
305
+ when :integer
306
+ data.to_i
307
+ else
308
+ data
309
+ end
310
+ end
311
+ Hash[*field_names.zip(data_fields).flatten]
312
+ end
313
+
314
+ # Add a new field and type.
315
+ def add_field(label, type, key = false, partial = false)
316
+ self.push(Struct.new(:label, :type).new(label, type))
317
+ @keys.add(label) if key
318
+ @partials.add(label) if partial
319
+ end
320
+
321
+ # Is the specified label a key?
322
+ def is_key?(label)
323
+ @keys.member?(label)
324
+ end
325
+
326
+ # Is the specified label a partial?
327
+ def is_partial?(label)
328
+ @partials.member?(label)
329
+ end
330
+ end # ProfileTableSchema
331
+
332
+
333
+ # A file that contains a set of database schema tables.
334
+ #
335
+ # This object is a hash of ProfileTableSchema objects indexed by table name.
336
+ class RelationsFile < Hash
337
+ def initialize(file)
338
+ super()
339
+ state = :outside_table
340
+ table_name = nil
341
+ file.each_with_index do |line,i|
342
+ # Remove comments and surrounding whitespace.
343
+ line.sub!(/#.*/, "")
344
+ line.strip!
345
+ case state
346
+ when :inside_table
347
+ if line.empty?
348
+ state = :outside_table
349
+ elsif line =~ /^(\S+)\s+:(\w+)(\s+:key)?(\s+:partial)?$/
350
+ # E.g. parse-id :integer :key
351
+ field, type = line.split
352
+ self[table_name].add_field($1, $2, !$3.nil?, !$4.nil?)
353
+ else
354
+ raise InvalidRelationsFile.new(filename, i+1, line)
355
+ end
356
+ when :outside_table
357
+ next if line.empty?
358
+ if line =~ /(\S+):/
359
+ # E.g. item:
360
+ table_name = $1
361
+ self[table_name] = ProfileTableSchema.new(table_name)
362
+ state = :inside_table
363
+ else
364
+ raise InvalidRelationsFile.new(filename, i+1, line)
365
+ end
366
+ end
367
+ end # each_with_index
368
+ end # initialize
369
+
370
+ # Print out a relations file
371
+ def to_s
372
+ values.map {|t| t.to_s}.join("\n\n")
373
+ end # to_s
374
+ end # RelationsFile
375
+
376
+
377
+ end # Delphin
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #--
4
+
5
+ # Copyright 2009 William Patrick McNeill
6
+ #
7
+ # This file is part of DELPN-IN Ruby Utility Package.
8
+ #
9
+ # The DELPN-IN Ruby Utility Package is free software; you can redistribute it
10
+ # and/or modify it under the terms of the GNU General Public License as
11
+ # published by the Free Software Foundation; either version 2 of the License,
12
+ # or (at your option) any later version.
13
+ #
14
+ # The DELPN-IN Ruby Utility Package is distributed in the hope that it will be
15
+ # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
17
+ # Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License along with
20
+ # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
21
+ # St, Fifth Floor, Boston, MA 02110-1301 USA
22
+ #
23
+ #++
24
+
25
+ # Test cases for the Delphin module
26
+
27
+ require "test/unit"
28
+ require "delphin"
29
+
30
+
31
+ class Schema < Test::Unit::TestCase
32
+ def test_stub
33
+ Delphin::ProfileTableSchema.new("schema")
34
+ end
35
+ end
36
+
37
+ class TestOutputProfileName < Test::Unit::TestCase
38
+ def setup
39
+ @name1 = "[tanaka-train] GP[0] +PT -LEX CW[1] +AE NS[4] NT[type] +NB LM[0] FT[:::1] RS[] MM[tao_lmvm] MI[5000] RT[1.0e-8] AT[1.0e-20] VA[1.0e+0] PC[100]"
40
+ @profile1 = Delphin::OutputProfileName.new(@name1)
41
+ @name2 = "[jhpstg] GP[0] +PT -LEX CW[] -AE NS[3] NT[type] +NB LM[0] FT[:::1] RS[] MM[tao_lmvm] MI[5000] RT[1.0e-6] AT[1.0e-20] VA[1.0e-4] PC[100]"
42
+ @profile2 = Delphin::OutputProfileName.new(@name2)
43
+ end
44
+
45
+ def test_valid_name
46
+ assert_equal(@name1, @profile1.name)
47
+ assert_equal("tanaka-train", @profile1.prefix)
48
+ assert_equal(0, @profile1.grandparenting)
49
+ assert_equal(1, @profile1.constituent_weight)
50
+ assert_equal(true, @profile1.active_edges)
51
+ assert_equal(4, @profile1.ngram_size)
52
+ assert_equal(true, @profile1.ngram_back_off)
53
+ assert_equal(1e-8, @profile1.relative_tolerance)
54
+ assert_equal(1, @profile1.variance)
55
+ end
56
+
57
+ def test_empty_variance
58
+ name = "[jhpstg] GP[0] +PT -LEX CW[2] +AE NS[2] NT[type] +NB LM[0] FT[:::1] RS[] MM[tao_lmvm] MI[5000] RT[1.0e-6] AT[1.0e-20] VA[] PC[100]"
59
+ profile = Delphin::OutputProfileName.new(name)
60
+ assert_equal(nil, profile.variance)
61
+ end
62
+
63
+ def test_feature_string
64
+ expected = "grandparenting=0,constituent_weight=1,active_edges=true,ngram_size=4,ngram_back_off=true,relative_tolerance=1.0e-08,variance=1.0"
65
+ assert_equal(expected, @profile1.feature_string)
66
+ end
67
+
68
+ def test_feature_equivalence
69
+ assert(@profile1.equal_learner_features?(@profile1))
70
+ assert((not @profile1.equal_learner_features?(@profile2)))
71
+ end
72
+
73
+ def test_invalid_name
74
+ assert_raise(ArgumentError) { Delphin::OutputProfileName.new("bogus") }
75
+ end
76
+
77
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: delphin
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - W.P. McNeill
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-09-23 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: |
17
+ This module is a Ruby wrapper for the DELPH-IN project.
18
+
19
+ email: billmcn@gmail.com
20
+ executables: []
21
+
22
+ extensions: []
23
+
24
+ extra_rdoc_files:
25
+ - README
26
+ files:
27
+ - test/test_delphin.rb
28
+ - lib/delphin.rb
29
+ - bin/cleanup-profile
30
+ - bin/score-profile
31
+ - README
32
+ has_rdoc: true
33
+ homepage: http://delphin.rubyforge.org/
34
+ licenses: []
35
+
36
+ post_install_message:
37
+ rdoc_options:
38
+ - - --title
39
+ - Delphin -- DELPH-IN utilities
40
+ - --main
41
+ - README
42
+ - --line-numbers
43
+ - --inline-source
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: "0"
51
+ version:
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "0"
57
+ version:
58
+ requirements: []
59
+
60
+ rubyforge_project: delphin
61
+ rubygems_version: 1.3.5
62
+ signing_key:
63
+ specification_version: 3
64
+ summary: Ruby utilities for the DELPH-IN project
65
+ test_files:
66
+ - test/test_delphin.rb