delphin 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,17 @@
1
+ = Ruby Utilities for DELPH-IN
2
+
3
+ This is a set of Ruby utilities for the {Delphin}[http://www.delph-in.net/] HPSG processing project.
4
+
5
+ = History
6
+
7
+ 1.0.0:: Profile data structures
8
+
9
+ = Copyright
10
+
11
+ Copyright 2009, William Patrick McNeill
12
+
13
+ This program is distributed under the GNU General Public License.
14
+
15
+ = Author
16
+
17
+ W.P. McNeill mailto:billmcn@gmail.com
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "delphin"
4
+ require "fileutils"
5
+ require "optparse"
6
+
7
+
8
+ # Display an error message and the generic help message and exit.
9
+ def error_exit(parser, error, exit_code = -1)
10
+ puts error
11
+ puts parser.help
12
+ exit(exit_code)
13
+ end
14
+
15
+
16
+ delete = false
17
+
18
+ parser = OptionParser.new do |opts|
19
+ opts.banner =<<-EOTEXT
20
+ #{File.basename(__FILE__)} [OPTION] glob [table table...]
21
+
22
+ List all invalid TSDB profiles matching the specified file glob and optionally delete them.
23
+
24
+ A directory is a valid profile if it contains a non-empty relations file. Other required non-empty table files may be specified as command line arguments.
25
+
26
+ By default this script merely prints the names of invalid profiles. It will also delete them if the delete switch is specified.
27
+
28
+ Bracket characters in globs must be escaped with two backslashes, e.g. \\\\[
29
+ EOTEXT
30
+ opts.on("-l", "--logging LEVEL", "Logging level") do |level|
31
+ Delphin.set_log_level(eval("Logger::#{level.upcase}"))
32
+ end
33
+
34
+ opts.on("-d", "--delete", "Delete invalid profiles") do
35
+ delete = true
36
+ end
37
+ end
38
+
39
+ begin
40
+ parser.parse!
41
+ rescue OptionParser::ParseError => e
42
+ error_exit(parser, e)
43
+ end
44
+
45
+ if ARGV.length < 1
46
+ error_exit(parser, "Incorrect number of arguments.")
47
+ end
48
+
49
+ glob = ARGV.shift
50
+ table_names = ARGV
51
+
52
+ Pathname.glob(glob).select {|d| d.directory?}.collect do |d|
53
+ Delphin::LOGGER.debug("Profile directory #{d}")
54
+ begin
55
+ profile = Delphin::Profile.new(d)
56
+ table_names.each do |table_name|
57
+ table = profile[table_name]
58
+ if File.zero?(table.filename)
59
+ raise Delphin::EmptyDataFile.new(table_name, profile)
60
+ end
61
+ end
62
+ rescue Delphin::InvalidProfileException => e
63
+ puts e.message
64
+ FileUtils.rm_rf(d) if delete
65
+ next
66
+ end
67
+ end
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "delphin"
4
+ require "optparse"
5
+ require "yaml"
6
+
7
+
8
+ # Display an error message and the generic help message and exit.
9
+ def error_exit(parser, error, exit_code = -1)
10
+ puts error
11
+ puts parser.help
12
+ exit(exit_code)
13
+ end
14
+
15
+
16
+ full_name = false
17
+ to_yaml = false
18
+
19
+ parser = OptionParser.new do |opts|
20
+ opts.banner =<<-EOTEXT
21
+ #{File.basename(__FILE__)} [OPTION] glob
22
+
23
+ Print a summary of the scores in all the profiles matching the specified file
24
+ glob.
25
+
26
+ Bracket characters in paths must be escaped with two backslashes, e.g. \\\\[
27
+ EOTEXT
28
+ opts.on("-l", "--logging LEVEL", "Logging level") do |level|
29
+ Delphin.set_log_level(eval("Logger::#{level.upcase}"))
30
+ end
31
+
32
+ opts.on("-f", "--full-name", "Print full directory name") do
33
+ full_name = true
34
+ end
35
+
36
+ opts.on("-y", "--yaml", "Output results as YAML") do
37
+ to_yaml = true
38
+ end
39
+
40
+ end
41
+
42
+ begin
43
+ parser.parse!
44
+ rescue OptionParser::ParseError => e
45
+ error_exit(parser, e)
46
+ end
47
+
48
+ if not ARGV.length == 1
49
+ error_exit(parser, "Incorrect number of arguments.")
50
+ end
51
+
52
+ stats = Delphin.summarize_folds(File.expand_path(ARGV.first))
53
+ s = if to_yaml
54
+ YAML::dump(stats)
55
+ else
56
+ stats.collect do |stat|
57
+ stat[3] = File.basename(stat[3]) if not full_name
58
+ stat.join(" ")
59
+ end.join("\n")
60
+ end
61
+ puts s
@@ -0,0 +1,377 @@
1
+ # Copyright 2009 William Patrick McNeill
2
+ #
3
+ # This file is part of DELPHN-IN Ruby Utility Package.
4
+ #
5
+ # The DELPHN-IN Ruby Utility Package is free software; you can redistribute it
6
+ # and/or modify it under the terms of the GNU General Public License as
7
+ # published by the Free Software Foundation; either version 2 of the License,
8
+ # or (at your option) any later version.
9
+ #
10
+ # The DELPHN-IN Ruby Utility Package is distributed in the hope that it will
11
+ # be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
13
+ # Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License along with
16
+ # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
17
+ # St, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+ require "logger"
20
+ require "pathname"
21
+ require "set"
22
+
23
+ # Utilities for use with the DELPH-IN project.
24
+ module Delphin
25
+ VERSION = "0.0.1"
26
+
27
+ # Create the logger and set its default log level to ERROR. This function
28
+ # is called when the module is loaded.
29
+ def Delphin.initialize_logger
30
+ logger = Logger.new(STDERR)
31
+ logger.level = Logger::ERROR
32
+ logger.datetime_format = "%Y-%m-%d %H:%M:%S"
33
+ logger
34
+ end
35
+
36
+ private_class_method :initialize_logger
37
+
38
+ # Logger used by all objects in this module. This is initialized at module
39
+ # load time. The default log level is ERROR.
40
+ LOGGER = initialize_logger
41
+
42
+ # Set the logging level. For example:
43
+ #
44
+ # > Delphin.set_log_level(Logger::DEBUG)
45
+ def Delphin.set_log_level(level)
46
+ Delphin::LOGGER.level = level
47
+ end
48
+
49
+ # Abstract base class for all exceptions raised by this module.
50
+ class InvalidProfileException < Exception
51
+ end
52
+
53
+ class MissingDataFile < InvalidProfileException
54
+ def initialize(name, profile)
55
+ @name = name
56
+ @profile = profile
57
+ end
58
+
59
+ def to_s
60
+ "Missing data file for table #{@name} in #{@profile}."
61
+ end
62
+ end
63
+
64
+ class EmptyDataFile < InvalidProfileException
65
+ def initialize(name, profile)
66
+ @name = name
67
+ @profile = profile
68
+ end
69
+
70
+ def to_s
71
+ "Empty data file for table #{@name} in #{@profile}."
72
+ end
73
+ end
74
+
75
+ class MissingRelationsFile < InvalidProfileException
76
+ def initialize(directory)
77
+ @directory = directory
78
+ end
79
+
80
+ def to_s
81
+ "Missing relations file in #{@directory}."
82
+ end
83
+ end
84
+
85
+ class InvalidRelationsFile < InvalidProfileException
86
+ def initialize(filename, linenum, line)
87
+ @filename = filename
88
+ @linenum = linenum
89
+ @line = line
90
+ end
91
+
92
+ def to_s
93
+ "Invalid line #{@linenum} #{@filename}\n#{@line}"
94
+ end
95
+ end
96
+
97
+ # Given a file glob that picks out profiles, extract the specified numerical
98
+ # information, calculate statistics, and return a table sorted by mean.
99
+ #
100
+ # [_glob_] A file glob
101
+ def Delphin.summarize_folds(glob)
102
+ Pathname.glob(glob).select {|d| d.directory?}.collect do |d|
103
+ LOGGER.debug("Profile directory #{d}")
104
+ begin
105
+ s = Profile.new(d).statistics("fold", "f-accuracy")
106
+ rescue InvalidProfileException => e
107
+ LOGGER.error(e.message)
108
+ next
109
+ end
110
+ [s.mean, s.sdev, s.range, d.to_s]
111
+ # Filter out nils left in the list by handled exceptions.
112
+ end.select{|r| not r.nil?}.sort_by {|r| -r.first}
113
+ end
114
+
115
+
116
+ # The name of a Logon TSDB output profile.
117
+ #
118
+ # These are long strings with information about the features used to
119
+ # generate them in brackets.
120
+ class OutputProfileName < Struct.new(:prefix, :grandparenting,
121
+ :constituent_weight, :active_edges,
122
+ :ngram_size, :ngram_back_off,
123
+ :relative_tolerance, :variance)
124
+ # The basename of the profile
125
+ attr_reader :name
126
+
127
+ # Create a output profile name from a directory name
128
+ #
129
+ # Path information will be stripped from the name parameter.
130
+ #
131
+ # [_name_] A directory name
132
+ def initialize(name)
133
+ @name = File.basename(name)
134
+ @name =~ /\[(\S+)\]\s # 1 Prefix
135
+ GP\[(\d+)\]\s # 2 Grandparenting
136
+ [+-]PT\s
137
+ [+-]LEX\s
138
+ CW\[(\d*)\]\s # 3 Constituent weight
139
+ ([+-])AE\s # 4 Active edges
140
+ NS\[(\d+)\]\s # 5 N-gram size
141
+ NT\[\w*\]\s
142
+ ([+-])NB\s # 6 N-gram backoff
143
+ LM\[\d+\]\s
144
+ FT\[:::\d+\]\s
145
+ RS\[\]\s
146
+ MM\[\S+\]\s
147
+ MI\[\d+\]\s
148
+ RT\[(\d+(?:\.\d+)?e[+-]\d+)\]\s # 7 Relative tolerance
149
+ AT\[\d+(?:\.\d+)?e[+-]\d+\]\s
150
+ VA\[(\d+(?:\.\d+)?e[+-]\d+)?\]\s # 8 Variance
151
+ PC\[\d+\]
152
+ /x or
153
+ raise ArgumentError.new("Invalid profile name #{name}")
154
+ super($1, $2.to_i, $3.to_i, $4 == "+", $5.to_i, $6 == "+", $7.to_f,
155
+ $8.nil? ? nil : $8.to_f)
156
+ end
157
+
158
+ def to_s
159
+ "#{prefix}:#{feature_string}"
160
+ end
161
+
162
+ # Do this profile name and other have the all same machine learning
163
+ # features?
164
+ def equal_learner_features?(other)
165
+ [:grandparenting, :constituent_weight, :active_edges, :ngram_size,
166
+ :ngram_back_off, :relative_tolerance, :variance].all? do |feature|
167
+ send(feature) == other.send(feature)
168
+ end
169
+ end
170
+
171
+ # A compact and readable representation of all the feature values.
172
+ def feature_string
173
+ [:grandparenting, :constituent_weight, :active_edges, :ngram_size,
174
+ :ngram_back_off, :relative_tolerance, :variance].collect do |feature|
175
+ "#{feature.to_s}=#{send(feature)}"
176
+ end.join(",")
177
+ end
178
+
179
+ end # OutputProfileName
180
+
181
+
182
+ # A TSDB profile
183
+ class Profile
184
+ attr_reader :directory, :relations
185
+
186
+ def initialize(directory)
187
+ @directory = directory
188
+ begin
189
+ @relations = open(File.join(directory, "relations")) do |file|
190
+ RelationsFile.new(file)
191
+ end
192
+ rescue Errno::ENOENT
193
+ raise MissingRelationsFile.new(directory)
194
+ end
195
+ end
196
+
197
+ def inspect
198
+ "#{self.class}(#{directory})"
199
+ end
200
+
201
+ def to_s
202
+ inspect
203
+ end
204
+
205
+ # A list of all the tables in the profile
206
+ def tables
207
+ @relations.keys
208
+ end
209
+
210
+ # Open the specified table file.
211
+ def [](name)
212
+ ProfileTable.new(self, name, @relations[name])
213
+ end
214
+
215
+ # Retun mean and standard deviation for numeric values in the specified
216
+ # field.
217
+ def statistics(table, field)
218
+ s = self[table].collect {|r| r[field].to_f}
219
+ n = s.length
220
+ raise EmptyDataFile.new(table, self) if n.zero?
221
+ mean = (s.inject(0) {|sum, x| sum + x})/n
222
+ n = n-1 if n > 1
223
+ sdev = Math.sqrt((s.inject(0) {|sum, x| sum + (x-mean)**2})/n)
224
+ range = s.max - s.min
225
+ Struct.new(:mean, :sdev, :range).new(mean, sdev, range)
226
+ end
227
+ end
228
+
229
+
230
+ # A data table in a TSDB profile.
231
+ class ProfileTable
232
+ include Enumerable
233
+
234
+ attr_reader :profile, :name, :schema, :filename
235
+
236
+ def initialize(profile, name, schema)
237
+ @profile = profile
238
+ @name = name
239
+ @schema = schema
240
+ # Find the table containing this table. It may be gzipped.
241
+ filename = File.join(profile.directory, name)
242
+ gzname = filename + ".gz"
243
+ if File.exist?(filename)
244
+ @filename = filename
245
+ @file = open(filename)
246
+ elsif File.exist?(gzname)
247
+ @filename = gzname
248
+ @file = Zlib::GzipReader.open(gzname)
249
+ else
250
+ raise MissingDataFile.new(@name, @profile)
251
+ end
252
+ end
253
+
254
+ def inspect
255
+ "#{self.class}(#{name}) in #{profile}"
256
+ end
257
+
258
+ def to_s
259
+ inspect
260
+ end
261
+
262
+ # Enumerate the records in this table.
263
+ def each
264
+ @file.each do |line|
265
+ yield @schema.record(line.strip!)
266
+ end
267
+ end
268
+ end # ProfileTable
269
+
270
+
271
+ # A database schema table in a profile.
272
+ #
273
+ # This is a list of field labels and their types.
274
+ class ProfileTableSchema < Array
275
+ attr_reader :name, :keys, :partials
276
+
277
+ def initialize(init_name)
278
+ super()
279
+ @name = init_name
280
+ @keys = Set.new
281
+ @partials = Set.new
282
+ end
283
+
284
+ # The string representation is identical to what appears in the relations
285
+ # file.
286
+ def to_s
287
+ "#{name}:\n" + collect do |field|
288
+ s = " #{field.label} :#{field.type}"
289
+ s += " :key" if is_key?(field.label)
290
+ s += " :partial" if is_partial?(field.label)
291
+ s
292
+ end.join("\n")
293
+ end
294
+
295
+ # Generate a data record from a line of text.
296
+ #
297
+ # A data record is a hash of field labels to values.
298
+ def record(text)
299
+ data_fields = text.split(/@/)
300
+ field_names = collect {|f| f.label}
301
+ field_types = collect {|f| f.type}
302
+ # Do type conversion if the field is of type integer.
303
+ data_fields = field_types.zip(data_fields).collect do |type, data|
304
+ case type
305
+ when :integer
306
+ data.to_i
307
+ else
308
+ data
309
+ end
310
+ end
311
+ Hash[*field_names.zip(data_fields).flatten]
312
+ end
313
+
314
+ # Add a new field and type.
315
+ def add_field(label, type, key = false, partial = false)
316
+ self.push(Struct.new(:label, :type).new(label, type))
317
+ @keys.add(label) if key
318
+ @partials.add(label) if partial
319
+ end
320
+
321
+ # Is the specified label a key?
322
+ def is_key?(label)
323
+ @keys.member?(label)
324
+ end
325
+
326
+ # Is the specified label a partial?
327
+ def is_partial?(label)
328
+ @partials.member?(label)
329
+ end
330
+ end # ProfileTableSchema
331
+
332
+
333
+ # A file that contains a set of database schema tables.
334
+ #
335
+ # This object is a hash of ProfileTableSchema objects indexed by table name.
336
+ class RelationsFile < Hash
337
+ def initialize(file)
338
+ super()
339
+ state = :outside_table
340
+ table_name = nil
341
+ file.each_with_index do |line,i|
342
+ # Remove comments and surrounding whitespace.
343
+ line.sub!(/#.*/, "")
344
+ line.strip!
345
+ case state
346
+ when :inside_table
347
+ if line.empty?
348
+ state = :outside_table
349
+ elsif line =~ /^(\S+)\s+:(\w+)(\s+:key)?(\s+:partial)?$/
350
+ # E.g. parse-id :integer :key
351
+ field, type = line.split
352
+ self[table_name].add_field($1, $2, !$3.nil?, !$4.nil?)
353
+ else
354
+ raise InvalidRelationsFile.new(filename, i+1, line)
355
+ end
356
+ when :outside_table
357
+ next if line.empty?
358
+ if line =~ /(\S+):/
359
+ # E.g. item:
360
+ table_name = $1
361
+ self[table_name] = ProfileTableSchema.new(table_name)
362
+ state = :inside_table
363
+ else
364
+ raise InvalidRelationsFile.new(filename, i+1, line)
365
+ end
366
+ end
367
+ end # each_with_index
368
+ end # initialize
369
+
370
+ # Print out a relations file
371
+ def to_s
372
+ values.map {|t| t.to_s}.join("\n\n")
373
+ end # to_s
374
+ end # RelationsFile
375
+
376
+
377
+ end # Delphin
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #--
4
+
5
+ # Copyright 2009 William Patrick McNeill
6
+ #
7
+ # This file is part of DELPN-IN Ruby Utility Package.
8
+ #
9
+ # The DELPN-IN Ruby Utility Package is free software; you can redistribute it
10
+ # and/or modify it under the terms of the GNU General Public License as
11
+ # published by the Free Software Foundation; either version 2 of the License,
12
+ # or (at your option) any later version.
13
+ #
14
+ # The DELPN-IN Ruby Utility Package is distributed in the hope that it will be
15
+ # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
17
+ # Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License along with
20
+ # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
21
+ # St, Fifth Floor, Boston, MA 02110-1301 USA
22
+ #
23
+ #++
24
+
25
+ # Test cases for the Delphin module
26
+
27
+ require "test/unit"
28
+ require "delphin"
29
+
30
+
31
+ class Schema < Test::Unit::TestCase
32
+ def test_stub
33
+ Delphin::ProfileTableSchema.new("schema")
34
+ end
35
+ end
36
+
37
+ class TestOutputProfileName < Test::Unit::TestCase
38
+ def setup
39
+ @name1 = "[tanaka-train] GP[0] +PT -LEX CW[1] +AE NS[4] NT[type] +NB LM[0] FT[:::1] RS[] MM[tao_lmvm] MI[5000] RT[1.0e-8] AT[1.0e-20] VA[1.0e+0] PC[100]"
40
+ @profile1 = Delphin::OutputProfileName.new(@name1)
41
+ @name2 = "[jhpstg] GP[0] +PT -LEX CW[] -AE NS[3] NT[type] +NB LM[0] FT[:::1] RS[] MM[tao_lmvm] MI[5000] RT[1.0e-6] AT[1.0e-20] VA[1.0e-4] PC[100]"
42
+ @profile2 = Delphin::OutputProfileName.new(@name2)
43
+ end
44
+
45
+ def test_valid_name
46
+ assert_equal(@name1, @profile1.name)
47
+ assert_equal("tanaka-train", @profile1.prefix)
48
+ assert_equal(0, @profile1.grandparenting)
49
+ assert_equal(1, @profile1.constituent_weight)
50
+ assert_equal(true, @profile1.active_edges)
51
+ assert_equal(4, @profile1.ngram_size)
52
+ assert_equal(true, @profile1.ngram_back_off)
53
+ assert_equal(1e-8, @profile1.relative_tolerance)
54
+ assert_equal(1, @profile1.variance)
55
+ end
56
+
57
+ def test_empty_variance
58
+ name = "[jhpstg] GP[0] +PT -LEX CW[2] +AE NS[2] NT[type] +NB LM[0] FT[:::1] RS[] MM[tao_lmvm] MI[5000] RT[1.0e-6] AT[1.0e-20] VA[] PC[100]"
59
+ profile = Delphin::OutputProfileName.new(name)
60
+ assert_equal(nil, profile.variance)
61
+ end
62
+
63
+ def test_feature_string
64
+ expected = "grandparenting=0,constituent_weight=1,active_edges=true,ngram_size=4,ngram_back_off=true,relative_tolerance=1.0e-08,variance=1.0"
65
+ assert_equal(expected, @profile1.feature_string)
66
+ end
67
+
68
+ def test_feature_equivalence
69
+ assert(@profile1.equal_learner_features?(@profile1))
70
+ assert((not @profile1.equal_learner_features?(@profile2)))
71
+ end
72
+
73
+ def test_invalid_name
74
+ assert_raise(ArgumentError) { Delphin::OutputProfileName.new("bogus") }
75
+ end
76
+
77
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: delphin
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - W.P. McNeill
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-09-23 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: |
17
+ This module is a Ruby wrapper for the DELPH-IN project.
18
+
19
+ email: billmcn@gmail.com
20
+ executables: []
21
+
22
+ extensions: []
23
+
24
+ extra_rdoc_files:
25
+ - README
26
+ files:
27
+ - test/test_delphin.rb
28
+ - lib/delphin.rb
29
+ - bin/cleanup-profile
30
+ - bin/score-profile
31
+ - README
32
+ has_rdoc: true
33
+ homepage: http://delphin.rubyforge.org/
34
+ licenses: []
35
+
36
+ post_install_message:
37
+ rdoc_options:
38
+ - - --title
39
+ - Delphin -- DELPH-IN utilities
40
+ - --main
41
+ - README
42
+ - --line-numbers
43
+ - --inline-source
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: "0"
51
+ version:
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "0"
57
+ version:
58
+ requirements: []
59
+
60
+ rubyforge_project: delphin
61
+ rubygems_version: 1.3.5
62
+ signing_key:
63
+ specification_version: 3
64
+ summary: Ruby utilities for the DELPH-IN project
65
+ test_files:
66
+ - test/test_delphin.rb