shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,126 @@
1
+ require_relative 'config_data'
2
+ require_relative 'configuration_error'
3
+
4
+ ##############################
5
+ # Class RosyConfigData
6
+ #
7
+ # inherits from ConfigData,
8
+ # sets features for ROSY
9
+ module Shalmaneser
10
+ module Configuration
11
+ class RosyConfigData < ConfigData
12
+ CONFIG_DEFS = {
13
+ "feature" => "list",
14
+ "classifier" => "list",
15
+ "verbose" => "bool",
16
+ "experiment_ID" => "string",
17
+ "directory_input_train" => "string",
18
+ "directory_input_test" => "string",
19
+ "directory_output" => "string",
20
+ "preproc_descr_file_train" => "string",
21
+ "preproc_descr_file_test" => "string",
22
+ "external_descr_file" => "string",
23
+ "dbtype" => "string", # "mysql" or "sqlite"
24
+
25
+ "host" => "string", # DB access: sqlite only
26
+ "user" => "string",
27
+ "passwd" => "string",
28
+ "dbname" => "string",
29
+
30
+ "data_dir" => "string", # for external use
31
+ "rosy_dir" => "pattern", # for internal use only, set by rosy.rb
32
+
33
+ "classifier_dir" => "string", # if present, special directory for classifiers
34
+
35
+ "classif_column_name" => "string",
36
+ "main_table_name" => "pattern",
37
+ "test_table_name" => "pattern",
38
+
39
+ "eval_file" => "pattern",
40
+ "log_file" => "pattern",
41
+ "failed_file" => "pattern",
42
+ "classifier_file" => "pattern",
43
+ "classifier_output_file" => "pattern",
44
+ "noval" => "string",
45
+ "split_nones" => "bool",
46
+ "print_eval_log" => "bool",
47
+ "assume_argrec_perfect" => "bool",
48
+ "xwise_argrec" => "string",
49
+ "xwise_arglab" => "string",
50
+ "xwise_onestep" => "string",
51
+ "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
52
+ "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
53
+ "prune" => "string", # pruning prior to argrec?
54
+
55
+ # Imported from PrepConfigData
56
+ 'do_postag' => 'bool',
57
+ 'do_lemmatize' => 'bool',
58
+ 'do_parse' => 'bool',
59
+ 'pos_tagger' => 'string',
60
+ 'lemmatizer' => 'string',
61
+ 'parser' => 'string'
62
+ }
63
+
64
+ def initialize(filename)
65
+ super(filename, CONFIG_DEFS, ["exp_ID", "test_ID", "split_ID",
66
+ "feature_name", "classif", "step",
67
+ "group", "dataset", "mode"])
68
+
69
+ # set access functions for list features
70
+ set_list_feature_access("feature",
71
+ method("access_feature"))
72
+
73
+ # set access functions for list features
74
+ set_list_feature_access("classifier",
75
+ method("access_feature"))
76
+ validate
77
+ end
78
+
79
+ ###
80
+ # protected
81
+
82
+ #####
83
+ # access_feature
84
+ #
85
+ # access function for feature 'feature'
86
+ #
87
+ # assumed format in the config file:
88
+ #
89
+ # feature = path [option]*
90
+ #
91
+ # i.e. first the name of the feature type to use, then
92
+ # optionally options associated with that feature,
93
+ # e.g. 'argrec': use that feature only when computing argrec
94
+ #
95
+ # the access function is called with parameter val_list, an array of
96
+ # string tuples, one string tuple for each feature defined.
97
+ # the first string in the tuple is the feature name, the rest are the options
98
+ #
99
+ # returns: a list of pairs [feature_name(string), options(array:string)]
100
+ # of defined features
101
+ def access_feature(val_list) # array:array:string: list of tuples defined in config file
102
+ # for feature 'feature'
103
+ if val_list.nil?
104
+ []
105
+ else
106
+ val_list.map do |feature_descr_tuple|
107
+ [feature_descr_tuple.first, feature_descr_tuple[1..-1]]
108
+ end
109
+ end
110
+ end
111
+
112
+ private
113
+
114
+ def validate
115
+ msg = []
116
+
117
+ unless get("experiment_ID") =~ /^[A-Za-z0-9_]+$/
118
+ msg << 'Please choose an alphanumeric experiment ID! '\
119
+ "You provided: #{get('experiment_ID')}"
120
+ end
121
+
122
+ raise(ConfigurationError, msg.join("\n")) if msg.any?
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,50 @@
1
+ # DBInterface
2
+ #
3
+ # Okay, things are getting somewhat complicated here with all
4
+ # the DB classes, but this is how it all fits together:
5
+ #
6
+ # - DBWrapper: abstract class describing the DB interface
7
+ # - DBMySQL, DBSQLite: subclasses of DBWrapper, for MySQL
8
+ # and SQLite, respectively
9
+ # - DBInterface: class to be used from outside,
10
+ # decides ( based on the experiment file) whether to use
11
+ # MySQL or SQLite and makes an object of the right kind,
12
+ # 'require'-ing either DBMySQL or DBSQLite, but not both,
13
+ # because the right ruby packages might not be installed
14
+ # for both SQL systems
15
+ # @note This class will be obsolete if we deleten MySQL.
16
+ class DBInterface
17
+
18
+ def self.get_db_interface(exp, # experiment file object with 'dbtype' entry
19
+ dir = nil, # string: Shalmaneser directory (used by SQLite only)
20
+ identifier = nil) # string: identifier of the data (SQLite)
21
+
22
+ case exp.get('dbtype')
23
+ when 'mysql'
24
+ begin
25
+ require 'db/db_mysql'
26
+ rescue => e
27
+ p e
28
+ STDERR.puts 'Error loading DB interface.'
29
+ STDERR.puts 'Make sure you have the Ruby MySQL package installed.'
30
+ exit 1
31
+ end
32
+
33
+ return DBMySQL.new(exp)
34
+ when 'sqlite'
35
+ begin
36
+ require 'db/db_sqlite'
37
+ rescue
38
+ STDERR.puts 'Error loading DB interface.'
39
+ STDERR.puts 'Make sure you have the Ruby SQLite package installed.'
40
+ exit 1
41
+ end
42
+ return DBSQLite.new(exp, dir, identifier)
43
+
44
+ else
45
+ STDERR.puts 'Error: database type needs to be either "mysql" or "sqlite"".'
46
+ STDERR.puts 'Please set parameter "dbtype" in the experiment file accordingly.'
47
+ exit 1
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,141 @@
1
+ # DBMysql: a subclass of DBWrapper.
2
+ #
3
+ # Use a MySQL server to access a database.
4
+ # Use the Ruby mysql interface package for that.
5
+
6
+ require 'mysql'
7
+ require 'db/db_wrapper'
8
+
9
+ #################
10
+ class DBMySQLResult < DBResult
11
+ # initialize with the result of Mysql::query
12
+ # which is a MysqlResult object
13
+ #
14
+ # also remember the offset of the first row
15
+ # for reset()
16
+ def initialize(value)
17
+ super(value)
18
+ @row_first = @result.row_tell
19
+ end
20
+
21
+ ###
22
+ # reset object such that each() can be run again
23
+ def reset
24
+ @result.row_seek(@row_first)
25
+ end
26
+
27
+ ###
28
+ # column names: list of strings
29
+ def list_column_names
30
+ current = @result.row_tell
31
+ fields = @result.fetch_fields.map(&:name)
32
+ @result.row_seek(current)
33
+
34
+ fields
35
+ end
36
+ end
37
+
38
+ #################
39
+ class DBMySQL < DBWrapper
40
+ ###
41
+ # initialization:
42
+ #
43
+ # open connection to MySQL server
44
+ def initialize(exp) # RosyConfigData experiment file object
45
+ super(exp)
46
+
47
+ @database = Mysql.real_connect(@exp.get('host'), @exp.get('user'),
48
+ @exp.get('passwd'), @exp.get('dbname'))
49
+
50
+ end
51
+
52
+
53
+ ###
54
+ # make a table
55
+ #
56
+ # returns: nothing
57
+ def create_table(table_name, # string
58
+ column_formats, # array: array: string*string [column_name,column_format]
59
+ index_column_names, # array: string: column_name
60
+ indexname) # string: name of automatically created index column
61
+
62
+ string = "CREATE TABLE #{table_name} (" +
63
+ "#{indexname} INT NOT NULL AUTO_INCREMENT"
64
+
65
+ # column declarations
66
+ unless column_formats.empty?
67
+ string << ", "
68
+ string << column_formats.map { |name, format| name.to_s + " " + format.to_s }.join(",")
69
+ end
70
+
71
+ # primary key
72
+ string << ", " + "PRIMARY KEY(#{indexname})"
73
+
74
+ # other keys
75
+ unless index_column_names.empty?
76
+ string << ", "
77
+ string << index_column_names.map { |name| "KEY(#{name})" }.join(",")
78
+ end
79
+ string << ");"
80
+
81
+ query_noretv(string)
82
+ end
83
+
84
+ ####
85
+ # querying the database:
86
+ # returns a DBResult object
87
+ def query(query)
88
+ result = @database.query(query)
89
+ if result
90
+ return DBMySQLResult.new(result)
91
+ else
92
+ return nil
93
+ end
94
+ end
95
+
96
+ ####
97
+ # querying the database:
98
+ # no result value
99
+ def query_noretv(query)
100
+ @database.query(query)
101
+ return nil
102
+ end
103
+
104
+ ###
105
+ # list all tables in the database
106
+ #
107
+ # array of strings
108
+ def list_tables
109
+ return @database.list_tables
110
+ end
111
+
112
+
113
+ #####
114
+ # list_column_formats
115
+ #
116
+ # list column names and column types of this table
117
+ #
118
+ # returns: array:string*string, list of pairs [column name, column format]
119
+ def list_column_formats(table_name)
120
+ retv = []
121
+ @database.query("DESCRIBE #{table_name}").each_hash { |field|
122
+ retv << [field["Field"], field["Type"]]
123
+ }
124
+ return retv
125
+ end
126
+
127
+ ####
128
+ # num_rows
129
+ #
130
+ # determine the number of rows in a table
131
+ # returns: integer or nil
132
+ def num_rows(table_name)
133
+ @database.query("SHOW TABLE STATUS").each_hash { |hash|
134
+ if hash["Name"] == table_name
135
+ return hash["Rows"]
136
+ end
137
+ }
138
+ return nil
139
+ end
140
+
141
+ end
@@ -0,0 +1,280 @@
1
+ # DBSQLite: a subclass of DBWrapper.
2
+ #
3
+ # Use SQLite to access a database.
4
+ # Use the Ruby sqlite3 interface package for that.
5
+
6
+ require 'sqlite3'
7
+ require 'tempfile'
8
+
9
+ require 'db/db_wrapper'
10
+
11
+ #################
12
+ class DBSQLiteResult < DBResult
13
+ # initialize with the result of SQLite::execute()
14
+ # which returns an array of rows
15
+ # Each row is an array
16
+ # but additionally has attributes
17
+ # - fields: returns an array of strings, the column names
18
+ # - types: returns an array of strings, the column types
19
+ def initialize(value)
20
+ super(value)
21
+ @counter = 0
22
+ end
23
+
24
+ ###
25
+ # column names: list of strings
26
+ def list_column_names
27
+ return @result.columns
28
+ end
29
+
30
+ # number of rows: returns an integer
31
+ def num_rows
32
+ # remember where we were in iterating over items
33
+ tmp_counter = @counter
34
+
35
+ # reset, and iterate over all rows to count
36
+ reset
37
+ retv = 0
38
+ each { |x| retv += 1}
39
+
40
+ # return to where we were in iterating over items
41
+ reset
42
+ while @counter < tmp_counter
43
+ @result.next
44
+ @counter += 1
45
+ end
46
+
47
+ # and return the number of rows
48
+ return retv
49
+ end
50
+
51
+
52
+ # yields each row as an array of values
53
+ def each
54
+ @result.each { |row|
55
+ @counter += 1
56
+ yield row.map { |x| x.to_s }
57
+ }
58
+ end
59
+
60
+ # yields each row as a hash: column name=> column value
61
+ def each_hash
62
+ @result.each { |row|
63
+ @counter += 1
64
+
65
+ row_hash = {}
66
+ row.fields.each_with_index { |key, index|
67
+ row_hash[key] = row[index].to_s
68
+ }
69
+ yield row_hash
70
+ }
71
+ end
72
+
73
+
74
+ ###
75
+ # reset such that each() can be run again on the result object
76
+ def reset
77
+ @result.reset
78
+ @counter = 0
79
+ end
80
+
81
+ # free object
82
+ def free
83
+ @result.close
84
+ end
85
+
86
+ # returns row as an array of column contents
87
+ def fetch_row
88
+ @counter += 1
89
+ return @result.next
90
+ end
91
+ end
92
+
93
+ #################
94
+ class DBSQLite < DBWrapper
95
+
96
+ ###
97
+ # initialization:
98
+ #
99
+ # open database file according to the given identifier
100
+ def initialize(exp, # RosyConfigData experiment file object
101
+ dir = nil, # string: directory for Shalmaneser internal data, ends in "/"
102
+ identifier = nil) # string: identifier to use for the database
103
+ super(exp)
104
+
105
+ # dir and identifier may be nil, if we're only opening this object
106
+ # in order to make temp databases
107
+ if dir and identifier
108
+ @database = SQLite3::Database.new(dir + identifier.to_s + ".db")
109
+ else
110
+ @database = nil
111
+ end
112
+
113
+ # temp file for temp database
114
+ @tf = nil
115
+ end
116
+
117
+ ###
118
+ # make a table
119
+ #
120
+ # returns: nothing
121
+ def create_table(table_name, # string
122
+ column_formats, # array: array: string*string [column_name,column_format]
123
+ index_column_names, # array: string: column_name
124
+ indexname) # string: name of automatically created index column
125
+
126
+ # primary key and auto-increment column
127
+ string = "CREATE TABLE #{table_name} (" +
128
+ "#{indexname} INTEGER PRIMARY KEY"
129
+
130
+ # column declarations
131
+ unless column_formats.empty?
132
+ string << ", "
133
+ string << column_formats.map { |name, format|
134
+ # include other keys
135
+ if index_column_names.include? name
136
+ name.to_s + " KEY " + format.to_s
137
+ else
138
+ name.to_s + " " + format.to_s
139
+ end
140
+ }.join(",")
141
+ end
142
+ string << ");"
143
+
144
+ query_noretv(string)
145
+ end
146
+
147
+ ###
148
+ # remove a table
149
+ def drop_table(table_name)
150
+ query_noretv("DROP TABLE " + table_name)
151
+ end
152
+
153
+ ###
154
+ def query(query)
155
+ if @database
156
+ return DBSQLiteResult.new(@database.query(query))
157
+ else
158
+ return nil
159
+ end
160
+ end
161
+
162
+ ####
163
+ # querying the database:
164
+ # no result value
165
+ def query_noretv(query)
166
+ if @database
167
+ @database.execute(query)
168
+ end
169
+ return nil
170
+ end
171
+
172
+ ###
173
+ # list all tables in the database
174
+ #
175
+ # array of strings
176
+ def list_tables
177
+ if @database
178
+ return @database.execute("select name from sqlite_master;").map { |t|
179
+ t.to_s
180
+ }
181
+ else
182
+ return nil
183
+ end
184
+ end
185
+
186
+ #####
187
+ # list_column_formats
188
+ #
189
+ # list column names and column types of this table
190
+ #
191
+ # returns: array:string*string, list of pairs [column name, column format]
192
+ def list_column_formats(table_name)
193
+ unless @database
194
+ return nil
195
+ end
196
+
197
+ table_descr = @database.execute("select * from sqlite_master where name=='#{table_name}';")
198
+ # this is an array of pieces of table description.
199
+ # the piece in the column called 'sql' is the 'create' statement.
200
+ # get the 'create' statement
201
+ begin
202
+ field_names = table_descr[0].fields
203
+ rescue
204
+ $stderr.puts "SQLite error: could not read description of table #{table_name}"
205
+ exit 1
206
+ end
207
+ create_index = (0..field_names.length).detect { |ix| field_names[ix] == 'sql' }
208
+
209
+ # try to parse column names out of the 'create' statement
210
+ if table_descr[0][create_index] =~ /^\s*create table \S+\s*\((.*)\)\s*$/i
211
+ # we now have something of shape ' a key varchar2(30), b varchar2(30)'
212
+ # split at the comma, remove whitespace at beginning and end
213
+ # then split again to get pairs [column name, column format]
214
+ return $1.split(",").map { |col_descrip|
215
+ pieces = col_descrip.strip.split.reject { |entry|
216
+ entry =~ /^key$/i or entry =~ /^primary$/i
217
+ }
218
+ if pieces.length > 2
219
+ $stderr.puts "Warning: problematic column format in #{col_descrip}, may be parsed wrong."
220
+ end
221
+ pieces
222
+ }
223
+ else
224
+ $stderr.puts "SQLite error: cannot read column names"
225
+ exit 1
226
+ end
227
+ end
228
+
229
+ ####
230
+ # num_rows
231
+ #
232
+ # determine the number of rows in a table
233
+ # returns: integer or nil
234
+ def num_rows(table_name)
235
+ unless @database
236
+ return nil
237
+ end
238
+
239
+ rows_s = @database.get_first_value( "select count(*) from #{table_name}" )
240
+ if rows_s
241
+ return rows_s.to_i
242
+ else
243
+ return nil
244
+ end
245
+ end
246
+
247
+ ####
248
+ # make a temporary table: make a table in a new, temporary file
249
+ #
250
+ # returns: DBWrapper object (or object of current subclass)
251
+ # that has the @table_name attribute set to the name of a temporary DB
252
+ #
253
+ # same as in superclass
254
+ #
255
+ # def make_temp_table(column_formats, # array: string*string [column_name,column_format]
256
+ # index_column_names, # array: string: column_name
257
+ # indexname) # string: name of autoincrement primary index
258
+
259
+ # temp_obj = self.clone()
260
+ # temp.initialize_temp_table(column_formats, index_column_names, indexname)
261
+ # return temp_obj
262
+ # end
263
+
264
+ def drop_temp_table
265
+ @tf.close(true)
266
+ @database = nil
267
+ end
268
+
269
+ ##############################
270
+ protected
271
+
272
+ def initialize_temp_table(column_formats, index_column_names, indexname)
273
+ @table_name = "temptable"
274
+ @tf = Tempfile.new("temp_table")
275
+ @tf.close
276
+ @database = SQLite3::Database.new(@tf.path)
277
+ create_table(@table_name, column_formats, index_column_names, indexname)
278
+ end
279
+
280
+ end