shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,126 @@
1
+ require_relative 'config_data'
2
+ require_relative 'configuration_error'
3
+
4
+ ##############################
5
+ # Class RosyConfigData
6
+ #
7
+ # inherits from ConfigData,
8
+ # sets features for ROSY
9
+ module Shalmaneser
10
+ module Configuration
11
+ class RosyConfigData < ConfigData
12
+ CONFIG_DEFS = {
13
+ "feature" => "list",
14
+ "classifier" => "list",
15
+ "verbose" => "bool",
16
+ "experiment_ID" => "string",
17
+ "directory_input_train" => "string",
18
+ "directory_input_test" => "string",
19
+ "directory_output" => "string",
20
+ "preproc_descr_file_train" => "string",
21
+ "preproc_descr_file_test" => "string",
22
+ "external_descr_file" => "string",
23
+ "dbtype" => "string", # "mysql" or "sqlite"
24
+
25
+ "host" => "string", # DB access: sqlite only
26
+ "user" => "string",
27
+ "passwd" => "string",
28
+ "dbname" => "string",
29
+
30
+ "data_dir" => "string", # for external use
31
+ "rosy_dir" => "pattern", # for internal use only, set by rosy.rb
32
+
33
+ "classifier_dir" => "string", # if present, special directory for classifiers
34
+
35
+ "classif_column_name" => "string",
36
+ "main_table_name" => "pattern",
37
+ "test_table_name" => "pattern",
38
+
39
+ "eval_file" => "pattern",
40
+ "log_file" => "pattern",
41
+ "failed_file" => "pattern",
42
+ "classifier_file" => "pattern",
43
+ "classifier_output_file" => "pattern",
44
+ "noval" => "string",
45
+ "split_nones" => "bool",
46
+ "print_eval_log" => "bool",
47
+ "assume_argrec_perfect" => "bool",
48
+ "xwise_argrec" => "string",
49
+ "xwise_arglab" => "string",
50
+ "xwise_onestep" => "string",
51
+ "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
52
+ "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
53
+ "prune" => "string", # pruning prior to argrec?
54
+
55
+ # Imported from PrepConfigData
56
+ 'do_postag' => 'bool',
57
+ 'do_lemmatize' => 'bool',
58
+ 'do_parse' => 'bool',
59
+ 'pos_tagger' => 'string',
60
+ 'lemmatizer' => 'string',
61
+ 'parser' => 'string'
62
+ }
63
+
64
+ def initialize(filename)
65
+ super(filename, CONFIG_DEFS, ["exp_ID", "test_ID", "split_ID",
66
+ "feature_name", "classif", "step",
67
+ "group", "dataset", "mode"])
68
+
69
+ # set access functions for list features
70
+ set_list_feature_access("feature",
71
+ method("access_feature"))
72
+
73
+ # set access functions for list features
74
+ set_list_feature_access("classifier",
75
+ method("access_feature"))
76
+ validate
77
+ end
78
+
79
+ ###
80
+ # protected
81
+
82
+ #####
83
+ # access_feature
84
+ #
85
+ # access function for feature 'feature'
86
+ #
87
+ # assumed format in the config file:
88
+ #
89
+ # feature = path [option]*
90
+ #
91
+ # i.e. first the name of the feature type to use, then
92
+ # optionally options associated with that feature,
93
+ # e.g. 'argrec': use that feature only when computing argrec
94
+ #
95
+ # the access function is called with parameter val_list, an array of
96
+ # string tuples, one string tuple for each feature defined.
97
+ # the first string in the tuple is the feature name, the rest are the options
98
+ #
99
+ # returns: a list of pairs [feature_name(string), options(array:string)]
100
+ # of defined features
101
+ def access_feature(val_list) # array:array:string: list of tuples defined in config file
102
+ # for feature 'feature'
103
+ if val_list.nil?
104
+ []
105
+ else
106
+ val_list.map do |feature_descr_tuple|
107
+ [feature_descr_tuple.first, feature_descr_tuple[1..-1]]
108
+ end
109
+ end
110
+ end
111
+
112
+ private
113
+
114
+ def validate
115
+ msg = []
116
+
117
+ unless get("experiment_ID") =~ /^[A-Za-z0-9_]+$/
118
+ msg << 'Please choose an alphanumeric experiment ID! '\
119
+ "You provided: #{get('experiment_ID')}"
120
+ end
121
+
122
+ raise(ConfigurationError, msg.join("\n")) if msg.any?
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,50 @@
1
+ # DBInterface
2
+ #
3
+ # Okay, things are getting somewhat complicated here with all
4
+ # the DB classes, but this is how it all fits together:
5
+ #
6
+ # - DBWrapper: abstract class describing the DB interface
7
+ # - DBMySQL, DBSQLite: subclasses of DBWrapper, for MySQL
8
+ # and SQLite, respectively
9
+ # - DBInterface: class to be used from outside,
10
+ # decides ( based on the experiment file) whether to use
11
+ # MySQL or SQLite and makes an object of the right kind,
12
+ # 'require'-ing either DBMySQL or DBSQLite, but not both,
13
+ # because the right ruby packages might not be installed
14
+ # for both SQL systems
15
+ # @note This class will be obsolete if we deleten MySQL.
16
+ class DBInterface
17
+
18
+ def self.get_db_interface(exp, # experiment file object with 'dbtype' entry
19
+ dir = nil, # string: Shalmaneser directory (used by SQLite only)
20
+ identifier = nil) # string: identifier of the data (SQLite)
21
+
22
+ case exp.get('dbtype')
23
+ when 'mysql'
24
+ begin
25
+ require 'db/db_mysql'
26
+ rescue => e
27
+ p e
28
+ STDERR.puts 'Error loading DB interface.'
29
+ STDERR.puts 'Make sure you have the Ruby MySQL package installed.'
30
+ exit 1
31
+ end
32
+
33
+ return DBMySQL.new(exp)
34
+ when 'sqlite'
35
+ begin
36
+ require 'db/db_sqlite'
37
+ rescue
38
+ STDERR.puts 'Error loading DB interface.'
39
+ STDERR.puts 'Make sure you have the Ruby SQLite package installed.'
40
+ exit 1
41
+ end
42
+ return DBSQLite.new(exp, dir, identifier)
43
+
44
+ else
45
+ STDERR.puts 'Error: database type needs to be either "mysql" or "sqlite"".'
46
+ STDERR.puts 'Please set parameter "dbtype" in the experiment file accordingly.'
47
+ exit 1
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,141 @@
1
+ # DBMysql: a subclass of DBWrapper.
2
+ #
3
+ # Use a MySQL server to access a database.
4
+ # Use the Ruby mysql interface package for that.
5
+
6
+ require 'mysql'
7
+ require 'db/db_wrapper'
8
+
9
+ #################
10
+ class DBMySQLResult < DBResult
11
+ # initialize with the result of Mysql::query
12
+ # which is a MysqlResult object
13
+ #
14
+ # also remember the offset of the first row
15
+ # for reset()
16
+ def initialize(value)
17
+ super(value)
18
+ @row_first = @result.row_tell
19
+ end
20
+
21
+ ###
22
+ # reset object such that each() can be run again
23
+ def reset
24
+ @result.row_seek(@row_first)
25
+ end
26
+
27
+ ###
28
+ # column names: list of strings
29
+ def list_column_names
30
+ current = @result.row_tell
31
+ fields = @result.fetch_fields.map(&:name)
32
+ @result.row_seek(current)
33
+
34
+ fields
35
+ end
36
+ end
37
+
38
+ #################
39
+ class DBMySQL < DBWrapper
40
+ ###
41
+ # initialization:
42
+ #
43
+ # open connection to MySQL server
44
+ def initialize(exp) # RosyConfigData experiment file object
45
+ super(exp)
46
+
47
+ @database = Mysql.real_connect(@exp.get('host'), @exp.get('user'),
48
+ @exp.get('passwd'), @exp.get('dbname'))
49
+
50
+ end
51
+
52
+
53
+ ###
54
+ # make a table
55
+ #
56
+ # returns: nothing
57
+ def create_table(table_name, # string
58
+ column_formats, # array: array: string*string [column_name,column_format]
59
+ index_column_names, # array: string: column_name
60
+ indexname) # string: name of automatically created index column
61
+
62
+ string = "CREATE TABLE #{table_name} (" +
63
+ "#{indexname} INT NOT NULL AUTO_INCREMENT"
64
+
65
+ # column declarations
66
+ unless column_formats.empty?
67
+ string << ", "
68
+ string << column_formats.map { |name, format| name.to_s + " " + format.to_s }.join(",")
69
+ end
70
+
71
+ # primary key
72
+ string << ", " + "PRIMARY KEY(#{indexname})"
73
+
74
+ # other keys
75
+ unless index_column_names.empty?
76
+ string << ", "
77
+ string << index_column_names.map { |name| "KEY(#{name})" }.join(",")
78
+ end
79
+ string << ");"
80
+
81
+ query_noretv(string)
82
+ end
83
+
84
+ ####
85
+ # querying the database:
86
+ # returns a DBResult object
87
+ def query(query)
88
+ result = @database.query(query)
89
+ if result
90
+ return DBMySQLResult.new(result)
91
+ else
92
+ return nil
93
+ end
94
+ end
95
+
96
+ ####
97
+ # querying the database:
98
+ # no result value
99
+ def query_noretv(query)
100
+ @database.query(query)
101
+ return nil
102
+ end
103
+
104
+ ###
105
+ # list all tables in the database
106
+ #
107
+ # array of strings
108
+ def list_tables
109
+ return @database.list_tables
110
+ end
111
+
112
+
113
+ #####
114
+ # list_column_formats
115
+ #
116
+ # list column names and column types of this table
117
+ #
118
+ # returns: array:string*string, list of pairs [column name, column format]
119
+ def list_column_formats(table_name)
120
+ retv = []
121
+ @database.query("DESCRIBE #{table_name}").each_hash { |field|
122
+ retv << [field["Field"], field["Type"]]
123
+ }
124
+ return retv
125
+ end
126
+
127
+ ####
128
+ # num_rows
129
+ #
130
+ # determine the number of rows in a table
131
+ # returns: integer or nil
132
+ def num_rows(table_name)
133
+ @database.query("SHOW TABLE STATUS").each_hash { |hash|
134
+ if hash["Name"] == table_name
135
+ return hash["Rows"]
136
+ end
137
+ }
138
+ return nil
139
+ end
140
+
141
+ end
@@ -0,0 +1,280 @@
1
+ # DBSQLite: a subclass of DBWrapper.
2
+ #
3
+ # Use SQLite to access a database.
4
+ # Use the Ruby sqlite3 interface package for that.
5
+
6
+ require 'sqlite3'
7
+ require 'tempfile'
8
+
9
+ require 'db/db_wrapper'
10
+
11
+ #################
12
+ class DBSQLiteResult < DBResult
13
+ # initialize with the result of SQLite::execute()
14
+ # which returns an array of rows
15
+ # Each row is an array
16
+ # but additionally has attributes
17
+ # - fields: returns an array of strings, the column names
18
+ # - types: returns an array of strings, the column types
19
+ def initialize(value)
20
+ super(value)
21
+ @counter = 0
22
+ end
23
+
24
+ ###
25
+ # column names: list of strings
26
+ def list_column_names
27
+ return @result.columns
28
+ end
29
+
30
+ # number of rows: returns an integer
31
+ def num_rows
32
+ # remember where we were in iterating over items
33
+ tmp_counter = @counter
34
+
35
+ # reset, and iterate over all rows to count
36
+ reset
37
+ retv = 0
38
+ each { |x| retv += 1}
39
+
40
+ # return to where we were in iterating over items
41
+ reset
42
+ while @counter < tmp_counter
43
+ @result.next
44
+ @counter += 1
45
+ end
46
+
47
+ # and return the number of rows
48
+ return retv
49
+ end
50
+
51
+
52
+ # yields each row as an array of values
53
+ def each
54
+ @result.each { |row|
55
+ @counter += 1
56
+ yield row.map { |x| x.to_s }
57
+ }
58
+ end
59
+
60
+ # yields each row as a hash: column name=> column value
61
+ def each_hash
62
+ @result.each { |row|
63
+ @counter += 1
64
+
65
+ row_hash = {}
66
+ row.fields.each_with_index { |key, index|
67
+ row_hash[key] = row[index].to_s
68
+ }
69
+ yield row_hash
70
+ }
71
+ end
72
+
73
+
74
+ ###
75
+ # reset such that each() can be run again on the result object
76
+ def reset
77
+ @result.reset
78
+ @counter = 0
79
+ end
80
+
81
+ # free object
82
+ def free
83
+ @result.close
84
+ end
85
+
86
+ # returns row as an array of column contents
87
+ def fetch_row
88
+ @counter += 1
89
+ return @result.next
90
+ end
91
+ end
92
+
93
+ #################
94
+ class DBSQLite < DBWrapper
95
+
96
+ ###
97
+ # initialization:
98
+ #
99
+ # open database file according to the given identifier
100
+ def initialize(exp, # RosyConfigData experiment file object
101
+ dir = nil, # string: directory for Shalmaneser internal data, ends in "/"
102
+ identifier = nil) # string: identifier to use for the database
103
+ super(exp)
104
+
105
+ # dir and identifier may be nil, if we're only opening this object
106
+ # in order to make temp databases
107
+ if dir and identifier
108
+ @database = SQLite3::Database.new(dir + identifier.to_s + ".db")
109
+ else
110
+ @database = nil
111
+ end
112
+
113
+ # temp file for temp database
114
+ @tf = nil
115
+ end
116
+
117
+ ###
118
+ # make a table
119
+ #
120
+ # returns: nothing
121
+ def create_table(table_name, # string
122
+ column_formats, # array: array: string*string [column_name,column_format]
123
+ index_column_names, # array: string: column_name
124
+ indexname) # string: name of automatically created index column
125
+
126
+ # primary key and auto-increment column
127
+ string = "CREATE TABLE #{table_name} (" +
128
+ "#{indexname} INTEGER PRIMARY KEY"
129
+
130
+ # column declarations
131
+ unless column_formats.empty?
132
+ string << ", "
133
+ string << column_formats.map { |name, format|
134
+ # include other keys
135
+ if index_column_names.include? name
136
+ name.to_s + " KEY " + format.to_s
137
+ else
138
+ name.to_s + " " + format.to_s
139
+ end
140
+ }.join(",")
141
+ end
142
+ string << ");"
143
+
144
+ query_noretv(string)
145
+ end
146
+
147
+ ###
148
+ # remove a table
149
+ def drop_table(table_name)
150
+ query_noretv("DROP TABLE " + table_name)
151
+ end
152
+
153
+ ###
154
+ def query(query)
155
+ if @database
156
+ return DBSQLiteResult.new(@database.query(query))
157
+ else
158
+ return nil
159
+ end
160
+ end
161
+
162
+ ####
163
+ # querying the database:
164
+ # no result value
165
+ def query_noretv(query)
166
+ if @database
167
+ @database.execute(query)
168
+ end
169
+ return nil
170
+ end
171
+
172
+ ###
173
+ # list all tables in the database
174
+ #
175
+ # array of strings
176
+ def list_tables
177
+ if @database
178
+ return @database.execute("select name from sqlite_master;").map { |t|
179
+ t.to_s
180
+ }
181
+ else
182
+ return nil
183
+ end
184
+ end
185
+
186
+ #####
187
+ # list_column_formats
188
+ #
189
+ # list column names and column types of this table
190
+ #
191
+ # returns: array:string*string, list of pairs [column name, column format]
192
+ def list_column_formats(table_name)
193
+ unless @database
194
+ return nil
195
+ end
196
+
197
+ table_descr = @database.execute("select * from sqlite_master where name=='#{table_name}';")
198
+ # this is an array of pieces of table description.
199
+ # the piece in the column called 'sql' is the 'create' statement.
200
+ # get the 'create' statement
201
+ begin
202
+ field_names = table_descr[0].fields
203
+ rescue
204
+ $stderr.puts "SQLite error: could not read description of table #{table_name}"
205
+ exit 1
206
+ end
207
+ create_index = (0..field_names.length).detect { |ix| field_names[ix] == 'sql' }
208
+
209
+ # try to parse column names out of the 'create' statement
210
+ if table_descr[0][create_index] =~ /^\s*create table \S+\s*\((.*)\)\s*$/i
211
+ # we now have something of shape ' a key varchar2(30), b varchar2(30)'
212
+ # split at the comma, remove whitespace at beginning and end
213
+ # then split again to get pairs [column name, column format]
214
+ return $1.split(",").map { |col_descrip|
215
+ pieces = col_descrip.strip.split.reject { |entry|
216
+ entry =~ /^key$/i or entry =~ /^primary$/i
217
+ }
218
+ if pieces.length > 2
219
+ $stderr.puts "Warning: problematic column format in #{col_descrip}, may be parsed wrong."
220
+ end
221
+ pieces
222
+ }
223
+ else
224
+ $stderr.puts "SQLite error: cannot read column names"
225
+ exit 1
226
+ end
227
+ end
228
+
229
+ ####
230
+ # num_rows
231
+ #
232
+ # determine the number of rows in a table
233
+ # returns: integer or nil
234
+ def num_rows(table_name)
235
+ unless @database
236
+ return nil
237
+ end
238
+
239
+ rows_s = @database.get_first_value( "select count(*) from #{table_name}" )
240
+ if rows_s
241
+ return rows_s.to_i
242
+ else
243
+ return nil
244
+ end
245
+ end
246
+
247
+ ####
248
+ # make a temporary table: make a table in a new, temporary file
249
+ #
250
+ # returns: DBWrapper object (or object of current subclass)
251
+ # that has the @table_name attribute set to the name of a temporary DB
252
+ #
253
+ # same as in superclass
254
+ #
255
+ # def make_temp_table(column_formats, # array: string*string [column_name,column_format]
256
+ # index_column_names, # array: string: column_name
257
+ # indexname) # string: name of autoincrement primary index
258
+
259
+ # temp_obj = self.clone()
260
+ # temp.initialize_temp_table(column_formats, index_column_names, indexname)
261
+ # return temp_obj
262
+ # end
263
+
264
+ def drop_temp_table
265
+ @tf.close(true)
266
+ @database = nil
267
+ end
268
+
269
+ ##############################
270
+ protected
271
+
272
+ def initialize_temp_table(column_formats, index_column_names, indexname)
273
+ @table_name = "temptable"
274
+ @tf = Tempfile.new("temp_table")
275
+ @tf.close
276
+ @database = SQLite3::Database.new(@tf.path)
277
+ create_table(@table_name, column_formats, index_column_names, indexname)
278
+ end
279
+
280
+ end