shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1 @@
1
+ # A dummy file to require for now.
@@ -0,0 +1,38 @@
1
+ require_relative 'tab_format_file'
2
+ require_relative 'fn_tab_sentence'
3
+ require_relative 'tab_format_named_args'
4
+
5
+ ########################################################
6
+ # TabFormat files containing everything that's in the FN lexunit files
7
+ class FNTabFormatFile < TabFormatFile
8
+ def initialize(filename, tag_suffix = nil, lemma_suffix = nil)
9
+ corpusname = File.dirname(filename) + "/" + File.basename(filename, ".tab")
10
+ filename_label_pairs = [filename, FNTabFormatFile.fntab_format]
11
+ # raise exception if lemmatisation does not esist
12
+ if lemma_suffix
13
+ filename_label_pairs.concat [corpusname + lemma_suffix, ["lemma"]]
14
+ end
15
+ # raise exception if tagging does not exist
16
+ if tag_suffix
17
+ filename_label_pairs.concat [corpusname + tag_suffix, ["pos"]]
18
+ end
19
+ super(filename_label_pairs)
20
+
21
+ @my_sentence_class = FNTabSentence
22
+ end
23
+
24
+ def self.fntab_format
25
+ ["word", FNTabFormatFile.frametab_format, "ne", "sent_id"]
26
+ end
27
+
28
+ def self.frametab_format
29
+ ["pt", "gf", "role", "target", "frame", "stuff"]
30
+ end
31
+
32
+ ##########
33
+ # given a hash mapping features to values,
34
+ # format according to fntab_format
35
+ def self.format_str(hash)
36
+ TabFormatNamedArgs.format_str(hash, FNTabFormatFile.fntab_format)
37
+ end
38
+ end
@@ -0,0 +1,67 @@
1
+ require_relative 'fn_tab_sentence'
2
+ require "ruby_class_extensions"
3
+
4
+ class FNTabFrame < FNTabSentence
5
+ ############
6
+ # initialize:
7
+ # as parent, except that we also get a frame number
8
+ # such that we can access the features of ``our'' frame
9
+ def initialize(pattern, frameno)
10
+ # by setting @group_no to frameno,
11
+ # we are initializing each TabFormatNamedArgs object
12
+ # in each_line_parsed() or read_one_line_parsed()
13
+ # with the right group number,
14
+ # such that all calls to TabFormatNamedArgs.get()
15
+ # will access the right group.
16
+ super(pattern)
17
+ @group_no = frameno
18
+ end
19
+
20
+ # returns the frame introduced by the target word(s)
21
+ # of this frame group, a string
22
+ def get_frame
23
+ sanity_check
24
+ each_line_parsed { |l|
25
+ return l.get("frame")
26
+ }
27
+ end
28
+
29
+ ####
30
+ # returns an array of integers: the indices of the target of
31
+ # the frame
32
+ # These are the line numbers, which start counting at 0
33
+ #
34
+ # a target may span more than one word
35
+ def get_target_indices
36
+ sanity_check
37
+ idx = []
38
+ each_line_parsed {|l|
39
+ unless l.get("target") == "-"
40
+ idx << l.get("lineno")
41
+ end
42
+ }
43
+
44
+ return idx
45
+ end
46
+
47
+ ####
48
+ # returns a string: the target
49
+ # in the case of multiword targets,
50
+ # we find the complete target at all
51
+ # indices, i.e. we can just take the first one we find
52
+ def get_target
53
+ each_line_parsed { |l|
54
+ t = l.get("target")
55
+ unless t == "-"
56
+ return t
57
+ end
58
+ }
59
+ end
60
+
61
+ ####
62
+ # get the target POS, according to FrameNet
63
+ def get_target_fn_pos
64
+ get_target =~ /^[^\.]+\.(\w+)$/
65
+ return $1
66
+ end
67
+ end
@@ -0,0 +1,169 @@
1
+ require_relative 'tab_format_sentence'
2
+ # require_relative 'fn_tab_frame'
3
+ require "ruby_class_extensions"
4
+ ############################################
5
+ class FNTabSentence < TabFormatSentence
6
+ ####
7
+ # overwrite this to get a feature from
8
+ # a group rather than from the main feature list
9
+ def get_this(l, feature_name)
10
+ l.get(feature_name)
11
+ end
12
+
13
+ ####
14
+ def sanity_check
15
+ each_line_parsed { |l|
16
+ if l.get("sent_id").nil?
17
+ raise "Error: corpus file does not conform to FN format."
18
+ else
19
+ return
20
+ end
21
+ }
22
+ end
23
+
24
+ ####
25
+ # returns the sentence ID, a string, as set by FrameNet
26
+ def get_sent_id
27
+ sanity_check
28
+ each_line_parsed { |l|
29
+ return l.get("sent_id")
30
+ }
31
+ end
32
+
33
+ ####
34
+ # iterator, yields each frame of the sentence as a FNTabFrame
35
+ # object. They contain the complete sentence, but provide
36
+ # access to exactly one frame of that sentence.
37
+ def each_frame
38
+ # how many frames? assume that each line has the same
39
+ # number of frames
40
+ num_frames = read_one_line_parsed(0).num_groups
41
+
42
+ 0.upto(num_frames - 1) { |frame_no|
43
+ frame_obj = FNTabFrame.new(@pattern, frame_no)
44
+ each_line { |l| frame_obj.add_line(l) }
45
+ yield frame_obj
46
+ }
47
+ end
48
+
49
+ ####
50
+ # computes a mapping from word indices to labels on these words
51
+ #
52
+ # returns a hash: index_list(array:integer) -> label(string)
53
+ # An entry il->label means that all the lines whose line
54
+ # numbers are listed in il are labeled with label.
55
+ #
56
+ # Line numbers correspond to words of the sentence. Counting starts at 0.
57
+ #
58
+ # By default, "markables" looks for role labels, i.e. labels in the
59
+ # column "role", but it can also look in another column.
60
+ # To change the default, give the column name as a parameter.
61
+ def markables(use_this_column = "role")
62
+ # returns hash of {index list} -> {markup label}
63
+
64
+ sanity_check
65
+
66
+ idlist_to_annotation_list = {}
67
+
68
+ # add entry for the target word
69
+ # idlist_to_annotation_list[get_target_indices()] = "target"
70
+
71
+ # determine span of each frame element
72
+ # if we find overlapping FEs, we write a warning to STDERR
73
+ # ignore the 2nd label and attempt to "close" the 1st label
74
+
75
+ ids = []
76
+ label = nil
77
+
78
+ each_line_parsed { |l|
79
+ this_id = get_this(l, "lineno")
80
+
81
+ # start of FE?
82
+ this_col = get_this(l, use_this_column)
83
+ unless this_col
84
+ $stderr.puts "nil entry #{use_this_column} in line #{this_id} of sent #{get_sent_id}. Skipping."
85
+ next
86
+ end
87
+ this_fe_ann = this_col.split(":")
88
+
89
+ case this_fe_ann.length
90
+ when 1 # nothing at all, or a single begin or end
91
+ markup = this_fe_ann.first
92
+ if markup == "-" or markup == "--" # no change
93
+ if label
94
+ ids << this_id
95
+ end
96
+ elsif markup =~ /^B-(\S+)$/
97
+ if label # are we within a markable right now?
98
+ $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" starts while within markable ", label.to_s
99
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
100
+ else
101
+ label = $1
102
+ ids << this_id
103
+ end
104
+ elsif markup =~ /^E-(\S+)$/
105
+ if label == $1 # we close the markable we've opened before
106
+ ids << this_id
107
+ # store information
108
+ idlist_to_annotation_list[ids] = label
109
+ # reset memory
110
+ label = nil
111
+ ids = []
112
+ else
113
+ $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" closes while within markable "+ label.to_s
114
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
115
+ end
116
+ else
117
+ $stderr.puts "[TabFormat] Warning: cannot analyse markup "+markup
118
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}"
119
+ end
120
+ when 2 # this should be a one-word markable
121
+ b_markup = this_fe_ann[0]
122
+ e_markup = this_fe_ann[1]
123
+ if label
124
+ $stderr.puts "[TabFormat] Warning: Finding new markable at word #{this_id} while within markable ", label
125
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
126
+ else
127
+ if b_markup =~ /^B-(\S+)$/
128
+ b_label = $1
129
+ if e_markup =~ /^E-(\S+)$/
130
+ e_label = $1
131
+ if b_label == e_label
132
+ idlist_to_annotation_list[[this_id]] = b_label
133
+ else
134
+ $stderr.puts "[TabFormat] Warning: Starting markable "+b_label+", closing markable "+e_label
135
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
136
+ end
137
+ else
138
+ $stderr.puts "[TabFormat] Warning: Unknown end markup "+e_markup
139
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
140
+ end
141
+ else
142
+ $stderr.puts "[TabFormat] Warning: Unknown start markup "+b_markup
143
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
144
+ end
145
+ end
146
+ else
147
+ $stderr.puts "Warning: cannot analyse markup with more than two colon-separated parts like "+this_fee_ann.join(":")
148
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}"
149
+ end
150
+ }
151
+
152
+ unless label.nil?
153
+ $stderr.puts "[TabFormat] Warning: Markable ", label, " did not end in sentence."
154
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
155
+ end
156
+
157
+ return idlist_to_annotation_list
158
+ end
159
+
160
+ #######
161
+ # @return [String] A tokenized sentence.
162
+ def to_s
163
+ sanity_check
164
+ array = []
165
+ each_line_parsed { |l| array << l.get("word") }
166
+
167
+ array.join(' ')
168
+ end
169
+ end
@@ -0,0 +1,91 @@
1
+ require_relative 'tab_format_sentence'
2
+
3
+ require "ruby_class_extensions"
4
+
5
+ #######################
6
+ class TabFormatFile
7
+ #######
8
+ # initialize:
9
+ # open files for reading.
10
+ #
11
+ # fp is a list of pairs [filename, format]
12
+ # where format is a list of strings that will be used
13
+ # to address columns of the file, the 1st string for the 1st column
14
+ #
15
+ # format may contain _one_ entry that is an array (or a call to repeat())
16
+ # e.g.:
17
+ # ["word", "pos", "lemma", repeat("frame", "target", "gf", "pt")]
18
+ def initialize(fp)
19
+ # open files
20
+ @files = []
21
+ @patterns = []
22
+ @no_of_read_lines = 0
23
+ fp.each_index { |ix|
24
+ if ix.modulo(2) == 0
25
+ # filename
26
+ begin
27
+ @files << File.new(fp[ix])
28
+ rescue
29
+ raise 'Sorry, could not read input file ' + fp[ix] + "\n"
30
+ end
31
+ else
32
+ # pattern
33
+ @patterns += fp[ix]
34
+ end
35
+ }
36
+
37
+ @my_sentence_class = TabFormatSentence
38
+ end
39
+
40
+ ########
41
+ # each_sentence:
42
+ # yield each sentence of the files in turn.
43
+ # sentences are expected to be separated
44
+ # by a line containing nothing but whitespace.
45
+ # the last sentence may or may not be followed by
46
+ # an empty line.
47
+ # each_sentence ends when EOF is encountered on the first file.
48
+ # it expects all the other files to be the same length
49
+ # (in terms of number of lines) as the first file.
50
+ # each sentence is returned in the form of an
51
+ # array of TabFormatSentence sentences.
52
+ # AB: TODO Delete this nasty exception!!!
53
+ # @todo Change `#readline` to `#gets` to avoid Exceptions.
54
+ # @todo Change `#gets` to `#readlines` to read all lines at once.
55
+ def each_sentence
56
+ unless @read_completely
57
+ sentence = @my_sentence_class.new(@patterns)
58
+ begin
59
+ loop do
60
+ linearray = []
61
+ @files.each { |f| linearray << f.readline.chomp }
62
+
63
+ @no_of_read_lines += 1
64
+ if linearray.detect { |x| x.strip == '' }
65
+ if linearray.detect { |x| x.strip != '' }
66
+ STDERR.puts "Error: Mismatching empty lines! <from lib/common>"
67
+ exit(1)
68
+ else
69
+ # sentence finished. yield it and start a new one
70
+ unless sentence.empty?
71
+ yield sentence
72
+ end
73
+ sentence = @my_sentence_class.new(@patterns)
74
+ end
75
+ # read an empty line in each of the other files
76
+
77
+ else
78
+ # sentence not yet finished.
79
+ # add this line to it
80
+ sentence.add_line(linearray.join("\t"))
81
+ end
82
+ end
83
+ rescue EOFError
84
+ # maybe we haven't yielded the last sentence yet.
85
+ yield sentence unless sentence.empty?
86
+
87
+ @read_completely = true
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,184 @@
1
+ require "ruby_class_extensions"
2
+ #################################################
3
+ # class for keeping one line,
4
+ # parsed.
5
+ # The line is kept as follows:
6
+ # - normal features: in a hash @f mapping feature names to values
7
+ # - features of the repeated group: in an array @r of
8
+ # TabFormatNamedArgs objects, one per group
9
+ #
10
+ # each feature of the line is available by name
11
+ # via the method "get".
12
+ # Additional features (from other input files) can be
13
+ # added to the TabFormatNamedArgs object via the method
14
+ # add_feature
15
+ #
16
+ # methods:
17
+ #
18
+ # new: initialize.
19
+ # values: array of strings
20
+ # features: how to access the strings by name
21
+ # 'features' is an array of strings
22
+ # later the i-th feature will be used to access
23
+ # the i-th value,
24
+ # except for repeated groups
25
+ #
26
+ # get: returns one feature by its name
27
+ # name: a string
28
+ #
29
+ # add_feature: add another feature to this object,
30
+ # which can be accessed via "get"
31
+ # name: name for the new feature, should be distinct
32
+ # from the ones already used in new()
33
+ # feature: a string, the value of the feature
34
+ ##
35
+
36
+ class TabFormatNamedArgs
37
+ ############
38
+ def initialize(values, features, group = nil)
39
+ @f = {}
40
+ @r = []
41
+ @group = group
42
+
43
+ # record the feature names, give special attention to a group
44
+ # if we have one
45
+ @group_feature_names = nil
46
+ @feature_names = features.map { |feature|
47
+ if feature.instance_of? Array
48
+ # found a group
49
+ @group_feature_names = feature
50
+ "GROUP"
51
+ else
52
+ feature
53
+ end
54
+ }
55
+
56
+ if @feature_names.count("GROUP") > 1
57
+ $stderr.puts "More than one group in feature set:" + features.join(" ")
58
+ raise "Cannot handle this."
59
+ end
60
+
61
+ # group_index: position of group in overall feature list
62
+ group_index = @feature_names.index("GROUP")
63
+ unless group_index
64
+ group_index = @feature_names.length
65
+ end
66
+ num_features_after_group = [0,
67
+ (@feature_names.length - 1) - group_index].max
68
+ index_after_groups = values.length - num_features_after_group
69
+
70
+
71
+ # features before group: put feature/value pairs in @f hash
72
+ 0.upto(group_index - 1) { |i|
73
+ @f[features[i]] = values[i]
74
+ }
75
+ # group: store each group in @r hash
76
+ if @group_feature_names
77
+ # for (group_start = group_index; group_start < index_after_groups;
78
+ # group_start += @group_feature_names.length())
79
+ group_no = 0
80
+ group_index.step(index_after_groups - 1,
81
+ @group_feature_names.length) { |group_start|
82
+ @r << TabFormatNamedArgs.new(values.slice(group_start,
83
+ @group_feature_names.length),
84
+ @group_feature_names,
85
+ group_no)
86
+ group_no += 1
87
+ }
88
+ end
89
+
90
+ # features after group: put feature/value pairs in @f hash
91
+ feature_index = group_index + 1
92
+ index_after_groups.upto(values.length - 1) { |i|
93
+ @f[features[feature_index]] = values[i]
94
+ feature_index += 1
95
+ }
96
+ end
97
+
98
+ ############
99
+ # return feature/value pairs as a tab format line,
100
+ # order of features as given in the 'features' list
101
+ # Features not set in the hash: their entry will be "-"
102
+ #
103
+ # If the feature list includes a group,
104
+ # assume zero entries for that group
105
+ def self.format_str(hash, # hash: feature -> value
106
+ features) # feature list, as for new()
107
+ if features.nil?
108
+ return ""
109
+ end
110
+
111
+ # sanity check: does the hash contain keys that are not in the feature list?
112
+ hash.keys.reject { |f| features.include? f }.each { |bad_feature|
113
+ $stderr.puts "Error: unknown feature #{bad_feature} in format_str: ignoring."
114
+ }
115
+
116
+ return features.select { |f|
117
+ # remove the group feature, if it's there
118
+ not(f.instance_of? Array)
119
+ }.map { |feature| hash.fetch(feature, '-') }.join("\t")
120
+ end
121
+
122
+ #############
123
+ def add_feature(name, feature)
124
+ if @f.key? name
125
+ raise "Trying to add a feature twice: #{name}."
126
+ end
127
+
128
+ @f[name] = feature
129
+ end
130
+
131
+ #############
132
+ # get feature value, identified by feature name
133
+ # return: feature value as string
134
+ def get(name)
135
+ if (retv = get_nongroup(name))
136
+ return retv
137
+ else
138
+ return get_from_group(name, @group)
139
+ end
140
+ end
141
+
142
+ #############
143
+ def set(name, feature)
144
+ @f[name] = feature
145
+ end
146
+
147
+ #############
148
+ def num_groups
149
+ return @r.length
150
+ end
151
+
152
+ #############
153
+ # return line as string, entries connected by tab,
154
+ # in the order that the entries were in originally
155
+ def to_s
156
+ return @feature_names.map { |feature|
157
+ case feature
158
+ when "GROUP"
159
+ @r.map { |group_obj| group_obj.to_s }.join("\t")
160
+ else
161
+ @f[feature]
162
+ end
163
+ }.join("\t")
164
+ end
165
+
166
+ protected
167
+
168
+ # get feature, non-group
169
+ # return: feature value (string)
170
+ def get_nongroup(feature)
171
+ return @f[feature]
172
+ end
173
+
174
+ # get feature from one of the groups
175
+ # return: feature value (string)
176
+ def get_from_group(name, group_no)
177
+ if not(group_no) or group_no >= @r.length
178
+ # no group with that number
179
+ return nil
180
+ else
181
+ return @r[group_no].get_nongroup(name)
182
+ end
183
+ end
184
+ end