shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1 @@
1
+ # A dummy file to require for now.
@@ -0,0 +1,38 @@
1
+ require_relative 'tab_format_file'
2
+ require_relative 'fn_tab_sentence'
3
+ require_relative 'tab_format_named_args'
4
+
5
+ ########################################################
6
+ # TabFormat files containing everything that's in the FN lexunit files
7
+ class FNTabFormatFile < TabFormatFile
8
+ def initialize(filename, tag_suffix = nil, lemma_suffix = nil)
9
+ corpusname = File.dirname(filename) + "/" + File.basename(filename, ".tab")
10
+ filename_label_pairs = [filename, FNTabFormatFile.fntab_format]
11
+ # raise exception if lemmatisation does not esist
12
+ if lemma_suffix
13
+ filename_label_pairs.concat [corpusname + lemma_suffix, ["lemma"]]
14
+ end
15
+ # raise exception if tagging does not exist
16
+ if tag_suffix
17
+ filename_label_pairs.concat [corpusname + tag_suffix, ["pos"]]
18
+ end
19
+ super(filename_label_pairs)
20
+
21
+ @my_sentence_class = FNTabSentence
22
+ end
23
+
24
+ def self.fntab_format
25
+ ["word", FNTabFormatFile.frametab_format, "ne", "sent_id"]
26
+ end
27
+
28
+ def self.frametab_format
29
+ ["pt", "gf", "role", "target", "frame", "stuff"]
30
+ end
31
+
32
+ ##########
33
+ # given a hash mapping features to values,
34
+ # format according to fntab_format
35
+ def self.format_str(hash)
36
+ TabFormatNamedArgs.format_str(hash, FNTabFormatFile.fntab_format)
37
+ end
38
+ end
@@ -0,0 +1,67 @@
1
+ require_relative 'fn_tab_sentence'
2
+ require "ruby_class_extensions"
3
+
4
+ class FNTabFrame < FNTabSentence
5
+ ############
6
+ # initialize:
7
+ # as parent, except that we also get a frame number
8
+ # such that we can access the features of ``our'' frame
9
+ def initialize(pattern, frameno)
10
+ # by setting @group_no to frameno,
11
+ # we are initializing each TabFormatNamedArgs object
12
+ # in each_line_parsed() or read_one_line_parsed()
13
+ # with the right group number,
14
+ # such that all calls to TabFormatNamedArgs.get()
15
+ # will access the right group.
16
+ super(pattern)
17
+ @group_no = frameno
18
+ end
19
+
20
+ # returns the frame introduced by the target word(s)
21
+ # of this frame group, a string
22
+ def get_frame
23
+ sanity_check
24
+ each_line_parsed { |l|
25
+ return l.get("frame")
26
+ }
27
+ end
28
+
29
+ ####
30
+ # returns an array of integers: the indices of the target of
31
+ # the frame
32
+ # These are the line numbers, which start counting at 0
33
+ #
34
+ # a target may span more than one word
35
+ def get_target_indices
36
+ sanity_check
37
+ idx = []
38
+ each_line_parsed {|l|
39
+ unless l.get("target") == "-"
40
+ idx << l.get("lineno")
41
+ end
42
+ }
43
+
44
+ return idx
45
+ end
46
+
47
+ ####
48
+ # returns a string: the target
49
+ # in the case of multiword targets,
50
+ # we find the complete target at all
51
+ # indices, i.e. we can just take the first one we find
52
+ def get_target
53
+ each_line_parsed { |l|
54
+ t = l.get("target")
55
+ unless t == "-"
56
+ return t
57
+ end
58
+ }
59
+ end
60
+
61
+ ####
62
+ # get the target POS, according to FrameNet
63
+ def get_target_fn_pos
64
+ get_target =~ /^[^\.]+\.(\w+)$/
65
+ return $1
66
+ end
67
+ end
@@ -0,0 +1,169 @@
1
+ require_relative 'tab_format_sentence'
2
+ # require_relative 'fn_tab_frame'
3
+ require "ruby_class_extensions"
4
+ ############################################
5
+ class FNTabSentence < TabFormatSentence
6
+ ####
7
+ # overwrite this to get a feature from
8
+ # a group rather than from the main feature list
9
+ def get_this(l, feature_name)
10
+ l.get(feature_name)
11
+ end
12
+
13
+ ####
14
+ def sanity_check
15
+ each_line_parsed { |l|
16
+ if l.get("sent_id").nil?
17
+ raise "Error: corpus file does not conform to FN format."
18
+ else
19
+ return
20
+ end
21
+ }
22
+ end
23
+
24
+ ####
25
+ # returns the sentence ID, a string, as set by FrameNet
26
+ def get_sent_id
27
+ sanity_check
28
+ each_line_parsed { |l|
29
+ return l.get("sent_id")
30
+ }
31
+ end
32
+
33
+ ####
34
+ # iterator, yields each frame of the sentence as a FNTabFrame
35
+ # object. They contain the complete sentence, but provide
36
+ # access to exactly one frame of that sentence.
37
+ def each_frame
38
+ # how many frames? assume that each line has the same
39
+ # number of frames
40
+ num_frames = read_one_line_parsed(0).num_groups
41
+
42
+ 0.upto(num_frames - 1) { |frame_no|
43
+ frame_obj = FNTabFrame.new(@pattern, frame_no)
44
+ each_line { |l| frame_obj.add_line(l) }
45
+ yield frame_obj
46
+ }
47
+ end
48
+
49
+ ####
50
+ # computes a mapping from word indices to labels on these words
51
+ #
52
+ # returns a hash: index_list(array:integer) -> label(string)
53
+ # An entry il->label means that all the lines whose line
54
+ # numbers are listed in il are labeled with label.
55
+ #
56
+ # Line numbers correspond to words of the sentence. Counting starts at 0.
57
+ #
58
+ # By default, "markables" looks for role labels, i.e. labels in the
59
+ # column "role", but it can also look in another column.
60
+ # To change the default, give the column name as a parameter.
61
+ def markables(use_this_column = "role")
62
+ # returns hash of {index list} -> {markup label}
63
+
64
+ sanity_check
65
+
66
+ idlist_to_annotation_list = {}
67
+
68
+ # add entry for the target word
69
+ # idlist_to_annotation_list[get_target_indices()] = "target"
70
+
71
+ # determine span of each frame element
72
+ # if we find overlapping FEs, we write a warning to STDERR
73
+ # ignore the 2nd label and attempt to "close" the 1st label
74
+
75
+ ids = []
76
+ label = nil
77
+
78
+ each_line_parsed { |l|
79
+ this_id = get_this(l, "lineno")
80
+
81
+ # start of FE?
82
+ this_col = get_this(l, use_this_column)
83
+ unless this_col
84
+ $stderr.puts "nil entry #{use_this_column} in line #{this_id} of sent #{get_sent_id}. Skipping."
85
+ next
86
+ end
87
+ this_fe_ann = this_col.split(":")
88
+
89
+ case this_fe_ann.length
90
+ when 1 # nothing at all, or a single begin or end
91
+ markup = this_fe_ann.first
92
+ if markup == "-" or markup == "--" # no change
93
+ if label
94
+ ids << this_id
95
+ end
96
+ elsif markup =~ /^B-(\S+)$/
97
+ if label # are we within a markable right now?
98
+ $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" starts while within markable ", label.to_s
99
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
100
+ else
101
+ label = $1
102
+ ids << this_id
103
+ end
104
+ elsif markup =~ /^E-(\S+)$/
105
+ if label == $1 # we close the markable we've opened before
106
+ ids << this_id
107
+ # store information
108
+ idlist_to_annotation_list[ids] = label
109
+ # reset memory
110
+ label = nil
111
+ ids = []
112
+ else
113
+ $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" closes while within markable "+ label.to_s
114
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
115
+ end
116
+ else
117
+ $stderr.puts "[TabFormat] Warning: cannot analyse markup "+markup
118
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}"
119
+ end
120
+ when 2 # this should be a one-word markable
121
+ b_markup = this_fe_ann[0]
122
+ e_markup = this_fe_ann[1]
123
+ if label
124
+ $stderr.puts "[TabFormat] Warning: Finding new markable at word #{this_id} while within markable ", label
125
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
126
+ else
127
+ if b_markup =~ /^B-(\S+)$/
128
+ b_label = $1
129
+ if e_markup =~ /^E-(\S+)$/
130
+ e_label = $1
131
+ if b_label == e_label
132
+ idlist_to_annotation_list[[this_id]] = b_label
133
+ else
134
+ $stderr.puts "[TabFormat] Warning: Starting markable "+b_label+", closing markable "+e_label
135
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
136
+ end
137
+ else
138
+ $stderr.puts "[TabFormat] Warning: Unknown end markup "+e_markup
139
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
140
+ end
141
+ else
142
+ $stderr.puts "[TabFormat] Warning: Unknown start markup "+b_markup
143
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
144
+ end
145
+ end
146
+ else
147
+ $stderr.puts "Warning: cannot analyse markup with more than two colon-separated parts like "+this_fee_ann.join(":")
148
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}"
149
+ end
150
+ }
151
+
152
+ unless label.nil?
153
+ $stderr.puts "[TabFormat] Warning: Markable ", label, " did not end in sentence."
154
+ $stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
155
+ end
156
+
157
+ return idlist_to_annotation_list
158
+ end
159
+
160
+ #######
161
+ # @return [String] A tokenized sentence.
162
+ def to_s
163
+ sanity_check
164
+ array = []
165
+ each_line_parsed { |l| array << l.get("word") }
166
+
167
+ array.join(' ')
168
+ end
169
+ end
@@ -0,0 +1,91 @@
1
+ require_relative 'tab_format_sentence'
2
+
3
+ require "ruby_class_extensions"
4
+
5
+ #######################
6
+ class TabFormatFile
7
+ #######
8
+ # initialize:
9
+ # open files for reading.
10
+ #
11
+ # fp is a list of pairs [filename, format]
12
+ # where format is a list of strings that will be used
13
+ # to address columns of the file, the 1st string for the 1st column
14
+ #
15
+ # format may contain _one_ entry that is an array (or a call to repeat())
16
+ # e.g.:
17
+ # ["word", "pos", "lemma", repeat("frame", "target", "gf", "pt")]
18
+ def initialize(fp)
19
+ # open files
20
+ @files = []
21
+ @patterns = []
22
+ @no_of_read_lines = 0
23
+ fp.each_index { |ix|
24
+ if ix.modulo(2) == 0
25
+ # filename
26
+ begin
27
+ @files << File.new(fp[ix])
28
+ rescue
29
+ raise 'Sorry, could not read input file ' + fp[ix] + "\n"
30
+ end
31
+ else
32
+ # pattern
33
+ @patterns += fp[ix]
34
+ end
35
+ }
36
+
37
+ @my_sentence_class = TabFormatSentence
38
+ end
39
+
40
+ ########
41
+ # each_sentence:
42
+ # yield each sentence of the files in turn.
43
+ # sentences are expected to be separated
44
+ # by a line containing nothing but whitespace.
45
+ # the last sentence may or may not be followed by
46
+ # an empty line.
47
+ # each_sentence ends when EOF is encountered on the first file.
48
+ # it expects all the other files to be the same length
49
+ # (in terms of number of lines) as the first file.
50
+ # each sentence is returned in the form of an
51
+ # array of TabFormatSentence sentences.
52
+ # AB: TODO Delete this nasty exception!!!
53
+ # @todo Change `#readline` to `#gets` to avoid Exceptions.
54
+ # @todo Change `#gets` to `#readlines` to read all lines at once.
55
+ def each_sentence
56
+ unless @read_completely
57
+ sentence = @my_sentence_class.new(@patterns)
58
+ begin
59
+ loop do
60
+ linearray = []
61
+ @files.each { |f| linearray << f.readline.chomp }
62
+
63
+ @no_of_read_lines += 1
64
+ if linearray.detect { |x| x.strip == '' }
65
+ if linearray.detect { |x| x.strip != '' }
66
+ STDERR.puts "Error: Mismatching empty lines! <from lib/common>"
67
+ exit(1)
68
+ else
69
+ # sentence finished. yield it and start a new one
70
+ unless sentence.empty?
71
+ yield sentence
72
+ end
73
+ sentence = @my_sentence_class.new(@patterns)
74
+ end
75
+ # read an empty line in each of the other files
76
+
77
+ else
78
+ # sentence not yet finished.
79
+ # add this line to it
80
+ sentence.add_line(linearray.join("\t"))
81
+ end
82
+ end
83
+ rescue EOFError
84
+ # maybe we haven't yielded the last sentence yet.
85
+ yield sentence unless sentence.empty?
86
+
87
+ @read_completely = true
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,184 @@
1
+ require "ruby_class_extensions"
2
+ #################################################
3
+ # class for keeping one line,
4
+ # parsed.
5
+ # The line is kept as follows:
6
+ # - normal features: in a hash @f mapping feature names to values
7
+ # - features of the repeated group: in an array @r of
8
+ # TabFormatNamedArgs objects, one per group
9
+ #
10
+ # each feature of the line is available by name
11
+ # via the method "get".
12
+ # Additional features (from other input files) can be
13
+ # added to the TabFormatNamedArgs object via the method
14
+ # add_feature
15
+ #
16
+ # methods:
17
+ #
18
+ # new: initialize.
19
+ # values: array of strings
20
+ # features: how to access the strings by name
21
+ # 'features' is an array of strings
22
+ # later the i-th feature will be used to access
23
+ # the i-th value,
24
+ # except for repeated groups
25
+ #
26
+ # get: returns one feature by its name
27
+ # name: a string
28
+ #
29
+ # add_feature: add another feature to this object,
30
+ # which can be accessed via "get"
31
+ # name: name for the new feature, should be distinct
32
+ # from the ones already used in new()
33
+ # feature: a string, the value of the feature
34
+ ##
35
+
36
+ class TabFormatNamedArgs
37
+ ############
38
+ def initialize(values, features, group = nil)
39
+ @f = {}
40
+ @r = []
41
+ @group = group
42
+
43
+ # record the feature names, give special attention to a group
44
+ # if we have one
45
+ @group_feature_names = nil
46
+ @feature_names = features.map { |feature|
47
+ if feature.instance_of? Array
48
+ # found a group
49
+ @group_feature_names = feature
50
+ "GROUP"
51
+ else
52
+ feature
53
+ end
54
+ }
55
+
56
+ if @feature_names.count("GROUP") > 1
57
+ $stderr.puts "More than one group in feature set:" + features.join(" ")
58
+ raise "Cannot handle this."
59
+ end
60
+
61
+ # group_index: position of group in overall feature list
62
+ group_index = @feature_names.index("GROUP")
63
+ unless group_index
64
+ group_index = @feature_names.length
65
+ end
66
+ num_features_after_group = [0,
67
+ (@feature_names.length - 1) - group_index].max
68
+ index_after_groups = values.length - num_features_after_group
69
+
70
+
71
+ # features before group: put feature/value pairs in @f hash
72
+ 0.upto(group_index - 1) { |i|
73
+ @f[features[i]] = values[i]
74
+ }
75
+ # group: store each group in @r hash
76
+ if @group_feature_names
77
+ # for (group_start = group_index; group_start < index_after_groups;
78
+ # group_start += @group_feature_names.length())
79
+ group_no = 0
80
+ group_index.step(index_after_groups - 1,
81
+ @group_feature_names.length) { |group_start|
82
+ @r << TabFormatNamedArgs.new(values.slice(group_start,
83
+ @group_feature_names.length),
84
+ @group_feature_names,
85
+ group_no)
86
+ group_no += 1
87
+ }
88
+ end
89
+
90
+ # features after group: put feature/value pairs in @f hash
91
+ feature_index = group_index + 1
92
+ index_after_groups.upto(values.length - 1) { |i|
93
+ @f[features[feature_index]] = values[i]
94
+ feature_index += 1
95
+ }
96
+ end
97
+
98
+ ############
99
+ # return feature/value pairs as a tab format line,
100
+ # order of features as given in the 'features' list
101
+ # Features not set in the hash: their entry will be "-"
102
+ #
103
+ # If the feature list includes a group,
104
+ # assume zero entries for that group
105
+ def self.format_str(hash, # hash: feature -> value
106
+ features) # feature list, as for new()
107
+ if features.nil?
108
+ return ""
109
+ end
110
+
111
+ # sanity check: does the hash contain keys that are not in the feature list?
112
+ hash.keys.reject { |f| features.include? f }.each { |bad_feature|
113
+ $stderr.puts "Error: unknown feature #{bad_feature} in format_str: ignoring."
114
+ }
115
+
116
+ return features.select { |f|
117
+ # remove the group feature, if it's there
118
+ not(f.instance_of? Array)
119
+ }.map { |feature| hash.fetch(feature, '-') }.join("\t")
120
+ end
121
+
122
+ #############
123
+ def add_feature(name, feature)
124
+ if @f.key? name
125
+ raise "Trying to add a feature twice: #{name}."
126
+ end
127
+
128
+ @f[name] = feature
129
+ end
130
+
131
+ #############
132
+ # get feature value, identified by feature name
133
+ # return: feature value as string
134
+ def get(name)
135
+ if (retv = get_nongroup(name))
136
+ return retv
137
+ else
138
+ return get_from_group(name, @group)
139
+ end
140
+ end
141
+
142
+ #############
143
+ def set(name, feature)
144
+ @f[name] = feature
145
+ end
146
+
147
+ #############
148
+ def num_groups
149
+ return @r.length
150
+ end
151
+
152
+ #############
153
+ # return line as string, entries connected by tab,
154
+ # in the order that the entries were in originally
155
+ def to_s
156
+ return @feature_names.map { |feature|
157
+ case feature
158
+ when "GROUP"
159
+ @r.map { |group_obj| group_obj.to_s }.join("\t")
160
+ else
161
+ @f[feature]
162
+ end
163
+ }.join("\t")
164
+ end
165
+
166
+ protected
167
+
168
+ # get feature, non-group
169
+ # return: feature value (string)
170
+ def get_nongroup(feature)
171
+ return @f[feature]
172
+ end
173
+
174
+ # get feature from one of the groups
175
+ # return: feature value (string)
176
+ def get_from_group(name, group_no)
177
+ if not(group_no) or group_no >= @r.length
178
+ # no group with that number
179
+ return nil
180
+ else
181
+ return @r[group_no].get_nongroup(name)
182
+ end
183
+ end
184
+ end