shalmaneser 0.0.1.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +284 -0
@@ -0,0 +1,196 @@
1
+ ###
2
+ # FixSynSemMapping:
3
+ # Given a SalsaTigerRegXML sentence with semantic role annotation,
4
+ # simplify the mapping of semantic roles to syntactic constituents
5
+ #
6
+ # The following is lifted from the LREC06 paper on Shalmaneser:
7
+ # During preprocessing, the span of semantic roles in the training corpora is
8
+ # projected onto the output of the syntactic parser by assigning each
9
+ # role to the set of maximal constituents covering its word span.
10
+ # f the word span of a role does not coincide
11
+ # with parse tree constituents, e.g. due to misparses,
12
+ # the role is ``spread out'' across several constituents. This leads to
13
+ # idiosyncratic paths between predicate and semantic role in the parse
14
+ # tree.
15
+ #
16
+ # [The following span standardization algorithm is used to make the
17
+ # syntax-semantics mapping more uniform:]
18
+ # Given a role r that has been assigned, let N be the set of
19
+ # terminal nodes of the syntactic structure that are covered by r.
20
+ #
21
+ # Iteratively compute the maximal projection of N in the syntactic
22
+ # structure:
23
+ # 1) If n is a node such that all of n's children are in N,
24
+ # then remove n's children from N and add n instead.
25
+ # 2) If n is a node with 3 or more children, and all of n's
26
+ # children except one are in N, then remove n's children from N
27
+ # and add n instead.
28
+ # 3) If n is an NP with 2 children, and one of them, another NP,
29
+ # is in N, and the other, a relative clause, is not, then remove
30
+ # n's children from N and add n instead.
31
+ #
32
+ # If none of the rules is applicable to N anymore, assign r to the
33
+ # nodes in N.
34
+ #
35
+ # Rule 1 implements normal maximal projection. Rule 2 ``repairs'' parser
36
+ # errors where all children of a node but one have been assigned the
37
+ # same role. Rule 3 addresses a problem of the FrameNet data, where
38
+ # relative clauses have been omitted from roles assigned to NPs.
39
+
40
+ # KE Feb 08: rule 3 currently out of commission!
41
+
42
+ require "common/SalsaTigerRegXML"
43
+
44
+ module FixSynSemMapping
45
+ ##
46
+ # fix it
47
+ #
48
+ # relevant settings in the experiment file:
49
+ #
50
+ # fe_syn_repair:
51
+ # If there is a node that would be a max. constituent for the
52
+ # words covered by the given FE, except that it has one child
53
+ # whose words are not in the FE, use the node as max constituent anyway.
54
+ # This is to repair cases where the parser has made an attachment choice
55
+ # that differs from the one in the gold annotation
56
+ #
57
+ # fe_rel_repair:
58
+ # If there is an NP such that all of its children except one have been
59
+ # assigned the same FE, and that missing child is a relative clause
60
+ # depending on one of the other children, then take the complete NP as
61
+ # that FE
62
+ def FixSynSemMapping.fixit(sent, # SalsaTigerSentence object
63
+ exp, # experiment file object
64
+ interpreter_class) # SynInterpreter class
65
+
66
+
67
+ unless exp.get("fe_syn_repair") or exp.get("fe_rel_repair")
68
+ return
69
+ end
70
+
71
+ if sent.nil?
72
+ return
73
+ end
74
+
75
+ # "repair" FEs:
76
+ sent.each_frame { |frame|
77
+
78
+ frame.each_child { |fe_or_target|
79
+
80
+ # repair only if the FE currently
81
+ # points to more than one syn node
82
+ if fe_or_target.children.length() < 2
83
+ next
84
+ end
85
+
86
+ if exp.get("fe_rel_repair")
87
+ lastfe = fe_or_target.children.last()
88
+ if lastfe and interpreter_class.simplified_pt(lastfe) =~ /^(WDT)|(WP\$?)|(WRB)/
89
+
90
+ # remove syn nodes that the FE points to
91
+ old_fe_syn = fe_or_target.children()
92
+ old_fe_syn.each { |child|
93
+ fe_or_target.remove_child(child)
94
+ }
95
+
96
+ # set it to point only to the last previous node, the relative pronoun
97
+ fe_or_target.add_child(lastfe)
98
+ end
99
+ end
100
+
101
+ if exp.get("fe_syn_repair")
102
+ # remove syn nodes that the FE points to
103
+ old_fe_syn = fe_or_target.children()
104
+ old_fe_syn.each { |child|
105
+ fe_or_target.remove_child(child)
106
+ }
107
+
108
+ # and recompute
109
+ new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t|
110
+ t.yield_nodes
111
+ }.flatten.uniq,
112
+ sent,
113
+ exp.get("fe_syn_repair"))
114
+
115
+ # make the FE point to the new nodes
116
+ new_fe_syn.each { |syn_node|
117
+ fe_or_target.add_child(syn_node)
118
+ }
119
+ end
120
+ } # each FE
121
+ } # each frame
122
+ end # def fixit
123
+ end # module
124
+
125
+
126
+ #########3
127
+ # old code
128
+
129
+ # if exp.get("fe_rel_repair")
130
+ # # repair relative clauses:
131
+ # # then make a procedure to pass on to max constituents
132
+ # # that will recognize the relevant cases
133
+
134
+ # accept_anyway_proc = Proc.new { |node, children_in, children_out|
135
+
136
+ # # node: SynNode
137
+ # # children_in, children_out: array:SynNode. children_in are the children
138
+ # # that are already covered by the FE, children_out the ones that aren't
139
+
140
+ # # if node is an NP,
141
+ # # and only one of its children is out,
142
+ # # and one node in children_in is an NP, and the missing child is an SBAR
143
+ # # with a child that is a relative pronoun, then consider the child in children_out as covered
144
+ # if interpreter_class.category(node) == "noun" and
145
+ # children_out.length() == 1 and
146
+ # children_in.select { |n| interpreter_class.category(n) == "noun" } and
147
+ # interpreter_class.category(children_out.first) == "sent" and
148
+ # (ch = children_out.first.children) and
149
+ # ch.select { |n| interpreter_class.relative_pronoun?(n) }
150
+ # true
151
+ # else
152
+ # false
153
+ # end
154
+ # }
155
+
156
+ # else
157
+ # accept_anyway_proc = nil
158
+ # end
159
+
160
+
161
+ # # "repair" FEs:
162
+ # sent.each_frame { |frame|
163
+
164
+ # frame.each_child { |fe_or_target|
165
+
166
+ # # repair only if the FE currently
167
+ # # points to more than one syn node, or
168
+ # # if it is a noun with a non-covered sentence sister
169
+ # if fe_or_target.children.length() > 1 or
170
+ # (exp.get("fe_rel_repair") and (curr_marked = fe_or_target.children.first()) and
171
+ # interpreter_class.category(curr_marked) == "noun" and
172
+ # (p = curr_marked.parent) and
173
+ # p.children.select { |n| n != curr_marked and interpreter_class.category(n) == "sent" } )
174
+
175
+ # # remember nodes covered by the FE
176
+ # old_fe_syn = fe_or_target.children()
177
+
178
+ # # remove syn nodes that the FE points to
179
+ # old_fe_syn.each { |child|
180
+ # fe_or_target.remove_child(child)
181
+ # }
182
+
183
+ # # and recompute
184
+ # new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t| t.yield_nodes}.flatten.uniq,
185
+ # sent,
186
+ # exp.get("fe_syn_repair"),
187
+ # accept_anyway_proc)
188
+
189
+ # # make the FE point to the new nodes
190
+ # new_fe_syn.each { |syn_node|
191
+ # fe_or_target.add_child(syn_node)
192
+ # }
193
+
194
+ # end # if FE points to more than one syn node
195
+ # } # each FE
196
+ # } # each frame
@@ -0,0 +1,66 @@
1
+ # FPrepConfigData
2
+ # Katrin Erk July 05
3
+ #
4
+ # Preprocessing for Fred and Rosy:
5
+ # access to a configuration and experiment description file
6
+
7
+ require "common/ConfigData"
8
+
9
+ ##############################
10
+ # Class FrPrepConfigData
11
+ #
12
+ # inherits from ConfigData,
13
+ # sets variable names appropriate to preprocessing task
14
+
15
+ class FrPrepConfigData < ConfigData
16
+ def initialize(filename)
17
+
18
+ # initialize config data object
19
+ super(filename, # config file
20
+ { "prep_experiment_ID" => "string", # experiment identifier
21
+
22
+ "frprep_directory" => "string", # dir for frprep internal data
23
+
24
+ # information about the dataset
25
+ "language" => "string", # en, de
26
+ "origin"=> "string", # FrameNet, Salsa, or nothing
27
+ "format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
28
+ "encoding" => "string", # utf8, iso, hex, or nothing
29
+
30
+
31
+ # directories
32
+ "directory_input" => "string", # dir with input data
33
+ "directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
34
+ "directory_parserout" => "string", # dir with parser output for the parser named below
35
+
36
+ # syntactic processing
37
+ "pos_tagger" => "string", # name of POS tagger
38
+ "lemmatizer" => "string", # name of lemmatizer
39
+ "parser" => "string", # name of parser
40
+ "pos_tagger_path" => "string", # path to POS tagger
41
+ "lemmatizer_path" => "string", # path to lemmatizer
42
+ "parser_path" => "string", # path to parser
43
+ "parser_max_sent_num" => "integer", # max number of sentences per parser input file
44
+ "parser_max_sent_len" => "integer", # max sentence length the parser handles
45
+
46
+ "do_parse" => "bool", # use parser?
47
+ "do_lemmatize" => "bool",# use lemmatizer?
48
+ "do_postag" => "bool", # use POS tagger?
49
+
50
+ # output format: if tabformat_output == true,
51
+ # output in Tab format rather than Salsa/Tiger XML
52
+ # (this will not work if do_parse == true)
53
+ "tabformat_output" => "bool",
54
+
55
+ # syntactic repairs, dependent on existing semantic role annotation
56
+ "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
57
+ "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
58
+ },
59
+ [ ] # variables
60
+ )
61
+
62
+ end
63
+ end
64
+
65
+
66
+
@@ -0,0 +1,1324 @@
1
+ # Salsa packages
2
+ require "common/ISO-8859-1"
3
+ require "common/Parser"
4
+ require "common/RegXML"
5
+ require "common/SalsaTigerRegXML"
6
+ require "common/SalsaTigerXMLHelper"
7
+ require "common/TabFormat"
8
+ require "common/ruby_class_extensions"
9
+ require "common/AbstractSynInterface"
10
+
11
+ ############################################3
12
+ # Module FrprepHelper:
13
+ #
14
+ # diverse transformation methods for frprep.rb
15
+ # moved over here to make the main file less crowded
16
+ module FrprepHelper
17
+
18
+ ####
19
+ # transform a file to UTF-8 from a given encoding
20
+ def FrprepHelper.to_utf8_file(input_filename, # string: name of input file
21
+ output_filename, # string: name of output file
22
+ encoding) # string: "iso", "hex"
23
+ begin
24
+ infile = File.new(input_filename)
25
+ outfile = File.new(output_filename, "w")
26
+ rescue
27
+ raise "Could not read #{input_filename}, or could not write to #{output_filename}."
28
+ end
29
+
30
+ while (line = infile.gets())
31
+ case encoding
32
+ when "iso"
33
+ outfile.puts UtfIso.from_iso_8859_1(line)
34
+ when "hex"
35
+ outfile.puts UtfIso.from_iso_8859_1(Ampersand.hex_to_iso(line))
36
+ else
37
+ raise "Shouldn't be here."
38
+ end
39
+ end
40
+ infile.close()
41
+ outfile.close()
42
+ end
43
+
44
+ ####
45
+ # transform BNC format file to plaintext file
46
+ def FrprepHelper.bnc_to_plain_file(input_filename, # string: name of input file
47
+ output_filename) # string: name of output file
48
+ begin
49
+ infile = File.new(input_filename)
50
+ outfile = File.new(output_filename, "w")
51
+ rescue
52
+ raise "Could not read #{input_filename}, or could not write to #{output_filename}."
53
+ end
54
+
55
+ infile.each { |line|
56
+ # does this line contain a sentence?
57
+ if line =~ /^\s*<s\s+n=/
58
+ # remove all tags, replace by spaces,
59
+ # then remove superfluous spaces
60
+ textline = line.gsub(/<.+?>/, " ").strip().squeeze(" ")
61
+
62
+
63
+ textline.gsub!(/&bquo;/, '"')
64
+ textline.gsub!(/&equo;/, '"')
65
+ textline.gsub!(/&mdash;/, "-")
66
+ textline.gsub!(/&ndash;/, "-")
67
+ textline.gsub!(/&percnt;/, "%")
68
+ textline.gsub!(/&pound;/, " pounds ")
69
+ textline.gsub!(/&amp;/, " and ")
70
+ textline.gsub!(/&hellip;/, "...")
71
+ textline.gsub!(/&copy;/, "(copyright)")
72
+ textline.gsub!(/&eacute;/, "e")
73
+ textline.gsub!(/&bull;/, "*")
74
+ textline.gsub!(/&dollar;/, "$")
75
+ textline.gsub!(/&deg;/, " degree ")
76
+
77
+ textline.gsub!(/&frac12;/, "1/2")
78
+ textline.gsub!(/&frac34;/, "3/4")
79
+
80
+ textline.gsub!(/&lsqb;/, "[")
81
+ textline.gsub!(/&rsqb;/, "]")
82
+
83
+ textline.gsub!(/&ins;/, "i")
84
+ textline.gsub!(/&ft;/, "ft")
85
+
86
+ textline.gsub!(/&rarr;/, ">")
87
+ textline.gsub!(/&larr;/, "<")
88
+
89
+
90
+ textline.gsub!(/&aacute;/, "a")
91
+ textline.gsub!(/&auml;/, "a")
92
+ textline.gsub!(/&agrave;/, "a")
93
+ textline.gsub!(/&atilde;/, "a")
94
+ textline.gsub!(/&acirc;/, "a")
95
+ textline.gsub!(/&Aacute;/, "A")
96
+ textline.gsub!(/&Auml;/, "A")
97
+ textline.gsub!(/&Agrave;/, "A")
98
+ textline.gsub!(/&Atilde;/, "A")
99
+ textline.gsub!(/&Acirc;/, "A")
100
+
101
+ textline.gsub!(/&eacute;/, "e")
102
+ textline.gsub!(/&egrave;/, "e")
103
+ textline.gsub!(/&ecirc;/, "e")
104
+ textline.gsub!(/&euml;/, "e")
105
+ textline.gsub!(/&Eacute;/, "E")
106
+ textline.gsub!(/&Egrave;/, "E")
107
+ textline.gsub!(/&Ecirc;/, "E")
108
+ textline.gsub!(/&Euml;/, "E")
109
+
110
+ textline.gsub!(/&iacute;/, "i")
111
+ textline.gsub!(/&igrave;/, "i")
112
+ textline.gsub!(/&icirc;/, "i")
113
+ textline.gsub!(/&iuml;/, "i")
114
+ textline.gsub!(/&Iacute;/, "I")
115
+ textline.gsub!(/&Igrave;/, "I")
116
+ textline.gsub!(/&Icirc;/, "I")
117
+
118
+ textline.gsub!(/&oacute;/, "o")
119
+ textline.gsub!(/&ograve;/, "o")
120
+ textline.gsub!(/&ocirc;/, "o")
121
+ textline.gsub!(/&ouml;/, "o")
122
+ textline.gsub!(/&Oacute;/, "O")
123
+ textline.gsub!(/&Ograve;/, "O")
124
+ textline.gsub!(/&Ocirc;/, "O")
125
+ textline.gsub!(/&Ouml;/, "O")
126
+
127
+ textline.gsub!(/&uacute;/, "u")
128
+ textline.gsub!(/&ugrave;/, "u")
129
+ textline.gsub!(/&ucirc;/, "u")
130
+ textline.gsub!(/&uuml;/, "u")
131
+ textline.gsub!(/&Uacute;/, "U")
132
+ textline.gsub!(/&Ugrave;/, "U")
133
+ textline.gsub!(/&Ucirc;/, "U")
134
+ textline.gsub!(/&Uuml;/, "U")
135
+
136
+ textline.gsub!(/&yuml;/, "y")
137
+ textline.gsub!(/&Yuml;/, "Y")
138
+
139
+ textline.gsub!(/&ntilde;/, "n")
140
+ textline.gsub!(/&Ntilde;/, "N")
141
+
142
+ textline.gsub!(/&ccedil;/, "c")
143
+ textline.gsub!(/&Ccedil;/, "C")
144
+
145
+
146
+ outfile.puts textline
147
+ end
148
+ }
149
+ infile.close()
150
+ outfile.close()
151
+ end
152
+
153
+
154
+ ####
155
+ # transform plaintext file to Tab format file
156
+ def FrprepHelper.plain_to_tab_file(input_filename,# string: name of input file
157
+ output_filename) # string: name of output file
158
+ begin
159
+ infile = File.new(input_filename)
160
+ outfile = File.new(output_filename, "w")
161
+ rescue
162
+ raise "Could not read #{input_filename}, or could not write to #{output_filename}."
163
+ end
164
+
165
+ filename_core = File.basename(input_filename, "txt")
166
+
167
+ # array(string): keep the words of each sentence
168
+ sentence = Array.new
169
+ # sentence number for making the sentence ID:
170
+ # global count, over all input files
171
+ sentno = 0
172
+
173
+ while (line = infile.gets())
174
+
175
+ # make a sentence ID for the next sentence: running number
176
+ sentid = filename_core + "_" + sentno.to_s
177
+ sentno += 1
178
+
179
+ # read words into the sentence array,
180
+ # separating out punctuation attached to the beginning or end of words
181
+ sentence.clear()
182
+ line.split.each { |word|
183
+ # punctuation at the beginning of the word
184
+ #if word =~ /^([\(\[`'\"-]+)(.*)$/
185
+ if word =~ /^([\(\[`\"-]+)(.*)$/
186
+ punct = $1
187
+ word = $2
188
+ punct.scan(/./) { |single_punct|
189
+ sentence << single_punct
190
+ }
191
+
192
+ end
193
+ # punctuation at the end of the word
194
+ #if word =~ /[,:;-\`?!'\"\.\)\]]+$/
195
+ if word =~ /[,:;-\`?!\"\.\)\]]+$/
196
+ sentence << $` # part before the match: the word
197
+ punct = $&
198
+ punct.scan(/./) { |single_punct|
199
+ sentence << single_punct
200
+ }
201
+
202
+ else
203
+ # no punctuation recognized
204
+ sentence << word
205
+ end
206
+ }
207
+
208
+
209
+ # remove empty words
210
+ sentence.reject! { |word| word.nil? or word.strip.empty? }
211
+
212
+ # write words to tab file
213
+ # KE Dec 06: TabFormat changed
214
+ sentence.each { |word|
215
+ # for each word, one line, entries in the line tab-separated
216
+ # the 'word' entry is the word, the 'lu_sent_ids' entry is the sentence ID sentid,
217
+ # all other entries (gf, pt, frame etc.) are not set
218
+ outfile.puts FNTabFormatFile.format_str({
219
+ "word" => word,
220
+ "sent_id" => sentid
221
+ })
222
+ }
223
+ outfile.puts
224
+ end
225
+ outfile.close()
226
+ end
227
+
228
+ ###########
229
+ #
230
+ # class method split_dir:
231
+ # read all files in one directory and produce chunk files *#{suffix} in outdir
232
+ # with a certain number of files in them (sent_num).
233
+ # Optionally, remove all sentences longer than sent_leng
234
+ #
235
+ # produces output files 1.<suffix>, 2.<suffix>, etc.
236
+ #
237
+ # assumes TabFormat sentences
238
+ #
239
+ # example: split_all("/tmp/in","/tmp/out",".tab",2000,80)
240
+
241
+ def FrprepHelper.split_dir(indir,
242
+ outdir,
243
+ suffix,
244
+ sent_num,
245
+ sent_leng=nil)
246
+
247
+ unless indir[-1,1] == "/"
248
+ indir += "/"
249
+ end
250
+ unless outdir[-1,1] == "/"
251
+ outdir += "/"
252
+ end
253
+
254
+ outfile_counter = 0
255
+ line_stack = Array.new
256
+ sent_stack = Array.new
257
+
258
+ Dir[indir+"*#{suffix}"].each {|infilename|
259
+ STDERR.puts "Now splitting #{infilename}"
260
+ infile = File.new(infilename)
261
+
262
+ while line = infile.gets
263
+ line.chomp!
264
+ case line
265
+ when "" # end of sentence
266
+ if !(sent_leng.nil? or line_stack.length < sent_leng) # record sentence
267
+ # suppress multiple empty lines
268
+ # to avoid problems with lemmatiser
269
+ # only record sent_stack if it is not empty.
270
+
271
+ # change (sp 15 01 07): just cut off sentence at sent_leng.
272
+
273
+ STDERR.puts "Cutting off long sentence #{line_stack.last.split("\t").last}"
274
+ line_stack = line_stack[0..sent_leng-1]
275
+ end
276
+ unless line_stack.empty?
277
+ sent_stack << line_stack
278
+ # reset line_stack
279
+ line_stack = Array.new
280
+ end
281
+
282
+
283
+ # check if we have to empty the sent stack
284
+ if sent_stack.length == sent_num # enough sentences for new outfile?
285
+ outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
286
+ sent_stack.each {|l_stack|
287
+ outfile.puts l_stack.join("\n")
288
+ outfile.puts
289
+ }
290
+ outfile.close
291
+ outfile_counter += 1
292
+ sent_stack = Array.new
293
+ end
294
+
295
+ else # for any other line
296
+ line_stack << line
297
+ end
298
+ end
299
+ infile.close
300
+ }
301
+ # the last remaining sentences
302
+ unless sent_stack.empty?
303
+ outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
304
+ sent_stack.each {|l_stack|
305
+ l_stack << "\n"
306
+ outfile.puts l_stack.join("\n")
307
+ }
308
+ outfile.close
309
+ end
310
+ end
311
+
312
+ ####
313
+ # note salsa targetlemma
314
+ #
315
+ # old_dir contains xml files whose name starts with the
316
+ # target lemma for all frames in the file
317
+ # record that target lemma in the <target> element of each frame
318
+ def FrprepHelper.note_salsa_targetlemma(old_dir, # string ending in /
319
+ new_dir) # string ending in /
320
+
321
+
322
+ # each input file: extract target lemma from filename,
323
+ # not this lemma in the <target> element of each frame
324
+ Dir[old_dir + "*.xml"].each { |filename|
325
+ changedfilename = new_dir + File.basename(filename)
326
+
327
+ if File.basename(filename) =~ /^(.*?)[_\.]/
328
+ lemma = $1
329
+
330
+ infile = FilePartsParser.new(filename)
331
+ outfile = File.new(changedfilename, "w")
332
+
333
+ # write header
334
+ outfile.puts infile.head()
335
+
336
+ # iterate through sentences, yield as SalsaTigerSentence objects
337
+ infile.scan_s() { |sent_string|
338
+ sent = SalsaTigerSentence.new(sent_string)
339
+ sent.each_frame { |frame|
340
+ frame.target.set_attribute("lemma", lemma)
341
+ }
342
+
343
+ # write changed sentence
344
+ outfile.puts sent.get()
345
+ } # each sentence
346
+
347
+ # write footer
348
+ outfile.puts infile.tail()
349
+ infile.close()
350
+ outfile.close()
351
+
352
+ else
353
+ # couldn't determine lemma
354
+ # just copy the file
355
+ `cp #{filename} #{changedfilename}`
356
+ end
357
+ }
358
+ end
359
+
360
+ ####
361
+ # stxml_split_dir
362
+ #
363
+ # split SalsaTigerXML files into new files of given length,
364
+ # skipping sentences that are too long
365
+ #
366
+ # At the same time, sentences that occur several times (i.e. sentences which are
367
+ # annotated by SALSA for more than one predicate) are compacted into one occurrence
368
+ # with combined semantics.
369
+ #
370
+ # assumes that all files in input_dir with
371
+ # extension .xml are SalsaTigerXMl files
372
+ def FrprepHelper.stxml_split_dir(input_dir, # string: input directory with STXML files
373
+ split_dir, # string: output directory
374
+ max_sentnum, # integer: max num of sentences per file
375
+ max_sentlen) # integer: max num of terminals per sentence
376
+
377
+ filenames = Dir[input_dir+"*.xml"].to_a
378
+
379
+ graph_hash = Hash.new # for each sentence id, keep <s...</graph>
380
+ frame_hash = Hash.new # for each sentence id , keep the <frame... </frame> string
381
+ uspfes_hash = Hash.new # for each sentence id, keep the uspfes stuff
382
+ uspframes_hash = Hash.new # for each sentence id, keep the uspframes stuff
383
+
384
+ ########################
385
+ # Traverse of file(s): compute an index of all frames for each sentence, with unique identifiers
386
+
387
+ filenames.each {|filename|
388
+
389
+ infile = FilePartsParser.new(filename)
390
+ infile.scan_s {|sent_str|
391
+
392
+ sentlen = 0
393
+ sent_str.delete("\n").scan(/<t\s/) { |occ| sentlen += 1}
394
+ if sentlen > max_sentlen
395
+ sent = RegXML.new(sent_str)
396
+ # revisit handling of long sentences
397
+ # $stderr.puts "I would have skipped overly long sentence " + sent.attributes["id"]+" but Sebastian forbade me.".to_s
398
+ # next
399
+ end
400
+
401
+ # substitute old frame identifiers with new, unique ones
402
+
403
+ # problem: we may have several frames per sentence, and need to keep track of them
404
+ # if we rename etc sxx_f1 to sxx_f2 and there is already a sxx_f2, then
405
+ # we cannot distinguish between these frames
406
+
407
+ # therefore, we substitute temporary identifiers until we have substituted
408
+ # all ids with temporary ones, and re-substitute final ones at the end.
409
+
410
+ this_frames = Array.new
411
+
412
+ temp_subs = Array.new
413
+ final_subs = Array.new
414
+
415
+ sent = RegXML.new(sent_str)
416
+ sentid = sent.attributes["id"].to_s
417
+ if sentid.nil?
418
+ STDERR.puts "[frprep] Warning: cannot find sentence id, skipping sentence:"
419
+ STDERR.puts sent_str
420
+ # strange sentence, no ID? skip
421
+ next
422
+ end
423
+
424
+ unless frame_hash.key? sentid
425
+ frame_hash[sentid] = Array.new
426
+ uspfes_hash[sentid] = Array.new
427
+ uspframes_hash[sentid] = Array.new
428
+ end
429
+
430
+ # find everything up to and including the graph
431
+ sent_children = sent.children_and_text()
432
+ graph = sent_children.detect { |child| child.name == "graph" }
433
+ graph_hash[sentid] = "<s " +
434
+ sent.attributes.to_a.map { |at, val| "#{at}=\'#{val}\'" }.join(" ") +
435
+ ">" +
436
+ graph.to_s
437
+
438
+ # find the usp block
439
+
440
+ sem = sent_children.detect { |child| child.name == "sem"}
441
+ usp = ""
442
+ if sem
443
+ usp = sem.children_and_text.detect { |child| child.name == "usp" }
444
+ usp = usp.to_s
445
+ end
446
+
447
+ # find all frames
448
+ if sem
449
+ frames = sem.children_and_text.detect { |child| child.name == "frames" }
450
+ if frames
451
+ frames.children_and_text.each { |frame|
452
+ unless frame.name == "frame"
453
+ next
454
+ end
455
+ frameid = frame.attributes["id"]
456
+
457
+ temp_frameid = "#{sentid}_temp_f#{frame_hash[sentid].length+this_frames.length+1}"
458
+ final_frameid = "#{sentid}_f#{frame_hash[sentid].length+this_frames.length+1}"
459
+
460
+ temp_subs << [frameid,temp_frameid]
461
+ final_subs << [temp_frameid,final_frameid]
462
+
463
+ this_frames << frame.to_s
464
+ }
465
+ end
466
+ end
467
+
468
+ # now first rename all the frames to temporary names
469
+
470
+ temp_subs.each {|orig_frameid, temp_frameid|
471
+ this_frames.map! {|frame_str|
472
+ #print "orig ", orig_frameid, " temp ", temp_frameid, "\n"
473
+ frame_str.gsub(orig_frameid,temp_frameid)
474
+ }
475
+
476
+ usp.gsub!(orig_frameid,temp_frameid)
477
+ }
478
+
479
+ # and re-rename the temporary names
480
+
481
+ final_subs.each {|temp_frameid, final_frameid|
482
+ this_frames.map! {|frame_str|
483
+ frame_str.gsub(temp_frameid,final_frameid)
484
+ }
485
+ usp.gsub!(temp_frameid, final_frameid)
486
+ }
487
+
488
+ # store frames in data structure
489
+ this_frames.each {|frame_str|
490
+ frame_hash[sentid] << frame_str
491
+ }
492
+
493
+ # store uspfes in data structure
494
+ unless usp.empty?
495
+ usp_elt = RegXML.new(usp)
496
+ uspfes = usp_elt.children_and_text.detect { |child| child.name == "uspfes" }
497
+ uspfes.children_and_text.each { |child|
498
+ unless child.name == "uspblock"
499
+ next
500
+ end
501
+ uspfes_hash[sentid] << child.to_s
502
+ }
503
+
504
+ # store uspframes in data structure
505
+ uspframes = usp_elt.children_and_text.detect { |child| child.name == "uspframes" }
506
+ uspframes.children_and_text.each { |child|
507
+ unless child.name == "uspblock"
508
+ next
509
+ end
510
+ uspframes_hash[sentid] << child.to_s
511
+ }
512
+ end
513
+ }
514
+ }
515
+
516
+ # now write everything in the data structure back to a file
517
+
518
+ filecounter = 0
519
+ sentcounter = 0
520
+ outfile = nil
521
+ sent_stack = Array.new
522
+
523
+ graph_hash.sort {|a,b| a[0].to_i <=> b[0].to_i}.each {|sentid,graph_str|
524
+
525
+ if sentcounter == max_sentnum
526
+ outfile.puts SalsaTigerXMLHelper.get_footer
527
+ outfile.close
528
+ outfile = nil
529
+ end
530
+
531
+ unless outfile
532
+ outfile = File.new(split_dir+filecounter.to_s+".xml","w")
533
+ outfile.puts SalsaTigerXMLHelper.get_header
534
+ filecounter +=1
535
+ sentcounter = 0
536
+ end
537
+
538
+ xml = Array.new
539
+ xml << graph_str
540
+ xml << "<sem>"
541
+ xml << "<globals>"
542
+ xml << "</globals>"
543
+ xml << "<frames>"
544
+ frame_hash[sentid].each {|frame_str|
545
+ xml << frame_str
546
+ }
547
+ xml << "</frames>"
548
+ xml << "<usp>"
549
+ xml << "<uspframes>"
550
+ uspframes_hash[sentid].each {|uspblock_str|
551
+ xml << uspblock_str
552
+ }
553
+ xml << "</uspframes>"
554
+ xml << "<uspfes>"
555
+ uspfes_hash[sentid].each {|uspblock_str|
556
+ xml << uspblock_str
557
+ }
558
+ xml << "</uspfes>"
559
+ xml << "</usp>"
560
+ xml << "</sem>"
561
+ xml << "</s>"
562
+
563
+ outfile.puts xml.join("\n")
564
+ sentcounter += 1
565
+ }
566
+
567
+ if outfile
568
+ outfile.puts SalsaTigerXMLHelper.get_footer
569
+ outfile.close
570
+ outfile = nil
571
+ end
572
+
573
+ end
574
+
575
+
576
+ ####
577
+ # transform SalsaTigerXML file to Tab format file
578
+ def FrprepHelper.stxml_to_tab_file(input_filename, # string: name of input file
579
+ output_filename, # string: name of output file
580
+ exp) # FrprepConfigData
581
+ infile = FilePartsParser.new(input_filename)
582
+ begin
583
+ outfile = File.new(output_filename,"w")
584
+ rescue
585
+ raise "Stxml to tab: could not write to tab file #{output_filename}"
586
+ end
587
+
588
+ infile.scan_s {|sent_string|
589
+
590
+ # determine sentence ID
591
+ sentid = RegXML.new(sent_string).attributes["id"]
592
+ unless sentid
593
+ $stderr.puts "No sentence ID in sentence:\n "+ sent_string
594
+ $stderr.puts "Making a new one up."
595
+ sentid = Time.new().to_f.to_s
596
+ end
597
+
598
+ # find terminals and process them
599
+ unless sent_string.delete("\n") =~ /<terminals[ >].+<\/terminals>/
600
+ $stderr.puts "Warning: could not find terminals in sentence:"
601
+ $stderr.puts sent_string
602
+ $stderr.puts "Skipping"
603
+ next
604
+ end
605
+
606
+ # modified by ines, 27/08/08
607
+ # for Berkeley => convert ( ) to -LRB- -RRB-
608
+
609
+ text = $&
610
+ if exp.get("parser") == "berkeley"
611
+ text.gsub!(/word='\('/, "word='*LRB*'")
612
+ text.gsub!(/word='\)'/, "word='*RRB*'")
613
+ text.gsub!(/word=['"]``['"]/, "word='\"'")
614
+ text.gsub!(/word=['"]''['"]/, "word='\"'")
615
+ text.gsub!(/word=['"]\&apos;\&apos;['"]/, "word='\"'")
616
+ #text.gsub!(/word=['"]\(['"]/, "word='-LRB-'")
617
+ #text.gsub!(/word=['"]\)['"]/, "word='-RRB-'")
618
+
619
+ end
620
+ terminals = text
621
+ #terminals = sent_string
622
+ terminals = RegXML.new(terminals)
623
+ terminals.children_and_text.each { |terminal|
624
+
625
+ unless terminal.name == "t"
626
+ # not a terminal after all
627
+ next
628
+ end
629
+
630
+
631
+ outfile.puts FNTabFormatFile.format_str({
632
+ "word" => SalsaTigerXMLHelper.unescape(terminal.attributes["word"]),
633
+ "sent_id" => sentid
634
+ })
635
+ } # each terminal
636
+ outfile.puts
637
+ } # each sentence
638
+ outfile.close
639
+ end
640
+
641
+ ###
642
+ # add semantics from tab:
643
+ #
644
+ # add information about semantics from a FN tab sentence
645
+ # to a SalsaTigerSentence object:
646
+ # - frames (one frame per sentence)
647
+ # - roles
648
+ # - FrameNet grammatical functions
649
+ # - FrameNet POS of target
650
+ def FrprepHelper.add_semantics_from_tab(st_sent, # SalsaTigerSentence object
651
+ tab_sent, # FNTabFormatSentence object
652
+ mapping, # hash: tab lineno -> array:SynNode
653
+ interpreter_class, # SynInterpreter class
654
+ exp) # FrprepConfigData
655
+
656
+ if tab_sent.nil?
657
+ # tab sentence not found
658
+ return
659
+ end
660
+
661
+ # iterate through frames in the tabsent
662
+ frame_index = 0
663
+ tab_sent.each_frame { |tab_frame_obj|
664
+ frame_name = tab_frame_obj.get_frame() # string
665
+
666
+ if frame_name.nil? or frame_name =~ /^-*$/
667
+ # weird: a frame without a frame
668
+ $stderr.puts "Warning: frame entry without a frame in tab sentence #{st_sent.id}."
669
+ $stderr.puts "Skipping"
670
+ next
671
+ end
672
+
673
+ frame_node = st_sent.add_frame(frame_name, tab_sent.get_sent_id() + "_f#{frame_index}")
674
+ frame_index += 1
675
+
676
+ # target
677
+ target_nodes = Array.new
678
+ tab_frame_obj.get_target_indices.each {|terminal_id|
679
+ if mapping[terminal_id]
680
+ target_nodes.concat mapping[terminal_id]
681
+ end
682
+ }
683
+
684
+ # let the interpreter class decide on how to determine the maximum constituents
685
+ target_maxnodes = interpreter_class.max_constituents(target_nodes, st_sent)
686
+ if target_maxnodes.empty?
687
+ # HIEr
688
+ STDERR.puts "Warning: no target in frame entry, sentence #{st_sent.id}."
689
+ $stderr.puts "frame is #{frame_name}, frame no #{frame_index}"
690
+ $stderr.puts "Skipping."
691
+ $stderr.puts "target indices: " + tab_frame_obj.get_target_indices.join(", ")
692
+ #tab_sent.each_line { |line|
693
+ # $stderr.puts line
694
+ # $stderr.puts "--"
695
+ #}
696
+ next
697
+ end
698
+ frame_node.add_fe("target",target_maxnodes)
699
+
700
+ # set features on target: target lemma, target POS
701
+ target_lemma = tab_frame_obj.get_target()
702
+ target_pos = nil
703
+ if target_lemma
704
+ if exp.get("origin") == "FrameNet"
705
+ # FrameNet data: here the lemma in the tab file has the form
706
+ # <lemma>.<POS>
707
+ # separate the two
708
+ if target_lemma =~ /^(.*)\.(.*)$/
709
+ target_lemma = $1
710
+ target_pos = $2
711
+ end
712
+ end
713
+ frame_node.target.set_attribute("lemma", target_lemma)
714
+ if target_pos
715
+ frame_node.target.set_attribute("pos", target_pos)
716
+ end
717
+ end
718
+
719
+ # roles, GF, PT
720
+ # synnode_markable_label:
721
+ # hash "role" | "gf" | "pt" -> SynNode -> array: label(string)
722
+ layer_synnode_label = Hash.new
723
+ ["gf", "pt", "role"].each {|layer|
724
+ termids2labels = tab_frame_obj.markables(layer)
725
+
726
+ unless layer_synnode_label[layer]
727
+ layer_synnode_label[layer] = Hash.new
728
+ end
729
+
730
+ termids2labels.each {|terminal_indices, label|
731
+ terminal_indices.each { |t_i|
732
+
733
+ if (nodes = mapping[t_i])
734
+
735
+ nodes.each { |node|
736
+ unless layer_synnode_label[layer][node]
737
+ layer_synnode_label[layer][node] = Array.new
738
+ end
739
+
740
+ layer_synnode_label[layer][node] << label
741
+ } # each node that t_i maps to
742
+ end # if t_i maps to anything
743
+
744
+ } # each terminal index
745
+ } # each mapping terminal indices -> label
746
+ } # each layer
747
+
748
+ # 'stuff' (Support and other things)
749
+ layer_synnode_label["stuff"] = Hash.new
750
+ tab_frame_obj.each_line_parsed { |line_obj|
751
+ if (label = line_obj.get("stuff")) != "-"
752
+ if (nodes = mapping[line_obj.get("lineno")])
753
+ nodes.each { |node|
754
+ unless layer_synnode_label["stuff"][node]
755
+ layer_synnode_label["stuff"][node] = Array.new
756
+ end
757
+ layer_synnode_label["stuff"][node] << label
758
+ }
759
+ end
760
+ end
761
+ }
762
+
763
+ # reencode:
764
+ # hash role_label(string) -> array of tuples [synnodes, gflabels, ptlabels]
765
+ # synnodes: array:SynNode. gflabels, ptlabels: array:String
766
+ #
767
+ # note that in this step, any gf or pt labels that have been
768
+ # assigned to a SynNode that has not also been assigned a role
769
+ # will be lost
770
+ role2nodes_labels = Hash.new
771
+ layer_synnode_label["role"].each_pair { |synnode, labels|
772
+ labels.each { | rolelabel|
773
+ unless role2nodes_labels[rolelabel]
774
+ role2nodes_labels[rolelabel] = Array.new
775
+ end
776
+
777
+ role2nodes_labels[rolelabel] << [
778
+ synnode,
779
+ layer_synnode_label["gf"][synnode],
780
+ layer_synnode_label["pt"][synnode]
781
+ ]
782
+ } # each role label
783
+ } # each pair SynNode/role labels
784
+
785
+ # reencode "stuff", but only the support cases
786
+ role2nodes_labels["Support"] = Array.new()
787
+
788
+ layer_synnode_label["stuff"].each_pair { |synnode, labels|
789
+ labels.each { |stufflabel|
790
+ if stufflabel =~ /Supp/
791
+ # some sort of support
792
+ role2nodes_labels["Support"] << [synnode, nil, nil]
793
+ end
794
+ }
795
+ }
796
+
797
+ ##
798
+ # each role label:
799
+ # make FeNode for the current frame
800
+ role2nodes_labels.each_pair { |rolelabel, node_gf_pt|
801
+
802
+ # get list of syn nodes, GF and PT labels for this role
803
+ # shortcut for GF and PT labels: take any labels that have
804
+ # been assigned for _some_ Synnode of this role
805
+ synnodes = node_gf_pt.map { |ngp| ngp[0] }
806
+ gflabels = node_gf_pt.map { |ngp| ngp[1] }.compact.flatten.uniq
807
+ ptlabels = node_gf_pt.map { |ngp| ngp[2] }.compact.flatten.uniq
808
+
809
+
810
+ # let the interpreter class decide on how to
811
+ # determine the maximum constituents
812
+ maxnodes = interpreter_class.max_constituents(synnodes, st_sent)
813
+
814
+ fe_node = st_sent.add_fe(frame_node, rolelabel, maxnodes)
815
+ unless gflabels.empty?
816
+ fe_node.set_attribute("gf", gflabels.join(","))
817
+ end
818
+ unless ptlabels.empty?
819
+ fe_node.set_attribute("pt", ptlabels.join(","))
820
+ end
821
+ } # each role label
822
+ } # each frame
823
+ end
824
+
825
+
826
+ ######
827
+ # handle multiword targets:
828
+ # if you find a verb with a separate prefix,
829
+ # change the verb's lemma information accordingly
830
+ # and add an attribute "other_words" to the verb node
831
+ # pointing to the other node
832
+ #
833
+ # In general, it will be assumed that "other_words" contains
834
+ # a list of node IDs for other nodes belonging to the same
835
+ # group, node IDs separated by spaces, and that
836
+ # each node of a group has the "other_words" attribute.
837
+ #
838
+ def FrprepHelper.handle_multiword_targets(sent, # SalsaTigerSentence object
839
+ interpreter, # SynInterpreter object
840
+ language) # string: en, de
841
+ ##
842
+ # only retain the interesting words of the sentence:
843
+ # content words and prepositions
844
+ if sent.nil?
845
+ return
846
+ end
847
+
848
+ nodes = sent.terminals.select { |node|
849
+ [
850
+ "adj", "adv", "card", "noun", "part", "prep", "verb"
851
+ ].include? interpreter.category(node)
852
+ }
853
+
854
+ ##
855
+ # group:
856
+ # group verbs with their separate particles
857
+ # (at a later point, other types of grouping can be inserted here)
858
+ groups = FrprepHelper.group_words(nodes, interpreter)
859
+
860
+ ##
861
+ # record grouping information as attributes on the terminals.
862
+ groups.each { |descr, group_of_nodes|
863
+ case descr
864
+ when "none"
865
+ # no grouping
866
+ when "part"
867
+ # separate particle belonging to a verb
868
+
869
+ # group_of_nodes is a pair [verb, particle]
870
+ verb, particle = group_of_nodes
871
+
872
+ verb.set_attribute("other_words", particle.id())
873
+ particle.set_attribute("other_words", verb.id())
874
+
875
+ if verb.get_attribute("lemma") and particle.get_attribute("lemma")
876
+ case language
877
+ when "de"
878
+ # German: prepend SVP to get the real lemma of the verb
879
+ verb.set_attribute("lemma",
880
+ particle.get_attribute("lemma") +
881
+ verb.get_attribute("lemma"))
882
+ when "en"
883
+ # English: append particle as separate word after the lemma of the verb
884
+ verb.set_attribute("lemma",
885
+ verb.get_attribute("lemma") + " " +
886
+ particle.get_attribute("lemma"))
887
+ else
888
+ # default
889
+ verb.set_attribute("lemma",
890
+ verb.get_attribute("lemma") + " " +
891
+ particle.get_attribute("lemma"))
892
+ end
893
+ end
894
+
895
+ else
896
+ raise "Shouldn't be here: unexpected description #{descr}"
897
+ end
898
+ }
899
+ end
900
+
901
+ ########################
902
+ # group_words
903
+ #
904
+ # auxiliary of transform_multiword targets
905
+ #
906
+ # Group terminals:
907
+ # At the moment, just find separate prefixes and particles
908
+ # for verbs
909
+ #
910
+ # returns: list of pairs [descr, nodes]
911
+ # descr: string, "none" (no group), "part" (separate verb particle)
912
+ # nodes: array:SynNode
913
+ def FrprepHelper.group_words(nodes, # array: SynNode
914
+ interpreter) # SynInterpreter object
915
+
916
+ retv = Array.new # array of groups, array:array:SynNode
917
+ done = Array.new # remember nodes already covered
918
+
919
+ nodes.each { |terminal_node|
920
+ if done.include? terminal_node
921
+ # we have already included this node in one of the groups
922
+ next
923
+ end
924
+
925
+ if (svp = interpreter.particle_of_verb(terminal_node, nodes))
926
+ retv << ["part", [terminal_node, svp]]
927
+ done << terminal_node
928
+ done << svp
929
+ else
930
+ retv << ["none", [terminal_node]]
931
+ done << terminal_node
932
+ end
933
+
934
+ }
935
+
936
+ return retv
937
+ end
938
+
939
+
940
+ ######
941
+ # handle unknown framenames
942
+ #
943
+ # For all frames with names matching Unknown\d+,
944
+ # rename them to <lemma>_Unknown\d+
945
+ def FrprepHelper.handle_unknown_framenames(sent, # SalsaTigerSentence
946
+ interpreter) # SynInterpreter class
947
+ if sent.nil?
948
+ return
949
+ end
950
+
951
+ sent.each_frame { |frame|
952
+ if frame.name() =~ /^Unknown/
953
+ if frame.target
954
+ maintarget = interpreter.main_node_of_expr(frame.target.children(), "no_mwe")
955
+ else
956
+ maintarget = nil
957
+ end
958
+ unless maintarget
959
+ $stderr.puts "Warning: Unknown frame, and I could not determine the target lemma: Frame #{frame.id()}"
960
+ $stderr.puts "Cannot repair frame name, leaving it as is."
961
+ return
962
+ end
963
+
964
+ # get lemma, if it exists, otherwise get word
965
+ # also, if the lemmatizer has returned a disjunction of lemmas,
966
+ # get the first disjunct
967
+ lemma = interpreter.lemma_backoff(maintarget)
968
+ if lemma
969
+ # we have a lemma
970
+ frame.set_name(lemma + "_" + frame.name())
971
+ else
972
+ # the main target word has no lemma attribute,
973
+ # and somehow I couldn't even get the target word
974
+ $stderr.puts "Warning: Salsa 'Unknown' frame."
975
+ $stderr.puts "Trying to make its lemma-specificity explicit, but"
976
+ $stderr.puts "I could not determine the target lemma nor the target word: frame #{frame.id()}"
977
+ $stderr.puts "Leaving 'Unknown' as it is."
978
+ end
979
+ end
980
+ }
981
+ end
982
+
983
+
984
+ #####################
985
+ #
986
+ # Integrate the semantic annotation of an old sentence
987
+ # into the corresponding new sentence
988
+ # At the same time, integrate the lemma information from the
989
+ # old sentence into the new sentence
990
+ def FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent,
991
+ newsent,
992
+ interpreter_class,
993
+ exp)
994
+ if oldsent.nil? or newsent.nil?
995
+ return
996
+ end
997
+ ##
998
+ # match old and new sentence via terminals
999
+ newterminals = newsent.terminals_sorted()
1000
+ oldterminals = oldsent.terminals_sorted()
1001
+ # sanity check: exact match on terminals?
1002
+ newterminals.interleave(oldterminals).each { |newnode, oldnode|
1003
+ #print "old ", oldnode.word, " ", newnode.word, "\n"
1004
+ # new and old word: use both unescaped and escaped variant
1005
+ if newnode
1006
+ newwords = [ newnode.word, SalsaTigerXMLHelper.escape(newnode.word) ]
1007
+ else
1008
+ newwords = [nil, nil]
1009
+ end
1010
+ if oldnode
1011
+ oldwords = [ oldnode.word, SalsaTigerXMLHelper.escape(oldnode.word) ]
1012
+ else
1013
+ oldwords = [ nil, nil]
1014
+ end
1015
+
1016
+ if (newwords & oldwords).empty?
1017
+ # old and new word don't match, either escaped or non-escaped
1018
+
1019
+ $stderr.puts "Warning: could not match terminals of sentence #{newsent.id()}"
1020
+ $stderr.puts "This means that I cannot match the semantic annotation"
1021
+ $stderr.puts "to the newly parsed sentence. Skipping."
1022
+ #$stderr.puts "Old sentence: "
1023
+ #$stderr.puts oldterminals.map { |n| n.word }.join("--")
1024
+ #$stderr.puts "New sentence: "
1025
+ #$stderr.puts newterminals.map { |n| n.word }.join("--")
1026
+ return false
1027
+ end
1028
+ }
1029
+
1030
+ ##
1031
+ # copy lemma information
1032
+ oldterminals.each_with_index { |oldnode, ix|
1033
+ newnode = newterminals[ix]
1034
+ if oldnode.get_attribute("lemma")
1035
+ newnode.set_attribute("lemma", oldnode.get_attribute("lemma"))
1036
+ end
1037
+ }
1038
+
1039
+ ##
1040
+ # copy frames
1041
+ oldsent.each_frame { |oldframe|
1042
+ # make new frame with same ID
1043
+ newframe = newsent.add_frame(oldframe.name, oldframe.id())
1044
+ # copy FEs
1045
+ oldframe.each_child { |oldfe|
1046
+ # new nodes: map old terminals to new terminals,
1047
+ # then find max constituents covering them
1048
+ newnodes = oldfe.descendants.select { |n|
1049
+ n.is_terminal?
1050
+ }.map { |n|
1051
+ oldterminals.index(n)
1052
+ }.map { |ix|
1053
+ newterminals[ix]
1054
+ }
1055
+
1056
+ # let the interpreter class decide on how to determine the maximum constituents
1057
+ newnodes = interpreter_class.max_constituents(newnodes, newsent)
1058
+
1059
+ # make new FE with same ID
1060
+ new_fe = newsent.add_fe(newframe, oldfe.name(), newnodes, oldfe.id())
1061
+ # keep all attributes of the FE
1062
+ if oldfe.get_f("attributes")
1063
+ oldfe.get_f("attributes").each_pair { |attr, value|
1064
+ new_fe.set_attribute(attr, value)
1065
+ }
1066
+ end
1067
+ }
1068
+ }
1069
+
1070
+ ##
1071
+ ### changed by ines => appears twice in stxml file
1072
+
1073
+ # copy underspecification
1074
+ # keep as is, since we've kept all frame and FE IDs
1075
+ oldsent.each_usp_frameblock { |olduspframe|
1076
+ newuspframe = newsent.add_usp("frame")
1077
+ olduspframe.each_child { |oldnode|
1078
+ newnode = newsent.sem_node_with_id(oldnode.id())
1079
+ if newnode
1080
+ newuspframe.add_child(newnode)
1081
+ else
1082
+ $stderr.puts "Error: unknown frame with ID #{oldnode.id()}"
1083
+ end
1084
+ }
1085
+ }
1086
+ oldsent.each_usp_feblock { |olduspfe|
1087
+ newuspfe = newsent.add_usp("fe")
1088
+ olduspfe.each_child { |oldnode|
1089
+ newnode = newsent.sem_node_with_id(oldnode.id())
1090
+ if newnode
1091
+ newuspfe.add_child(newnode)
1092
+ else
1093
+ $stderr.puts "Error: unknown FE with ID #{oldnode.id()}"
1094
+ end
1095
+ }
1096
+ }
1097
+
1098
+ end
1099
+
1100
+ ####################
1101
+ # add head attributes to each nonterminal in each
1102
+ # SalsaTigerXML file in a directory
1103
+
1104
+ def FrprepHelper.add_head_attributes(st_sent, # SalsaTigerSentence object
1105
+ interpreter) # SynInterpreter class
1106
+ st_sent.each_nonterminal {|nt_node|
1107
+ head_term = interpreter.head_terminal(nt_node)
1108
+ if head_term and head_term.word()
1109
+ nt_node.set_attribute("head", head_term.word())
1110
+ else
1111
+ nt_node.set_attribute("head", "--")
1112
+ end
1113
+ } # each nonterminal
1114
+ end
1115
+
1116
+ # add lemma information to each terminal in a given SalsaTigerSentence object
1117
+ def FrprepHelper.add_lemmas_from_tab(st_sent, # SalsaTigerSentence object
1118
+ tab_sent,# FNTabFormatSentence object
1119
+ mapping) # hash: tab lineno -> array:SynNode
1120
+ if tab_sent.nil?
1121
+ # tab sentence not found
1122
+ return
1123
+ end
1124
+
1125
+ # produce list with word, lemma pairs
1126
+ lemmat = Array.new
1127
+ tab_sent.each_line_parsed {|line|
1128
+ word = line.get("word")
1129
+ lemma = line.get("lemma")
1130
+ lemmat << [word,lemma]
1131
+ }
1132
+
1133
+ # match with st_sent terminal list and add lemma attributes
1134
+ # KE Jan 07: if word mismatch,
1135
+ # set to Lemmatizer file version,
1136
+ # but count mismatches
1137
+ word_mismatches = Array.new()
1138
+
1139
+ st_sent.each_terminal_sorted {|t|
1140
+ matching_lineno = (0..lemmat.length()-1).to_a.detect { |tab_lineno|
1141
+ mapping[tab_lineno].include? t
1142
+ }
1143
+ unless matching_lineno
1144
+ next
1145
+ end
1146
+ word, lemma = lemmat[matching_lineno]
1147
+
1148
+ # transform characters to XML-friendly form
1149
+ # for comparison with st_word, which is also escaped
1150
+ word = SalsaTigerXMLHelper.escape(word)
1151
+ st_word = t.word()
1152
+ if word != st_word and
1153
+ word != SalsaTigerXMLHelper.escape(st_word)
1154
+ # true mismatch.
1155
+ # use the Lemmatizer version of the word, remember the mismatch
1156
+ word_mismatches << [st_word, word]
1157
+ t.set_attribute("word", word)
1158
+ end
1159
+
1160
+ if lemma
1161
+ # we actually do have lemma information
1162
+ lemmatised_head = SalsaTigerXMLHelper.escape(lemma)
1163
+ t.set_attribute("lemma",lemmatised_head)
1164
+ end
1165
+ } # each terminal
1166
+
1167
+ # did we have mismatches? then report them
1168
+ unless word_mismatches.empty?
1169
+ $stderr.puts "Warning: Word mismatches found between Lemmatizer file and SalsaTigerXML file generalted from parser output."
1170
+ $stderr.puts "(May be due to failed reencoding of special character in the parser output.)"
1171
+ $stderr.puts "I am using the Lemmatizer version by default."
1172
+ $stderr.puts "Version used:"
1173
+ $stderr.print "\t"
1174
+ st_sent.each_terminal_sorted { |t| $stderr.print ">>#{t}<<" }
1175
+ $stderr.puts
1176
+ $stderr.print "SalsaTigerXML file had: "
1177
+ $stderr.print word_mismatches.map { |st_word, tab_word|
1178
+ "#{st_word} instead of #{tab_word}"
1179
+ }.join(", ")
1180
+ $stderr.puts
1181
+ end
1182
+ end
1183
+
1184
+ ###################3
1185
+ # given a SalsaTigerSentence,
1186
+ # look for FrameNet frames that are
1187
+ # test frames, and remove them
1188
+ def FrprepHelper.remove_deprecated_frames(sent, # SalsaTigerSentence
1189
+ exp) # FrprepConfigData
1190
+
1191
+ unless exp.get("origin") == "FrameNet"
1192
+ return
1193
+ end
1194
+
1195
+ sent.frames.each { |frame_obj|
1196
+ if frame_obj.name() == "Boulder" or
1197
+ frame_obj.name() =~ /^Test/
1198
+ sent.remove_frame(frame_obj)
1199
+ end
1200
+ }
1201
+ end
1202
+
1203
+ end
1204
+
1205
+ ############################################3
1206
+ # Class FrprepFlatSyntax:
1207
+ #
1208
+ # given a FNTabFormat file,
1209
+ # yield each of its sentences in SalsaTigerXML,
1210
+ # constructing a flat syntax
1211
+ class FrprepFlatSyntax
1212
+ def initialize(tabfilename, # string: name of tab file
1213
+ postag_suffix, # postag file suffix (or nil)
1214
+ lemma_suffix) # lemmatisation file suffix (or nil)
1215
+
1216
+ @tabfilename = tabfilename
1217
+ @pos_suffix = postag_suffix
1218
+ @lemma_suffix = lemma_suffix
1219
+ end
1220
+
1221
+ # yield each non-parse sentence as a tuple
1222
+ # [ salsa/tiger xml sentence, tab format sentence, mapping]
1223
+ # of a SalsaTigerSentence object, a FNTabSentence object,
1224
+ # and a hash: FNTab sentence lineno(integer) -> array:SynNode
1225
+ # pointing each tab word to one or more SalsaTigerSentence terminals
1226
+ def each_sentence(dummy)
1227
+
1228
+ # read tab file with lemma and POS info
1229
+ tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
1230
+
1231
+ tabfile.each_sentence() { |tabsent|
1232
+ # start new, empty sentence with "failed" attribute (i.e. no parse)
1233
+ # and with the ID of the corresponding TabFormat sentence
1234
+ sentid = tabsent.get_sent_id()
1235
+ if sentid.nil? or sentid =~ /^-*$/
1236
+ $stderr.puts "No sentence ID for sentence:"
1237
+ tabsent.each_line_parsed { |l| $stderr.print l.get("word"), " "}
1238
+ $stderr.puts
1239
+ sentid = Time.new().to_f.to_s
1240
+ end
1241
+ sent = SalsaTigerSentence.new("<s id=\"#{SalsaTigerXMLHelper.escape(sentid)}\" failed=\"true\"></s>")
1242
+
1243
+ # add single nonterminal node, category "S"
1244
+ single_nonterminal_id = SalsaTigerXMLHelper.escape(sentid.to_s + "_NT")
1245
+ vroot = sent.add_syn("nt", "S", # category
1246
+ nil, # word
1247
+ nil, # pos
1248
+ single_nonterminal_id)
1249
+
1250
+ # add terminals
1251
+ tabsent.each_line_parsed() { |line_obj|
1252
+ # make terminal node with tab sent info
1253
+ node_id = sentid.to_s + "_" + line_obj.get("lineno").to_s
1254
+ word = line_obj.get("word")
1255
+ unless word
1256
+ word = ""
1257
+ end
1258
+ word = SalsaTigerXMLHelper.escape(word)
1259
+ pos = line_obj.get("pos")
1260
+ unless pos
1261
+ pos = ""
1262
+ end
1263
+ pos = SalsaTigerXMLHelper.escape(pos)
1264
+ terminal = sent.add_syn("t", nil, # category
1265
+ word, pos,
1266
+ node_id)
1267
+
1268
+ if line_obj.get("lemma")
1269
+ # lemma
1270
+ terminal.set_attribute("lemma", SalsaTigerXMLHelper.escape(line_obj.get("lemma")))
1271
+ end
1272
+
1273
+ # add new terminal as child of vroot
1274
+ vroot.add_child(terminal, nil)
1275
+ terminal.add_parent(vroot, nil)
1276
+ } # each line of tab file
1277
+
1278
+ # yield newly constructed SalsaTigerXMl sentence plus tab sentence
1279
+ yield [sent, tabsent, SynInterfaceSTXML.standard_mapping(sent, tabsent)]
1280
+ }
1281
+ end
1282
+ end
1283
+
1284
+ ############################################3
1285
+ # Class FrprepReadStxml
1286
+ #
1287
+ # given a STXML file,
1288
+ # yield each of its sentences
1289
+ class FrprepReadStxml
1290
+ def initialize(stxmlfilename, # string: name of SalsaTigerXML file
1291
+ tabfilename, # string: name of corresponding tab file (or nil)
1292
+ postag_suffix, # POS tag file suffix (or nil)
1293
+ lemma_suffix) # lemmatization file suffix (or nil)
1294
+
1295
+ @stxmlfilename = stxmlfilename
1296
+ @tabfilename = tabfilename
1297
+ @pos_suffix = postag_suffix
1298
+ @lemma_suffix = lemma_suffix
1299
+ end
1300
+ # yield each non-parse sentence as a tuple
1301
+ # [ salsa/tiger xml sentence, tab format sentence, mapping]
1302
+ # of a SalsaTigerSentence object, a FNTabSentence object,
1303
+ # and a hash: FNTab sentence lineno(integer) -> array:SynNode
1304
+ # pointing each tab word to one or more SalsaTigerSentence terminals
1305
+ def each_sentence(dummy)
1306
+ # read corresponding tab file?
1307
+ tab_sents = Array.new()
1308
+ if File.exists? @tabfilename
1309
+ tabfile = FNTabFormatFile.new(@tabfilename,@pos_suffix,@lemma_suffix)
1310
+ tabfile.each_sentence { |tabsent|
1311
+ tab_sents << tabsent
1312
+ }
1313
+ end
1314
+
1315
+ # read STXML file
1316
+ infile = FilePartsParser.new(@stxmlfilename)
1317
+ index = 0
1318
+ infile.scan_s { |sent_string|
1319
+ sent = SalsaTigerSentence.new(sent_string)
1320
+ yield [sent, tab_sents.at(index), SynInterfaceSTXML.standard_mapping(sent, tab_sents.at(index))]
1321
+ index += 1
1322
+ }
1323
+ end
1324
+ end