frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,196 @@
1
+ ###
2
+ # FixSynSemMapping:
3
+ # Given a SalsaTigerRegXML sentence with semantic role annotation,
4
+ # simplify the mapping of semantic roles to syntactic constituents
5
+ #
6
+ # The following is lifted from the LREC06 paper on Shalmaneser:
7
+ # During preprocessing, the span of semantic roles in the training corpora is
8
+ # projected onto the output of the syntactic parser by assigning each
9
+ # role to the set of maximal constituents covering its word span.
10
+ # f the word span of a role does not coincide
11
+ # with parse tree constituents, e.g. due to misparses,
12
+ # the role is ``spread out'' across several constituents. This leads to
13
+ # idiosyncratic paths between predicate and semantic role in the parse
14
+ # tree.
15
+ #
16
+ # [The following span standardization algorithm is used to make the
17
+ # syntax-semantics mapping more uniform:]
18
+ # Given a role r that has been assigned, let N be the set of
19
+ # terminal nodes of the syntactic structure that are covered by r.
20
+ #
21
+ # Iteratively compute the maximal projection of N in the syntactic
22
+ # structure:
23
+ # 1) If n is a node such that all of n's children are in N,
24
+ # then remove n's children from N and add n instead.
25
+ # 2) If n is a node with 3 or more children, and all of n's
26
+ # children except one are in N, then remove n's children from N
27
+ # and add n instead.
28
+ # 3) If n is an NP with 2 children, and one of them, another NP,
29
+ # is in N, and the other, a relative clause, is not, then remove
30
+ # n's children from N and add n instead.
31
+ #
32
+ # If none of the rules is applicable to N anymore, assign r to the
33
+ # nodes in N.
34
+ #
35
+ # Rule 1 implements normal maximal projection. Rule 2 ``repairs'' parser
36
+ # errors where all children of a node but one have been assigned the
37
+ # same role. Rule 3 addresses a problem of the FrameNet data, where
38
+ # relative clauses have been omitted from roles assigned to NPs.
39
+
40
+ # KE Feb 08: rule 3 currently out of commission!
41
+
42
+ require "common/SalsaTigerRegXML"
43
+
44
+ module FixSynSemMapping
45
+ ##
46
+ # fix it
47
+ #
48
+ # relevant settings in the experiment file:
49
+ #
50
+ # fe_syn_repair:
51
+ # If there is a node that would be a max. constituent for the
52
+ # words covered by the given FE, except that it has one child
53
+ # whose words are not in the FE, use the node as max constituent anyway.
54
+ # This is to repair cases where the parser has made an attachment choice
55
+ # that differs from the one in the gold annotation
56
+ #
57
+ # fe_rel_repair:
58
+ # If there is an NP such that all of its children except one have been
59
+ # assigned the same FE, and that missing child is a relative clause
60
+ # depending on one of the other children, then take the complete NP as
61
+ # that FE
62
+ def FixSynSemMapping.fixit(sent, # SalsaTigerSentence object
63
+ exp, # experiment file object
64
+ interpreter_class) # SynInterpreter class
65
+
66
+
67
+ unless exp.get("fe_syn_repair") or exp.get("fe_rel_repair")
68
+ return
69
+ end
70
+
71
+ if sent.nil?
72
+ return
73
+ end
74
+
75
+ # "repair" FEs:
76
+ sent.each_frame { |frame|
77
+
78
+ frame.each_child { |fe_or_target|
79
+
80
+ # repair only if the FE currently
81
+ # points to more than one syn node
82
+ if fe_or_target.children.length() < 2
83
+ next
84
+ end
85
+
86
+ if exp.get("fe_rel_repair")
87
+ lastfe = fe_or_target.children.last()
88
+ if lastfe and interpreter_class.simplified_pt(lastfe) =~ /^(WDT)|(WP\$?)|(WRB)/
89
+
90
+ # remove syn nodes that the FE points to
91
+ old_fe_syn = fe_or_target.children()
92
+ old_fe_syn.each { |child|
93
+ fe_or_target.remove_child(child)
94
+ }
95
+
96
+ # set it to point only to the last previous node, the relative pronoun
97
+ fe_or_target.add_child(lastfe)
98
+ end
99
+ end
100
+
101
+ if exp.get("fe_syn_repair")
102
+ # remove syn nodes that the FE points to
103
+ old_fe_syn = fe_or_target.children()
104
+ old_fe_syn.each { |child|
105
+ fe_or_target.remove_child(child)
106
+ }
107
+
108
+ # and recompute
109
+ new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t|
110
+ t.yield_nodes
111
+ }.flatten.uniq,
112
+ sent,
113
+ exp.get("fe_syn_repair"))
114
+
115
+ # make the FE point to the new nodes
116
+ new_fe_syn.each { |syn_node|
117
+ fe_or_target.add_child(syn_node)
118
+ }
119
+ end
120
+ } # each FE
121
+ } # each frame
122
+ end # def fixit
123
+ end # module
124
+
125
+
126
+ #########3
127
+ # old code
128
+
129
+ # if exp.get("fe_rel_repair")
130
+ # # repair relative clauses:
131
+ # # then make a procedure to pass on to max constituents
132
+ # # that will recognize the relevant cases
133
+
134
+ # accept_anyway_proc = Proc.new { |node, children_in, children_out|
135
+
136
+ # # node: SynNode
137
+ # # children_in, children_out: array:SynNode. children_in are the children
138
+ # # that are already covered by the FE, children_out the ones that aren't
139
+
140
+ # # if node is an NP,
141
+ # # and only one of its children is out,
142
+ # # and one node in children_in is an NP, and the missing child is an SBAR
143
+ # # with a child that is a relative pronoun, then consider the child in children_out as covered
144
+ # if interpreter_class.category(node) == "noun" and
145
+ # children_out.length() == 1 and
146
+ # children_in.select { |n| interpreter_class.category(n) == "noun" } and
147
+ # interpreter_class.category(children_out.first) == "sent" and
148
+ # (ch = children_out.first.children) and
149
+ # ch.select { |n| interpreter_class.relative_pronoun?(n) }
150
+ # true
151
+ # else
152
+ # false
153
+ # end
154
+ # }
155
+
156
+ # else
157
+ # accept_anyway_proc = nil
158
+ # end
159
+
160
+
161
+ # # "repair" FEs:
162
+ # sent.each_frame { |frame|
163
+
164
+ # frame.each_child { |fe_or_target|
165
+
166
+ # # repair only if the FE currently
167
+ # # points to more than one syn node, or
168
+ # # if it is a noun with a non-covered sentence sister
169
+ # if fe_or_target.children.length() > 1 or
170
+ # (exp.get("fe_rel_repair") and (curr_marked = fe_or_target.children.first()) and
171
+ # interpreter_class.category(curr_marked) == "noun" and
172
+ # (p = curr_marked.parent) and
173
+ # p.children.select { |n| n != curr_marked and interpreter_class.category(n) == "sent" } )
174
+
175
+ # # remember nodes covered by the FE
176
+ # old_fe_syn = fe_or_target.children()
177
+
178
+ # # remove syn nodes that the FE points to
179
+ # old_fe_syn.each { |child|
180
+ # fe_or_target.remove_child(child)
181
+ # }
182
+
183
+ # # and recompute
184
+ # new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t| t.yield_nodes}.flatten.uniq,
185
+ # sent,
186
+ # exp.get("fe_syn_repair"),
187
+ # accept_anyway_proc)
188
+
189
+ # # make the FE point to the new nodes
190
+ # new_fe_syn.each { |syn_node|
191
+ # fe_or_target.add_child(syn_node)
192
+ # }
193
+
194
+ # end # if FE points to more than one syn node
195
+ # } # each FE
196
+ # } # each frame
@@ -0,0 +1,66 @@
1
+ # FPrepConfigData
2
+ # Katrin Erk July 05
3
+ #
4
+ # Preprocessing for Fred and Rosy:
5
+ # access to a configuration and experiment description file
6
+
7
+ require "common/ConfigData"
8
+
9
+ ##############################
10
+ # Class FrPrepConfigData
11
+ #
12
+ # inherits from ConfigData,
13
+ # sets variable names appropriate to preprocessing task
14
+
15
+ class FrPrepConfigData < ConfigData
16
+ def initialize(filename)
17
+
18
+ # initialize config data object
19
+ super(filename, # config file
20
+ { "prep_experiment_ID" => "string", # experiment identifier
21
+
22
+ "frprep_directory" => "string", # dir for frprep internal data
23
+
24
+ # information about the dataset
25
+ "language" => "string", # en, de
26
+ "origin"=> "string", # FrameNet, Salsa, or nothing
27
+ "format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
28
+ "encoding" => "string", # utf8, iso, hex, or nothing
29
+
30
+
31
+ # directories
32
+ "directory_input" => "string", # dir with input data
33
+ "directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
34
+ "directory_parserout" => "string", # dir with parser output for the parser named below
35
+
36
+ # syntactic processing
37
+ "pos_tagger" => "string", # name of POS tagger
38
+ "lemmatizer" => "string", # name of lemmatizer
39
+ "parser" => "string", # name of parser
40
+ "pos_tagger_path" => "string", # path to POS tagger
41
+ "lemmatizer_path" => "string", # path to lemmatizer
42
+ "parser_path" => "string", # path to parser
43
+ "parser_max_sent_num" => "integer", # max number of sentences per parser input file
44
+ "parser_max_sent_len" => "integer", # max sentence length the parser handles
45
+
46
+ "do_parse" => "bool", # use parser?
47
+ "do_lemmatize" => "bool",# use lemmatizer?
48
+ "do_postag" => "bool", # use POS tagger?
49
+
50
+ # output format: if tabformat_output == true,
51
+ # output in Tab format rather than Salsa/Tiger XML
52
+ # (this will not work if do_parse == true)
53
+ "tabformat_output" => "bool",
54
+
55
+ # syntactic repairs, dependent on existing semantic role annotation
56
+ "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
57
+ "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
58
+ },
59
+ [ ] # variables
60
+ )
61
+
62
+ end
63
+ end
64
+
65
+
66
+
@@ -0,0 +1,1324 @@
1
+ # Salsa packages
2
+ require "common/ISO-8859-1"
3
+ require "common/Parser"
4
+ require "common/RegXML"
5
+ require "common/SalsaTigerRegXML"
6
+ require "common/SalsaTigerXMLHelper"
7
+ require "common/TabFormat"
8
+ require "common/ruby_class_extensions"
9
+ require "common/AbstractSynInterface"
10
+
11
+ ############################################3
12
+ # Module FrprepHelper:
13
+ #
14
+ # diverse transformation methods for frprep.rb
15
+ # moved over here to make the main file less crowded
16
+ module FrprepHelper
17
+
18
+ ####
19
+ # transform a file to UTF-8 from a given encoding
20
+ def FrprepHelper.to_utf8_file(input_filename, # string: name of input file
21
+ output_filename, # string: name of output file
22
+ encoding) # string: "iso", "hex"
23
+ begin
24
+ infile = File.new(input_filename)
25
+ outfile = File.new(output_filename, "w")
26
+ rescue
27
+ raise "Could not read #{input_filename}, or could not write to #{output_filename}."
28
+ end
29
+
30
+ while (line = infile.gets())
31
+ case encoding
32
+ when "iso"
33
+ outfile.puts UtfIso.from_iso_8859_1(line)
34
+ when "hex"
35
+ outfile.puts UtfIso.from_iso_8859_1(Ampersand.hex_to_iso(line))
36
+ else
37
+ raise "Shouldn't be here."
38
+ end
39
+ end
40
+ infile.close()
41
+ outfile.close()
42
+ end
43
+
44
+ ####
45
+ # transform BNC format file to plaintext file
46
+ def FrprepHelper.bnc_to_plain_file(input_filename, # string: name of input file
47
+ output_filename) # string: name of output file
48
+ begin
49
+ infile = File.new(input_filename)
50
+ outfile = File.new(output_filename, "w")
51
+ rescue
52
+ raise "Could not read #{input_filename}, or could not write to #{output_filename}."
53
+ end
54
+
55
+ infile.each { |line|
56
+ # does this line contain a sentence?
57
+ if line =~ /^\s*<s\s+n=/
58
+ # remove all tags, replace by spaces,
59
+ # then remove superfluous spaces
60
+ textline = line.gsub(/<.+?>/, " ").strip().squeeze(" ")
61
+
62
+
63
+ textline.gsub!(/&bquo;/, '"')
64
+ textline.gsub!(/&equo;/, '"')
65
+ textline.gsub!(/&mdash;/, "-")
66
+ textline.gsub!(/&ndash;/, "-")
67
+ textline.gsub!(/&percnt;/, "%")
68
+ textline.gsub!(/&pound;/, " pounds ")
69
+ textline.gsub!(/&amp;/, " and ")
70
+ textline.gsub!(/&hellip;/, "...")
71
+ textline.gsub!(/&copy;/, "(copyright)")
72
+ textline.gsub!(/&eacute;/, "e")
73
+ textline.gsub!(/&bull;/, "*")
74
+ textline.gsub!(/&dollar;/, "$")
75
+ textline.gsub!(/&deg;/, " degree ")
76
+
77
+ textline.gsub!(/&frac12;/, "1/2")
78
+ textline.gsub!(/&frac34;/, "3/4")
79
+
80
+ textline.gsub!(/&lsqb;/, "[")
81
+ textline.gsub!(/&rsqb;/, "]")
82
+
83
+ textline.gsub!(/&ins;/, "i")
84
+ textline.gsub!(/&ft;/, "ft")
85
+
86
+ textline.gsub!(/&rarr;/, ">")
87
+ textline.gsub!(/&larr;/, "<")
88
+
89
+
90
+ textline.gsub!(/&aacute;/, "a")
91
+ textline.gsub!(/&auml;/, "a")
92
+ textline.gsub!(/&agrave;/, "a")
93
+ textline.gsub!(/&atilde;/, "a")
94
+ textline.gsub!(/&acirc;/, "a")
95
+ textline.gsub!(/&Aacute;/, "A")
96
+ textline.gsub!(/&Auml;/, "A")
97
+ textline.gsub!(/&Agrave;/, "A")
98
+ textline.gsub!(/&Atilde;/, "A")
99
+ textline.gsub!(/&Acirc;/, "A")
100
+
101
+ textline.gsub!(/&eacute;/, "e")
102
+ textline.gsub!(/&egrave;/, "e")
103
+ textline.gsub!(/&ecirc;/, "e")
104
+ textline.gsub!(/&euml;/, "e")
105
+ textline.gsub!(/&Eacute;/, "E")
106
+ textline.gsub!(/&Egrave;/, "E")
107
+ textline.gsub!(/&Ecirc;/, "E")
108
+ textline.gsub!(/&Euml;/, "E")
109
+
110
+ textline.gsub!(/&iacute;/, "i")
111
+ textline.gsub!(/&igrave;/, "i")
112
+ textline.gsub!(/&icirc;/, "i")
113
+ textline.gsub!(/&iuml;/, "i")
114
+ textline.gsub!(/&Iacute;/, "I")
115
+ textline.gsub!(/&Igrave;/, "I")
116
+ textline.gsub!(/&Icirc;/, "I")
117
+
118
+ textline.gsub!(/&oacute;/, "o")
119
+ textline.gsub!(/&ograve;/, "o")
120
+ textline.gsub!(/&ocirc;/, "o")
121
+ textline.gsub!(/&ouml;/, "o")
122
+ textline.gsub!(/&Oacute;/, "O")
123
+ textline.gsub!(/&Ograve;/, "O")
124
+ textline.gsub!(/&Ocirc;/, "O")
125
+ textline.gsub!(/&Ouml;/, "O")
126
+
127
+ textline.gsub!(/&uacute;/, "u")
128
+ textline.gsub!(/&ugrave;/, "u")
129
+ textline.gsub!(/&ucirc;/, "u")
130
+ textline.gsub!(/&uuml;/, "u")
131
+ textline.gsub!(/&Uacute;/, "U")
132
+ textline.gsub!(/&Ugrave;/, "U")
133
+ textline.gsub!(/&Ucirc;/, "U")
134
+ textline.gsub!(/&Uuml;/, "U")
135
+
136
+ textline.gsub!(/&yuml;/, "y")
137
+ textline.gsub!(/&Yuml;/, "Y")
138
+
139
+ textline.gsub!(/&ntilde;/, "n")
140
+ textline.gsub!(/&Ntilde;/, "N")
141
+
142
+ textline.gsub!(/&ccedil;/, "c")
143
+ textline.gsub!(/&Ccedil;/, "C")
144
+
145
+
146
+ outfile.puts textline
147
+ end
148
+ }
149
+ infile.close()
150
+ outfile.close()
151
+ end
152
+
153
+
154
+ ####
155
+ # transform plaintext file to Tab format file
156
+ def FrprepHelper.plain_to_tab_file(input_filename,# string: name of input file
157
+ output_filename) # string: name of output file
158
+ begin
159
+ infile = File.new(input_filename)
160
+ outfile = File.new(output_filename, "w")
161
+ rescue
162
+ raise "Could not read #{input_filename}, or could not write to #{output_filename}."
163
+ end
164
+
165
+ filename_core = File.basename(input_filename, "txt")
166
+
167
+ # array(string): keep the words of each sentence
168
+ sentence = Array.new
169
+ # sentence number for making the sentence ID:
170
+ # global count, over all input files
171
+ sentno = 0
172
+
173
+ while (line = infile.gets())
174
+
175
+ # make a sentence ID for the next sentence: running number
176
+ sentid = filename_core + "_" + sentno.to_s
177
+ sentno += 1
178
+
179
+ # read words into the sentence array,
180
+ # separating out punctuation attached to the beginning or end of words
181
+ sentence.clear()
182
+ line.split.each { |word|
183
+ # punctuation at the beginning of the word
184
+ #if word =~ /^([\(\[`'\"-]+)(.*)$/
185
+ if word =~ /^([\(\[`\"-]+)(.*)$/
186
+ punct = $1
187
+ word = $2
188
+ punct.scan(/./) { |single_punct|
189
+ sentence << single_punct
190
+ }
191
+
192
+ end
193
+ # punctuation at the end of the word
194
+ #if word =~ /[,:;-\`?!'\"\.\)\]]+$/
195
+ if word =~ /[,:;-\`?!\"\.\)\]]+$/
196
+ sentence << $` # part before the match: the word
197
+ punct = $&
198
+ punct.scan(/./) { |single_punct|
199
+ sentence << single_punct
200
+ }
201
+
202
+ else
203
+ # no punctuation recognized
204
+ sentence << word
205
+ end
206
+ }
207
+
208
+
209
+ # remove empty words
210
+ sentence.reject! { |word| word.nil? or word.strip.empty? }
211
+
212
+ # write words to tab file
213
+ # KE Dec 06: TabFormat changed
214
+ sentence.each { |word|
215
+ # for each word, one line, entries in the line tab-separated
216
+ # the 'word' entry is the word, the 'lu_sent_ids' entry is the sentence ID sentid,
217
+ # all other entries (gf, pt, frame etc.) are not set
218
+ outfile.puts FNTabFormatFile.format_str({
219
+ "word" => word,
220
+ "sent_id" => sentid
221
+ })
222
+ }
223
+ outfile.puts
224
+ end
225
+ outfile.close()
226
+ end
227
+
228
+ ###########
229
+ #
230
+ # class method split_dir:
231
+ # read all files in one directory and produce chunk files *#{suffix} in outdir
232
+ # with a certain number of files in them (sent_num).
233
+ # Optionally, remove all sentences longer than sent_leng
234
+ #
235
+ # produces output files 1.<suffix>, 2.<suffix>, etc.
236
+ #
237
+ # assumes TabFormat sentences
238
+ #
239
+ # example: split_all("/tmp/in","/tmp/out",".tab",2000,80)
240
+
241
+ def FrprepHelper.split_dir(indir,
242
+ outdir,
243
+ suffix,
244
+ sent_num,
245
+ sent_leng=nil)
246
+
247
+ unless indir[-1,1] == "/"
248
+ indir += "/"
249
+ end
250
+ unless outdir[-1,1] == "/"
251
+ outdir += "/"
252
+ end
253
+
254
+ outfile_counter = 0
255
+ line_stack = Array.new
256
+ sent_stack = Array.new
257
+
258
+ Dir[indir+"*#{suffix}"].each {|infilename|
259
+ STDERR.puts "Now splitting #{infilename}"
260
+ infile = File.new(infilename)
261
+
262
+ while line = infile.gets
263
+ line.chomp!
264
+ case line
265
+ when "" # end of sentence
266
+ if !(sent_leng.nil? or line_stack.length < sent_leng) # record sentence
267
+ # suppress multiple empty lines
268
+ # to avoid problems with lemmatiser
269
+ # only record sent_stack if it is not empty.
270
+
271
+ # change (sp 15 01 07): just cut off sentence at sent_leng.
272
+
273
+ STDERR.puts "Cutting off long sentence #{line_stack.last.split("\t").last}"
274
+ line_stack = line_stack[0..sent_leng-1]
275
+ end
276
+ unless line_stack.empty?
277
+ sent_stack << line_stack
278
+ # reset line_stack
279
+ line_stack = Array.new
280
+ end
281
+
282
+
283
+ # check if we have to empty the sent stack
284
+ if sent_stack.length == sent_num # enough sentences for new outfile?
285
+ outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
286
+ sent_stack.each {|l_stack|
287
+ outfile.puts l_stack.join("\n")
288
+ outfile.puts
289
+ }
290
+ outfile.close
291
+ outfile_counter += 1
292
+ sent_stack = Array.new
293
+ end
294
+
295
+ else # for any other line
296
+ line_stack << line
297
+ end
298
+ end
299
+ infile.close
300
+ }
301
+ # the last remaining sentences
302
+ unless sent_stack.empty?
303
+ outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
304
+ sent_stack.each {|l_stack|
305
+ l_stack << "\n"
306
+ outfile.puts l_stack.join("\n")
307
+ }
308
+ outfile.close
309
+ end
310
+ end
311
+
312
+ ####
313
+ # note salsa targetlemma
314
+ #
315
+ # old_dir contains xml files whose name starts with the
316
+ # target lemma for all frames in the file
317
+ # record that target lemma in the <target> element of each frame
318
+ def FrprepHelper.note_salsa_targetlemma(old_dir, # string ending in /
319
+ new_dir) # string ending in /
320
+
321
+
322
+ # each input file: extract target lemma from filename,
323
+ # not this lemma in the <target> element of each frame
324
+ Dir[old_dir + "*.xml"].each { |filename|
325
+ changedfilename = new_dir + File.basename(filename)
326
+
327
+ if File.basename(filename) =~ /^(.*?)[_\.]/
328
+ lemma = $1
329
+
330
+ infile = FilePartsParser.new(filename)
331
+ outfile = File.new(changedfilename, "w")
332
+
333
+ # write header
334
+ outfile.puts infile.head()
335
+
336
+ # iterate through sentences, yield as SalsaTigerSentence objects
337
+ infile.scan_s() { |sent_string|
338
+ sent = SalsaTigerSentence.new(sent_string)
339
+ sent.each_frame { |frame|
340
+ frame.target.set_attribute("lemma", lemma)
341
+ }
342
+
343
+ # write changed sentence
344
+ outfile.puts sent.get()
345
+ } # each sentence
346
+
347
+ # write footer
348
+ outfile.puts infile.tail()
349
+ infile.close()
350
+ outfile.close()
351
+
352
+ else
353
+ # couldn't determine lemma
354
+ # just copy the file
355
+ `cp #{filename} #{changedfilename}`
356
+ end
357
+ }
358
+ end
359
+
360
+ ####
361
+ # stxml_split_dir
362
+ #
363
+ # split SalsaTigerXML files into new files of given length,
364
+ # skipping sentences that are too long
365
+ #
366
+ # At the same time, sentences that occur several times (i.e. sentences which are
367
+ # annotated by SALSA for more than one predicate) are compacted into one occurrence
368
+ # with combined semantics.
369
+ #
370
+ # assumes that all files in input_dir with
371
+ # extension .xml are SalsaTigerXMl files
372
+ def FrprepHelper.stxml_split_dir(input_dir, # string: input directory with STXML files
373
+ split_dir, # string: output directory
374
+ max_sentnum, # integer: max num of sentences per file
375
+ max_sentlen) # integer: max num of terminals per sentence
376
+
377
+ filenames = Dir[input_dir+"*.xml"].to_a
378
+
379
+ graph_hash = Hash.new # for each sentence id, keep <s...</graph>
380
+ frame_hash = Hash.new # for each sentence id , keep the <frame... </frame> string
381
+ uspfes_hash = Hash.new # for each sentence id, keep the uspfes stuff
382
+ uspframes_hash = Hash.new # for each sentence id, keep the uspframes stuff
383
+
384
+ ########################
385
+ # Traverse of file(s): compute an index of all frames for each sentence, with unique identifiers
386
+
387
+ filenames.each {|filename|
388
+
389
+ infile = FilePartsParser.new(filename)
390
+ infile.scan_s {|sent_str|
391
+
392
+ sentlen = 0
393
+ sent_str.delete("\n").scan(/<t\s/) { |occ| sentlen += 1}
394
+ if sentlen > max_sentlen
395
+ sent = RegXML.new(sent_str)
396
+ # revisit handling of long sentences
397
+ # $stderr.puts "I would have skipped overly long sentence " + sent.attributes["id"]+" but Sebastian forbade me.".to_s
398
+ # next
399
+ end
400
+
401
+ # substitute old frame identifiers with new, unique ones
402
+
403
+ # problem: we may have several frames per sentence, and need to keep track of them
404
+ # if we rename etc sxx_f1 to sxx_f2 and there is already a sxx_f2, then
405
+ # we cannot distinguish between these frames
406
+
407
+ # therefore, we substitute temporary identifiers until we have substituted
408
+ # all ids with temporary ones, and re-substitute final ones at the end.
409
+
410
+ this_frames = Array.new
411
+
412
+ temp_subs = Array.new
413
+ final_subs = Array.new
414
+
415
+ sent = RegXML.new(sent_str)
416
+ sentid = sent.attributes["id"].to_s
417
+ if sentid.nil?
418
+ STDERR.puts "[frprep] Warning: cannot find sentence id, skipping sentence:"
419
+ STDERR.puts sent_str
420
+ # strange sentence, no ID? skip
421
+ next
422
+ end
423
+
424
+ unless frame_hash.key? sentid
425
+ frame_hash[sentid] = Array.new
426
+ uspfes_hash[sentid] = Array.new
427
+ uspframes_hash[sentid] = Array.new
428
+ end
429
+
430
+ # find everything up to and including the graph
431
+ sent_children = sent.children_and_text()
432
+ graph = sent_children.detect { |child| child.name == "graph" }
433
+ graph_hash[sentid] = "<s " +
434
+ sent.attributes.to_a.map { |at, val| "#{at}=\'#{val}\'" }.join(" ") +
435
+ ">" +
436
+ graph.to_s
437
+
438
+ # find the usp block
439
+
440
+ sem = sent_children.detect { |child| child.name == "sem"}
441
+ usp = ""
442
+ if sem
443
+ usp = sem.children_and_text.detect { |child| child.name == "usp" }
444
+ usp = usp.to_s
445
+ end
446
+
447
+ # find all frames
448
+ if sem
449
+ frames = sem.children_and_text.detect { |child| child.name == "frames" }
450
+ if frames
451
+ frames.children_and_text.each { |frame|
452
+ unless frame.name == "frame"
453
+ next
454
+ end
455
+ frameid = frame.attributes["id"]
456
+
457
+ temp_frameid = "#{sentid}_temp_f#{frame_hash[sentid].length+this_frames.length+1}"
458
+ final_frameid = "#{sentid}_f#{frame_hash[sentid].length+this_frames.length+1}"
459
+
460
+ temp_subs << [frameid,temp_frameid]
461
+ final_subs << [temp_frameid,final_frameid]
462
+
463
+ this_frames << frame.to_s
464
+ }
465
+ end
466
+ end
467
+
468
+ # now first rename all the frames to temporary names
469
+
470
+ temp_subs.each {|orig_frameid, temp_frameid|
471
+ this_frames.map! {|frame_str|
472
+ #print "orig ", orig_frameid, " temp ", temp_frameid, "\n"
473
+ frame_str.gsub(orig_frameid,temp_frameid)
474
+ }
475
+
476
+ usp.gsub!(orig_frameid,temp_frameid)
477
+ }
478
+
479
+ # and re-rename the temporary names
480
+
481
+ final_subs.each {|temp_frameid, final_frameid|
482
+ this_frames.map! {|frame_str|
483
+ frame_str.gsub(temp_frameid,final_frameid)
484
+ }
485
+ usp.gsub!(temp_frameid, final_frameid)
486
+ }
487
+
488
+ # store frames in data structure
489
+ this_frames.each {|frame_str|
490
+ frame_hash[sentid] << frame_str
491
+ }
492
+
493
+ # store uspfes in data structure
494
+ unless usp.empty?
495
+ usp_elt = RegXML.new(usp)
496
+ uspfes = usp_elt.children_and_text.detect { |child| child.name == "uspfes" }
497
+ uspfes.children_and_text.each { |child|
498
+ unless child.name == "uspblock"
499
+ next
500
+ end
501
+ uspfes_hash[sentid] << child.to_s
502
+ }
503
+
504
+ # store uspframes in data structure
505
+ uspframes = usp_elt.children_and_text.detect { |child| child.name == "uspframes" }
506
+ uspframes.children_and_text.each { |child|
507
+ unless child.name == "uspblock"
508
+ next
509
+ end
510
+ uspframes_hash[sentid] << child.to_s
511
+ }
512
+ end
513
+ }
514
+ }
515
+
516
+ # now write everything in the data structure back to a file
517
+
518
+ filecounter = 0
519
+ sentcounter = 0
520
+ outfile = nil
521
+ sent_stack = Array.new
522
+
523
+ graph_hash.sort {|a,b| a[0].to_i <=> b[0].to_i}.each {|sentid,graph_str|
524
+
525
+ if sentcounter == max_sentnum
526
+ outfile.puts SalsaTigerXMLHelper.get_footer
527
+ outfile.close
528
+ outfile = nil
529
+ end
530
+
531
+ unless outfile
532
+ outfile = File.new(split_dir+filecounter.to_s+".xml","w")
533
+ outfile.puts SalsaTigerXMLHelper.get_header
534
+ filecounter +=1
535
+ sentcounter = 0
536
+ end
537
+
538
+ xml = Array.new
539
+ xml << graph_str
540
+ xml << "<sem>"
541
+ xml << "<globals>"
542
+ xml << "</globals>"
543
+ xml << "<frames>"
544
+ frame_hash[sentid].each {|frame_str|
545
+ xml << frame_str
546
+ }
547
+ xml << "</frames>"
548
+ xml << "<usp>"
549
+ xml << "<uspframes>"
550
+ uspframes_hash[sentid].each {|uspblock_str|
551
+ xml << uspblock_str
552
+ }
553
+ xml << "</uspframes>"
554
+ xml << "<uspfes>"
555
+ uspfes_hash[sentid].each {|uspblock_str|
556
+ xml << uspblock_str
557
+ }
558
+ xml << "</uspfes>"
559
+ xml << "</usp>"
560
+ xml << "</sem>"
561
+ xml << "</s>"
562
+
563
+ outfile.puts xml.join("\n")
564
+ sentcounter += 1
565
+ }
566
+
567
+ if outfile
568
+ outfile.puts SalsaTigerXMLHelper.get_footer
569
+ outfile.close
570
+ outfile = nil
571
+ end
572
+
573
+ end
574
+
575
+
576
+ ####
577
+ # transform SalsaTigerXML file to Tab format file
578
+ def FrprepHelper.stxml_to_tab_file(input_filename, # string: name of input file
579
+ output_filename, # string: name of output file
580
+ exp) # FrprepConfigData
581
+ infile = FilePartsParser.new(input_filename)
582
+ begin
583
+ outfile = File.new(output_filename,"w")
584
+ rescue
585
+ raise "Stxml to tab: could not write to tab file #{output_filename}"
586
+ end
587
+
588
+ infile.scan_s {|sent_string|
589
+
590
+ # determine sentence ID
591
+ sentid = RegXML.new(sent_string).attributes["id"]
592
+ unless sentid
593
+ $stderr.puts "No sentence ID in sentence:\n "+ sent_string
594
+ $stderr.puts "Making a new one up."
595
+ sentid = Time.new().to_f.to_s
596
+ end
597
+
598
+ # find terminals and process them
599
+ unless sent_string.delete("\n") =~ /<terminals[ >].+<\/terminals>/
600
+ $stderr.puts "Warning: could not find terminals in sentence:"
601
+ $stderr.puts sent_string
602
+ $stderr.puts "Skipping"
603
+ next
604
+ end
605
+
606
+ # modified by ines, 27/08/08
607
+ # for Berkeley => convert ( ) to -LRB- -RRB-
608
+
609
+ text = $&
610
+ if exp.get("parser") == "berkeley"
611
+ text.gsub!(/word='\('/, "word='*LRB*'")
612
+ text.gsub!(/word='\)'/, "word='*RRB*'")
613
+ text.gsub!(/word=['"]``['"]/, "word='\"'")
614
+ text.gsub!(/word=['"]''['"]/, "word='\"'")
615
+ text.gsub!(/word=['"]\&apos;\&apos;['"]/, "word='\"'")
616
+ #text.gsub!(/word=['"]\(['"]/, "word='-LRB-'")
617
+ #text.gsub!(/word=['"]\)['"]/, "word='-RRB-'")
618
+
619
+ end
620
+ terminals = text
621
+ #terminals = sent_string
622
+ terminals = RegXML.new(terminals)
623
+ terminals.children_and_text.each { |terminal|
624
+
625
+ unless terminal.name == "t"
626
+ # not a terminal after all
627
+ next
628
+ end
629
+
630
+
631
+ outfile.puts FNTabFormatFile.format_str({
632
+ "word" => SalsaTigerXMLHelper.unescape(terminal.attributes["word"]),
633
+ "sent_id" => sentid
634
+ })
635
+ } # each terminal
636
+ outfile.puts
637
+ } # each sentence
638
+ outfile.close
639
+ end
640
+
641
+ ###
642
+ # add semantics from tab:
643
+ #
644
+ # add information about semantics from a FN tab sentence
645
+ # to a SalsaTigerSentence object:
646
+ # - frames (one frame per sentence)
647
+ # - roles
648
+ # - FrameNet grammatical functions
649
+ # - FrameNet POS of target
650
+ def FrprepHelper.add_semantics_from_tab(st_sent, # SalsaTigerSentence object
651
+ tab_sent, # FNTabFormatSentence object
652
+ mapping, # hash: tab lineno -> array:SynNode
653
+ interpreter_class, # SynInterpreter class
654
+ exp) # FrprepConfigData
655
+
656
+ if tab_sent.nil?
657
+ # tab sentence not found
658
+ return
659
+ end
660
+
661
+ # iterate through frames in the tabsent
662
+ frame_index = 0
663
+ tab_sent.each_frame { |tab_frame_obj|
664
+ frame_name = tab_frame_obj.get_frame() # string
665
+
666
+ if frame_name.nil? or frame_name =~ /^-*$/
667
+ # weird: a frame without a frame
668
+ $stderr.puts "Warning: frame entry without a frame in tab sentence #{st_sent.id}."
669
+ $stderr.puts "Skipping"
670
+ next
671
+ end
672
+
673
+ frame_node = st_sent.add_frame(frame_name, tab_sent.get_sent_id() + "_f#{frame_index}")
674
+ frame_index += 1
675
+
676
+ # target
677
+ target_nodes = Array.new
678
+ tab_frame_obj.get_target_indices.each {|terminal_id|
679
+ if mapping[terminal_id]
680
+ target_nodes.concat mapping[terminal_id]
681
+ end
682
+ }
683
+
684
+ # let the interpreter class decide on how to determine the maximum constituents
685
+ target_maxnodes = interpreter_class.max_constituents(target_nodes, st_sent)
686
+ if target_maxnodes.empty?
687
+ # HIEr
688
+ STDERR.puts "Warning: no target in frame entry, sentence #{st_sent.id}."
689
+ $stderr.puts "frame is #{frame_name}, frame no #{frame_index}"
690
+ $stderr.puts "Skipping."
691
+ $stderr.puts "target indices: " + tab_frame_obj.get_target_indices.join(", ")
692
+ #tab_sent.each_line { |line|
693
+ # $stderr.puts line
694
+ # $stderr.puts "--"
695
+ #}
696
+ next
697
+ end
698
+ frame_node.add_fe("target",target_maxnodes)
699
+
700
+ # set features on target: target lemma, target POS
701
+ target_lemma = tab_frame_obj.get_target()
702
+ target_pos = nil
703
+ if target_lemma
704
+ if exp.get("origin") == "FrameNet"
705
+ # FrameNet data: here the lemma in the tab file has the form
706
+ # <lemma>.<POS>
707
+ # separate the two
708
+ if target_lemma =~ /^(.*)\.(.*)$/
709
+ target_lemma = $1
710
+ target_pos = $2
711
+ end
712
+ end
713
+ frame_node.target.set_attribute("lemma", target_lemma)
714
+ if target_pos
715
+ frame_node.target.set_attribute("pos", target_pos)
716
+ end
717
+ end
718
+
719
+ # roles, GF, PT
720
+ # synnode_markable_label:
721
+ # hash "role" | "gf" | "pt" -> SynNode -> array: label(string)
722
+ layer_synnode_label = Hash.new
723
+ ["gf", "pt", "role"].each {|layer|
724
+ termids2labels = tab_frame_obj.markables(layer)
725
+
726
+ unless layer_synnode_label[layer]
727
+ layer_synnode_label[layer] = Hash.new
728
+ end
729
+
730
+ termids2labels.each {|terminal_indices, label|
731
+ terminal_indices.each { |t_i|
732
+
733
+ if (nodes = mapping[t_i])
734
+
735
+ nodes.each { |node|
736
+ unless layer_synnode_label[layer][node]
737
+ layer_synnode_label[layer][node] = Array.new
738
+ end
739
+
740
+ layer_synnode_label[layer][node] << label
741
+ } # each node that t_i maps to
742
+ end # if t_i maps to anything
743
+
744
+ } # each terminal index
745
+ } # each mapping terminal indices -> label
746
+ } # each layer
747
+
748
+ # 'stuff' (Support and other things)
749
+ layer_synnode_label["stuff"] = Hash.new
750
+ tab_frame_obj.each_line_parsed { |line_obj|
751
+ if (label = line_obj.get("stuff")) != "-"
752
+ if (nodes = mapping[line_obj.get("lineno")])
753
+ nodes.each { |node|
754
+ unless layer_synnode_label["stuff"][node]
755
+ layer_synnode_label["stuff"][node] = Array.new
756
+ end
757
+ layer_synnode_label["stuff"][node] << label
758
+ }
759
+ end
760
+ end
761
+ }
762
+
763
+ # reencode:
764
+ # hash role_label(string) -> array of tuples [synnodes, gflabels, ptlabels]
765
+ # synnodes: array:SynNode. gflabels, ptlabels: array:String
766
+ #
767
+ # note that in this step, any gf or pt labels that have been
768
+ # assigned to a SynNode that has not also been assigned a role
769
+ # will be lost
770
+ role2nodes_labels = Hash.new
771
+ layer_synnode_label["role"].each_pair { |synnode, labels|
772
+ labels.each { | rolelabel|
773
+ unless role2nodes_labels[rolelabel]
774
+ role2nodes_labels[rolelabel] = Array.new
775
+ end
776
+
777
+ role2nodes_labels[rolelabel] << [
778
+ synnode,
779
+ layer_synnode_label["gf"][synnode],
780
+ layer_synnode_label["pt"][synnode]
781
+ ]
782
+ } # each role label
783
+ } # each pair SynNode/role labels
784
+
785
+ # reencode "stuff", but only the support cases
786
+ role2nodes_labels["Support"] = Array.new()
787
+
788
+ layer_synnode_label["stuff"].each_pair { |synnode, labels|
789
+ labels.each { |stufflabel|
790
+ if stufflabel =~ /Supp/
791
+ # some sort of support
792
+ role2nodes_labels["Support"] << [synnode, nil, nil]
793
+ end
794
+ }
795
+ }
796
+
797
+ ##
798
+ # each role label:
799
+ # make FeNode for the current frame
800
+ role2nodes_labels.each_pair { |rolelabel, node_gf_pt|
801
+
802
+ # get list of syn nodes, GF and PT labels for this role
803
+ # shortcut for GF and PT labels: take any labels that have
804
+ # been assigned for _some_ Synnode of this role
805
+ synnodes = node_gf_pt.map { |ngp| ngp[0] }
806
+ gflabels = node_gf_pt.map { |ngp| ngp[1] }.compact.flatten.uniq
807
+ ptlabels = node_gf_pt.map { |ngp| ngp[2] }.compact.flatten.uniq
808
+
809
+
810
+ # let the interpreter class decide on how to
811
+ # determine the maximum constituents
812
+ maxnodes = interpreter_class.max_constituents(synnodes, st_sent)
813
+
814
+ fe_node = st_sent.add_fe(frame_node, rolelabel, maxnodes)
815
+ unless gflabels.empty?
816
+ fe_node.set_attribute("gf", gflabels.join(","))
817
+ end
818
+ unless ptlabels.empty?
819
+ fe_node.set_attribute("pt", ptlabels.join(","))
820
+ end
821
+ } # each role label
822
+ } # each frame
823
+ end
824
+
825
+
826
+ ######
827
+ # handle multiword targets:
828
+ # if you find a verb with a separate prefix,
829
+ # change the verb's lemma information accordingly
830
+ # and add an attribute "other_words" to the verb node
831
+ # pointing to the other node
832
+ #
833
+ # In general, it will be assumed that "other_words" contains
834
+ # a list of node IDs for other nodes belonging to the same
835
+ # group, node IDs separated by spaces, and that
836
+ # each node of a group has the "other_words" attribute.
837
+ #
838
+ def FrprepHelper.handle_multiword_targets(sent, # SalsaTigerSentence object
839
+ interpreter, # SynInterpreter object
840
+ language) # string: en, de
841
+ ##
842
+ # only retain the interesting words of the sentence:
843
+ # content words and prepositions
844
+ if sent.nil?
845
+ return
846
+ end
847
+
848
+ nodes = sent.terminals.select { |node|
849
+ [
850
+ "adj", "adv", "card", "noun", "part", "prep", "verb"
851
+ ].include? interpreter.category(node)
852
+ }
853
+
854
+ ##
855
+ # group:
856
+ # group verbs with their separate particles
857
+ # (at a later point, other types of grouping can be inserted here)
858
+ groups = FrprepHelper.group_words(nodes, interpreter)
859
+
860
+ ##
861
+ # record grouping information as attributes on the terminals.
862
+ groups.each { |descr, group_of_nodes|
863
+ case descr
864
+ when "none"
865
+ # no grouping
866
+ when "part"
867
+ # separate particle belonging to a verb
868
+
869
+ # group_of_nodes is a pair [verb, particle]
870
+ verb, particle = group_of_nodes
871
+
872
+ verb.set_attribute("other_words", particle.id())
873
+ particle.set_attribute("other_words", verb.id())
874
+
875
+ if verb.get_attribute("lemma") and particle.get_attribute("lemma")
876
+ case language
877
+ when "de"
878
+ # German: prepend SVP to get the real lemma of the verb
879
+ verb.set_attribute("lemma",
880
+ particle.get_attribute("lemma") +
881
+ verb.get_attribute("lemma"))
882
+ when "en"
883
+ # English: append particle as separate word after the lemma of the verb
884
+ verb.set_attribute("lemma",
885
+ verb.get_attribute("lemma") + " " +
886
+ particle.get_attribute("lemma"))
887
+ else
888
+ # default
889
+ verb.set_attribute("lemma",
890
+ verb.get_attribute("lemma") + " " +
891
+ particle.get_attribute("lemma"))
892
+ end
893
+ end
894
+
895
+ else
896
+ raise "Shouldn't be here: unexpected description #{descr}"
897
+ end
898
+ }
899
+ end
900
+
901
+ ########################
902
+ # group_words
903
+ #
904
+ # auxiliary of transform_multiword targets
905
+ #
906
+ # Group terminals:
907
+ # At the moment, just find separate prefixes and particles
908
+ # for verbs
909
+ #
910
+ # returns: list of pairs [descr, nodes]
911
+ # descr: string, "none" (no group), "part" (separate verb particle)
912
+ # nodes: array:SynNode
913
+ def FrprepHelper.group_words(nodes, # array: SynNode
914
+ interpreter) # SynInterpreter object
915
+
916
+ retv = Array.new # array of groups, array:array:SynNode
917
+ done = Array.new # remember nodes already covered
918
+
919
+ nodes.each { |terminal_node|
920
+ if done.include? terminal_node
921
+ # we have already included this node in one of the groups
922
+ next
923
+ end
924
+
925
+ if (svp = interpreter.particle_of_verb(terminal_node, nodes))
926
+ retv << ["part", [terminal_node, svp]]
927
+ done << terminal_node
928
+ done << svp
929
+ else
930
+ retv << ["none", [terminal_node]]
931
+ done << terminal_node
932
+ end
933
+
934
+ }
935
+
936
+ return retv
937
+ end
938
+
939
+
940
+ ######
941
+ # handle unknown framenames
942
+ #
943
+ # For all frames with names matching Unknown\d+,
944
+ # rename them to <lemma>_Unknown\d+
945
+ def FrprepHelper.handle_unknown_framenames(sent, # SalsaTigerSentence
946
+ interpreter) # SynInterpreter class
947
+ if sent.nil?
948
+ return
949
+ end
950
+
951
+ sent.each_frame { |frame|
952
+ if frame.name() =~ /^Unknown/
953
+ if frame.target
954
+ maintarget = interpreter.main_node_of_expr(frame.target.children(), "no_mwe")
955
+ else
956
+ maintarget = nil
957
+ end
958
+ unless maintarget
959
+ $stderr.puts "Warning: Unknown frame, and I could not determine the target lemma: Frame #{frame.id()}"
960
+ $stderr.puts "Cannot repair frame name, leaving it as is."
961
+ return
962
+ end
963
+
964
+ # get lemma, if it exists, otherwise get word
965
+ # also, if the lemmatizer has returned a disjunction of lemmas,
966
+ # get the first disjunct
967
+ lemma = interpreter.lemma_backoff(maintarget)
968
+ if lemma
969
+ # we have a lemma
970
+ frame.set_name(lemma + "_" + frame.name())
971
+ else
972
+ # the main target word has no lemma attribute,
973
+ # and somehow I couldn't even get the target word
974
+ $stderr.puts "Warning: Salsa 'Unknown' frame."
975
+ $stderr.puts "Trying to make its lemma-specificity explicit, but"
976
+ $stderr.puts "I could not determine the target lemma nor the target word: frame #{frame.id()}"
977
+ $stderr.puts "Leaving 'Unknown' as it is."
978
+ end
979
+ end
980
+ }
981
+ end
982
+
983
+
984
+ #####################
985
+ #
986
+ # Integrate the semantic annotation of an old sentence
987
+ # into the corresponding new sentence
988
+ # At the same time, integrate the lemma information from the
989
+ # old sentence into the new sentence
990
+ def FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent,
991
+ newsent,
992
+ interpreter_class,
993
+ exp)
994
+ if oldsent.nil? or newsent.nil?
995
+ return
996
+ end
997
+ ##
998
+ # match old and new sentence via terminals
999
+ newterminals = newsent.terminals_sorted()
1000
+ oldterminals = oldsent.terminals_sorted()
1001
+ # sanity check: exact match on terminals?
1002
+ newterminals.interleave(oldterminals).each { |newnode, oldnode|
1003
+ #print "old ", oldnode.word, " ", newnode.word, "\n"
1004
+ # new and old word: use both unescaped and escaped variant
1005
+ if newnode
1006
+ newwords = [ newnode.word, SalsaTigerXMLHelper.escape(newnode.word) ]
1007
+ else
1008
+ newwords = [nil, nil]
1009
+ end
1010
+ if oldnode
1011
+ oldwords = [ oldnode.word, SalsaTigerXMLHelper.escape(oldnode.word) ]
1012
+ else
1013
+ oldwords = [ nil, nil]
1014
+ end
1015
+
1016
+ if (newwords & oldwords).empty?
1017
+ # old and new word don't match, either escaped or non-escaped
1018
+
1019
+ $stderr.puts "Warning: could not match terminals of sentence #{newsent.id()}"
1020
+ $stderr.puts "This means that I cannot match the semantic annotation"
1021
+ $stderr.puts "to the newly parsed sentence. Skipping."
1022
+ #$stderr.puts "Old sentence: "
1023
+ #$stderr.puts oldterminals.map { |n| n.word }.join("--")
1024
+ #$stderr.puts "New sentence: "
1025
+ #$stderr.puts newterminals.map { |n| n.word }.join("--")
1026
+ return false
1027
+ end
1028
+ }
1029
+
1030
+ ##
1031
+ # copy lemma information
1032
+ oldterminals.each_with_index { |oldnode, ix|
1033
+ newnode = newterminals[ix]
1034
+ if oldnode.get_attribute("lemma")
1035
+ newnode.set_attribute("lemma", oldnode.get_attribute("lemma"))
1036
+ end
1037
+ }
1038
+
1039
+ ##
1040
+ # copy frames
1041
+ oldsent.each_frame { |oldframe|
1042
+ # make new frame with same ID
1043
+ newframe = newsent.add_frame(oldframe.name, oldframe.id())
1044
+ # copy FEs
1045
+ oldframe.each_child { |oldfe|
1046
+ # new nodes: map old terminals to new terminals,
1047
+ # then find max constituents covering them
1048
+ newnodes = oldfe.descendants.select { |n|
1049
+ n.is_terminal?
1050
+ }.map { |n|
1051
+ oldterminals.index(n)
1052
+ }.map { |ix|
1053
+ newterminals[ix]
1054
+ }
1055
+
1056
+ # let the interpreter class decide on how to determine the maximum constituents
1057
+ newnodes = interpreter_class.max_constituents(newnodes, newsent)
1058
+
1059
+ # make new FE with same ID
1060
+ new_fe = newsent.add_fe(newframe, oldfe.name(), newnodes, oldfe.id())
1061
+ # keep all attributes of the FE
1062
+ if oldfe.get_f("attributes")
1063
+ oldfe.get_f("attributes").each_pair { |attr, value|
1064
+ new_fe.set_attribute(attr, value)
1065
+ }
1066
+ end
1067
+ }
1068
+ }
1069
+
1070
+ ##
1071
+ ### changed by ines => appears twice in stxml file
1072
+
1073
+ # copy underspecification
1074
+ # keep as is, since we've kept all frame and FE IDs
1075
+ oldsent.each_usp_frameblock { |olduspframe|
1076
+ newuspframe = newsent.add_usp("frame")
1077
+ olduspframe.each_child { |oldnode|
1078
+ newnode = newsent.sem_node_with_id(oldnode.id())
1079
+ if newnode
1080
+ newuspframe.add_child(newnode)
1081
+ else
1082
+ $stderr.puts "Error: unknown frame with ID #{oldnode.id()}"
1083
+ end
1084
+ }
1085
+ }
1086
+ oldsent.each_usp_feblock { |olduspfe|
1087
+ newuspfe = newsent.add_usp("fe")
1088
+ olduspfe.each_child { |oldnode|
1089
+ newnode = newsent.sem_node_with_id(oldnode.id())
1090
+ if newnode
1091
+ newuspfe.add_child(newnode)
1092
+ else
1093
+ $stderr.puts "Error: unknown FE with ID #{oldnode.id()}"
1094
+ end
1095
+ }
1096
+ }
1097
+
1098
+ end
1099
+
1100
+ ####################
1101
+ # add head attributes to each nonterminal in each
1102
+ # SalsaTigerXML file in a directory
1103
+
1104
+ def FrprepHelper.add_head_attributes(st_sent, # SalsaTigerSentence object
1105
+ interpreter) # SynInterpreter class
1106
+ st_sent.each_nonterminal {|nt_node|
1107
+ head_term = interpreter.head_terminal(nt_node)
1108
+ if head_term and head_term.word()
1109
+ nt_node.set_attribute("head", head_term.word())
1110
+ else
1111
+ nt_node.set_attribute("head", "--")
1112
+ end
1113
+ } # each nonterminal
1114
+ end
1115
+
1116
+ # add lemma information to each terminal in a given SalsaTigerSentence object
1117
+ def FrprepHelper.add_lemmas_from_tab(st_sent, # SalsaTigerSentence object
1118
+ tab_sent,# FNTabFormatSentence object
1119
+ mapping) # hash: tab lineno -> array:SynNode
1120
+ if tab_sent.nil?
1121
+ # tab sentence not found
1122
+ return
1123
+ end
1124
+
1125
+ # produce list with word, lemma pairs
1126
+ lemmat = Array.new
1127
+ tab_sent.each_line_parsed {|line|
1128
+ word = line.get("word")
1129
+ lemma = line.get("lemma")
1130
+ lemmat << [word,lemma]
1131
+ }
1132
+
1133
+ # match with st_sent terminal list and add lemma attributes
1134
+ # KE Jan 07: if word mismatch,
1135
+ # set to Lemmatizer file version,
1136
+ # but count mismatches
1137
+ word_mismatches = Array.new()
1138
+
1139
+ st_sent.each_terminal_sorted {|t|
1140
+ matching_lineno = (0..lemmat.length()-1).to_a.detect { |tab_lineno|
1141
+ mapping[tab_lineno].include? t
1142
+ }
1143
+ unless matching_lineno
1144
+ next
1145
+ end
1146
+ word, lemma = lemmat[matching_lineno]
1147
+
1148
+ # transform characters to XML-friendly form
1149
+ # for comparison with st_word, which is also escaped
1150
+ word = SalsaTigerXMLHelper.escape(word)
1151
+ st_word = t.word()
1152
+ if word != st_word and
1153
+ word != SalsaTigerXMLHelper.escape(st_word)
1154
+ # true mismatch.
1155
+ # use the Lemmatizer version of the word, remember the mismatch
1156
+ word_mismatches << [st_word, word]
1157
+ t.set_attribute("word", word)
1158
+ end
1159
+
1160
+ if lemma
1161
+ # we actually do have lemma information
1162
+ lemmatised_head = SalsaTigerXMLHelper.escape(lemma)
1163
+ t.set_attribute("lemma",lemmatised_head)
1164
+ end
1165
+ } # each terminal
1166
+
1167
+ # did we have mismatches? then report them
1168
+ unless word_mismatches.empty?
1169
+ $stderr.puts "Warning: Word mismatches found between Lemmatizer file and SalsaTigerXML file generalted from parser output."
1170
+ $stderr.puts "(May be due to failed reencoding of special character in the parser output.)"
1171
+ $stderr.puts "I am using the Lemmatizer version by default."
1172
+ $stderr.puts "Version used:"
1173
+ $stderr.print "\t"
1174
+ st_sent.each_terminal_sorted { |t| $stderr.print ">>#{t}<<" }
1175
+ $stderr.puts
1176
+ $stderr.print "SalsaTigerXML file had: "
1177
+ $stderr.print word_mismatches.map { |st_word, tab_word|
1178
+ "#{st_word} instead of #{tab_word}"
1179
+ }.join(", ")
1180
+ $stderr.puts
1181
+ end
1182
+ end
1183
+
1184
+ ###################3
1185
+ # given a SalsaTigerSentence,
1186
+ # look for FrameNet frames that are
1187
+ # test frames, and remove them
1188
+ def FrprepHelper.remove_deprecated_frames(sent, # SalsaTigerSentence
1189
+ exp) # FrprepConfigData
1190
+
1191
+ unless exp.get("origin") == "FrameNet"
1192
+ return
1193
+ end
1194
+
1195
+ sent.frames.each { |frame_obj|
1196
+ if frame_obj.name() == "Boulder" or
1197
+ frame_obj.name() =~ /^Test/
1198
+ sent.remove_frame(frame_obj)
1199
+ end
1200
+ }
1201
+ end
1202
+
1203
+ end
1204
+
1205
+ ############################################3
1206
+ # Class FrprepFlatSyntax:
1207
+ #
1208
+ # given a FNTabFormat file,
1209
+ # yield each of its sentences in SalsaTigerXML,
1210
+ # constructing a flat syntax
1211
+ class FrprepFlatSyntax
1212
+ def initialize(tabfilename, # string: name of tab file
1213
+ postag_suffix, # postag file suffix (or nil)
1214
+ lemma_suffix) # lemmatisation file suffix (or nil)
1215
+
1216
+ @tabfilename = tabfilename
1217
+ @pos_suffix = postag_suffix
1218
+ @lemma_suffix = lemma_suffix
1219
+ end
1220
+
1221
+ # yield each non-parse sentence as a tuple
1222
+ # [ salsa/tiger xml sentence, tab format sentence, mapping]
1223
+ # of a SalsaTigerSentence object, a FNTabSentence object,
1224
+ # and a hash: FNTab sentence lineno(integer) -> array:SynNode
1225
+ # pointing each tab word to one or more SalsaTigerSentence terminals
1226
+ def each_sentence(dummy)
1227
+
1228
+ # read tab file with lemma and POS info
1229
+ tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
1230
+
1231
+ tabfile.each_sentence() { |tabsent|
1232
+ # start new, empty sentence with "failed" attribute (i.e. no parse)
1233
+ # and with the ID of the corresponding TabFormat sentence
1234
+ sentid = tabsent.get_sent_id()
1235
+ if sentid.nil? or sentid =~ /^-*$/
1236
+ $stderr.puts "No sentence ID for sentence:"
1237
+ tabsent.each_line_parsed { |l| $stderr.print l.get("word"), " "}
1238
+ $stderr.puts
1239
+ sentid = Time.new().to_f.to_s
1240
+ end
1241
+ sent = SalsaTigerSentence.new("<s id=\"#{SalsaTigerXMLHelper.escape(sentid)}\" failed=\"true\"></s>")
1242
+
1243
+ # add single nonterminal node, category "S"
1244
+ single_nonterminal_id = SalsaTigerXMLHelper.escape(sentid.to_s + "_NT")
1245
+ vroot = sent.add_syn("nt", "S", # category
1246
+ nil, # word
1247
+ nil, # pos
1248
+ single_nonterminal_id)
1249
+
1250
+ # add terminals
1251
+ tabsent.each_line_parsed() { |line_obj|
1252
+ # make terminal node with tab sent info
1253
+ node_id = sentid.to_s + "_" + line_obj.get("lineno").to_s
1254
+ word = line_obj.get("word")
1255
+ unless word
1256
+ word = ""
1257
+ end
1258
+ word = SalsaTigerXMLHelper.escape(word)
1259
+ pos = line_obj.get("pos")
1260
+ unless pos
1261
+ pos = ""
1262
+ end
1263
+ pos = SalsaTigerXMLHelper.escape(pos)
1264
+ terminal = sent.add_syn("t", nil, # category
1265
+ word, pos,
1266
+ node_id)
1267
+
1268
+ if line_obj.get("lemma")
1269
+ # lemma
1270
+ terminal.set_attribute("lemma", SalsaTigerXMLHelper.escape(line_obj.get("lemma")))
1271
+ end
1272
+
1273
+ # add new terminal as child of vroot
1274
+ vroot.add_child(terminal, nil)
1275
+ terminal.add_parent(vroot, nil)
1276
+ } # each line of tab file
1277
+
1278
+ # yield newly constructed SalsaTigerXMl sentence plus tab sentence
1279
+ yield [sent, tabsent, SynInterfaceSTXML.standard_mapping(sent, tabsent)]
1280
+ }
1281
+ end
1282
+ end
1283
+
1284
+ ############################################3
1285
+ # Class FrprepReadStxml
1286
+ #
1287
+ # given a STXML file,
1288
+ # yield each of its sentences
1289
+ class FrprepReadStxml
1290
+ def initialize(stxmlfilename, # string: name of SalsaTigerXML file
1291
+ tabfilename, # string: name of corresponding tab file (or nil)
1292
+ postag_suffix, # POS tag file suffix (or nil)
1293
+ lemma_suffix) # lemmatization file suffix (or nil)
1294
+
1295
+ @stxmlfilename = stxmlfilename
1296
+ @tabfilename = tabfilename
1297
+ @pos_suffix = postag_suffix
1298
+ @lemma_suffix = lemma_suffix
1299
+ end
1300
+ # yield each non-parse sentence as a tuple
1301
+ # [ salsa/tiger xml sentence, tab format sentence, mapping]
1302
+ # of a SalsaTigerSentence object, a FNTabSentence object,
1303
+ # and a hash: FNTab sentence lineno(integer) -> array:SynNode
1304
+ # pointing each tab word to one or more SalsaTigerSentence terminals
1305
+ def each_sentence(dummy)
1306
+ # read corresponding tab file?
1307
+ tab_sents = Array.new()
1308
+ if File.exists? @tabfilename
1309
+ tabfile = FNTabFormatFile.new(@tabfilename,@pos_suffix,@lemma_suffix)
1310
+ tabfile.each_sentence { |tabsent|
1311
+ tab_sents << tabsent
1312
+ }
1313
+ end
1314
+
1315
+ # read STXML file
1316
+ infile = FilePartsParser.new(@stxmlfilename)
1317
+ index = 0
1318
+ infile.scan_s { |sent_string|
1319
+ sent = SalsaTigerSentence.new(sent_string)
1320
+ yield [sent, tab_sents.at(index), SynInterfaceSTXML.standard_mapping(sent, tab_sents.at(index))]
1321
+ index += 1
1322
+ }
1323
+ end
1324
+ end