frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,144 @@
1
+ # sp 28 06 04
2
+ #
3
+ # this module offers methods to extract gemma corpora from the FrameNet database#
4
+
5
+ require 'FrameXML'
6
+
7
+ class FNDatabase
8
+
9
+ def each_matching_sentence(file_pred,sent_pred)
10
+ # fundamental access function to FrameXML files
11
+
12
+ # returns file objects where
13
+ # FrameXMLSentence matches sent_pred
14
+ # (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
15
+ each_matching_file(file_pred) {|frameNetFile|
16
+ frameNetFile.each_sentence {|frameNetSent|
17
+ if sent_pred.call(frameNetSent)
18
+ frameNetSent.verify_annotation
19
+ yield frameNetSent
20
+ end
21
+ }
22
+ }
23
+ end
24
+
25
+ def each_matching_file(file_pred)
26
+ # fundamental access function to FrameXML files
27
+
28
+ # returns file (FrameXMLFile) objects which match file_pred
29
+ each_framexml_file{|frameNetFile|
30
+ if file_pred.call(frameNetFile)
31
+ yield frameNetFile
32
+ end
33
+ frameNetFile.close
34
+ }
35
+ end
36
+
37
+ def extract_frame(frame,outfile)
38
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
39
+ Proc.new{|fnsent| true}) {|fnsent|
40
+ if fnsent.contains_FE_annotation_and_target
41
+ fnsent.print_conll_style_to(outfile)
42
+ end
43
+ }
44
+ end
45
+
46
+ def extract_lemma(lemma,outfile)
47
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
48
+ Proc.new{|fnsent| true}) {|fnsent|
49
+ if fnsent.contains_FE_annotation_and_target
50
+ fnsent.print_conll_style_to(outfile)
51
+ end
52
+ }
53
+ end
54
+
55
+ def extract_everything(outdirectory)
56
+ unless outdirectory[-1,1] == "/"
57
+ outdirectory += "/"
58
+ end
59
+
60
+ outfiles = Hash.new
61
+ each_matching_sentence(Proc.new{|fnfile| true},
62
+ Proc.new{|fnsent| true}) {|fnsent|
63
+ frame = fnsent.get_file_obj.get_frame
64
+ unless outfiles.key?(frame)
65
+ outfiles[frame] = File.new(outdirectory+frame+".tab","w")
66
+ end
67
+ if fnsent.contains_FE_annotation_and_target
68
+ fnsent.print_conll_style_to(outfiles[frame])
69
+ end
70
+ }
71
+ # close output files
72
+ outfiles.each_value {|file|
73
+ file.close
74
+ }
75
+ # remove zero-size files
76
+ Dir[outdirectory+"*"].each {|filename|
77
+ if FileTest.zero?(filename)
78
+ File.unlink(filename)
79
+ end
80
+ }
81
+ end
82
+
83
+
84
+ def initialize(fn_path)
85
+ unless fn_path[-1,1] == "/"
86
+ fn_path += "/"
87
+ end
88
+ @fn = fn_path
89
+ end
90
+
91
+ private
92
+
93
+ def each_framexml_file
94
+ # files might be zipped
95
+ Dir[@fn+"lu*.xml.gz"].each {|gzfile|
96
+ Kernel.system("cp "+gzfile+" /tmp/")
97
+ Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
98
+ gzfile =~ /(.+)\.gz/
99
+ yield FrameXMLFile.new("/tmp/"+File.basename($1))
100
+ }
101
+ # or might not
102
+ Dir[@fn+"/lu*.xml"].each {|filename|
103
+ yield FrameXMLFile.new(filename)
104
+ }
105
+ end
106
+
107
+ # I don't really remember what this was good for ;-)
108
+
109
+ # def browse_everything(allFiles)
110
+ # if allFiles
111
+ # Dir[fn+"*.xml.gz"].each {|gzfile|
112
+ # Kernel.system("cp "+gzfile+" /tmp/")
113
+ # Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
114
+ # gzfile =~ /(.+)\.gz/
115
+ # # STDERR.puts File.basename($1)
116
+ # # STDERR.print "."
117
+ # ff = FrameXMLFile.new("/tmp/"+File.basename($1))
118
+ # ff.each_sentence {|s|
119
+ # if s.contains_FE_annotation_and_target
120
+ # s.verify_annotation
121
+ # if s.verify_annotation
122
+ # puts "****************** Error: Still problems after 2nd verification!"
123
+ # end
124
+ # s.print_conll_style
125
+ # end
126
+ # }
127
+ # }
128
+ # else
129
+ # ff = FrameXMLFile.new("/tmp/lu1870.xml")
130
+ # ff.each_sentence {|s|
131
+ # if s.contains_FE_annotation_and_target
132
+ # s.verify_annotation
133
+ # if s.verify_annotation
134
+ # puts "****************** Error: Still problems after 2nd verification!"
135
+ # end
136
+ # # s.print_layers
137
+ # s.print_conll_style
138
+ # end
139
+ # }
140
+ # end
141
+ # end
142
+
143
+ end
144
+
@@ -0,0 +1,196 @@
1
+ ###
2
+ # FixSynSemMapping:
3
+ # Given a SalsaTigerRegXML sentence with semantic role annotation,
4
+ # simplify the mapping of semantic roles to syntactic constituents
5
+ #
6
+ # The following is lifted from the LREC06 paper on Shalmaneser:
7
+ # During preprocessing, the span of semantic roles in the training corpora is
8
+ # projected onto the output of the syntactic parser by assigning each
9
+ # role to the set of maximal constituents covering its word span.
10
+ # f the word span of a role does not coincide
11
+ # with parse tree constituents, e.g. due to misparses,
12
+ # the role is ``spread out'' across several constituents. This leads to
13
+ # idiosyncratic paths between predicate and semantic role in the parse
14
+ # tree.
15
+ #
16
+ # [The following span standardization algorithm is used to make the
17
+ # syntax-semantics mapping more uniform:]
18
+ # Given a role r that has been assigned, let N be the set of
19
+ # terminal nodes of the syntactic structure that are covered by r.
20
+ #
21
+ # Iteratively compute the maximal projection of N in the syntactic
22
+ # structure:
23
+ # 1) If n is a node such that all of n's children are in N,
24
+ # then remove n's children from N and add n instead.
25
+ # 2) If n is a node with 3 or more children, and all of n's
26
+ # children except one are in N, then remove n's children from N
27
+ # and add n instead.
28
+ # 3) If n is an NP with 2 children, and one of them, another NP,
29
+ # is in N, and the other, a relative clause, is not, then remove
30
+ # n's children from N and add n instead.
31
+ #
32
+ # If none of the rules is applicable to N anymore, assign r to the
33
+ # nodes in N.
34
+ #
35
+ # Rule 1 implements normal maximal projection. Rule 2 ``repairs'' parser
36
+ # errors where all children of a node but one have been assigned the
37
+ # same role. Rule 3 addresses a problem of the FrameNet data, where
38
+ # relative clauses have been omitted from roles assigned to NPs.
39
+
40
+ # KE Feb 08: rule 3 currently out of commission!
41
+
42
+ require "frprep/SalsaTigerRegXML"
43
+
44
+ module FixSynSemMapping
45
+ ##
46
+ # fix it
47
+ #
48
+ # relevant settings in the experiment file:
49
+ #
50
+ # fe_syn_repair:
51
+ # If there is a node that would be a max. constituent for the
52
+ # words covered by the given FE, except that it has one child
53
+ # whose words are not in the FE, use the node as max constituent anyway.
54
+ # This is to repair cases where the parser has made an attachment choice
55
+ # that differs from the one in the gold annotation
56
+ #
57
+ # fe_rel_repair:
58
+ # If there is an NP such that all of its children except one have been
59
+ # assigned the same FE, and that missing child is a relative clause
60
+ # depending on one of the other children, then take the complete NP as
61
+ # that FE
62
+ def FixSynSemMapping.fixit(sent, # SalsaTigerSentence object
63
+ exp, # experiment file object
64
+ interpreter_class) # SynInterpreter class
65
+
66
+
67
+ unless exp.get("fe_syn_repair") or exp.get("fe_rel_repair")
68
+ return
69
+ end
70
+
71
+ if sent.nil?
72
+ return
73
+ end
74
+
75
+ # "repair" FEs:
76
+ sent.each_frame { |frame|
77
+
78
+ frame.each_child { |fe_or_target|
79
+
80
+ # repair only if the FE currently
81
+ # points to more than one syn node
82
+ if fe_or_target.children.length() < 2
83
+ next
84
+ end
85
+
86
+ if exp.get("fe_rel_repair")
87
+ lastfe = fe_or_target.children.last()
88
+ if lastfe and interpreter_class.simplified_pt(lastfe) =~ /^(WDT)|(WP\$?)|(WRB)/
89
+
90
+ # remove syn nodes that the FE points to
91
+ old_fe_syn = fe_or_target.children()
92
+ old_fe_syn.each { |child|
93
+ fe_or_target.remove_child(child)
94
+ }
95
+
96
+ # set it to point only to the last previous node, the relative pronoun
97
+ fe_or_target.add_child(lastfe)
98
+ end
99
+ end
100
+
101
+ if exp.get("fe_syn_repair")
102
+ # remove syn nodes that the FE points to
103
+ old_fe_syn = fe_or_target.children()
104
+ old_fe_syn.each { |child|
105
+ fe_or_target.remove_child(child)
106
+ }
107
+
108
+ # and recompute
109
+ new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t|
110
+ t.yield_nodes
111
+ }.flatten.uniq,
112
+ sent,
113
+ exp.get("fe_syn_repair"))
114
+
115
+ # make the FE point to the new nodes
116
+ new_fe_syn.each { |syn_node|
117
+ fe_or_target.add_child(syn_node)
118
+ }
119
+ end
120
+ } # each FE
121
+ } # each frame
122
+ end # def fixit
123
+ end # module
124
+
125
+
126
+ #########3
127
+ # old code
128
+
129
+ # if exp.get("fe_rel_repair")
130
+ # # repair relative clauses:
131
+ # # then make a procedure to pass on to max constituents
132
+ # # that will recognize the relevant cases
133
+
134
+ # accept_anyway_proc = Proc.new { |node, children_in, children_out|
135
+
136
+ # # node: SynNode
137
+ # # children_in, children_out: array:SynNode. children_in are the children
138
+ # # that are already covered by the FE, children_out the ones that aren't
139
+
140
+ # # if node is an NP,
141
+ # # and only one of its children is out,
142
+ # # and one node in children_in is an NP, and the missing child is an SBAR
143
+ # # with a child that is a relative pronoun, then consider the child in children_out as covered
144
+ # if interpreter_class.category(node) == "noun" and
145
+ # children_out.length() == 1 and
146
+ # children_in.select { |n| interpreter_class.category(n) == "noun" } and
147
+ # interpreter_class.category(children_out.first) == "sent" and
148
+ # (ch = children_out.first.children) and
149
+ # ch.select { |n| interpreter_class.relative_pronoun?(n) }
150
+ # true
151
+ # else
152
+ # false
153
+ # end
154
+ # }
155
+
156
+ # else
157
+ # accept_anyway_proc = nil
158
+ # end
159
+
160
+
161
+ # # "repair" FEs:
162
+ # sent.each_frame { |frame|
163
+
164
+ # frame.each_child { |fe_or_target|
165
+
166
+ # # repair only if the FE currently
167
+ # # points to more than one syn node, or
168
+ # # if it is a noun with a non-covered sentence sister
169
+ # if fe_or_target.children.length() > 1 or
170
+ # (exp.get("fe_rel_repair") and (curr_marked = fe_or_target.children.first()) and
171
+ # interpreter_class.category(curr_marked) == "noun" and
172
+ # (p = curr_marked.parent) and
173
+ # p.children.select { |n| n != curr_marked and interpreter_class.category(n) == "sent" } )
174
+
175
+ # # remember nodes covered by the FE
176
+ # old_fe_syn = fe_or_target.children()
177
+
178
+ # # remove syn nodes that the FE points to
179
+ # old_fe_syn.each { |child|
180
+ # fe_or_target.remove_child(child)
181
+ # }
182
+
183
+ # # and recompute
184
+ # new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t| t.yield_nodes}.flatten.uniq,
185
+ # sent,
186
+ # exp.get("fe_syn_repair"),
187
+ # accept_anyway_proc)
188
+
189
+ # # make the FE point to the new nodes
190
+ # new_fe_syn.each { |syn_node|
191
+ # fe_or_target.add_child(syn_node)
192
+ # }
193
+
194
+ # end # if FE points to more than one syn node
195
+ # } # each FE
196
+ # } # each frame
@@ -0,0 +1,66 @@
1
+ # FPrepConfigData
2
+ # Katrin Erk July 05
3
+ #
4
+ # Preprocessing for Fred and Rosy:
5
+ # access to a configuration and experiment description file
6
+
7
+ require "frprep/ConfigData"
8
+
9
+ ##############################
10
+ # Class FrPrepConfigData
11
+ #
12
+ # inherits from ConfigData,
13
+ # sets variable names appropriate to preprocessing task
14
+
15
+ class FrPrepConfigData < ConfigData
16
+ def initialize(filename)
17
+
18
+ # initialize config data object
19
+ super(filename, # config file
20
+ { "prep_experiment_ID" => "string", # experiment identifier
21
+
22
+ "frprep_directory" => "string", # dir for frprep internal data
23
+
24
+ # information about the dataset
25
+ "language" => "string", # en, de
26
+ "origin"=> "string", # FrameNet, Salsa, or nothing
27
+ "format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
28
+ "encoding" => "string", # utf8, iso, hex, or nothing
29
+
30
+
31
+ # directories
32
+ "directory_input" => "string", # dir with input data
33
+ "directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
34
+ "directory_parserout" => "string", # dir with parser output for the parser named below
35
+
36
+ # syntactic processing
37
+ "pos_tagger" => "string", # name of POS tagger
38
+ "lemmatizer" => "string", # name of lemmatizer
39
+ "parser" => "string", # name of parser
40
+ "pos_tagger_path" => "string", # path to POS tagger
41
+ "lemmatizer_path" => "string", # path to lemmatizer
42
+ "parser_path" => "string", # path to parser
43
+ "parser_max_sent_num" => "integer", # max number of sentences per parser input file
44
+ "parser_max_sent_len" => "integer", # max sentence length the parser handles
45
+
46
+ "do_parse" => "bool", # use parser?
47
+ "do_lemmatize" => "bool",# use lemmatizer?
48
+ "do_postag" => "bool", # use POS tagger?
49
+
50
+ # output format: if tabformat_output == true,
51
+ # output in Tab format rather than Salsa/Tiger XML
52
+ # (this will not work if do_parse == true)
53
+ "tabformat_output" => "bool",
54
+
55
+ # syntactic repairs, dependent on existing semantic role annotation
56
+ "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
57
+ "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
58
+ },
59
+ [ ] # variables
60
+ )
61
+
62
+ end
63
+ end
64
+
65
+
66
+
@@ -0,0 +1,513 @@
1
+ # sp 18 06 2004
2
+ #
3
+ # access to FrameNet XML files, sentences, and annotation.
4
+ #
5
+ # sp 10 11 04: only data from the first layer with name XY is
6
+ # used for output. Other data is saved in layer XY.2nd, but is
7
+ # currently not processed.
8
+ #
9
+ # sp 22 05 04: also, if two labels exist which cover the same span
10
+ # (ie there is a double annotation within the same layer), ignore
11
+ # all but the first label.
12
+ #
13
+ # ke 13 07 05:
14
+ # - changed to RegXMl.rb
15
+ # - fixed two problems in analyse_layer:
16
+ # - Deleting problematic labels:
17
+ # For some reason, thisLayer[i+1..-1].each_index {|other_i|
18
+ # included the index 0 in any case, resulting in the 1st
19
+ # label being deleted in any case.
20
+ # - Deleting problematic labels, checking for label overlap:
21
+ # The old formulation worked only if labels occurred in the array
22
+ # in the order they occurred in the sentence, but that was not the case.
23
+ # - Change in deleting problematic labels:
24
+ # No longer delete duplicate labels, since e.g. in the PT level there
25
+ # may be more than one NP label, and we want to keep those
26
+ #
27
+ # KE January 2007:
28
+ # write new adapted FNTab format
29
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
30
+
31
+ require 'Ampersand'
32
+ require 'ISO-8859-1'
33
+ require 'RegXML'
34
+
35
+ class FrameXMLFile # only verified to work for FrameNet v1.1
36
+
37
+ def initialize(filename)
38
+ @filename = filename
39
+ file = File.new(filename)
40
+ counter = 0
41
+ while true
42
+ counter +=1
43
+ line = file.gets
44
+ if line =~ /<lexunit/
45
+ break
46
+ end
47
+ if counter > 3
48
+ STDERR.puts "Error: File "+filename+" does not conform to FrameNet v1.1 standard (lexunit in 3rd line)"
49
+ Kernel.exit
50
+ end
51
+ end
52
+ # found lexunit
53
+ string = line
54
+ while (line = file.gets())
55
+ string << line
56
+ end
57
+ @lexunit = RegXML.new(string)
58
+ attributes = @lexunit.attributes()
59
+ @id = attributes["ID"]
60
+ attributes["name"] =~ /^([^.]+).([^.]+)$/
61
+ @lu = $1
62
+ @pos = $2.upcase
63
+ if @lu.nil?
64
+ raise "[framexml] no lemma in header of file #{@filename}"
65
+ elsif @pos.nil?
66
+ raise "[framexml] no pos in header of file #{@filename}"
67
+ end
68
+ @frame = attributes["frame"]
69
+ end
70
+
71
+ def get_lu
72
+ return @lu.gsub(" ","_")
73
+ end
74
+
75
+ def get_lu_id
76
+ return @id
77
+ end
78
+
79
+ def get_filename
80
+ return @filename
81
+ end
82
+
83
+ def get_pos
84
+ return @pos
85
+ end
86
+
87
+ def get_frame
88
+ return @frame
89
+ end
90
+
91
+ def close
92
+ end
93
+
94
+ def each_sentence
95
+ @lexunit.children_and_text().each { |subcorpus|
96
+ subcorpus.children_and_text().each { |annotationSet|
97
+ if annotationSet.name == "annotationSet"
98
+ # sentence found
99
+ yield FrameXMLSentence.new(annotationSet,self)
100
+ end
101
+ }
102
+ }
103
+ end
104
+ end
105
+
106
+ class FrameXMLSentence
107
+ def initialize(annotationSet,file_obj)
108
+ @file_obj = file_obj
109
+
110
+ # layers: hash layer_name -> array:[name, start, stop]
111
+ # name: name of the element, string
112
+ # start: start character, integer
113
+ # stop: end character, integer
114
+ @layers = Hash.new
115
+
116
+ annotationSet.children_and_text().each { |sentence_or_layer_elt|
117
+
118
+ case sentence_or_layer_elt.name
119
+ when "sentence"
120
+ # sentence: has ID, its child is <text>[text]</text>
121
+ @sent_id = sentence_or_layer_elt.attributes["ID"]
122
+ text_elt = sentence_or_layer_elt.children_and_text().detect { |child|
123
+ child.name == "text"
124
+ }
125
+ if text_elt
126
+ # found the text element. its only child should be the text
127
+ @orig_text = text_elt.children_and_text().detect { |child|
128
+ child.text?
129
+ }
130
+ if @orig_text
131
+ # take text out of RegXMl object
132
+ @orig_text = @orig_text.to_s()
133
+ end
134
+ end
135
+
136
+ when "layers"
137
+ # contains annotation layers
138
+ sentence_or_layer_elt.children_and_text().each { |layer|
139
+ unless layer.name == "layer"
140
+ # additional material, ignore
141
+ next
142
+ end
143
+
144
+ name = layer.attributes["name"]
145
+ unless name
146
+ raise "layer without a name"
147
+ end
148
+ unless @layers.key?(name)
149
+ @layers[name] = analyse_layer(layer, name)
150
+ end
151
+ }
152
+ end
153
+ }
154
+
155
+ @pos_text = UtfIso.to_iso_8859_1(@orig_text).split(" ") # text with special characters replaced by iso8859 characters
156
+ @text = Ampersand.utf8_to_hex(@orig_text).split(" ") # text with special characters replaced by &...; sequences
157
+
158
+ # all text and pos_text have the same number of elements!
159
+ @start_is = Hash.new # map char indices (start of words) onto word indices
160
+ @stop_is = Hash.new # map char indices (end of words) onto word indices
161
+ @charidx = Array.new # maps word indices on [start,stop]
162
+
163
+ @double_space = Array.new
164
+ pos = 0
165
+ while (match = @orig_text.index(/(\s\s+)/,pos))
166
+ @double_space << match
167
+ pos = match+1
168
+ end
169
+
170
+
171
+ # fill start, stop and charidx arrays
172
+ char_i = 0
173
+ @pos_text.each_index {|word_i|
174
+ @start_is[char_i] = word_i
175
+ startchar = char_i
176
+ # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
177
+ char_i += our_length(@pos_text[word_i])
178
+ @stop_is[char_i-1] = word_i
179
+
180
+ stopchar = char_i-1
181
+
182
+ # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
183
+
184
+ @charidx << [startchar,stopchar]
185
+
186
+ # separators
187
+ if @double_space.include?(char_i) then
188
+ char_i += 2
189
+ else
190
+ char_i += 1
191
+ end
192
+ }
193
+ end
194
+
195
+ def get_file_obj
196
+ return @file_obj
197
+ end
198
+
199
+ def get_sent_id
200
+ return @sent_id
201
+ end
202
+
203
+ def print_text
204
+ puts "("+@id+ ")\t"+@text
205
+ end
206
+
207
+ def contains_FE_annotation_and_target
208
+ target_info = @layers["Target"][0]
209
+ unless target_info[0] == "Target"
210
+ STDERR.puts "Error in sentence from "+filename+": No target" # strictly speaking, no target at pos 0 in @layers["Target"]
211
+ STDERR.puts "Sentence: "+@text
212
+ return false
213
+ else
214
+ return (@layers.key?("FE") and target_info[2] != 0)
215
+ end
216
+ end
217
+
218
+ # we only verify the interesting layers (FE,GF,Target)
219
+ # if there is weird stuff going on on e.g. the Noun or Adj layer, we don't care.
220
+
221
+ def verify_annotation # returns true if some change has taken place
222
+ change = false
223
+ @layers.each_pair {|layername,l|
224
+
225
+ if layername=="FE" or layername=="GF" or layername=="PT" or layername=="Target" # only verify the "important" layers
226
+
227
+ l.each_index {|i|
228
+
229
+ element,start,stop = l[i]
230
+
231
+ newstart = start
232
+ newstop = stop
233
+
234
+ @charidx.each_index{|j|
235
+ unless j== 0
236
+ pstartidx, pstopidx = @charidx[j-1]
237
+ end
238
+ startidx, stopidx = @charidx[j]
239
+
240
+ if (start > startidx and start <= stopidx) or
241
+ (j != 0 and start > pstopidx and start < startidx)
242
+ newstart = startidx
243
+ end
244
+
245
+ if (stop >= startidx and stop < stopidx)
246
+ newstop = stopidx
247
+ elsif (j != 0 and stop > pstopidx and stop < startidx)
248
+ newstop = pstopidx
249
+ end
250
+
251
+ }
252
+ if start != newstart or stop != newstop
253
+ change = true
254
+ @layers[layername][i] = [element,newstart,newstop]
255
+ STDERR.puts "Heuristics has changed element "+element+" from ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"] in file "+@file_obj.get_filename+"."
256
+ markable_as_string(layername,element).each {|string|
257
+ STDERR.puts "New markable: "+string
258
+ }
259
+ STDERR.puts "Sentence: "+@pos_text.join(" ")
260
+ puts
261
+ end
262
+ }
263
+ end
264
+ }
265
+ return change
266
+ end
267
+
268
+ def print_conll_style
269
+ print_conll_style_to(STDOUT)
270
+ end
271
+
272
+ # CHANGED KE January 2007:
273
+ # write new adapted FNTab format
274
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
275
+ def print_conll_style_to(out)
276
+
277
+ # even though in principle there might be multiple
278
+ # labels for one span [i.e. in one value of the
279
+ # {gf,fe,pt} hashes], we only ever record one
280
+
281
+ gf = Hash.new
282
+ add_all_to_hash(gf,"GF")
283
+ fe = Hash.new
284
+ add_all_to_hash(fe,"FE")
285
+ pt = Hash.new
286
+ add_all_to_hash(pt,"PT")
287
+ target = Hash.new
288
+ add_all_to_hash(target,"Target")
289
+
290
+ in_target = false
291
+
292
+ @pos_text.each_index {|i|
293
+ # write format:
294
+ # "word" "pt", "gf", "role", "target", "frame", "stuff" "ne", "sent_id"
295
+ line = Array.new
296
+ # word
297
+ word = @pos_text[i]
298
+ line << word
299
+
300
+ start, stop = @charidx[i]
301
+ # "pt", "gf", "role",
302
+ [pt,gf,fe].each {|hash|
303
+ token = Array.new
304
+ if hash.key?([start,"start"])
305
+ markables = hash.delete([start,"start"])
306
+ markables.each {|element|
307
+ token << "B-"+element
308
+ }
309
+ end
310
+ if hash.key?([stop,"stop"])
311
+ markables = hash.delete([stop,"stop"])
312
+ markables.each {|element|
313
+ token << "E-"+element
314
+ }
315
+ end
316
+ if token.empty?
317
+ line << "-"
318
+ else
319
+ line << token.sort.join(":")
320
+ end
321
+ }
322
+ # "target"
323
+ if target.key?([start,"start"])
324
+ target.delete([start,"start"])
325
+ in_target = true
326
+ end
327
+ if in_target
328
+ line << @file_obj.get_lu+"."+@file_obj.get_pos
329
+ else
330
+ line << "-"
331
+ end
332
+ if target.key?([stop,"stop"])
333
+ target.delete([stop,"stop"])
334
+ in_target = false
335
+ end
336
+ # "frame"
337
+ line << @file_obj.get_frame
338
+
339
+ # "stuff" "ne",
340
+ line << "-"
341
+ line << "-"
342
+
343
+ # "sent_id"
344
+ line << @file_obj.get_lu_id+"-"+@sent_id
345
+
346
+ out.puts line.join("\t")
347
+ }
348
+
349
+ out.puts
350
+
351
+ [gf,fe,pt,target].each {|hash|
352
+ unless hash.empty?
353
+ STDERR.puts @file_obj.get_filename
354
+ raise "**** Error: Hash not empty after creation of Sentence in CoNLL-Format (could not find matching words for some markup element)!"
355
+ end
356
+ }
357
+ end
358
+
359
+
360
+ def print_layers
361
+ @layers.each {|ln,l|
362
+ puts "Layer "+ln+":"
363
+ l.each {|element,start,stop|
364
+ puts "\t"+element+": "+start.to_s+" -- "+stop.to_s
365
+ }
366
+ puts "***"
367
+ }
368
+ end
369
+
370
+
371
+ private
372
+
373
+
374
+ def our_length(string) # (1) replace &...; with 1 char and " with two chars
375
+ return string.gsub(/&(.+?);/,"X").length
376
+ end
377
+
378
+ def is_fe(fename)
379
+ @layers["FE"].each {|name,start,stop|
380
+ if fename == name
381
+ return true
382
+ end
383
+ }
384
+ return false
385
+ end
386
+
387
+
388
+ def markable_as_string(layername,markup_name) # returns an array of all markables with this name
389
+
390
+ result = Array.new
391
+
392
+ festart = nil
393
+ festop = nil
394
+ @layers[layername].each {|name,start,stop|
395
+ if markup_name == name
396
+ fe = Array.new
397
+ infe = false
398
+ @charidx.each_index {|i|
399
+ startidx,stopidx = @charidx[i]
400
+ if startidx == start
401
+ infe = true
402
+ end
403
+ if infe
404
+ fe << @pos_text[i]
405
+ end
406
+ if stopidx == stop
407
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+", VERIFIED]")
408
+ break
409
+ elsif stopidx > stop
410
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+",ERROR]")
411
+ break
412
+ end
413
+ }
414
+ end
415
+ }
416
+ return result
417
+ end
418
+
419
+ def add_to_hash(hash,key,name)
420
+ exists = false
421
+ if hash.key?(key)
422
+ exists = true
423
+ else
424
+ hash[key] = Array.new
425
+ hash[key] << name
426
+ end
427
+ return exists
428
+ end
429
+
430
+ def add_all_to_hash(hash,layername)
431
+ # use "uniq" to remove wrong double annotations
432
+ @layers[layername].uniq.each {|element,start,stop|
433
+ exists = add_to_hash(hash,[start, "start"],element)
434
+ if exists
435
+ STDERR.puts "Warning ["+@file_obj.get_filename+"]: In layer "+layername+", two elements start at position "+start.to_s+". Only using first. Layer as read from FrameXML: "+@layers[layername].map {|element,start,stop| element+" ("+start.to_s+","+stop.to_s+")"}.join(" ")
436
+ else
437
+ add_to_hash(hash,[stop, "stop"],element)
438
+ end
439
+ }
440
+ end
441
+
442
+
443
+ def analyse_layer(layer_elt,name) # read layer information from file and store in @layers
444
+ if name.nil?
445
+ STDERR.puts "Error: layer line "+line+" with empty name."
446
+ end
447
+
448
+ # thisLayer, retv: array:[name(string), start(integer), end(integer)]
449
+ thisLayer = Array.new
450
+ retv = Array.new
451
+
452
+ labels_elt = layer_elt.children_and_text.detect { |child| child.name == "labels"}
453
+ unless labels_elt
454
+ # no labels found, return empty array
455
+ return thisLayer
456
+ end
457
+
458
+ labels_elt.children_and_text.each { |label|
459
+ unless label.name == "label"
460
+ # some other markup, ignore
461
+ next
462
+ end
463
+
464
+ attributes = label.attributes()
465
+ if attributes["itype"]
466
+ # null instantiation, don't retain
467
+ next
468
+ end
469
+ if not(attributes["start"]) and not(attributes["end"])
470
+ # no start and end labels
471
+ next
472
+ end
473
+ thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
474
+ }
475
+
476
+ # sanity check: verify that
477
+ # 1. we don't have overlapping labels
478
+
479
+ deleteHash = Hash.new # keep track of the labels which are to be deleted
480
+ # i -> Boolean
481
+
482
+ thisLayer.each_index {|i|
483
+ # efficiency: skip already delete labels
484
+ if deleteHash[i]
485
+ next
486
+ end
487
+ this_label, this_from , this_to = thisLayer[i]
488
+
489
+ # compare with all remaining labels
490
+ (i+1..thisLayer.length()-1).to_a.each { |other_i|
491
+ other_label,other_from,other_to = thisLayer[other_i]
492
+
493
+ # overlap? Throw out the later FE
494
+ if this_from <= other_from and other_from <= this_to
495
+ $stderr.puts "Warning: Label overlap, deleting #{other_label}"
496
+ deleteHash[other_i] = true
497
+ elsif this_from <= other_to and other_to <= this_to
498
+ $stderr.puts "Warning: Label overlap, deleting #{this_label}"
499
+ deleteHash[i] = true
500
+ end
501
+ }
502
+ # matched with all other labels. If "keep", return
503
+
504
+ if deleteHash[i]
505
+ # $stderr.puts " deleting entry #{i}"
506
+ else
507
+ retv << thisLayer[i]
508
+ end
509
+ }
510
+
511
+ return retv
512
+ end
513
+ end