frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,144 @@
1
+ # sp 28 06 04
2
+ #
3
+ # this module offers methods to extract gemma corpora from the FrameNet database#
4
+
5
+ require 'FrameXML'
6
+
7
+ class FNDatabase
8
+
9
+ def each_matching_sentence(file_pred,sent_pred)
10
+ # fundamental access function to FrameXML files
11
+
12
+ # returns file objects where
13
+ # FrameXMLSentence matches sent_pred
14
+ # (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
15
+ each_matching_file(file_pred) {|frameNetFile|
16
+ frameNetFile.each_sentence {|frameNetSent|
17
+ if sent_pred.call(frameNetSent)
18
+ frameNetSent.verify_annotation
19
+ yield frameNetSent
20
+ end
21
+ }
22
+ }
23
+ end
24
+
25
+ def each_matching_file(file_pred)
26
+ # fundamental access function to FrameXML files
27
+
28
+ # returns file (FrameXMLFile) objects which match file_pred
29
+ each_framexml_file{|frameNetFile|
30
+ if file_pred.call(frameNetFile)
31
+ yield frameNetFile
32
+ end
33
+ frameNetFile.close
34
+ }
35
+ end
36
+
37
+ def extract_frame(frame,outfile)
38
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
39
+ Proc.new{|fnsent| true}) {|fnsent|
40
+ if fnsent.contains_FE_annotation_and_target
41
+ fnsent.print_conll_style_to(outfile)
42
+ end
43
+ }
44
+ end
45
+
46
+ def extract_lemma(lemma,outfile)
47
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
48
+ Proc.new{|fnsent| true}) {|fnsent|
49
+ if fnsent.contains_FE_annotation_and_target
50
+ fnsent.print_conll_style_to(outfile)
51
+ end
52
+ }
53
+ end
54
+
55
+ def extract_everything(outdirectory)
56
+ unless outdirectory[-1,1] == "/"
57
+ outdirectory += "/"
58
+ end
59
+
60
+ outfiles = Hash.new
61
+ each_matching_sentence(Proc.new{|fnfile| true},
62
+ Proc.new{|fnsent| true}) {|fnsent|
63
+ frame = fnsent.get_file_obj.get_frame
64
+ unless outfiles.key?(frame)
65
+ outfiles[frame] = File.new(outdirectory+frame+".tab","w")
66
+ end
67
+ if fnsent.contains_FE_annotation_and_target
68
+ fnsent.print_conll_style_to(outfiles[frame])
69
+ end
70
+ }
71
+ # close output files
72
+ outfiles.each_value {|file|
73
+ file.close
74
+ }
75
+ # remove zero-size files
76
+ Dir[outdirectory+"*"].each {|filename|
77
+ if FileTest.zero?(filename)
78
+ File.unlink(filename)
79
+ end
80
+ }
81
+ end
82
+
83
+
84
+ def initialize(fn_path)
85
+ unless fn_path[-1,1] == "/"
86
+ fn_path += "/"
87
+ end
88
+ @fn = fn_path
89
+ end
90
+
91
+ private
92
+
93
+ def each_framexml_file
94
+ # files might be zipped
95
+ Dir[@fn+"lu*.xml.gz"].each {|gzfile|
96
+ Kernel.system("cp "+gzfile+" /tmp/")
97
+ Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
98
+ gzfile =~ /(.+)\.gz/
99
+ yield FrameXMLFile.new("/tmp/"+File.basename($1))
100
+ }
101
+ # or might not
102
+ Dir[@fn+"/lu*.xml"].each {|filename|
103
+ yield FrameXMLFile.new(filename)
104
+ }
105
+ end
106
+
107
+ # I don't really remember what this was good for ;-)
108
+
109
+ # def browse_everything(allFiles)
110
+ # if allFiles
111
+ # Dir[fn+"*.xml.gz"].each {|gzfile|
112
+ # Kernel.system("cp "+gzfile+" /tmp/")
113
+ # Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
114
+ # gzfile =~ /(.+)\.gz/
115
+ # # STDERR.puts File.basename($1)
116
+ # # STDERR.print "."
117
+ # ff = FrameXMLFile.new("/tmp/"+File.basename($1))
118
+ # ff.each_sentence {|s|
119
+ # if s.contains_FE_annotation_and_target
120
+ # s.verify_annotation
121
+ # if s.verify_annotation
122
+ # puts "****************** Error: Still problems after 2nd verification!"
123
+ # end
124
+ # s.print_conll_style
125
+ # end
126
+ # }
127
+ # }
128
+ # else
129
+ # ff = FrameXMLFile.new("/tmp/lu1870.xml")
130
+ # ff.each_sentence {|s|
131
+ # if s.contains_FE_annotation_and_target
132
+ # s.verify_annotation
133
+ # if s.verify_annotation
134
+ # puts "****************** Error: Still problems after 2nd verification!"
135
+ # end
136
+ # # s.print_layers
137
+ # s.print_conll_style
138
+ # end
139
+ # }
140
+ # end
141
+ # end
142
+
143
+ end
144
+
@@ -0,0 +1,196 @@
1
+ ###
2
+ # FixSynSemMapping:
3
+ # Given a SalsaTigerRegXML sentence with semantic role annotation,
4
+ # simplify the mapping of semantic roles to syntactic constituents
5
+ #
6
+ # The following is lifted from the LREC06 paper on Shalmaneser:
7
+ # During preprocessing, the span of semantic roles in the training corpora is
8
+ # projected onto the output of the syntactic parser by assigning each
9
+ # role to the set of maximal constituents covering its word span.
10
+ # f the word span of a role does not coincide
11
+ # with parse tree constituents, e.g. due to misparses,
12
+ # the role is ``spread out'' across several constituents. This leads to
13
+ # idiosyncratic paths between predicate and semantic role in the parse
14
+ # tree.
15
+ #
16
+ # [The following span standardization algorithm is used to make the
17
+ # syntax-semantics mapping more uniform:]
18
+ # Given a role r that has been assigned, let N be the set of
19
+ # terminal nodes of the syntactic structure that are covered by r.
20
+ #
21
+ # Iteratively compute the maximal projection of N in the syntactic
22
+ # structure:
23
+ # 1) If n is a node such that all of n's children are in N,
24
+ # then remove n's children from N and add n instead.
25
+ # 2) If n is a node with 3 or more children, and all of n's
26
+ # children except one are in N, then remove n's children from N
27
+ # and add n instead.
28
+ # 3) If n is an NP with 2 children, and one of them, another NP,
29
+ # is in N, and the other, a relative clause, is not, then remove
30
+ # n's children from N and add n instead.
31
+ #
32
+ # If none of the rules is applicable to N anymore, assign r to the
33
+ # nodes in N.
34
+ #
35
+ # Rule 1 implements normal maximal projection. Rule 2 ``repairs'' parser
36
+ # errors where all children of a node but one have been assigned the
37
+ # same role. Rule 3 addresses a problem of the FrameNet data, where
38
+ # relative clauses have been omitted from roles assigned to NPs.
39
+
40
+ # KE Feb 08: rule 3 currently out of commission!
41
+
42
+ require "frprep/SalsaTigerRegXML"
43
+
44
+ module FixSynSemMapping
45
+ ##
46
+ # fix it
47
+ #
48
+ # relevant settings in the experiment file:
49
+ #
50
+ # fe_syn_repair:
51
+ # If there is a node that would be a max. constituent for the
52
+ # words covered by the given FE, except that it has one child
53
+ # whose words are not in the FE, use the node as max constituent anyway.
54
+ # This is to repair cases where the parser has made an attachment choice
55
+ # that differs from the one in the gold annotation
56
+ #
57
+ # fe_rel_repair:
58
+ # If there is an NP such that all of its children except one have been
59
+ # assigned the same FE, and that missing child is a relative clause
60
+ # depending on one of the other children, then take the complete NP as
61
+ # that FE
62
+ def FixSynSemMapping.fixit(sent, # SalsaTigerSentence object
63
+ exp, # experiment file object
64
+ interpreter_class) # SynInterpreter class
65
+
66
+
67
+ unless exp.get("fe_syn_repair") or exp.get("fe_rel_repair")
68
+ return
69
+ end
70
+
71
+ if sent.nil?
72
+ return
73
+ end
74
+
75
+ # "repair" FEs:
76
+ sent.each_frame { |frame|
77
+
78
+ frame.each_child { |fe_or_target|
79
+
80
+ # repair only if the FE currently
81
+ # points to more than one syn node
82
+ if fe_or_target.children.length() < 2
83
+ next
84
+ end
85
+
86
+ if exp.get("fe_rel_repair")
87
+ lastfe = fe_or_target.children.last()
88
+ if lastfe and interpreter_class.simplified_pt(lastfe) =~ /^(WDT)|(WP\$?)|(WRB)/
89
+
90
+ # remove syn nodes that the FE points to
91
+ old_fe_syn = fe_or_target.children()
92
+ old_fe_syn.each { |child|
93
+ fe_or_target.remove_child(child)
94
+ }
95
+
96
+ # set it to point only to the last previous node, the relative pronoun
97
+ fe_or_target.add_child(lastfe)
98
+ end
99
+ end
100
+
101
+ if exp.get("fe_syn_repair")
102
+ # remove syn nodes that the FE points to
103
+ old_fe_syn = fe_or_target.children()
104
+ old_fe_syn.each { |child|
105
+ fe_or_target.remove_child(child)
106
+ }
107
+
108
+ # and recompute
109
+ new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t|
110
+ t.yield_nodes
111
+ }.flatten.uniq,
112
+ sent,
113
+ exp.get("fe_syn_repair"))
114
+
115
+ # make the FE point to the new nodes
116
+ new_fe_syn.each { |syn_node|
117
+ fe_or_target.add_child(syn_node)
118
+ }
119
+ end
120
+ } # each FE
121
+ } # each frame
122
+ end # def fixit
123
+ end # module
124
+
125
+
126
+ #########3
127
+ # old code
128
+
129
+ # if exp.get("fe_rel_repair")
130
+ # # repair relative clauses:
131
+ # # then make a procedure to pass on to max constituents
132
+ # # that will recognize the relevant cases
133
+
134
+ # accept_anyway_proc = Proc.new { |node, children_in, children_out|
135
+
136
+ # # node: SynNode
137
+ # # children_in, children_out: array:SynNode. children_in are the children
138
+ # # that are already covered by the FE, children_out the ones that aren't
139
+
140
+ # # if node is an NP,
141
+ # # and only one of its children is out,
142
+ # # and one node in children_in is an NP, and the missing child is an SBAR
143
+ # # with a child that is a relative pronoun, then consider the child in children_out as covered
144
+ # if interpreter_class.category(node) == "noun" and
145
+ # children_out.length() == 1 and
146
+ # children_in.select { |n| interpreter_class.category(n) == "noun" } and
147
+ # interpreter_class.category(children_out.first) == "sent" and
148
+ # (ch = children_out.first.children) and
149
+ # ch.select { |n| interpreter_class.relative_pronoun?(n) }
150
+ # true
151
+ # else
152
+ # false
153
+ # end
154
+ # }
155
+
156
+ # else
157
+ # accept_anyway_proc = nil
158
+ # end
159
+
160
+
161
+ # # "repair" FEs:
162
+ # sent.each_frame { |frame|
163
+
164
+ # frame.each_child { |fe_or_target|
165
+
166
+ # # repair only if the FE currently
167
+ # # points to more than one syn node, or
168
+ # # if it is a noun with a non-covered sentence sister
169
+ # if fe_or_target.children.length() > 1 or
170
+ # (exp.get("fe_rel_repair") and (curr_marked = fe_or_target.children.first()) and
171
+ # interpreter_class.category(curr_marked) == "noun" and
172
+ # (p = curr_marked.parent) and
173
+ # p.children.select { |n| n != curr_marked and interpreter_class.category(n) == "sent" } )
174
+
175
+ # # remember nodes covered by the FE
176
+ # old_fe_syn = fe_or_target.children()
177
+
178
+ # # remove syn nodes that the FE points to
179
+ # old_fe_syn.each { |child|
180
+ # fe_or_target.remove_child(child)
181
+ # }
182
+
183
+ # # and recompute
184
+ # new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t| t.yield_nodes}.flatten.uniq,
185
+ # sent,
186
+ # exp.get("fe_syn_repair"),
187
+ # accept_anyway_proc)
188
+
189
+ # # make the FE point to the new nodes
190
+ # new_fe_syn.each { |syn_node|
191
+ # fe_or_target.add_child(syn_node)
192
+ # }
193
+
194
+ # end # if FE points to more than one syn node
195
+ # } # each FE
196
+ # } # each frame
@@ -0,0 +1,66 @@
1
+ # FPrepConfigData
2
+ # Katrin Erk July 05
3
+ #
4
+ # Preprocessing for Fred and Rosy:
5
+ # access to a configuration and experiment description file
6
+
7
+ require "frprep/ConfigData"
8
+
9
+ ##############################
10
+ # Class FrPrepConfigData
11
+ #
12
+ # inherits from ConfigData,
13
+ # sets variable names appropriate to preprocessing task
14
+
15
+ class FrPrepConfigData < ConfigData
16
+ def initialize(filename)
17
+
18
+ # initialize config data object
19
+ super(filename, # config file
20
+ { "prep_experiment_ID" => "string", # experiment identifier
21
+
22
+ "frprep_directory" => "string", # dir for frprep internal data
23
+
24
+ # information about the dataset
25
+ "language" => "string", # en, de
26
+ "origin"=> "string", # FrameNet, Salsa, or nothing
27
+ "format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
28
+ "encoding" => "string", # utf8, iso, hex, or nothing
29
+
30
+
31
+ # directories
32
+ "directory_input" => "string", # dir with input data
33
+ "directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
34
+ "directory_parserout" => "string", # dir with parser output for the parser named below
35
+
36
+ # syntactic processing
37
+ "pos_tagger" => "string", # name of POS tagger
38
+ "lemmatizer" => "string", # name of lemmatizer
39
+ "parser" => "string", # name of parser
40
+ "pos_tagger_path" => "string", # path to POS tagger
41
+ "lemmatizer_path" => "string", # path to lemmatizer
42
+ "parser_path" => "string", # path to parser
43
+ "parser_max_sent_num" => "integer", # max number of sentences per parser input file
44
+ "parser_max_sent_len" => "integer", # max sentence length the parser handles
45
+
46
+ "do_parse" => "bool", # use parser?
47
+ "do_lemmatize" => "bool",# use lemmatizer?
48
+ "do_postag" => "bool", # use POS tagger?
49
+
50
+ # output format: if tabformat_output == true,
51
+ # output in Tab format rather than Salsa/Tiger XML
52
+ # (this will not work if do_parse == true)
53
+ "tabformat_output" => "bool",
54
+
55
+ # syntactic repairs, dependent on existing semantic role annotation
56
+ "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
57
+ "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
58
+ },
59
+ [ ] # variables
60
+ )
61
+
62
+ end
63
+ end
64
+
65
+
66
+
@@ -0,0 +1,513 @@
1
+ # sp 18 06 2004
2
+ #
3
+ # access to FrameNet XML files, sentences, and annotation.
4
+ #
5
+ # sp 10 11 04: only data from the first layer with name XY is
6
+ # used for output. Other data is saved in layer XY.2nd, but is
7
+ # currently not processed.
8
+ #
9
+ # sp 22 05 04: also, if two labels exist which cover the same span
10
+ # (ie there is a double annotation within the same layer), ignore
11
+ # all but the first label.
12
+ #
13
+ # ke 13 07 05:
14
+ # - changed to RegXMl.rb
15
+ # - fixed two problems in analyse_layer:
16
+ # - Deleting problematic labels:
17
+ # For some reason, thisLayer[i+1..-1].each_index {|other_i|
18
+ # included the index 0 in any case, resulting in the 1st
19
+ # label being deleted in any case.
20
+ # - Deleting problematic labels, checking for label overlap:
21
+ # The old formulation worked only if labels occurred in the array
22
+ # in the order they occurred in the sentence, but that was not the case.
23
+ # - Change in deleting problematic labels:
24
+ # No longer delete duplicate labels, since e.g. in the PT level there
25
+ # may be more than one NP label, and we want to keep those
26
+ #
27
+ # KE January 2007:
28
+ # write new adapted FNTab format
29
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
30
+
31
+ require 'Ampersand'
32
+ require 'ISO-8859-1'
33
+ require 'RegXML'
34
+
35
+ class FrameXMLFile # only verified to work for FrameNet v1.1
36
+
37
+ def initialize(filename)
38
+ @filename = filename
39
+ file = File.new(filename)
40
+ counter = 0
41
+ while true
42
+ counter +=1
43
+ line = file.gets
44
+ if line =~ /<lexunit/
45
+ break
46
+ end
47
+ if counter > 3
48
+ STDERR.puts "Error: File "+filename+" does not conform to FrameNet v1.1 standard (lexunit in 3rd line)"
49
+ Kernel.exit
50
+ end
51
+ end
52
+ # found lexunit
53
+ string = line
54
+ while (line = file.gets())
55
+ string << line
56
+ end
57
+ @lexunit = RegXML.new(string)
58
+ attributes = @lexunit.attributes()
59
+ @id = attributes["ID"]
60
+ attributes["name"] =~ /^([^.]+).([^.]+)$/
61
+ @lu = $1
62
+ @pos = $2.upcase
63
+ if @lu.nil?
64
+ raise "[framexml] no lemma in header of file #{@filename}"
65
+ elsif @pos.nil?
66
+ raise "[framexml] no pos in header of file #{@filename}"
67
+ end
68
+ @frame = attributes["frame"]
69
+ end
70
+
71
+ def get_lu
72
+ return @lu.gsub(" ","_")
73
+ end
74
+
75
+ def get_lu_id
76
+ return @id
77
+ end
78
+
79
+ def get_filename
80
+ return @filename
81
+ end
82
+
83
+ def get_pos
84
+ return @pos
85
+ end
86
+
87
+ def get_frame
88
+ return @frame
89
+ end
90
+
91
+ def close
92
+ end
93
+
94
+ def each_sentence
95
+ @lexunit.children_and_text().each { |subcorpus|
96
+ subcorpus.children_and_text().each { |annotationSet|
97
+ if annotationSet.name == "annotationSet"
98
+ # sentence found
99
+ yield FrameXMLSentence.new(annotationSet,self)
100
+ end
101
+ }
102
+ }
103
+ end
104
+ end
105
+
106
+ class FrameXMLSentence
107
+ def initialize(annotationSet,file_obj)
108
+ @file_obj = file_obj
109
+
110
+ # layers: hash layer_name -> array:[name, start, stop]
111
+ # name: name of the element, string
112
+ # start: start character, integer
113
+ # stop: end character, integer
114
+ @layers = Hash.new
115
+
116
+ annotationSet.children_and_text().each { |sentence_or_layer_elt|
117
+
118
+ case sentence_or_layer_elt.name
119
+ when "sentence"
120
+ # sentence: has ID, its child is <text>[text]</text>
121
+ @sent_id = sentence_or_layer_elt.attributes["ID"]
122
+ text_elt = sentence_or_layer_elt.children_and_text().detect { |child|
123
+ child.name == "text"
124
+ }
125
+ if text_elt
126
+ # found the text element. its only child should be the text
127
+ @orig_text = text_elt.children_and_text().detect { |child|
128
+ child.text?
129
+ }
130
+ if @orig_text
131
+ # take text out of RegXMl object
132
+ @orig_text = @orig_text.to_s()
133
+ end
134
+ end
135
+
136
+ when "layers"
137
+ # contains annotation layers
138
+ sentence_or_layer_elt.children_and_text().each { |layer|
139
+ unless layer.name == "layer"
140
+ # additional material, ignore
141
+ next
142
+ end
143
+
144
+ name = layer.attributes["name"]
145
+ unless name
146
+ raise "layer without a name"
147
+ end
148
+ unless @layers.key?(name)
149
+ @layers[name] = analyse_layer(layer, name)
150
+ end
151
+ }
152
+ end
153
+ }
154
+
155
+ @pos_text = UtfIso.to_iso_8859_1(@orig_text).split(" ") # text with special characters replaced by iso8859 characters
156
+ @text = Ampersand.utf8_to_hex(@orig_text).split(" ") # text with special characters replaced by &...; sequences
157
+
158
+ # all text and pos_text have the same number of elements!
159
+ @start_is = Hash.new # map char indices (start of words) onto word indices
160
+ @stop_is = Hash.new # map char indices (end of words) onto word indices
161
+ @charidx = Array.new # maps word indices on [start,stop]
162
+
163
+ @double_space = Array.new
164
+ pos = 0
165
+ while (match = @orig_text.index(/(\s\s+)/,pos))
166
+ @double_space << match
167
+ pos = match+1
168
+ end
169
+
170
+
171
+ # fill start, stop and charidx arrays
172
+ char_i = 0
173
+ @pos_text.each_index {|word_i|
174
+ @start_is[char_i] = word_i
175
+ startchar = char_i
176
+ # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
177
+ char_i += our_length(@pos_text[word_i])
178
+ @stop_is[char_i-1] = word_i
179
+
180
+ stopchar = char_i-1
181
+
182
+ # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
183
+
184
+ @charidx << [startchar,stopchar]
185
+
186
+ # separators
187
+ if @double_space.include?(char_i) then
188
+ char_i += 2
189
+ else
190
+ char_i += 1
191
+ end
192
+ }
193
+ end
194
+
195
+ def get_file_obj
196
+ return @file_obj
197
+ end
198
+
199
+ def get_sent_id
200
+ return @sent_id
201
+ end
202
+
203
+ def print_text
204
+ puts "("+@id+ ")\t"+@text
205
+ end
206
+
207
+ def contains_FE_annotation_and_target
208
+ target_info = @layers["Target"][0]
209
+ unless target_info[0] == "Target"
210
+ STDERR.puts "Error in sentence from "+filename+": No target" # strictly speaking, no target at pos 0 in @layers["Target"]
211
+ STDERR.puts "Sentence: "+@text
212
+ return false
213
+ else
214
+ return (@layers.key?("FE") and target_info[2] != 0)
215
+ end
216
+ end
217
+
218
+ # we only verify the interesting layers (FE,GF,Target)
219
+ # if there is weird stuff going on on e.g. the Noun or Adj layer, we don't care.
220
+
221
+ def verify_annotation # returns true if some change has taken place
222
+ change = false
223
+ @layers.each_pair {|layername,l|
224
+
225
+ if layername=="FE" or layername=="GF" or layername=="PT" or layername=="Target" # only verify the "important" layers
226
+
227
+ l.each_index {|i|
228
+
229
+ element,start,stop = l[i]
230
+
231
+ newstart = start
232
+ newstop = stop
233
+
234
+ @charidx.each_index{|j|
235
+ unless j== 0
236
+ pstartidx, pstopidx = @charidx[j-1]
237
+ end
238
+ startidx, stopidx = @charidx[j]
239
+
240
+ if (start > startidx and start <= stopidx) or
241
+ (j != 0 and start > pstopidx and start < startidx)
242
+ newstart = startidx
243
+ end
244
+
245
+ if (stop >= startidx and stop < stopidx)
246
+ newstop = stopidx
247
+ elsif (j != 0 and stop > pstopidx and stop < startidx)
248
+ newstop = pstopidx
249
+ end
250
+
251
+ }
252
+ if start != newstart or stop != newstop
253
+ change = true
254
+ @layers[layername][i] = [element,newstart,newstop]
255
+ STDERR.puts "Heuristics has changed element "+element+" from ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"] in file "+@file_obj.get_filename+"."
256
+ markable_as_string(layername,element).each {|string|
257
+ STDERR.puts "New markable: "+string
258
+ }
259
+ STDERR.puts "Sentence: "+@pos_text.join(" ")
260
+ puts
261
+ end
262
+ }
263
+ end
264
+ }
265
+ return change
266
+ end
267
+
268
+ def print_conll_style
269
+ print_conll_style_to(STDOUT)
270
+ end
271
+
272
+ # CHANGED KE January 2007:
273
+ # write new adapted FNTab format
274
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
275
+ def print_conll_style_to(out)
276
+
277
+ # even though in principle there might be multiple
278
+ # labels for one span [i.e. in one value of the
279
+ # {gf,fe,pt} hashes], we only ever record one
280
+
281
+ gf = Hash.new
282
+ add_all_to_hash(gf,"GF")
283
+ fe = Hash.new
284
+ add_all_to_hash(fe,"FE")
285
+ pt = Hash.new
286
+ add_all_to_hash(pt,"PT")
287
+ target = Hash.new
288
+ add_all_to_hash(target,"Target")
289
+
290
+ in_target = false
291
+
292
+ @pos_text.each_index {|i|
293
+ # write format:
294
+ # "word" "pt", "gf", "role", "target", "frame", "stuff" "ne", "sent_id"
295
+ line = Array.new
296
+ # word
297
+ word = @pos_text[i]
298
+ line << word
299
+
300
+ start, stop = @charidx[i]
301
+ # "pt", "gf", "role",
302
+ [pt,gf,fe].each {|hash|
303
+ token = Array.new
304
+ if hash.key?([start,"start"])
305
+ markables = hash.delete([start,"start"])
306
+ markables.each {|element|
307
+ token << "B-"+element
308
+ }
309
+ end
310
+ if hash.key?([stop,"stop"])
311
+ markables = hash.delete([stop,"stop"])
312
+ markables.each {|element|
313
+ token << "E-"+element
314
+ }
315
+ end
316
+ if token.empty?
317
+ line << "-"
318
+ else
319
+ line << token.sort.join(":")
320
+ end
321
+ }
322
+ # "target"
323
+ if target.key?([start,"start"])
324
+ target.delete([start,"start"])
325
+ in_target = true
326
+ end
327
+ if in_target
328
+ line << @file_obj.get_lu+"."+@file_obj.get_pos
329
+ else
330
+ line << "-"
331
+ end
332
+ if target.key?([stop,"stop"])
333
+ target.delete([stop,"stop"])
334
+ in_target = false
335
+ end
336
+ # "frame"
337
+ line << @file_obj.get_frame
338
+
339
+ # "stuff" "ne",
340
+ line << "-"
341
+ line << "-"
342
+
343
+ # "sent_id"
344
+ line << @file_obj.get_lu_id+"-"+@sent_id
345
+
346
+ out.puts line.join("\t")
347
+ }
348
+
349
+ out.puts
350
+
351
+ [gf,fe,pt,target].each {|hash|
352
+ unless hash.empty?
353
+ STDERR.puts @file_obj.get_filename
354
+ raise "**** Error: Hash not empty after creation of Sentence in CoNLL-Format (could not find matching words for some markup element)!"
355
+ end
356
+ }
357
+ end
358
+
359
+
360
+ def print_layers
361
+ @layers.each {|ln,l|
362
+ puts "Layer "+ln+":"
363
+ l.each {|element,start,stop|
364
+ puts "\t"+element+": "+start.to_s+" -- "+stop.to_s
365
+ }
366
+ puts "***"
367
+ }
368
+ end
369
+
370
+
371
+ private
372
+
373
+
374
+ def our_length(string) # (1) replace &...; with 1 char and " with two chars
375
+ return string.gsub(/&(.+?);/,"X").length
376
+ end
377
+
378
+ def is_fe(fename)
379
+ @layers["FE"].each {|name,start,stop|
380
+ if fename == name
381
+ return true
382
+ end
383
+ }
384
+ return false
385
+ end
386
+
387
+
388
+ def markable_as_string(layername,markup_name) # returns an array of all markables with this name
389
+
390
+ result = Array.new
391
+
392
+ festart = nil
393
+ festop = nil
394
+ @layers[layername].each {|name,start,stop|
395
+ if markup_name == name
396
+ fe = Array.new
397
+ infe = false
398
+ @charidx.each_index {|i|
399
+ startidx,stopidx = @charidx[i]
400
+ if startidx == start
401
+ infe = true
402
+ end
403
+ if infe
404
+ fe << @pos_text[i]
405
+ end
406
+ if stopidx == stop
407
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+", VERIFIED]")
408
+ break
409
+ elsif stopidx > stop
410
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+",ERROR]")
411
+ break
412
+ end
413
+ }
414
+ end
415
+ }
416
+ return result
417
+ end
418
+
419
+ def add_to_hash(hash,key,name)
420
+ exists = false
421
+ if hash.key?(key)
422
+ exists = true
423
+ else
424
+ hash[key] = Array.new
425
+ hash[key] << name
426
+ end
427
+ return exists
428
+ end
429
+
430
+ def add_all_to_hash(hash,layername)
431
+ # use "uniq" to remove wrong double annotations
432
+ @layers[layername].uniq.each {|element,start,stop|
433
+ exists = add_to_hash(hash,[start, "start"],element)
434
+ if exists
435
+ STDERR.puts "Warning ["+@file_obj.get_filename+"]: In layer "+layername+", two elements start at position "+start.to_s+". Only using first. Layer as read from FrameXML: "+@layers[layername].map {|element,start,stop| element+" ("+start.to_s+","+stop.to_s+")"}.join(" ")
436
+ else
437
+ add_to_hash(hash,[stop, "stop"],element)
438
+ end
439
+ }
440
+ end
441
+
442
+
443
+ def analyse_layer(layer_elt,name) # read layer information from file and store in @layers
444
+ if name.nil?
445
+ STDERR.puts "Error: layer line "+line+" with empty name."
446
+ end
447
+
448
+ # thisLayer, retv: array:[name(string), start(integer), end(integer)]
449
+ thisLayer = Array.new
450
+ retv = Array.new
451
+
452
+ labels_elt = layer_elt.children_and_text.detect { |child| child.name == "labels"}
453
+ unless labels_elt
454
+ # no labels found, return empty array
455
+ return thisLayer
456
+ end
457
+
458
+ labels_elt.children_and_text.each { |label|
459
+ unless label.name == "label"
460
+ # some other markup, ignore
461
+ next
462
+ end
463
+
464
+ attributes = label.attributes()
465
+ if attributes["itype"]
466
+ # null instantiation, don't retain
467
+ next
468
+ end
469
+ if not(attributes["start"]) and not(attributes["end"])
470
+ # no start and end labels
471
+ next
472
+ end
473
+ thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
474
+ }
475
+
476
+ # sanity check: verify that
477
+ # 1. we don't have overlapping labels
478
+
479
+ deleteHash = Hash.new # keep track of the labels which are to be deleted
480
+ # i -> Boolean
481
+
482
+ thisLayer.each_index {|i|
483
+ # efficiency: skip already delete labels
484
+ if deleteHash[i]
485
+ next
486
+ end
487
+ this_label, this_from , this_to = thisLayer[i]
488
+
489
+ # compare with all remaining labels
490
+ (i+1..thisLayer.length()-1).to_a.each { |other_i|
491
+ other_label,other_from,other_to = thisLayer[other_i]
492
+
493
+ # overlap? Throw out the later FE
494
+ if this_from <= other_from and other_from <= this_to
495
+ $stderr.puts "Warning: Label overlap, deleting #{other_label}"
496
+ deleteHash[other_i] = true
497
+ elsif this_from <= other_to and other_to <= this_to
498
+ $stderr.puts "Warning: Label overlap, deleting #{this_label}"
499
+ deleteHash[i] = true
500
+ end
501
+ }
502
+ # matched with all other labels. If "keep", return
503
+
504
+ if deleteHash[i]
505
+ # $stderr.puts " deleting entry #{i}"
506
+ else
507
+ retv << thisLayer[i]
508
+ end
509
+ }
510
+
511
+ return retv
512
+ end
513
+ end