frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,99 @@
|
|
1
|
+
# sp jul 05 05
|
2
|
+
#
|
3
|
+
# Static helper methods for SalsaTigerRegXML:
|
4
|
+
|
5
|
+
# - provide header and footer for Salsa/Tiger XML files
|
6
|
+
# - escape and unescape HTML entities
|
7
|
+
#
|
8
|
+
# changed KE nov 05:
|
9
|
+
# many methods moved to FrprepHelper
|
10
|
+
|
11
|
+
require "common/SalsaTigerRegXML"
|
12
|
+
require "common/headz"
|
13
|
+
require "common/Parser"
|
14
|
+
require "tempfile"
|
15
|
+
|
16
|
+
class SalsaTigerXMLHelper
|
17
|
+
|
18
|
+
|
19
|
+
###
|
20
|
+
# get header of SalsaTigerXML files (as string)
|
21
|
+
def SalsaTigerXMLHelper.get_header
|
22
|
+
|
23
|
+
header = <<ENDOFHEADER
|
24
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
25
|
+
<corpus corpusname="corpus" target="">
|
26
|
+
<head>
|
27
|
+
<meta>
|
28
|
+
<format>
|
29
|
+
NeGra format, version 3</format>
|
30
|
+
</meta>
|
31
|
+
<frames xmlns="http://www.clt-st.de/framenet/frame-database">
|
32
|
+
</frames>
|
33
|
+
<wordtags xmlns="http://www.clt-st.de/salsa/wordtags">
|
34
|
+
</wordtags>
|
35
|
+
<flags>
|
36
|
+
</flags>
|
37
|
+
<annotation>
|
38
|
+
<edgelabel>
|
39
|
+
</edgelabel>
|
40
|
+
<secedgelabel>
|
41
|
+
</secedgelabel>
|
42
|
+
</annotation>
|
43
|
+
</head>
|
44
|
+
<body>
|
45
|
+
ENDOFHEADER
|
46
|
+
|
47
|
+
return header
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
###
|
52
|
+
# get footer of SALSATigerXML files (as string)
|
53
|
+
def SalsaTigerXMLHelper.get_footer
|
54
|
+
|
55
|
+
footer = <<ENDOFFOOTER
|
56
|
+
</body>
|
57
|
+
</corpus>
|
58
|
+
ENDOFFOOTER
|
59
|
+
|
60
|
+
return footer
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
# escape and unescape strings for representation in XML
|
66
|
+
|
67
|
+
@@replacements = [
|
68
|
+
# ["''","""], # added by ines (09/03/09), might cause problems for unescape???
|
69
|
+
["&","&"], # must be first for escaping, last for unescaping
|
70
|
+
["<","<"],
|
71
|
+
[">", ">"],
|
72
|
+
["\"","''"],
|
73
|
+
# ["\"","""],
|
74
|
+
# ["\'\'","""],
|
75
|
+
# ["\`\`","""],
|
76
|
+
["\'","'"],
|
77
|
+
["\`\`","''"],
|
78
|
+
# ["''","''"]
|
79
|
+
]
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
def SalsaTigerXMLHelper.escape(string)
|
84
|
+
@@replacements.each {|unescaped,escaped|
|
85
|
+
string.gsub!(unescaped,escaped)
|
86
|
+
}
|
87
|
+
return string
|
88
|
+
end
|
89
|
+
|
90
|
+
def SalsaTigerXMLHelper.unescape(string)
|
91
|
+
# reverse replacements to replace & last
|
92
|
+
@@replacements.reverse.each {|unescaped,escaped|
|
93
|
+
string.gsub!(escaped,unescaped)
|
94
|
+
}
|
95
|
+
return string
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
end
|
@@ -0,0 +1,384 @@
|
|
1
|
+
####
|
2
|
+
# sp 21 07 05
|
3
|
+
#
|
4
|
+
# modified ke 30 10 05: adapted to fit into SynInterface
|
5
|
+
#
|
6
|
+
# represents a file containing Sleepy parses
|
7
|
+
#
|
8
|
+
# underlying data structure for individual sentences: SalsaTigerSentence
|
9
|
+
require "tempfile"
|
10
|
+
|
11
|
+
require "common/SalsaTigerRegXML"
|
12
|
+
require "common/SalsaTigerXMLHelper"
|
13
|
+
require "common/TabFormat"
|
14
|
+
require "common/Counter"
|
15
|
+
|
16
|
+
require "common/AbstractSynInterface"
|
17
|
+
require "common/Tiger.rb"
|
18
|
+
|
19
|
+
################################################
|
20
|
+
# Interface class
|
21
|
+
class SleepyInterface < SynInterfaceSTXML
|
22
|
+
SleepyInterface.announce_me()
|
23
|
+
|
24
|
+
###
|
25
|
+
def SleepyInterface.system()
|
26
|
+
return "sleepy"
|
27
|
+
end
|
28
|
+
|
29
|
+
###
|
30
|
+
def SleepyInterface.service()
|
31
|
+
return "parser"
|
32
|
+
end
|
33
|
+
|
34
|
+
###
|
35
|
+
# initialize to set values for all subsequent processing
|
36
|
+
def initialize(program_path, # string: path to system
|
37
|
+
insuffix, # string: suffix of tab files
|
38
|
+
outsuffix, # string: suffix for parsed files
|
39
|
+
stsuffix, # string: suffix for Salsa/TIGER XML files
|
40
|
+
var_hash = {}) # optional arguments in a hash
|
41
|
+
|
42
|
+
super(program_path, insuffix, outsuffix, stsuffix, var_hash)
|
43
|
+
unless @program_path =~ /\/$/
|
44
|
+
@program_path = @program_path + "/"
|
45
|
+
end
|
46
|
+
|
47
|
+
# new: evaluate var hash
|
48
|
+
@pos_suffix = var_hash["pos_suffix"]
|
49
|
+
@lemma_suffix = var_hash["lemma_suffix"]
|
50
|
+
@tab_dir = var_hash["tab_dir"]
|
51
|
+
end
|
52
|
+
|
53
|
+
####
|
54
|
+
# parse a directory with TabFormat files and write the parse trees to outputdir
|
55
|
+
# I assume that the files in inputdir are smaller than
|
56
|
+
# the maximum number of sentences that
|
57
|
+
# Sleepy can parse in one go (i.e. that they are split)
|
58
|
+
def process_dir(in_dir, # string: input directory name
|
59
|
+
out_dir) # string: output directory name
|
60
|
+
|
61
|
+
sleepy_prog = "#{@program_path}sleepy --beam 1000 --model-file #{@program_path}negra.model --parse "
|
62
|
+
|
63
|
+
Dir[in_dir + "*" + @insuffix].each {|inputfilename|
|
64
|
+
STDERR.puts "*** Parsing #{inputfilename} with Sleepy"
|
65
|
+
corpusfilename = File.basename(inputfilename, @insuffix)
|
66
|
+
parsefilename = out_dir + corpusfilename + @outsuffix
|
67
|
+
tempfile = Tempfile.new(corpusfilename)
|
68
|
+
|
69
|
+
# we need neither lemmata nor POS tags; sleepy can do with the words
|
70
|
+
corpusfile = FNTabFormatFile.new(inputfilename,nil, nil)
|
71
|
+
corpusfile.each_sentence {|sentence|
|
72
|
+
tempfile.puts sentence.to_s
|
73
|
+
}
|
74
|
+
tempfile.close
|
75
|
+
# parse and remove comments in the parser output
|
76
|
+
Kernel.system(sleepy_prog+" "+tempfile.path+" 2>&1 | grep -v \"Span:\" > "+parsefilename)
|
77
|
+
}
|
78
|
+
end
|
79
|
+
|
80
|
+
###
|
81
|
+
# for a given parsed file:
|
82
|
+
# yield each sentence as a pair
|
83
|
+
# [SalsaTigerSentence object, FNTabFormatSentence object]
|
84
|
+
# of the sentence in SalsaTigerXML and the matching tab format sentence
|
85
|
+
#
|
86
|
+
# If a parse has failed, returns
|
87
|
+
# [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
|
88
|
+
# to allow more detailed accounting for failed parses
|
89
|
+
# (basically just a flat structure with a failed=true attribute
|
90
|
+
# at the sentence node)
|
91
|
+
def each_sentence(parsefilename)
|
92
|
+
# sanity checks
|
93
|
+
unless @tab_dir
|
94
|
+
$stderr.puts "SleepyInterface error: Need to set tab directory on initialization"
|
95
|
+
exit 1
|
96
|
+
end
|
97
|
+
|
98
|
+
# get matching tab file for this parser output file
|
99
|
+
parsefile = File.new(parsefilename)
|
100
|
+
tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
|
101
|
+
tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
|
102
|
+
|
103
|
+
sentid = 0
|
104
|
+
|
105
|
+
tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
|
106
|
+
|
107
|
+
sentence_str = ""
|
108
|
+
status = true # error encountered?
|
109
|
+
|
110
|
+
# assemble next sentence in Sleepy file by reading lines from parsefile
|
111
|
+
while true
|
112
|
+
line = parsefile.gets
|
113
|
+
case line
|
114
|
+
when /% Parse failed/
|
115
|
+
status = false
|
116
|
+
break
|
117
|
+
when nil # end of file: nothing more to break
|
118
|
+
break
|
119
|
+
when /^%/, /^\s*$/ # empty lines, other comments: end of current sentence
|
120
|
+
unless sentence_str == "" # only break if you have read something
|
121
|
+
break
|
122
|
+
end
|
123
|
+
else
|
124
|
+
sentence_str += line.chomp # collect line of current parse and continue reading
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# we have reached some kind of end
|
129
|
+
sentid +=1
|
130
|
+
|
131
|
+
# we don't have a sentence: hopefully, this is becase parsing has failed
|
132
|
+
# if this is not the case, we are in trouble
|
133
|
+
if sentence_str == ""
|
134
|
+
case status
|
135
|
+
|
136
|
+
when false
|
137
|
+
# return a SalsaTigerSentence object for the failed sentence
|
138
|
+
# with a virtual top node and one terminal per word.
|
139
|
+
if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
|
140
|
+
my_sent_id = tab_sent.get_sent_id()
|
141
|
+
else
|
142
|
+
my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
|
143
|
+
end
|
144
|
+
sent = SleepyInterface.failed_sentence(tab_sent, my_sent_id)
|
145
|
+
yield [sent, tab_sent, SleepyInterface.standard_mapping(sent, tab_sent)]
|
146
|
+
|
147
|
+
else
|
148
|
+
# this may not happen: we need some sentence for the current
|
149
|
+
# TabFile sentence
|
150
|
+
$stderr.puts "SleepyInterface error: premature end of parser file!"
|
151
|
+
exit 1
|
152
|
+
end
|
153
|
+
else
|
154
|
+
# if we are here, we have a sentence_str to work on
|
155
|
+
# hopefully, our status is OK
|
156
|
+
case status
|
157
|
+
when true
|
158
|
+
if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
|
159
|
+
my_sent_id = tab_sent.get_sent_id()
|
160
|
+
else
|
161
|
+
my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
|
162
|
+
end
|
163
|
+
st_sent = build_salsatiger(" " + sentence_str + " ", 0,
|
164
|
+
Array.new, Counter.new(0),
|
165
|
+
Counter.new(500),
|
166
|
+
SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
|
167
|
+
yield [st_sent, tab_sent, SleepyInterface.standard_mapping(st_sent, tab_sent)]
|
168
|
+
|
169
|
+
else # i.e. when "failed"
|
170
|
+
$stderr.puts "SleepyInterface error: failed parse, but parse tree exists??"
|
171
|
+
exit 1
|
172
|
+
end
|
173
|
+
end
|
174
|
+
}
|
175
|
+
|
176
|
+
# all TabFile sentences are consumed:
|
177
|
+
# now we may just encounter comments, garbage, empty lines etc.
|
178
|
+
|
179
|
+
while not parsefile.eof?
|
180
|
+
case parsefile.gets
|
181
|
+
when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
|
182
|
+
else
|
183
|
+
$stderr.puts "SleepyInterface error: premature end of tab file"
|
184
|
+
exit 1
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
|
190
|
+
###
|
191
|
+
# write Salsa/TIGER XML output to file
|
192
|
+
def to_stxml_file(infilename, # string: name of parse file
|
193
|
+
outfilename) # string: name of output stxml file
|
194
|
+
|
195
|
+
outfile = File.new(outfilename, "w")
|
196
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
197
|
+
each_sentence(infilename) { |st_sent, tabsent|
|
198
|
+
outfile.puts st_sent.get()
|
199
|
+
}
|
200
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
201
|
+
outfile.close()
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
|
206
|
+
########################
|
207
|
+
private
|
208
|
+
|
209
|
+
###
|
210
|
+
# Recursive function for parsing a Sleepy parse tree and
|
211
|
+
# building a SalsaTigerSentence recursively
|
212
|
+
#
|
213
|
+
# Algorithm: manage stack which contains, for the current constituent,
|
214
|
+
# child constituents (if a nonterminal), and the category label.
|
215
|
+
# When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
|
216
|
+
# All children and the category label are popped from the stack and integrated into the
|
217
|
+
# TigerSalsa data structure. The new node is re-pushed onto the stack.
|
218
|
+
def build_salsatiger(sentence, # string
|
219
|
+
pos, # position in string (index): integer
|
220
|
+
stack, # stack with incomplete nodes: Array
|
221
|
+
termc, # terminal counter
|
222
|
+
nontc, # nonterminal counter
|
223
|
+
sent_obj) # SalsaTigerSentence
|
224
|
+
|
225
|
+
|
226
|
+
# main case distinction: match the beginning of our string
|
227
|
+
# (i.e. what follows our current position in the string)
|
228
|
+
|
229
|
+
case sentence[pos..-1]
|
230
|
+
|
231
|
+
when /^ *$/ # nothing -> whole sentence parsed
|
232
|
+
if stack.length == 1
|
233
|
+
# sleepy always delivers one "top" node; if we don't get just one
|
234
|
+
# node, something has gone wrong
|
235
|
+
node = stack.pop
|
236
|
+
node.del_attribute("gf")
|
237
|
+
return sent_obj
|
238
|
+
else
|
239
|
+
$stderr.puts "SleepyINterface Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
|
240
|
+
exit 1
|
241
|
+
end
|
242
|
+
|
243
|
+
when /^\s*\(([^ )]+) /
|
244
|
+
# match the beginning of a new constituent
|
245
|
+
# (opening bracket + category + space, may not contain closing bracket)
|
246
|
+
cat = $1
|
247
|
+
if cat.nil? or cat == ""
|
248
|
+
$stderr.puts "SleepyInterface Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
|
249
|
+
exit 1
|
250
|
+
end
|
251
|
+
# STDERR.puts "new const #{cat}"
|
252
|
+
stack.push cat # throw the category label on the stack
|
253
|
+
return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
|
254
|
+
|
255
|
+
when /^\s*(\S+)\) /
|
256
|
+
# match the end of a terminal constituent (something before a closing bracket + space)
|
257
|
+
word = $1
|
258
|
+
comb_cat = stack.pop
|
259
|
+
if comb_cat.to_s == ""
|
260
|
+
$stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
261
|
+
exit 1
|
262
|
+
end
|
263
|
+
cat,gf = split_cat(comb_cat)
|
264
|
+
node = sent_obj.add_syn("t",
|
265
|
+
nil, # cat (doesn't matter here)
|
266
|
+
SalsaTigerXMLHelper.escape(word), # word
|
267
|
+
cat, # pos
|
268
|
+
termc.next.to_s)
|
269
|
+
node.set_attribute("gf",gf)
|
270
|
+
# STDERR.puts "completed terminal #{cat}, #{word}"
|
271
|
+
stack.push node
|
272
|
+
return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
|
273
|
+
|
274
|
+
when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
|
275
|
+
# now collect children:
|
276
|
+
# pop items from the stack until you find the category
|
277
|
+
children = Array.new
|
278
|
+
while true
|
279
|
+
if stack.empty?
|
280
|
+
$stderr.puts "SleepyInterface Error: stack empty; cannot find more children"
|
281
|
+
exit 1
|
282
|
+
end
|
283
|
+
item = stack.pop
|
284
|
+
case item.class.to_s
|
285
|
+
when "SynNode" # this is a child
|
286
|
+
children.push item
|
287
|
+
when "String" # this is the category label
|
288
|
+
if item.to_s == ""
|
289
|
+
$stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
290
|
+
exit 1
|
291
|
+
end
|
292
|
+
cat,gf = split_cat(item)
|
293
|
+
break
|
294
|
+
else
|
295
|
+
$stderr.puts "SleepyInterface Error: unknown item class #{item.class.to_s}"
|
296
|
+
exit 1
|
297
|
+
end
|
298
|
+
end
|
299
|
+
# now add a nonterminal node to the sentence object and
|
300
|
+
# register the children nodes
|
301
|
+
node = sent_obj.add_syn("nt",
|
302
|
+
cat, # cat
|
303
|
+
nil, # word (doesn't matter)
|
304
|
+
nil, # pos (doesn't matter)
|
305
|
+
nontc.next.to_s)
|
306
|
+
children.each {|child|
|
307
|
+
child_gf = child.get_attribute("gf")
|
308
|
+
child.del_attribute("gf")
|
309
|
+
node.add_child(child,child_gf)
|
310
|
+
child.add_parent(node, child_gf)
|
311
|
+
}
|
312
|
+
node.set_attribute("gf",gf)
|
313
|
+
# STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
|
314
|
+
stack.push node
|
315
|
+
return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
|
316
|
+
else
|
317
|
+
|
318
|
+
if sentence =~ /Fatal error: exception Out_of_memory/
|
319
|
+
$stderr.puts "SleepyInterface error: Sleepy parser ran out of memory."
|
320
|
+
$stderr.puts "Try reducing the max. sentence length"
|
321
|
+
$stderr.puts "in the experiment file."
|
322
|
+
exit 1
|
323
|
+
end
|
324
|
+
|
325
|
+
|
326
|
+
$stderr.puts "SleepyInterface Error: cannot analyse sentence at pos #{pos}:\n #{sentence[pos..-1]}\n Complete sentence: \n#{sentence}"
|
327
|
+
exit 1
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
###
|
332
|
+
# Sleepy delivers node labels as "phrase type"-"grammatical function"
|
333
|
+
# but the GF may not be present.
|
334
|
+
|
335
|
+
def split_cat(cat)
|
336
|
+
|
337
|
+
cat =~ /^([^-]*)(-([^-]*))?$/
|
338
|
+
unless $1
|
339
|
+
$stderr.puts "SleepyInterface Error: could not identify category in #{cat}"
|
340
|
+
exit 1
|
341
|
+
end
|
342
|
+
|
343
|
+
proper_cat = $1
|
344
|
+
|
345
|
+
if $3
|
346
|
+
gf = $3
|
347
|
+
else
|
348
|
+
gf = ""
|
349
|
+
end
|
350
|
+
|
351
|
+
return [proper_cat,gf]
|
352
|
+
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
|
357
|
+
|
358
|
+
################################################
|
359
|
+
# Interpreter class
|
360
|
+
class SleepyInterpreter < Tiger
|
361
|
+
SleepyInterpreter.announce_me()
|
362
|
+
|
363
|
+
###
|
364
|
+
# names of the systems interpreted by this class:
|
365
|
+
# returns a hash service(string) -> system name (string),
|
366
|
+
# e.g.
|
367
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
368
|
+
def SleepyInterpreter.systems()
|
369
|
+
return {
|
370
|
+
"parser" => "sleepy"
|
371
|
+
}
|
372
|
+
end
|
373
|
+
|
374
|
+
###
|
375
|
+
# names of additional systems that may be interpreted by this class
|
376
|
+
# returns a hash service(string) -> system name(string)
|
377
|
+
# same as names()
|
378
|
+
def SleepyInterpreter.optional_systems()
|
379
|
+
return {
|
380
|
+
"lemmatizer" => "treetagger"
|
381
|
+
}
|
382
|
+
end
|
383
|
+
|
384
|
+
end
|