frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,44 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
require "frprep/AbstractSynInterface"
|
3
|
+
|
4
|
+
################################################
|
5
|
+
# Interface class
|
6
|
+
class TntInterface < SynInterfaceTab
|
7
|
+
TntInterface.announce_me()
|
8
|
+
|
9
|
+
def TntInterface.system()
|
10
|
+
return "tnt"
|
11
|
+
end
|
12
|
+
|
13
|
+
def TntInterface.service()
|
14
|
+
return "pos_tagger"
|
15
|
+
end
|
16
|
+
|
17
|
+
def process_file(infilename, # string: name of input file
|
18
|
+
outfilename) # string: name of output file
|
19
|
+
|
20
|
+
tempfile = Tempfile.new("Tnt")
|
21
|
+
TntInterface.fntab_words_to_file(infilename, tempfile)
|
22
|
+
tempfile.close
|
23
|
+
|
24
|
+
# 1. use grep to remove commentaries from file
|
25
|
+
# 2. use sed to extract tags tag list:
|
26
|
+
# - match one or more non-spaces
|
27
|
+
# - match one or more spaces
|
28
|
+
# - match one or more non-spaces and write to outfilename
|
29
|
+
|
30
|
+
# This assumes that the experiment file entry for pos_tagger_path
|
31
|
+
# has the form
|
32
|
+
# pos_tagger_path = <program_name> <model>
|
33
|
+
|
34
|
+
Kernel.system(@program_path + " " + tempfile.path +
|
35
|
+
' | grep -v -E "^%%" | sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
|
36
|
+
|
37
|
+
tempfile.close(true) # delete tempfile
|
38
|
+
unless `cat #{infilename} | wc -l`.strip ==
|
39
|
+
`cat #{outfilename} | wc -l`.strip
|
40
|
+
raise "Error: tagged file has different line number from corpus file!"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
data/lib/frprep/Tree.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'frprep/Graph'
|
2
|
+
|
3
|
+
class TreeNode < GraphNode
|
4
|
+
|
5
|
+
def initialize(id)
|
6
|
+
super(id)
|
7
|
+
end
|
8
|
+
|
9
|
+
# redo the ancestor-related methods,
|
10
|
+
# since here we only have one parent per node
|
11
|
+
def parent()
|
12
|
+
retv = parents()
|
13
|
+
if retv.nil?
|
14
|
+
return nil
|
15
|
+
else
|
16
|
+
return retv.first
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def parent_label()
|
21
|
+
retv = parent_labels()
|
22
|
+
if retv.nil?
|
23
|
+
return nil
|
24
|
+
else
|
25
|
+
return retv.first
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def parent_with_edgelabel()
|
31
|
+
retv = parents_with_edgelabel()
|
32
|
+
|
33
|
+
if retv.nil?
|
34
|
+
return nil
|
35
|
+
else
|
36
|
+
return retv.first
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
def add_parent(parent, edgelabel, varhash={})
|
42
|
+
set_parent(parent, edgelabel, varhash)
|
43
|
+
end
|
44
|
+
|
45
|
+
def set_parent(parent, edgelabel, varhash={})
|
46
|
+
# remove old parent
|
47
|
+
each_parent_with_edgelabel { |label, parent|
|
48
|
+
remove_parent(parent, label, varhash)
|
49
|
+
}
|
50
|
+
|
51
|
+
# set new parent
|
52
|
+
@parents << [edgelabel, parent]
|
53
|
+
|
54
|
+
# and vice versa: add self as child to parent
|
55
|
+
unless varhash["pointer_insteadof_edge"]
|
56
|
+
unless parent.children_with_edgelabel().include? [edgelabel, self]
|
57
|
+
parent.add_child(self, edgelabel)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,303 @@
|
|
1
|
+
# sp 30 11 06
|
2
|
+
# extended by TreeTaggerPOSInterface
|
3
|
+
|
4
|
+
require "tempfile"
|
5
|
+
|
6
|
+
require "frprep/AbstractSynInterface"
|
7
|
+
|
8
|
+
###########
|
9
|
+
# KE dec 7, 06
|
10
|
+
# common mixin for both Treetagger modules, doing the actual processing
|
11
|
+
module TreetaggerModule
|
12
|
+
###
|
13
|
+
# Treetagger does both lemmatization and POS-tagging.
|
14
|
+
# However, the way the SynInterface system is set up in Shalmaneser,
|
15
|
+
# each SynInterface can offer only _one_ service.
|
16
|
+
# This means that we cannot do a SynInterface that writes
|
17
|
+
# both a POS file and a lemma file.
|
18
|
+
# Instead, both will include this module, which does the
|
19
|
+
# actual TreeTagger call and then stores the result in a file
|
20
|
+
# of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
|
21
|
+
# but with a separate extension.
|
22
|
+
# really_process_file checks for existence of this file because,
|
23
|
+
# if the TreeTagger lemmatization and POS-tagging classes are called separately,
|
24
|
+
# one of them will go first, and the 2nd one will not need to do the
|
25
|
+
# TreeTagger call anymore
|
26
|
+
#
|
27
|
+
# really_process_file returns a filename, the name of the file containing
|
28
|
+
# the TreeTagger output with both POS tags and lemma information
|
29
|
+
#
|
30
|
+
# WARNING: this method assumes that outfilename contains a suffix
|
31
|
+
# that can be replaced by .TreeTagger
|
32
|
+
def really_process_file(infilename, # string: name of input file
|
33
|
+
outfilename,# string: name of file that the caller is to produce
|
34
|
+
make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
|
35
|
+
|
36
|
+
# fabricate the filename in which the
|
37
|
+
# actual TreeTagger output will be placed:
|
38
|
+
# <directory> + <outfilename minus last suffix> + ".TreeTagger"
|
39
|
+
current_suffix = outfilename[outfilename.rindex(".")..-1]
|
40
|
+
my_outfilename = File.dirname(outfilename) + "/" +
|
41
|
+
File.basename(outfilename, current_suffix) +
|
42
|
+
".TreeTagger"
|
43
|
+
|
44
|
+
##
|
45
|
+
# does it exist? then just return it
|
46
|
+
if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
|
47
|
+
return my_outfilename
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# else construct it, then return it
|
52
|
+
tempfile = Tempfile.new("Treetagger")
|
53
|
+
TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
|
54
|
+
tempfile.close
|
55
|
+
|
56
|
+
# call TreeTagger
|
57
|
+
Kernel.system(@program_path+" "+tempfile.path +
|
58
|
+
" > " + my_outfilename)
|
59
|
+
tempfile.close(true) # delete first tempfile
|
60
|
+
|
61
|
+
# external problem: sometimes, the treetagger keeps the last <EOS> for itself,
|
62
|
+
# resulting on a .tagged file missing the last (blank) line
|
63
|
+
|
64
|
+
original_length = `cat #{infilename} | wc -l`.strip.to_i
|
65
|
+
puts infilename
|
66
|
+
lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
|
67
|
+
|
68
|
+
# `cp #{tempfile2.path()} /tmp/lout`
|
69
|
+
|
70
|
+
case original_length - lemmatised_length
|
71
|
+
when 0
|
72
|
+
# everything ok, don't do anything
|
73
|
+
when 1
|
74
|
+
# add one more newline to the .tagged file
|
75
|
+
`echo "" >> #{my_outfilename}`
|
76
|
+
else
|
77
|
+
# this is "real" error
|
78
|
+
STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
|
79
|
+
STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
|
80
|
+
$stderr.puts "has different line number from corpus file!"
|
81
|
+
raise
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
return my_outfilename
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
#######################################
|
90
|
+
class TreetaggerInterface < SynInterfaceTab
|
91
|
+
TreetaggerInterface.announce_me()
|
92
|
+
|
93
|
+
include TreetaggerModule
|
94
|
+
|
95
|
+
###
|
96
|
+
def TreetaggerInterface.system()
|
97
|
+
return "treetagger"
|
98
|
+
end
|
99
|
+
|
100
|
+
###
|
101
|
+
def TreetaggerInterface.service()
|
102
|
+
return "lemmatizer"
|
103
|
+
end
|
104
|
+
|
105
|
+
###
|
106
|
+
# convert TreeTagger's penn tagset into Collins' penn tagset *argh*
|
107
|
+
|
108
|
+
def convert_to_berkeley(line)
|
109
|
+
line.chomp!
|
110
|
+
return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
###
|
115
|
+
def process_file(infilename, # string: name of input file
|
116
|
+
outfilename) # string: name of output file
|
117
|
+
|
118
|
+
# KE change here
|
119
|
+
ttfilename = really_process_file(infilename, outfilename)
|
120
|
+
|
121
|
+
# write all output to tempfile2 first, then
|
122
|
+
# change ISO to UTF-8 into outputfile
|
123
|
+
tempfile2 = Tempfile.new("treetagger")
|
124
|
+
tempfile2.close()
|
125
|
+
|
126
|
+
# 2. use cut to get the actual lemmtisation
|
127
|
+
|
128
|
+
Kernel.system("cat " + ttfilename +
|
129
|
+
' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
|
130
|
+
|
131
|
+
# transform ISO-8859-1 back to UTF-8,
|
132
|
+
# write to 'outfilename'
|
133
|
+
begin
|
134
|
+
outfile = File.new(outfilename, "w")
|
135
|
+
rescue
|
136
|
+
raise "Could not write to #{outfilename}"
|
137
|
+
end
|
138
|
+
tempfile2.open
|
139
|
+
# AB: Internally all the flow is an utf-8 encoded stream.
|
140
|
+
# TreeTagger consumes one byte encodings (but we should provide a
|
141
|
+
# utf-8 model for German). So we convert utf-8 to latin1, then
|
142
|
+
# process the text and convert it back to utf-8.
|
143
|
+
#
|
144
|
+
while line = tempfile2.gets
|
145
|
+
#outfile.puts UtfIso.from_iso_8859_1(line)
|
146
|
+
utf8line = UtfIso.from_iso_8859_1(line)
|
147
|
+
outfile.puts convert_to_berkeley(utf8line)
|
148
|
+
end
|
149
|
+
|
150
|
+
# remove second tempfile, finalize output file
|
151
|
+
tempfile2.close(true)
|
152
|
+
outfile.close()
|
153
|
+
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
# sp 30 11 06
|
159
|
+
#
|
160
|
+
# using TreeTagger for POS tagging of English text
|
161
|
+
#
|
162
|
+
# copy-and-paste from lemmatisation
|
163
|
+
#
|
164
|
+
# differences:
|
165
|
+
# 1. use field 2 and not 3 from the output
|
166
|
+
# 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
|
167
|
+
#
|
168
|
+
# KE 7 12 06
|
169
|
+
# change interface such that TreeTagger is called only once
|
170
|
+
# and both POS tags and lemma are read from the same files,
|
171
|
+
# rather than calling the tagger twice
|
172
|
+
class TreetaggerPOSInterface < SynInterfaceTab
|
173
|
+
TreetaggerPOSInterface.announce_me()
|
174
|
+
include TreetaggerModule
|
175
|
+
|
176
|
+
###
|
177
|
+
def TreetaggerPOSInterface.system()
|
178
|
+
return "treetagger"
|
179
|
+
end
|
180
|
+
|
181
|
+
###
|
182
|
+
def TreetaggerPOSInterface.service()
|
183
|
+
return "pos_tagger"
|
184
|
+
end
|
185
|
+
|
186
|
+
###
|
187
|
+
# convert TreeTagger's penn tagset into Collins' penn tagset *argh*
|
188
|
+
|
189
|
+
def convert_to_collins(line)
|
190
|
+
line.chomp!
|
191
|
+
return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
|
192
|
+
end
|
193
|
+
|
194
|
+
###
|
195
|
+
def process_file(infilename, # string: name of input file
|
196
|
+
outfilename) # string: name of output file
|
197
|
+
|
198
|
+
# KE change here
|
199
|
+
tt_filename = really_process_file(infilename, outfilename, true)
|
200
|
+
|
201
|
+
# write all output to tempfile2 first, then
|
202
|
+
# change ISO to UTF-8 into outputfile
|
203
|
+
tempfile2 = Tempfile.new("treetagger")
|
204
|
+
tempfile2.close()
|
205
|
+
|
206
|
+
# 2. use cut to get the actual lemmtisation
|
207
|
+
|
208
|
+
Kernel.system("cat " + tt_filename +
|
209
|
+
' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
|
210
|
+
|
211
|
+
# transform ISO-8859-1 back to UTF-8,
|
212
|
+
# write to 'outfilename'
|
213
|
+
begin
|
214
|
+
outfile = File.new(outfilename, "w")
|
215
|
+
rescue
|
216
|
+
raise "Could not write to #{outfilename}"
|
217
|
+
end
|
218
|
+
tempfile2.open()
|
219
|
+
while (line = tempfile2.gets())
|
220
|
+
outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
|
221
|
+
end
|
222
|
+
|
223
|
+
# remove second tempfile, finalize output file
|
224
|
+
tempfile2.close(true)
|
225
|
+
outfile.close()
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
###############
|
230
|
+
# an interpreter that only has Treetagger, no parser
|
231
|
+
class TreetaggerInterpreter < SynInterpreter
|
232
|
+
TreetaggerInterpreter.announce_me()
|
233
|
+
|
234
|
+
###
|
235
|
+
# names of the systems interpreted by this class:
|
236
|
+
# returns a hash service(string) -> system name (string),
|
237
|
+
# e.g.
|
238
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
239
|
+
def TreetaggerInterpreter.systems()
|
240
|
+
return {
|
241
|
+
"pos_tagger" => "treetagger",
|
242
|
+
}
|
243
|
+
end
|
244
|
+
|
245
|
+
###
|
246
|
+
# names of additional systems that may be interpreted by this class
|
247
|
+
# returns a hash service(string) -> system name(string)
|
248
|
+
# same as names()
|
249
|
+
def TreetaggerInterpreter.optional_systems()
|
250
|
+
return {
|
251
|
+
"lemmatizer" => "treetagger"
|
252
|
+
}
|
253
|
+
end
|
254
|
+
|
255
|
+
###
|
256
|
+
# generalize over POS tags.
|
257
|
+
#
|
258
|
+
# returns one of:
|
259
|
+
#
|
260
|
+
# adj: adjective (phrase)
|
261
|
+
# adv: adverb (phrase)
|
262
|
+
# card: numbers, quantity phrases
|
263
|
+
# con: conjunction
|
264
|
+
# det: determiner, including possessive/demonstrative pronouns etc.
|
265
|
+
# for: foreign material
|
266
|
+
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
267
|
+
# part: particles, truncated words (German compound parts)
|
268
|
+
# prep: preposition (phrase)
|
269
|
+
# pun: punctuation, brackets, etc.
|
270
|
+
# sent: sentence
|
271
|
+
# top: top node of a sentence
|
272
|
+
# verb: verb (phrase)
|
273
|
+
# nil: something went wrong
|
274
|
+
#
|
275
|
+
# returns: string, or nil
|
276
|
+
def TreetaggerInterpreter.category(node) # SynNode
|
277
|
+
pt = TreetaggerInterpreter.pt(node)
|
278
|
+
if pt.nil?
|
279
|
+
# phrase type could not be determined
|
280
|
+
return nil
|
281
|
+
end
|
282
|
+
|
283
|
+
pt.to_s.strip() =~ /^([^-]*)/
|
284
|
+
case $1
|
285
|
+
when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
|
286
|
+
when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
|
287
|
+
when /^CD/, /^QP/ then return "card"
|
288
|
+
when /^CC/, /^WRB/, /^CONJP/ then return "con"
|
289
|
+
when /^DT/, /^POS/ then return "det"
|
290
|
+
when /^FW/, /^SYM/ then return "for"
|
291
|
+
when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
|
292
|
+
when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
|
293
|
+
when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
|
294
|
+
when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
|
295
|
+
when /^TOP/ then return "top"
|
296
|
+
when /^TRACE/ then return "trace"
|
297
|
+
when /^V/ , /^MD/ then return "verb"
|
298
|
+
else
|
299
|
+
# $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
|
300
|
+
return nil
|
301
|
+
end
|
302
|
+
end
|
303
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
# AB, 2010-11-25
|
4
|
+
|
5
|
+
##############################
|
6
|
+
# class for managing parses:
|
7
|
+
#
|
8
|
+
# Given either a directory with tab format files or
|
9
|
+
# a directory with SalsaTigerXML files (or both) and
|
10
|
+
# a directory for putting parse files:
|
11
|
+
# - parse, unless no parsing set in the experiment file
|
12
|
+
# - for each parsed file: yield one OneParsedFile object
|
13
|
+
require 'frprep/one_parsed_file'
|
14
|
+
|
15
|
+
class DoParses
|
16
|
+
def initialize(exp, # FrPrepConfigData object
|
17
|
+
file_suffixes, # hash: file type(string) -> suffix(string)
|
18
|
+
parse_dir, # string: name of directory to put parses
|
19
|
+
var_hash = {}) # further directories
|
20
|
+
@exp = exp
|
21
|
+
@file_suffixes = file_suffixes
|
22
|
+
@parse_dir = parse_dir
|
23
|
+
@tab_dir = var_hash["tab_dir"]
|
24
|
+
@stxml_dir = var_hash["stxml_dir"]
|
25
|
+
|
26
|
+
# pre-parsed data available?
|
27
|
+
@parsed_files = @exp.get("directory_parserout")
|
28
|
+
end
|
29
|
+
|
30
|
+
###
|
31
|
+
def each_parsed_file()
|
32
|
+
if @exp.get("do_postag")
|
33
|
+
postag_suffix = @file_suffixes["pos"]
|
34
|
+
else
|
35
|
+
postag_suffix = nil
|
36
|
+
end
|
37
|
+
|
38
|
+
if @exp.get("do_lemmatize")
|
39
|
+
lemma_suffix = @file_suffixes["lemma"]
|
40
|
+
else
|
41
|
+
lemma_suffix = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
if @exp.get("do_parse")
|
45
|
+
|
46
|
+
# get parser interface
|
47
|
+
sys_class = SynInterfaces.get_interface("parser",
|
48
|
+
@exp.get("parser"))
|
49
|
+
unless sys_class
|
50
|
+
raise "Shouldn't be here"
|
51
|
+
end
|
52
|
+
parse_suffix = "." + sys_class.name()
|
53
|
+
sys = sys_class.new(@exp.get("parser_path"),
|
54
|
+
@file_suffixes["tab"],
|
55
|
+
parse_suffix,
|
56
|
+
@file_suffixes["stxml"],
|
57
|
+
"pos_suffix" => postag_suffix,
|
58
|
+
"lemma_suffix" => lemma_suffix,
|
59
|
+
"tab_dir" => @tab_dir)
|
60
|
+
|
61
|
+
if @parsed_files
|
62
|
+
# reuse old parses
|
63
|
+
|
64
|
+
$stderr.puts "Frprep: using pre-computed parses in " + @parsed_files.to_s()
|
65
|
+
$stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
|
66
|
+
|
67
|
+
Dir[@parsed_files + "*"].each { |parsefilename|
|
68
|
+
|
69
|
+
if File.stat(parsefilename).ftype != "file"
|
70
|
+
# something other than a file
|
71
|
+
next
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
# core filename: remove directory and anything after the last "."
|
76
|
+
filename_core = File.basename(parsefilename, ".*")
|
77
|
+
#print "FN ", filename_core, " PN ", parsefilename, " sys ", sys, "\n"
|
78
|
+
# use iterator to read each parsed file
|
79
|
+
yield OneParsedFile.new(filename_core, parsefilename, sys)
|
80
|
+
}
|
81
|
+
|
82
|
+
else
|
83
|
+
# do new parses
|
84
|
+
$stderr.puts "Frprep: Parsing"
|
85
|
+
|
86
|
+
# sanity check
|
87
|
+
unless @exp.get("parser_path")
|
88
|
+
raise "Parsing: I need 'parser_path' in the experiment file"
|
89
|
+
end
|
90
|
+
unless @tab_dir
|
91
|
+
raise "Cannot parse without tab files"
|
92
|
+
end
|
93
|
+
|
94
|
+
# parse
|
95
|
+
sys.process_dir(@tab_dir, @parse_dir)
|
96
|
+
|
97
|
+
$stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
|
98
|
+
|
99
|
+
Dir[@parse_dir + "*" + parse_suffix].each { |parsefilename|
|
100
|
+
filename_core = File.basename(parsefilename, parse_suffix)
|
101
|
+
|
102
|
+
# use iterator to read each parsed file
|
103
|
+
yield OneParsedFile.new(filename_core, parsefilename, sys)
|
104
|
+
}
|
105
|
+
end
|
106
|
+
|
107
|
+
else
|
108
|
+
# no parse:
|
109
|
+
# get pseudo-parse tree
|
110
|
+
|
111
|
+
if @stxml_dir
|
112
|
+
# use existing SalsaTigerXML files
|
113
|
+
Dir[@stxml_dir + "*.xml"].each { |stxmlfilename|
|
114
|
+
|
115
|
+
filename_core = File.basename(stxmlfilename, ".xml")
|
116
|
+
if @tab_dir
|
117
|
+
# we know the tab directory too
|
118
|
+
tabfilename = @tab_dir + filename_core + @file_suffixes["tab"]
|
119
|
+
each_sentence_obj = FrprepReadStxml.new(stxmlfilename, tabfilename,
|
120
|
+
postag_suffix, lemma_suffix)
|
121
|
+
else
|
122
|
+
# we have no tab directory
|
123
|
+
each_sentence_obj = FrprepReadStxml.new(stxmlfilename, nil,
|
124
|
+
postag_suffix, lemma_suffix)
|
125
|
+
end
|
126
|
+
|
127
|
+
yield OneParsedFile.new(filename_core, stxmlfilename, each_sentence_obj)
|
128
|
+
}
|
129
|
+
|
130
|
+
else
|
131
|
+
# construct SalsaTigerXML from tab files
|
132
|
+
Dir[@tab_dir+"*"+@file_suffixes["tab"]].each { |tabfilename|
|
133
|
+
each_sentence_obj = FrprepFlatSyntax.new(tabfilename,
|
134
|
+
postag_suffix,
|
135
|
+
lemma_suffix)
|
136
|
+
filename_core = File.basename(tabfilename, @file_suffixes["tab"])
|
137
|
+
yield OneParsedFile.new(filename_core, tabfilename, each_sentence_obj)
|
138
|
+
}
|
139
|
+
end # source of pseudo-parse
|
140
|
+
end # parse or no parse
|
141
|
+
end
|
142
|
+
end
|