shalmaneser 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/shalmaneser +8 -2
- data/doc/index.md +1 -0
- data/lib/shalmaneser/opt_parser.rb +68 -67
- metadata +49 -119
- data/bin/fred +0 -16
- data/bin/frprep +0 -34
- data/bin/rosy +0 -17
- data/lib/common/AbstractSynInterface.rb +0 -1229
- data/lib/common/Counter.rb +0 -18
- data/lib/common/EnduserMode.rb +0 -27
- data/lib/common/Eval.rb +0 -480
- data/lib/common/FixSynSemMapping.rb +0 -196
- data/lib/common/Graph.rb +0 -345
- data/lib/common/ISO-8859-1.rb +0 -24
- data/lib/common/ML.rb +0 -186
- data/lib/common/Mallet.rb +0 -236
- data/lib/common/Maxent.rb +0 -229
- data/lib/common/Optimise.rb +0 -195
- data/lib/common/Parser.rb +0 -213
- data/lib/common/RegXML.rb +0 -269
- data/lib/common/RosyConventions.rb +0 -171
- data/lib/common/STXmlTerminalOrder.rb +0 -194
- data/lib/common/SalsaTigerRegXML.rb +0 -2347
- data/lib/common/SalsaTigerXMLHelper.rb +0 -99
- data/lib/common/SynInterfaces.rb +0 -282
- data/lib/common/TabFormat.rb +0 -721
- data/lib/common/Tiger.rb +0 -1448
- data/lib/common/Timbl.rb +0 -144
- data/lib/common/Tree.rb +0 -61
- data/lib/common/config_data.rb +0 -470
- data/lib/common/config_format_element.rb +0 -220
- data/lib/common/headz.rb +0 -338
- data/lib/common/option_parser.rb +0 -13
- data/lib/common/prep_config_data.rb +0 -62
- data/lib/common/prep_helper.rb +0 -1330
- data/lib/common/ruby_class_extensions.rb +0 -310
- data/lib/db/db_interface.rb +0 -48
- data/lib/db/db_mysql.rb +0 -145
- data/lib/db/db_sqlite.rb +0 -280
- data/lib/db/db_table.rb +0 -239
- data/lib/db/db_wrapper.rb +0 -176
- data/lib/db/sql_query.rb +0 -243
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/fred/Baseline.rb +0 -150
- data/lib/fred/FileZipped.rb +0 -31
- data/lib/fred/FredBOWContext.rb +0 -877
- data/lib/fred/FredConventions.rb +0 -232
- data/lib/fred/FredDetermineTargets.rb +0 -319
- data/lib/fred/FredEval.rb +0 -312
- data/lib/fred/FredFeatureExtractors.rb +0 -322
- data/lib/fred/FredFeatures.rb +0 -1061
- data/lib/fred/FredFeaturize.rb +0 -602
- data/lib/fred/FredNumTrainingSenses.rb +0 -27
- data/lib/fred/FredParameters.rb +0 -402
- data/lib/fred/FredSplit.rb +0 -84
- data/lib/fred/FredSplitPkg.rb +0 -180
- data/lib/fred/FredTest.rb +0 -606
- data/lib/fred/FredTrain.rb +0 -144
- data/lib/fred/PlotAndREval.rb +0 -480
- data/lib/fred/fred.rb +0 -47
- data/lib/fred/fred_config_data.rb +0 -185
- data/lib/fred/md5.rb +0 -23
- data/lib/fred/opt_parser.rb +0 -250
- data/lib/frprep/Ampersand.rb +0 -39
- data/lib/frprep/CollinsInterface.rb +0 -1165
- data/lib/frprep/Counter.rb +0 -18
- data/lib/frprep/FNCorpusXML.rb +0 -643
- data/lib/frprep/FNDatabase.rb +0 -144
- data/lib/frprep/FrameXML.rb +0 -513
- data/lib/frprep/Graph.rb +0 -345
- data/lib/frprep/MiniparInterface.rb +0 -1388
- data/lib/frprep/RegXML.rb +0 -269
- data/lib/frprep/STXmlTerminalOrder.rb +0 -194
- data/lib/frprep/SleepyInterface.rb +0 -384
- data/lib/frprep/TntInterface.rb +0 -44
- data/lib/frprep/TreetaggerInterface.rb +0 -327
- data/lib/frprep/do_parses.rb +0 -143
- data/lib/frprep/frprep.rb +0 -693
- data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
- data/lib/frprep/interfaces/stanford_interface.rb +0 -353
- data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
- data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
- data/lib/frprep/one_parsed_file.rb +0 -28
- data/lib/frprep/opt_parser.rb +0 -94
- data/lib/frprep/ruby_class_extensions.rb +0 -310
- data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/FailedParses.rb +0 -130
- data/lib/rosy/FeatureInfo.rb +0 -242
- data/lib/rosy/GfInduce.rb +0 -1115
- data/lib/rosy/GfInduceFeature.rb +0 -148
- data/lib/rosy/InputData.rb +0 -294
- data/lib/rosy/RosyConfusability.rb +0 -338
- data/lib/rosy/RosyEval.rb +0 -465
- data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
- data/lib/rosy/RosyFeaturize.rb +0 -281
- data/lib/rosy/RosyInspect.rb +0 -336
- data/lib/rosy/RosyIterator.rb +0 -478
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
- data/lib/rosy/RosyPruning.rb +0 -165
- data/lib/rosy/RosyServices.rb +0 -744
- data/lib/rosy/RosySplit.rb +0 -232
- data/lib/rosy/RosyTask.rb +0 -19
- data/lib/rosy/RosyTest.rb +0 -829
- data/lib/rosy/RosyTrain.rb +0 -234
- data/lib/rosy/RosyTrainingTestTable.rb +0 -787
- data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/opt_parser.rb +0 -379
- data/lib/rosy/rosy.rb +0 -78
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/lib/shalmaneser/version.rb +0 -3
data/lib/fred/FredConventions.rb
DELETED
@@ -1,232 +0,0 @@
|
|
1
|
-
# FredConventions
|
2
|
-
# Katrin Erk June 05
|
3
|
-
#
|
4
|
-
# several small things that should be uniform
|
5
|
-
# throughout the system
|
6
|
-
|
7
|
-
require "common/ruby_class_extensions"
|
8
|
-
|
9
|
-
require "common/EnduserMode"
|
10
|
-
class Object
|
11
|
-
|
12
|
-
###
|
13
|
-
# joining and breaking up senses
|
14
|
-
def fred_join_senses(senses)
|
15
|
-
return senses.sort().join("++")
|
16
|
-
end
|
17
|
-
|
18
|
-
def fred_split_sense(joined_senses)
|
19
|
-
return joined_senses.split("++")
|
20
|
-
end
|
21
|
-
|
22
|
-
###
|
23
|
-
# fred_dirname
|
24
|
-
#
|
25
|
-
# constructs a directory name:
|
26
|
-
# fred data directory / experiment ID / maindir / subdir
|
27
|
-
#
|
28
|
-
# if is_existing == existing, the directory is checked for existence,
|
29
|
-
# if is_existing == new, it is created if necessary
|
30
|
-
#
|
31
|
-
# returns: a string
|
32
|
-
def fred_dirname(exp, # FredConfigData object
|
33
|
-
maindir, # string: main part of directory name
|
34
|
-
subdir, # string: subpart of directory name
|
35
|
-
is_existing = "existing") # string: "existing" or "new", default: existing
|
36
|
-
|
37
|
-
case is_existing
|
38
|
-
when "existing"
|
39
|
-
return File.existing_dir(exp.get("fred_directory"),
|
40
|
-
exp.get("experiment_ID"),
|
41
|
-
maindir,
|
42
|
-
subdir)
|
43
|
-
when "new"
|
44
|
-
return File.new_dir(exp.get("fred_directory"),
|
45
|
-
exp.get("experiment_ID"),
|
46
|
-
maindir,
|
47
|
-
subdir)
|
48
|
-
else
|
49
|
-
raise "Shouldn't be here: #{is_existing}"
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
####
|
54
|
-
# filenames for feature files
|
55
|
-
def fred_feature_filename(lemma, sense = nil,
|
56
|
-
do_binary = false)
|
57
|
-
if do_binary
|
58
|
-
return "fred.features.#{lemma}.SENSE.#{sense}"
|
59
|
-
else
|
60
|
-
return "fred.features.#{lemma}"
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
####
|
65
|
-
# filenames for split files
|
66
|
-
def fred_split_filename(lemma)
|
67
|
-
return "fred.split.#{lemma}"
|
68
|
-
end
|
69
|
-
|
70
|
-
###
|
71
|
-
# deconstruct split filename
|
72
|
-
# returns: lemma
|
73
|
-
def deconstruct_fred_split_filename(filename)
|
74
|
-
basename = File.basename(filename)
|
75
|
-
if basename =~ /^fred\.split\.(.*)/
|
76
|
-
return $1
|
77
|
-
else
|
78
|
-
return nil
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
###
|
83
|
-
# deconstruct feature file name
|
84
|
-
# returns: hash with keys
|
85
|
-
# "lemma"
|
86
|
-
# "sense
|
87
|
-
def deconstruct_fred_feature_filename(filename)
|
88
|
-
|
89
|
-
basename = File.basename(filename)
|
90
|
-
retv = Hash.new()
|
91
|
-
# binary:
|
92
|
-
# fred.features.#{lemma}.SENSE.#{sense}
|
93
|
-
if basename =~ /^fred\.features\.(.*)\.SENSE\.(.*)$/
|
94
|
-
retv["lemma"] = $1
|
95
|
-
retv["sense"] = $2
|
96
|
-
elsif basename =~ /^fred\.features\.(.*)/
|
97
|
-
# fred.features.#{lemma}
|
98
|
-
retv["lemma"] = $1
|
99
|
-
|
100
|
-
else
|
101
|
-
# complete mismatch
|
102
|
-
return nil
|
103
|
-
end
|
104
|
-
|
105
|
-
return retv
|
106
|
-
end
|
107
|
-
|
108
|
-
####
|
109
|
-
# filename for answer key files
|
110
|
-
def fred_answerkey_filename(lemma)
|
111
|
-
return "fred.answerkey.#{lemma}"
|
112
|
-
end
|
113
|
-
|
114
|
-
###
|
115
|
-
# classifier directory
|
116
|
-
def fred_classifier_directory(exp, # FredConfigData object
|
117
|
-
splitID = nil) # string or nil
|
118
|
-
|
119
|
-
if exp.get("classifier_dir")
|
120
|
-
# user-specified classifier directory
|
121
|
-
|
122
|
-
if splitID
|
123
|
-
return File.new_dir(exp.get("classifier_dir"), splitID)
|
124
|
-
else
|
125
|
-
return File.new_dir(exp.get("classifier_dir"))
|
126
|
-
end
|
127
|
-
|
128
|
-
else
|
129
|
-
# my classifier directory
|
130
|
-
if splitID
|
131
|
-
return fred_dirname(exp, "classifiers", splitID, "new")
|
132
|
-
else
|
133
|
-
return fred_dirname(exp, "classifiers", "all", "new")
|
134
|
-
end
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
|
-
###
|
139
|
-
# classifier file
|
140
|
-
def fred_classifier_filename(classifier, lemma, sense=nil)
|
141
|
-
if sense
|
142
|
-
return "fred.classif.#{classifier}.LEMMA.#{lemma}.SENSE.#{sense}"
|
143
|
-
else
|
144
|
-
return "fred.classif.#{classifier}.LEMMA.#{lemma}"
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
def deconstruct_fred_classifier_filename(filename)
|
149
|
-
retv = Hash.new()
|
150
|
-
if filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)\.SENSE\.(.*)$/
|
151
|
-
retv["lemma"] = $2
|
152
|
-
retv["sense"] = $3
|
153
|
-
elsif filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)$/
|
154
|
-
retv["lemma"] = $2
|
155
|
-
end
|
156
|
-
return retv
|
157
|
-
end
|
158
|
-
|
159
|
-
###
|
160
|
-
# result file
|
161
|
-
def fred_result_filename(lemma)
|
162
|
-
return "fred.result.#{lemma.gsub(/\./, "_")}"
|
163
|
-
end
|
164
|
-
|
165
|
-
##########
|
166
|
-
# lemma and POS: combine into string separated by
|
167
|
-
# a separator character
|
168
|
-
#
|
169
|
-
# fred_lemmapos_combine: take two strings, return combined string
|
170
|
-
# if POS is nil, returns lemma<separator character>
|
171
|
-
# fred_lemmapos_separate: take one string, return two strings
|
172
|
-
# if no POS could be retrieved, returns nil as POS and the whole string as lemma
|
173
|
-
def fred_lemmapos_combine(lemma, # string
|
174
|
-
pos) # string
|
175
|
-
return lemma.to_s + "." + pos.to_s.gsub(/\./, "DOT")
|
176
|
-
end
|
177
|
-
|
178
|
-
###
|
179
|
-
def fred_lemmapos_separate(lemmapos) # string
|
180
|
-
pieces = lemmapos.split(".")
|
181
|
-
if pieces.length() > 1
|
182
|
-
return [ pieces[0..-2].join("."), pieces[-1] ]
|
183
|
-
else
|
184
|
-
# no POS found, treat all of lemmapos as lemma
|
185
|
-
return [ lemmapos, nil ]
|
186
|
-
end
|
187
|
-
end
|
188
|
-
end
|
189
|
-
|
190
|
-
########################################
|
191
|
-
# given a SynNode object representing a terminal,
|
192
|
-
# return:
|
193
|
-
# - the word
|
194
|
-
# - the lemma
|
195
|
-
# - the part of speech
|
196
|
-
# - the named entity (if any)
|
197
|
-
#
|
198
|
-
# as a tuple
|
199
|
-
#
|
200
|
-
# WARNING: word and lemma are turned to lowercase
|
201
|
-
module WordLemmaPosNe
|
202
|
-
def word_lemma_pos_ne(syn_obj, # SynNode object
|
203
|
-
i) # SynInterpreter class
|
204
|
-
unless syn_obj.is_terminal?
|
205
|
-
$stderr.puts "Featurization warning: unexpectedly received non-terminal"
|
206
|
-
return [ nil, nil, nil, nil ]
|
207
|
-
end
|
208
|
-
|
209
|
-
word = syn_obj.word()
|
210
|
-
if word
|
211
|
-
word.downcase!
|
212
|
-
end
|
213
|
-
|
214
|
-
lemma = i.lemma_backoff(syn_obj)
|
215
|
-
if lemma and SalsaTigerXMLHelper.unescape(lemma) == "<unknown>"
|
216
|
-
lemma = nil
|
217
|
-
end
|
218
|
-
if lemma
|
219
|
-
lemma.downcase!
|
220
|
-
end
|
221
|
-
|
222
|
-
pos = syn_obj.part_of_speech()
|
223
|
-
|
224
|
-
ne = syn_obj.get_attribute("ne")
|
225
|
-
unless ne
|
226
|
-
ne = syn_obj.get_attribute("headof_ne")
|
227
|
-
end
|
228
|
-
|
229
|
-
return [word, lemma, pos, ne]
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
@@ -1,319 +0,0 @@
|
|
1
|
-
require "fred/FileZipped"
|
2
|
-
|
3
|
-
require "fred/fred_config_data"
|
4
|
-
require "common/SynInterfaces"
|
5
|
-
require "fred/FredConventions"
|
6
|
-
|
7
|
-
|
8
|
-
########################################
|
9
|
-
# target determination classes:
|
10
|
-
# either determine targets from existing annotation
|
11
|
-
# with frames,
|
12
|
-
# or use all known targets.
|
13
|
-
class Targets
|
14
|
-
attr_reader :targets_okay
|
15
|
-
|
16
|
-
###
|
17
|
-
def initialize(exp, # experiment file object
|
18
|
-
interpreter_class, # SynInterpreter class, or nil
|
19
|
-
mode) # string: "r", "w", "a", as in files
|
20
|
-
@exp = exp
|
21
|
-
@interpreter_class = interpreter_class
|
22
|
-
|
23
|
-
# keep recorded targets here.
|
24
|
-
# try to read old list now.
|
25
|
-
@targets = Hash.new()
|
26
|
-
|
27
|
-
# write target info in the classifier directory.
|
28
|
-
# This is _not_ dependent on a potential split ID
|
29
|
-
@dir = File.new_dir(fred_classifier_directory(@exp), "targets")
|
30
|
-
|
31
|
-
@targets_okay = true
|
32
|
-
case mode
|
33
|
-
when "w"
|
34
|
-
# start from scratch, no list of targets
|
35
|
-
when "a", "r"
|
36
|
-
# read existing file containing targets
|
37
|
-
begin
|
38
|
-
file = FileZipped.new(@dir + "targets.txt.gz")
|
39
|
-
rescue
|
40
|
-
# no pickle present: signal this
|
41
|
-
@targets_okay = false
|
42
|
-
return
|
43
|
-
end
|
44
|
-
file.each { |line|
|
45
|
-
line.chomp!
|
46
|
-
if line =~ /^LEMMA (.+) SENSES (.+)$/
|
47
|
-
lemmapos = $1
|
48
|
-
senses = $2.split()
|
49
|
-
lemmapos.gsub!(/ /, '_')
|
50
|
-
#lemmapos.gsub!(/\.[A-Z]\./, '.')
|
51
|
-
@targets[lemmapos] = senses
|
52
|
-
end
|
53
|
-
}
|
54
|
-
|
55
|
-
else
|
56
|
-
$stderr.puts "Error: shouldn't be here."
|
57
|
-
exit 1
|
58
|
-
end
|
59
|
-
|
60
|
-
if ["w", "a"].include? mode
|
61
|
-
@record_targets = true
|
62
|
-
else
|
63
|
-
@record_targets = false
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
###
|
68
|
-
# determine_targets:
|
69
|
-
# for a given SalsaTigerSentence,
|
70
|
-
# determine all targets,
|
71
|
-
# each as a _single_ main terminal node
|
72
|
-
#
|
73
|
-
# We need a single terminal node in order
|
74
|
-
# to compute the context window
|
75
|
-
#
|
76
|
-
# returns:
|
77
|
-
# hash: target_IDs -> list of senses
|
78
|
-
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
79
|
-
#
|
80
|
-
# where a sense is represented as a hash:
|
81
|
-
# "sense": sense, a string
|
82
|
-
# "obj": FrameNode object
|
83
|
-
# "all_targets": list of node IDs, may comprise more than a single node
|
84
|
-
# "lex": lemma, or multiword expression in canonical form
|
85
|
-
# "sid": sentence ID
|
86
|
-
def determine_targets(sent)
|
87
|
-
raise "overwrite me"
|
88
|
-
end
|
89
|
-
|
90
|
-
##
|
91
|
-
# returns a list of lemma-pos combined strings
|
92
|
-
def get_lemmas()
|
93
|
-
return @targets.keys()
|
94
|
-
end
|
95
|
-
|
96
|
-
##
|
97
|
-
# access to lemmas and POS, returns a list of pairs [lemma, pos] (string*string)
|
98
|
-
def get_lemma_pos()
|
99
|
-
|
100
|
-
return @targets.keys().map { |lemmapos| fred_lemmapos_separate(lemmapos) }
|
101
|
-
end
|
102
|
-
|
103
|
-
##
|
104
|
-
# access to senses
|
105
|
-
def get_senses(lemmapos) # string, result of fred_lemmapos_combine
|
106
|
-
@targets[lemmapos] ? @targets[lemmapos] : []
|
107
|
-
end
|
108
|
-
|
109
|
-
##
|
110
|
-
# write file
|
111
|
-
def done_reading_targets()
|
112
|
-
begin
|
113
|
-
file = FileZipped.new(@dir + "targets.txt.gz", "w")
|
114
|
-
rescue
|
115
|
-
$stderr.puts "Error: Could not write file #{@dir}targets.txt.gz"
|
116
|
-
exit 1
|
117
|
-
end
|
118
|
-
|
119
|
-
@targets.each_pair { |lemma, senses|
|
120
|
-
file.puts "LEMMA #{lemma} SENSES "+ senses.join(" ")
|
121
|
-
}
|
122
|
-
|
123
|
-
file.close
|
124
|
-
end
|
125
|
-
|
126
|
-
###############################
|
127
|
-
protected
|
128
|
-
|
129
|
-
##
|
130
|
-
# record: record occurrence of a lemma/sense pair
|
131
|
-
# <@targets> data structure
|
132
|
-
def record(target_info)
|
133
|
-
lemmapos = fred_lemmapos_combine(target_info["lex"], target_info["pos"])
|
134
|
-
unless @targets[lemmapos]
|
135
|
-
@targets[lemmapos] = []
|
136
|
-
end
|
137
|
-
|
138
|
-
unless @targets[lemmapos].include? target_info["sense"]
|
139
|
-
@targets[lemmapos] << target_info["sense"]
|
140
|
-
end
|
141
|
-
end
|
142
|
-
end
|
143
|
-
|
144
|
-
########################################
|
145
|
-
class FindTargetsFromFrames < Targets
|
146
|
-
###
|
147
|
-
# determine_targets:
|
148
|
-
# use existing frames to find targets
|
149
|
-
#
|
150
|
-
# returns:
|
151
|
-
# hash: target_IDs -> list of senses
|
152
|
-
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
153
|
-
#
|
154
|
-
# where a sense is represented as a hash:
|
155
|
-
# "sense": sense, a string
|
156
|
-
# "obj": FrameNode object
|
157
|
-
# "all_targets": list of node IDs, may comprise more than a single node
|
158
|
-
# "lex": lemma, or multiword expression in canonical form
|
159
|
-
# "sid": sentence ID
|
160
|
-
def determine_targets(st_sent) #SalsaTigerSentence object
|
161
|
-
retv = Hash.new()
|
162
|
-
st_sent.each_frame { |frame_obj|
|
163
|
-
# instance-specific computation:
|
164
|
-
# target and target positions
|
165
|
-
# WARNING: at this moment, we are
|
166
|
-
# not considering true multiword targets for German.
|
167
|
-
# Remove the "no_mwe" parameter in main_node_of_expr
|
168
|
-
# to change this
|
169
|
-
term = nil
|
170
|
-
all_targets = nil
|
171
|
-
if frame_obj.target.nil? or frame_obj.target.children.empty?
|
172
|
-
# no target, nothing to record
|
173
|
-
|
174
|
-
elsif @exp.get("language") == "de"
|
175
|
-
# don't consider true multiword targets for German
|
176
|
-
all_targets = frame_obj.target.children()
|
177
|
-
term = @interpreter_class.main_node_of_expr(all_targets, "no_mwe")
|
178
|
-
|
179
|
-
else
|
180
|
-
# for all other languages: try to figure out the head target word
|
181
|
-
# anyway
|
182
|
-
all_targets = frame_obj.target.children()
|
183
|
-
term = @interpreter_class.main_node_of_expr(all_targets)
|
184
|
-
end
|
185
|
-
|
186
|
-
if term and term.is_splitword?
|
187
|
-
# don't use parts of a word as main node
|
188
|
-
term = term.parent()
|
189
|
-
end
|
190
|
-
if term and term.is_terminal?
|
191
|
-
key = [all_targets.map { |t| t.id() }, term.id()]
|
192
|
-
|
193
|
-
unless retv[key]
|
194
|
-
retv[key] = Array.new()
|
195
|
-
end
|
196
|
-
|
197
|
-
pos = frame_obj.target().get_attribute("pos")
|
198
|
-
# gold POS available, may be in wrong form,
|
199
|
-
# i.e. not the same strings that @interpreter_class.category()
|
200
|
-
# would return
|
201
|
-
case pos
|
202
|
-
when /^[Vv]$/
|
203
|
-
pos = "verb"
|
204
|
-
when /^[Nn]$/
|
205
|
-
pos = "noun"
|
206
|
-
when /^[Aa]$/
|
207
|
-
pos = "adj"
|
208
|
-
when nil
|
209
|
-
pos = @interpreter_class.category(term)
|
210
|
-
end
|
211
|
-
|
212
|
-
target_info = {
|
213
|
-
"sense" => frame_obj.name(),
|
214
|
-
"obj" => frame_obj,
|
215
|
-
"all_targets" => frame_obj.target.children().map { |ch| ch.id() },
|
216
|
-
"lex" => frame_obj.target().get_attribute("lemma"),
|
217
|
-
"pos" => pos,
|
218
|
-
"sid" => st_sent.id()
|
219
|
-
}
|
220
|
-
#print "lex ", frame_obj.target(), " und ",frame_obj.target().get_attribute("lemma"), "\n"
|
221
|
-
retv[key] << target_info
|
222
|
-
if @record_targets
|
223
|
-
record(target_info)
|
224
|
-
end
|
225
|
-
end
|
226
|
-
}
|
227
|
-
return retv
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
########################################
|
232
|
-
class FindAllTargets < Targets
|
233
|
-
###
|
234
|
-
# determine_targets:
|
235
|
-
# use all known lemmas, minus stopwords
|
236
|
-
def initialize(exp,
|
237
|
-
interpreter_class)
|
238
|
-
# read target info from file
|
239
|
-
super(exp, interpreter_class, "r")
|
240
|
-
@training_lemmapos_pairs = get_lemma_pos()
|
241
|
-
|
242
|
-
get_senses(@training_lemmapos_pairs)
|
243
|
-
# list of words to exclude from assignment, for now
|
244
|
-
@stoplemmas = [
|
245
|
-
"have",
|
246
|
-
"do",
|
247
|
-
"be"
|
248
|
-
# "make"
|
249
|
-
]
|
250
|
-
|
251
|
-
end
|
252
|
-
|
253
|
-
####
|
254
|
-
#
|
255
|
-
# returns:
|
256
|
-
# hash: target_IDs -> list of senses
|
257
|
-
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
258
|
-
#
|
259
|
-
# where a sense is represented as a hash:
|
260
|
-
# "sense": sense, a string
|
261
|
-
# "obj": FrameNode object
|
262
|
-
# "all_targets": list of node IDs, may comprise more than a single node
|
263
|
-
# "lex": lemma, or multiword expression in canonical form
|
264
|
-
# "sid": sentence ID
|
265
|
-
def determine_targets(sent) #SalsaTigerSentence object
|
266
|
-
# map target IDs to list of senses, in our case always [ nil ]
|
267
|
-
# because we assume that the senses of the targets we point out
|
268
|
-
# are unknown
|
269
|
-
retv = Hash.new()
|
270
|
-
# iterate through terminals of the sentence, check for inclusion
|
271
|
-
# of their lemma in @training_lemmas
|
272
|
-
sent.each_terminal { |node|
|
273
|
-
# we know this lemma from the training data,
|
274
|
-
# and it is not an auxiliary,
|
275
|
-
# and it is not in the stopword list
|
276
|
-
# and the node does not represent a preposition
|
277
|
-
|
278
|
-
### modified by ines, 17.10.2008
|
279
|
-
lemma = @interpreter_class.lemma_backoff(node)
|
280
|
-
pos = @interpreter_class.category(node)
|
281
|
-
|
282
|
-
# print "lemma ", lemma, " pos ", pos, "\n"
|
283
|
-
# reg = /\.[ANV]/
|
284
|
-
# if !reg.match(lemma)
|
285
|
-
# if /verb/.match(pos)
|
286
|
-
# lemma = lemma + ".V"
|
287
|
-
# elsif /noun/.match(pos)
|
288
|
-
# lemma = lemma + ".N"
|
289
|
-
# elsif /adj/.match(pos)
|
290
|
-
# lemma = lemma + ".A"
|
291
|
-
# end
|
292
|
-
# print "LEMMA ", lemma, " POS ", pos, "\n"
|
293
|
-
# end
|
294
|
-
|
295
|
-
if (@training_lemmapos_pairs.include? [lemma, pos] and
|
296
|
-
not(@interpreter_class.auxiliary?(node)) and
|
297
|
-
not(@stoplemmas.include? lemma) and
|
298
|
-
not(pos == "prep"))
|
299
|
-
key = [ [ node.id() ], node.id() ]
|
300
|
-
|
301
|
-
# take this as a target.
|
302
|
-
retv[ key ] = [
|
303
|
-
{
|
304
|
-
"sense" => nil,
|
305
|
-
"obj" => nil,
|
306
|
-
"all_targets" => [ node.id() ],
|
307
|
-
"lex" => lemma,
|
308
|
-
"pos" => pos,
|
309
|
-
"sid" => sent.id()
|
310
|
-
} ]
|
311
|
-
# no recording of target info,
|
312
|
-
# since we haven't determined anything new
|
313
|
-
end
|
314
|
-
}
|
315
|
-
|
316
|
-
return retv
|
317
|
-
end
|
318
|
-
end
|
319
|
-
|