frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,182 @@
1
+ # FredConfigData
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system:
5
+ # access to a configuration and experiment description file
6
+
7
+ require "common/ConfigData"
8
+
9
+ ##############################
10
+ # Class FredConfigData
11
+ #
12
+ # inherits from ConfigData,
13
+ # sets variable names appropriate to WSD task
14
+
15
+ class FredConfigData < ConfigData
16
+ def initialize(filename)
17
+
18
+ # initialize config data object
19
+ super(filename, # config file
20
+ {
21
+ "experiment_ID" => "string", # experiment ID
22
+ "enduser_mode" => "bool", # work in enduser mode? (disallowing many things)
23
+
24
+ "preproc_descr_file_train" => "string", # path to preprocessing files
25
+ "preproc_descr_file_test" => "string",
26
+ "directory_output" => "string", # path to Salsa/Tiger XML output directory
27
+
28
+ "verbose" => "bool" , # print diagnostic messages?
29
+ "apply_to_all_known_targets" => "bool", # apply to all known targets rather than the ones with a frame?
30
+
31
+ "fred_directory" => "string",# directory for internal info
32
+ "classifier_dir" => "string", # write classifiers here
33
+
34
+ "classifier" => "list", # classifiers
35
+
36
+ "dbtype" => "string", # "mysql" or "sqlite"
37
+
38
+ "host" => "string", # DB access: sqlite only
39
+ "user" => "string",
40
+ "passwd" => "string",
41
+ "dbname" => "string",
42
+
43
+ # featurization info
44
+ "feature" => "list", # which features to use for the classifier?
45
+ "binary_classifiers" => "bool",# make binary rather than n-ary clasifiers?
46
+ "negsense" => "string", # binary classifier: negative sense is..?
47
+ "numerical_features" => "string", # do what with numerical features?
48
+
49
+ # what to do with items that have multiple senses?
50
+ # 'binarize': binary classifiers, and consider positive
51
+ # if the sense is among the gold senses
52
+ # 'join' : make one joint sense
53
+ # 'repeat' : make multiple occurrences of the item, one sense per occ
54
+ # 'keep' : keep as separate labels
55
+ #
56
+ # multilabel: consider as assigned all labels
57
+ # above a certain confidence threshold?
58
+ "handle_multilabel" => "string",
59
+ "assignment_confidence_threshold" => "float",
60
+
61
+ # single-sentence context?
62
+ "single_sent_context" => "bool",
63
+
64
+ # noncontiguous input? then we need access to a larger corpus
65
+ "noncontiguous_input" => "bool",
66
+ "larger_corpus_dir" => "string",
67
+ "larger_corpus_format" => "string",
68
+ "larger_corpus_encoding" => "string"
69
+ },
70
+ [ # variables
71
+ "train",
72
+ "exp_ID"
73
+ ]
74
+ )
75
+
76
+ # set access functions for list features
77
+ set_list_feature_access("classifier",
78
+ method("access_classifier"))
79
+ set_list_feature_access("feature",
80
+ method("access_feature"))
81
+ end
82
+
83
+ ###
84
+ # protected
85
+
86
+ #####
87
+ # access_feature
88
+ #
89
+ # access function for feature 'feature'
90
+ #
91
+ # assumed format:
92
+ #
93
+ # feature = context 50
94
+ # feature = context 2
95
+ # feature = syn
96
+ #
97
+ # i.e. first the name of the feature type to use, then
98
+ # optionally a parameter,
99
+ # and the same feature can occur more than once (which makes sense
100
+ # only in case of parameters)
101
+ #
102
+ #
103
+ # returns:
104
+ # - If a feature is given as a parameter,
105
+ # - If the feature is not set in the experiment file, nil
106
+ # - If the feature is set and has a parameter, the list of
107
+ # parameter values set for it. It is assumed that the parameters
108
+ # are integers, and they are returned as integers
109
+ # - If the feature is set and has no parameter, true
110
+ # - If no feature is given as parameter:
111
+ # a list of all features that have been set in the experiment file
112
+ # Each feature is given as a tuple: the first element is the feature (a string),
113
+ # all further elements are options (integers)
114
+ def access_feature(val_list, # array:array:string: list of tuples defined in config file
115
+ # for feature 'feature'
116
+ feature=nil) # string: feature type name
117
+
118
+ if feature
119
+ # access options for this feature
120
+
121
+ # get the right tuples
122
+ positives = val_list.select { |entries|
123
+ entries.first() == feature
124
+ }.map { |entries|
125
+ entries[1]
126
+ }
127
+
128
+ if positives.empty?
129
+ # feature not defined
130
+ return nil
131
+
132
+ elsif positives.compact().empty?
133
+ # feature defined, but no parameters
134
+ return true
135
+
136
+ else
137
+ # feature defined, and has values
138
+ return positives.map { |par| par.to_i() }
139
+ end
140
+
141
+ else
142
+ # return all features that have been set
143
+ return val_list.map { |feature_name, *options|
144
+ [feature_name] + options.map { |o| o.to_i() }
145
+ }
146
+ end
147
+ end
148
+
149
+ #####
150
+ # access_classifier
151
+ #
152
+ # access function for feature 'classifier'
153
+ #
154
+ # assumed format in the config file:
155
+ #
156
+ # feature = path [option]*
157
+ #
158
+ # i.e. first the name of the feature type to use, then
159
+ # optionally options associated with that feature,
160
+ # e.g. 'argrec': use that feature only when computing argrec
161
+ #
162
+ # the access function is called with parameter val_list, an array of
163
+ # string tuples, one string tuple for each feature defined.
164
+ # the first string in the tuple is the feature name, the rest are the options
165
+ #
166
+ # returns: a list of pairs [feature_name(string), options(array:string)]
167
+ # of defined features
168
+ def access_classifier(val_list) # array:array:string: list of tuples defined in config file
169
+ # for feature 'feature'
170
+ if val_list.nil?
171
+ return []
172
+ else
173
+ return val_list.map { |cl_descr_tuple|
174
+ [cl_descr_tuple.first, cl_descr_tuple[1..-1]]
175
+ }
176
+ end
177
+ end
178
+
179
+ end
180
+
181
+
182
+
@@ -0,0 +1,232 @@
1
+ # FredConventions
2
+ # Katrin Erk June 05
3
+ #
4
+ # several small things that should be uniform
5
+ # throughout the system
6
+
7
+ require "common/ruby_class_extensions"
8
+
9
+ require "common/EnduserMode"
10
+ class Object
11
+
12
+ ###
13
+ # joining and breaking up senses
14
+ def fred_join_senses(senses)
15
+ return senses.sort().join("++")
16
+ end
17
+
18
+ def fred_split_sense(joined_senses)
19
+ return joined_senses.split("++")
20
+ end
21
+
22
+ ###
23
+ # fred_dirname
24
+ #
25
+ # constructs a directory name:
26
+ # fred data directory / experiment ID / maindir / subdir
27
+ #
28
+ # if is_existing == existing, the directory is checked for existence,
29
+ # if is_existing == new, it is created if necessary
30
+ #
31
+ # returns: a string
32
+ def fred_dirname(exp, # FredConfigData object
33
+ maindir, # string: main part of directory name
34
+ subdir, # string: subpart of directory name
35
+ is_existing = "existing") # string: "existing" or "new", default: existing
36
+
37
+ case is_existing
38
+ when "existing"
39
+ return File.existing_dir(exp.get("fred_directory"),
40
+ exp.get("experiment_ID"),
41
+ maindir,
42
+ subdir)
43
+ when "new"
44
+ return File.new_dir(exp.get("fred_directory"),
45
+ exp.get("experiment_ID"),
46
+ maindir,
47
+ subdir)
48
+ else
49
+ raise "Shouldn't be here: #{is_existing}"
50
+ end
51
+ end
52
+
53
+ ####
54
+ # filenames for feature files
55
+ def fred_feature_filename(lemma, sense = nil,
56
+ do_binary = false)
57
+ if do_binary
58
+ return "fred.features.#{lemma}.SENSE.#{sense}"
59
+ else
60
+ return "fred.features.#{lemma}"
61
+ end
62
+ end
63
+
64
+ ####
65
+ # filenames for split files
66
+ def fred_split_filename(lemma)
67
+ return "fred.split.#{lemma}"
68
+ end
69
+
70
+ ###
71
+ # deconstruct split filename
72
+ # returns: lemma
73
+ def deconstruct_fred_split_filename(filename)
74
+ basename = File.basename(filename)
75
+ if basename =~ /^fred\.split\.(.*)/
76
+ return $1
77
+ else
78
+ return nil
79
+ end
80
+ end
81
+
82
+ ###
83
+ # deconstruct feature file name
84
+ # returns: hash with keys
85
+ # "lemma"
86
+ # "sense
87
+ def deconstruct_fred_feature_filename(filename)
88
+
89
+ basename = File.basename(filename)
90
+ retv = Hash.new()
91
+ # binary:
92
+ # fred.features.#{lemma}.SENSE.#{sense}
93
+ if basename =~ /^fred\.features\.(.*)\.SENSE\.(.*)$/
94
+ retv["lemma"] = $1
95
+ retv["sense"] = $2
96
+ elsif basename =~ /^fred\.features\.(.*)/
97
+ # fred.features.#{lemma}
98
+ retv["lemma"] = $1
99
+
100
+ else
101
+ # complete mismatch
102
+ return nil
103
+ end
104
+
105
+ return retv
106
+ end
107
+
108
+ ####
109
+ # filename for answer key files
110
+ def fred_answerkey_filename(lemma)
111
+ return "fred.answerkey.#{lemma}"
112
+ end
113
+
114
+ ###
115
+ # classifier directory
116
+ def fred_classifier_directory(exp, # FredConfigData object
117
+ splitID = nil) # string or nil
118
+
119
+ if exp.get("classifier_dir")
120
+ # user-specified classifier directory
121
+
122
+ if splitID
123
+ return File.new_dir(exp.get("classifier_dir"), splitID)
124
+ else
125
+ return File.new_dir(exp.get("classifier_dir"))
126
+ end
127
+
128
+ else
129
+ # my classifier directory
130
+ if splitID
131
+ return fred_dirname(exp, "classifiers", splitID, "new")
132
+ else
133
+ return fred_dirname(exp, "classifiers", "all", "new")
134
+ end
135
+ end
136
+ end
137
+
138
+ ###
139
+ # classifier file
140
+ def fred_classifier_filename(classifier, lemma, sense=nil)
141
+ if sense
142
+ return "fred.classif.#{classifier}.LEMMA.#{lemma}.SENSE.#{sense}"
143
+ else
144
+ return "fred.classif.#{classifier}.LEMMA.#{lemma}"
145
+ end
146
+ end
147
+
148
+ def deconstruct_fred_classifier_filename(filename)
149
+ retv = Hash.new()
150
+ if filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)\.SENSE\.(.*)$/
151
+ retv["lemma"] = $2
152
+ retv["sense"] = $3
153
+ elsif filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)$/
154
+ retv["lemma"] = $2
155
+ end
156
+ return retv
157
+ end
158
+
159
+ ###
160
+ # result file
161
+ def fred_result_filename(lemma)
162
+ return "fred.result.#{lemma.gsub(/\./, "_")}"
163
+ end
164
+
165
+ ##########
166
+ # lemma and POS: combine into string separated by
167
+ # a separator character
168
+ #
169
+ # fred_lemmapos_combine: take two strings, return combined string
170
+ # if POS is nil, returns lemma<separator character>
171
+ # fred_lemmapos_separate: take one string, return two strings
172
+ # if no POS could be retrieved, returns nil as POS and the whole string as lemma
173
+ def fred_lemmapos_combine(lemma, # string
174
+ pos) # string
175
+ return lemma.to_s + "." + pos.to_s.gsub(/\./, "DOT")
176
+ end
177
+
178
+ ###
179
+ def fred_lemmapos_separate(lemmapos) # string
180
+ pieces = lemmapos.split(".")
181
+ if pieces.length() > 1
182
+ return [ pieces[0..-2].join("."), pieces[-1] ]
183
+ else
184
+ # no POS found, treat all of lemmapos as lemma
185
+ return [ lemmapos, nil ]
186
+ end
187
+ end
188
+ end
189
+
190
+ ########################################
191
+ # given a SynNode object representing a terminal,
192
+ # return:
193
+ # - the word
194
+ # - the lemma
195
+ # - the part of speech
196
+ # - the named entity (if any)
197
+ #
198
+ # as a tuple
199
+ #
200
+ # WARNING: word and lemma are turned to lowercase
201
+ module WordLemmaPosNe
202
+ def word_lemma_pos_ne(syn_obj, # SynNode object
203
+ i) # SynInterpreter class
204
+ unless syn_obj.is_terminal?
205
+ $stderr.puts "Featurization warning: unexpectedly received non-terminal"
206
+ return [ nil, nil, nil, nil ]
207
+ end
208
+
209
+ word = syn_obj.word()
210
+ if word
211
+ word.downcase!
212
+ end
213
+
214
+ lemma = i.lemma_backoff(syn_obj)
215
+ if lemma and SalsaTigerXMLHelper.unescape(lemma) == "<unknown>"
216
+ lemma = nil
217
+ end
218
+ if lemma
219
+ lemma.downcase!
220
+ end
221
+
222
+ pos = syn_obj.part_of_speech()
223
+
224
+ ne = syn_obj.get_attribute("ne")
225
+ unless ne
226
+ ne = syn_obj.get_attribute("headof_ne")
227
+ end
228
+
229
+ return [word, lemma, pos, ne]
230
+ end
231
+ end
232
+
@@ -0,0 +1,324 @@
1
+ require "fred/FileZipped"
2
+
3
+ require "fred/FredConfigData"
4
+ require "common/SynInterfaces"
5
+ require "fred/FredConventions"
6
+
7
+
8
+ ########################################
9
+ # target determination classes:
10
+ # either determine targets from existing annotation
11
+ # with frames,
12
+ # or use all known targets.
13
+ class Targets
14
+ attr_reader :targets_okay
15
+
16
+ ###
17
+ def initialize(exp, # experiment file object
18
+ interpreter_class, # SynInterpreter class, or nil
19
+ mode) # string: "r", "w", "a", as in files
20
+ @exp = exp
21
+ @interpreter_class = interpreter_class
22
+
23
+ # keep recorded targets here.
24
+ # try to read old list now.
25
+ @targets = Hash.new()
26
+
27
+ # write target info in the classifier directory.
28
+ # This is _not_ dependent on a potential split ID
29
+ @dir = File.new_dir(fred_classifier_directory(@exp), "targets")
30
+
31
+ @targets_okay = true
32
+ case mode
33
+ when "w"
34
+ # start from scratch, no list of targets
35
+ when "a", "r"
36
+ # read existing file containing targets
37
+ begin
38
+ file = FileZipped.new(@dir + "targets.txt.gz")
39
+ rescue
40
+ # no pickle present: signal this
41
+ @targets_okay = false
42
+ return
43
+ end
44
+ file.each { |line|
45
+ line.chomp!
46
+ if line =~ /^LEMMA (.+) SENSES (.+)$/
47
+ lemmapos = $1
48
+ senses = $2.split()
49
+ lemmapos.gsub!(/ /, '_')
50
+ #lemmapos.gsub!(/\.[A-Z]\./, '.')
51
+ @targets[lemmapos] = senses
52
+ end
53
+ }
54
+
55
+ else
56
+ $stderr.puts "Error: shouldn't be here."
57
+ exit 1
58
+ end
59
+
60
+ if ["w", "a"].include? mode
61
+ @record_targets = true
62
+ else
63
+ @record_targets = false
64
+ end
65
+ end
66
+
67
+ ###
68
+ # determine_targets:
69
+ # for a given SalsaTigerSentence,
70
+ # determine all targets,
71
+ # each as a _single_ main terminal node
72
+ #
73
+ # We need a single terminal node in order
74
+ # to compute the context window
75
+ #
76
+ # returns:
77
+ # hash: target_IDs -> list of senses
78
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
79
+ #
80
+ # where a sense is represented as a hash:
81
+ # "sense": sense, a string
82
+ # "obj": FrameNode object
83
+ # "all_targets": list of node IDs, may comprise more than a single node
84
+ # "lex": lemma, or multiword expression in canonical form
85
+ # "sid": sentence ID
86
+ def determine_targets(sent)
87
+ raise "overwrite me"
88
+ end
89
+
90
+ ##
91
+ # returns a list of lemma-pos combined strings
92
+ def get_lemmas()
93
+ return @targets.keys()
94
+ end
95
+
96
+ ##
97
+ # access to lemmas and POS, returns a list of pairs [lemma, pos] (string*string)
98
+ def get_lemma_pos()
99
+
100
+ return @targets.keys().map { |lemmapos| fred_lemmapos_separate(lemmapos) }
101
+ end
102
+
103
+ ##
104
+ # access to senses
105
+ def get_senses(lemmapos) # string, result of fred_lemmapos_combine
106
+
107
+ if @targets[lemmapos]
108
+ return @targets[lemmapos]
109
+ else
110
+ return []
111
+ end
112
+ end
113
+
114
+ ##
115
+ # write file
116
+ def done_reading_targets()
117
+ begin
118
+ file = FileZipped.new(@dir + "targets.txt.gz", "w")
119
+ rescue
120
+ $stderr.puts "Error: Could not write file #{@dir}targets.txt.gz"
121
+ exit 1
122
+ end
123
+
124
+ @targets.each_pair { |lemma, senses|
125
+ file.puts "LEMMA #{lemma} SENSES "+ senses.join(" ")
126
+ }
127
+
128
+ file.close()
129
+ end
130
+
131
+ ###############################
132
+ protected
133
+
134
+ ##
135
+ # record: record occurrence of a lemma/sense pair
136
+ # @targets data structure
137
+ def record(target_info)
138
+ lemmapos = fred_lemmapos_combine(target_info["lex"], target_info["pos"])
139
+ unless @targets[lemmapos]
140
+ @targets[lemmapos] = Array.new
141
+ end
142
+
143
+ unless @targets[lemmapos].include? target_info["sense"]
144
+ @targets[lemmapos] << target_info["sense"]
145
+ end
146
+ end
147
+ end
148
+
149
+ ########################################
150
+ class FindTargetsFromFrames < Targets
151
+ ###
152
+ # determine_targets:
153
+ # use existing frames to find targets
154
+ #
155
+ # returns:
156
+ # hash: target_IDs -> list of senses
157
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
158
+ #
159
+ # where a sense is represented as a hash:
160
+ # "sense": sense, a string
161
+ # "obj": FrameNode object
162
+ # "all_targets": list of node IDs, may comprise more than a single node
163
+ # "lex": lemma, or multiword expression in canonical form
164
+ # "sid": sentence ID
165
+ def determine_targets(st_sent) #SalsaTigerSentence object
166
+ retv = Hash.new()
167
+ st_sent.each_frame { |frame_obj|
168
+ # instance-specific computation:
169
+ # target and target positions
170
+ # WARNING: at this moment, we are
171
+ # not considering true multiword targets for German.
172
+ # Remove the "no_mwe" parameter in main_node_of_expr
173
+ # to change this
174
+ term = nil
175
+ all_targets = nil
176
+ if frame_obj.target.nil? or frame_obj.target.children.empty?
177
+ # no target, nothing to record
178
+
179
+ elsif @exp.get("language") == "de"
180
+ # don't consider true multiword targets for German
181
+ all_targets = frame_obj.target.children()
182
+ term = @interpreter_class.main_node_of_expr(all_targets, "no_mwe")
183
+
184
+ else
185
+ # for all other languages: try to figure out the head target word
186
+ # anyway
187
+ all_targets = frame_obj.target.children()
188
+ term = @interpreter_class.main_node_of_expr(all_targets)
189
+ end
190
+
191
+ if term and term.is_splitword?
192
+ # don't use parts of a word as main node
193
+ term = term.parent()
194
+ end
195
+ if term and term.is_terminal?
196
+ key = [all_targets.map { |t| t.id() }, term.id()]
197
+
198
+ unless retv[key]
199
+ retv[key] = Array.new()
200
+ end
201
+
202
+ pos = frame_obj.target().get_attribute("pos")
203
+ # gold POS available, may be in wrong form,
204
+ # i.e. not the same strings that @interpreter_class.category()
205
+ # would return
206
+ case pos
207
+ when /^[Vv]$/
208
+ pos = "verb"
209
+ when /^[Nn]$/
210
+ pos = "noun"
211
+ when /^[Aa]$/
212
+ pos = "adj"
213
+ when nil
214
+ pos = @interpreter_class.category(term)
215
+ end
216
+
217
+ target_info = {
218
+ "sense" => frame_obj.name(),
219
+ "obj" => frame_obj,
220
+ "all_targets" => frame_obj.target.children().map { |ch| ch.id() },
221
+ "lex" => frame_obj.target().get_attribute("lemma"),
222
+ "pos" => pos,
223
+ "sid" => st_sent.id()
224
+ }
225
+ #print "lex ", frame_obj.target(), " und ",frame_obj.target().get_attribute("lemma"), "\n"
226
+ retv[key] << target_info
227
+ if @record_targets
228
+ record(target_info)
229
+ end
230
+ end
231
+ }
232
+ return retv
233
+ end
234
+ end
235
+
236
+ ########################################
237
+ class FindAllTargets < Targets
238
+ ###
239
+ # determine_targets:
240
+ # use all known lemmas, minus stopwords
241
+ def initialize(exp,
242
+ interpreter_class)
243
+ # read target info from file
244
+ super(exp, interpreter_class, "r")
245
+ @training_lemmapos_pairs = get_lemma_pos()
246
+
247
+ get_senses(@training_lemmapos_pairs)
248
+ # list of words to exclude from assignment, for now
249
+ @stoplemmas = [
250
+ "have",
251
+ "do",
252
+ "be"
253
+ # "make"
254
+ ]
255
+
256
+ end
257
+
258
+ ####
259
+ #
260
+ # returns:
261
+ # hash: target_IDs -> list of senses
262
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
263
+ #
264
+ # where a sense is represented as a hash:
265
+ # "sense": sense, a string
266
+ # "obj": FrameNode object
267
+ # "all_targets": list of node IDs, may comprise more than a single node
268
+ # "lex": lemma, or multiword expression in canonical form
269
+ # "sid": sentence ID
270
+ def determine_targets(sent) #SalsaTigerSentence object
271
+ # map target IDs to list of senses, in our case always [ nil ]
272
+ # because we assume that the senses of the targets we point out
273
+ # are unknown
274
+ retv = Hash.new()
275
+ # iterate through terminals of the sentence, check for inclusion
276
+ # of their lemma in @training_lemmas
277
+ sent.each_terminal { |node|
278
+ # we know this lemma from the training data,
279
+ # and it is not an auxiliary,
280
+ # and it is not in the stopword list
281
+ # and the node does not represent a preposition
282
+
283
+ ### modified by ines, 17.10.2008
284
+ lemma = @interpreter_class.lemma_backoff(node)
285
+ pos = @interpreter_class.category(node)
286
+
287
+ # print "lemma ", lemma, " pos ", pos, "\n"
288
+ # reg = /\.[ANV]/
289
+ # if !reg.match(lemma)
290
+ # if /verb/.match(pos)
291
+ # lemma = lemma + ".V"
292
+ # elsif /noun/.match(pos)
293
+ # lemma = lemma + ".N"
294
+ # elsif /adj/.match(pos)
295
+ # lemma = lemma + ".A"
296
+ # end
297
+ # print "LEMMA ", lemma, " POS ", pos, "\n"
298
+ # end
299
+
300
+ if (@training_lemmapos_pairs.include? [lemma, pos] and
301
+ not(@interpreter_class.auxiliary?(node)) and
302
+ not(@stoplemmas.include? lemma) and
303
+ not(pos == "prep"))
304
+ key = [ [ node.id() ], node.id() ]
305
+
306
+ # take this as a target.
307
+ retv[ key ] = [
308
+ {
309
+ "sense" => nil,
310
+ "obj" => nil,
311
+ "all_targets" => [ node.id() ],
312
+ "lex" => lemma,
313
+ "pos" => pos,
314
+ "sid" => sent.id()
315
+ } ]
316
+ # no recording of target info,
317
+ # since we haven't determined anything new
318
+ end
319
+ }
320
+
321
+ return retv
322
+ end
323
+ end
324
+