frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,182 @@
1
+ # FredConfigData
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system:
5
+ # access to a configuration and experiment description file
6
+
7
+ require "common/ConfigData"
8
+
9
+ ##############################
10
+ # Class FredConfigData
11
+ #
12
+ # inherits from ConfigData,
13
+ # sets variable names appropriate to WSD task
14
+
15
+ class FredConfigData < ConfigData
16
+ def initialize(filename)
17
+
18
+ # initialize config data object
19
+ super(filename, # config file
20
+ {
21
+ "experiment_ID" => "string", # experiment ID
22
+ "enduser_mode" => "bool", # work in enduser mode? (disallowing many things)
23
+
24
+ "preproc_descr_file_train" => "string", # path to preprocessing files
25
+ "preproc_descr_file_test" => "string",
26
+ "directory_output" => "string", # path to Salsa/Tiger XML output directory
27
+
28
+ "verbose" => "bool" , # print diagnostic messages?
29
+ "apply_to_all_known_targets" => "bool", # apply to all known targets rather than the ones with a frame?
30
+
31
+ "fred_directory" => "string",# directory for internal info
32
+ "classifier_dir" => "string", # write classifiers here
33
+
34
+ "classifier" => "list", # classifiers
35
+
36
+ "dbtype" => "string", # "mysql" or "sqlite"
37
+
38
+ "host" => "string", # DB access: sqlite only
39
+ "user" => "string",
40
+ "passwd" => "string",
41
+ "dbname" => "string",
42
+
43
+ # featurization info
44
+ "feature" => "list", # which features to use for the classifier?
45
+ "binary_classifiers" => "bool",# make binary rather than n-ary clasifiers?
46
+ "negsense" => "string", # binary classifier: negative sense is..?
47
+ "numerical_features" => "string", # do what with numerical features?
48
+
49
+ # what to do with items that have multiple senses?
50
+ # 'binarize': binary classifiers, and consider positive
51
+ # if the sense is among the gold senses
52
+ # 'join' : make one joint sense
53
+ # 'repeat' : make multiple occurrences of the item, one sense per occ
54
+ # 'keep' : keep as separate labels
55
+ #
56
+ # multilabel: consider as assigned all labels
57
+ # above a certain confidence threshold?
58
+ "handle_multilabel" => "string",
59
+ "assignment_confidence_threshold" => "float",
60
+
61
+ # single-sentence context?
62
+ "single_sent_context" => "bool",
63
+
64
+ # noncontiguous input? then we need access to a larger corpus
65
+ "noncontiguous_input" => "bool",
66
+ "larger_corpus_dir" => "string",
67
+ "larger_corpus_format" => "string",
68
+ "larger_corpus_encoding" => "string"
69
+ },
70
+ [ # variables
71
+ "train",
72
+ "exp_ID"
73
+ ]
74
+ )
75
+
76
+ # set access functions for list features
77
+ set_list_feature_access("classifier",
78
+ method("access_classifier"))
79
+ set_list_feature_access("feature",
80
+ method("access_feature"))
81
+ end
82
+
83
+ ###
84
+ # protected
85
+
86
+ #####
87
+ # access_feature
88
+ #
89
+ # access function for feature 'feature'
90
+ #
91
+ # assumed format:
92
+ #
93
+ # feature = context 50
94
+ # feature = context 2
95
+ # feature = syn
96
+ #
97
+ # i.e. first the name of the feature type to use, then
98
+ # optionally a parameter,
99
+ # and the same feature can occur more than once (which makes sense
100
+ # only in case of parameters)
101
+ #
102
+ #
103
+ # returns:
104
+ # - If a feature is given as a parameter,
105
+ # - If the feature is not set in the experiment file, nil
106
+ # - If the feature is set and has a parameter, the list of
107
+ # parameter values set for it. It is assumed that the parameters
108
+ # are integers, and they are returned as integers
109
+ # - If the feature is set and has no parameter, true
110
+ # - If no feature is given as parameter:
111
+ # a list of all features that have been set in the experiment file
112
+ # Each feature is given as a tuple: the first element is the feature (a string),
113
+ # all further elements are options (integers)
114
+ def access_feature(val_list, # array:array:string: list of tuples defined in config file
115
+ # for feature 'feature'
116
+ feature=nil) # string: feature type name
117
+
118
+ if feature
119
+ # access options for this feature
120
+
121
+ # get the right tuples
122
+ positives = val_list.select { |entries|
123
+ entries.first() == feature
124
+ }.map { |entries|
125
+ entries[1]
126
+ }
127
+
128
+ if positives.empty?
129
+ # feature not defined
130
+ return nil
131
+
132
+ elsif positives.compact().empty?
133
+ # feature defined, but no parameters
134
+ return true
135
+
136
+ else
137
+ # feature defined, and has values
138
+ return positives.map { |par| par.to_i() }
139
+ end
140
+
141
+ else
142
+ # return all features that have been set
143
+ return val_list.map { |feature_name, *options|
144
+ [feature_name] + options.map { |o| o.to_i() }
145
+ }
146
+ end
147
+ end
148
+
149
+ #####
150
+ # access_classifier
151
+ #
152
+ # access function for feature 'classifier'
153
+ #
154
+ # assumed format in the config file:
155
+ #
156
+ # feature = path [option]*
157
+ #
158
+ # i.e. first the name of the feature type to use, then
159
+ # optionally options associated with that feature,
160
+ # e.g. 'argrec': use that feature only when computing argrec
161
+ #
162
+ # the access function is called with parameter val_list, an array of
163
+ # string tuples, one string tuple for each feature defined.
164
+ # the first string in the tuple is the feature name, the rest are the options
165
+ #
166
+ # returns: a list of pairs [feature_name(string), options(array:string)]
167
+ # of defined features
168
+ def access_classifier(val_list) # array:array:string: list of tuples defined in config file
169
+ # for feature 'feature'
170
+ if val_list.nil?
171
+ return []
172
+ else
173
+ return val_list.map { |cl_descr_tuple|
174
+ [cl_descr_tuple.first, cl_descr_tuple[1..-1]]
175
+ }
176
+ end
177
+ end
178
+
179
+ end
180
+
181
+
182
+
@@ -0,0 +1,232 @@
1
+ # FredConventions
2
+ # Katrin Erk June 05
3
+ #
4
+ # several small things that should be uniform
5
+ # throughout the system
6
+
7
+ require "common/ruby_class_extensions"
8
+
9
+ require "common/EnduserMode"
10
+ class Object
11
+
12
+ ###
13
+ # joining and breaking up senses
14
+ def fred_join_senses(senses)
15
+ return senses.sort().join("++")
16
+ end
17
+
18
+ def fred_split_sense(joined_senses)
19
+ return joined_senses.split("++")
20
+ end
21
+
22
+ ###
23
+ # fred_dirname
24
+ #
25
+ # constructs a directory name:
26
+ # fred data directory / experiment ID / maindir / subdir
27
+ #
28
+ # if is_existing == existing, the directory is checked for existence,
29
+ # if is_existing == new, it is created if necessary
30
+ #
31
+ # returns: a string
32
+ def fred_dirname(exp, # FredConfigData object
33
+ maindir, # string: main part of directory name
34
+ subdir, # string: subpart of directory name
35
+ is_existing = "existing") # string: "existing" or "new", default: existing
36
+
37
+ case is_existing
38
+ when "existing"
39
+ return File.existing_dir(exp.get("fred_directory"),
40
+ exp.get("experiment_ID"),
41
+ maindir,
42
+ subdir)
43
+ when "new"
44
+ return File.new_dir(exp.get("fred_directory"),
45
+ exp.get("experiment_ID"),
46
+ maindir,
47
+ subdir)
48
+ else
49
+ raise "Shouldn't be here: #{is_existing}"
50
+ end
51
+ end
52
+
53
+ ####
54
+ # filenames for feature files
55
+ def fred_feature_filename(lemma, sense = nil,
56
+ do_binary = false)
57
+ if do_binary
58
+ return "fred.features.#{lemma}.SENSE.#{sense}"
59
+ else
60
+ return "fred.features.#{lemma}"
61
+ end
62
+ end
63
+
64
+ ####
65
+ # filenames for split files
66
+ def fred_split_filename(lemma)
67
+ return "fred.split.#{lemma}"
68
+ end
69
+
70
+ ###
71
+ # deconstruct split filename
72
+ # returns: lemma
73
+ def deconstruct_fred_split_filename(filename)
74
+ basename = File.basename(filename)
75
+ if basename =~ /^fred\.split\.(.*)/
76
+ return $1
77
+ else
78
+ return nil
79
+ end
80
+ end
81
+
82
+ ###
83
+ # deconstruct feature file name
84
+ # returns: hash with keys
85
+ # "lemma"
86
+ # "sense
87
+ def deconstruct_fred_feature_filename(filename)
88
+
89
+ basename = File.basename(filename)
90
+ retv = Hash.new()
91
+ # binary:
92
+ # fred.features.#{lemma}.SENSE.#{sense}
93
+ if basename =~ /^fred\.features\.(.*)\.SENSE\.(.*)$/
94
+ retv["lemma"] = $1
95
+ retv["sense"] = $2
96
+ elsif basename =~ /^fred\.features\.(.*)/
97
+ # fred.features.#{lemma}
98
+ retv["lemma"] = $1
99
+
100
+ else
101
+ # complete mismatch
102
+ return nil
103
+ end
104
+
105
+ return retv
106
+ end
107
+
108
+ ####
109
+ # filename for answer key files
110
+ def fred_answerkey_filename(lemma)
111
+ return "fred.answerkey.#{lemma}"
112
+ end
113
+
114
+ ###
115
+ # classifier directory
116
+ def fred_classifier_directory(exp, # FredConfigData object
117
+ splitID = nil) # string or nil
118
+
119
+ if exp.get("classifier_dir")
120
+ # user-specified classifier directory
121
+
122
+ if splitID
123
+ return File.new_dir(exp.get("classifier_dir"), splitID)
124
+ else
125
+ return File.new_dir(exp.get("classifier_dir"))
126
+ end
127
+
128
+ else
129
+ # my classifier directory
130
+ if splitID
131
+ return fred_dirname(exp, "classifiers", splitID, "new")
132
+ else
133
+ return fred_dirname(exp, "classifiers", "all", "new")
134
+ end
135
+ end
136
+ end
137
+
138
+ ###
139
+ # classifier file
140
+ def fred_classifier_filename(classifier, lemma, sense=nil)
141
+ if sense
142
+ return "fred.classif.#{classifier}.LEMMA.#{lemma}.SENSE.#{sense}"
143
+ else
144
+ return "fred.classif.#{classifier}.LEMMA.#{lemma}"
145
+ end
146
+ end
147
+
148
+ def deconstruct_fred_classifier_filename(filename)
149
+ retv = Hash.new()
150
+ if filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)\.SENSE\.(.*)$/
151
+ retv["lemma"] = $2
152
+ retv["sense"] = $3
153
+ elsif filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)$/
154
+ retv["lemma"] = $2
155
+ end
156
+ return retv
157
+ end
158
+
159
+ ###
160
+ # result file
161
+ def fred_result_filename(lemma)
162
+ return "fred.result.#{lemma.gsub(/\./, "_")}"
163
+ end
164
+
165
+ ##########
166
+ # lemma and POS: combine into string separated by
167
+ # a separator character
168
+ #
169
+ # fred_lemmapos_combine: take two strings, return combined string
170
+ # if POS is nil, returns lemma<separator character>
171
+ # fred_lemmapos_separate: take one string, return two strings
172
+ # if no POS could be retrieved, returns nil as POS and the whole string as lemma
173
+ def fred_lemmapos_combine(lemma, # string
174
+ pos) # string
175
+ return lemma.to_s + "." + pos.to_s.gsub(/\./, "DOT")
176
+ end
177
+
178
+ ###
179
+ def fred_lemmapos_separate(lemmapos) # string
180
+ pieces = lemmapos.split(".")
181
+ if pieces.length() > 1
182
+ return [ pieces[0..-2].join("."), pieces[-1] ]
183
+ else
184
+ # no POS found, treat all of lemmapos as lemma
185
+ return [ lemmapos, nil ]
186
+ end
187
+ end
188
+ end
189
+
190
+ ########################################
191
+ # given a SynNode object representing a terminal,
192
+ # return:
193
+ # - the word
194
+ # - the lemma
195
+ # - the part of speech
196
+ # - the named entity (if any)
197
+ #
198
+ # as a tuple
199
+ #
200
+ # WARNING: word and lemma are turned to lowercase
201
+ module WordLemmaPosNe
202
+ def word_lemma_pos_ne(syn_obj, # SynNode object
203
+ i) # SynInterpreter class
204
+ unless syn_obj.is_terminal?
205
+ $stderr.puts "Featurization warning: unexpectedly received non-terminal"
206
+ return [ nil, nil, nil, nil ]
207
+ end
208
+
209
+ word = syn_obj.word()
210
+ if word
211
+ word.downcase!
212
+ end
213
+
214
+ lemma = i.lemma_backoff(syn_obj)
215
+ if lemma and SalsaTigerXMLHelper.unescape(lemma) == "<unknown>"
216
+ lemma = nil
217
+ end
218
+ if lemma
219
+ lemma.downcase!
220
+ end
221
+
222
+ pos = syn_obj.part_of_speech()
223
+
224
+ ne = syn_obj.get_attribute("ne")
225
+ unless ne
226
+ ne = syn_obj.get_attribute("headof_ne")
227
+ end
228
+
229
+ return [word, lemma, pos, ne]
230
+ end
231
+ end
232
+
@@ -0,0 +1,324 @@
1
+ require "fred/FileZipped"
2
+
3
+ require "fred/FredConfigData"
4
+ require "common/SynInterfaces"
5
+ require "fred/FredConventions"
6
+
7
+
8
+ ########################################
9
+ # target determination classes:
10
+ # either determine targets from existing annotation
11
+ # with frames,
12
+ # or use all known targets.
13
+ class Targets
14
+ attr_reader :targets_okay
15
+
16
+ ###
17
+ def initialize(exp, # experiment file object
18
+ interpreter_class, # SynInterpreter class, or nil
19
+ mode) # string: "r", "w", "a", as in files
20
+ @exp = exp
21
+ @interpreter_class = interpreter_class
22
+
23
+ # keep recorded targets here.
24
+ # try to read old list now.
25
+ @targets = Hash.new()
26
+
27
+ # write target info in the classifier directory.
28
+ # This is _not_ dependent on a potential split ID
29
+ @dir = File.new_dir(fred_classifier_directory(@exp), "targets")
30
+
31
+ @targets_okay = true
32
+ case mode
33
+ when "w"
34
+ # start from scratch, no list of targets
35
+ when "a", "r"
36
+ # read existing file containing targets
37
+ begin
38
+ file = FileZipped.new(@dir + "targets.txt.gz")
39
+ rescue
40
+ # no pickle present: signal this
41
+ @targets_okay = false
42
+ return
43
+ end
44
+ file.each { |line|
45
+ line.chomp!
46
+ if line =~ /^LEMMA (.+) SENSES (.+)$/
47
+ lemmapos = $1
48
+ senses = $2.split()
49
+ lemmapos.gsub!(/ /, '_')
50
+ #lemmapos.gsub!(/\.[A-Z]\./, '.')
51
+ @targets[lemmapos] = senses
52
+ end
53
+ }
54
+
55
+ else
56
+ $stderr.puts "Error: shouldn't be here."
57
+ exit 1
58
+ end
59
+
60
+ if ["w", "a"].include? mode
61
+ @record_targets = true
62
+ else
63
+ @record_targets = false
64
+ end
65
+ end
66
+
67
+ ###
68
+ # determine_targets:
69
+ # for a given SalsaTigerSentence,
70
+ # determine all targets,
71
+ # each as a _single_ main terminal node
72
+ #
73
+ # We need a single terminal node in order
74
+ # to compute the context window
75
+ #
76
+ # returns:
77
+ # hash: target_IDs -> list of senses
78
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
79
+ #
80
+ # where a sense is represented as a hash:
81
+ # "sense": sense, a string
82
+ # "obj": FrameNode object
83
+ # "all_targets": list of node IDs, may comprise more than a single node
84
+ # "lex": lemma, or multiword expression in canonical form
85
+ # "sid": sentence ID
86
+ def determine_targets(sent)
87
+ raise "overwrite me"
88
+ end
89
+
90
+ ##
91
+ # returns a list of lemma-pos combined strings
92
+ def get_lemmas()
93
+ return @targets.keys()
94
+ end
95
+
96
+ ##
97
+ # access to lemmas and POS, returns a list of pairs [lemma, pos] (string*string)
98
+ def get_lemma_pos()
99
+
100
+ return @targets.keys().map { |lemmapos| fred_lemmapos_separate(lemmapos) }
101
+ end
102
+
103
+ ##
104
+ # access to senses
105
+ def get_senses(lemmapos) # string, result of fred_lemmapos_combine
106
+
107
+ if @targets[lemmapos]
108
+ return @targets[lemmapos]
109
+ else
110
+ return []
111
+ end
112
+ end
113
+
114
+ ##
115
+ # write file
116
+ def done_reading_targets()
117
+ begin
118
+ file = FileZipped.new(@dir + "targets.txt.gz", "w")
119
+ rescue
120
+ $stderr.puts "Error: Could not write file #{@dir}targets.txt.gz"
121
+ exit 1
122
+ end
123
+
124
+ @targets.each_pair { |lemma, senses|
125
+ file.puts "LEMMA #{lemma} SENSES "+ senses.join(" ")
126
+ }
127
+
128
+ file.close()
129
+ end
130
+
131
+ ###############################
132
+ protected
133
+
134
+ ##
135
+ # record: record occurrence of a lemma/sense pair
136
+ # @targets data structure
137
+ def record(target_info)
138
+ lemmapos = fred_lemmapos_combine(target_info["lex"], target_info["pos"])
139
+ unless @targets[lemmapos]
140
+ @targets[lemmapos] = Array.new
141
+ end
142
+
143
+ unless @targets[lemmapos].include? target_info["sense"]
144
+ @targets[lemmapos] << target_info["sense"]
145
+ end
146
+ end
147
+ end
148
+
149
+ ########################################
150
+ class FindTargetsFromFrames < Targets
151
+ ###
152
+ # determine_targets:
153
+ # use existing frames to find targets
154
+ #
155
+ # returns:
156
+ # hash: target_IDs -> list of senses
157
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
158
+ #
159
+ # where a sense is represented as a hash:
160
+ # "sense": sense, a string
161
+ # "obj": FrameNode object
162
+ # "all_targets": list of node IDs, may comprise more than a single node
163
+ # "lex": lemma, or multiword expression in canonical form
164
+ # "sid": sentence ID
165
+ def determine_targets(st_sent) #SalsaTigerSentence object
166
+ retv = Hash.new()
167
+ st_sent.each_frame { |frame_obj|
168
+ # instance-specific computation:
169
+ # target and target positions
170
+ # WARNING: at this moment, we are
171
+ # not considering true multiword targets for German.
172
+ # Remove the "no_mwe" parameter in main_node_of_expr
173
+ # to change this
174
+ term = nil
175
+ all_targets = nil
176
+ if frame_obj.target.nil? or frame_obj.target.children.empty?
177
+ # no target, nothing to record
178
+
179
+ elsif @exp.get("language") == "de"
180
+ # don't consider true multiword targets for German
181
+ all_targets = frame_obj.target.children()
182
+ term = @interpreter_class.main_node_of_expr(all_targets, "no_mwe")
183
+
184
+ else
185
+ # for all other languages: try to figure out the head target word
186
+ # anyway
187
+ all_targets = frame_obj.target.children()
188
+ term = @interpreter_class.main_node_of_expr(all_targets)
189
+ end
190
+
191
+ if term and term.is_splitword?
192
+ # don't use parts of a word as main node
193
+ term = term.parent()
194
+ end
195
+ if term and term.is_terminal?
196
+ key = [all_targets.map { |t| t.id() }, term.id()]
197
+
198
+ unless retv[key]
199
+ retv[key] = Array.new()
200
+ end
201
+
202
+ pos = frame_obj.target().get_attribute("pos")
203
+ # gold POS available, may be in wrong form,
204
+ # i.e. not the same strings that @interpreter_class.category()
205
+ # would return
206
+ case pos
207
+ when /^[Vv]$/
208
+ pos = "verb"
209
+ when /^[Nn]$/
210
+ pos = "noun"
211
+ when /^[Aa]$/
212
+ pos = "adj"
213
+ when nil
214
+ pos = @interpreter_class.category(term)
215
+ end
216
+
217
+ target_info = {
218
+ "sense" => frame_obj.name(),
219
+ "obj" => frame_obj,
220
+ "all_targets" => frame_obj.target.children().map { |ch| ch.id() },
221
+ "lex" => frame_obj.target().get_attribute("lemma"),
222
+ "pos" => pos,
223
+ "sid" => st_sent.id()
224
+ }
225
+ #print "lex ", frame_obj.target(), " und ",frame_obj.target().get_attribute("lemma"), "\n"
226
+ retv[key] << target_info
227
+ if @record_targets
228
+ record(target_info)
229
+ end
230
+ end
231
+ }
232
+ return retv
233
+ end
234
+ end
235
+
236
+ ########################################
237
+ class FindAllTargets < Targets
238
+ ###
239
+ # determine_targets:
240
+ # use all known lemmas, minus stopwords
241
+ def initialize(exp,
242
+ interpreter_class)
243
+ # read target info from file
244
+ super(exp, interpreter_class, "r")
245
+ @training_lemmapos_pairs = get_lemma_pos()
246
+
247
+ get_senses(@training_lemmapos_pairs)
248
+ # list of words to exclude from assignment, for now
249
+ @stoplemmas = [
250
+ "have",
251
+ "do",
252
+ "be"
253
+ # "make"
254
+ ]
255
+
256
+ end
257
+
258
+ ####
259
+ #
260
+ # returns:
261
+ # hash: target_IDs -> list of senses
262
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
263
+ #
264
+ # where a sense is represented as a hash:
265
+ # "sense": sense, a string
266
+ # "obj": FrameNode object
267
+ # "all_targets": list of node IDs, may comprise more than a single node
268
+ # "lex": lemma, or multiword expression in canonical form
269
+ # "sid": sentence ID
270
+ def determine_targets(sent) #SalsaTigerSentence object
271
+ # map target IDs to list of senses, in our case always [ nil ]
272
+ # because we assume that the senses of the targets we point out
273
+ # are unknown
274
+ retv = Hash.new()
275
+ # iterate through terminals of the sentence, check for inclusion
276
+ # of their lemma in @training_lemmas
277
+ sent.each_terminal { |node|
278
+ # we know this lemma from the training data,
279
+ # and it is not an auxiliary,
280
+ # and it is not in the stopword list
281
+ # and the node does not represent a preposition
282
+
283
+ ### modified by ines, 17.10.2008
284
+ lemma = @interpreter_class.lemma_backoff(node)
285
+ pos = @interpreter_class.category(node)
286
+
287
+ # print "lemma ", lemma, " pos ", pos, "\n"
288
+ # reg = /\.[ANV]/
289
+ # if !reg.match(lemma)
290
+ # if /verb/.match(pos)
291
+ # lemma = lemma + ".V"
292
+ # elsif /noun/.match(pos)
293
+ # lemma = lemma + ".N"
294
+ # elsif /adj/.match(pos)
295
+ # lemma = lemma + ".A"
296
+ # end
297
+ # print "LEMMA ", lemma, " POS ", pos, "\n"
298
+ # end
299
+
300
+ if (@training_lemmapos_pairs.include? [lemma, pos] and
301
+ not(@interpreter_class.auxiliary?(node)) and
302
+ not(@stoplemmas.include? lemma) and
303
+ not(pos == "prep"))
304
+ key = [ [ node.id() ], node.id() ]
305
+
306
+ # take this as a target.
307
+ retv[ key ] = [
308
+ {
309
+ "sense" => nil,
310
+ "obj" => nil,
311
+ "all_targets" => [ node.id() ],
312
+ "lex" => lemma,
313
+ "pos" => pos,
314
+ "sid" => sent.id()
315
+ } ]
316
+ # no recording of target info,
317
+ # since we haven't determined anything new
318
+ end
319
+ }
320
+
321
+ return retv
322
+ end
323
+ end
324
+