shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,232 +0,0 @@
1
- # FredConventions
2
- # Katrin Erk June 05
3
- #
4
- # several small things that should be uniform
5
- # throughout the system
6
-
7
- require "common/ruby_class_extensions"
8
-
9
- require "common/EnduserMode"
10
- class Object
11
-
12
- ###
13
- # joining and breaking up senses
14
- def fred_join_senses(senses)
15
- return senses.sort().join("++")
16
- end
17
-
18
- def fred_split_sense(joined_senses)
19
- return joined_senses.split("++")
20
- end
21
-
22
- ###
23
- # fred_dirname
24
- #
25
- # constructs a directory name:
26
- # fred data directory / experiment ID / maindir / subdir
27
- #
28
- # if is_existing == existing, the directory is checked for existence,
29
- # if is_existing == new, it is created if necessary
30
- #
31
- # returns: a string
32
- def fred_dirname(exp, # FredConfigData object
33
- maindir, # string: main part of directory name
34
- subdir, # string: subpart of directory name
35
- is_existing = "existing") # string: "existing" or "new", default: existing
36
-
37
- case is_existing
38
- when "existing"
39
- return File.existing_dir(exp.get("fred_directory"),
40
- exp.get("experiment_ID"),
41
- maindir,
42
- subdir)
43
- when "new"
44
- return File.new_dir(exp.get("fred_directory"),
45
- exp.get("experiment_ID"),
46
- maindir,
47
- subdir)
48
- else
49
- raise "Shouldn't be here: #{is_existing}"
50
- end
51
- end
52
-
53
- ####
54
- # filenames for feature files
55
- def fred_feature_filename(lemma, sense = nil,
56
- do_binary = false)
57
- if do_binary
58
- return "fred.features.#{lemma}.SENSE.#{sense}"
59
- else
60
- return "fred.features.#{lemma}"
61
- end
62
- end
63
-
64
- ####
65
- # filenames for split files
66
- def fred_split_filename(lemma)
67
- return "fred.split.#{lemma}"
68
- end
69
-
70
- ###
71
- # deconstruct split filename
72
- # returns: lemma
73
- def deconstruct_fred_split_filename(filename)
74
- basename = File.basename(filename)
75
- if basename =~ /^fred\.split\.(.*)/
76
- return $1
77
- else
78
- return nil
79
- end
80
- end
81
-
82
- ###
83
- # deconstruct feature file name
84
- # returns: hash with keys
85
- # "lemma"
86
- # "sense
87
- def deconstruct_fred_feature_filename(filename)
88
-
89
- basename = File.basename(filename)
90
- retv = Hash.new()
91
- # binary:
92
- # fred.features.#{lemma}.SENSE.#{sense}
93
- if basename =~ /^fred\.features\.(.*)\.SENSE\.(.*)$/
94
- retv["lemma"] = $1
95
- retv["sense"] = $2
96
- elsif basename =~ /^fred\.features\.(.*)/
97
- # fred.features.#{lemma}
98
- retv["lemma"] = $1
99
-
100
- else
101
- # complete mismatch
102
- return nil
103
- end
104
-
105
- return retv
106
- end
107
-
108
- ####
109
- # filename for answer key files
110
- def fred_answerkey_filename(lemma)
111
- return "fred.answerkey.#{lemma}"
112
- end
113
-
114
- ###
115
- # classifier directory
116
- def fred_classifier_directory(exp, # FredConfigData object
117
- splitID = nil) # string or nil
118
-
119
- if exp.get("classifier_dir")
120
- # user-specified classifier directory
121
-
122
- if splitID
123
- return File.new_dir(exp.get("classifier_dir"), splitID)
124
- else
125
- return File.new_dir(exp.get("classifier_dir"))
126
- end
127
-
128
- else
129
- # my classifier directory
130
- if splitID
131
- return fred_dirname(exp, "classifiers", splitID, "new")
132
- else
133
- return fred_dirname(exp, "classifiers", "all", "new")
134
- end
135
- end
136
- end
137
-
138
- ###
139
- # classifier file
140
- def fred_classifier_filename(classifier, lemma, sense=nil)
141
- if sense
142
- return "fred.classif.#{classifier}.LEMMA.#{lemma}.SENSE.#{sense}"
143
- else
144
- return "fred.classif.#{classifier}.LEMMA.#{lemma}"
145
- end
146
- end
147
-
148
- def deconstruct_fred_classifier_filename(filename)
149
- retv = Hash.new()
150
- if filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)\.SENSE\.(.*)$/
151
- retv["lemma"] = $2
152
- retv["sense"] = $3
153
- elsif filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)$/
154
- retv["lemma"] = $2
155
- end
156
- return retv
157
- end
158
-
159
- ###
160
- # result file
161
- def fred_result_filename(lemma)
162
- return "fred.result.#{lemma.gsub(/\./, "_")}"
163
- end
164
-
165
- ##########
166
- # lemma and POS: combine into string separated by
167
- # a separator character
168
- #
169
- # fred_lemmapos_combine: take two strings, return combined string
170
- # if POS is nil, returns lemma<separator character>
171
- # fred_lemmapos_separate: take one string, return two strings
172
- # if no POS could be retrieved, returns nil as POS and the whole string as lemma
173
- def fred_lemmapos_combine(lemma, # string
174
- pos) # string
175
- return lemma.to_s + "." + pos.to_s.gsub(/\./, "DOT")
176
- end
177
-
178
- ###
179
- def fred_lemmapos_separate(lemmapos) # string
180
- pieces = lemmapos.split(".")
181
- if pieces.length() > 1
182
- return [ pieces[0..-2].join("."), pieces[-1] ]
183
- else
184
- # no POS found, treat all of lemmapos as lemma
185
- return [ lemmapos, nil ]
186
- end
187
- end
188
- end
189
-
190
- ########################################
191
- # given a SynNode object representing a terminal,
192
- # return:
193
- # - the word
194
- # - the lemma
195
- # - the part of speech
196
- # - the named entity (if any)
197
- #
198
- # as a tuple
199
- #
200
- # WARNING: word and lemma are turned to lowercase
201
- module WordLemmaPosNe
202
- def word_lemma_pos_ne(syn_obj, # SynNode object
203
- i) # SynInterpreter class
204
- unless syn_obj.is_terminal?
205
- $stderr.puts "Featurization warning: unexpectedly received non-terminal"
206
- return [ nil, nil, nil, nil ]
207
- end
208
-
209
- word = syn_obj.word()
210
- if word
211
- word.downcase!
212
- end
213
-
214
- lemma = i.lemma_backoff(syn_obj)
215
- if lemma and SalsaTigerXMLHelper.unescape(lemma) == "<unknown>"
216
- lemma = nil
217
- end
218
- if lemma
219
- lemma.downcase!
220
- end
221
-
222
- pos = syn_obj.part_of_speech()
223
-
224
- ne = syn_obj.get_attribute("ne")
225
- unless ne
226
- ne = syn_obj.get_attribute("headof_ne")
227
- end
228
-
229
- return [word, lemma, pos, ne]
230
- end
231
- end
232
-
@@ -1,319 +0,0 @@
1
- require "fred/FileZipped"
2
-
3
- require "fred/fred_config_data"
4
- require "common/SynInterfaces"
5
- require "fred/FredConventions"
6
-
7
-
8
- ########################################
9
- # target determination classes:
10
- # either determine targets from existing annotation
11
- # with frames,
12
- # or use all known targets.
13
- class Targets
14
- attr_reader :targets_okay
15
-
16
- ###
17
- def initialize(exp, # experiment file object
18
- interpreter_class, # SynInterpreter class, or nil
19
- mode) # string: "r", "w", "a", as in files
20
- @exp = exp
21
- @interpreter_class = interpreter_class
22
-
23
- # keep recorded targets here.
24
- # try to read old list now.
25
- @targets = Hash.new()
26
-
27
- # write target info in the classifier directory.
28
- # This is _not_ dependent on a potential split ID
29
- @dir = File.new_dir(fred_classifier_directory(@exp), "targets")
30
-
31
- @targets_okay = true
32
- case mode
33
- when "w"
34
- # start from scratch, no list of targets
35
- when "a", "r"
36
- # read existing file containing targets
37
- begin
38
- file = FileZipped.new(@dir + "targets.txt.gz")
39
- rescue
40
- # no pickle present: signal this
41
- @targets_okay = false
42
- return
43
- end
44
- file.each { |line|
45
- line.chomp!
46
- if line =~ /^LEMMA (.+) SENSES (.+)$/
47
- lemmapos = $1
48
- senses = $2.split()
49
- lemmapos.gsub!(/ /, '_')
50
- #lemmapos.gsub!(/\.[A-Z]\./, '.')
51
- @targets[lemmapos] = senses
52
- end
53
- }
54
-
55
- else
56
- $stderr.puts "Error: shouldn't be here."
57
- exit 1
58
- end
59
-
60
- if ["w", "a"].include? mode
61
- @record_targets = true
62
- else
63
- @record_targets = false
64
- end
65
- end
66
-
67
- ###
68
- # determine_targets:
69
- # for a given SalsaTigerSentence,
70
- # determine all targets,
71
- # each as a _single_ main terminal node
72
- #
73
- # We need a single terminal node in order
74
- # to compute the context window
75
- #
76
- # returns:
77
- # hash: target_IDs -> list of senses
78
- # where target_IDs is a pair [list of terminal IDs, main terminal ID]
79
- #
80
- # where a sense is represented as a hash:
81
- # "sense": sense, a string
82
- # "obj": FrameNode object
83
- # "all_targets": list of node IDs, may comprise more than a single node
84
- # "lex": lemma, or multiword expression in canonical form
85
- # "sid": sentence ID
86
- def determine_targets(sent)
87
- raise "overwrite me"
88
- end
89
-
90
- ##
91
- # returns a list of lemma-pos combined strings
92
- def get_lemmas()
93
- return @targets.keys()
94
- end
95
-
96
- ##
97
- # access to lemmas and POS, returns a list of pairs [lemma, pos] (string*string)
98
- def get_lemma_pos()
99
-
100
- return @targets.keys().map { |lemmapos| fred_lemmapos_separate(lemmapos) }
101
- end
102
-
103
- ##
104
- # access to senses
105
- def get_senses(lemmapos) # string, result of fred_lemmapos_combine
106
- @targets[lemmapos] ? @targets[lemmapos] : []
107
- end
108
-
109
- ##
110
- # write file
111
- def done_reading_targets()
112
- begin
113
- file = FileZipped.new(@dir + "targets.txt.gz", "w")
114
- rescue
115
- $stderr.puts "Error: Could not write file #{@dir}targets.txt.gz"
116
- exit 1
117
- end
118
-
119
- @targets.each_pair { |lemma, senses|
120
- file.puts "LEMMA #{lemma} SENSES "+ senses.join(" ")
121
- }
122
-
123
- file.close
124
- end
125
-
126
- ###############################
127
- protected
128
-
129
- ##
130
- # record: record occurrence of a lemma/sense pair
131
- # <@targets> data structure
132
- def record(target_info)
133
- lemmapos = fred_lemmapos_combine(target_info["lex"], target_info["pos"])
134
- unless @targets[lemmapos]
135
- @targets[lemmapos] = []
136
- end
137
-
138
- unless @targets[lemmapos].include? target_info["sense"]
139
- @targets[lemmapos] << target_info["sense"]
140
- end
141
- end
142
- end
143
-
144
- ########################################
145
- class FindTargetsFromFrames < Targets
146
- ###
147
- # determine_targets:
148
- # use existing frames to find targets
149
- #
150
- # returns:
151
- # hash: target_IDs -> list of senses
152
- # where target_IDs is a pair [list of terminal IDs, main terminal ID]
153
- #
154
- # where a sense is represented as a hash:
155
- # "sense": sense, a string
156
- # "obj": FrameNode object
157
- # "all_targets": list of node IDs, may comprise more than a single node
158
- # "lex": lemma, or multiword expression in canonical form
159
- # "sid": sentence ID
160
- def determine_targets(st_sent) #SalsaTigerSentence object
161
- retv = Hash.new()
162
- st_sent.each_frame { |frame_obj|
163
- # instance-specific computation:
164
- # target and target positions
165
- # WARNING: at this moment, we are
166
- # not considering true multiword targets for German.
167
- # Remove the "no_mwe" parameter in main_node_of_expr
168
- # to change this
169
- term = nil
170
- all_targets = nil
171
- if frame_obj.target.nil? or frame_obj.target.children.empty?
172
- # no target, nothing to record
173
-
174
- elsif @exp.get("language") == "de"
175
- # don't consider true multiword targets for German
176
- all_targets = frame_obj.target.children()
177
- term = @interpreter_class.main_node_of_expr(all_targets, "no_mwe")
178
-
179
- else
180
- # for all other languages: try to figure out the head target word
181
- # anyway
182
- all_targets = frame_obj.target.children()
183
- term = @interpreter_class.main_node_of_expr(all_targets)
184
- end
185
-
186
- if term and term.is_splitword?
187
- # don't use parts of a word as main node
188
- term = term.parent()
189
- end
190
- if term and term.is_terminal?
191
- key = [all_targets.map { |t| t.id() }, term.id()]
192
-
193
- unless retv[key]
194
- retv[key] = Array.new()
195
- end
196
-
197
- pos = frame_obj.target().get_attribute("pos")
198
- # gold POS available, may be in wrong form,
199
- # i.e. not the same strings that @interpreter_class.category()
200
- # would return
201
- case pos
202
- when /^[Vv]$/
203
- pos = "verb"
204
- when /^[Nn]$/
205
- pos = "noun"
206
- when /^[Aa]$/
207
- pos = "adj"
208
- when nil
209
- pos = @interpreter_class.category(term)
210
- end
211
-
212
- target_info = {
213
- "sense" => frame_obj.name(),
214
- "obj" => frame_obj,
215
- "all_targets" => frame_obj.target.children().map { |ch| ch.id() },
216
- "lex" => frame_obj.target().get_attribute("lemma"),
217
- "pos" => pos,
218
- "sid" => st_sent.id()
219
- }
220
- #print "lex ", frame_obj.target(), " und ",frame_obj.target().get_attribute("lemma"), "\n"
221
- retv[key] << target_info
222
- if @record_targets
223
- record(target_info)
224
- end
225
- end
226
- }
227
- return retv
228
- end
229
- end
230
-
231
- ########################################
232
- class FindAllTargets < Targets
233
- ###
234
- # determine_targets:
235
- # use all known lemmas, minus stopwords
236
- def initialize(exp,
237
- interpreter_class)
238
- # read target info from file
239
- super(exp, interpreter_class, "r")
240
- @training_lemmapos_pairs = get_lemma_pos()
241
-
242
- get_senses(@training_lemmapos_pairs)
243
- # list of words to exclude from assignment, for now
244
- @stoplemmas = [
245
- "have",
246
- "do",
247
- "be"
248
- # "make"
249
- ]
250
-
251
- end
252
-
253
- ####
254
- #
255
- # returns:
256
- # hash: target_IDs -> list of senses
257
- # where target_IDs is a pair [list of terminal IDs, main terminal ID]
258
- #
259
- # where a sense is represented as a hash:
260
- # "sense": sense, a string
261
- # "obj": FrameNode object
262
- # "all_targets": list of node IDs, may comprise more than a single node
263
- # "lex": lemma, or multiword expression in canonical form
264
- # "sid": sentence ID
265
- def determine_targets(sent) #SalsaTigerSentence object
266
- # map target IDs to list of senses, in our case always [ nil ]
267
- # because we assume that the senses of the targets we point out
268
- # are unknown
269
- retv = Hash.new()
270
- # iterate through terminals of the sentence, check for inclusion
271
- # of their lemma in @training_lemmas
272
- sent.each_terminal { |node|
273
- # we know this lemma from the training data,
274
- # and it is not an auxiliary,
275
- # and it is not in the stopword list
276
- # and the node does not represent a preposition
277
-
278
- ### modified by ines, 17.10.2008
279
- lemma = @interpreter_class.lemma_backoff(node)
280
- pos = @interpreter_class.category(node)
281
-
282
- # print "lemma ", lemma, " pos ", pos, "\n"
283
- # reg = /\.[ANV]/
284
- # if !reg.match(lemma)
285
- # if /verb/.match(pos)
286
- # lemma = lemma + ".V"
287
- # elsif /noun/.match(pos)
288
- # lemma = lemma + ".N"
289
- # elsif /adj/.match(pos)
290
- # lemma = lemma + ".A"
291
- # end
292
- # print "LEMMA ", lemma, " POS ", pos, "\n"
293
- # end
294
-
295
- if (@training_lemmapos_pairs.include? [lemma, pos] and
296
- not(@interpreter_class.auxiliary?(node)) and
297
- not(@stoplemmas.include? lemma) and
298
- not(pos == "prep"))
299
- key = [ [ node.id() ], node.id() ]
300
-
301
- # take this as a target.
302
- retv[ key ] = [
303
- {
304
- "sense" => nil,
305
- "obj" => nil,
306
- "all_targets" => [ node.id() ],
307
- "lex" => lemma,
308
- "pos" => pos,
309
- "sid" => sent.id()
310
- } ]
311
- # no recording of target info,
312
- # since we haven't determined anything new
313
- end
314
- }
315
-
316
- return retv
317
- end
318
- end
319
-