shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,269 +0,0 @@
1
- # RegXML
2
- #
3
- # Katrin Erk June 2005
4
-
5
- # SalsaTigerRegXML: take control of the data structure, no underlying xml
6
- # representation anymore, re-generation of xml on demand
7
-
8
- class RegXML
9
-
10
- def initialize(string, # string representing a single XML element
11
- i_am_text = false) # boolean: xml element (false) or text (true)
12
-
13
- unless string.class == String
14
- raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
15
- end
16
- if i_am_text
17
- @s = string
18
- @i_am_text = true
19
- else
20
- @s = string.gsub(/\n/, " ").freeze
21
- @i_am_text = false
22
-
23
- element_test()
24
- dyck_test()
25
- end
26
- end
27
-
28
- def to_s()
29
- return xml_readable(@s)
30
- end
31
-
32
- def text?
33
- return @i_am_text
34
- end
35
-
36
- def name()
37
- if @i_am_text
38
- # text
39
- return nil
40
-
41
- else
42
- # xml element
43
- if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
44
- return $1
45
- else
46
- raise "Cannot parse:\n#{xml_readable(@s)}"
47
- end
48
- end
49
- end
50
-
51
- def attributes()
52
- if @i_am_text
53
- # text
54
- return {}
55
-
56
- else
57
- # xml element
58
-
59
- # remove <element_name from the beginning of @s,
60
- # place the rest up to the first > into elt_contents:
61
- # this is a string of the form
62
- # - either (name=value)*
63
- # - or (name=value)*/
64
- unless @s =~ /^\s*<\s*#{name()}(.*)$/
65
- raise "Cannot parse:\n #{xml_readable(@s)}"
66
- end
67
-
68
- retv = Hash.new
69
- elt_contents = $1
70
-
71
- # repeat until only > or /> is left
72
- while elt_contents !~ /^\s*\/?>/
73
-
74
- # shave off the next name=value pair
75
- # put the rest into elt_contents
76
- # make sure that if the value is quoted with ',
77
- # we accept " inside the value, and vice versa.
78
- unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
79
- raise "Cannot parse:\n #{xml_readable(elt_contents)}"
80
- end
81
- retv[$1] = $3
82
- elt_contents = $4
83
- end
84
-
85
- return retv
86
- end
87
- end
88
-
89
- def children_and_text()
90
- if @i_am_text
91
- return []
92
-
93
- else
94
- if unary_element()
95
- # <bla/>, no children
96
- return []
97
- end
98
-
99
- # @s has the form <bla...> ... </bla>.
100
- # remove <bla ...> from the beginning of @s,
101
- # place the rest up to </bla> into children_s:
102
-
103
- mainname = name()
104
- unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
105
- raise "Cannot parse:\n #{xml_readable(@s)}"
106
- end
107
-
108
- retv = Array.new
109
- children_s = $3
110
-
111
- # repeat until only whitespace is left
112
- while children_s !~ /^\s*$/
113
-
114
- # shave off the next bit of text
115
- # put the rest into children_s
116
- unless children_s =~ /^\s*(.*?)(<.*$|$)/
117
- $stderr.puts "Whole was:\n #{xml_readable(@s)}"
118
- $stderr.puts
119
- raise "Cannot parse:\n #{xml_readable(children_s)}"
120
- end
121
- unless $1.strip.empty?
122
- children_s = $2
123
- retv << RegXML.new($1, true)
124
- end
125
-
126
- # anything left after we've parsed text?
127
- if children_s =~ /^s*$/
128
- break
129
- end
130
-
131
- # shave off the next child
132
- # and put the rest into children_s
133
-
134
- # determine the next child's name, and the string index at which
135
- # the element start tag ends with either / or >
136
- unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
137
- $stderr.puts "Whole was:\n #{xml_readable(@s)}"
138
- $stderr.puts
139
- raise "Cannot parse:\n #{xml_readable(children_s)}"
140
- end
141
- childname = $2
142
- child = $1
143
- endofelt_ix = $&.length()
144
-
145
-
146
- # and remove it
147
- case children_s[endofelt_ix..-1]
148
- when /^\/>(.*)$/
149
- # next child is a unary element
150
- children_s = $1
151
- retv << RegXML.new(child + "/>")
152
-
153
- when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
154
- children_s = $2
155
- retv << RegXML.new(child + $1)
156
-
157
- else
158
- $stderr.puts "Whole was:\n #{xml_readable(@s)}"
159
- $stderr.puts
160
- raise "Cannot parse:\n#{xml_readable(children_s)}"
161
- end
162
- end
163
-
164
- return retv
165
- end
166
- end
167
-
168
- def RegXML.test()
169
- bla = RegXML.new(" <bla blupp='a\"b'
170
- lalala=\"c\">
171
- <lalala> </lalala>
172
- texttext
173
- <lala blupp='b'/>
174
- nochtext
175
- <la> <l/> </la>
176
- </ bla >
177
- ")
178
- puts "name " + bla.name()
179
- puts
180
- puts bla.to_s()
181
- puts
182
- bla.attributes.each { |attr, val|
183
- puts "attr " + attr + "=" + val
184
- }
185
- puts
186
- bla.children_and_text.each { |child_obj|
187
- if child_obj.text?
188
- puts "da text " + child_obj.to_s
189
- else
190
- puts "da child " + child_obj.to_s
191
- end
192
- }
193
- puts
194
-
195
- puts "NEU"
196
- bla = RegXML.new(" < bla blupp='a\"'/> ")
197
- puts "name " + bla.name()
198
- puts
199
- puts bla.to_s()
200
- puts
201
- bla.attributes.each { |attr, val|
202
- puts "attr " + attr + "=" + val
203
- }
204
- puts
205
- bla.children_and_text.each { |child_obj|
206
- if child_obj.text?
207
- puts "da text " + child_obj.to_s
208
- else
209
- puts "da child " + child_obj.to_s
210
- end
211
- }
212
- puts
213
-
214
- end
215
-
216
- ##############
217
- protected
218
-
219
- def unary_element()
220
- # <bla/>
221
- if @s =~ /^\s*<.*\/>\s*$/
222
- return true
223
- else
224
- return false
225
- end
226
- end
227
-
228
- def element_test()
229
- # make sure we have a single XML element, either <bla/> or
230
- # <bla>...</bla>
231
-
232
- if unary_element()
233
- # <bla/>
234
- elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
235
- # <bla > ... </bla>
236
- else
237
- raise "Cannot parse:\n #{xml_readable(@s)}"
238
- end
239
- end
240
-
241
- def dyck_test()
242
- # every prefix of @s must have at least as many < as >
243
- opening = 0
244
- closing = 0
245
- @s.scan(/[<>]/) { |bracket|
246
- case bracket
247
- when "<"
248
- opening += 1
249
- when ">"
250
- closing += 1
251
- if closing > opening
252
- raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
253
- end
254
- end
255
- }
256
-
257
- # and in total, @s must have equally many < and >
258
- unless @s.count("<") == @s.count(">")
259
- raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
260
- end
261
- end
262
-
263
- def xml_readable(string)
264
- return string.gsub(/>/, ">\n")
265
- end
266
- end
267
-
268
- # RegXML.test()
269
-
@@ -1,194 +0,0 @@
1
- #########
2
- # module StringTerminalsInRightOrder
3
- #
4
- # returns the yield of a node, or a list of nodes, as a string
5
- # of " "-separated words
6
- #
7
- # Words are put into the right order, left to right,
8
- # under the assumption that their node IDs reflect that order
9
- #
10
- # Terminal nodes are assumed to have IDs ending in a number,
11
- # numbered from left to right
12
- #
13
- # Splitword nodes are assumed to have IDs ending in N_sM
14
- # for numbers N and M, where N orders terminals left to right
15
- # and M orders the splitword parts left to right
16
- #
17
- # If the yield of the node/the list of nodes contains all splitwords of a terminal,
18
- # the whole terminal is taken instead
19
- #
20
- # methods:
21
- #
22
- # string_for_node returns the string for the yield of a node
23
- # node: a node object
24
- #
25
- # string_for_nodes returns the string for the yield of a list of nodes
26
- # nodes: a list of node objects
27
-
28
- module StringTerminalsInRightOrder
29
- def string_for_node(node)
30
- string_for_nodes([node])
31
- end
32
-
33
- def string_for_nodes(nodes)
34
- a = right_level_terminals_for_nodes(nodes)
35
- a = sort_terminals_and_splitwords_left_to_right(a)
36
- return node_array_to_string(a)
37
- end
38
-
39
- #####
40
- private
41
-
42
- # right_level_terminals_for_nodes:
43
- # - compute the yield for each element of 'nodes'
44
- # - then consider all splitwords in the yield:
45
- # if all splitwords of a terminal are in the yield,
46
- # then use the terminal rather than its splitwords
47
- def right_level_terminals_for_nodes(nodes)
48
- a = nodes.map { |n| n.yield_nodes()}.flatten
49
- b = Array.new
50
- a.each { |n|
51
- if n.is_splitword?
52
- # see if a contains all parts of this splitword
53
- # if so, take into b the splitword's parent, the terminal,
54
- # rather than the individual splitwords
55
-
56
- if n.parent.nil?
57
- # splitword without a parent
58
- b << n
59
- elsif b.include? n.parent or a.include? n.parent
60
- # did we already include the splitword's parent in b?
61
- # then we're done
62
- else
63
-
64
- # check if all children of n.parent are in 'a'
65
- all_in = true
66
- n.parent.each_child { |nsibling|
67
- unless a.include? nsibling
68
- all_in = false
69
- break
70
- end
71
- }
72
-
73
- if all_in
74
- # yes, all children of n.parent are in 'a'
75
- b << n.parent
76
- else
77
- # no, some sibling of n is not in 'a'
78
- b << n
79
- end
80
- end
81
- elsif n.is_terminal?
82
- # n is a terminal
83
- b << n
84
- # if n is anything but a splitword or a terminal,
85
- # ignore it
86
- end
87
- }
88
- return b.uniq
89
- end
90
-
91
- # sort_terminals_and_splitwords_left_to_right:
92
- # take an array of nodes that consists of terminals and splitwords
93
- # and sort them using the following comparison:
94
- # - when comparing two terminals, use the
95
- # last numbers in their respective IDs
96
- # - when comparing two splitwords, their IDs end in _N_sM
97
- # for numbers N and M.
98
- # If they coincide in N, compare them by M,
99
- # else compare them by M
100
- # - when comparing a terminal and a splitword,
101
- # compare the terminal's last number to the splitword's N
102
- def sort_terminals_and_splitwords_left_to_right(nodes)
103
- nodes.sort { |a, b|
104
- if a.is_splitword? and b.is_splitword?
105
- compare_splitwords(a, b)
106
- elsif a.is_terminal? and b.is_terminal?
107
- compare_terminals(a, b)
108
- else
109
- compare_mixed(a, b)
110
- end
111
- }
112
- end
113
-
114
- # node_array_to_string:
115
- # 'nodes' is an array of node objects, each of which offer a "word" method
116
- # string their words together separated by " "
117
- def node_array_to_string(nodes)
118
- s = ""
119
- nodes.each { |n|
120
- s = s + n.word + " "
121
- }
122
- return s
123
- end
124
-
125
- # - when comparing two terminals, use the
126
- # last numbers in their respective IDs
127
- def compare_terminals(a, b)
128
- last_i(a) <=> last_i(b)
129
- end
130
-
131
- # - when comparing two splitwords, their IDs end in _N_sM
132
- # for numbers N and M.
133
- # If they coincide in N, compare them by M,
134
- # else compare them by M
135
- def compare_splitwords(a, b)
136
- if splitword_terminal_i(a) == splitword_terminal_i(b)
137
- # parts of same terminal?
138
- # compare parts
139
- last_i(a) <=> last_i(b)
140
- else
141
- # not parts of same terminal?
142
- # compare terminals
143
- splitword_terminal_i(a) <=> splitword_terminal_i(b)
144
- end
145
- end
146
-
147
- # - when comparing a terminal and a splitword,
148
- # compare the terminal's last number to the splitword's N
149
- def compare_mixed(a, b)
150
- if a.is_splitword? and b.is_terminal?
151
- splitword_terminal_i(a) <=> last_i(b)
152
-
153
- elsif a.is_terminal? and b.is_splitword?
154
- last_i(a) <=> splitword_terminal_i(b)
155
- else
156
- # not one terminal, one splitword?
157
- # then what?
158
- $stderr.print "SalsaTigerSentence, compare_mixed: confused by "
159
- $stderr.print a.id, ", ", b.id, "\n"
160
- end
161
- end
162
-
163
- # return last number of the ID of a node
164
- def last_i(n)
165
- n.id =~ /(\d+)$/ # match final string of digits
166
- if $1.nil? # if shouldn't happen _in principle_
167
- # but we might get weird node IDs for splitwords;
168
- # so we act gracefully and catch the case where there
169
- # is one final letter behind the digits
170
- n.id =~ /(\d+)\w$/
171
- end
172
- if $1.nil? # this shouldn't ever happen
173
- $stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
174
- $stderr.print n.id, "\n"
175
- exit 1
176
- end
177
- return $1.to_i # and return it as number
178
- end
179
-
180
- # assume the ID of the node includes N_sM
181
- # return N
182
- def splitword_terminal_i(n)
183
- n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
184
- if $1.nil? # this shouldn't ever happen
185
- $stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
186
- $stderr.print n.id, "\n"
187
- exit 1
188
- end
189
- return $1.to_i # and return it as number
190
- end
191
-
192
- end
193
-
194
-