frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,171 @@
1
+ # RosyConventions
2
+ # KE May 05
3
+ #
4
+ # Conventions to be used throughout the Rosy system
5
+ # for greater consistency
6
+
7
+ require "common/ruby_class_extensions"
8
+
9
+ require "common/EnduserMode"
10
+
11
+ #################################################################
12
+ #################################################################
13
+ ###
14
+ # value restriction (to pass on to a view):
15
+ # some column is restricted to be equal/inequal to some value
16
+ class ValueRestriction
17
+
18
+ attr_reader :val_is_variable, :table_name_included
19
+
20
+ ###
21
+ # new(): store values
22
+ def initialize(column, # string: column name
23
+ value, # value this column is to be restricted to
24
+ var_hash = {}) # hash:additional settings. possible entries:
25
+ # posneg: string: "=" or "!=": equality or inequality restriction
26
+ # (default: =)
27
+ # table_name_included: boolean: is the table name aready included
28
+ # in the column name? default: false
29
+
30
+ @column = column
31
+ @value = value
32
+
33
+ @posneg = var_hash["posneg"]
34
+ if @posneg.nil?
35
+ # per default, equality restriction
36
+ @posneg = "="
37
+ else
38
+ unless ["=", "!="].include? @posneg
39
+ raise "posneg should be either '=' or '!='. I got: " + @posneg.to_s
40
+ end
41
+ end
42
+
43
+ @table_name_included = var_hash["table_name_included"]
44
+ if @table_name_included.nil?
45
+ # per default, the table name is not yet included
46
+ # in the column name
47
+ @table_name_included = false
48
+ end
49
+
50
+ # per default, value is a value and not another column name
51
+ @val_is_variable = false
52
+ end
53
+
54
+ ###
55
+ # get(): returns a triple [column name(string), eq(string), value(object)]
56
+ def get()
57
+ return [@column, @posneg, @value]
58
+ end
59
+ end
60
+
61
+ ###
62
+ # value restrictions saying that variable1 = variable2:
63
+ # here, value is a variable name, and the table names
64
+ # must be already included
65
+ class VarVarRestriction < ValueRestriction
66
+ def initialize(column, value, var_hash={})
67
+ super(column, value, var_hash)
68
+ @val_is_variable = true
69
+ @table_name_included = true
70
+ end
71
+ end
72
+
73
+ #################################################################
74
+ #################################################################
75
+ # Table and column names to pass on to a view / SQLQuery:
76
+ # which DB table to access, which columns to view?
77
+ #
78
+ # table_obj: DBTable object or DBWrapper object, table to access.
79
+ # The important thing is that the object must have a table_name attribute.
80
+ # columns: string|array:string, list of column names, or "*" for all columns
81
+
82
+ SelectTableAndColumns = Struct.new("SelectTableAndColumns", :table_obj, :columns)
83
+
84
+ #################################################################
85
+ #################################################################
86
+
87
+ ###
88
+ # transforming feature output to a format that classifiers can handle
89
+ def prepare_output_for_classifiers(string)
90
+ # change punctuation to _PUNCT_
91
+ # and change empty space to _
92
+ # because otherwise some classifiers may spit
93
+ return string.gsub(/[.":';`]/,"_PUNCT_").gsub(/\s/,"_")
94
+ end
95
+
96
+ #################################################################
97
+ #################################################################
98
+
99
+ ###
100
+ # classifier directory:
101
+ # either user-given classifier_dir or our own default classifier directory,
102
+ # then argrec/arglab/onestep, plus the splitID, if there is one
103
+ def classifier_directory_name(exp, # RosyConfigData object
104
+ step, # argrec, arglab, onestep
105
+ splitID) # string or nil
106
+
107
+ if exp.get("classifier_dir")
108
+ base_dir = File.new_dir(exp.get("classifier_dir"))
109
+ else
110
+ base_dir = File.new_dir(exp.instantiate("rosy_dir",
111
+ "exp_ID" => exp.get("experiment_ID")))
112
+ end
113
+ classif_base_dir = File.new_dir(base_dir, "classif_dir")
114
+
115
+ if splitID
116
+ return File.new_dir(classif_base_dir, step + "." + splitID.to_s)
117
+ else
118
+ return File.new_dir(classif_base_dir, step)
119
+ end
120
+ end
121
+
122
+ #################################################################
123
+ #################################################################
124
+
125
+ ###
126
+ # instance ID: sentence ID plus frame ID
127
+ def construct_instance_id(sentence_id, frame_id)
128
+ return sentence_id.to_s + "---" + frame_id.to_s
129
+ end
130
+
131
+ def deconstruct_instance_id(instance_id)
132
+ return instance_id.split("---")
133
+ end
134
+
135
+ #################################################################
136
+ #################################################################
137
+
138
+ # default test ID given when the user didn't specify one
139
+ def default_test_ID()
140
+ return "apply"
141
+ end
142
+
143
+
144
+ #################################################################
145
+ #################################################################
146
+
147
+ ###
148
+ # extend Array class by subsumption
149
+ module Subsumed
150
+ def subsumed_by?(array2)
151
+ temp = array2.clone()
152
+ self.each { |el|
153
+ found = false
154
+ temp.each_index { |ix|
155
+ if el == temp[ix]
156
+ temp.delete_at(ix)
157
+ found = true
158
+ break
159
+ end
160
+ }
161
+ unless found
162
+ return false
163
+ end
164
+ }
165
+ return true
166
+ end
167
+ end
168
+
169
+ class Array
170
+ include Subsumed
171
+ end
@@ -0,0 +1,243 @@
1
+ # class SQLQuery
2
+ # KE, SP 27.1.05
3
+ #
4
+ # provides static methods that generate SQL queries as strings
5
+ # that can then be passed on to the database
6
+
7
+ require "common/ruby_class_extensions"
8
+
9
+ require "common/RosyConventions"
10
+
11
+ class SQLQuery
12
+
13
+
14
+ #####
15
+ # SQLQuery.insert
16
+ #
17
+ # query created: insert a new row into a given database table
18
+ # the new row is given as a list of pairs [column_name, value]
19
+ #
20
+ # returns: string
21
+ def SQLQuery.insert(table_name, # string: table name
22
+ field_value_pairs) # array: string*object [column_name, cell_value]
23
+
24
+ # example:
25
+ # insert into table01 (field01,field02,field03,field04,field05) values
26
+ # (2, 'second', 'another', '1999-10-23', '10:30:00');
27
+
28
+ string = "INSERT INTO " + table_name + "("+
29
+ field_value_pairs.map { |column_name, cell_value|
30
+ column_name
31
+ }.join(",") +
32
+ ") VALUES (" +
33
+ field_value_pairs.map { |column_name, cell_value|
34
+ if cell_value.nil?
35
+ raise "SQL query construction error: Nil value for column " + column_name
36
+ end
37
+ SQLQuery.stringify_value(cell_value)
38
+ }.join(",") + ");"
39
+
40
+ return string
41
+ end
42
+
43
+ #####
44
+ # SQLQuery.select
45
+ #
46
+ # query created: select from given database tables
47
+ # all column entries that conform to the given description:
48
+ # - names of the columns to be selected (or the string "*")
49
+ # - only those column entries where the row matches the given
50
+ # row restrictions: [column_name, column_value] => WHERE column_name IS column_value
51
+ # - optionally, at most N lines => LIMIT N
52
+ # - If more than one DB table is named, make a join
53
+ # - Value restrictions: If it doesn't say which DB table to use,
54
+ # use the first one listed in table_col_pairs
55
+ #
56
+ # Use with only one database table creates queries like e.g.
57
+ # SELECT column1, column2 FROM table WHERE column3=val3 AND column4!=val4
58
+ #
59
+ # or:
60
+ # SELECT DISTINCT column1, column2 FROM table WHERE column3=val3 AND column4!=val4 LIMIT 10
61
+ #
62
+ # Use with 2 SelectTableAndColumns entries creates queries like
63
+ # SELECT table1.column1, table1.column2 FROM table1, table2 WHERE table1.column1=val3 AND table1.id=table2.id
64
+ #
65
+ #
66
+ # returns: string.
67
+ # raises an error if no columns at all are selected
68
+ def SQLQuery.select(table_col_pairs, # Array: SelectTableAndColumns
69
+ row_restrictions, # array: ValueRestriction objects
70
+ var_hash = {}) # further parameters:
71
+ # line_limit: integer: select at most N lines. if nil, all lines are chosen
72
+ # distinct: boolean: return each tuple only once. if nil or false, duplicates are kept
73
+
74
+ if table_col_pairs.empty?
75
+ raise "Zero tables to select from"
76
+ end
77
+
78
+ ## SELECT
79
+ string = "SELECT "
80
+
81
+ if var_hash["distinct"]
82
+ # unique return values?
83
+ string << "DISTINCT "
84
+ end
85
+
86
+ ## column names to select: iterate through table/col pairs
87
+ at_least_one_column_selected = false
88
+ string << table_col_pairs.map { |tc|
89
+
90
+ if tc.columns == "*"
91
+ # all columns from this table
92
+ at_least_one_column_selected = true
93
+ SQLQuery.prepend_tablename(tc.table_obj.table_name, "*")
94
+
95
+ elsif tc.columns.class.to_s == "Array" and not(tc.columns.empty?)
96
+ # at least one column from this table
97
+ at_least_one_column_selected = true
98
+
99
+ tc.columns.map { |c|
100
+ if c.nil? or c.empty?
101
+ raise "Got nil/empty value within the column name list"
102
+ end
103
+
104
+ SQLQuery.prepend_tablename(tc.table_obj.table_name, c)
105
+ }.join(", " )
106
+
107
+ else
108
+ # no columns from this table
109
+ nil
110
+ end
111
+ }.compact.join(", ")
112
+
113
+
114
+ if not(at_least_one_column_selected)
115
+ raise "Empty select: zero columns selected"
116
+ end
117
+
118
+ ## FROM table name(s)
119
+ string += " FROM " + table_col_pairs.map { |tc| tc.table_obj.table_name }.join(", ")
120
+
121
+ ## WHERE row_restrictions
122
+ unless row_restrictions.nil? or row_restrictions.empty?
123
+ string += " WHERE "+row_restrictions.map { |restr_obj|
124
+ # get the actual restriction out of its object
125
+ # form: name(string) eqsymb(string: =, !=) value(object)
126
+ name, eqsymb, value = restr_obj.get()
127
+ if value.nil?
128
+ raise "SQL query construction error: Nil value for column " + name
129
+ end
130
+ unless restr_obj.val_is_variable
131
+ # value is a value, not a variable name
132
+ value = SQLQuery.stringify_value(value)
133
+ end
134
+ if restr_obj.table_name_included
135
+ # name already includes table name, if needed
136
+ name + eqsymb + value
137
+ else
138
+ # prepend name of first table in table_col_pairs
139
+ SQLQuery.prepend_tablename(table_col_pairs.first.table_obj.table_name(), name) + eqsymb + value
140
+ end
141
+ }.join(" AND ")
142
+ end
143
+
144
+
145
+ ## LIMIT at_most_that_many_lines
146
+ if var_hash["line_limit"]
147
+ string += " LIMIT " + var_hash["line_limit"].to_s
148
+ end
149
+ string += ";"
150
+
151
+ return string
152
+ end
153
+
154
+ #####
155
+ # SQLQuery.update
156
+ #
157
+ # query created: overwrite several cells in possibly multiple rows of a
158
+ # database table with new values
159
+ # rows are selected via row restrictions
160
+ #
161
+ # returns: nothing
162
+
163
+ # update table01 set field04=19991022, field05=062218 where field01=1;
164
+
165
+ def SQLQuery.update(table_name, # string: table name
166
+ field_value_pairs, # array: string*Object: column name and value
167
+ row_restrictions # array: ValueRestriction objects: column name and value restriction
168
+ )
169
+ string = "UPDATE "+table_name+" SET "+
170
+ field_value_pairs.map {|field,value|
171
+ if value.nil?
172
+ raise "SQL query construction error: Nil value for column " + field
173
+ end
174
+ field+"="+SQLQuery.stringify_value(value)}.join(", ") +
175
+ " WHERE "+row_restrictions.map {|restr_obj|
176
+ # get the actual restriction out of its object
177
+ # form: name(string) eqsymb(string: =, !=) value(object)
178
+ name, eqsymb, value = restr_obj.get()
179
+ if value.nil?
180
+ raise "SQL query construction error: Nil value for column " + name
181
+ end
182
+ name + eqsymb + SQLQuery.stringify_value(value)
183
+ }.join(" AND ")
184
+ string += ";"
185
+ return string
186
+ end
187
+
188
+
189
+ #####
190
+ # SQLQuery.add_columns
191
+ #
192
+ # query created: extend given table by
193
+ # one or more columns given by their names and formats
194
+ #
195
+ # returns: string
196
+ def SQLQuery.add_columns(table_name, # string: table name
197
+ column_formats) # array: array: string*string [column_name,column_format]
198
+
199
+ string = "ALTER TABLE " + table_name
200
+ string << column_formats.map { |column_name, column_format|
201
+ " ADD COLUMN " + column_name + " " + column_format
202
+ }.join(", ")
203
+
204
+ string << ";"
205
+
206
+ return string
207
+ end
208
+
209
+ #####
210
+ # SQLQuery.stringify ensures that value is a properly
211
+ # escaped SQL string
212
+ #
213
+ # returns: string
214
+ def SQLQuery.stringify_value(value) # object
215
+ if value.class == String
216
+ return "'" + value.gsub(/"/,"QQUOT0").gsub(/'/, "QQUOT1").gsub(/`/, "QQUOT2") + "'"
217
+ else
218
+ return value.to_s
219
+ end
220
+ end
221
+
222
+ #####
223
+ # SQLQuery.unstringify undoes the result of stringify_value
224
+ # please apply only to strings
225
+ def SQLQuery.unstringify_value(value) # string
226
+ value.gsub(/QQUOT0/, '"').gsub(/QQUOT1/, "'").gsub(/QQUOT2/, "`")
227
+ end
228
+
229
+ ####
230
+ # SQLQuery.prepend_tablename
231
+ #
232
+ # auxiliary method for select:
233
+ # prepend table name to column name
234
+ # and if the column name does not already include a table name
235
+ def SQLQuery.prepend_tablename(table_name,
236
+ column_name)
237
+ if not(column_name.include?("."))
238
+ return table_name + "." + column_name
239
+ else
240
+ return column_name
241
+ end
242
+ end
243
+ end
@@ -0,0 +1,194 @@
1
+ #########
2
+ # module StringTerminalsInRightOrder
3
+ #
4
+ # returns the yield of a node, or a list of nodes, as a string
5
+ # of " "-separated words
6
+ #
7
+ # Words are put into the right order, left to right,
8
+ # under the assumption that their node IDs reflect that order
9
+ #
10
+ # Terminal nodes are assumed to have IDs ending in a number,
11
+ # numbered from left to right
12
+ #
13
+ # Splitword nodes are assumed to have IDs ending in N_sM
14
+ # for numbers N and M, where N orders terminals left to right
15
+ # and M orders the splitword parts left to right
16
+ #
17
+ # If the yield of the node/the list of nodes contains all splitwords of a terminal,
18
+ # the whole terminal is taken instead
19
+ #
20
+ # methods:
21
+ #
22
+ # string_for_node returns the string for the yield of a node
23
+ # node: a node object
24
+ #
25
+ # string_for_nodes returns the string for the yield of a list of nodes
26
+ # nodes: a list of node objects
27
+
28
+ module StringTerminalsInRightOrder
29
+ def string_for_node(node)
30
+ string_for_nodes([node])
31
+ end
32
+
33
+ def string_for_nodes(nodes)
34
+ a = right_level_terminals_for_nodes(nodes)
35
+ a = sort_terminals_and_splitwords_left_to_right(a)
36
+ return node_array_to_string(a)
37
+ end
38
+
39
+ #####
40
+ private
41
+
42
+ # right_level_terminals_for_nodes:
43
+ # - compute the yield for each element of 'nodes'
44
+ # - then consider all splitwords in the yield:
45
+ # if all splitwords of a terminal are in the yield,
46
+ # then use the terminal rather than its splitwords
47
+ def right_level_terminals_for_nodes(nodes)
48
+ a = nodes.map { |n| n.yield_nodes()}.flatten
49
+ b = Array.new
50
+ a.each { |n|
51
+ if n.is_splitword?
52
+ # see if a contains all parts of this splitword
53
+ # if so, take into b the splitword's parent, the terminal,
54
+ # rather than the individual splitwords
55
+
56
+ if n.parent.nil?
57
+ # splitword without a parent
58
+ b << n
59
+ elsif b.include? n.parent or a.include? n.parent
60
+ # did we already include the splitword's parent in b?
61
+ # then we're done
62
+ else
63
+
64
+ # check if all children of n.parent are in 'a'
65
+ all_in = true
66
+ n.parent.each_child { |nsibling|
67
+ unless a.include? nsibling
68
+ all_in = false
69
+ break
70
+ end
71
+ }
72
+
73
+ if all_in
74
+ # yes, all children of n.parent are in 'a'
75
+ b << n.parent
76
+ else
77
+ # no, some sibling of n is not in 'a'
78
+ b << n
79
+ end
80
+ end
81
+ elsif n.is_terminal?
82
+ # n is a terminal
83
+ b << n
84
+ # if n is anything but a splitword or a terminal,
85
+ # ignore it
86
+ end
87
+ }
88
+ return b.uniq
89
+ end
90
+
91
+ # sort_terminals_and_splitwords_left_to_right:
92
+ # take an array of nodes that consists of terminals and splitwords
93
+ # and sort them using the following comparison:
94
+ # - when comparing two terminals, use the
95
+ # last numbers in their respective IDs
96
+ # - when comparing two splitwords, their IDs end in _N_sM
97
+ # for numbers N and M.
98
+ # If they coincide in N, compare them by M,
99
+ # else compare them by M
100
+ # - when comparing a terminal and a splitword,
101
+ # compare the terminal's last number to the splitword's N
102
+ def sort_terminals_and_splitwords_left_to_right(nodes)
103
+ nodes.sort { |a, b|
104
+ if a.is_splitword? and b.is_splitword?
105
+ compare_splitwords(a, b)
106
+ elsif a.is_terminal? and b.is_terminal?
107
+ compare_terminals(a, b)
108
+ else
109
+ compare_mixed(a, b)
110
+ end
111
+ }
112
+ end
113
+
114
+ # node_array_to_string:
115
+ # 'nodes' is an array of node objects, each of which offer a "word" method
116
+ # string their words together separated by " "
117
+ def node_array_to_string(nodes)
118
+ s = ""
119
+ nodes.each { |n|
120
+ s = s + n.word + " "
121
+ }
122
+ return s
123
+ end
124
+
125
+ # - when comparing two terminals, use the
126
+ # last numbers in their respective IDs
127
+ def compare_terminals(a, b)
128
+ last_i(a) <=> last_i(b)
129
+ end
130
+
131
+ # - when comparing two splitwords, their IDs end in _N_sM
132
+ # for numbers N and M.
133
+ # If they coincide in N, compare them by M,
134
+ # else compare them by M
135
+ def compare_splitwords(a, b)
136
+ if splitword_terminal_i(a) == splitword_terminal_i(b)
137
+ # parts of same terminal?
138
+ # compare parts
139
+ last_i(a) <=> last_i(b)
140
+ else
141
+ # not parts of same terminal?
142
+ # compare terminals
143
+ splitword_terminal_i(a) <=> splitword_terminal_i(b)
144
+ end
145
+ end
146
+
147
+ # - when comparing a terminal and a splitword,
148
+ # compare the terminal's last number to the splitword's N
149
+ def compare_mixed(a, b)
150
+ if a.is_splitword? and b.is_terminal?
151
+ splitword_terminal_i(a) <=> last_i(b)
152
+
153
+ elsif a.is_terminal? and b.is_splitword?
154
+ last_i(a) <=> splitword_terminal_i(b)
155
+ else
156
+ # not one terminal, one splitword?
157
+ # then what?
158
+ $stderr.print "SalsaTigerSentence, compare_mixed: confused by "
159
+ $stderr.print a.id, ", ", b.id, "\n"
160
+ end
161
+ end
162
+
163
+ # return last number of the ID of a node
164
+ def last_i(n)
165
+ n.id =~ /(\d+)$/ # match final string of digits
166
+ if $1.nil? # if shouldn't happen _in principle_
167
+ # but we might get weird node IDs for splitwords;
168
+ # so we act gracefully and catch the case where there
169
+ # is one final letter behind the digits
170
+ n.id =~ /(\d+)\w$/
171
+ end
172
+ if $1.nil? # this shouldn't ever happen
173
+ $stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
174
+ $stderr.print n.id, "\n"
175
+ exit 1
176
+ end
177
+ return $1.to_i # and return it as number
178
+ end
179
+
180
+ # assume the ID of the node includes N_sM
181
+ # return N
182
+ def splitword_terminal_i(n)
183
+ n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
184
+ if $1.nil? # this shouldn't ever happen
185
+ $stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
186
+ $stderr.print n.id, "\n"
187
+ exit 1
188
+ end
189
+ return $1.to_i # and return it as number
190
+ end
191
+
192
+ end
193
+
194
+