frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,171 @@
1
+ # RosyConventions
2
+ # KE May 05
3
+ #
4
+ # Conventions to be used throughout the Rosy system
5
+ # for greater consistency
6
+
7
+ require "common/ruby_class_extensions"
8
+
9
+ require "common/EnduserMode"
10
+
11
+ #################################################################
12
+ #################################################################
13
+ ###
14
+ # value restriction (to pass on to a view):
15
+ # some column is restricted to be equal/inequal to some value
16
+ class ValueRestriction
17
+
18
+ attr_reader :val_is_variable, :table_name_included
19
+
20
+ ###
21
+ # new(): store values
22
+ def initialize(column, # string: column name
23
+ value, # value this column is to be restricted to
24
+ var_hash = {}) # hash:additional settings. possible entries:
25
+ # posneg: string: "=" or "!=": equality or inequality restriction
26
+ # (default: =)
27
+ # table_name_included: boolean: is the table name aready included
28
+ # in the column name? default: false
29
+
30
+ @column = column
31
+ @value = value
32
+
33
+ @posneg = var_hash["posneg"]
34
+ if @posneg.nil?
35
+ # per default, equality restriction
36
+ @posneg = "="
37
+ else
38
+ unless ["=", "!="].include? @posneg
39
+ raise "posneg should be either '=' or '!='. I got: " + @posneg.to_s
40
+ end
41
+ end
42
+
43
+ @table_name_included = var_hash["table_name_included"]
44
+ if @table_name_included.nil?
45
+ # per default, the table name is not yet included
46
+ # in the column name
47
+ @table_name_included = false
48
+ end
49
+
50
+ # per default, value is a value and not another column name
51
+ @val_is_variable = false
52
+ end
53
+
54
+ ###
55
+ # get(): returns a triple [column name(string), eq(string), value(object)]
56
+ def get()
57
+ return [@column, @posneg, @value]
58
+ end
59
+ end
60
+
61
+ ###
62
+ # value restrictions saying that variable1 = variable2:
63
+ # here, value is a variable name, and the table names
64
+ # must be already included
65
+ class VarVarRestriction < ValueRestriction
66
+ def initialize(column, value, var_hash={})
67
+ super(column, value, var_hash)
68
+ @val_is_variable = true
69
+ @table_name_included = true
70
+ end
71
+ end
72
+
73
+ #################################################################
74
+ #################################################################
75
+ # Table and column names to pass on to a view / SQLQuery:
76
+ # which DB table to access, which columns to view?
77
+ #
78
+ # table_obj: DBTable object or DBWrapper object, table to access.
79
+ # The important thing is that the object must have a table_name attribute.
80
+ # columns: string|array:string, list of column names, or "*" for all columns
81
+
82
+ SelectTableAndColumns = Struct.new("SelectTableAndColumns", :table_obj, :columns)
83
+
84
+ #################################################################
85
+ #################################################################
86
+
87
+ ###
88
+ # transforming feature output to a format that classifiers can handle
89
+ def prepare_output_for_classifiers(string)
90
+ # change punctuation to _PUNCT_
91
+ # and change empty space to _
92
+ # because otherwise some classifiers may spit
93
+ return string.gsub(/[.":';`]/,"_PUNCT_").gsub(/\s/,"_")
94
+ end
95
+
96
+ #################################################################
97
+ #################################################################
98
+
99
+ ###
100
+ # classifier directory:
101
+ # either user-given classifier_dir or our own default classifier directory,
102
+ # then argrec/arglab/onestep, plus the splitID, if there is one
103
+ def classifier_directory_name(exp, # RosyConfigData object
104
+ step, # argrec, arglab, onestep
105
+ splitID) # string or nil
106
+
107
+ if exp.get("classifier_dir")
108
+ base_dir = File.new_dir(exp.get("classifier_dir"))
109
+ else
110
+ base_dir = File.new_dir(exp.instantiate("rosy_dir",
111
+ "exp_ID" => exp.get("experiment_ID")))
112
+ end
113
+ classif_base_dir = File.new_dir(base_dir, "classif_dir")
114
+
115
+ if splitID
116
+ return File.new_dir(classif_base_dir, step + "." + splitID.to_s)
117
+ else
118
+ return File.new_dir(classif_base_dir, step)
119
+ end
120
+ end
121
+
122
+ #################################################################
123
+ #################################################################
124
+
125
+ ###
126
+ # instance ID: sentence ID plus frame ID
127
+ def construct_instance_id(sentence_id, frame_id)
128
+ return sentence_id.to_s + "---" + frame_id.to_s
129
+ end
130
+
131
+ def deconstruct_instance_id(instance_id)
132
+ return instance_id.split("---")
133
+ end
134
+
135
+ #################################################################
136
+ #################################################################
137
+
138
+ # default test ID given when the user didn't specify one
139
+ def default_test_ID()
140
+ return "apply"
141
+ end
142
+
143
+
144
+ #################################################################
145
+ #################################################################
146
+
147
+ ###
148
+ # extend Array class by subsumption
149
+ module Subsumed
150
+ def subsumed_by?(array2)
151
+ temp = array2.clone()
152
+ self.each { |el|
153
+ found = false
154
+ temp.each_index { |ix|
155
+ if el == temp[ix]
156
+ temp.delete_at(ix)
157
+ found = true
158
+ break
159
+ end
160
+ }
161
+ unless found
162
+ return false
163
+ end
164
+ }
165
+ return true
166
+ end
167
+ end
168
+
169
+ class Array
170
+ include Subsumed
171
+ end
@@ -0,0 +1,243 @@
1
+ # class SQLQuery
2
+ # KE, SP 27.1.05
3
+ #
4
+ # provides static methods that generate SQL queries as strings
5
+ # that can then be passed on to the database
6
+
7
+ require "common/ruby_class_extensions"
8
+
9
+ require "common/RosyConventions"
10
+
11
+ class SQLQuery
12
+
13
+
14
+ #####
15
+ # SQLQuery.insert
16
+ #
17
+ # query created: insert a new row into a given database table
18
+ # the new row is given as a list of pairs [column_name, value]
19
+ #
20
+ # returns: string
21
+ def SQLQuery.insert(table_name, # string: table name
22
+ field_value_pairs) # array: string*object [column_name, cell_value]
23
+
24
+ # example:
25
+ # insert into table01 (field01,field02,field03,field04,field05) values
26
+ # (2, 'second', 'another', '1999-10-23', '10:30:00');
27
+
28
+ string = "INSERT INTO " + table_name + "("+
29
+ field_value_pairs.map { |column_name, cell_value|
30
+ column_name
31
+ }.join(",") +
32
+ ") VALUES (" +
33
+ field_value_pairs.map { |column_name, cell_value|
34
+ if cell_value.nil?
35
+ raise "SQL query construction error: Nil value for column " + column_name
36
+ end
37
+ SQLQuery.stringify_value(cell_value)
38
+ }.join(",") + ");"
39
+
40
+ return string
41
+ end
42
+
43
+ #####
44
+ # SQLQuery.select
45
+ #
46
+ # query created: select from given database tables
47
+ # all column entries that conform to the given description:
48
+ # - names of the columns to be selected (or the string "*")
49
+ # - only those column entries where the row matches the given
50
+ # row restrictions: [column_name, column_value] => WHERE column_name IS column_value
51
+ # - optionally, at most N lines => LIMIT N
52
+ # - If more than one DB table is named, make a join
53
+ # - Value restrictions: If it doesn't say which DB table to use,
54
+ # use the first one listed in table_col_pairs
55
+ #
56
+ # Use with only one database table creates queries like e.g.
57
+ # SELECT column1, column2 FROM table WHERE column3=val3 AND column4!=val4
58
+ #
59
+ # or:
60
+ # SELECT DISTINCT column1, column2 FROM table WHERE column3=val3 AND column4!=val4 LIMIT 10
61
+ #
62
+ # Use with 2 SelectTableAndColumns entries creates queries like
63
+ # SELECT table1.column1, table1.column2 FROM table1, table2 WHERE table1.column1=val3 AND table1.id=table2.id
64
+ #
65
+ #
66
+ # returns: string.
67
+ # raises an error if no columns at all are selected
68
+ def SQLQuery.select(table_col_pairs, # Array: SelectTableAndColumns
69
+ row_restrictions, # array: ValueRestriction objects
70
+ var_hash = {}) # further parameters:
71
+ # line_limit: integer: select at most N lines. if nil, all lines are chosen
72
+ # distinct: boolean: return each tuple only once. if nil or false, duplicates are kept
73
+
74
+ if table_col_pairs.empty?
75
+ raise "Zero tables to select from"
76
+ end
77
+
78
+ ## SELECT
79
+ string = "SELECT "
80
+
81
+ if var_hash["distinct"]
82
+ # unique return values?
83
+ string << "DISTINCT "
84
+ end
85
+
86
+ ## column names to select: iterate through table/col pairs
87
+ at_least_one_column_selected = false
88
+ string << table_col_pairs.map { |tc|
89
+
90
+ if tc.columns == "*"
91
+ # all columns from this table
92
+ at_least_one_column_selected = true
93
+ SQLQuery.prepend_tablename(tc.table_obj.table_name, "*")
94
+
95
+ elsif tc.columns.class.to_s == "Array" and not(tc.columns.empty?)
96
+ # at least one column from this table
97
+ at_least_one_column_selected = true
98
+
99
+ tc.columns.map { |c|
100
+ if c.nil? or c.empty?
101
+ raise "Got nil/empty value within the column name list"
102
+ end
103
+
104
+ SQLQuery.prepend_tablename(tc.table_obj.table_name, c)
105
+ }.join(", " )
106
+
107
+ else
108
+ # no columns from this table
109
+ nil
110
+ end
111
+ }.compact.join(", ")
112
+
113
+
114
+ if not(at_least_one_column_selected)
115
+ raise "Empty select: zero columns selected"
116
+ end
117
+
118
+ ## FROM table name(s)
119
+ string += " FROM " + table_col_pairs.map { |tc| tc.table_obj.table_name }.join(", ")
120
+
121
+ ## WHERE row_restrictions
122
+ unless row_restrictions.nil? or row_restrictions.empty?
123
+ string += " WHERE "+row_restrictions.map { |restr_obj|
124
+ # get the actual restriction out of its object
125
+ # form: name(string) eqsymb(string: =, !=) value(object)
126
+ name, eqsymb, value = restr_obj.get()
127
+ if value.nil?
128
+ raise "SQL query construction error: Nil value for column " + name
129
+ end
130
+ unless restr_obj.val_is_variable
131
+ # value is a value, not a variable name
132
+ value = SQLQuery.stringify_value(value)
133
+ end
134
+ if restr_obj.table_name_included
135
+ # name already includes table name, if needed
136
+ name + eqsymb + value
137
+ else
138
+ # prepend name of first table in table_col_pairs
139
+ SQLQuery.prepend_tablename(table_col_pairs.first.table_obj.table_name(), name) + eqsymb + value
140
+ end
141
+ }.join(" AND ")
142
+ end
143
+
144
+
145
+ ## LIMIT at_most_that_many_lines
146
+ if var_hash["line_limit"]
147
+ string += " LIMIT " + var_hash["line_limit"].to_s
148
+ end
149
+ string += ";"
150
+
151
+ return string
152
+ end
153
+
154
+ #####
155
+ # SQLQuery.update
156
+ #
157
+ # query created: overwrite several cells in possibly multiple rows of a
158
+ # database table with new values
159
+ # rows are selected via row restrictions
160
+ #
161
+ # returns: nothing
162
+
163
+ # update table01 set field04=19991022, field05=062218 where field01=1;
164
+
165
+ def SQLQuery.update(table_name, # string: table name
166
+ field_value_pairs, # array: string*Object: column name and value
167
+ row_restrictions # array: ValueRestriction objects: column name and value restriction
168
+ )
169
+ string = "UPDATE "+table_name+" SET "+
170
+ field_value_pairs.map {|field,value|
171
+ if value.nil?
172
+ raise "SQL query construction error: Nil value for column " + field
173
+ end
174
+ field+"="+SQLQuery.stringify_value(value)}.join(", ") +
175
+ " WHERE "+row_restrictions.map {|restr_obj|
176
+ # get the actual restriction out of its object
177
+ # form: name(string) eqsymb(string: =, !=) value(object)
178
+ name, eqsymb, value = restr_obj.get()
179
+ if value.nil?
180
+ raise "SQL query construction error: Nil value for column " + name
181
+ end
182
+ name + eqsymb + SQLQuery.stringify_value(value)
183
+ }.join(" AND ")
184
+ string += ";"
185
+ return string
186
+ end
187
+
188
+
189
+ #####
190
+ # SQLQuery.add_columns
191
+ #
192
+ # query created: extend given table by
193
+ # one or more columns given by their names and formats
194
+ #
195
+ # returns: string
196
+ def SQLQuery.add_columns(table_name, # string: table name
197
+ column_formats) # array: array: string*string [column_name,column_format]
198
+
199
+ string = "ALTER TABLE " + table_name
200
+ string << column_formats.map { |column_name, column_format|
201
+ " ADD COLUMN " + column_name + " " + column_format
202
+ }.join(", ")
203
+
204
+ string << ";"
205
+
206
+ return string
207
+ end
208
+
209
+ #####
210
+ # SQLQuery.stringify ensures that value is a properly
211
+ # escaped SQL string
212
+ #
213
+ # returns: string
214
+ def SQLQuery.stringify_value(value) # object
215
+ if value.class == String
216
+ return "'" + value.gsub(/"/,"QQUOT0").gsub(/'/, "QQUOT1").gsub(/`/, "QQUOT2") + "'"
217
+ else
218
+ return value.to_s
219
+ end
220
+ end
221
+
222
+ #####
223
+ # SQLQuery.unstringify undoes the result of stringify_value
224
+ # please apply only to strings
225
+ def SQLQuery.unstringify_value(value) # string
226
+ value.gsub(/QQUOT0/, '"').gsub(/QQUOT1/, "'").gsub(/QQUOT2/, "`")
227
+ end
228
+
229
+ ####
230
+ # SQLQuery.prepend_tablename
231
+ #
232
+ # auxiliary method for select:
233
+ # prepend table name to column name
234
+ # and if the column name does not already include a table name
235
+ def SQLQuery.prepend_tablename(table_name,
236
+ column_name)
237
+ if not(column_name.include?("."))
238
+ return table_name + "." + column_name
239
+ else
240
+ return column_name
241
+ end
242
+ end
243
+ end
@@ -0,0 +1,194 @@
1
+ #########
2
+ # module StringTerminalsInRightOrder
3
+ #
4
+ # returns the yield of a node, or a list of nodes, as a string
5
+ # of " "-separated words
6
+ #
7
+ # Words are put into the right order, left to right,
8
+ # under the assumption that their node IDs reflect that order
9
+ #
10
+ # Terminal nodes are assumed to have IDs ending in a number,
11
+ # numbered from left to right
12
+ #
13
+ # Splitword nodes are assumed to have IDs ending in N_sM
14
+ # for numbers N and M, where N orders terminals left to right
15
+ # and M orders the splitword parts left to right
16
+ #
17
+ # If the yield of the node/the list of nodes contains all splitwords of a terminal,
18
+ # the whole terminal is taken instead
19
+ #
20
+ # methods:
21
+ #
22
+ # string_for_node returns the string for the yield of a node
23
+ # node: a node object
24
+ #
25
+ # string_for_nodes returns the string for the yield of a list of nodes
26
+ # nodes: a list of node objects
27
+
28
+ module StringTerminalsInRightOrder
29
+ def string_for_node(node)
30
+ string_for_nodes([node])
31
+ end
32
+
33
+ def string_for_nodes(nodes)
34
+ a = right_level_terminals_for_nodes(nodes)
35
+ a = sort_terminals_and_splitwords_left_to_right(a)
36
+ return node_array_to_string(a)
37
+ end
38
+
39
+ #####
40
+ private
41
+
42
+ # right_level_terminals_for_nodes:
43
+ # - compute the yield for each element of 'nodes'
44
+ # - then consider all splitwords in the yield:
45
+ # if all splitwords of a terminal are in the yield,
46
+ # then use the terminal rather than its splitwords
47
+ def right_level_terminals_for_nodes(nodes)
48
+ a = nodes.map { |n| n.yield_nodes()}.flatten
49
+ b = Array.new
50
+ a.each { |n|
51
+ if n.is_splitword?
52
+ # see if a contains all parts of this splitword
53
+ # if so, take into b the splitword's parent, the terminal,
54
+ # rather than the individual splitwords
55
+
56
+ if n.parent.nil?
57
+ # splitword without a parent
58
+ b << n
59
+ elsif b.include? n.parent or a.include? n.parent
60
+ # did we already include the splitword's parent in b?
61
+ # then we're done
62
+ else
63
+
64
+ # check if all children of n.parent are in 'a'
65
+ all_in = true
66
+ n.parent.each_child { |nsibling|
67
+ unless a.include? nsibling
68
+ all_in = false
69
+ break
70
+ end
71
+ }
72
+
73
+ if all_in
74
+ # yes, all children of n.parent are in 'a'
75
+ b << n.parent
76
+ else
77
+ # no, some sibling of n is not in 'a'
78
+ b << n
79
+ end
80
+ end
81
+ elsif n.is_terminal?
82
+ # n is a terminal
83
+ b << n
84
+ # if n is anything but a splitword or a terminal,
85
+ # ignore it
86
+ end
87
+ }
88
+ return b.uniq
89
+ end
90
+
91
+ # sort_terminals_and_splitwords_left_to_right:
92
+ # take an array of nodes that consists of terminals and splitwords
93
+ # and sort them using the following comparison:
94
+ # - when comparing two terminals, use the
95
+ # last numbers in their respective IDs
96
+ # - when comparing two splitwords, their IDs end in _N_sM
97
+ # for numbers N and M.
98
+ # If they coincide in N, compare them by M,
99
+ # else compare them by M
100
+ # - when comparing a terminal and a splitword,
101
+ # compare the terminal's last number to the splitword's N
102
+ def sort_terminals_and_splitwords_left_to_right(nodes)
103
+ nodes.sort { |a, b|
104
+ if a.is_splitword? and b.is_splitword?
105
+ compare_splitwords(a, b)
106
+ elsif a.is_terminal? and b.is_terminal?
107
+ compare_terminals(a, b)
108
+ else
109
+ compare_mixed(a, b)
110
+ end
111
+ }
112
+ end
113
+
114
+ # node_array_to_string:
115
+ # 'nodes' is an array of node objects, each of which offer a "word" method
116
+ # string their words together separated by " "
117
+ def node_array_to_string(nodes)
118
+ s = ""
119
+ nodes.each { |n|
120
+ s = s + n.word + " "
121
+ }
122
+ return s
123
+ end
124
+
125
+ # - when comparing two terminals, use the
126
+ # last numbers in their respective IDs
127
+ def compare_terminals(a, b)
128
+ last_i(a) <=> last_i(b)
129
+ end
130
+
131
+ # - when comparing two splitwords, their IDs end in _N_sM
132
+ # for numbers N and M.
133
+ # If they coincide in N, compare them by M,
134
+ # else compare them by M
135
+ def compare_splitwords(a, b)
136
+ if splitword_terminal_i(a) == splitword_terminal_i(b)
137
+ # parts of same terminal?
138
+ # compare parts
139
+ last_i(a) <=> last_i(b)
140
+ else
141
+ # not parts of same terminal?
142
+ # compare terminals
143
+ splitword_terminal_i(a) <=> splitword_terminal_i(b)
144
+ end
145
+ end
146
+
147
+ # - when comparing a terminal and a splitword,
148
+ # compare the terminal's last number to the splitword's N
149
+ def compare_mixed(a, b)
150
+ if a.is_splitword? and b.is_terminal?
151
+ splitword_terminal_i(a) <=> last_i(b)
152
+
153
+ elsif a.is_terminal? and b.is_splitword?
154
+ last_i(a) <=> splitword_terminal_i(b)
155
+ else
156
+ # not one terminal, one splitword?
157
+ # then what?
158
+ $stderr.print "SalsaTigerSentence, compare_mixed: confused by "
159
+ $stderr.print a.id, ", ", b.id, "\n"
160
+ end
161
+ end
162
+
163
+ # return last number of the ID of a node
164
+ def last_i(n)
165
+ n.id =~ /(\d+)$/ # match final string of digits
166
+ if $1.nil? # if shouldn't happen _in principle_
167
+ # but we might get weird node IDs for splitwords;
168
+ # so we act gracefully and catch the case where there
169
+ # is one final letter behind the digits
170
+ n.id =~ /(\d+)\w$/
171
+ end
172
+ if $1.nil? # this shouldn't ever happen
173
+ $stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
174
+ $stderr.print n.id, "\n"
175
+ exit 1
176
+ end
177
+ return $1.to_i # and return it as number
178
+ end
179
+
180
+ # assume the ID of the node includes N_sM
181
+ # return N
182
+ def splitword_terminal_i(n)
183
+ n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
184
+ if $1.nil? # this shouldn't ever happen
185
+ $stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
186
+ $stderr.print n.id, "\n"
187
+ exit 1
188
+ end
189
+ return $1.to_i # and return it as number
190
+ end
191
+
192
+ end
193
+
194
+