shalmaneser-rosy 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/rosy +14 -7
  4. data/lib/rosy/FailedParses.rb +22 -20
  5. data/lib/rosy/FeatureInfo.rb +35 -31
  6. data/lib/rosy/GfInduce.rb +132 -130
  7. data/lib/rosy/GfInduceFeature.rb +86 -68
  8. data/lib/rosy/InputData.rb +59 -55
  9. data/lib/rosy/RosyConfusability.rb +47 -40
  10. data/lib/rosy/RosyEval.rb +55 -55
  11. data/lib/rosy/RosyFeatureExtractors.rb +295 -290
  12. data/lib/rosy/RosyFeaturize.rb +54 -67
  13. data/lib/rosy/RosyInspect.rb +52 -50
  14. data/lib/rosy/RosyIterator.rb +73 -67
  15. data/lib/rosy/RosyPhase2FeatureExtractors.rb +48 -48
  16. data/lib/rosy/RosyPruning.rb +39 -31
  17. data/lib/rosy/RosyServices.rb +116 -115
  18. data/lib/rosy/RosySplit.rb +55 -53
  19. data/lib/rosy/RosyTask.rb +7 -3
  20. data/lib/rosy/RosyTest.rb +174 -191
  21. data/lib/rosy/RosyTrain.rb +46 -50
  22. data/lib/rosy/RosyTrainingTestTable.rb +101 -99
  23. data/lib/rosy/TargetsMostFrequentFrame.rb +13 -9
  24. data/lib/rosy/{AbstractFeatureAndExternal.rb → abstract_feature_extractor.rb} +22 -97
  25. data/lib/rosy/abstract_single_feature_extractor.rb +52 -0
  26. data/lib/rosy/external_feature_extractor.rb +35 -0
  27. data/lib/rosy/opt_parser.rb +231 -201
  28. data/lib/rosy/rosy.rb +63 -64
  29. data/lib/rosy/rosy_conventions.rb +66 -0
  30. data/lib/rosy/rosy_error.rb +15 -0
  31. data/lib/rosy/var_var_restriction.rb +16 -0
  32. data/lib/shalmaneser/rosy.rb +1 -0
  33. metadata +26 -19
  34. data/lib/rosy/ExternalConfigData.rb +0 -58
  35. data/lib/rosy/View.rb +0 -418
  36. data/lib/rosy/rosy_config_data.rb +0 -121
  37. data/test/frprep/test_opt_parser.rb +0 -94
  38. data/test/functional/functional_test_helper.rb +0 -58
  39. data/test/functional/test_fred.rb +0 -47
  40. data/test/functional/test_frprep.rb +0 -99
  41. data/test/functional/test_rosy.rb +0 -40
data/lib/rosy/GfInduce.rb CHANGED
@@ -7,13 +7,13 @@
7
7
  # induce a mapping from parse tree paths to grammatical functions from this information
8
8
  # and apply it to new sentences
9
9
 
10
- require "common/AbstractSynInterface"
11
- require "common/ruby_class_extensions"
10
+ require "ruby_class_extensions"
12
11
 
13
12
  #####################################################################
14
13
  # Management of mapping from GFs to paths
15
14
  #####################################################################
16
-
15
+ module Shalmaneser
16
+ module Rosy
17
17
  class GfiGfPathMapping
18
18
 
19
19
  #########################################
@@ -26,18 +26,18 @@ class GfiGfPathMapping
26
26
  @interpreter = interpreter_class
27
27
 
28
28
  # hash: POS(string) -> hash gf(string) -> hash: path_string -> frequency(int)
29
- @gf_to_paths = Hash.new
29
+ @gf_to_paths = {}
30
30
 
31
- # hash: POS(string)-> hash: gf(string) -> hash: one edge of a path ->
31
+ # hash: POS(string)-> hash: gf(string) -> hash: one edge of a path ->
32
32
  # frequency(int) | hash: one edge of a path -> ...
33
- @gf_to_edgelabel = Hash.new
33
+ @gf_to_edgelabel = {}
34
34
 
35
35
  # hash: word(string) -> array: [gf, prep, head_category]
36
- @word_to_gflist = Hash.new
36
+ @word_to_gflist = {}
37
37
 
38
38
  # hash: path as string(string) -> array of steps
39
39
  # where a step is a tuple of stringss [{U, D}, edgelabel, nodelabel}
40
- @pathstring_to_path = Hash.new
40
+ @pathstring_to_path = {}
41
41
  end
42
42
 
43
43
  #########################################
@@ -67,7 +67,7 @@ class GfiGfPathMapping
67
67
  # remember the path as an array of triples [direction, edgelabel, nodelabel]
68
68
  # as hash value of the path-as-string
69
69
  unless @pathstring_to_path[path_s]
70
- @pathstring_to_path[path_s] = Array.new
70
+ @pathstring_to_path[path_s] = []
71
71
  path.each_step { |direction, edgelabel, nodelabel, node|
72
72
  @pathstring_to_path[path_s] << [direction, edgelabel, nodelabel]
73
73
  }
@@ -76,7 +76,7 @@ class GfiGfPathMapping
76
76
  # store the mapping in the
77
77
  # gf -> path hash
78
78
  unless @gf_to_paths[pos]
79
- @gf_to_paths[pos] = Hash.new
79
+ @gf_to_paths[pos] = {}
80
80
  end
81
81
  unless @gf_to_paths[pos][gf]
82
82
  @gf_to_paths[pos][gf] = Hash.new(0)
@@ -86,7 +86,7 @@ class GfiGfPathMapping
86
86
 
87
87
  # remember this gf/pt tuple as possible GF of the current lemma
88
88
  unless @word_to_gflist[lemmapos]
89
- @word_to_gflist[lemmapos] = Array.new
89
+ @word_to_gflist[lemmapos] = []
90
90
  end
91
91
  unless @word_to_gflist[lemmapos].include? [gf, prep, headcat]
92
92
  @word_to_gflist[lemmapos] << [gf, prep, headcat]
@@ -97,13 +97,13 @@ class GfiGfPathMapping
97
97
  # finish up inducing:
98
98
  # reencode information in a fashion
99
99
  # that makes apply() faster
100
- def finish_inducing()
100
+ def finish_inducing
101
101
  # make sure gf_to_edgelabel is empty at the start
102
- @gf_to_edgelabel.clear()
103
-
102
+ @gf_to_edgelabel.clear
103
+
104
104
  @gf_to_paths.each_pair { |pos, gf_to_paths_to_freq|
105
105
  unless @gf_to_edgelabel[pos]
106
- @gf_to_edgelabel[pos] = Hash.new()
106
+ @gf_to_edgelabel[pos] = {}
107
107
  end
108
108
 
109
109
  gf_to_paths_to_freq.each_pair { |gf, paths_to_freq|
@@ -115,16 +115,16 @@ class GfiGfPathMapping
115
115
  $stderr.puts "found empty path for #{gf}, frequency #{freq}. Skipping."
116
116
  next
117
117
  end
118
-
118
+
119
119
  if freq >= 5 or
120
120
  gf =~ /Head|Appositive|Quant|Protagonist/
121
121
  # path frequent enough: list it
122
122
 
123
123
  unless @gf_to_edgelabel[pos][gf]
124
- @gf_to_edgelabel[pos][gf] = Hash.new()
124
+ @gf_to_edgelabel[pos][gf] = {}
125
125
  end
126
-
127
- enter_path(@gf_to_edgelabel[pos][gf], steps.clone(), freq)
126
+
127
+ enter_path(@gf_to_edgelabel[pos][gf], steps.clone, freq)
128
128
  end
129
129
  }
130
130
  }
@@ -137,7 +137,7 @@ class GfiGfPathMapping
137
137
 
138
138
  ###
139
139
  # test output
140
- def test_output()
140
+ def test_output
141
141
  # gf_to_paths:
142
142
  # sum frequencies, compare frequency against average path length
143
143
  puts "============================="
@@ -148,26 +148,26 @@ class GfiGfPathMapping
148
148
  # puts "================"
149
149
  # puts "POS #{pos} GF #{gf}:"
150
150
  # @gf_to_paths[pos][gf].each_pair { |path_s, freq|
151
- # puts "#{path_s} freq:#{freq} len:#{@pathstring_to_path[path_s].length()}"
151
+ # puts "#{path_s} freq:#{freq} len:#{@pathstring_to_path[path_s].length}"
152
152
  # }
153
153
  # }
154
- # }
154
+ # }
155
155
  @gf_to_paths.each_key { |pos|
156
156
  @gf_to_paths[pos].each_key { |gf|
157
157
  puts "================"
158
158
  puts "POS #{pos} GF #{gf}:"
159
-
159
+
160
160
  @gf_to_paths[pos][gf].values.uniq.sort { |a, b| b <=> a}.each { |frequency|
161
161
  sum = 0
162
162
  count = 0
163
163
  @gf_to_paths[pos][gf].each_pair { |path_s, otherfreq|
164
164
  if otherfreq == frequency
165
165
  count += 1
166
- sum += @pathstring_to_path[path_s].length()
166
+ sum += @pathstring_to_path[path_s].length
167
167
  end
168
168
  }
169
169
  avg_pathlen = sum.to_f / count.to_f
170
-
170
+
171
171
  puts " Frequency #{frequency}: #{count} path(s)"
172
172
  puts " #{avg_pathlen} avg. path len"
173
173
  }
@@ -193,7 +193,7 @@ class GfiGfPathMapping
193
193
  end
194
194
 
195
195
  #########################################
196
- # Restricting induced mappings
196
+ # Restricting induced mappings
197
197
  # to achieve better mappings
198
198
  #########################################
199
199
 
@@ -202,7 +202,7 @@ class GfiGfPathMapping
202
202
  # exclude all paths that include an Up edge
203
203
  #
204
204
  # changes @gf_to_edgelabel, not reversible
205
- def restrict_to_downpaths()
205
+ def restrict_to_downpaths
206
206
  @gf_to_edgelabel.each_value { |pos_specific|
207
207
  pos_specific.each_value { |hash_or_val|
208
208
  restrict_pathhash_to_downpaths(hash_or_val)
@@ -249,7 +249,7 @@ class GfiGfPathMapping
249
249
  # by comparing paths in the parse tree
250
250
  # against the GF/path mappings stored in @gf_to_edgelabel
251
251
  #
252
- # returns:
252
+ # returns:
253
253
  # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
254
254
  def potential_gfs_of_node(start_node, # SynNode
255
255
  lemma, # string: lemma for start_node
@@ -257,10 +257,10 @@ class GfiGfPathMapping
257
257
 
258
258
 
259
259
  # determine possible GFs of a SynNode:
260
- #
260
+ #
261
261
  # hash: SynNode(some node in this sentence) -> list of tuples [gf label, prep, headcat, hash of steps]
262
262
  # initialize with just the entry for the start node
263
- potential_gfs = Hash.new
263
+ potential_gfs = {}
264
264
  potential_gfs[start_node] = potential_gfs_of_lemma(lemma, pos)
265
265
  # $stderr.puts "HIER #{lemma} " + potential_gfs_of_lemma(lemma, pos).map { |gf, prep, hc, hash|
266
266
  # "#{gf}:#{prep}:#{hc} "
@@ -274,7 +274,7 @@ class GfiGfPathMapping
274
274
  agenda = [start_node]
275
275
  # been_there: list of SynNode objects
276
276
  # that have been considered already and needn't be visited again
277
- been_there = Hash.new
277
+ been_there = {}
278
278
  been_there[start_node] = true
279
279
 
280
280
  # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
@@ -282,17 +282,17 @@ class GfiGfPathMapping
282
282
  # frequency: frequency with which the path from verb to GF has
283
283
  # been seen in the FN data (such that we can keep
284
284
  # the best path and discard others)
285
- node_to_label_and_freq = Hash.new()
285
+ node_to_label_and_freq = {}
286
286
 
287
287
  while not(agenda.empty?)
288
- prev_node = agenda.shift()
288
+ prev_node = agenda.shift
289
289
 
290
290
  unless potential_gfs[prev_node]
291
291
  # no further GFs to be reached from prev_node:
292
292
  # shouldn't be here, but never mind, just ignore
293
293
  next
294
294
  end
295
-
295
+
296
296
  # surrounding_nodes returns a list of pairs [SynNode, Path object]
297
297
  @interpreter.surrounding_nodes(prev_node, true).each { |node, path|
298
298
  myprep = @interpreter.preposition(node)
@@ -313,10 +313,10 @@ class GfiGfPathMapping
313
313
  been_there[node] = true
314
314
 
315
315
  unless potential_gfs[node]
316
- potential_gfs[node] = Array.new
316
+ potential_gfs[node] = []
317
317
  end
318
-
319
- path.each_step() { |step|
318
+
319
+ path.each_step { |step|
320
320
  # each edge from prev_node to node:
321
321
  # see whether we can walk this edge to reach some of the GFs
322
322
  # still to be reached
@@ -326,16 +326,16 @@ class GfiGfPathMapping
326
326
  potential_gfs[prev_node].each { |gf, prep, headcat, hash|
327
327
 
328
328
  if hash[step_s]
329
- # yes, there is still a possibility of reaching gf
329
+ # yes, there is still a possibility of reaching gf
330
330
  # from our current node
331
331
 
332
- if hash[step_s].kind_of? Integer
332
+ if hash[step_s].is_a? Integer
333
333
  # actually, we have reached gf,
334
334
  # and hash[last_edge] is the frequency with which
335
335
  # this path has led to this GF in the FN data
336
336
 
337
337
  freq = hash[step_s]
338
-
338
+
339
339
  # check whether node has the right preposition
340
340
  # and the right head category
341
341
  if myprep != prep or
@@ -351,13 +351,13 @@ class GfiGfPathMapping
351
351
  # or the old frequency was lower than the current one:
352
352
  # keep the new entry
353
353
  node_to_label_and_freq[node] = [gf, prep, freq]
354
-
354
+
355
355
  else
356
- # this node has been assigned a GF before, and the
356
+ # this node has been assigned a GF before, and the
357
357
  # other frequency was higher:
358
358
  # discard the current entry
359
359
  end
360
-
360
+
361
361
  else
362
362
  # we have not yet reached gf, but we still might
363
363
  # at the next node we meet from here
@@ -401,7 +401,7 @@ class GfiGfPathMapping
401
401
  # direction, edgelabel, nodelabel
402
402
  #
403
403
  # returns: string, the key
404
- def string_step(step)
404
+ def string_step(step)
405
405
  direction = step[0]
406
406
  edgelabel = step[1]
407
407
  nodelabel = step[2]
@@ -420,28 +420,28 @@ class GfiGfPathMapping
420
420
  chainlinks, # array: string*string*string
421
421
  frequency) # integer: frequency of this mapping
422
422
  # take off first chain link
423
- key = string_step(chainlinks.shift())
424
-
423
+ key = string_step(chainlinks.shift)
424
+
425
425
  if chainlinks.empty?
426
426
  # that was the last link, actually
427
427
  hash[key] = frequency
428
428
  else
429
429
  # more links available
430
430
  unless hash[key]
431
- hash[key] = Hash.new()
431
+ hash[key] = {}
432
432
  end
433
-
434
- if hash[key].kind_of? Integer
435
- # there is a shorter path for the same GF,
433
+
434
+ if hash[key].is_a? Integer
435
+ # there is a shorter path for the same GF,
436
436
  # ending at the point where we are now.
437
437
  # which frequency is higher?
438
438
  if frequency > hash[key]
439
- hash[key] = Hash.new()
439
+ hash[key] = {}
440
440
  else
441
441
  return
442
442
  end
443
443
  end
444
-
444
+
445
445
  enter_path(hash[key], chainlinks, frequency)
446
446
  end
447
447
  end
@@ -457,8 +457,8 @@ class GfiGfPathMapping
457
457
  def print_entries(hash, num_spaces)
458
458
  hash.each_pair { |first_link, rest|
459
459
  print " "*num_spaces, first_link
460
-
461
- if rest.kind_of? Integer
460
+
461
+ if rest.is_a? Integer
462
462
  puts " #{rest}"
463
463
  else
464
464
  puts
@@ -468,7 +468,7 @@ class GfiGfPathMapping
468
468
  end
469
469
 
470
470
  #########################################
471
- # Restricting induced mappings
471
+ # Restricting induced mappings
472
472
  # to achieve better mappings
473
473
  #########################################
474
474
 
@@ -478,7 +478,7 @@ class GfiGfPathMapping
478
478
  # kill all entries whose keys describe an Up step in the path,
479
479
  # go into recursion for remaining entries
480
480
  def restrict_pathhash_to_downpaths(hash_or_val) # path hash or integer freq
481
- if hash_or_val.kind_of? Integer
481
+ if hash_or_val.is_a? Integer
482
482
  return
483
483
  end
484
484
 
@@ -498,12 +498,12 @@ class GfiGfPathMapping
498
498
 
499
499
  ###
500
500
  # recursive function:
501
- # if the argument is a hash and
501
+ # if the argument is a hash and
502
502
  # the remaining path length is 0, kill all entries
503
503
  # else go into recursion for all entries with reduced path length
504
504
  def restrict_pathhash_len(hash_or_val, # path hash or integer freq
505
- n) # restrict paths from what length?
506
- if hash_or_val.kind_of? Integer
505
+ n) # restrict paths from what length?
506
+ if hash_or_val.is_a? Integer
507
507
  return
508
508
  end
509
509
 
@@ -513,7 +513,7 @@ class GfiGfPathMapping
513
513
  hash_or_val.keys.each { |k| hash_or_val.delete(k) }
514
514
  else
515
515
  hash_or_val.each_value { |next_hash|
516
- restrict_pathhash_len(next_hash, n-1)
516
+ restrict_pathhash_len(next_hash, n-1)
517
517
  }
518
518
  end
519
519
  end
@@ -525,9 +525,9 @@ class GfiGfPathMapping
525
525
  ###
526
526
  # given a lemma,
527
527
  # look in its list of all GFs that we have ever found for that lemma
528
- #
528
+ #
529
529
  # returns: array of pairs [gf label, point in gf_to_edgelabel hash]
530
- # all the labels of GFs of this word,
530
+ # all the labels of GFs of this word,
531
531
  # and for each GF, the matching GF-to-path hash
532
532
  def potential_gfs_of_lemma(lemma, pos)
533
533
 
@@ -566,7 +566,7 @@ class GfiSubcatFrames
566
566
  def initialize(include_sem) # boolean
567
567
  # hash: word(string) -> array:[frame(string), subcatframe]
568
568
  # with subcatframe an array of tuples [gf, prep, fe, multiplicity]
569
- @word_to_subcatframes = Hash.new
569
+ @word_to_subcatframes = {}
570
570
 
571
571
  # hash: <subcatframe encoded as string> -> frequency
572
572
  @subcat_to_freq = Hash.new(0)
@@ -591,9 +591,9 @@ class GfiSubcatFrames
591
591
  unless @include_sem
592
592
  frame = nil
593
593
  end
594
-
594
+
595
595
  unless @word_to_subcatframes[lemmapos]
596
- @word_to_subcatframes[lemmapos] = Array.new
596
+ @word_to_subcatframes[lemmapos] = []
597
597
  end
598
598
 
599
599
  # reencode subcat frame:
@@ -601,27 +601,27 @@ class GfiSubcatFrames
601
601
  #
602
602
  # multiplicity is either "one" or "many", depending on
603
603
  # the number of times the same gf/prep pair occurred.
604
- # If the same gf/prep pair occurred with different FEs, they
604
+ # If the same gf/prep pair occurred with different FEs, they
605
605
  # will be concatenated into a space-separated string
606
606
  # with a single subcat entry
607
607
  count_gfprep = Hash.new(0)
608
- gfprep_to_fe = Hash.new
608
+ gfprep_to_fe = {}
609
609
 
610
610
  scf.each { |gf, prep, fe|
611
611
  count_gfprep[[gf, prep]] += 1
612
- unless gfprep_to_fe[[gf, prep]]
613
- gfprep_to_fe[[gf, prep]] = Array.new
614
- end
615
- unless gfprep_to_fe[[gf, prep]].include?(fe)
616
- gfprep_to_fe[[gf, prep]] << fe
617
- end
612
+ unless gfprep_to_fe[[gf, prep]]
613
+ gfprep_to_fe[[gf, prep]] = []
614
+ end
615
+ unless gfprep_to_fe[[gf, prep]].include?(fe)
616
+ gfprep_to_fe[[gf, prep]] << fe
617
+ end
618
618
  }
619
619
  subcatframe = count_gfprep.to_a.map { |gfprep, count|
620
620
  gf, prep = gfprep
621
621
  if @include_sem
622
- fe = gfprep_to_fe[[gf, prep]].join(" ")
622
+ fe = gfprep_to_fe[[gf, prep]].join(" ")
623
623
  else
624
- fe = nil
624
+ fe = nil
625
625
  end
626
626
  if count == 1
627
627
  [gf, prep, fe, "one"]
@@ -632,7 +632,7 @@ class GfiSubcatFrames
632
632
  if a[0] != b[0]
633
633
  # compare GF
634
634
  a[0] <=> b[0]
635
- else
635
+ else
636
636
  # compare prep
637
637
  a[1].to_s <=> b[1].to_s
638
638
  end
@@ -652,13 +652,13 @@ class GfiSubcatFrames
652
652
  #########################################
653
653
 
654
654
  ###
655
- def test_output()
655
+ def test_output
656
656
  puts "WORD_TO_SUBCATFRAMES"
657
657
  @word_to_subcatframes.each_pair { |word, frames_and_mappings|
658
658
  puts word
659
659
  frames_and_mappings.each { |frame, subcatframe|
660
660
  puts "\t#{frame} "+ subcatframe.to_a.map { |gf, prep, fe, freq| "[#{gf}]:#{prep}:#{fe}:#{freq}" }.join(" ")
661
- puts "\t\tfreq #{@subcat_to_freq[string_subcatframe(subcatframe)]}"
661
+ puts "\t\tfreq #{@subcat_to_freq[string_subcatframe(subcatframe)]}"
662
662
  }
663
663
  }
664
664
  puts
@@ -686,11 +686,11 @@ class GfiSubcatFrames
686
686
  # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
687
687
  #
688
688
  # strict: boolean. If true, return only those subcat frames that exactly match
689
- # all GFs listed in node_to_gf. If false, also return subcat frames that
689
+ # all GFs listed in node_to_gf. If false, also return subcat frames that
690
690
  # match a subset of the GFs mentioned in node_to_gf.
691
- #
692
- # returns: list of tuples [frame(string), subcat frame, frequency(integer)],
693
- # where a subcat frame is an array of tuples
691
+ #
692
+ # returns: list of tuples [frame(string), subcat frame, frequency(integer)],
693
+ # where a subcat frame is an array of tuples
694
694
  # [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
695
695
  # and the syn_nodes are sorted by confidence, best first
696
696
  def match(start_node, # SynNode
@@ -703,10 +703,10 @@ class GfiSubcatFrames
703
703
  return []
704
704
  end
705
705
 
706
- # $stderr.puts "HIER4 GFs found: " + node_to_gf.values.map { |gf, prep, freq|
707
- # "#{gf}:#{prep}"
706
+ # $stderr.puts "HIER4 GFs found: " + node_to_gf.values.map { |gf, prep, freq|
707
+ # "#{gf}:#{prep}"
708
708
  # }.join(" ")
709
- # $stderr.puts "HIER5 GF possible: (#{@word_to_subcatframes[string_lemmapos(lemma, pos)].length()})"
709
+ # $stderr.puts "HIER5 GF possible: (#{@word_to_subcatframes[string_lemmapos(lemma, pos)].length})"
710
710
  # @word_to_subcatframes[string_lemmapos(lemma, pos)].each { |frame, scf|
711
711
  # scf.each { |gf, prep, fe, mult|
712
712
  # $stderr.print "#{gf}:#{prep} "
@@ -714,12 +714,12 @@ class GfiSubcatFrames
714
714
  # $stderr.puts
715
715
  # }
716
716
 
717
- # word_to_subcatframes:
717
+ # word_to_subcatframes:
718
718
  # hash: lemma(string) -> array:[frame(string), subcatframe]
719
719
  # with subcatframe: array of tuples [gf, prep, fe, multiplicity]
720
720
  scf_list = @word_to_subcatframes[string_lemmapos(lemma, pos)].map { |frame, subcatframe|
721
721
  [
722
- frame,
722
+ frame,
723
723
  # returns: array of tuples [gf, prep, fe, syn_nodes]
724
724
  match_subcat(subcatframe, node_to_gf, strict),
725
725
  @subcat_to_freq[string_subcatframe(subcatframe)]
@@ -730,7 +730,7 @@ class GfiSubcatFrames
730
730
  # muiltiplicity "one", and the "many" has only been filled by one
731
731
  #
732
732
  # so sort by frequency, then discard duplicates using a "seen" hash
733
- seen = Hash.new
733
+ seen = {}
734
734
  return scf_list.sort { |a, b| b.last <=> a.last }.select { |frame, subcatframe, frequency|
735
735
  sc_string = string_subcatframe_withnodes(subcatframe)
736
736
  if seen[sc_string]
@@ -745,7 +745,7 @@ class GfiSubcatFrames
745
745
  ###
746
746
  # given a subcat frame and a hash mapping each node to a gf/prep pair,
747
747
  # check whether the node/gf mapping matches the subcat frame.
748
- # Match:
748
+ # Match:
749
749
  # * for each node/gf mapping, the GF/prep occurs in the subcat frame
750
750
  # (But if there are many nodes for the same GF/prep and
751
751
  # multiplicity is "one", nodes may be discarded.)
@@ -756,7 +756,7 @@ class GfiSubcatFrames
756
756
  # node_to_gf:
757
757
  # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
758
758
  #
759
- # returns:
759
+ # returns:
760
760
  # nil on mismatch.
761
761
  # match: copy of the subcat frame, each entry minus multiplicity but plus matching syn nodes
762
762
  def match_subcat(subcatframe, # array of tuples as described above
@@ -764,11 +764,11 @@ class GfiSubcatFrames
764
764
  strict) # boolean: strict match, or subseteq match?
765
765
 
766
766
  # each node of the node -> gf hash:
767
- # check whether the GF of the node->gf mapping
767
+ # check whether the GF of the node->gf mapping
768
768
  # occurs in the subcat frame
769
769
  # if it does, remember it in entry_to_nodes
770
770
  # if it does not, regard the match as failed
771
- entry_to_nodes = Hash.new
771
+ entry_to_nodes = {}
772
772
 
773
773
  node_to_gf.each_key {|node|
774
774
  gf, prep, frequency = node_to_gf[node]
@@ -779,7 +779,7 @@ class GfiSubcatFrames
779
779
  if other_gf == gf and other_prep == prep
780
780
  # match
781
781
  unless entry_to_nodes[[gf, prep]]
782
- entry_to_nodes[[gf, prep]] = Array.new
782
+ entry_to_nodes[[gf, prep]] = []
783
783
  end
784
784
  entry_to_nodes[[gf, prep]] << node
785
785
  match_found = true
@@ -795,23 +795,23 @@ class GfiSubcatFrames
795
795
 
796
796
 
797
797
  subcatframe.each { |gf, prep, fe, multiplicity|
798
-
798
+
799
799
  # opposite direction:
800
800
  # see if all slots of the subcat frame have been matched against at least one SynNode,
801
801
  # otherwise discard
802
802
  unless entry_to_nodes[[gf, prep]]
803
803
  return nil
804
804
  end
805
-
805
+
806
806
  # only one node to be returned for this slot:
807
807
  # use the one with the highest frequency for its gf->path mapping
808
- if multiplicity == "one" and entry_to_nodes[[gf, prep]].length() > 1
809
- # sort nodes by the frequency
810
- # entries in node_to_gf,
811
- # then keep only the <multiplicity> first ones
812
- entry_to_nodes[[gf, prep]] = entry_to_nodes[[gf, prep]].sort { |node1, node2|
813
- node_to_gf[node2].last <=> node_to_gf[node1].last
814
- }.slice(0, 1)
808
+ if multiplicity == "one" and entry_to_nodes[[gf, prep]].length > 1
809
+ # sort nodes by the frequency
810
+ # entries in node_to_gf,
811
+ # then keep only the <multiplicity> first ones
812
+ entry_to_nodes[[gf, prep]] = entry_to_nodes[[gf, prep]].sort { |node1, node2|
813
+ node_to_gf[node2].last <=> node_to_gf[node1].last
814
+ }.slice(0, 1)
815
815
  end
816
816
  }
817
817
 
@@ -819,7 +819,7 @@ class GfiSubcatFrames
819
819
  return subcatframe.map { |gf, prep, fe, multiplicity|
820
820
  # sort "many" nodes by the frequency of their gf->path mapping
821
821
  [
822
- gf, prep, fe,
822
+ gf, prep, fe,
823
823
  entry_to_nodes[[gf, prep]].sort { |node1, node2|
824
824
  node_to_gf[node2].last <=> node_to_gf[node1].last
825
825
  }
@@ -850,7 +850,7 @@ class GfiSubcatFrames
850
850
 
851
851
  return subcatframe.map { |gf, prep, fes, count| "#{gf} #{prep} #{count}" }.sort.join(", ")
852
852
  end
853
-
853
+
854
854
  # subcatframe to string
855
855
  #
856
856
  # here: we have a list of SynNodes instead of the multiplicity
@@ -879,7 +879,7 @@ class GfInduce
879
879
  # include_sem: if true, keep frame name and FE name
880
880
  # as part of the subcat frame. if false, don't keep them
881
881
  def initialize(interpreter_class, # SynInterpreter class
882
- include_sem = false)# boolean
882
+ include_sem = false)# boolean
883
883
 
884
884
  @interpreter = interpreter_class
885
885
  @gf_path_map = GfiGfPathMapping.new(interpreter_class)
@@ -901,7 +901,7 @@ class GfInduce
901
901
  end
902
902
 
903
903
  file.puts Marshal.dump(self)
904
- file.close()
904
+ file.close
905
905
  end
906
906
 
907
907
  ###
@@ -917,7 +917,7 @@ class GfInduce
917
917
  end
918
918
 
919
919
  gfi_obj = Marshal.load(file)
920
- file.close()
920
+ file.close
921
921
  return gfi_obj
922
922
  end
923
923
 
@@ -927,7 +927,7 @@ class GfInduce
927
927
 
928
928
  ###
929
929
  # induce path -> gf mapping from the given SalsaTigerSentence object
930
- #
930
+ #
931
931
  # Assumption: sent contains semantic annotation: FrameNet frames
932
932
  # and the FEs of the frames have information on grammatical function (gf)
933
933
  # and phrase type (pt) of the phrase that the FE points to
@@ -938,20 +938,20 @@ class GfInduce
938
938
  # induce GFs from each frame of the sentence
939
939
  sent.each_frame { |frame|
940
940
  unless frame.target
941
- # frame without a target:
941
+ # frame without a target:
942
942
  # nothing I can do
943
943
  next
944
944
  end
945
945
 
946
946
  # main target node, lemma
947
- maintarget, targetlemma, targetpos = mainnode_and_lemma(frame.target.children())
947
+ maintarget, targetlemma, targetpos = mainnode_and_lemma(frame.target.children)
948
948
  if not(maintarget) or not(targetlemma)
949
949
  # cannot count this one
950
950
  next
951
951
  end
952
952
 
953
953
  # array of tuples [gfpt, prep, fe]
954
- subcatframe = Array.new
954
+ subcatframe = []
955
955
 
956
956
  # each FE (but not the target itself):
957
957
  frame.each_child { |fe|
@@ -975,21 +975,21 @@ class GfInduce
975
975
 
976
976
  # store the mapping
977
977
  @gf_path_map.store_mapping(gfpt, path, syn_node, targetlemma, targetpos)
978
-
978
+
979
979
  # preposition?
980
980
  prep = @interpreter.preposition(syn_node)
981
981
  if prep
982
982
  prep.downcase!
983
983
  end
984
984
 
985
- # remember combination gfpt/prep/fe
985
+ # remember combination gfpt/prep/fe
986
986
  # as part of the subcat frame
987
- subcatframe << [gfpt, prep, fe.name()]
987
+ subcatframe << [gfpt, prep, fe.name]
988
988
  } # each syn node that the FE points to
989
989
  } # each FE of the frame
990
990
 
991
991
  # store the subcat frame
992
- @subcat_frames.store_subcatframe(subcatframe, frame.name(), targetlemma, targetpos)
992
+ @subcat_frames.store_subcatframe(subcatframe, frame.name, targetlemma, targetpos)
993
993
  } # each frame
994
994
  end
995
995
 
@@ -997,8 +997,8 @@ class GfInduce
997
997
  # finish up inducing:
998
998
  # reencode information in a fashion
999
999
  # that makes apply() faster
1000
- def compute_mapping()
1001
- @gf_path_map.finish_inducing()
1000
+ def compute_mapping
1001
+ @gf_path_map.finish_inducing
1002
1002
  end
1003
1003
 
1004
1004
  #########################################
@@ -1006,21 +1006,21 @@ class GfInduce
1006
1006
  #########################################
1007
1007
 
1008
1008
  ###
1009
- def test_output()
1010
- @gf_path_map.test_output()
1011
- @subcat_frames.test_output()
1009
+ def test_output
1010
+ @gf_path_map.test_output
1011
+ @subcat_frames.test_output
1012
1012
  end
1013
1013
 
1014
1014
  #########################################
1015
- # Restricting induced mappings
1015
+ # Restricting induced mappings
1016
1016
  # to achieve better mappings
1017
1017
  #########################################
1018
1018
 
1019
1019
  ####
1020
1020
  # restrict gf -> path mappings:
1021
1021
  # exclude all paths that include an Up edge
1022
- def restrict_to_downpaths()
1023
- @gf_path_map.restrict_to_downpaths()
1022
+ def restrict_to_downpaths
1023
+ @gf_path_map.restrict_to_downpaths
1024
1024
  end
1025
1025
 
1026
1026
  ####
@@ -1046,18 +1046,18 @@ class GfInduce
1046
1046
  ###
1047
1047
  # given a list of nodes (idea: they form a MWE together;
1048
1048
  # may of course be a single node),
1049
- # determine all subcat frames, i.e. all consistent sets of grammatical functions,
1049
+ # determine all subcat frames, i.e. all consistent sets of grammatical functions,
1050
1050
  # for the main node among the nodelist.
1051
1051
  # For each subcat frame, potential FN frames and FE labels
1052
1052
  # are returned as well
1053
1053
  #
1054
1054
  # strict: boolean. If true, return only those subcat frames that exactly match
1055
- # all GFs listed in node_to_gf. If false, also return subcat frames that
1055
+ # all GFs listed in node_to_gf. If false, also return subcat frames that
1056
1056
  # match a subset of the GFs mentioned in node_to_gf.
1057
- #
1058
1057
  #
1059
- # returns: list of tuples [frame(string), subcat frame, frequency(integer)],
1060
- # where a subcat frame is an array of tuples
1058
+ #
1059
+ # returns: list of tuples [frame(string), subcat frame, frequency(integer)],
1060
+ # where a subcat frame is an array of tuples
1061
1061
  # [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
1062
1062
  def apply(nodelist, # array:SynNode
1063
1063
  strict = false) # match: strict or subseteq?
@@ -1082,7 +1082,7 @@ class GfInduce
1082
1082
 
1083
1083
  return @subcat_frames.match(mainnode, lemma, pos, node_to_gf, strict)
1084
1084
  end
1085
-
1085
+
1086
1086
 
1087
1087
  #########################################
1088
1088
  #########################################
@@ -1108,8 +1108,10 @@ class GfInduce
1108
1108
  # verb? then add the voice to the POS
1109
1109
  if (voice = @interpreter.voice(mainnode))
1110
1110
  pos = pos + "-" + voice
1111
- end
1111
+ end
1112
1112
  return [mainnode, lemma, pos]
1113
1113
  end
1114
1114
 
1115
1115
  end
1116
+ end
1117
+ end