frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
 - data/CHANGELOG.rdoc +0 -0
 - data/LICENSE.rdoc +0 -0
 - data/README.rdoc +0 -0
 - data/lib/common/AbstractSynInterface.rb +1227 -0
 - data/lib/common/BerkeleyInterface.rb +375 -0
 - data/lib/common/CollinsInterface.rb +1165 -0
 - data/lib/common/ConfigData.rb +694 -0
 - data/lib/common/Counter.rb +18 -0
 - data/lib/common/DBInterface.rb +48 -0
 - data/lib/common/EnduserMode.rb +27 -0
 - data/lib/common/Eval.rb +480 -0
 - data/lib/common/FixSynSemMapping.rb +196 -0
 - data/lib/common/FrPrepConfigData.rb +66 -0
 - data/lib/common/FrprepHelper.rb +1324 -0
 - data/lib/common/Graph.rb +345 -0
 - data/lib/common/ISO-8859-1.rb +24 -0
 - data/lib/common/ML.rb +186 -0
 - data/lib/common/Maxent.rb +215 -0
 - data/lib/common/MiniparInterface.rb +1388 -0
 - data/lib/common/Optimise.rb +195 -0
 - data/lib/common/Parser.rb +213 -0
 - data/lib/common/RegXML.rb +269 -0
 - data/lib/common/RosyConventions.rb +171 -0
 - data/lib/common/SQLQuery.rb +243 -0
 - data/lib/common/STXmlTerminalOrder.rb +194 -0
 - data/lib/common/SalsaTigerRegXML.rb +2347 -0
 - data/lib/common/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/common/SleepyInterface.rb +384 -0
 - data/lib/common/SynInterfaces.rb +275 -0
 - data/lib/common/TabFormat.rb +720 -0
 - data/lib/common/Tiger.rb +1448 -0
 - data/lib/common/TntInterface.rb +44 -0
 - data/lib/common/Tree.rb +61 -0
 - data/lib/common/TreetaggerInterface.rb +303 -0
 - data/lib/common/headz.rb +338 -0
 - data/lib/common/option_parser.rb +13 -0
 - data/lib/common/ruby_class_extensions.rb +310 -0
 - data/lib/fred/Baseline.rb +150 -0
 - data/lib/fred/FileZipped.rb +31 -0
 - data/lib/fred/FredBOWContext.rb +863 -0
 - data/lib/fred/FredConfigData.rb +182 -0
 - data/lib/fred/FredConventions.rb +232 -0
 - data/lib/fred/FredDetermineTargets.rb +324 -0
 - data/lib/fred/FredEval.rb +312 -0
 - data/lib/fred/FredFeatureExtractors.rb +321 -0
 - data/lib/fred/FredFeatures.rb +1061 -0
 - data/lib/fred/FredFeaturize.rb +596 -0
 - data/lib/fred/FredNumTrainingSenses.rb +27 -0
 - data/lib/fred/FredParameters.rb +402 -0
 - data/lib/fred/FredSplit.rb +84 -0
 - data/lib/fred/FredSplitPkg.rb +180 -0
 - data/lib/fred/FredTest.rb +607 -0
 - data/lib/fred/FredTrain.rb +144 -0
 - data/lib/fred/PlotAndREval.rb +480 -0
 - data/lib/fred/fred.rb +45 -0
 - data/lib/fred/md5.rb +23 -0
 - data/lib/fred/opt_parser.rb +250 -0
 - data/lib/frprep/AbstractSynInterface.rb +1227 -0
 - data/lib/frprep/Ampersand.rb +37 -0
 - data/lib/frprep/BerkeleyInterface.rb +375 -0
 - data/lib/frprep/CollinsInterface.rb +1165 -0
 - data/lib/frprep/ConfigData.rb +694 -0
 - data/lib/frprep/Counter.rb +18 -0
 - data/lib/frprep/FNCorpusXML.rb +643 -0
 - data/lib/frprep/FNDatabase.rb +144 -0
 - data/lib/frprep/FixSynSemMapping.rb +196 -0
 - data/lib/frprep/FrPrepConfigData.rb +66 -0
 - data/lib/frprep/FrameXML.rb +513 -0
 - data/lib/frprep/FrprepHelper.rb +1324 -0
 - data/lib/frprep/Graph.rb +345 -0
 - data/lib/frprep/ISO-8859-1.rb +24 -0
 - data/lib/frprep/MiniparInterface.rb +1388 -0
 - data/lib/frprep/Parser.rb +213 -0
 - data/lib/frprep/RegXML.rb +269 -0
 - data/lib/frprep/STXmlTerminalOrder.rb +194 -0
 - data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
 - data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/frprep/SleepyInterface.rb +384 -0
 - data/lib/frprep/SynInterfaces.rb +275 -0
 - data/lib/frprep/TabFormat.rb +720 -0
 - data/lib/frprep/Tiger.rb +1448 -0
 - data/lib/frprep/TntInterface.rb +44 -0
 - data/lib/frprep/Tree.rb +61 -0
 - data/lib/frprep/TreetaggerInterface.rb +303 -0
 - data/lib/frprep/do_parses.rb +142 -0
 - data/lib/frprep/frprep.rb +686 -0
 - data/lib/frprep/headz.rb +338 -0
 - data/lib/frprep/one_parsed_file.rb +28 -0
 - data/lib/frprep/opt_parser.rb +94 -0
 - data/lib/frprep/ruby_class_extensions.rb +310 -0
 - data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
 - data/lib/rosy/DBMySQL.rb +146 -0
 - data/lib/rosy/DBSQLite.rb +280 -0
 - data/lib/rosy/DBTable.rb +239 -0
 - data/lib/rosy/DBWrapper.rb +176 -0
 - data/lib/rosy/ExternalConfigData.rb +58 -0
 - data/lib/rosy/FailedParses.rb +130 -0
 - data/lib/rosy/FeatureInfo.rb +242 -0
 - data/lib/rosy/GfInduce.rb +1115 -0
 - data/lib/rosy/GfInduceFeature.rb +148 -0
 - data/lib/rosy/InputData.rb +294 -0
 - data/lib/rosy/RosyConfigData.rb +115 -0
 - data/lib/rosy/RosyConfusability.rb +338 -0
 - data/lib/rosy/RosyEval.rb +465 -0
 - data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
 - data/lib/rosy/RosyFeaturize.rb +280 -0
 - data/lib/rosy/RosyInspect.rb +336 -0
 - data/lib/rosy/RosyIterator.rb +477 -0
 - data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
 - data/lib/rosy/RosyPruning.rb +165 -0
 - data/lib/rosy/RosyServices.rb +744 -0
 - data/lib/rosy/RosySplit.rb +232 -0
 - data/lib/rosy/RosyTask.rb +19 -0
 - data/lib/rosy/RosyTest.rb +826 -0
 - data/lib/rosy/RosyTrain.rb +232 -0
 - data/lib/rosy/RosyTrainingTestTable.rb +786 -0
 - data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
 - data/lib/rosy/View.rb +418 -0
 - data/lib/rosy/opt_parser.rb +379 -0
 - data/lib/rosy/rosy.rb +77 -0
 - data/lib/shalmaneser/version.rb +3 -0
 - data/test/frprep/test_opt_parser.rb +94 -0
 - data/test/functional/functional_test_helper.rb +40 -0
 - data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
 - data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
 - data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
 - data/test/functional/test_fred.rb +47 -0
 - data/test/functional/test_frprep.rb +52 -0
 - data/test/functional/test_rosy.rb +20 -0
 - metadata +270 -0
 
| 
         @@ -0,0 +1,686 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'frprep/do_parses'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'frprep/FrprepHelper'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'frprep/FixSynSemMapping'
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            ##############################
         
     | 
| 
      
 6 
     | 
    
         
            +
            # The class that does all the work
         
     | 
| 
      
 7 
     | 
    
         
            +
            module FrPrep
         
     | 
| 
      
 8 
     | 
    
         
            +
            class FrPrep
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
              def initialize(exp)   # FrprepConfigData object
         
     | 
| 
      
 11 
     | 
    
         
            +
                @exp = exp
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                # AB: move to FRprepOptionParser    
         
     | 
| 
      
 14 
     | 
    
         
            +
                # remove previous contents of frprep internal data directory
         
     | 
| 
      
 15 
     | 
    
         
            +
                unless exp.get("frprep_directory")
         
     | 
| 
      
 16 
     | 
    
         
            +
                  raise "Please set 'frprep_directory', the frprep internal data directory,\n" +
         
     | 
| 
      
 17 
     | 
    
         
            +
                        "in the experiment file."
         
     | 
| 
      
 18 
     | 
    
         
            +
                end
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                # experiment directory: 
         
     | 
| 
      
 21 
     | 
    
         
            +
                # frprep internal data directory, subdir according to experiment ID
         
     | 
| 
      
 22 
     | 
    
         
            +
                 exp_dir = File.new_dir(@exp.get("frprep_directory"),
         
     | 
| 
      
 23 
     | 
    
         
            +
                                        @exp.get("prep_experiment_ID"))
         
     | 
| 
      
 24 
     | 
    
         
            +
                # %x{rm -rf #{exp_dir}}
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                # suffixes for different types of output files
         
     | 
| 
      
 27 
     | 
    
         
            +
                @file_suffixes = {"lemma" => ".lemma",
         
     | 
| 
      
 28 
     | 
    
         
            +
                  "pos" => ".pos",
         
     | 
| 
      
 29 
     | 
    
         
            +
                  "tab" => ".tab",
         
     | 
| 
      
 30 
     | 
    
         
            +
                  "stxml" => ".xml"}
         
     | 
| 
      
 31 
     | 
    
         
            +
              end
         
     | 
| 
      
 32 
     | 
    
         
            +
              
         
     | 
| 
      
 33 
     | 
    
         
            +
              def transform()
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                # AB: Debugging.
         
     | 
| 
      
 36 
     | 
    
         
            +
                debugger if $DEBUG
         
     | 
| 
      
 37 
     | 
    
         
            +
                
         
     | 
| 
      
 38 
     | 
    
         
            +
                current_format = @exp.get("format")
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                # AB: move to FRprepOptionParser
         
     | 
| 
      
 41 
     | 
    
         
            +
                unless @exp.get("directory_input")
         
     | 
| 
      
 42 
     | 
    
         
            +
                  $stderr.puts "Please specify 'directory_input' in the experiment file."
         
     | 
| 
      
 43 
     | 
    
         
            +
                  exit 1
         
     | 
| 
      
 44 
     | 
    
         
            +
                end
         
     | 
| 
      
 45 
     | 
    
         
            +
                # AB: move to FRprepOptionParser    
         
     | 
| 
      
 46 
     | 
    
         
            +
                unless @exp.get("directory_preprocessed")
         
     | 
| 
      
 47 
     | 
    
         
            +
                  $stderr.puts "Please specify 'directory_preprocessed' in the experiment file."
         
     | 
| 
      
 48 
     | 
    
         
            +
                  exit 1
         
     | 
| 
      
 49 
     | 
    
         
            +
                end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                ##
         
     | 
| 
      
 52 
     | 
    
         
            +
                # input and output directories.
         
     | 
| 
      
 53 
     | 
    
         
            +
                #
         
     | 
| 
      
 54 
     | 
    
         
            +
                # sanity check: output in tab format will not work
         
     | 
| 
      
 55 
     | 
    
         
            +
                # if we also do a parse
         
     | 
| 
      
 56 
     | 
    
         
            +
                if @exp.get("tabformat_output") and @exp.get("do_parse")
         
     | 
| 
      
 57 
     | 
    
         
            +
                  $stderr.puts "Error: Cannot do Tab format output"
         
     | 
| 
      
 58 
     | 
    
         
            +
                  $stderr.puts "when the input text is being parsed."
         
     | 
| 
      
 59 
     | 
    
         
            +
                  $stderr.puts "Please set either 'tabformat_output' or 'do_parse' to false."
         
     | 
| 
      
 60 
     | 
    
         
            +
                  exit 1
         
     | 
| 
      
 61 
     | 
    
         
            +
                end
         
     | 
| 
      
 62 
     | 
    
         
            +
                input_dir = File.existing_dir(@exp.get("directory_input"))
         
     | 
| 
      
 63 
     | 
    
         
            +
                output_dir = File.new_dir(@exp.get("directory_preprocessed"))
         
     | 
| 
      
 64 
     | 
    
         
            +
                if @exp.get("tabformat_output")
         
     | 
| 
      
 65 
     | 
    
         
            +
                  split_dir = output_dir
         
     | 
| 
      
 66 
     | 
    
         
            +
                else
         
     | 
| 
      
 67 
     | 
    
         
            +
                  split_dir = frprep_dirname("split", "new")
         
     | 
| 
      
 68 
     | 
    
         
            +
                end
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                ####
         
     | 
| 
      
 71 
     | 
    
         
            +
                # transform data to UTF-8
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
                if ["iso", "hex"].include? @exp.get("encoding")
         
     | 
| 
      
 74 
     | 
    
         
            +
                  # transform ISO -> UTF-8 or Hex -> UTF-8
         
     | 
| 
      
 75 
     | 
    
         
            +
                  # write result to encoding_dir, 
         
     | 
| 
      
 76 
     | 
    
         
            +
                  # then set encoding_dir to be the new input_dir
         
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
                  encoding_dir = frprep_dirname("encoding", "new")
         
     | 
| 
      
 79 
     | 
    
         
            +
                  $stderr.puts "Frprep: Transforming  to UTF-8."
         
     | 
| 
      
 80 
     | 
    
         
            +
                  Dir[input_dir + "*"].each { |filename|
         
     | 
| 
      
 81 
     | 
    
         
            +
                    unless File.file? filename
         
     | 
| 
      
 82 
     | 
    
         
            +
                      # not a file? then skip
         
     | 
| 
      
 83 
     | 
    
         
            +
                      next
         
     | 
| 
      
 84 
     | 
    
         
            +
                    end
         
     | 
| 
      
 85 
     | 
    
         
            +
                    outfilename = encoding_dir + File.basename(filename)
         
     | 
| 
      
 86 
     | 
    
         
            +
                    FrprepHelper.to_utf8_file(filename, outfilename, @exp.get("encoding"))
         
     | 
| 
      
 87 
     | 
    
         
            +
                  }
         
     | 
| 
      
 88 
     | 
    
         
            +
                  
         
     | 
| 
      
 89 
     | 
    
         
            +
                  input_dir = encoding_dir
         
     | 
| 
      
 90 
     | 
    
         
            +
                end
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
                
         
     | 
| 
      
 93 
     | 
    
         
            +
                ####
         
     | 
| 
      
 94 
     | 
    
         
            +
                # transform data all the way to the output format,
         
     | 
| 
      
 95 
     | 
    
         
            +
                # which is SalsaTigerXML by default,
         
     | 
| 
      
 96 
     | 
    
         
            +
                # except when tabformat_output has been set, in which case it's 
         
     | 
| 
      
 97 
     | 
    
         
            +
                # Tab format.
         
     | 
| 
      
 98 
     | 
    
         
            +
                current_dir = input_dir
         
     | 
| 
      
 99 
     | 
    
         
            +
             
     | 
| 
      
 100 
     | 
    
         
            +
                if @exp.get("tabformat_output")
         
     | 
| 
      
 101 
     | 
    
         
            +
                  done_format = "SalsaTabWithPos"
         
     | 
| 
      
 102 
     | 
    
         
            +
                else
         
     | 
| 
      
 103 
     | 
    
         
            +
                  done_format = "Done"
         
     | 
| 
      
 104 
     | 
    
         
            +
                end
         
     | 
| 
      
 105 
     | 
    
         
            +
             
     | 
| 
      
 106 
     | 
    
         
            +
                while not(current_format == done_format)
         
     | 
| 
      
 107 
     | 
    
         
            +
                  case current_format
         
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
                  when "BNC"
         
     | 
| 
      
 110 
     | 
    
         
            +
                    # basically plain, plus some tags to be removed
         
     | 
| 
      
 111 
     | 
    
         
            +
                    plain_dir = frprep_dirname("plain", "new")
         
     | 
| 
      
 112 
     | 
    
         
            +
                    
         
     | 
| 
      
 113 
     | 
    
         
            +
                    $stderr.puts "Frprep: Transforming BNC format text in #{current_dir} to plain format."
         
     | 
| 
      
 114 
     | 
    
         
            +
                    $stderr.puts "Storing the result in #{plain_dir}."
         
     | 
| 
      
 115 
     | 
    
         
            +
                    $stderr.puts "Expecting one sentence per line."
         
     | 
| 
      
 116 
     | 
    
         
            +
                    
         
     | 
| 
      
 117 
     | 
    
         
            +
            	transform_bncformat_dir(current_dir, plain_dir)
         
     | 
| 
      
 118 
     | 
    
         
            +
                    
         
     | 
| 
      
 119 
     | 
    
         
            +
            	current_dir = plain_dir
         
     | 
| 
      
 120 
     | 
    
         
            +
            	current_format = "Plain"
         
     | 
| 
      
 121 
     | 
    
         
            +
             
     | 
| 
      
 122 
     | 
    
         
            +
                  when "Plain" 
         
     | 
| 
      
 123 
     | 
    
         
            +
            	# transform to tab format
         
     | 
| 
      
 124 
     | 
    
         
            +
                    
         
     | 
| 
      
 125 
     | 
    
         
            +
                    tab_dir = frprep_dirname("tab", "new")
         
     | 
| 
      
 126 
     | 
    
         
            +
             
     | 
| 
      
 127 
     | 
    
         
            +
                    $stderr.puts "Frprep: Transforming plain text in #{current_dir} to SalsaTab format."
         
     | 
| 
      
 128 
     | 
    
         
            +
                    $stderr.puts "Storing the result in #{tab_dir}."
         
     | 
| 
      
 129 
     | 
    
         
            +
                    $stderr.puts "Expecting one sentence per line."
         
     | 
| 
      
 130 
     | 
    
         
            +
                    
         
     | 
| 
      
 131 
     | 
    
         
            +
            	transform_plain_dir(current_dir, tab_dir)
         
     | 
| 
      
 132 
     | 
    
         
            +
                    
         
     | 
| 
      
 133 
     | 
    
         
            +
            	current_dir = tab_dir
         
     | 
| 
      
 134 
     | 
    
         
            +
            	current_format = "SalsaTab"
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
                  when "FNXml"
         
     | 
| 
      
 137 
     | 
    
         
            +
            	# transform to tab format
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
      
 139 
     | 
    
         
            +
                    tab_dir = frprep_dirname("tab", "new")
         
     | 
| 
      
 140 
     | 
    
         
            +
             
     | 
| 
      
 141 
     | 
    
         
            +
            	$stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
         
     | 
| 
      
 142 
     | 
    
         
            +
            	$stderr.puts "Storing the result in " + tab_dir
         
     | 
| 
      
 143 
     | 
    
         
            +
             
     | 
| 
      
 144 
     | 
    
         
            +
            	fndata = FNDatabase.new(current_dir)
         
     | 
| 
      
 145 
     | 
    
         
            +
            	fndata.extract_everything(tab_dir)
         
     | 
| 
      
 146 
     | 
    
         
            +
            	Kernel.system("chmod -R g+rx #{tab_dir}")
         
     | 
| 
      
 147 
     | 
    
         
            +
             
     | 
| 
      
 148 
     | 
    
         
            +
            	current_dir = tab_dir
         
     | 
| 
      
 149 
     | 
    
         
            +
            	current_format = "SalsaTab"
         
     | 
| 
      
 150 
     | 
    
         
            +
             
     | 
| 
      
 151 
     | 
    
         
            +
                  when "FNCorpusXml"
         
     | 
| 
      
 152 
     | 
    
         
            +
                    # transform to tab format
         
     | 
| 
      
 153 
     | 
    
         
            +
                    tab_dir = frprep_dirname("tab", "new")
         
     | 
| 
      
 154 
     | 
    
         
            +
             
     | 
| 
      
 155 
     | 
    
         
            +
            	$stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
         
     | 
| 
      
 156 
     | 
    
         
            +
            	$stderr.puts "Storing the result in " + tab_dir
         
     | 
| 
      
 157 
     | 
    
         
            +
                    # assuming that all XML files in the current directory are FN Corpus XML files
         
     | 
| 
      
 158 
     | 
    
         
            +
                    Dir[current_dir + "*.xml"].each { |fncorpusfilename|
         
     | 
| 
      
 159 
     | 
    
         
            +
                      corpus = FNCorpusXMLFile.new(fncorpusfilename)
         
     | 
| 
      
 160 
     | 
    
         
            +
                      outfile = File.new(tab_dir + File.basename(fncorpusfilename, ".xml") + ".tab", 
         
     | 
| 
      
 161 
     | 
    
         
            +
                                         "w")
         
     | 
| 
      
 162 
     | 
    
         
            +
                      corpus.print_conll_style(outfile)
         
     | 
| 
      
 163 
     | 
    
         
            +
                      outfile.close()
         
     | 
| 
      
 164 
     | 
    
         
            +
                    }
         
     | 
| 
      
 165 
     | 
    
         
            +
             
     | 
| 
      
 166 
     | 
    
         
            +
            	Kernel.system("chmod -R g+rx #{tab_dir}")
         
     | 
| 
      
 167 
     | 
    
         
            +
            	current_dir = tab_dir
         
     | 
| 
      
 168 
     | 
    
         
            +
            	current_format = "SalsaTab"
         
     | 
| 
      
 169 
     | 
    
         
            +
             
     | 
| 
      
 170 
     | 
    
         
            +
                  when "SalsaTab"
         
     | 
| 
      
 171 
     | 
    
         
            +
            	# lemmatize and POStag
         
     | 
| 
      
 172 
     | 
    
         
            +
             
     | 
| 
      
 173 
     | 
    
         
            +
                    $stderr.puts "Frprep: Lemmatizing and parsing text in #{current_dir}."
         
     | 
| 
      
 174 
     | 
    
         
            +
                    $stderr.puts "Storing the result in #{split_dir}."
         
     | 
| 
      
 175 
     | 
    
         
            +
                    transform_pos_and_lemmatize(current_dir, split_dir)
         
     | 
| 
      
 176 
     | 
    
         
            +
             
     | 
| 
      
 177 
     | 
    
         
            +
                    current_dir = split_dir
         
     | 
| 
      
 178 
     | 
    
         
            +
            	current_format = "SalsaTabWithPos"
         
     | 
| 
      
 179 
     | 
    
         
            +
             
     | 
| 
      
 180 
     | 
    
         
            +
                  when "SalsaTabWithPos"
         
     | 
| 
      
 181 
     | 
    
         
            +
                    # parse
         
     | 
| 
      
 182 
     | 
    
         
            +
             
     | 
| 
      
 183 
     | 
    
         
            +
                    parse_dir = frprep_dirname("parse", "new")
         
     | 
| 
      
 184 
     | 
    
         
            +
             
     | 
| 
      
 185 
     | 
    
         
            +
                    $stderr.puts "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format."
         
     | 
| 
      
 186 
     | 
    
         
            +
                    $stderr.puts "Storing the result in #{parse_dir}."
         
     | 
| 
      
 187 
     | 
    
         
            +
             
     | 
| 
      
 188 
     | 
    
         
            +
                    transform_salsatab_dir(current_dir, parse_dir, output_dir)
         
     | 
| 
      
 189 
     | 
    
         
            +
             
     | 
| 
      
 190 
     | 
    
         
            +
                    current_dir = output_dir
         
     | 
| 
      
 191 
     | 
    
         
            +
            	current_format = "Done"
         
     | 
| 
      
 192 
     | 
    
         
            +
             
     | 
| 
      
 193 
     | 
    
         
            +
                  when "SalsaTigerXML"
         
     | 
| 
      
 194 
     | 
    
         
            +
                    
         
     | 
| 
      
 195 
     | 
    
         
            +
                    parse_dir = frprep_dirname("parse", "new")
         
     | 
| 
      
 196 
     | 
    
         
            +
            	print "Transform parser output into stxml\n"	
         
     | 
| 
      
 197 
     | 
    
         
            +
                    transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir, @exp)
         
     | 
| 
      
 198 
     | 
    
         
            +
                    current_dir = output_dir
         
     | 
| 
      
 199 
     | 
    
         
            +
                    current_format = "Done"
         
     | 
| 
      
 200 
     | 
    
         
            +
             
     | 
| 
      
 201 
     | 
    
         
            +
                  else
         
     | 
| 
      
 202 
     | 
    
         
            +
            	$stderr.puts "Unknown data format #{current_format}"
         
     | 
| 
      
 203 
     | 
    
         
            +
                    $stderr.puts "Please check the 'format' entry in your experiment file."
         
     | 
| 
      
 204 
     | 
    
         
            +
                    raise "Experiment file problem"
         
     | 
| 
      
 205 
     | 
    
         
            +
                  end
         
     | 
| 
      
 206 
     | 
    
         
            +
                end
         
     | 
| 
      
 207 
     | 
    
         
            +
             
     | 
| 
      
 208 
     | 
    
         
            +
                $stderr.puts "Frprep: Done preprocessing."
         
     | 
| 
      
 209 
     | 
    
         
            +
              end
         
     | 
| 
      
 210 
     | 
    
         
            +
              
         
     | 
| 
      
 211 
     | 
    
         
            +
              ############################################################################3
         
     | 
| 
      
 212 
     | 
    
         
            +
              private
         
     | 
| 
      
 213 
     | 
    
         
            +
              ############################################################################3
         
     | 
| 
      
 214 
     | 
    
         
            +
             
     | 
| 
      
 215 
     | 
    
         
            +
              ###############
         
     | 
| 
      
 216 
     | 
    
         
            +
              # frprep_dirname:
         
     | 
| 
      
 217 
     | 
    
         
            +
              # make directory name for frprep-internal data
         
     | 
| 
      
 218 
     | 
    
         
            +
              # of a certain kind described in <subdir>
         
     | 
| 
      
 219 
     | 
    
         
            +
              #
         
     | 
| 
      
 220 
     | 
    
         
            +
              # frprep_directory has one subdirectory for each experiment ID,
         
     | 
| 
      
 221 
     | 
    
         
            +
              # and below that there is one subdir per subtask
         
     | 
| 
      
 222 
     | 
    
         
            +
              #
         
     | 
| 
      
 223 
     | 
    
         
            +
              # If this is a new directory, it is constructed,
         
     | 
| 
      
 224 
     | 
    
         
            +
              # if it should be an existing directory, its existence is  checked.
         
     | 
| 
      
 225 
     | 
    
         
            +
              def frprep_dirname(subdir,     # string: designator of subdirectory
         
     | 
| 
      
 226 
     | 
    
         
            +
                                 new = nil)  # non-nil: this may be a new directory
         
     | 
| 
      
 227 
     | 
    
         
            +
             
     | 
| 
      
 228 
     | 
    
         
            +
                dirname = File.new_dir(@exp.get("frprep_directory"),
         
     | 
| 
      
 229 
     | 
    
         
            +
                                       @exp.get("prep_experiment_ID"),
         
     | 
| 
      
 230 
     | 
    
         
            +
                                       subdir)
         
     | 
| 
      
 231 
     | 
    
         
            +
             
     | 
| 
      
 232 
     | 
    
         
            +
             
     | 
| 
      
 233 
     | 
    
         
            +
                if new
         
     | 
| 
      
 234 
     | 
    
         
            +
                  return File.new_dir(dirname)
         
     | 
| 
      
 235 
     | 
    
         
            +
                else
         
     | 
| 
      
 236 
     | 
    
         
            +
                  return File.existing_dir(dirname)
         
     | 
| 
      
 237 
     | 
    
         
            +
                end
         
     | 
| 
      
 238 
     | 
    
         
            +
              end
         
     | 
| 
      
 239 
     | 
    
         
            +
              
         
     | 
| 
      
 240 
     | 
    
         
            +
             
     | 
| 
      
 241 
     | 
    
         
            +
             
     | 
| 
      
 242 
     | 
    
         
            +
              ###############
         
     | 
| 
      
 243 
     | 
    
         
            +
              # transform_plain:
         
     | 
| 
      
 244 
     | 
    
         
            +
              #
         
     | 
| 
      
 245 
     | 
    
         
            +
              # transformation for BNC format:
         
     | 
| 
      
 246 
     | 
    
         
            +
              #
         
     | 
| 
      
 247 
     | 
    
         
            +
              # transform to plain format, removing <> elements
         
     | 
| 
      
 248 
     | 
    
         
            +
              def transform_bncformat_dir(input_dir,  # string: input directory
         
     | 
| 
      
 249 
     | 
    
         
            +
                                          output_dir) # string: output directory
         
     | 
| 
      
 250 
     | 
    
         
            +
             
     | 
| 
      
 251 
     | 
    
         
            +
                Dir[input_dir + "*"].each { |bncfilename|
         
     | 
| 
      
 252 
     | 
    
         
            +
                  
         
     | 
| 
      
 253 
     | 
    
         
            +
                  # open input and output file
         
     | 
| 
      
 254 
     | 
    
         
            +
                  # end output file name in "tab" because that is, at the moment, required
         
     | 
| 
      
 255 
     | 
    
         
            +
                  outfilename = output_dir + File.basename(bncfilename)
         
     | 
| 
      
 256 
     | 
    
         
            +
                  FrprepHelper.bnc_to_plain_file(bncfilename, outfilename)
         
     | 
| 
      
 257 
     | 
    
         
            +
                }
         
     | 
| 
      
 258 
     | 
    
         
            +
              end
         
     | 
| 
      
 259 
     | 
    
         
            +
             
     | 
| 
      
 260 
     | 
    
         
            +
             
     | 
| 
      
 261 
     | 
    
         
            +
              ###############
         
     | 
| 
      
 262 
     | 
    
         
            +
              # transform_plain:
         
     | 
| 
      
 263 
     | 
    
         
            +
              #
         
     | 
| 
      
 264 
     | 
    
         
            +
              # transformation for plaintext:
         
     | 
| 
      
 265 
     | 
    
         
            +
              #
         
     | 
| 
      
 266 
     | 
    
         
            +
              # transform to Tab format, separating punctuation from adjacent words
         
     | 
| 
      
 267 
     | 
    
         
            +
              def transform_plain_dir(input_dir,  # string: input directory
         
     | 
| 
      
 268 
     | 
    
         
            +
                                      output_dir) # string: output directory
         
     | 
| 
      
 269 
     | 
    
         
            +
             
     | 
| 
      
 270 
     | 
    
         
            +
                Dir[input_dir + "*"].each { |plainfilename|
         
     | 
| 
      
 271 
     | 
    
         
            +
                  
         
     | 
| 
      
 272 
     | 
    
         
            +
                  # open input and output file
         
     | 
| 
      
 273 
     | 
    
         
            +
                  # end output file name in "tab" because that is, at the moment, required
         
     | 
| 
      
 274 
     | 
    
         
            +
                  outfilename = output_dir + File.basename(plainfilename) + @file_suffixes["tab"]
         
     | 
| 
      
 275 
     | 
    
         
            +
                  FrprepHelper.plain_to_tab_file(plainfilename, outfilename)
         
     | 
| 
      
 276 
     | 
    
         
            +
                }
         
     | 
| 
      
 277 
     | 
    
         
            +
              end
         
     | 
| 
      
 278 
     | 
    
         
            +
             
     | 
| 
      
 279 
     | 
    
         
            +
              ###############
         
     | 
| 
      
 280 
     | 
    
         
            +
              # transform_pos_and_lemmatize
         
     | 
| 
      
 281 
     | 
    
         
            +
              #
         
     | 
| 
      
 282 
     | 
    
         
            +
              # transformation for Tab format files:
         
     | 
| 
      
 283 
     | 
    
         
            +
              #
         
     | 
| 
      
 284 
     | 
    
         
            +
              # - Split into parser-size chunks
         
     | 
| 
      
 285 
     | 
    
         
            +
              # - POS-tag, lemmatize
         
     | 
| 
      
 286 
     | 
    
         
            +
              def transform_pos_and_lemmatize(input_dir, # string: input directory
         
     | 
| 
      
 287 
     | 
    
         
            +
                                              output_dir) # string: output directory
         
     | 
| 
      
 288 
     | 
    
         
            +
                ##
         
     | 
| 
      
 289 
     | 
    
         
            +
                # split the TabFormatFile into chunks of max_sent_num size
         
     | 
| 
      
 290 
     | 
    
         
            +
                FrprepHelper.split_dir(input_dir, output_dir,@file_suffixes["tab"],
         
     | 
| 
      
 291 
     | 
    
         
            +
            			   @exp.get("parser_max_sent_num"), 
         
     | 
| 
      
 292 
     | 
    
         
            +
            			   @exp.get("parser_max_sent_len"))
         
     | 
| 
      
 293 
     | 
    
         
            +
                
         
     | 
| 
      
 294 
     | 
    
         
            +
                ##
         
     | 
| 
      
 295 
     | 
    
         
            +
                # POS-Tagging
         
     | 
| 
      
 296 
     | 
    
         
            +
                if @exp.get("do_postag")
         
     | 
| 
      
 297 
     | 
    
         
            +
                  $stderr.puts "Frprep: Tagging."
         
     | 
| 
      
 298 
     | 
    
         
            +
                  unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
         
     | 
| 
      
 299 
     | 
    
         
            +
            	raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
         
     | 
| 
      
 300 
     | 
    
         
            +
                  end
         
     | 
| 
      
 301 
     | 
    
         
            +
                  
         
     | 
| 
      
 302 
     | 
    
         
            +
                  sys_class = SynInterfaces.get_interface("pos_tagger", 
         
     | 
| 
      
 303 
     | 
    
         
            +
            					      @exp.get("pos_tagger"))
         
     | 
| 
      
 304 
     | 
    
         
            +
                  print "pos tagger interface: ", sys_class, "\n" 
         
     | 
| 
      
 305 
     | 
    
         
            +
                  unless sys_class
         
     | 
| 
      
 306 
     | 
    
         
            +
                    raise "Shouldn't be here"
         
     | 
| 
      
 307 
     | 
    
         
            +
                  end
         
     | 
| 
      
 308 
     | 
    
         
            +
                  sys = sys_class.new(@exp.get("pos_tagger_path"),
         
     | 
| 
      
 309 
     | 
    
         
            +
            			  @file_suffixes["tab"],
         
     | 
| 
      
 310 
     | 
    
         
            +
            			  @file_suffixes["pos"])
         
     | 
| 
      
 311 
     | 
    
         
            +
                  sys.process_dir(output_dir, output_dir)
         
     | 
| 
      
 312 
     | 
    
         
            +
                end
         
     | 
| 
      
 313 
     | 
    
         
            +
                  
         
     | 
| 
      
 314 
     | 
    
         
            +
                
         
     | 
| 
      
 315 
     | 
    
         
            +
                ## 
         
     | 
| 
      
 316 
     | 
    
         
            +
                # Lemmatization
         
     | 
| 
      
 317 
     | 
    
         
            +
                if @exp.get("do_lemmatize")
         
     | 
| 
      
 318 
     | 
    
         
            +
                  $stderr.puts "Frprep: Lemmatizing."
         
     | 
| 
      
 319 
     | 
    
         
            +
                  unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
         
     | 
| 
      
 320 
     | 
    
         
            +
            	raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
         
     | 
| 
      
 321 
     | 
    
         
            +
                  end
         
     | 
| 
      
 322 
     | 
    
         
            +
             
     | 
| 
      
 323 
     | 
    
         
            +
                  sys_class = SynInterfaces.get_interface("lemmatizer", 
         
     | 
| 
      
 324 
     | 
    
         
            +
            					      @exp.get("lemmatizer"))
         
     | 
| 
      
 325 
     | 
    
         
            +
                  # AB: make this exception explicit.
         
     | 
| 
      
 326 
     | 
    
         
            +
                  unless sys_class
         
     | 
| 
      
 327 
     | 
    
         
            +
                    raise 'I got a empty interface class for the lemmatizer!'
         
     | 
| 
      
 328 
     | 
    
         
            +
                  end
         
     | 
| 
      
 329 
     | 
    
         
            +
                  sys = sys_class.new(@exp.get("lemmatizer_path"),
         
     | 
| 
      
 330 
     | 
    
         
            +
            			  @file_suffixes["tab"],
         
     | 
| 
      
 331 
     | 
    
         
            +
            			  @file_suffixes["lemma"])
         
     | 
| 
      
 332 
     | 
    
         
            +
                  sys.process_dir(output_dir, output_dir)
         
     | 
| 
      
 333 
     | 
    
         
            +
                end
         
     | 
| 
      
 334 
     | 
    
         
            +
              end
         
     | 
| 
      
 335 
     | 
    
         
            +
             
     | 
| 
      
 336 
     | 
    
         
            +
              ###############
         
     | 
| 
      
 337 
     | 
    
         
            +
              # transform_salsatab
         
     | 
| 
      
 338 
     | 
    
         
            +
              #
         
     | 
| 
      
 339 
     | 
    
         
            +
              # transformation for Tab format files:
         
     | 
| 
      
 340 
     | 
    
         
            +
              #
         
     | 
| 
      
 341 
     | 
    
         
            +
              # - parse
         
     | 
| 
      
 342 
     | 
    
         
            +
              # - Transform parser output to SalsaTigerXML
         
     | 
| 
      
 343 
     | 
    
         
            +
              #   If no parsing, make flat syntactic structure.
         
     | 
| 
      
 344 
     | 
    
         
            +
              def transform_salsatab_dir(input_dir,        # string: input directory
         
     | 
| 
      
 345 
     | 
    
         
            +
                                         parse_dir,     # string: output directory for parses 
         
     | 
| 
      
 346 
     | 
    
         
            +
                                         output_dir)       # string: global output directory
         
     | 
| 
      
 347 
     | 
    
         
            +
                
         
     | 
| 
      
 348 
     | 
    
         
            +
                ##
         
     | 
| 
      
 349 
     | 
    
         
            +
                # (Parse and) transform to SalsaTigerXML 
         
     | 
| 
      
 350 
     | 
    
         
            +
             
     | 
| 
      
 351 
     | 
    
         
            +
                # get interpretation class for this 
         
     | 
| 
      
 352 
     | 
    
         
            +
                # parser/lemmatizer/POS tagger combination
         
     | 
| 
      
 353 
     | 
    
         
            +
                interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
         
     | 
| 
      
 354 
     | 
    
         
            +
                unless interpreter_class
         
     | 
| 
      
 355 
     | 
    
         
            +
                  raise "Shouldn't be here"
         
     | 
| 
      
 356 
     | 
    
         
            +
                end
         
     | 
| 
      
 357 
     | 
    
         
            +
                
         
     | 
| 
      
 358 
     | 
    
         
            +
                parse_obj = DoParses.new(@exp, @file_suffixes,
         
     | 
| 
      
 359 
     | 
    
         
            +
            			     parse_dir, 
         
     | 
| 
      
 360 
     | 
    
         
            +
            			     "tab_dir" => input_dir)
         
     | 
| 
      
 361 
     | 
    
         
            +
                parse_obj.each_parsed_file { |parsed_file_obj|
         
     | 
| 
      
 362 
     | 
    
         
            +
             
     | 
| 
      
 363 
     | 
    
         
            +
                  outfilename = output_dir + parsed_file_obj.filename + ".xml"
         
     | 
| 
      
 364 
     | 
    
         
            +
                  $stderr.puts "Writing #{outfilename}"
         
     | 
| 
      
 365 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 366 
     | 
    
         
            +
                    outfile = File.new(outfilename, "w")
         
     | 
| 
      
 367 
     | 
    
         
            +
                  rescue
         
     | 
| 
      
 368 
     | 
    
         
            +
                    raise "Cannot write to SalsaTigerXML output file #{outfilename}"
         
     | 
| 
      
 369 
     | 
    
         
            +
                  end
         
     | 
| 
      
 370 
     | 
    
         
            +
             
     | 
| 
      
 371 
     | 
    
         
            +
                  outfile.puts SalsaTigerXMLHelper.get_header()
         
     | 
| 
      
 372 
     | 
    
         
            +
                  # work with triples
         
     | 
| 
      
 373 
     | 
    
         
            +
                  # SalsaTigerSentence, FNTabSentence,
         
     | 
| 
      
 374 
     | 
    
         
            +
                  # hash: tab sentence index(integer) -> array:SynNode
         
     | 
| 
      
 375 
     | 
    
         
            +
                  parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
         
     | 
| 
      
 376 
     | 
    
         
            +
             
     | 
| 
      
 377 
     | 
    
         
            +
                    # parsed: add headwords using parse tree
         
     | 
| 
      
 378 
     | 
    
         
            +
                    if @exp.get("do_parse")
         
     | 
| 
      
 379 
     | 
    
         
            +
                      FrprepHelper.add_head_attributes(st_sent, interpreter_class)
         
     | 
| 
      
 380 
     | 
    
         
            +
                    end
         
     | 
| 
      
 381 
     | 
    
         
            +
             
     | 
| 
      
 382 
     | 
    
         
            +
                    # add lemmas, if they are there. If they are not, don't print out a warning.
         
     | 
| 
      
 383 
     | 
    
         
            +
                    if @exp.get("do_lemmatize")
         
     | 
| 
      
 384 
     | 
    
         
            +
                      FrprepHelper.add_lemmas_from_tab(st_sent, tabformat_sent, mapping)
         
     | 
| 
      
 385 
     | 
    
         
            +
                    end
         
     | 
| 
      
 386 
     | 
    
         
            +
                    
         
     | 
| 
      
 387 
     | 
    
         
            +
                    # add semantics
         
     | 
| 
      
 388 
     | 
    
         
            +
            	# we can use the method in SalsaTigerXMLHelper
         
     | 
| 
      
 389 
     | 
    
         
            +
            	# that reads semantic information from the tab file
         
     | 
| 
      
 390 
     | 
    
         
            +
            	# and combines all targets of a sentence into one frame
         
     | 
| 
      
 391 
     | 
    
         
            +
            	FrprepHelper.add_semantics_from_tab(st_sent, tabformat_sent, mapping, 
         
     | 
| 
      
 392 
     | 
    
         
            +
            					    interpreter_class, @exp)
         
     | 
| 
      
 393 
     | 
    
         
            +
             
     | 
| 
      
 394 
     | 
    
         
            +
                    # remove pseudo-frames from FrameNet data
         
     | 
| 
      
 395 
     | 
    
         
            +
                    FrprepHelper.remove_deprecated_frames(st_sent, @exp)
         
     | 
| 
      
 396 
     | 
    
         
            +
             
     | 
| 
      
 397 
     | 
    
         
            +
                    # handle multiword targets
         
     | 
| 
      
 398 
     | 
    
         
            +
                    FrprepHelper.handle_multiword_targets(st_sent, 
         
     | 
| 
      
 399 
     | 
    
         
            +
            					      interpreter_class, @exp.get("language"))
         
     | 
| 
      
 400 
     | 
    
         
            +
             
     | 
| 
      
 401 
     | 
    
         
            +
                    # handle Unknown frame names
         
     | 
| 
      
 402 
     | 
    
         
            +
                    FrprepHelper.handle_unknown_framenames(st_sent, interpreter_class)	       
         
     | 
| 
      
 403 
     | 
    
         
            +
            	
         
     | 
| 
      
 404 
     | 
    
         
            +
                    outfile.puts st_sent.get()
         
     | 
| 
      
 405 
     | 
    
         
            +
                  }
         
     | 
| 
      
 406 
     | 
    
         
            +
                  outfile.puts SalsaTigerXMLHelper.get_footer()
         
     | 
| 
      
 407 
     | 
    
         
            +
                }
         
     | 
| 
      
 408 
     | 
    
         
            +
              end
         
     | 
| 
      
 409 
     | 
    
         
            +
             
     | 
| 
      
 410 
     | 
    
         
            +
              #############################################
         
     | 
| 
      
 411 
     | 
    
         
            +
              # transform_stxml
         
     | 
| 
      
 412 
     | 
    
         
            +
              # 
         
     | 
| 
      
 413 
     | 
    
         
            +
              # transformation for SalsaTigerXML data
         
     | 
| 
      
 414 
     | 
    
         
            +
              #
         
     | 
| 
      
 415 
     | 
    
         
            +
              # - If the input format was SalsaTigerXML:
         
     | 
| 
      
 416 
     | 
    
         
            +
              #   - Tag, lemmatize and parse, if the experiment file tells you so
         
     | 
| 
      
 417 
     | 
    
         
            +
              #
         
     | 
| 
      
 418 
     | 
    
         
            +
              # - If the origin is the Salsa corpus: 
         
     | 
| 
      
 419 
     | 
    
         
            +
              #   Change frame names from Unknown\d+ to lemma_Unknown\d+
         
     | 
| 
      
 420 
     | 
    
         
            +
              #
         
     | 
| 
      
 421 
     | 
    
         
            +
              # - fix multiword lemmas, or at least try
         
     | 
| 
      
 422 
     | 
    
         
            +
              # - transform to UTF 8
         
     | 
| 
      
 423 
     | 
    
         
            +
              def transform_stxml_dir(parse_dir,  # string: name of directory for parse data
         
     | 
| 
      
 424 
     | 
    
         
            +
                                      tab_dir,    # string: name of directory for split/tab data
         
     | 
| 
      
 425 
     | 
    
         
            +
                                      input_dir,  # string: name of input directory
         
     | 
| 
      
 426 
     | 
    
         
            +
                                      output_dir, # string: name of final output directory
         
     | 
| 
      
 427 
     | 
    
         
            +
                                      exp)        # FrprepConfigData
         
     | 
| 
      
 428 
     | 
    
         
            +
             
     | 
| 
      
 429 
     | 
    
         
            +
                ####
         
     | 
| 
      
 430 
     | 
    
         
            +
                # Data preparation
         
     | 
| 
      
 431 
     | 
    
         
            +
                
         
     | 
| 
      
 432 
     | 
    
         
            +
                # Data with Salsa as origin:
         
     | 
| 
      
 433 
     | 
    
         
            +
                # remember the target lemma as an attribute on the 
         
     | 
| 
      
 434 
     | 
    
         
            +
                # <target> elements
         
     | 
| 
      
 435 
     | 
    
         
            +
                #
         
     | 
| 
      
 436 
     | 
    
         
            +
                # currently deactivated: encoding problems
         
     | 
| 
      
 437 
     | 
    
         
            +
                #     if @exp.get("origin") == "SalsaTiger"
         
     | 
| 
      
 438 
     | 
    
         
            +
                #       $stderr.puts "Frprep: noting target lemmas"
         
     | 
| 
      
 439 
     | 
    
         
            +
                #       changed_input_dir = frprep_dirname("salsalemma", "new") 
         
     | 
| 
      
 440 
     | 
    
         
            +
                #       FrprepHelper.note_salsa_targetlemmas(input_dir, changed_input_dir)     
         
     | 
| 
      
 441 
     | 
    
         
            +
                
         
     | 
| 
      
 442 
     | 
    
         
            +
                #       # remember changed input dir as input dir
         
     | 
| 
      
 443 
     | 
    
         
            +
                #       input_dir = changed_input_dir
         
     | 
| 
      
 444 
     | 
    
         
            +
                #     end
         
     | 
| 
      
 445 
     | 
    
         
            +
                
         
     | 
| 
      
 446 
     | 
    
         
            +
                #  If data is to be parsed, split and tabify input files
         
     | 
| 
      
 447 
     | 
    
         
            +
                #    else copy data to stxml_indir.
         
     | 
| 
      
 448 
     | 
    
         
            +
                
         
     | 
| 
      
 449 
     | 
    
         
            +
                # stxml_dir: directory where SalsaTiger data is situated
         
     | 
| 
      
 450 
     | 
    
         
            +
                if @exp.get("do_parse")
         
     | 
| 
      
 451 
     | 
    
         
            +
                  # split data
         
     | 
| 
      
 452 
     | 
    
         
            +
                  stxml_splitdir = frprep_dirname("stxml_split", "new")
         
     | 
| 
      
 453 
     | 
    
         
            +
                  stxml_dir = stxml_splitdir
         
     | 
| 
      
 454 
     | 
    
         
            +
             
     | 
| 
      
 455 
     | 
    
         
            +
                  $stderr.puts "Frprep: splitting data"
         
     | 
| 
      
 456 
     | 
    
         
            +
                  FrprepHelper.stxml_split_dir(input_dir, stxml_splitdir, 
         
     | 
| 
      
 457 
     | 
    
         
            +
            				   @exp.get("parser_max_sent_num"), 
         
     | 
| 
      
 458 
     | 
    
         
            +
            				   @exp.get("parser_max_sent_len"))
         
     | 
| 
      
 459 
     | 
    
         
            +
                else
         
     | 
| 
      
 460 
     | 
    
         
            +
                  # no parsing: copy data to split dir
         
     | 
| 
      
 461 
     | 
    
         
            +
                  stxml_dir = parse_dir
         
     | 
| 
      
 462 
     | 
    
         
            +
                  $stderr.puts "Frprep: Copying data to #{stxml_dir}"
         
     | 
| 
      
 463 
     | 
    
         
            +
                  Dir[input_dir + "*.xml"].each { |filename|
         
     | 
| 
      
 464 
     | 
    
         
            +
                    `cp #{filename} #{stxml_dir}#{File.basename(filename)}`
         
     | 
| 
      
 465 
     | 
    
         
            +
                  }
         
     | 
| 
      
 466 
     | 
    
         
            +
                end
         
     | 
| 
      
 467 
     | 
    
         
            +
             
     | 
| 
      
 468 
     | 
    
         
            +
                # Some syntactic processing will take place:
         
     | 
| 
      
 469 
     | 
    
         
            +
                # tabify data
         
     | 
| 
      
 470 
     | 
    
         
            +
                if @exp.get("do_parse") or @exp.get("do_lemmatize") or @exp.get("do_postag")      
         
     | 
| 
      
 471 
     | 
    
         
            +
                  $stderr.puts "Frprep: making input for syn. processing" 
         
     | 
| 
      
 472 
     | 
    
         
            +
                  
         
     | 
| 
      
 473 
     | 
    
         
            +
                  Dir[stxml_dir+"*"+@file_suffixes["stxml"]].each { |stxmlfilename|
         
     | 
| 
      
 474 
     | 
    
         
            +
                    
         
     | 
| 
      
 475 
     | 
    
         
            +
                    tabfilename = tab_dir + File.basename(stxmlfilename,@file_suffixes["stxml"]) + @file_suffixes["tab"]
         
     | 
| 
      
 476 
     | 
    
         
            +
                    FrprepHelper.stxml_to_tab_file(stxmlfilename, tabfilename, exp)
         
     | 
| 
      
 477 
     | 
    
         
            +
                  }
         
     | 
| 
      
 478 
     | 
    
         
            +
                end
         
     | 
| 
      
 479 
     | 
    
         
            +
                
         
     | 
| 
      
 480 
     | 
    
         
            +
                ###
         
     | 
| 
      
 481 
     | 
    
         
            +
                # POS-tagging
         
     | 
| 
      
 482 
     | 
    
         
            +
                if @exp.get("do_postag")
         
     | 
| 
      
 483 
     | 
    
         
            +
                  $stderr.puts "Frprep: Tagging."
         
     | 
| 
      
 484 
     | 
    
         
            +
                  unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
         
     | 
| 
      
 485 
     | 
    
         
            +
            	raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
         
     | 
| 
      
 486 
     | 
    
         
            +
                  end
         
     | 
| 
      
 487 
     | 
    
         
            +
             
     | 
| 
      
 488 
     | 
    
         
            +
                  sys_class = SynInterfaces.get_interface("pos_tagger", 
         
     | 
| 
      
 489 
     | 
    
         
            +
            					      @exp.get("pos_tagger"))
         
     | 
| 
      
 490 
     | 
    
         
            +
                  unless sys_class
         
     | 
| 
      
 491 
     | 
    
         
            +
                    raise "Shouldn't be here"
         
     | 
| 
      
 492 
     | 
    
         
            +
                  end
         
     | 
| 
      
 493 
     | 
    
         
            +
                  sys = sys_class.new(@exp.get("pos_tagger_path"),
         
     | 
| 
      
 494 
     | 
    
         
            +
            			  @file_suffixes["tab"],
         
     | 
| 
      
 495 
     | 
    
         
            +
            			  @file_suffixes["pos"])
         
     | 
| 
      
 496 
     | 
    
         
            +
                  sys.process_dir(tab_dir, tab_dir)
         
     | 
| 
      
 497 
     | 
    
         
            +
                end
         
     | 
| 
      
 498 
     | 
    
         
            +
             
     | 
| 
      
 499 
     | 
    
         
            +
                ###
         
     | 
| 
      
 500 
     | 
    
         
            +
                # Lemmatization
         
     | 
| 
      
 501 
     | 
    
         
            +
                if @exp.get("do_lemmatize")
         
     | 
| 
      
 502 
     | 
    
         
            +
                  $stderr.puts "Frprep: Lemmatizing."
         
     | 
| 
      
 503 
     | 
    
         
            +
                  unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
         
     | 
| 
      
 504 
     | 
    
         
            +
            	raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
         
     | 
| 
      
 505 
     | 
    
         
            +
                  end
         
     | 
| 
      
 506 
     | 
    
         
            +
             
     | 
| 
      
 507 
     | 
    
         
            +
                  sys_class = SynInterfaces.get_interface("lemmatizer", 
         
     | 
| 
      
 508 
     | 
    
         
            +
            					      @exp.get("lemmatizer"))
         
     | 
| 
      
 509 
     | 
    
         
            +
                  unless sys_class
         
     | 
| 
      
 510 
     | 
    
         
            +
                    raise "Shouldn't be here"
         
     | 
| 
      
 511 
     | 
    
         
            +
                  end
         
     | 
| 
      
 512 
     | 
    
         
            +
                  sys = sys_class.new(@exp.get("lemmatizer_path"),
         
     | 
| 
      
 513 
     | 
    
         
            +
            			  @file_suffixes["tab"],
         
     | 
| 
      
 514 
     | 
    
         
            +
            			  @file_suffixes["lemma"])
         
     | 
| 
      
 515 
     | 
    
         
            +
                  sys.process_dir(tab_dir, tab_dir)
         
     | 
| 
      
 516 
     | 
    
         
            +
                end
         
     | 
| 
      
 517 
     | 
    
         
            +
             
     | 
| 
      
 518 
     | 
    
         
            +
                ###
         
     | 
| 
      
 519 
     | 
    
         
            +
                # Parsing, production of SalsaTigerXML output
         
     | 
| 
      
 520 
     | 
    
         
            +
             
     | 
| 
      
 521 
     | 
    
         
            +
                # get interpretation class for this 
         
     | 
| 
      
 522 
     | 
    
         
            +
                # parser/lemmatizer/POS tagger combination
         
     | 
| 
      
 523 
     | 
    
         
            +
                sys_class_names = Hash.new
         
     | 
| 
      
 524 
     | 
    
         
            +
                [["do_postag", "pos_tagger"],
         
     | 
| 
      
 525 
     | 
    
         
            +
                  ["do_lemmatize", "lemmatizer"],
         
     | 
| 
      
 526 
     | 
    
         
            +
                  ["do_parse", "parser"]].each { |service, system_name|
         
     | 
| 
      
 527 
     | 
    
         
            +
                  if @exp.get(service)  # yes, perform this service
         
     | 
| 
      
 528 
     | 
    
         
            +
            	sys_class_names[system_name] = @exp.get(system_name)
         
     | 
| 
      
 529 
     | 
    
         
            +
                  end
         
     | 
| 
      
 530 
     | 
    
         
            +
                }
         
     | 
| 
      
 531 
     | 
    
         
            +
                interpreter_class = SynInterfaces.get_interpreter(sys_class_names)
         
     | 
| 
      
 532 
     | 
    
         
            +
                unless interpreter_class
         
     | 
| 
      
 533 
     | 
    
         
            +
                  raise "Shouldn't be here"
         
     | 
| 
      
 534 
     | 
    
         
            +
                end
         
     | 
| 
      
 535 
     | 
    
         
            +
             
     | 
| 
      
 536 
     | 
    
         
            +
                parse_obj = DoParses.new(@exp, @file_suffixes,
         
     | 
| 
      
 537 
     | 
    
         
            +
            			     parse_dir, 
         
     | 
| 
      
 538 
     | 
    
         
            +
            			     "tab_dir" => tab_dir,
         
     | 
| 
      
 539 
     | 
    
         
            +
                                         "stxml_dir" => stxml_dir)
         
     | 
| 
      
 540 
     | 
    
         
            +
                parse_obj.each_parsed_file { |parsed_file_obj|
         
     | 
| 
      
 541 
     | 
    
         
            +
                  outfilename = output_dir + parsed_file_obj.filename + ".xml"
         
     | 
| 
      
 542 
     | 
    
         
            +
                  $stderr.puts "Writing #{outfilename}"
         
     | 
| 
      
 543 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 544 
     | 
    
         
            +
                    outfile = File.new(outfilename, "w")
         
     | 
| 
      
 545 
     | 
    
         
            +
                  rescue
         
     | 
| 
      
 546 
     | 
    
         
            +
                    raise "Cannot write to SalsaTigerXML output file #{outfilename}"
         
     | 
| 
      
 547 
     | 
    
         
            +
                  end
         
     | 
| 
      
 548 
     | 
    
         
            +
             
     | 
| 
      
 549 
     | 
    
         
            +
             
     | 
| 
      
 550 
     | 
    
         
            +
                  if @exp.get("do_parse")
         
     | 
| 
      
 551 
     | 
    
         
            +
                    # read old SalsaTigerXML file
         
     | 
| 
      
 552 
     | 
    
         
            +
                    # so we can integrate the old file's semantics later
         
     | 
| 
      
 553 
     | 
    
         
            +
                    oldxml = Array.new # array of sentence strings
         
     | 
| 
      
 554 
     | 
    
         
            +
                    # we assume that the old and the new file have the same name,
         
     | 
| 
      
 555 
     | 
    
         
            +
                    # ending in .xml.
         
     | 
| 
      
 556 
     | 
    
         
            +
                    oldxmlfile = FilePartsParser.new(stxml_dir + parsed_file_obj.filename + ".xml")
         
     | 
| 
      
 557 
     | 
    
         
            +
                    oldxmlfile.scan_s { |sent_string|
         
     | 
| 
      
 558 
     | 
    
         
            +
                      # remember this sentence by its ID
         
     | 
| 
      
 559 
     | 
    
         
            +
                      oldxml << sent_string
         
     | 
| 
      
 560 
     | 
    
         
            +
                    }
         
     | 
| 
      
 561 
     | 
    
         
            +
                  end       
         
     | 
| 
      
 562 
     | 
    
         
            +
             
     | 
| 
      
 563 
     | 
    
         
            +
                  outfile.puts SalsaTigerXMLHelper.get_header()
         
     | 
| 
      
 564 
     | 
    
         
            +
                  index = 0
         
     | 
| 
      
 565 
     | 
    
         
            +
                  # work with triples
         
     | 
| 
      
 566 
     | 
    
         
            +
                  # SalsaTigerSentence, FNTabSentence,
         
     | 
| 
      
 567 
     | 
    
         
            +
                  # hash: tab sentence index(integer) -> array:SynNode
         
     | 
| 
      
 568 
     | 
    
         
            +
                  parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
         
     | 
| 
      
 569 
     | 
    
         
            +
             
     | 
| 
      
 570 
     | 
    
         
            +
                    # parsed? then integrate semantics and lemmas from old file
         
     | 
| 
      
 571 
     | 
    
         
            +
                    if @exp.get("do_parse")
         
     | 
| 
      
 572 
     | 
    
         
            +
                      oldsent_string = oldxml[index]
         
     | 
| 
      
 573 
     | 
    
         
            +
                      index += 1
         
     | 
| 
      
 574 
     | 
    
         
            +
                      if oldsent_string
         
     | 
| 
      
 575 
     | 
    
         
            +
             
     | 
| 
      
 576 
     | 
    
         
            +
                        # modified by ines, 27/08/08
         
     | 
| 
      
 577 
     | 
    
         
            +
                        # for Berkeley => substitute ( ) for *LRB* *RRB*
         
     | 
| 
      
 578 
     | 
    
         
            +
                        if exp.get("parser") == "berkeley"
         
     | 
| 
      
 579 
     | 
    
         
            +
                          oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
         
     | 
| 
      
 580 
     | 
    
         
            +
                          oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
         
     | 
| 
      
 581 
     | 
    
         
            +
                          oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
         
     | 
| 
      
 582 
     | 
    
         
            +
                          oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
         
     | 
| 
      
 583 
     | 
    
         
            +
                        end
         
     | 
| 
      
 584 
     | 
    
         
            +
             
     | 
| 
      
 585 
     | 
    
         
            +
                        # we have both an old and a new sentence, so integrate semantics
         
     | 
| 
      
 586 
     | 
    
         
            +
                        oldsent = SalsaTigerSentence.new(oldsent_string)
         
     | 
| 
      
 587 
     | 
    
         
            +
            	if st_sent.nil?
         
     | 
| 
      
 588 
     | 
    
         
            +
            		next
         
     | 
| 
      
 589 
     | 
    
         
            +
            	end
         
     | 
| 
      
 590 
     | 
    
         
            +
                      if ( FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp) == false)
         
     | 
| 
      
 591 
     | 
    
         
            +
            		#print "FALSE \n";
         
     | 
| 
      
 592 
     | 
    
         
            +
            		#print oldsent, "\n", st_sent, "\n\n";
         
     | 
| 
      
 593 
     | 
    
         
            +
             
     | 
| 
      
 594 
     | 
    
         
            +
                  	    	oldsent_string = oldxml[index]
         
     | 
| 
      
 595 
     | 
    
         
            +
                    	index += 1
         
     | 
| 
      
 596 
     | 
    
         
            +
                      	if oldsent_string
         
     | 
| 
      
 597 
     | 
    
         
            +
             
     | 
| 
      
 598 
     | 
    
         
            +
                        	# modified by ines, 27/08/08
         
     | 
| 
      
 599 
     | 
    
         
            +
                        	# for Berkeley => substitute ( ) for *LRB* *RRB*
         
     | 
| 
      
 600 
     | 
    
         
            +
                        	if exp.get("parser") == "berkeley"
         
     | 
| 
      
 601 
     | 
    
         
            +
                        	  oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
         
     | 
| 
      
 602 
     | 
    
         
            +
                        	  oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
         
     | 
| 
      
 603 
     | 
    
         
            +
                        	  oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
         
     | 
| 
      
 604 
     | 
    
         
            +
                        	  oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
         
     | 
| 
      
 605 
     | 
    
         
            +
                        	end
         
     | 
| 
      
 606 
     | 
    
         
            +
             
     | 
| 
      
 607 
     | 
    
         
            +
                        	# we have both an old and a new sentence, so integrate semantics
         
     | 
| 
      
 608 
     | 
    
         
            +
                        	oldsent = SalsaTigerSentence.new(oldsent_string)
         
     | 
| 
      
 609 
     | 
    
         
            +
                            #print oldsent, "\n", st_sent, "\n\n";
         
     | 
| 
      
 610 
     | 
    
         
            +
            		FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp) 
         
     | 
| 
      
 611 
     | 
    
         
            +
            	
         
     | 
| 
      
 612 
     | 
    
         
            +
            		end
         
     | 
| 
      
 613 
     | 
    
         
            +
            	  #else
         
     | 
| 
      
 614 
     | 
    
         
            +
            			#print "TRUE\n";
         
     | 
| 
      
 615 
     | 
    
         
            +
            			#print oldsent, "\n", st_sent, "\n\n";
         
     | 
| 
      
 616 
     | 
    
         
            +
             	  end
         
     | 
| 
      
 617 
     | 
    
         
            +
                    else
         
     | 
| 
      
 618 
     | 
    
         
            +
                        # no corresponding old sentence for this new sentence
         
     | 
| 
      
 619 
     | 
    
         
            +
                        $stderr.puts "Warning: transporting semantics -- missing source sentence, skipping"
         
     | 
| 
      
 620 
     | 
    
         
            +
                      end
         
     | 
| 
      
 621 
     | 
    
         
            +
                    end
         
     | 
| 
      
 622 
     | 
    
         
            +
             
     | 
| 
      
 623 
     | 
    
         
            +
                    # remove pseudo-frames from FrameNet data
         
     | 
| 
      
 624 
     | 
    
         
            +
                    FrprepHelper.remove_deprecated_frames(st_sent, @exp)
         
     | 
| 
      
 625 
     | 
    
         
            +
             
     | 
| 
      
 626 
     | 
    
         
            +
                    # repair syn/sem mapping problems?
         
     | 
| 
      
 627 
     | 
    
         
            +
                    if @exp.get("fe_syn_repair") or @exp.get("fe_rel_repair")
         
     | 
| 
      
 628 
     | 
    
         
            +
                      FixSynSemMapping.fixit(st_sent, @exp, interpreter_class)
         
     | 
| 
      
 629 
     | 
    
         
            +
                    end
         
     | 
| 
      
 630 
     | 
    
         
            +
             
     | 
| 
      
 631 
     | 
    
         
            +
                    outfile.puts st_sent.get()
         
     | 
| 
      
 632 
     | 
    
         
            +
                  } # each ST sentence
         
     | 
| 
      
 633 
     | 
    
         
            +
                  outfile.puts SalsaTigerXMLHelper.get_footer()
         
     | 
| 
      
 634 
     | 
    
         
            +
                } # each file parsed
         
     | 
| 
      
 635 
     | 
    
         
            +
              end
         
     | 
| 
      
 636 
     | 
    
         
            +
             
     | 
| 
      
 637 
     | 
    
         
            +
             
     | 
| 
      
 638 
     | 
    
         
            +
              ###################################
         
     | 
| 
      
 639 
     | 
    
         
            +
              # general file iterators
         
     | 
| 
      
 640 
     | 
    
         
            +
             
     | 
| 
      
 641 
     | 
    
         
            +
              # yields pairs of [infile name, outfile stream]
         
     | 
| 
      
 642 
     | 
    
         
            +
              def change_each_file_in_dir(dir,                 # string: directory name
         
     | 
| 
      
 643 
     | 
    
         
            +
                                          suffix)    # string: filename pattern, e.g. "*.xml"
         
     | 
| 
      
 644 
     | 
    
         
            +
                Dir[dir + "*#{suffix}"].each { |filename|
         
     | 
| 
      
 645 
     | 
    
         
            +
                  tempfile = Tempfile.new("FrprepHelper")
         
     | 
| 
      
 646 
     | 
    
         
            +
                  yield [filename, tempfile]
         
     | 
| 
      
 647 
     | 
    
         
            +
                  
         
     | 
| 
      
 648 
     | 
    
         
            +
                  # move temp file to original file location
         
     | 
| 
      
 649 
     | 
    
         
            +
                  tempfile.close()
         
     | 
| 
      
 650 
     | 
    
         
            +
                  `cp #{filename} #{filename}.bak`
         
     | 
| 
      
 651 
     | 
    
         
            +
                  `mv #{tempfile.path()} #{filename}`
         
     | 
| 
      
 652 
     | 
    
         
            +
                  tempfile.close(true)
         
     | 
| 
      
 653 
     | 
    
         
            +
                } # each file
         
     | 
| 
      
 654 
     | 
    
         
            +
              end
         
     | 
| 
      
 655 
     | 
    
         
            +
             
     | 
| 
      
 656 
     | 
    
         
            +
              #######
         
     | 
| 
      
 657 
     | 
    
         
            +
              # change_each_stxml_file_in_dir
         
     | 
| 
      
 658 
     | 
    
         
            +
              #
         
     | 
| 
      
 659 
     | 
    
         
            +
              # use change_each_file_in_dir, but assume that the files
         
     | 
| 
      
 660 
     | 
    
         
            +
              # are SalsaTigerXML files: Keep file headers and footers,
         
     | 
| 
      
 661 
     | 
    
         
            +
              # and just offer individual sentences for changing
         
     | 
| 
      
 662 
     | 
    
         
            +
              #
         
     | 
| 
      
 663 
     | 
    
         
            +
              # Yields SalsaTigerSentence objects, each sentence to be changed
         
     | 
| 
      
 664 
     | 
    
         
            +
              def change_each_stxml_file_in_dir(dir)            # string: directory name
         
     | 
| 
      
 665 
     | 
    
         
            +
             
     | 
| 
      
 666 
     | 
    
         
            +
                change_each_file_in_dir(dir, "*.xml") { |stfilename, tf|
         
     | 
| 
      
 667 
     | 
    
         
            +
                  infile = FilePartsParser.new(stfilename)
         
     | 
| 
      
 668 
     | 
    
         
            +
             
     | 
| 
      
 669 
     | 
    
         
            +
                  # write header
         
     | 
| 
      
 670 
     | 
    
         
            +
                  tf.puts infile.head()
         
     | 
| 
      
 671 
     | 
    
         
            +
             
     | 
| 
      
 672 
     | 
    
         
            +
                  # iterate through sentences, yield as SalsaTigerSentence objects
         
     | 
| 
      
 673 
     | 
    
         
            +
                  infile.scan_s() { |sent_string|
         
     | 
| 
      
 674 
     | 
    
         
            +
                    sent = SalsaTigerSentence.new(sent_string)
         
     | 
| 
      
 675 
     | 
    
         
            +
                    yield sent
         
     | 
| 
      
 676 
     | 
    
         
            +
                    # write changed sentence
         
     | 
| 
      
 677 
     | 
    
         
            +
                    tf.puts sent.get()
         
     | 
| 
      
 678 
     | 
    
         
            +
                  } # each sentence
         
     | 
| 
      
 679 
     | 
    
         
            +
                  
         
     | 
| 
      
 680 
     | 
    
         
            +
                  # write footer
         
     | 
| 
      
 681 
     | 
    
         
            +
                  tf.puts infile.tail()
         
     | 
| 
      
 682 
     | 
    
         
            +
                  infile.close()
         
     | 
| 
      
 683 
     | 
    
         
            +
                }
         
     | 
| 
      
 684 
     | 
    
         
            +
              end
         
     | 
| 
      
 685 
     | 
    
         
            +
            end
         
     | 
| 
      
 686 
     | 
    
         
            +
            end
         
     |