RubyGems - shalmaneser-fred - Versions diffs - 1.2.0.rc4 → 1.2.rc5 - Mend

shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

Files changed (68) hide show

checksums.yaml +4 -4
data/README.md +47 -18
data/bin/fred +8 -3
data/lib/fred/FredConventions.rb +190 -189
data/lib/fred/abstract_context_provider.rb +246 -0
data/lib/fred/abstract_fred_feature_access.rb +43 -0
data/lib/fred/answer_key_access.rb +130 -0
data/lib/fred/aux_keep_writers.rb +94 -0
data/lib/fred/baseline.rb +153 -0
data/lib/fred/context_provider.rb +55 -0
data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
data/lib/fred/feature_extractors.rb +5 -0
data/lib/fred/file_zipped.rb +43 -0
data/lib/fred/find_all_targets.rb +94 -0
data/lib/fred/find_targets_from_frames.rb +92 -0
data/lib/fred/fred.rb +43 -40
data/lib/fred/fred_error.rb +15 -0
data/lib/fred/fred_eval.rb +311 -0
data/lib/fred/fred_feature_access.rb +420 -0
data/lib/fred/fred_feature_info.rb +56 -0
data/lib/fred/fred_featurize.rb +525 -0
data/lib/fred/fred_parameters.rb +190 -0
data/lib/fred/fred_split.rb +86 -0
data/lib/fred/fred_split_pkg.rb +189 -0
data/lib/fred/fred_test.rb +571 -0
data/lib/fred/fred_train.rb +125 -0
data/lib/fred/grammatical_function_access.rb +63 -0
data/lib/fred/md5.rb +6 -0
data/lib/fred/meta_feature_access.rb +185 -0
data/lib/fred/non_contiguous_context_provider.rb +532 -0
data/lib/fred/opt_parser.rb +182 -161
data/lib/fred/plot_and_r_eval.rb +486 -0
data/lib/fred/single_sent_context_provider.rb +76 -0
data/lib/fred/slide_var.rb +148 -0
data/lib/fred/targets.rb +136 -0
data/lib/fred/toggle_var.rb +61 -0
data/lib/fred/word_lemma_pos_ne.rb +51 -0
data/lib/fred/write_features_binary.rb +95 -0
data/lib/fred/write_features_nary.rb +51 -0
data/lib/fred/write_features_nary_or_binary.rb +51 -0
data/lib/shalmaneser/fred.rb +1 -0
metadata +57 -30
data/lib/fred/Baseline.rb +0 -150
data/lib/fred/FileZipped.rb +0 -31
data/lib/fred/FredBOWContext.rb +0 -877
data/lib/fred/FredDetermineTargets.rb +0 -319
data/lib/fred/FredEval.rb +0 -312
data/lib/fred/FredFeatureExtractors.rb +0 -322
data/lib/fred/FredFeatures.rb +0 -1061
data/lib/fred/FredFeaturize.rb +0 -602
data/lib/fred/FredNumTrainingSenses.rb +0 -27
data/lib/fred/FredParameters.rb +0 -402
data/lib/fred/FredSplit.rb +0 -84
data/lib/fred/FredSplitPkg.rb +0 -180
data/lib/fred/FredTest.rb +0 -606
data/lib/fred/FredTrain.rb +0 -144
data/lib/fred/PlotAndREval.rb +0 -480
data/lib/fred/fred_config_data.rb +0 -185
data/test/frprep/test_opt_parser.rb +0 -94
data/test/functional/functional_test_helper.rb +0 -58
data/test/functional/test_fred.rb +0 -47
data/test/functional/test_frprep.rb +0 -99
data/test/functional/test_rosy.rb +0 -40

data/lib/fred/plot_and_r_eval.rb ADDED Viewed

@@ -0,0 +1,486 @@
+require "tempfile"
+require "StandardPkgExtensions"
+module Shalmaneser
+  module Fred
+    # @todo Ivestigate where this module is used.
+    module PlotAndREval
+      ############
+      # given a set of mappings x_axis_value -> y_axis_value,
+      # plot them all within the same gnuplot graph
+      #
+      # scores:
+      # either hash: score_label(string) -> hash x_axis(float) -> y_axis(float)
+      # or hash: score_label(string) -> array [x_axis(float), y_axis(float)]
+      def self.gnuplot_direct(scores,
+                              title,      # string: title for output files
+                              x_name,     # string: label for x axis
+                              y_name,     # string: label for y axis
+                              plotoutfile, # string: name of gnuplot output file
+                              data_style = "linespoints") # data style
+        # for each score label: write x_axis/y_axis pairs to a separate tempfile
+        score_file = {}
+        scores.each_pair { |score_label, score_values|
+          score_file[score_label] = Tempfile.new("PlotAndREval")
+          score_values.to_a.sort { |a, b| a.first <=> b.first }.each do |x_val, y_val|
+            score_file[score_label].puts "#{x_val} #{y_val}"
+          end
+          score_file[score_label].close
+        }
+        # write command file for gnuplot
+        gf = Tempfile.new("PlotAndREval")
+        gf.puts "set title \"" + title + "\""
+        gf.puts "set ylabel \"" + y_name + "\""
+        gf.puts "set xlabel \"" + x_name + "\""
+        gf.puts "set time"
+        gf.puts "set data style " + data_style
+        gf.puts "set grid"
+        gf.puts "set output \"" + plotoutfile + "\""
+        gf.puts "set terminal postscript color"
+        gf.print "plot "
+        gf.puts score_file.to_a.map { |score_label, tempfile|
+          # plot "<filename>" using "<title>", "<filename>" using "<title>",...
+          "\"" + tempfile.path + "\"" + " title \"" + score_label + "\""
+        }.join(", ")
+        # finalize tempfile
+        gf.close
+        %x{gnuplot #{gf.path}}
+      end
+      #################
+      # Given a list of pairs [x, y],
+      # group them into N bins (by splitting the range from min score to max score)
+      # compute the average y for each x bin, and plot
+      def self.gnuplot_average(scores, # array of pairs [x(float), y(float)
+                               title,  # string: title for output file
+                               x_label, # label for x axis
+                               y_label, # label for y axis
+                               plotoutfile, # string: name of gnuplot output file
+                               min_value, # float: minimum value
+                               bin_size) # float: size of one bin
+        # sort scores into bins
+        bin = {}
+        scores.each { |xval, yval|
+          bin_no = (xval - min_value / bin_size).floor
+          unless bin[bin_no]
+            bin[bin_no] = []
+          end
+          bin[bin_no] << yval
+        }
+        # print average for each bin to temp infile for gnuplot
+        tf = Tempfile.new("plot_and_r")
+        bin.keys.sort.each do |bin_no|
+          if bin[bin_no].length > 0
+            avg = (bin[bin_no].big_sum(0.0) { |yval| yval }) / bin[bin_no].length.to_f
+          else
+            avg = 0.0
+          end
+          val = min_value + (bin_no.to_f * bin_size)
+          tf.print val, "\t", avg, "\n"
+        end
+        tf.close
+        # make gnuplot main infile
+        gf = Tempfile.new("plot_and_r")
+        gf.puts "set title \"#{title}\""
+        gf.puts "set ylabel \"#{y_label}\""
+        gf.puts "set xlabel \"#{x_label}\""
+        gf.puts "set time"
+        gf.puts "set data style linespoints"
+        gf.puts "set grid"
+        gf.puts "set output \"" + plotoutfile + "\""
+        gf.puts "set terminal postscript color"
+        gf.print "plot \"#{tf.path}\" title \"#{y_label}\""
+        gf.puts
+        gf.puts
+        gf.close
+        # now gnuplot it
+        %x{gnuplot #{gf.path}}
+        # and remove temp files
+        tf.close(true)
+        gf.close(true)
+      end
+      #################
+      # given a mapping from labels to scores,
+      # split the range form min. score to max. score into
+      # 20 bins, sort the label/score pairs into the bins,
+      # and gnuplot them as a bar graph of 20 bars.
+      #
+      # A title for the graph must be given, and a
+      # name for the gnuplot output file.
+      # If the name of a text output file is given,
+      # the result is also printed as text.
+      #
+      # If minvalue and maxvalue are given, they are used
+      # as start and end of the scale instead of the
+      # min. and max. values from the scores hash.
+      def self.gnuplot_quantity_chart(scores, # hash:label(string) -> value(float), label->score-mapping
+                                      title,  # string: title for output files
+                                      score_name, # string: what are the scores? (label for y axis)
+                                      plotoutfile, # string: name of gnuplot output file
+                                      textoutfile = nil, # string: name of text output file
+                                      minvalue=nil, # float: minimum value for y axis
+                                      maxvalue=nil) # float: maximum value for y axis
+        # group scores in 20 subgroups
+        # first determine minimum, maximum score, single interval
+        if minvalue.nil?
+          # @todo AB: Change this to the constant.
+          minvalue = 1.0/0.0 # infinity
+          scores.values.each do |score|
+            minvalue = [score, minvalue].min
+          end
+        end
+        if maxvalue.nil?
+          # @todo AB: Change this to the constant.
+          maxvalue = -1.0/0.0 # -infinity
+          scores.values.each do |score|
+            maxvalue = [score, maxvalue].max
+          end
+        end
+        interval = (maxvalue - minvalue) / 20.0
+        # now compute the number of scores in each interval
+        num_in_range = Hash.new(0)
+        scores.each_pair { |label, score|
+          num = (score / interval).floor
+          num_in_range[num] += 1
+        }
+        # open output files:
+        # text output, temp files for gnuplot
+        if textoutfile
+          textout = File.new(textoutfile, "w")
+          # document number of scores in each range
+          # to text outfile
+          textout.puts "-------------------------"
+          textout.puts title
+          textout.puts "-------------------------"
+          num_in_range.keys.sort.each { |rangeno|
+            range_lower = interval * rangeno.to_f
+            textout.print "number of values btw. ", sprintf("%.2f", range_lower),
+                          " and ", sprintf("%.2f", range_lower + interval), ": ",
+                          num_in_range[rangeno], "\n"
+          }
+          textout.close
+        end
+        # document number of scores in each range
+        # to temp. infile for gnuplot
+        tf = Tempfile.new("plot_and_r")
+        0.upto(19) { |rangeno|
+          range_lower = interval * rangeno.to_f
+          tf.print range_lower, "\t", num_in_range[rangeno], "\n"
+        }
+        tf.close
+        # make gnuplot main infile
+        gf = Tempfile.new("plot_and_r")
+        gf.puts "set title \"" + title+ "\""
+        gf.puts "set ylabel \"num items\""
+        gf.puts "set xlabel \"" + score_name + "\""
+        gf.puts "set time"
+        gf.puts "set data style boxes"
+        gf.puts "set boxwidth " + (interval/2.0).to_s
+        gf.puts "set grid"
+        gf.puts "set output \"" + plotoutfile + "\""
+        gf.puts "set terminal postscript color"
+        gf.print "plot \"" + tf.path + "\" title \"" + score_name + "\" with boxes"
+        gf.puts
+        gf.puts
+        gf.close
+        # now gnuplot it
+        %x{gnuplot #{gf.path}}
+        # and remove temp files
+        tf.close(true)
+        gf.close(true)
+      end
+      #####
+      # draws a scatter plot comparing two
+      # mappings from labels to scores
+      # the first (base) scores are drawn on the x axis,
+      # the second (comparison) scores are drawn on the y axis.
+      # The method only looks at labels present in the base score,
+      # so if a label is present only in the comparison score but not the base score
+      # it is ignored.
+      def self.gnuplot_correlation_chart(base_scores, # hash: label(string) -> value(float)
+                                         comparison_scores, # hash: label(string) -> value(float)
+                                         title,  # string: title for output files
+                                         base_name, # string: what are the base scores?
+                                         comparison_name, # string: what are the comparison scores?
+                                         plotoutfile, # string: name of gnuplot output file
+                                         textoutfile = nil) # string: name of text output file
+        # text output: base score/comparison score pairs
+        if textoutfile
+          begin
+            textout = File.new(textoutfile, "w")
+          rescue
+            raise "Couldn't write to " + textoutfile
+          end
+          textout.puts "------------------------"
+          textout.puts title
+          textout.puts "------------------------"
+          # text output: base score / comparison score pairs
+          base_scores.to_a.sort { |a, b| b.last <=> a.last }.each { |label, score|
+            textout.print label, ": ", base_name, ": ", score, ", ", comparison_name, ": "
+            if comparison_scores[label]
+              textout.print comparison_scores[label], "\n"
+            else
+              textout.print "--", "\n"
+            end
+          }
+        end
+        # make scatter plot: base vs. comparison
+        tf = Tempfile.new("plot_and_r")
+        base_scores.each_pair { |label, score|
+          if comparison_scores[label]
+            tf.print score, "\t", comparison_scores[label], "\n"
+          else
+            $stderr.puts "no comparison scores for " + label
+          end
+        }
+        tf.close
+        # make gnuplot main infile
+        gf = Tempfile.new("plot_and_r")
+        gf.puts "set title \"" + title + "\""
+        gf.puts "set ylabel \"" + comparison_name + "\""
+        gf.puts "set xlabel \"" + base_name + "\""
+        gf.puts "set time"
+        gf.puts "set data style points"
+        gf.puts "set grid"
+        gf.puts "set output \"" + plotoutfile + "\""
+        gf.puts "set terminal postscript color"
+        gf.puts "plot \"" + tf.path + "\""
+        gf.puts
+        gf.close
+        # now gnuplot it
+        %x{gnuplot #{gf.path}}
+        tf.close(true)
+        gf.close(true)
+      end
+      # given two mappings from labels to scores,
+      # draw a gnuplot drawing comparing them
+      # as box scores:
+      # sort the first mapping by scores (in descending order),
+      # then for each label draw first the score from the first mapping
+      # as a box, then the score from the second mapping
+      # as a differently colored box.
+      #
+      # Scores1 is the basis for the comparison: only those labels
+      # are used that occur in mapping 1 are included in the comparison
+      #
+      # A title for the graph must be given, and a
+      # name for the gnuplot output file.
+      # If the name of a text output file is given,
+      # the result is also printed as text.
+      def self.gnuplot_comparison_chart(scores1, # hash:label(string) -> value(float), label->score-mapping
+                                        scores2, # hash:label(string) -> value(float), label->score-mapping
+                                        title,  # string: title for output files
+                                        score_name, # string: what are the scores? (label for y axis)
+                                        plotoutfile, # string: name of gnuplot output file
+                                        textoutfile = nil) # string: name of text output file
+        # text output
+        if textoutfile
+          textout = File.new(textoutfile, "w")
+          # document scores in each range
+          # to text outfile
+          textout.puts "-------------------------"
+          textout.puts title
+          textout.puts "-------------------------"
+          textout.puts "Label\tScore 1\tScore 2"
+          scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
+            textout.print label, "\t", score1, "\t"
+            score2 = scores2[label]
+            if score2
+              textout.print score2, "\n"
+            else
+              textout.print "-", "\n"
+            end
+          }
+          textout.close
+        end
+        # document number of scores in each mapping
+        # to temp. infile for gnuplot
+        tf1 = Tempfile.new("plot_and_r")
+        tf2 = Tempfile.new("plot_and_r")
+        index = 0.0
+        scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
+          score2 = scores2[label]
+          tf1.print index, "\t", score1, "\n"
+          if score2
+            i2 = index + 0.2
+            tf2.print i2, "\t", score2, "\n"
+          end
+          index += 1.0
+        }
+        tf1.close
+        tf2.close
+        # make gnuplot main infile
+        gf = Tempfile.new("plot_and_r")
+        gf.puts "set title \"" + title+ "\""
+        gf.puts "set ylabel \"" + score_name + "\""
+        gf.puts "set time"
+        gf.puts "set boxwidth 0.2"
+        gf.puts "set noxtics"
+        gf.puts "set grid"
+        gf.puts "set output \"" + plotoutfile + "\""
+        gf.puts "set terminal postscript color"
+        gf.print "plot \"" + tf1.path + "\" title \"score 1\" with boxes fs solid 0.9,"
+        gf.puts "\"" + tf2.path + "\" title \"score 2\" with boxes fs solid 0.6"
+        gf.puts
+        gf.puts
+        gf.close
+        # now gnuplot it
+        %x{gnuplot #{gf.path}}
+        # and remove temp files
+        tf1.close(true)
+        tf2.close(true)
+        gf.close(true)
+      end
+      #####
+      #
+      # computes a nonparametric rank correlation
+      #
+      # can compute partial correlations, i.e. correlations which factor out the influence
+      # of a confound variable (last variable, can be omitted).
+      def self.tau_correlation(base_scores, # hash: label(string) -> value(float)
+                               comparison_scores, # hash: label(string) -> value(float)
+                               base_name, # string: what are the base scores?
+                               comparison_name, # string: what are the comparison scores?
+                               textoutfile, # string: name of text output file
+                               confound_scores = nil) # hash: label(string) -> value(float)
+        # compute Kendall's tau:
+        # correlation between fscore and confusion?
+        tf_f = Tempfile.new("plot_and_r")
+        tf_e = Tempfile.new("plot_and_r")
+        if confound_scores
+          tf_c = Tempfile.new("plot_and_r")
+        end
+        base_scores.each_pair { |label, score|
+          if comparison_scores[label]
+            tf_f.puts score.to_s
+            tf_e.puts comparison_scores[label].to_s
+            if confound_scores
+              if confound_scores[label]
+                # logarithmise frequencies
+                tf_c.puts((Math.log(confound_scores[label])).to_s)
+              else
+                $stderr.puts "no confound scores for " + label
+              end
+            end
+          else
+            $stderr.puts "no comparison scores for " + label
+          end
+        }
+        tf_e.close
+        tf_f.close
+        if confound_scores
+          tf_c.close
+        end
+        # write the R script to rf
+        rf = Tempfile.new("plot_and_r")
+        # write the output to rfout
+        rfout = Tempfile.new("plot_and_r")
+        rfout.close
+        if confound_scores # perform partial correlation analysis
+          rf.puts "base <- read.table(\"#{tf_f.path}\")"
+          rf.puts "comparison <- read.table(\"#{tf_e.path}\")"
+          rf.puts "confuse <- read.table(\"#{tf_c.path}\")"
+          # adapted from https://stat.ethz.ch/pipermail/r-help/2001-August/012820.html
+          # compute partial correlation coefficient for comparison, with confuse excluded
+          rf.puts "cor(lm(base[[1]]~confuse[[1]])$resid,lm(comparison[[1]]~confuse[[1]])$resid,method=\"kendall\")"
+          # compute partial correlation coefficient for confuse, with comparison excluded
+          rf.puts "cor(lm(base[[1]]~comparison[[1]])$resid,lm(confuse[[1]]~comparison[[1]])$resid,method=\"kendall\")"
+          # compute significance of partial correlation
+          rf.puts "summary(lm(base[[1]] ~ comparison[[1]] + confuse[[1]]))"
+        else # perform normal correlation analysis
+          rf.puts "base <- read.table(\"#{tf_f.path}\")"
+          rf.puts "comparison <- read.table(\"#{tf_e.path}\")"
+          rf.puts "cor.test(base[[1]], comparison[[1]], method=\"kendall\", exact=FALSE)"
+        end
+        rf.close
+        # @todo AB: Correct this path!
+        %x{/proj/contrib/R/R-1.8.0/bin/R --vanilla < #{rf.path} > #{rfout.path}}
+        rfout.open
+        # output of R results: to stderr and to textout file
+        begin
+          textout = File.new(textoutfile, "w")
+        rescue
+          raise "Couldn't write to file " + textoutfile
+        end
+        textout.puts "-----------------------"
+        textout.puts "Correlation of " + base_name + " and " + comparison_name + " by Kendall's tau:"
+        textout.puts "-----------------------"
+        while (line = rfout.gets)
+          $stderr.puts "R output: " + line
+          textout.puts "R output: " + line
+        end
+        tf_e.close(true)
+        tf_f.close(true)
+        rf.close(true)
+        rfout.close(true)
+        textout.close
+      end
+    end
+  end
+end

data/lib/fred/single_sent_context_provider.rb ADDED Viewed

@@ -0,0 +1,76 @@
+require 'fred/abstract_context_provider'
+require 'tabular_format/tab_format_sentence'
+require 'salsa_tiger_xml/salsa_tiger_sentence'
+require 'salsa_tiger_xml/file_parts_parser'
+module Shalmaneser
+  module Fred
+    ####################################
+    # SingleSentContextProvider:
+    # subclass of AbstractContextProvider
+    # that assumes that each sentence of the input text
+    # stands on its own
+    class SingleSentContextProvider < AbstractContextProvider
+      ###
+      # each_window: iterator
+      #
+      # given a directory with Salsa/Tiger XML data,
+      # iterate through the data,
+      # yielding each target word as soon as its context window is filled
+      # (or the last file is at an end)
+      def each_window(dir) # string: directory containing Salsa/Tiger XML data
+        # iterate through files in the directory.
+        # Try sorting filenames numerically, since this is
+        # what frprep mostly does with filenames
+        Dir[dir + "*.xml"].sort { |a, b|
+          File.basename(a, ".xml").to_i <=> File.basename(b, ".xml").to_i
+        }.each { |filename|
+          # progress bar
+          if @exp.get("verbose")
+            $stderr.puts "Featurizing #{File.basename(filename)}"
+          end
+          f = STXML::FilePartsParser.new(filename)
+          each_window_for_file(f) { |result|
+            yield result
+          }
+        }
+      end
+      ##################################
+      protected
+      ######################
+      # each_window_for_file: iterator
+      # same as each_window, but only for a single file
+      # (to be called from each_window())
+      def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
+        fpp.scan_s { |sent_string|
+          sent = STXML::SalsaTigerSentence.new(sent_string)
+          each_window_for_sent(sent) { |result|
+            yield result
+          }
+        }
+        # no need to clear the context: we're doing this after each sentence
+      end
+      ###
+      # each_window_for_sent: empty context after each sentence
+      def each_window_for_sent(sent)
+        if sent.is_a? STXML::SalsaTigerSentence
+          each_window_for_stsent(sent) { |result| yield result }
+        elsif sent.is_a? TabFormatSentence
+          each_window_for_tabsent(sent) { |result | yield result }
+        else
+          $stderr.puts "Error: got #{sent.class}, expected SalsaTigerSentence or TabFormatSentence."
+          exit 1
+        end
+        # clear the context
+        each_remaining_target { |result| yield result }
+      end
+    end
+  end
+end