RubyGems - batch_experiment - Versions diffs - 1.0.2 → 2.0.0 - Mend

batch_experiment 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/README.md +4 -2
data/examples/debug_batch.rb +1 -1
data/examples/{example_batch.rb → experiment_example.rb} +11 -11
data/examples/{ukp_batch.rb → ukp_experiment.rb} +10 -6
data/lib/batch_experiment.rb +260 -107
data/lib/batch_experiment/extractor.rb +6 -4
data/lib/batch_experiment/sample_extractors.rb +0 -31
metadata +4 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 1eaf01a01169f63766cca960fc0acb6edfdff9cb
-  data.tar.gz: 69e6f2a63a0d3cab8f3c0c1974327a52a1adcf60
+  metadata.gz: 4bfaf4a6fa10230872eb47786f5ed25caef9866c
+  data.tar.gz: 5c2d57dbc5487a37d409bd6c9454177f16d44cae
 SHA512:
-  metadata.gz: 5582c9660677fab1bdb3a8ce3f0af9d4f4254b0b3f91510aca64d05a74bc4f6b3614f5eec67cebe5d2a5b9e51b33cd8c0d007fc32a61d84bce390b8957307004
-  data.tar.gz: 41fec77dbd71293fd7bd013167301d39c3001dc9504e69b714414f228c403e99f5c2d9f9780114ae54a035a3a38436dbbcd622523591782b31f005cd68cf6494
+  metadata.gz: e53e5c263fca88c5389268e2d8e2aafd8491bf100d3ad3888d84024ba7f515e4797ef7856fd0f03274274f563e3263a5bb5d55a1e336210e3e770409ca52e377
+  data.tar.gz: f4d4741012d48a9383ac7cd73013ab40ab86d0dcbd98d7b55cee2b0ae6c3ffe790c1662d60ab2093249a4d41a720c2705a3615198ce4aae4239a5ecc1da52c2e

data/README.md CHANGED Viewed

@@ -15,7 +15,7 @@ What conditions you need to use this tool:
 What is not needed:
-* To know how to program in ruby. Only taking less than 5 minutes to learn some basic syntax will suffice to run commands on multiple cores and save the results to files. However, there's an exception, if you want not only to execute the commands but want to extract and group some information from their output to a CVS, you will need to tell ruby how to do the extracting part.
+* To know how to program in ruby. Only taking less than 5 minutes to learn some basic syntax will suffice to run commands on multiple cores and save the results to files (using BatchExperiment::batch). However, if you want not only to execute the commands but want to extract and group some information from their output to a CVS (using BatchExperiment::experiment), you will need to tell ruby how to do the extracting part.
 ## How to use it
@@ -23,7 +23,7 @@ You will need to create a ruby script (copy, past and adapt one of the provided
 ## Examples
-After installing the gem, you will have a examples folder (/home/YOUR_USER/.gem/ruby/RUBY_VERSION/gems/batch_experiment-GEM_VERSION/examples). The sample_batch.rb gives you a good ideia of how to use #batch (no csv creation). The example_batch.rb together with the lib/batch_experiment/sample_extractors.rb gives a good ideia of how to use #experiment with multiple commands and extractors (csv creation).
+After installing the gem, you will have a examples folder (/home/YOUR_USER/.gem/ruby/RUBY_VERSION/gems/batch_experiment-GEM_VERSION/examples). The sample_batch.rb gives you a good ideia of how to use ::batch (no csv creation).
 ```ruby
 #!/bin/ruby
@@ -50,5 +50,7 @@ conf = {
 BatchExperiment::batch(commands, conf)
 ```
+The experiment_example.rb (and the lib/batch_experiment/sample_extractors.rb) gives a good ideia of how to use #experiment with multiple commands and how to create an extractor (used to create a csv).
 This code was born in [this repository](https://github.com/henriquebecker91/masters/tree/master/codes/rb/batch_experiment).

data/examples/debug_batch.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/bin/ruby
-require_relative '../lib/batch_experiment.rb'
+require 'batch_experiment'
 commands = []
 10000.times { | n | commands << "sleep 1 && echo #{n}" }

data/examples/{example_batch.rb → experiment_example.rb} RENAMED Viewed

@@ -13,30 +13,30 @@ comms_info = [{
   pattern: 'y',
   extractor: BatchExperiment::FirstLineExtractor,
   prefix: 'echo',
-}, {
-  command: 'wc FILE',
-  pattern: 'FILE',
-  extractor: BatchExperiment::WcExtractor,
-  prefix: 'wc',
 }]
-execution_info = {
+batch_conf = {
   # IDs of the CPU cores that can be used for executing tests.
   cpus_available: [1, 2, 3],
   # Maximum number of seconds that a command can run. After this a kill command
   # (TERM signal) will be issued.
   timeout: 5,
+  # Object that gives the filename for storing the output of each run.
+  converter: BatchExperiment::Comm2FnameConverter.new,
 }
-conf = {
+experiment_conf = {
   # The name of the file where will be written the CSV data.
   csvfname: 'example.csv',
-  # The columns will be ordered by command. All the columns of the first
-  # command before the one from the second and so on.
-  ic_columns: false,
+  # Number of times the same command will be executed over the same file.
+  qt_runs: 5,
+  # Order of the commands execution
+  comms_order: :random, #:by_comm, #:by_file,
+  # Random seed (only used if comms_order is :random)
+  rng: Random.new(0),
 }
 files = ['bible.txt', 'taoteching.txt']
-BatchExperiment::experiment(comms_info, execution_info, conf, files)
+BatchExperiment::experiment(comms_info, batch_conf, experiment_conf, files)

data/examples/{ukp_batch.rb → ukp_experiment.rb} RENAMED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/ruby
-require_relative 'batch_experiment'
-require_relative 'batch_experiment/sample_extractors'
+require 'batch_experiment'
+require 'batch_experiment/sample_extractors'
 # I run the three lines below in the console to disable hyperthreading cores on
 # my computer before examining the cores with the top command.
@@ -24,16 +24,20 @@ comms_info = [{
 execution_info = {
   cpus_available: [1, 2, 3],
   timeout: 10,
-  post_timeout: 5,
+  post_timeout: 2,
 }
-conf = { csvfname: 'pya_site8.csv' }
+conf = {
+  csvfname: 'pya_site8.csv',
+  comms_order: :random,
+  qt_runs: 10,
+}
 files = ['corepb.ukp', 'exnsd18.ukp', 'exnsd26.ukp', 'exnsdbis18.ukp', 'exnsd16.ukp', 'exnsd20.ukp', 'exnsdbis10.ukp', 'exnsds12.ukp']
 # If you don't execute the script from the ukp files folder you need to put the
 # folder relative or absolute path here (with trailing slash).
-path = ''
+path = '~/Aulas/mestrado/masters/data/ukp/'
 files.map! { | f | path + f }
-experiment(comms_info, execution_info, conf, files)
+BatchExperiment::experiment(comms_info, execution_info, conf, files)

data/lib/batch_experiment.rb CHANGED Viewed

@@ -4,9 +4,15 @@ require 'pathname'
 # The main module, the two main utility methods offered are ::batch and
 # ::experiment.
 module BatchExperiment
-  # The default callable object used by #batch to convert a command into a
-  # filename.
-  module FilenameSanitizer
+  # The default callable object used by Comm2FnameConverter to convert
+  # a command into a filename. Comm2FnameConverter don't create a sanitized
+  # filename from the command string (it uses its first argument to do this,
+  # whose default is FnameSanitizer).
+  # Note that this is a pure function, so if the same command appears more than
+  # one time, it will get the same name, it's Comm2FnameConverter that gives
+  # multiple instances of the same command different names (by suffixing with
+  # numbers).
+  module FnameSanitizer
     def self.call(command)
       fname = command.strip
       fname.gsub!(/[^[:alnum:]]/, '_')
@@ -17,7 +23,57 @@ module BatchExperiment
     end
   end
-  # Internal use only. DO NOT DEPEND.
+  # Converts a command to a filename using a given sanitizer, gives different
+  # names to different calls with the same arguments. Example: if a call with
+  # "sleep 1" yields "sleep_1", the second call with the same argument yields
+  # "sleep_1.2", and so on. Note that this is done by remembering previous
+  # calls, the object don't inspect the filesystem to check if that name
+  # was or wasn't used.
+  class Comm2FnameConverter
+    # Creates a new Comm2FnameConverter, with no memory of any previous calls.
+    #
+    # @param sanitizer [#call] Callable object used to create a filename from
+    #   the arguments passed to Comm2FnameConverter.call. This class expects
+    #   that sanitizer has no internal state, so when an instance of this class
+    #   is cloned, there's no problem with sharing the sanitizer between the
+    #   clones. Default: BatchExperiment::FnameSanitizer.
+    def initialize(sanitizer = FnameSanitizer)
+      @num_times_seen = {}
+      @sanitizer = sanitizer
+    end
+    # Takes a command, creates a fname for it, if this fname was already seen
+    # before, returns the fname + ".N", where N is the number of times fname
+    # was already seen.
+    #
+    # @param comm [String] A system command.
+    # @return [String] The sanitized filename created from that command.
+    # @note Note that different arguments can be reduced to the same
+    #   sanitized filename and, if this happens, they will NOT overwrite
+    #   each other. Example: 'echo "abc"' -> 'echo_abc'; 'echo abc' ->
+    #   'echo_abc.2'.
+    def call(comm)
+      fname = @sanitizer.call(comm)
+      if @num_times_seen.include? fname
+        @num_times_seen[fname] += 1
+        fname << ".#{@num_times_seen[fname]}"
+      else
+        @num_times_seen[fname] = 1
+      end
+      fname.clone
+    end
+    def initialize_clone(old)
+      @num_times_seen = old.num_times_seen.clone
+    end
+    # To allow the initialize_clone implementation.
+    protected
+    attr_reader :num_times_seen
+  end
+  # INTERNAL USE ONLY.
   # Remove any finished commands from comms_running, insert the cpus
   # freed by the commands termination to the free_cpus, insert the
   # terminated commands on comms_executed.
@@ -42,7 +98,7 @@ module BatchExperiment
   # filenames are derived from the commands. Appending '.out' to one of the
   # partial filenames will give the filename were the command stdout was
   # redirected. The analogue is valid for '.err' and stderr. Right before a
-  # command begans to run, a "partial_filename.unfinished file is created.
+  # command begans to run, a 'partial_filename.unfinished' file is created.
   # After the command ends its execution this file is removed. If the command
   # ends its execution by means of a timeout the file is also removed. The file
   # only remains if the batch procedure is interrupted (script was killed,
@@ -51,7 +107,7 @@ module BatchExperiment
   #
   # @param commands [Array<String>] The shell commands.
   # @param conf [Hash] The configurations, as follows:
-  #   -- cpus_available [Array<Fixnum>] Cpu cores that can be used to run the
+  #   -- cpus_available [Array<Fixnum>] CPU cores that can be used to run the
   #   commands. Required parameter. The cpu numbers begin at 0, despite what
   #   htop tells you.
   #   -- timeout [Number] Number of seconds before killing a command. Required
@@ -59,35 +115,36 @@ module BatchExperiment
   #   -- time_fmt [String] A string in the time (external command) format. See
   #   http://linux.die.net/man/1/time. Default: 'ext_time: %e\next_mem: %M\n'.
   #   -- busy_loop_sleep [Number] How many seconds to wait before checking if
-  #   a command ended execution. This is max time a cpu will be vacant between
-  #   two commands. Default: 0.1.
+  #   a command ended execution. This time will be very close to the max time a
+  #   cpu will remain vacant between two commands. Default: 0.1 (1/10 second).
   #   -- post_timeout [Number] A command isn't guaranteed to end after
   #   receiving a TERM signal. If the command hasn't stopped, waits
   #   post_timeout seconds before sending a KILL signal (give it a chance to
   #   end gracefully). Default: 5.
-  #   -- fname_sanitizer [#call] The call method of this object
-  #   should take a String and convert it (possibly losing information), to a
-  #   valid filename. Used over the commands to define the output files of
-  #   commands. Default: BatchExperiment::FilenameSanitizer
+  #   -- converter [#call] The call method of this object should take a String
+  #   and convert it (possibly losing information), to a valid filename. Used
+  #   over the commands to define the output files of commands.
+  #   Default: BatchExperiment::Comm2FnameConverter.new.
   #   -- skip_done_comms [FalseClass,TrueClass] Skip any command for what a
   #   corresponding '.out' file exists, except if both a '.out' and a
-  #   '.unfinished' file exist, in the last case the command is executed.
+  #   '.unfinished' file exist, in the last case the command is always
+  #   executed. If false, execute all commands and overwrite all ".out".
   #   Default: true.
   #   -- unfinished_ext [String] Extension to be used in place of
-  #   '.unfinished'.  Default: '.unfinished'.
+  #   '.unfinished'. Default: '.unfinished'.
   #   -- out_ext [String] Extension to be used in place of '.out'.
   #   Default: '.out'.
   #   -- err_ext [String] Extension to be used in place of '.err'.
   #   Default: '.err'.
+  #
   # @return [String] Which commands were executed. Can be different from
   #   the 'commands' argument if commands are skipped (see :skip_done_comms).
   #
   # @note If the same command is executed over the same file more than one
-  #   time, then only the last execution will be saved (because the '.out',
-  #   '.err' and '.unfinished' files will be overwritten). But the parameter
-  #   conf\[:fname_sanitizer\] can be used to circumvent the restriction over
-  #   equal commands (if the object has state it can return a different
-  #   filename for every time it's called with the same argument).
+  #   time, then any run besides the first will have a numeric suffix.
+  #   Example: "sleep 1" -> "sleep_1", "sleep 1" -> "sleep_1.2".
+  #   For more info see the parameter conf\[:fname_sanitizer\], and its
+  #   default value BatchExperiment::Comm2FnameConverter.new.
   # @note This procedure makes use of the following linux commands: time (not
   #   the bash internal one, but the package one, i.e.
   #   https://www.archlinux.org/packages/extra/x86_64/time/); timeout (from
@@ -101,8 +158,10 @@ module BatchExperiment
   #   conf\[:time_fmt\] to a empty string only a newline will be appended.
   def self.batch(commands, conf)
     # Throw exceptions if required configurations aren't provided.
-    fail 'conf[:cpus_available] not set' unless conf[:cpus_available]
-    fail 'conf[:timeout] not set' unless conf[:timeout]
+    if !conf[:cpus_available] then
+      fail ArgumentError, 'conf[:cpus_available] not set'
+    end
+    fail ArgumentError, 'conf[:timeout] not set' unless conf[:timeout]
     # Initialize optional configurations with default values if they weren't
     # provided. Don't change the conf argument, only our version of conf.
@@ -113,7 +172,7 @@ module BatchExperiment
     conf[:err_ext]          ||= '.err'
     conf[:busy_loop_sleep]  ||= 0.1
     conf[:post_timeout]     ||= 5
-    conf[:fname_sanitizer]  ||= BatchExperiment::FilenameSanitizer
+    conf[:converter]        ||= BatchExperiment::Comm2FnameConverter.new
     conf[:skip_done_comms]    = true if conf[:skip_done_comms].nil?
     # Initialize main variables
@@ -123,23 +182,23 @@ module BatchExperiment
     comms_executed = []
     commands.each do | command |
-      commfname = conf[:fname_sanitizer].call(command)
+      commfname = conf[:converter].call(command)
       out_fname = commfname + conf[:out_ext]
       err_fname = commfname + conf[:err_ext]
       lockfname = commfname + conf[:unfinished_ext]
       if conf[:skip_done_comms] && File.exists?(out_fname)
         if File.exists?(lockfname)
-          puts "found file #{out_fname}, but a #{lockfname} also exists"
-          puts "will execute command '#{command}' anyway"
+          puts "Found file #{out_fname}, but a #{lockfname} also exists:"
+          puts "Will execute command '#{command}' anyway."
         else
-          puts "found file #{commfname}, skipping command: #{command}"
+          puts "Found file #{commfname}, skipping command: #{command}"
           STDOUT.flush
           next
         end
       end
-      puts "waiting to execute command: #{command}"
+      puts "Waiting to execute command: #{command}"
       STDOUT.flush
       while free_cpus.empty? do
@@ -188,21 +247,29 @@ module BatchExperiment
     comms_executed
   end
-  # gencommff: GENerate COMMands For Files
+  # INTERNAL USE ONLY. gencommff: GENerate COMMands For Files.
+  # Creates a hash with the generated commands as keys, and store (as the
+  # respective value) the comm_info hash and the file (using a { comm_info: X,
+  # filename: Y } structure).
   #
-  # @param comm [String] A string with 'patt' as a substring.
-  # @param patt [String] A string contained in 'comm'.
-  # @param files [Enumerable<String>] A list of strings to substitute patt at
-  #   comm.
-  # @return [Array<String>] Example: gencommff('echo STR', 'STR', ['a', 'b',
-  #   'c']) returns ['echo a', 'echo b', 'echo c'].
-  def self.gencommff(comm, patt, files)
-    ret = []
-    files.each { | f | ret << comm.gsub(patt, f) }
+  # @param comm_info [Hash] A hash structure following the same format
+  #   that the elements of the comms_info array parameter of #experiment.
+  # @param files [Enumerable<String>] A list of strings that will replace
+  #   comm_info[:pattern] at a copy of comm_info[:command].
+  # @return [Hash<String, Hash>] A hash on the following format
+  #   { expanded_command => { comm_info: comm_info, filename: f }, ...}
+  def self.gencommff(comm_info, files)
+    ret = {}
+    comm = comm_info[:command]
+    patt = comm_info[:pattern]
+    files.each do | f |
+      ret[comm.gsub(patt, f)] = { comm_info: comm_info, filename: f }
+    end
     ret
   end
-  # Intercalate a variable number of variable sized arrays in one array.
+  # INTERNAL USE ONLY. Intercalate a variable number of variable sized arrays
+  # in one array.
   #
   # @param [Array<Array<Object>>] xss An array of arrays.
   # @return [Array<Object>] An array of the same size as the sum of the size
@@ -223,53 +290,94 @@ module BatchExperiment
     ret
   end
+  class ColumnSpecError < ArgumentError; end
+  # INTERNAL USE ONLY. Check if the headers can be combined, if they can
+  # return a shallow copy of the biggest header, otherwise throw an exception.
+  #
+  # @param headers [Array<Array<Comparable>>] An array of arrays of strings
+  #   (or any object that implements '!=').
+  # @return A shallow copy of the biggest inner array in headers. Only returns
+  #   if for each position on the biggest inner array has the same value as
+  #   that position on all the other arrays with at least that size.
+  def self.merge_headers(headers)
+    mer_size = headers.map { | h | h.size }.max
+    merged_h = Array.new(mer_size)
+    mer_size.times do | i |
+      headers.each do | h |
+        next if h.size < i
+        if merged_h[i].nil?
+          merged_h[i] = h[i]
+        elsif merged_h[i] != h[i]
+          raise ColumnSpecError, "Error: When using BatchExperiment::experiment"
+            + " all the extractors have to agree on the columns they share."
+            + " In the specific case: the column nº #{i} was labeled as"
+            + " '#{merged_h[i]}' on one extractor, and '#{h[i]}' on another,"
+            + " this can be only a difference on notation ('time' vs 'Time'),"
+            + " or can mean that in the same column two different kinds of data"
+            + " are being presented. The program will be aborted. Check that."
+        end
+      end
+    end
+    merged_h
+  end
   # Takes N shell commands and M files/parameters, execute each command of the
   # N commands over the M files, save the output of each command/file
   # combination, use objects provided with the command to extract relevant
-  # information from the output file, and group those information in a CVS
+  # information from the output file, and group those information in a CSV
   # file. Easier to understand seeing the sample_batch.rb example in action.
   #
   # @param comms_info [Array<Hash>] An array of hashs, each with the config
   #   needed to know how to deal with the command. Four required fields
   #   (all keys are symbols):
   #   command [String] A string with a sh shell command.
-  #   pattern [String] A substring of command, will be replace by the strings
+  #   pattern [String] A substring of command, will be replaced by the strings
   #   in the paramenter 'files'.
   #   extractor [#extract,#names] Object implementing the Extractor interface.
-  #   prefix [String] A string that will be used to prefix the extractor.names
-  #   when they are used as column names. Improves Extractor reusability.
+  #   prefix [String] A string that will be used on the 'algorithm' column
+  #   to identify the used command.
   # @param batch_conf [Hash] Configuration used to call batch. See the
   #   explanation for parameter 'conf' on the documentation of the batch
-  #   method. There are required fields for this hash parameter.
+  #   method. There are required fields for this hash parameter. Also, note
+  #   that the batch_conf\[:converter\] should allow cloning without sharing
+  #   mutable state. A converter clone is used by #experiment internally, it
+  #   has to obtain the same results as the original copy (that is passed to
+  #   BatchExperiment::batch).
   # @param conf [Hash] Lots of parameters. Here's a list:
-  #   csvfname [String] The filename/filepath for the file that will contain
+  #   -- csvfname [String] The filename/filepath for the file that will contain
   #   the CSV data. Required field.
   #   separator [String] The separator used at the CSV file. Default: ';'.
-  #   ic_columns [TrueClass, FalseClass] Intercalate the data returned by the
-  #   extractors. In other words, the csv line for some file will not present
-  #   all fields of the first command, then all fields of the second command,
-  #   etc, but instead will present the first field of all commands, the second
-  #   field of all commands, and so on. Default: true.
-  #   ic_comms [TrueClass, FalseClass] Intercalate the commands execution.
-  #   Instead of executing the first command over all files first, execute all
-  #   the commands over the first file first. This was made to avoid
-  #   confounding (statistical concept). If something disrupts the processing
-  #   power for some period of time, the effect will probably be distributed
-  #   between commands. The risk some algorithm seems better or worse than it
-  #   really is will be reduced. For example: you are making tests at an
-  #   notebook, the notebook becomes unplugged for a short time. The cores will
-  #   probably enter in energy saving mode and affect the observed performance.
-  #   If this happens when all tested commands are the same, then will seem
-  #   that that an command had a worse performance. If this happens when the
-  #   commands are intercalated, then maybe some instances will seem harder
-  #   than others (what is less problematic). Default: true.
+  #   -- qt_runs [NilClass,Integer] If nil or one then each command is
+  #   executed once. If is a number bigger than one, the command is executed
+  #   that number of times. The batch_conf[:converter] will define the name
+  #   that will be given to each run. Every file will appear qt_runs times on
+  #   the filename column and, for the same file, the values on the run_number
+  #   column will be the integer numbers between 1 and qt_runs (both
+  #   inclusive). Default: nil.
+  #   -- comms_order [:by_comm,:by_file,:random] The order the
+  #   commands will be executed. Case by_comm: will execute the first command
+  #   over all the files (using the files order), then will execute the
+  #   second command over all files, and so on. Case by_file: will execute
+  #   all the commands (using the comms_info order) over the first file,
+  #   then will execute all the comands over the second file, and so on.
+  #   Case random: will expand all the command/file combinations (replicating
+  #   the same command qt_run times) and then will apply shuffle to this array,
+  #   using the object passed to the rng parameter. This last option is the
+  #   most adequate for statistical testing.
+  #   -- rng [Nil,#rand] An object that implements the #rand method (behaves
+  #   like an instance of the core Random class). If comms_order is random and
+  #   rng is nil, will issue a warning remembering the default that was used.
+  #   Default: Random.new(42).
   #   skip_commands [TrueClass, FalseClass] If true, will not execute the
-  #   commands and assume that the outputs are already saved. Will only execute
-  #   the extractors over the already saved outputs, and create the CSV file
-  #   from them. Default: false.
+  #   commands and assume that the outputs are already saved (on ".out" files).
+  #   Will only execute the extractors over the already saved outputs, and
+  #   create the CSV file from them. Default: false.
   #
   # @param files [Array<Strings>] The strings that will replace the :pattern
-  #   on :command, for every element in comms_info.
+  #   on :command, for every element in comms_info. Can be a filename, or
+  #   can be anything else (a numeric parameter, sh code, etc..), but we
+  #   refer to them as files for simplicity and uniformity.
   #
   # @return [NilClass,Array<String>] The return of the internal #batch
   #   call. Returns nil if conf[:skip_commands] was set to true.
@@ -283,67 +391,112 @@ module BatchExperiment
     # Initialize optional configurations with default values if they weren't
     # provided. Don't change the conf argument, only our version of conf.
     conf = conf.clone
-    conf[:separator]  ||= ';'
-    conf[:ic_columns]   = true if conf[:ic_columns].nil?
-    conf[:ic_comms]     = true if conf[:ic_comms].nil?
+    conf[:separator]    ||= ';'
+    conf[:qt_runs]      ||= 1
+    conf[:comms_order]  ||= :by_comm
+    conf[:rng]          ||= Random.new(42)
     #conf[:skip_commands] defaults to false/nil
     # Get some of the batch config that we use inside here too.
-    out_ext = batch_conf[:out_ext] || '.out'
-    unfinished_ext = batch_conf[:unfinished_ext] || '.unfinished'
-    fname_sanitizer   = batch_conf[:fname_sanitizer]
-    fname_sanitizer ||= BatchExperiment::FilenameSanitizer
+    out_ext         = batch_conf[:out_ext] || '.out'
+    unfinished_ext  = batch_conf[:unfinished_ext] || '.unfinished'
+    converter = batch_conf[:converter].clone unless batch_conf[:converter].nil?
+    converter ||= BatchExperiment::Comm2FnameConverter.new
-    # Create commands the templates and the file list.
+    # Expand all commands, combining command templates and files.
     comms_sets = []
     comms_info.each do | comm_info |
-      comms_sets << gencommff(comm_info[:command], comm_info[:pattern], files)
+      comms_sets << gencommff(comm_info, files)
     end
-    comm_list = conf[:ic_comms] ? intercalate(comms_sets) : comms_sets.flatten
+    expanded_comms = comms_sets.map { | h | h.keys }
+    # If each command should be run more than once...
+    if conf[:qt_runs] > 1
+      # ... we replace each single command by an array of qt_runs copies,
+      # and then flatten the parent array.
+      expanded_comms.map! do | a |
+        a.map! { | c | Array.new(conf[:qt_runs], c) }.flatten!
+      end
+    end
+    # At this moment the expanded_comms is an array of arrays, each internal
+    # array has all the expanded commands of the one single command template
+    # over all the files.
+    # After the code block below, the expanded_comms will be an one-level array
+    # of the expanded commands, in the order they will be executed.
+    expanded_comms = case conf[:comms_order]
+    when :by_comm # all runs of the first command template first
+      expanded_comms.flatten!
+    when :by_file # all runs over the first file first
+      intercalate(expanded_comms)
+    when :random  # a random order
+      expanded_comms.flatten!.shuffle!(random: conf[:rng])
+    end
     # Execute the commands (or not).
-    ret = batch(comm_list, batch_conf) unless conf[:skip_commands]
+    ret = batch(expanded_comms, batch_conf) unless conf[:skip_commands]
     # Build header (first csv line, column names).
-    header = []
-    comms_info.each do | comm_info |
-      prefixed_names = comm_info[:extractor].names.map do | name |
-        (comm_info[:prefix] + ' ') << name
+    header = ['algorithm', 'filename', 'run_number']
+    header << merge_headers(comms_info.map { | c | c[:extractor].names })
+    header = header.join(conf[:separator])
+    # We need to merge the union of all comms_sets to query it.
+    comm2origin = {}
+    comms_sets.each do | h |
+      comm2origin.merge!(h) do | k, v, v2 |
+        puts "WARNING: The command expansion '#{k}' was generated more than once. The first time was by the template '#{v[:comm]}' and the file '#{v[:file]}', and this time by template '#{v2[:comm]}' and the file '#{v2[:file]}'. Will report on CSV as this command was generated by the template '#{v[:comm]}' and the file '#{v[:file]}'."
+        v
       end
-      header << prefixed_names
     end
-    header = intercalate(header) if conf[:ic_columns]
-    header = ['Filename'].concat(header).join(conf[:separator])
-    # Build body (inspect all output files an make csv lines).
+    # Build body (inspect all output files and make csv lines).
+    #
+    # Body format: algorithm;filename;run_number;first extracted column; ...
+    #
+    # This means that the extractors have to agree on what is each column, two
+    # different extractors have to extract the same kind of data at each column
+    # (the first field returned by all extractors has to be, for example, cpu
+    # time, the same applies for the remaining fields).
+    # If one extractor extract more fields than the others this is not a
+    # problem, if the second biggest extractor (in number of fields extract)
+    # will extract, for example, 4 fields, and the biggest extract 6 fields,
+    # the first 4 fields extracted by the biggest extractor have to be the same
+    # as the ones on the second-biggest extractor. This way, all the lines will
+    # have the kind of data on the first four columns (not counting the
+    # algorithm, filename and run_number ones), and only lines provenient from
+    # the biggest extractor will have data on the fifth and sixth columns.
     body = [header]
-    files.each_with_index do | inst_fname, j |
-      line = []
-      comms_info.each_with_index do | comm_info, i |
-        command =
-          if conf[:ic_comms]
-            comm_list[(j * comms_info.size) + i]
-          else
-            comm_list[(i * files.size) + j]
-          end
-        partial_fname = fname_sanitizer.call(command)
-        out_fname = partial_fname + out_ext
-        lockfname = partial_fname + unfinished_ext
-        if File.exists?(out_fname)
-          f_content = File.open(out_fname, 'r') { | f | f.read }
-          line << comm_info[:extractor].extract(f_content)
+    times_found = {}
+    expanded_comms.each do | exp_comm |
+      run_info   = comm2origin[exp_comm]
+      algorithm  = run_info[:comm_info][:prefix]
+      filename   = run_info[:filename]
+      times_found[exp_comm] ||= 0
+      times_found[exp_comm]  += 1
+      run_number = times_found[exp_comm]
+      curr_line = [algorithm, filename, run_number]
+      partial_fname = converter.call(exp_comm)
+      out_fname = partial_fname + out_ext
+      lockfname = partial_fname + unfinished_ext
+      extractor = run_info[:comm_info][:extractor]
+      if File.exists?(out_fname)
+        if File.exists?(lockfname)
+          puts "Ignored file '#{out_fname}' because there was a"
+             + "  '#{lockfname}' file in the same folder."
         else
-          # if the file wasn't created insert a empty column set
-          # of the same size the true column set would be
-          line << comm_info[:extractor].names.map { | _ | '' }
+          f_content = File.open(out_fname, 'r') { | f | f.read }
+          curr_line << extractor.extract(f_content)
         end
       end
-      line = intercalate(line) if conf[:ic_columns]
-      body << [inst_fname].concat(line).join(conf[:separator])
+      body << curr_line.join(conf[:separator])
     end
-    body = body.map! { | line | line << conf[:separator] }.join("\n")
+    body = body.join(conf[:separator] + "\n")
     # Write CSV data into a CSV file.
     File.open(conf[:csvfname], 'w') { | f | f.write(body) }

data/lib/batch_experiment/extractor.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module BatchExperiment
   # Module that defines the interface used for extracting info from other
   # programs output. You don't need to include it in your object, will suffice
   # that the object (that you will use to extract info from the output) has the
-  # ::names and ::extract methods defined.
+  # #names and #extract methods defined.
   module Extractor
     # Find a line in the following format: "field: value", return value.
     #
@@ -13,7 +13,9 @@ module BatchExperiment
     # @return [String] The 'value' as a string or, if 'field' isn't found, an
     #   empty string.
     def self.get_field(lines, field)
-      lines.grep(/^#{field}: .*/).each { | l | return l.match(/:[\t ]+(.*)/)[1] }
+      lines.grep(/^#{field}: .*/).each do | l |
+        return l.match(/:[\t ]+(.*)/)[1]
+      end
       ''
     end
@@ -42,7 +44,7 @@ module BatchExperiment
     # @return [Array<String>] The strings that will be used to make the column
     #   names at the BatchExperiment.experiment method.
     def names
-      fail 'This method should have been overwritten by a subclass.'
+      raise 'This method should have been overwritten by a subclass.'
     end
     # Extract N values of some program output, where N is equal to #names.size.
@@ -62,7 +64,7 @@ module BatchExperiment
     #   and the line string elements don't end in linebreak.
     # @return [Array<String>] The N extracted values, as strings.
     def extract_from_lines(lines)
-      fail 'This method should have been overwritten by a subclass.'
+      raise 'This method should have been overwritten by a subclass.'
     end
   end
 end

data/lib/batch_experiment/sample_extractors.rb CHANGED Viewed

@@ -15,37 +15,6 @@ module BatchExperiment
     end
   end
-  module WcExtractor
-    extend Extractor
-    def self.names
-      ['lines', 'words', 'bytes', 'ext_time', 'ext_mem']
-    end
-    def self.extract(content)
-      arr = content.split(' ')
-      qt_lines, words, bytes = arr[0], arr[1], arr[2]
-      lines = content.lines.map! { | l | l.chomp! }
-      [ qt_lines, words, bytes,
-        Extractor.get_field(lines, 'ext_time'),
-        Extractor.get_field(lines, 'ext_mem')
-      ]
-    end
-  end
-  module TwoWordsExtractor
-    extend Extractor
-    def self.names
-      ['first word', 'second word', 'ext_time', 'ext_mem']
-    end
-    def self.extract_from_lines(lines)
-      words = lines.empty? || lines[0].nil? ? ['',''] : lines[0].split().take(2)
-      words << Extractor.get_field(lines, 'ext_time')
-      words << Extractor.get_field(lines, 'ext_mem')
-      words
-    end
-  end
   # Sample extractors used at https://github.com/henriquebecker91/masters,
   # where this code had its beggining. This file contains the code used to
   # extract info from the different outputs generated by UKP solving programs.

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: batch_experiment
 version: !ruby/object:Gem::Version
-  version: 1.0.2
+  version: 2.0.0
 platform: ruby
 authors:
 - Henrique Becker
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-03-28 00:00:00.000000000 Z
+date: 2016-06-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: childprocess
@@ -33,10 +33,10 @@ files:
 - README.md
 - examples/bible.txt
 - examples/debug_batch.rb
-- examples/example_batch.rb
+- examples/experiment_example.rb
 - examples/sample_batch.rb
 - examples/taoteching.txt
-- examples/ukp_batch.rb
+- examples/ukp_experiment.rb
 - lib/batch_experiment.rb
 - lib/batch_experiment/extractor.rb
 - lib/batch_experiment/sample_extractors.rb