RubyGems - batch_experiment - Versions diffs - 0.1.0 - Mend

batch_experiment 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +7 -0
data/examples/sample_batch.rb +48 -0
data/examples/ukp_batch.rb +39 -0
data/lib/batch_experiment/extractor.rb +29 -0
data/lib/batch_experiment/sample_extractors.rb +49 -0
data/lib/batch_experiment.rb +345 -0
metadata +64 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 4d594a77b4909c00eb2c07a05dbc6c4f0cc68c9a
+  data.tar.gz: 979f0d1800486061aff5d9a1728e55790e675e34
+SHA512:
+  metadata.gz: cc0d896402fc7820e7c1f2741ae9e07ffdea6ab14edc3156e9bdcd5d3f4c24b71e753087e1980fdb97c8b85006cc2c7ef1bbb3bdad3a181b0853a7c94f76ea82
+  data.tar.gz: bfde027e77f78360a9728eacca5075a00a2b59e47cd0224e692c2e44bc6118e942c93c4913a0bd65a5861612191f92deb4402052440037e416391c9433d2629c

data/examples/sample_batch.rb ADDED Viewed

@@ -0,0 +1,48 @@
+#!/bin/ruby
+require 'batch_experiment'
+require 'batch_experiment/sample_extractors'
+comms_info = [{
+  # String with command to be executed. Must have 'pattern' as substring.
+  command: 'sleep 1 && echo X X',
+  # Substring present in 'command'. Often replaced by the instance filename.
+  pattern: 'X',
+  # Extractor object. Receives the output of the command and return
+  # the most important fields.
+  extractor: SampleExtractor.new,
+  # String used to identify the command. Will be used to prefix the return of
+  # extractor.names.
+  prefix: 'doubled',
+}, {
+  command: 'sleep 3 && echo "banana X"',
+  pattern: 'X',
+  extractor: SampleExtractor.new,
+  prefix: 'banana',
+}, {
+  command: 'sleep 100 && echo "never gonna happen X"',
+  pattern: 'X',
+  extractor: SampleExtractor.new,
+  prefix: 'timeout',
+}]
+execution_info = {
+  # IDs of the CPU cores that can be used for executing tests.
+  cpus_available: [1, 2, 3],
+  # Maximum number of seconds that a command can run. After this a kill command
+  # (TERM signal) will be issued.
+  timeout: 5,
+  # Maximum number of seconds that a command can run after a kill command was
+  # issued. After this a kill -9 command (KILL signal) will be issued.
+  post_timeout: 1,
+}
+conf = {
+  # The name of the file where will be written the CSV data.
+  csvfname: 'sample.csv',
+}
+files = ['apple', 'orange'] # Applejack would be proud
+BatchExperiment::experiment(comms_info, execution_info, conf, files)

data/examples/ukp_batch.rb ADDED Viewed

@@ -0,0 +1,39 @@
+#!/usr/bin/ruby
+require_relative 'batch_experiment'
+require_relative 'batch_experiment/sample_extractors'
+# I run the three lines below in the console to disable hyperthreading cores on
+# my computer before examining the cores with the top command.
+# for i in 4 5 6 7; do
+#   sudo sh -c "echo 0 > /sys/devices/system/cpu/cpu$i/online";
+# done
+comms_info = [{
+  command: 'pyasukpt -src INST_FILE',
+  pattern: 'INST_FILE',
+  extractor: PyaExtractor.new,
+  prefix: 'PYAsUKP',
+}, {
+  command: 'run_ukp5.out INST_FILE',
+  pattern: 'INST_FILE',
+  extractor: UKP5Extractor.new,
+  prefix: 'UKP5',
+}]
+execution_info = {
+  cpus_available: [1, 2, 3],
+  timeout: 10,
+  post_timeout: 5,
+}
+conf = { csvfname: 'pya_site8.csv' }
+files = ['corepb.ukp', 'exnsd18.ukp', 'exnsd26.ukp', 'exnsdbis18.ukp', 'exnsd16.ukp', 'exnsd20.ukp', 'exnsdbis10.ukp', 'exnsds12.ukp']
+# If you don't execute the script from the ukp files folder you need to put the
+# folder relative or absolute path here (with trailing slash).
+path = ''
+files.map! { | f | path + f }
+experiment(comms_info, execution_info, conf, files)

data/lib/batch_experiment/extractor.rb ADDED Viewed

@@ -0,0 +1,29 @@
+module Extractor
+  # For when there's a field whose value is after '<field>: '.
+  def self.get_field(lines, field)
+    lines.grep(/^#{field}: .*/).each { | l | return l.match(/:[\t ]+(.*)/)[1] }
+    ''
+  end
+  # For when there's a field whose value is in the next line.
+  def self.get_hfield(lines, field)
+    if ix = lines.find_index(field) then lines[ix + 1] else '' end
+  end
+  # Return the field names for each of the elements returned by
+  # extract. Ex.: ['Time', 'Max Mem Use', 'opt', ... ]
+  def names
+    fail 'This method should have been overwritten by a subclass.'
+  end
+  def extract(content)
+    extract_from_lines(content.lines.map! { | l | l.chomp! })
+  end
+  # Extract an array of values from the command output. This array has the same
+  # size as the one returned by field_names.
+  def extract_from_lines(lines)
+    fail 'This method should have been overwritten by a subclass.'
+  end
+end

data/lib/batch_experiment/sample_extractors.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require 'require_relative'
+require_relative './extractor.rb'
+# Sample extractors used at https://github.com/henriquebecker91/masters, where
+# this code had its beggining. This file contains the code used to extract info
+# from the different outputs generated by UKP solving programs.
+class SampleExtractor
+  include Extractor
+  def names
+    ['first word', 'second word', 'ext_time', 'ext_mem']
+  end
+  def extract_from_lines(lines)
+    words = lines.empty? || lines[0].nil? ? ['',''] : lines[0].split().take(2)
+    words << Extractor.get_field(lines, 'ext_time')
+    words << Extractor.get_field(lines, 'ext_mem')
+    words
+  end
+end
+class UKP5Extractor
+  include Extractor
+  def names
+    ['internal time', 'external time', 'external memory', 'opt']
+  end
+  def extract_from_lines(lines)
+    ['Seconds', 'ext_time', 'ext_mem', 'opt'].map do | label |
+      Extractor.get_field(lines, label)
+    end
+  end
+end
+class PyaExtractor
+  include Extractor
+  def names
+    ['internal time', 'external time', 'external memory', 'opt']
+  end
+  def extract_from_lines(lines)
+    values = ['Total Time ', 'ext_time', 'ext_mem'].map do | label |
+      Extractor.get_field(lines, label)
+    end
+    opt_key = '#The optimal value for the given capacity'
+    values << Extractor.get_hfield(lines, opt_key)
+  end
+end

data/lib/batch_experiment.rb ADDED Viewed

@@ -0,0 +1,345 @@
+require 'childprocess'
+require 'pathname'
+module BatchExperiment
+  # The default callable class used by batch to convert a command into a
+  # filename.
+  class FilenameSanitizer
+    def call(command)
+      fname = command.strip
+      fname.gsub!(/[^[:alnum:]]/, '_')
+      fname.gsub!(/_+/, '_')
+      fname.gsub!(/^_/, '')
+      fname.gsub!(/_$/, '')
+      fname
+    end
+  end
+  # Internal use only. DO NOT DEPEND.
+  # Remove any finished commands from comms_running, insert the cpus
+  # freed by the commands termination to the free_cpus, insert the
+  # terminated commands on comms_executed.
+  def self.update_finished(free_cpus, comms_running, comms_executed)
+    comms_running.delete_if do | job |
+      if job[:proc].exited?
+        free_cpus.push(job[:cpu])
+        File.delete(job[:lockfname])
+        comms_executed << job[:command]
+      end
+      job[:proc].exited? # bool returned to delete_if
+    end
+  end
+  # Takes a list of commands, execute them only on the designed core/cpus, and
+  # kill them if the timeout expires, never lets a core/cpu rest for more than
+  # conf[:busy_loop_sleep] seconds between a command and another. The
+  # conf[:fname_sanitizer] is called over the commands to generate partial
+  # filenames. Appending '.out' to one of the partial filenames will give the
+  # filename were the command stdout was redirected. The analogue is valid for
+  # '.err' and stderr. The first partial filename corresponds to the first
+  # command in commands, and so on. Right before a command begans to run, a
+  # "partial_filename.#{conf[:unfinished_ext]}" file is created. After the
+  # command ends its execution this file is removed. If the command ends its
+  # execution by means of a timeout the file is also removed. The file only
+  # remains if the batch procedure is interrupted (not a specific command).
+  #
+  # @param commands [Array<String>] The shell commands.
+  # @param conf [Hash] The configurations, as follows:
+  #   :cpus_available [Array<Fixnum>] Cpu cores that can be used to run the
+  #   commands. Required parameter. The cpu numbers begin at 0, despite what
+  #   htop tells you;
+  #   :timeout [Number] Number of seconds before killing a command. Required
+  #   parameter. Is the same for all the commands;
+  #   :time_fmt [String] A string in the time (external command) format. See
+  #   http://linux.die.net/man/1/time. Default: 'ext_time: %e\next_mem: %M\n'.
+  #   :busy_loop_sleep [Number] How many seconds to wait before checking if a
+  #   command ended execution. This is max time a cpu will be vacant between
+  #   two commands. Default: 0.1;
+  #   :post_timeout [Number] A command isn't guaranteed to end after receiving
+  #   a TERM signal. If the command hasn't stopped, waits post_timeout seconds
+  #   before sending a KILL signal (give it a chance to end gracefully).
+  #   Default: 5;
+  #   :fname_sanitizer [Callable Object] The call method of this object
+  #   should take a String and convert it (possibly losing information), to a
+  #   valid filename. Used over the commands to define the output files of
+  #   commands.
+  #   Default: BatchExperiment::FilenameSanitizer.new;
+  #   :skip_done_comms [FalseClass,TrueClass] Skip any command for what a
+  #   corresponding '.out' file exists, except if both a '.out' and a
+  #   '.unfinished' file exist, in the last case the command is executed.
+  #   Default: true;
+  #   :unfinished_ext [String] Extension to be used in place of '.unfinished'.
+  #   Default: '.unfinished';
+  #   :out_ext [String] Extension to be used in place of '.out'.
+  #   Default: '.out';
+  #   :err_ext [String] Extension to be used in place of '.err'.
+  #   Default: '.err';
+  # @return [String] Which commands were executed. Can be different from
+  #   the 'commands' argument if commands are skipped (see :skip_done_comms).
+  #
+  # @note This procedure was not designed to support equal commands (the last
+  #   equal command executed will subscribe the '.out', '.err' and '.unfinished'
+  #   files used by any previous equal command). But the parameter
+  #   conf[:fname_sanitizer] can be used to circumvent the restriction over
+  #   equal commands (if the object has state it can return a different
+  #   filename for every time it's called with the same argument).
+  # @note This procedure makes use of the following linux commands: time (not
+  #   the bash internal one, but the package one, i.e.
+  #   https://www.archlinux.org/packages/extra/x86_64/time/); timeout (from
+  #   coreutils); taskset (from util-linux,
+  #   https://www.archlinux.org/packages/core/x86_64/util-linux/); sh (the
+  #   shell).
+  # @note The command is executed inside a call to "sh -c command", so it has
+  #   to be a valid sh command.
+  # @note The output of the command "time -f #{conf[:time_fmt]}" will be
+  #   appended to the '.out' file of every command. If you set conf[:time_fmt]
+  #   to a empty string only a newline will be appended.
+  def self.batch(commands, conf)
+    # Throw exceptions if required configurations aren't provided.
+    fail "conf[:cpus_available] not set" unless conf[:cpus_available]
+    fail "conf[:timeout] not set" unless conf[:timeout]
+    # Initialize optional configurations with default values if they weren't
+    # provided. Don't change the conf argument, only our version of conf.
+    conf = conf.clone
+    conf[:time_fmt]         ||= 'ext_time: %e\\next_mem: %M\\n'
+    conf[:unfinished_ext]   ||= '.unfinished'
+    conf[:out_ext]          ||= '.out'
+    conf[:err_ext]          ||= '.err'
+    conf[:busy_loop_sleep]  ||= 0.1
+    conf[:post_timeout]     ||= 5
+    conf[:fname_sanitizer]  ||= BatchExperiment::FilenameSanitizer.new
+    conf[:skip_done_comms]    = true if conf[:skip_done_comms].nil?
+    # Initialize main variables
+    free_cpus = conf[:cpus_available].clone
+    comms_running = []
+    cpu = nil
+    comms_executed = []
+    commands.each do | command |
+      commfname = conf[:fname_sanitizer].call(command)
+      out_fname = commfname + conf[:out_ext]
+      err_fname = commfname + conf[:err_ext]
+      lockfname = commfname + conf[:unfinished_ext]
+      if conf[:skip_done_comms] && File.exists?(out_fname)
+        if File.exists?(lockfname)
+          puts "found file #{out_fname}, but a #{lockfname} also exists"
+          puts "will execute command '#{command}' anyway"
+        else
+          puts "found file #{commfname}, skipping command: #{command}"
+          STDOUT.flush
+          next
+        end
+      end
+      puts "waiting to execute command: #{command}"
+      STDOUT.flush
+      while free_cpus.empty? do
+        sleep conf[:busy_loop_sleep]
+        update_finished(free_cpus, comms_running, comms_executed)
+      end
+      cpu = free_cpus.pop
+      cproc = ChildProcess.build(
+        'taskset', '-c', cpu.to_s,
+        'time', '-f', conf[:time_fmt], '--append', '-o', out_fname,
+        'timeout', '--preserve-status', '-k', "#{conf[:post_timeout]}s",
+          "#{conf[:timeout]}s",
+        'sh', '-c', command
+      )
+      File.open(lockfname, 'w') {} # empty on purpose
+      out = File.open(out_fname, 'w')
+      err = File.open(err_fname, 'w')
+      cproc.io.stdout = out
+      cproc.io.stderr = err
+      cproc.start
+      comms_running << {
+        proc: cproc,
+        cpu: cpu,
+        lockfname: lockfname,
+        command: command
+      }
+      puts "command assigned to cpu#{cpu}"
+      STDOUT.flush
+    end
+    until comms_running.empty? do
+      sleep conf[:busy_loop_sleep]
+      update_finished(free_cpus, comms_running, comms_executed)
+    end
+    comms_executed
+  end
+  # gencommff: GENerate COMMands For Files
+  #
+  # @param comm [String] A string with 'patt' as a substring.
+  # @param patt [String] A string contained in 'comm'.
+  # @param files [Enumerable<String>] A list of strings to substitute patt at
+  #   comm.
+  # @return [Array<String>] Example: gencommff('echo STR', 'STR', ['a', 'b',
+  #   'c']) returns ['echo a', 'echo b', 'echo c'].
+  def self.gencommff(comm, patt, files)
+    ret = []
+    files.each { | f | ret << comm.gsub(patt, f) }
+    ret
+  end
+  # Intercalate a variable number of variable sized arrays in one array.
+  #
+  # @param [Array<Array<Object>>] xss An array of arrays.
+  # @return [Array<Object>] An array of the same size as the sum of the size
+  #   of all inner arrays. The values are the same (not copies) as the values
+  #   of the array. Example: intercalate([[1, 4, 6, 7], [], [2, 5], [3]])
+  #   returns [1, 2, 3, 4, 5, 6, 7].
+  def self.intercalate(xss)
+    ret = []
+    xss = xss.map { | xs | xs.reverse }
+    until xss.empty? do
+      xss.delete_if do | xs |
+        unless xs.empty?
+          ret << xs.pop
+        end
+        xs.empty?
+      end
+    end
+    ret
+  end
+  # Takes N shell commands and M files/parameters, execute each command of the
+  # N commands over the M files, save the output of each command/file
+  # combination, use objects provided with the command to extract relevant
+  # information from the output file, and group those information in a CVS
+  # file. Easier to understand seeing the sample_batch.rb example in action.
+  #
+  # @param comms_info [Array<Hash>] An array of hashs, each with the config
+  #   needed to know how to deal with the command. Four required fields
+  #   (all keys are symbols):
+  #   command [String] A string with a sh shell command.
+  #   pattern [String] A substring of command, will be replace by the strings
+  #   in the paramenter 'files'.
+  #   extractor [Extractor] An object that implements the Extractor interface.
+  #   prefix [String] A string that will be used to prefix the extractor.names
+  #   when they are used as column names. Improves Extractor reusability.
+  # @param batch_conf [Hash] Configuration used to call batch. See the
+  #   explanation for parameter 'conf' on the documentation of the batch
+  #   method. There are required fields for this hash parameter.
+  # @param conf [Hash] Lots of parameters. Here's a list:
+  #   csvfname [String] The filename/filepath for the file that will contain
+  #   the CSV data. Required field.
+  #   separator [String] The separator used at the CSV file. Default: ';'.
+  #   ic_columns [TrueClass, FalseClass] Intercalate the data returned by the
+  #   extractors. In other words, the csv line for some file will not present
+  #   all fields of the first command, then all fields of the second command,
+  #   etc, but instead will present the first field of all commands, the second
+  #   field of all commands, and so on. Default: true.
+  #   ic_comms [TrueClass, FalseClass] Intercalate the commands execution.
+  #   Instead of executing the first command over all files first, execute all
+  #   the commands over the first file first. This was made to avoid
+  #   confounding (statistical concept). If something disrupts the processing
+  #   power for some period of time, the effect will probably be distributed
+  #   between commands. The risk some algorithm seems better or worse than it
+  #   really is will be reduced. For example: you are making tests at an
+  #   notebook, the notebook becomes unplugged for a short time. The cores will
+  #   probably enter in energy saving mode and affect the observed performance.
+  #   If this happens when all tested commands are the same, then will seem
+  #   that that an command had a worse performance. If this happens when the
+  #   commands are intercalated, then maybe some instances will seem harder
+  #   than others (what is less problematic). Default: true.
+  #   skip_commands [TrueClass, FalseClass] If true, will not execute the
+  #   commands and assume that the outputs are already saved. Will only execute
+  #   the extractors over the already saved outputs, and create the CSV file
+  #   from them. Default: false.
+  #
+  # @param files [Array<Strings>] The strings that will replace the :pattern
+  #   on :command, for every element in comms_info.
+  #
+  # @return [NilClass,Array<String>] The return of the internal #batch
+  #   call. Returns nil if conf[:skip_commands] was set to true.
+  #
+  # @see BatchExperiment.batch
+  def self.experiment(comms_info, batch_conf, conf, files)
+    # Throw exceptions if required configurations aren't provided.
+    fail 'conf[:csvfname] is not defined' unless conf[:csvfname]
+    # Initialize optional configurations with default values if they weren't
+    # provided. Don't change the conf argument, only our version of conf.
+    conf = conf.clone
+    conf[:separator]  ||= ';'
+    conf[:ic_columns]   = true if conf[:ic_columns].nil?
+    conf[:ic_comms]     = true if conf[:ic_comms].nil?
+    #conf[:skip_commands] defaults to false/nil
+    # Get some of the batch config that we use inside here too.
+    out_ext = batch_conf[:out_ext] || '.out'
+    unfinished_ext = batch_conf[:unfinished_ext] || '.unfinished'
+    fname_sanitizer   = batch_conf[:fname_sanitizer]
+    fname_sanitizer ||= BatchExperiment::FilenameSanitizer.new
+    # Create commands the templates and the file list.
+    comms_sets = []
+    comms_info.each do | comm_info |
+      comms_sets << gencommff(comm_info[:command], comm_info[:pattern], files)
+    end
+    comm_list = conf[:ic_comm] ? intercalate(comms_sets) : comms_sets.flatten
+    # Execute the commands (or not).
+    ret = batch(comm_list, batch_conf) unless conf[:skip_commands]
+    # Build header (first csv line, column names).
+    header = []
+    comms_info.each do | comm_info |
+      prefixed_names = comm_info[:extractor].names.map do | name |
+        (comm_info[:prefix] + ' ') << name
+      end
+      header << prefixed_names
+    end
+    header = intercalate(header) if conf[:ic_columns]
+    header = ['Filename'].concat(header).join(conf[:separator])
+    # Build body (inspect all output files an make csv lines).
+    body = [header]
+    files.each_with_index do | inst_fname, j |
+      line = []
+      comms_info.each_with_index do | comm_info, i |
+        command =
+          if conf[:ic_comm]
+            comm_list[(j * comms_info.size) + i]
+          else
+            comm_list[(i * files.size) + j]
+          end
+        partial_fname = fname_sanitizer.call(command)
+        out_fname = partial_fname + out_ext
+        lockfname = partial_fname + unfinished_ext
+        if File.exists?(out_fname)
+          f_content = File.open(out_fname, 'r') { | f | f.read }
+          line << comm_info[:extractor].extract(f_content)
+        else
+          # if the file wasn't created insert a empty column set
+          # of the same size the true column set would be
+          line << comm_info[:extractor].names.map { | _ | '' }
+        end
+      end
+      line = intercalate(line) if conf[:ic_columns]
+      body << [inst_fname].concat(line).join(conf[:separator])
+    end
+    body = body.map! { | line | line << conf[:separator] }.join("\n")
+    # Write CSV data into a CSV file.
+    File.open(conf[:csvfname], 'w') { | f | f.write(body) }
+    return ret
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,64 @@
+--- !ruby/object:Gem::Specification
+name: batch_experiment
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Henrique Becker
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2016-03-18 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: childprocess
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.5'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.5'
+description: ''
+email: henriquebecker91@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- examples/sample_batch.rb
+- examples/ukp_batch.rb
+- lib/batch_experiment.rb
+- lib/batch_experiment/extractor.rb
+- lib/batch_experiment/sample_extractors.rb
+homepage: https://rubygems.org/gems/batch_experiment
+licenses:
+- Public Domain
+- Unlicense
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.5.1
+signing_key:
+specification_version: 4
+summary: A ruby script that distributes system commands between cpu cores, and save
+  their output.
+test_files: []