batch_experiment 1.0.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1eaf01a01169f63766cca960fc0acb6edfdff9cb
4
- data.tar.gz: 69e6f2a63a0d3cab8f3c0c1974327a52a1adcf60
3
+ metadata.gz: 4bfaf4a6fa10230872eb47786f5ed25caef9866c
4
+ data.tar.gz: 5c2d57dbc5487a37d409bd6c9454177f16d44cae
5
5
  SHA512:
6
- metadata.gz: 5582c9660677fab1bdb3a8ce3f0af9d4f4254b0b3f91510aca64d05a74bc4f6b3614f5eec67cebe5d2a5b9e51b33cd8c0d007fc32a61d84bce390b8957307004
7
- data.tar.gz: 41fec77dbd71293fd7bd013167301d39c3001dc9504e69b714414f228c403e99f5c2d9f9780114ae54a035a3a38436dbbcd622523591782b31f005cd68cf6494
6
+ metadata.gz: e53e5c263fca88c5389268e2d8e2aafd8491bf100d3ad3888d84024ba7f515e4797ef7856fd0f03274274f563e3263a5bb5d55a1e336210e3e770409ca52e377
7
+ data.tar.gz: f4d4741012d48a9383ac7cd73013ab40ab86d0dcbd98d7b55cee2b0ae6c3ffe790c1662d60ab2093249a4d41a720c2705a3615198ce4aae4239a5ecc1da52c2e
data/README.md CHANGED
@@ -15,7 +15,7 @@ What conditions you need to use this tool:
15
15
 
16
16
  What is not needed:
17
17
 
18
- * To know how to program in ruby. Only taking less than 5 minutes to learn some basic syntax will suffice to run commands on multiple cores and save the results to files. However, there's an exception, if you want not only to execute the commands but want to extract and group some information from their output to a CVS, you will need to tell ruby how to do the extracting part.
18
+ * To know how to program in ruby. Only taking less than 5 minutes to learn some basic syntax will suffice to run commands on multiple cores and save the results to files (using BatchExperiment::batch). However, if you want not only to execute the commands but want to extract and group some information from their output to a CVS (using BatchExperiment::experiment), you will need to tell ruby how to do the extracting part.
19
19
 
20
20
  ## How to use it
21
21
 
@@ -23,7 +23,7 @@ You will need to create a ruby script (copy, past and adapt one of the provided
23
23
 
24
24
  ## Examples
25
25
 
26
- After installing the gem, you will have a examples folder (/home/YOUR_USER/.gem/ruby/RUBY_VERSION/gems/batch_experiment-GEM_VERSION/examples). The sample_batch.rb gives you a good ideia of how to use #batch (no csv creation). The example_batch.rb together with the lib/batch_experiment/sample_extractors.rb gives a good ideia of how to use #experiment with multiple commands and extractors (csv creation).
26
+ After installing the gem, you will have a examples folder (/home/YOUR_USER/.gem/ruby/RUBY_VERSION/gems/batch_experiment-GEM_VERSION/examples). The sample_batch.rb gives you a good ideia of how to use ::batch (no csv creation).
27
27
 
28
28
  ```ruby
29
29
  #!/bin/ruby
@@ -50,5 +50,7 @@ conf = {
50
50
  BatchExperiment::batch(commands, conf)
51
51
  ```
52
52
 
53
+ The experiment_example.rb (and the lib/batch_experiment/sample_extractors.rb) gives a good ideia of how to use #experiment with multiple commands and how to create an extractor (used to create a csv).
54
+
53
55
  This code was born in [this repository](https://github.com/henriquebecker91/masters/tree/master/codes/rb/batch_experiment).
54
56
 
@@ -1,6 +1,6 @@
1
1
  #!/bin/ruby
2
2
 
3
- require_relative '../lib/batch_experiment.rb'
3
+ require 'batch_experiment'
4
4
 
5
5
  commands = []
6
6
  10000.times { | n | commands << "sleep 1 && echo #{n}" }
@@ -13,30 +13,30 @@ comms_info = [{
13
13
  pattern: 'y',
14
14
  extractor: BatchExperiment::FirstLineExtractor,
15
15
  prefix: 'echo',
16
- }, {
17
- command: 'wc FILE',
18
- pattern: 'FILE',
19
- extractor: BatchExperiment::WcExtractor,
20
- prefix: 'wc',
21
16
  }]
22
17
 
23
- execution_info = {
18
+ batch_conf = {
24
19
  # IDs of the CPU cores that can be used for executing tests.
25
20
  cpus_available: [1, 2, 3],
26
21
  # Maximum number of seconds that a command can run. After this a kill command
27
22
  # (TERM signal) will be issued.
28
23
  timeout: 5,
24
+ # Object that gives the filename for storing the output of each run.
25
+ converter: BatchExperiment::Comm2FnameConverter.new,
29
26
  }
30
27
 
31
- conf = {
28
+ experiment_conf = {
32
29
  # The name of the file where will be written the CSV data.
33
30
  csvfname: 'example.csv',
34
- # The columns will be ordered by command. All the columns of the first
35
- # command before the one from the second and so on.
36
- ic_columns: false,
31
+ # Number of times the same command will be executed over the same file.
32
+ qt_runs: 5,
33
+ # Order of the commands execution
34
+ comms_order: :random, #:by_comm, #:by_file,
35
+ # Random seed (only used if comms_order is :random)
36
+ rng: Random.new(0),
37
37
  }
38
38
 
39
39
  files = ['bible.txt', 'taoteching.txt']
40
40
 
41
- BatchExperiment::experiment(comms_info, execution_info, conf, files)
41
+ BatchExperiment::experiment(comms_info, batch_conf, experiment_conf, files)
42
42
 
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/ruby
2
2
 
3
- require_relative 'batch_experiment'
4
- require_relative 'batch_experiment/sample_extractors'
3
+ require 'batch_experiment'
4
+ require 'batch_experiment/sample_extractors'
5
5
 
6
6
  # I run the three lines below in the console to disable hyperthreading cores on
7
7
  # my computer before examining the cores with the top command.
@@ -24,16 +24,20 @@ comms_info = [{
24
24
  execution_info = {
25
25
  cpus_available: [1, 2, 3],
26
26
  timeout: 10,
27
- post_timeout: 5,
27
+ post_timeout: 2,
28
28
  }
29
29
 
30
- conf = { csvfname: 'pya_site8.csv' }
30
+ conf = {
31
+ csvfname: 'pya_site8.csv',
32
+ comms_order: :random,
33
+ qt_runs: 10,
34
+ }
31
35
 
32
36
  files = ['corepb.ukp', 'exnsd18.ukp', 'exnsd26.ukp', 'exnsdbis18.ukp', 'exnsd16.ukp', 'exnsd20.ukp', 'exnsdbis10.ukp', 'exnsds12.ukp']
33
37
  # If you don't execute the script from the ukp files folder you need to put the
34
38
  # folder relative or absolute path here (with trailing slash).
35
- path = ''
39
+ path = '~/Aulas/mestrado/masters/data/ukp/'
36
40
  files.map! { | f | path + f }
37
41
 
38
- experiment(comms_info, execution_info, conf, files)
42
+ BatchExperiment::experiment(comms_info, execution_info, conf, files)
39
43
 
@@ -4,9 +4,15 @@ require 'pathname'
4
4
  # The main module, the two main utility methods offered are ::batch and
5
5
  # ::experiment.
6
6
  module BatchExperiment
7
- # The default callable object used by #batch to convert a command into a
8
- # filename.
9
- module FilenameSanitizer
7
+ # The default callable object used by Comm2FnameConverter to convert
8
+ # a command into a filename. Comm2FnameConverter don't create a sanitized
9
+ # filename from the command string (it uses its first argument to do this,
10
+ # whose default is FnameSanitizer).
11
+ # Note that this is a pure function, so if the same command appears more than
12
+ # one time, it will get the same name, it's Comm2FnameConverter that gives
13
+ # multiple instances of the same command different names (by suffixing with
14
+ # numbers).
15
+ module FnameSanitizer
10
16
  def self.call(command)
11
17
  fname = command.strip
12
18
  fname.gsub!(/[^[:alnum:]]/, '_')
@@ -17,7 +23,57 @@ module BatchExperiment
17
23
  end
18
24
  end
19
25
 
20
- # Internal use only. DO NOT DEPEND.
26
+ # Converts a command to a filename using a given sanitizer, gives different
27
+ # names to different calls with the same arguments. Example: if a call with
28
+ # "sleep 1" yields "sleep_1", the second call with the same argument yields
29
+ # "sleep_1.2", and so on. Note that this is done by remembering previous
30
+ # calls, the object don't inspect the filesystem to check if that name
31
+ # was or wasn't used.
32
+ class Comm2FnameConverter
33
+ # Creates a new Comm2FnameConverter, with no memory of any previous calls.
34
+ #
35
+ # @param sanitizer [#call] Callable object used to create a filename from
36
+ # the arguments passed to Comm2FnameConverter.call. This class expects
37
+ # that sanitizer has no internal state, so when an instance of this class
38
+ # is cloned, there's no problem with sharing the sanitizer between the
39
+ # clones. Default: BatchExperiment::FnameSanitizer.
40
+ def initialize(sanitizer = FnameSanitizer)
41
+ @num_times_seen = {}
42
+ @sanitizer = sanitizer
43
+ end
44
+
45
+ # Takes a command, creates a fname for it, if this fname was already seen
46
+ # before, returns the fname + ".N", where N is the number of times fname
47
+ # was already seen.
48
+ #
49
+ # @param comm [String] A system command.
50
+ # @return [String] The sanitized filename created from that command.
51
+ # @note Note that different arguments can be reduced to the same
52
+ # sanitized filename and, if this happens, they will NOT overwrite
53
+ # each other. Example: 'echo "abc"' -> 'echo_abc'; 'echo abc' ->
54
+ # 'echo_abc.2'.
55
+ def call(comm)
56
+ fname = @sanitizer.call(comm)
57
+ if @num_times_seen.include? fname
58
+ @num_times_seen[fname] += 1
59
+ fname << ".#{@num_times_seen[fname]}"
60
+ else
61
+ @num_times_seen[fname] = 1
62
+ end
63
+
64
+ fname.clone
65
+ end
66
+
67
+ def initialize_clone(old)
68
+ @num_times_seen = old.num_times_seen.clone
69
+ end
70
+
71
+ # To allow the initialize_clone implementation.
72
+ protected
73
+ attr_reader :num_times_seen
74
+ end
75
+
76
+ # INTERNAL USE ONLY.
21
77
  # Remove any finished commands from comms_running, insert the cpus
22
78
  # freed by the commands termination to the free_cpus, insert the
23
79
  # terminated commands on comms_executed.
@@ -42,7 +98,7 @@ module BatchExperiment
42
98
  # filenames are derived from the commands. Appending '.out' to one of the
43
99
  # partial filenames will give the filename were the command stdout was
44
100
  # redirected. The analogue is valid for '.err' and stderr. Right before a
45
- # command begans to run, a "partial_filename.unfinished file is created.
101
+ # command begans to run, a 'partial_filename.unfinished' file is created.
46
102
  # After the command ends its execution this file is removed. If the command
47
103
  # ends its execution by means of a timeout the file is also removed. The file
48
104
  # only remains if the batch procedure is interrupted (script was killed,
@@ -51,7 +107,7 @@ module BatchExperiment
51
107
  #
52
108
  # @param commands [Array<String>] The shell commands.
53
109
  # @param conf [Hash] The configurations, as follows:
54
- # -- cpus_available [Array<Fixnum>] Cpu cores that can be used to run the
110
+ # -- cpus_available [Array<Fixnum>] CPU cores that can be used to run the
55
111
  # commands. Required parameter. The cpu numbers begin at 0, despite what
56
112
  # htop tells you.
57
113
  # -- timeout [Number] Number of seconds before killing a command. Required
@@ -59,35 +115,36 @@ module BatchExperiment
59
115
  # -- time_fmt [String] A string in the time (external command) format. See
60
116
  # http://linux.die.net/man/1/time. Default: 'ext_time: %e\next_mem: %M\n'.
61
117
  # -- busy_loop_sleep [Number] How many seconds to wait before checking if
62
- # a command ended execution. This is max time a cpu will be vacant between
63
- # two commands. Default: 0.1.
118
+ # a command ended execution. This time will be very close to the max time a
119
+ # cpu will remain vacant between two commands. Default: 0.1 (1/10 second).
64
120
  # -- post_timeout [Number] A command isn't guaranteed to end after
65
121
  # receiving a TERM signal. If the command hasn't stopped, waits
66
122
  # post_timeout seconds before sending a KILL signal (give it a chance to
67
123
  # end gracefully). Default: 5.
68
- # -- fname_sanitizer [#call] The call method of this object
69
- # should take a String and convert it (possibly losing information), to a
70
- # valid filename. Used over the commands to define the output files of
71
- # commands. Default: BatchExperiment::FilenameSanitizer
124
+ # -- converter [#call] The call method of this object should take a String
125
+ # and convert it (possibly losing information), to a valid filename. Used
126
+ # over the commands to define the output files of commands.
127
+ # Default: BatchExperiment::Comm2FnameConverter.new.
72
128
  # -- skip_done_comms [FalseClass,TrueClass] Skip any command for what a
73
129
  # corresponding '.out' file exists, except if both a '.out' and a
74
- # '.unfinished' file exist, in the last case the command is executed.
130
+ # '.unfinished' file exist, in the last case the command is always
131
+ # executed. If false, execute all commands and overwrite all ".out".
75
132
  # Default: true.
76
133
  # -- unfinished_ext [String] Extension to be used in place of
77
- # '.unfinished'. Default: '.unfinished'.
134
+ # '.unfinished'. Default: '.unfinished'.
78
135
  # -- out_ext [String] Extension to be used in place of '.out'.
79
136
  # Default: '.out'.
80
137
  # -- err_ext [String] Extension to be used in place of '.err'.
81
138
  # Default: '.err'.
139
+ #
82
140
  # @return [String] Which commands were executed. Can be different from
83
141
  # the 'commands' argument if commands are skipped (see :skip_done_comms).
84
142
  #
85
143
  # @note If the same command is executed over the same file more than one
86
- # time, then only the last execution will be saved (because the '.out',
87
- # '.err' and '.unfinished' files will be overwritten). But the parameter
88
- # conf\[:fname_sanitizer\] can be used to circumvent the restriction over
89
- # equal commands (if the object has state it can return a different
90
- # filename for every time it's called with the same argument).
144
+ # time, then any run besides the first will have a numeric suffix.
145
+ # Example: "sleep 1" -> "sleep_1", "sleep 1" -> "sleep_1.2".
146
+ # For more info see the parameter conf\[:fname_sanitizer\], and its
147
+ # default value BatchExperiment::Comm2FnameConverter.new.
91
148
  # @note This procedure makes use of the following linux commands: time (not
92
149
  # the bash internal one, but the package one, i.e.
93
150
  # https://www.archlinux.org/packages/extra/x86_64/time/); timeout (from
@@ -101,8 +158,10 @@ module BatchExperiment
101
158
  # conf\[:time_fmt\] to a empty string only a newline will be appended.
102
159
  def self.batch(commands, conf)
103
160
  # Throw exceptions if required configurations aren't provided.
104
- fail 'conf[:cpus_available] not set' unless conf[:cpus_available]
105
- fail 'conf[:timeout] not set' unless conf[:timeout]
161
+ if !conf[:cpus_available] then
162
+ fail ArgumentError, 'conf[:cpus_available] not set'
163
+ end
164
+ fail ArgumentError, 'conf[:timeout] not set' unless conf[:timeout]
106
165
 
107
166
  # Initialize optional configurations with default values if they weren't
108
167
  # provided. Don't change the conf argument, only our version of conf.
@@ -113,7 +172,7 @@ module BatchExperiment
113
172
  conf[:err_ext] ||= '.err'
114
173
  conf[:busy_loop_sleep] ||= 0.1
115
174
  conf[:post_timeout] ||= 5
116
- conf[:fname_sanitizer] ||= BatchExperiment::FilenameSanitizer
175
+ conf[:converter] ||= BatchExperiment::Comm2FnameConverter.new
117
176
  conf[:skip_done_comms] = true if conf[:skip_done_comms].nil?
118
177
 
119
178
  # Initialize main variables
@@ -123,23 +182,23 @@ module BatchExperiment
123
182
  comms_executed = []
124
183
 
125
184
  commands.each do | command |
126
- commfname = conf[:fname_sanitizer].call(command)
185
+ commfname = conf[:converter].call(command)
127
186
  out_fname = commfname + conf[:out_ext]
128
187
  err_fname = commfname + conf[:err_ext]
129
188
  lockfname = commfname + conf[:unfinished_ext]
130
189
 
131
190
  if conf[:skip_done_comms] && File.exists?(out_fname)
132
191
  if File.exists?(lockfname)
133
- puts "found file #{out_fname}, but a #{lockfname} also exists"
134
- puts "will execute command '#{command}' anyway"
192
+ puts "Found file #{out_fname}, but a #{lockfname} also exists:"
193
+ puts "Will execute command '#{command}' anyway."
135
194
  else
136
- puts "found file #{commfname}, skipping command: #{command}"
195
+ puts "Found file #{commfname}, skipping command: #{command}"
137
196
  STDOUT.flush
138
197
  next
139
198
  end
140
199
  end
141
200
 
142
- puts "waiting to execute command: #{command}"
201
+ puts "Waiting to execute command: #{command}"
143
202
  STDOUT.flush
144
203
 
145
204
  while free_cpus.empty? do
@@ -188,21 +247,29 @@ module BatchExperiment
188
247
  comms_executed
189
248
  end
190
249
 
191
- # gencommff: GENerate COMMands For Files
250
+ # INTERNAL USE ONLY. gencommff: GENerate COMMands For Files.
251
+ # Creates a hash with the generated commands as keys, and store (as the
252
+ # respective value) the comm_info hash and the file (using a { comm_info: X,
253
+ # filename: Y } structure).
192
254
  #
193
- # @param comm [String] A string with 'patt' as a substring.
194
- # @param patt [String] A string contained in 'comm'.
195
- # @param files [Enumerable<String>] A list of strings to substitute patt at
196
- # comm.
197
- # @return [Array<String>] Example: gencommff('echo STR', 'STR', ['a', 'b',
198
- # 'c']) returns ['echo a', 'echo b', 'echo c'].
199
- def self.gencommff(comm, patt, files)
200
- ret = []
201
- files.each { | f | ret << comm.gsub(patt, f) }
255
+ # @param comm_info [Hash] A hash structure following the same format
256
+ # that the elements of the comms_info array parameter of #experiment.
257
+ # @param files [Enumerable<String>] A list of strings that will replace
258
+ # comm_info[:pattern] at a copy of comm_info[:command].
259
+ # @return [Hash<String, Hash>] A hash on the following format
260
+ # { expanded_command => { comm_info: comm_info, filename: f }, ...}
261
+ def self.gencommff(comm_info, files)
262
+ ret = {}
263
+ comm = comm_info[:command]
264
+ patt = comm_info[:pattern]
265
+ files.each do | f |
266
+ ret[comm.gsub(patt, f)] = { comm_info: comm_info, filename: f }
267
+ end
202
268
  ret
203
269
  end
204
270
 
205
- # Intercalate a variable number of variable sized arrays in one array.
271
+ # INTERNAL USE ONLY. Intercalate a variable number of variable sized arrays
272
+ # in one array.
206
273
  #
207
274
  # @param [Array<Array<Object>>] xss An array of arrays.
208
275
  # @return [Array<Object>] An array of the same size as the sum of the size
@@ -223,53 +290,94 @@ module BatchExperiment
223
290
  ret
224
291
  end
225
292
 
293
+ class ColumnSpecError < ArgumentError; end
294
+
295
+ # INTERNAL USE ONLY. Check if the headers can be combined, if they can
296
+ # return a shallow copy of the biggest header, otherwise throw an exception.
297
+ #
298
+ # @param headers [Array<Array<Comparable>>] An array of arrays of strings
299
+ # (or any object that implements '!=').
300
+ # @return A shallow copy of the biggest inner array in headers. Only returns
301
+ # if for each position on the biggest inner array has the same value as
302
+ # that position on all the other arrays with at least that size.
303
+ def self.merge_headers(headers)
304
+ mer_size = headers.map { | h | h.size }.max
305
+ merged_h = Array.new(mer_size)
306
+ mer_size.times do | i |
307
+ headers.each do | h |
308
+ next if h.size < i
309
+ if merged_h[i].nil?
310
+ merged_h[i] = h[i]
311
+ elsif merged_h[i] != h[i]
312
+ raise ColumnSpecError, "Error: When using BatchExperiment::experiment"
313
+ + " all the extractors have to agree on the columns they share."
314
+ + " In the specific case: the column nº #{i} was labeled as"
315
+ + " '#{merged_h[i]}' on one extractor, and '#{h[i]}' on another,"
316
+ + " this can be only a difference on notation ('time' vs 'Time'),"
317
+ + " or can mean that in the same column two different kinds of data"
318
+ + " are being presented. The program will be aborted. Check that."
319
+ end
320
+ end
321
+ end
322
+ merged_h
323
+ end
324
+
226
325
  # Takes N shell commands and M files/parameters, execute each command of the
227
326
  # N commands over the M files, save the output of each command/file
228
327
  # combination, use objects provided with the command to extract relevant
229
- # information from the output file, and group those information in a CVS
328
+ # information from the output file, and group those information in a CSV
230
329
  # file. Easier to understand seeing the sample_batch.rb example in action.
231
330
  #
232
331
  # @param comms_info [Array<Hash>] An array of hashs, each with the config
233
332
  # needed to know how to deal with the command. Four required fields
234
333
  # (all keys are symbols):
235
334
  # command [String] A string with a sh shell command.
236
- # pattern [String] A substring of command, will be replace by the strings
335
+ # pattern [String] A substring of command, will be replaced by the strings
237
336
  # in the paramenter 'files'.
238
337
  # extractor [#extract,#names] Object implementing the Extractor interface.
239
- # prefix [String] A string that will be used to prefix the extractor.names
240
- # when they are used as column names. Improves Extractor reusability.
338
+ # prefix [String] A string that will be used on the 'algorithm' column
339
+ # to identify the used command.
241
340
  # @param batch_conf [Hash] Configuration used to call batch. See the
242
341
  # explanation for parameter 'conf' on the documentation of the batch
243
- # method. There are required fields for this hash parameter.
342
+ # method. There are required fields for this hash parameter. Also, note
343
+ # that the batch_conf\[:converter\] should allow cloning without sharing
344
+ # mutable state. A converter clone is used by #experiment internally, it
345
+ # has to obtain the same results as the original copy (that is passed to
346
+ # BatchExperiment::batch).
244
347
  # @param conf [Hash] Lots of parameters. Here's a list:
245
- # csvfname [String] The filename/filepath for the file that will contain
348
+ # -- csvfname [String] The filename/filepath for the file that will contain
246
349
  # the CSV data. Required field.
247
350
  # separator [String] The separator used at the CSV file. Default: ';'.
248
- # ic_columns [TrueClass, FalseClass] Intercalate the data returned by the
249
- # extractors. In other words, the csv line for some file will not present
250
- # all fields of the first command, then all fields of the second command,
251
- # etc, but instead will present the first field of all commands, the second
252
- # field of all commands, and so on. Default: true.
253
- # ic_comms [TrueClass, FalseClass] Intercalate the commands execution.
254
- # Instead of executing the first command over all files first, execute all
255
- # the commands over the first file first. This was made to avoid
256
- # confounding (statistical concept). If something disrupts the processing
257
- # power for some period of time, the effect will probably be distributed
258
- # between commands. The risk some algorithm seems better or worse than it
259
- # really is will be reduced. For example: you are making tests at an
260
- # notebook, the notebook becomes unplugged for a short time. The cores will
261
- # probably enter in energy saving mode and affect the observed performance.
262
- # If this happens when all tested commands are the same, then will seem
263
- # that that an command had a worse performance. If this happens when the
264
- # commands are intercalated, then maybe some instances will seem harder
265
- # than others (what is less problematic). Default: true.
351
+ # -- qt_runs [NilClass,Integer] If nil or one then each command is
352
+ # executed once. If is a number bigger than one, the command is executed
353
+ # that number of times. The batch_conf[:converter] will define the name
354
+ # that will be given to each run. Every file will appear qt_runs times on
355
+ # the filename column and, for the same file, the values on the run_number
356
+ # column will be the integer numbers between 1 and qt_runs (both
357
+ # inclusive). Default: nil.
358
+ # -- comms_order [:by_comm,:by_file,:random] The order the
359
+ # commands will be executed. Case by_comm: will execute the first command
360
+ # over all the files (using the files order), then will execute the
361
+ # second command over all files, and so on. Case by_file: will execute
362
+ # all the commands (using the comms_info order) over the first file,
363
+ # then will execute all the comands over the second file, and so on.
364
+ # Case random: will expand all the command/file combinations (replicating
365
+ # the same command qt_run times) and then will apply shuffle to this array,
366
+ # using the object passed to the rng parameter. This last option is the
367
+ # most adequate for statistical testing.
368
+ # -- rng [Nil,#rand] An object that implements the #rand method (behaves
369
+ # like an instance of the core Random class). If comms_order is random and
370
+ # rng is nil, will issue a warning remembering the default that was used.
371
+ # Default: Random.new(42).
266
372
  # skip_commands [TrueClass, FalseClass] If true, will not execute the
267
- # commands and assume that the outputs are already saved. Will only execute
268
- # the extractors over the already saved outputs, and create the CSV file
269
- # from them. Default: false.
373
+ # commands and assume that the outputs are already saved (on ".out" files).
374
+ # Will only execute the extractors over the already saved outputs, and
375
+ # create the CSV file from them. Default: false.
270
376
  #
271
377
  # @param files [Array<Strings>] The strings that will replace the :pattern
272
- # on :command, for every element in comms_info.
378
+ # on :command, for every element in comms_info. Can be a filename, or
379
+ # can be anything else (a numeric parameter, sh code, etc..), but we
380
+ # refer to them as files for simplicity and uniformity.
273
381
  #
274
382
  # @return [NilClass,Array<String>] The return of the internal #batch
275
383
  # call. Returns nil if conf[:skip_commands] was set to true.
@@ -283,67 +391,112 @@ module BatchExperiment
283
391
  # Initialize optional configurations with default values if they weren't
284
392
  # provided. Don't change the conf argument, only our version of conf.
285
393
  conf = conf.clone
286
- conf[:separator] ||= ';'
287
- conf[:ic_columns] = true if conf[:ic_columns].nil?
288
- conf[:ic_comms] = true if conf[:ic_comms].nil?
394
+ conf[:separator] ||= ';'
395
+ conf[:qt_runs] ||= 1
396
+ conf[:comms_order] ||= :by_comm
397
+ conf[:rng] ||= Random.new(42)
289
398
  #conf[:skip_commands] defaults to false/nil
290
399
 
291
400
  # Get some of the batch config that we use inside here too.
292
- out_ext = batch_conf[:out_ext] || '.out'
293
- unfinished_ext = batch_conf[:unfinished_ext] || '.unfinished'
294
- fname_sanitizer = batch_conf[:fname_sanitizer]
295
- fname_sanitizer ||= BatchExperiment::FilenameSanitizer
401
+ out_ext = batch_conf[:out_ext] || '.out'
402
+ unfinished_ext = batch_conf[:unfinished_ext] || '.unfinished'
403
+ converter = batch_conf[:converter].clone unless batch_conf[:converter].nil?
404
+ converter ||= BatchExperiment::Comm2FnameConverter.new
296
405
 
297
- # Create commands the templates and the file list.
406
+ # Expand all commands, combining command templates and files.
298
407
  comms_sets = []
299
408
  comms_info.each do | comm_info |
300
- comms_sets << gencommff(comm_info[:command], comm_info[:pattern], files)
409
+ comms_sets << gencommff(comm_info, files)
301
410
  end
302
411
 
303
- comm_list = conf[:ic_comms] ? intercalate(comms_sets) : comms_sets.flatten
412
+ expanded_comms = comms_sets.map { | h | h.keys }
413
+ # If each command should be run more than once...
414
+ if conf[:qt_runs] > 1
415
+ # ... we replace each single command by an array of qt_runs copies,
416
+ # and then flatten the parent array.
417
+ expanded_comms.map! do | a |
418
+ a.map! { | c | Array.new(conf[:qt_runs], c) }.flatten!
419
+ end
420
+ end
421
+
422
+ # At this moment the expanded_comms is an array of arrays, each internal
423
+ # array has all the expanded commands of the one single command template
424
+ # over all the files.
425
+ # After the code block below, the expanded_comms will be an one-level array
426
+ # of the expanded commands, in the order they will be executed.
427
+ expanded_comms = case conf[:comms_order]
428
+ when :by_comm # all runs of the first command template first
429
+ expanded_comms.flatten!
430
+ when :by_file # all runs over the first file first
431
+ intercalate(expanded_comms)
432
+ when :random # a random order
433
+ expanded_comms.flatten!.shuffle!(random: conf[:rng])
434
+ end
304
435
 
305
436
  # Execute the commands (or not).
306
- ret = batch(comm_list, batch_conf) unless conf[:skip_commands]
437
+ ret = batch(expanded_comms, batch_conf) unless conf[:skip_commands]
307
438
 
308
439
  # Build header (first csv line, column names).
309
- header = []
310
- comms_info.each do | comm_info |
311
- prefixed_names = comm_info[:extractor].names.map do | name |
312
- (comm_info[:prefix] + ' ') << name
440
+ header = ['algorithm', 'filename', 'run_number']
441
+ header << merge_headers(comms_info.map { | c | c[:extractor].names })
442
+ header = header.join(conf[:separator])
443
+
444
+ # We need to merge the union of all comms_sets to query it.
445
+ comm2origin = {}
446
+ comms_sets.each do | h |
447
+ comm2origin.merge!(h) do | k, v, v2 |
448
+ puts "WARNING: The command expansion '#{k}' was generated more than once. The first time was by the template '#{v[:comm]}' and the file '#{v[:file]}', and this time by template '#{v2[:comm]}' and the file '#{v2[:file]}'. Will report on CSV as this command was generated by the template '#{v[:comm]}' and the file '#{v[:file]}'."
449
+ v
313
450
  end
314
- header << prefixed_names
315
451
  end
316
- header = intercalate(header) if conf[:ic_columns]
317
- header = ['Filename'].concat(header).join(conf[:separator])
318
452
 
319
- # Build body (inspect all output files an make csv lines).
453
+ # Build body (inspect all output files and make csv lines).
454
+ #
455
+ # Body format: algorithm;filename;run_number;first extracted column; ...
456
+ #
457
+ # This means that the extractors have to agree on what is each column, two
458
+ # different extractors have to extract the same kind of data at each column
459
+ # (the first field returned by all extractors has to be, for example, cpu
460
+ # time, the same applies for the remaining fields).
461
+ # If one extractor extract more fields than the others this is not a
462
+ # problem, if the second biggest extractor (in number of fields extract)
463
+ # will extract, for example, 4 fields, and the biggest extract 6 fields,
464
+ # the first 4 fields extracted by the biggest extractor have to be the same
465
+ # as the ones on the second-biggest extractor. This way, all the lines will
466
+ # have the kind of data on the first four columns (not counting the
467
+ # algorithm, filename and run_number ones), and only lines provenient from
468
+ # the biggest extractor will have data on the fifth and sixth columns.
320
469
  body = [header]
321
- files.each_with_index do | inst_fname, j |
322
- line = []
323
- comms_info.each_with_index do | comm_info, i |
324
- command =
325
- if conf[:ic_comms]
326
- comm_list[(j * comms_info.size) + i]
327
- else
328
- comm_list[(i * files.size) + j]
329
- end
330
-
331
- partial_fname = fname_sanitizer.call(command)
332
- out_fname = partial_fname + out_ext
333
- lockfname = partial_fname + unfinished_ext
334
- if File.exists?(out_fname)
335
- f_content = File.open(out_fname, 'r') { | f | f.read }
336
- line << comm_info[:extractor].extract(f_content)
470
+ times_found = {}
471
+ expanded_comms.each do | exp_comm |
472
+ run_info = comm2origin[exp_comm]
473
+ algorithm = run_info[:comm_info][:prefix]
474
+ filename = run_info[:filename]
475
+
476
+ times_found[exp_comm] ||= 0
477
+ times_found[exp_comm] += 1
478
+ run_number = times_found[exp_comm]
479
+
480
+ curr_line = [algorithm, filename, run_number]
481
+
482
+ partial_fname = converter.call(exp_comm)
483
+ out_fname = partial_fname + out_ext
484
+ lockfname = partial_fname + unfinished_ext
485
+ extractor = run_info[:comm_info][:extractor]
486
+
487
+ if File.exists?(out_fname)
488
+ if File.exists?(lockfname)
489
+ puts "Ignored file '#{out_fname}' because there was a"
490
+ + " '#{lockfname}' file in the same folder."
337
491
  else
338
- # if the file wasn't created insert a empty column set
339
- # of the same size the true column set would be
340
- line << comm_info[:extractor].names.map { | _ | '' }
492
+ f_content = File.open(out_fname, 'r') { | f | f.read }
493
+ curr_line << extractor.extract(f_content)
341
494
  end
342
495
  end
343
- line = intercalate(line) if conf[:ic_columns]
344
- body << [inst_fname].concat(line).join(conf[:separator])
496
+
497
+ body << curr_line.join(conf[:separator])
345
498
  end
346
- body = body.map! { | line | line << conf[:separator] }.join("\n")
499
+ body = body.join(conf[:separator] + "\n")
347
500
 
348
501
  # Write CSV data into a CSV file.
349
502
  File.open(conf[:csvfname], 'w') { | f | f.write(body) }
@@ -2,7 +2,7 @@ module BatchExperiment
2
2
  # Module that defines the interface used for extracting info from other
3
3
  # programs output. You don't need to include it in your object, will suffice
4
4
  # that the object (that you will use to extract info from the output) has the
5
- # ::names and ::extract methods defined.
5
+ # #names and #extract methods defined.
6
6
  module Extractor
7
7
  # Find a line in the following format: "field: value", return value.
8
8
  #
@@ -13,7 +13,9 @@ module BatchExperiment
13
13
  # @return [String] The 'value' as a string or, if 'field' isn't found, an
14
14
  # empty string.
15
15
  def self.get_field(lines, field)
16
- lines.grep(/^#{field}: .*/).each { | l | return l.match(/:[\t ]+(.*)/)[1] }
16
+ lines.grep(/^#{field}: .*/).each do | l |
17
+ return l.match(/:[\t ]+(.*)/)[1]
18
+ end
17
19
  ''
18
20
  end
19
21
 
@@ -42,7 +44,7 @@ module BatchExperiment
42
44
  # @return [Array<String>] The strings that will be used to make the column
43
45
  # names at the BatchExperiment.experiment method.
44
46
  def names
45
- fail 'This method should have been overwritten by a subclass.'
47
+ raise 'This method should have been overwritten by a subclass.'
46
48
  end
47
49
 
48
50
  # Extract N values of some program output, where N is equal to #names.size.
@@ -62,7 +64,7 @@ module BatchExperiment
62
64
  # and the line string elements don't end in linebreak.
63
65
  # @return [Array<String>] The N extracted values, as strings.
64
66
  def extract_from_lines(lines)
65
- fail 'This method should have been overwritten by a subclass.'
67
+ raise 'This method should have been overwritten by a subclass.'
66
68
  end
67
69
  end
68
70
  end
@@ -15,37 +15,6 @@ module BatchExperiment
15
15
  end
16
16
  end
17
17
 
18
- module WcExtractor
19
- extend Extractor
20
- def self.names
21
- ['lines', 'words', 'bytes', 'ext_time', 'ext_mem']
22
- end
23
-
24
- def self.extract(content)
25
- arr = content.split(' ')
26
- qt_lines, words, bytes = arr[0], arr[1], arr[2]
27
- lines = content.lines.map! { | l | l.chomp! }
28
- [ qt_lines, words, bytes,
29
- Extractor.get_field(lines, 'ext_time'),
30
- Extractor.get_field(lines, 'ext_mem')
31
- ]
32
- end
33
- end
34
-
35
- module TwoWordsExtractor
36
- extend Extractor
37
- def self.names
38
- ['first word', 'second word', 'ext_time', 'ext_mem']
39
- end
40
-
41
- def self.extract_from_lines(lines)
42
- words = lines.empty? || lines[0].nil? ? ['',''] : lines[0].split().take(2)
43
- words << Extractor.get_field(lines, 'ext_time')
44
- words << Extractor.get_field(lines, 'ext_mem')
45
- words
46
- end
47
- end
48
-
49
18
  # Sample extractors used at https://github.com/henriquebecker91/masters,
50
19
  # where this code had its beggining. This file contains the code used to
51
20
  # extract info from the different outputs generated by UKP solving programs.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: batch_experiment
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Henrique Becker
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-28 00:00:00.000000000 Z
11
+ date: 2016-06-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: childprocess
@@ -33,10 +33,10 @@ files:
33
33
  - README.md
34
34
  - examples/bible.txt
35
35
  - examples/debug_batch.rb
36
- - examples/example_batch.rb
36
+ - examples/experiment_example.rb
37
37
  - examples/sample_batch.rb
38
38
  - examples/taoteching.txt
39
- - examples/ukp_batch.rb
39
+ - examples/ukp_experiment.rb
40
40
  - lib/batch_experiment.rb
41
41
  - lib/batch_experiment/extractor.rb
42
42
  - lib/batch_experiment/sample_extractors.rb