batch_experiment 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -2
- data/examples/debug_batch.rb +1 -1
- data/examples/{example_batch.rb → experiment_example.rb} +11 -11
- data/examples/{ukp_batch.rb → ukp_experiment.rb} +10 -6
- data/lib/batch_experiment.rb +260 -107
- data/lib/batch_experiment/extractor.rb +6 -4
- data/lib/batch_experiment/sample_extractors.rb +0 -31
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4bfaf4a6fa10230872eb47786f5ed25caef9866c
|
4
|
+
data.tar.gz: 5c2d57dbc5487a37d409bd6c9454177f16d44cae
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e53e5c263fca88c5389268e2d8e2aafd8491bf100d3ad3888d84024ba7f515e4797ef7856fd0f03274274f563e3263a5bb5d55a1e336210e3e770409ca52e377
|
7
|
+
data.tar.gz: f4d4741012d48a9383ac7cd73013ab40ab86d0dcbd98d7b55cee2b0ae6c3ffe790c1662d60ab2093249a4d41a720c2705a3615198ce4aae4239a5ecc1da52c2e
|
data/README.md
CHANGED
@@ -15,7 +15,7 @@ What conditions you need to use this tool:
|
|
15
15
|
|
16
16
|
What is not needed:
|
17
17
|
|
18
|
-
* To know how to program in ruby. Only taking less than 5 minutes to learn some basic syntax will suffice to run commands on multiple cores and save the results to files. However,
|
18
|
+
* To know how to program in ruby. Only taking less than 5 minutes to learn some basic syntax will suffice to run commands on multiple cores and save the results to files (using BatchExperiment::batch). However, if you want not only to execute the commands but want to extract and group some information from their output to a CVS (using BatchExperiment::experiment), you will need to tell ruby how to do the extracting part.
|
19
19
|
|
20
20
|
## How to use it
|
21
21
|
|
@@ -23,7 +23,7 @@ You will need to create a ruby script (copy, past and adapt one of the provided
|
|
23
23
|
|
24
24
|
## Examples
|
25
25
|
|
26
|
-
After installing the gem, you will have a examples folder (/home/YOUR_USER/.gem/ruby/RUBY_VERSION/gems/batch_experiment-GEM_VERSION/examples). The sample_batch.rb gives you a good ideia of how to use
|
26
|
+
After installing the gem, you will have a examples folder (/home/YOUR_USER/.gem/ruby/RUBY_VERSION/gems/batch_experiment-GEM_VERSION/examples). The sample_batch.rb gives you a good ideia of how to use ::batch (no csv creation).
|
27
27
|
|
28
28
|
```ruby
|
29
29
|
#!/bin/ruby
|
@@ -50,5 +50,7 @@ conf = {
|
|
50
50
|
BatchExperiment::batch(commands, conf)
|
51
51
|
```
|
52
52
|
|
53
|
+
The experiment_example.rb (and the lib/batch_experiment/sample_extractors.rb) gives a good ideia of how to use #experiment with multiple commands and how to create an extractor (used to create a csv).
|
54
|
+
|
53
55
|
This code was born in [this repository](https://github.com/henriquebecker91/masters/tree/master/codes/rb/batch_experiment).
|
54
56
|
|
data/examples/debug_batch.rb
CHANGED
@@ -13,30 +13,30 @@ comms_info = [{
|
|
13
13
|
pattern: 'y',
|
14
14
|
extractor: BatchExperiment::FirstLineExtractor,
|
15
15
|
prefix: 'echo',
|
16
|
-
}, {
|
17
|
-
command: 'wc FILE',
|
18
|
-
pattern: 'FILE',
|
19
|
-
extractor: BatchExperiment::WcExtractor,
|
20
|
-
prefix: 'wc',
|
21
16
|
}]
|
22
17
|
|
23
|
-
|
18
|
+
batch_conf = {
|
24
19
|
# IDs of the CPU cores that can be used for executing tests.
|
25
20
|
cpus_available: [1, 2, 3],
|
26
21
|
# Maximum number of seconds that a command can run. After this a kill command
|
27
22
|
# (TERM signal) will be issued.
|
28
23
|
timeout: 5,
|
24
|
+
# Object that gives the filename for storing the output of each run.
|
25
|
+
converter: BatchExperiment::Comm2FnameConverter.new,
|
29
26
|
}
|
30
27
|
|
31
|
-
|
28
|
+
experiment_conf = {
|
32
29
|
# The name of the file where will be written the CSV data.
|
33
30
|
csvfname: 'example.csv',
|
34
|
-
#
|
35
|
-
|
36
|
-
|
31
|
+
# Number of times the same command will be executed over the same file.
|
32
|
+
qt_runs: 5,
|
33
|
+
# Order of the commands execution
|
34
|
+
comms_order: :random, #:by_comm, #:by_file,
|
35
|
+
# Random seed (only used if comms_order is :random)
|
36
|
+
rng: Random.new(0),
|
37
37
|
}
|
38
38
|
|
39
39
|
files = ['bible.txt', 'taoteching.txt']
|
40
40
|
|
41
|
-
BatchExperiment::experiment(comms_info,
|
41
|
+
BatchExperiment::experiment(comms_info, batch_conf, experiment_conf, files)
|
42
42
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
require 'batch_experiment'
|
4
|
+
require 'batch_experiment/sample_extractors'
|
5
5
|
|
6
6
|
# I run the three lines below in the console to disable hyperthreading cores on
|
7
7
|
# my computer before examining the cores with the top command.
|
@@ -24,16 +24,20 @@ comms_info = [{
|
|
24
24
|
execution_info = {
|
25
25
|
cpus_available: [1, 2, 3],
|
26
26
|
timeout: 10,
|
27
|
-
post_timeout:
|
27
|
+
post_timeout: 2,
|
28
28
|
}
|
29
29
|
|
30
|
-
conf = {
|
30
|
+
conf = {
|
31
|
+
csvfname: 'pya_site8.csv',
|
32
|
+
comms_order: :random,
|
33
|
+
qt_runs: 10,
|
34
|
+
}
|
31
35
|
|
32
36
|
files = ['corepb.ukp', 'exnsd18.ukp', 'exnsd26.ukp', 'exnsdbis18.ukp', 'exnsd16.ukp', 'exnsd20.ukp', 'exnsdbis10.ukp', 'exnsds12.ukp']
|
33
37
|
# If you don't execute the script from the ukp files folder you need to put the
|
34
38
|
# folder relative or absolute path here (with trailing slash).
|
35
|
-
path = ''
|
39
|
+
path = '~/Aulas/mestrado/masters/data/ukp/'
|
36
40
|
files.map! { | f | path + f }
|
37
41
|
|
38
|
-
experiment(comms_info, execution_info, conf, files)
|
42
|
+
BatchExperiment::experiment(comms_info, execution_info, conf, files)
|
39
43
|
|
data/lib/batch_experiment.rb
CHANGED
@@ -4,9 +4,15 @@ require 'pathname'
|
|
4
4
|
# The main module, the two main utility methods offered are ::batch and
|
5
5
|
# ::experiment.
|
6
6
|
module BatchExperiment
|
7
|
-
# The default callable object used by
|
8
|
-
# filename.
|
9
|
-
|
7
|
+
# The default callable object used by Comm2FnameConverter to convert
|
8
|
+
# a command into a filename. Comm2FnameConverter don't create a sanitized
|
9
|
+
# filename from the command string (it uses its first argument to do this,
|
10
|
+
# whose default is FnameSanitizer).
|
11
|
+
# Note that this is a pure function, so if the same command appears more than
|
12
|
+
# one time, it will get the same name, it's Comm2FnameConverter that gives
|
13
|
+
# multiple instances of the same command different names (by suffixing with
|
14
|
+
# numbers).
|
15
|
+
module FnameSanitizer
|
10
16
|
def self.call(command)
|
11
17
|
fname = command.strip
|
12
18
|
fname.gsub!(/[^[:alnum:]]/, '_')
|
@@ -17,7 +23,57 @@ module BatchExperiment
|
|
17
23
|
end
|
18
24
|
end
|
19
25
|
|
20
|
-
#
|
26
|
+
# Converts a command to a filename using a given sanitizer, gives different
|
27
|
+
# names to different calls with the same arguments. Example: if a call with
|
28
|
+
# "sleep 1" yields "sleep_1", the second call with the same argument yields
|
29
|
+
# "sleep_1.2", and so on. Note that this is done by remembering previous
|
30
|
+
# calls, the object don't inspect the filesystem to check if that name
|
31
|
+
# was or wasn't used.
|
32
|
+
class Comm2FnameConverter
|
33
|
+
# Creates a new Comm2FnameConverter, with no memory of any previous calls.
|
34
|
+
#
|
35
|
+
# @param sanitizer [#call] Callable object used to create a filename from
|
36
|
+
# the arguments passed to Comm2FnameConverter.call. This class expects
|
37
|
+
# that sanitizer has no internal state, so when an instance of this class
|
38
|
+
# is cloned, there's no problem with sharing the sanitizer between the
|
39
|
+
# clones. Default: BatchExperiment::FnameSanitizer.
|
40
|
+
def initialize(sanitizer = FnameSanitizer)
|
41
|
+
@num_times_seen = {}
|
42
|
+
@sanitizer = sanitizer
|
43
|
+
end
|
44
|
+
|
45
|
+
# Takes a command, creates a fname for it, if this fname was already seen
|
46
|
+
# before, returns the fname + ".N", where N is the number of times fname
|
47
|
+
# was already seen.
|
48
|
+
#
|
49
|
+
# @param comm [String] A system command.
|
50
|
+
# @return [String] The sanitized filename created from that command.
|
51
|
+
# @note Note that different arguments can be reduced to the same
|
52
|
+
# sanitized filename and, if this happens, they will NOT overwrite
|
53
|
+
# each other. Example: 'echo "abc"' -> 'echo_abc'; 'echo abc' ->
|
54
|
+
# 'echo_abc.2'.
|
55
|
+
def call(comm)
|
56
|
+
fname = @sanitizer.call(comm)
|
57
|
+
if @num_times_seen.include? fname
|
58
|
+
@num_times_seen[fname] += 1
|
59
|
+
fname << ".#{@num_times_seen[fname]}"
|
60
|
+
else
|
61
|
+
@num_times_seen[fname] = 1
|
62
|
+
end
|
63
|
+
|
64
|
+
fname.clone
|
65
|
+
end
|
66
|
+
|
67
|
+
def initialize_clone(old)
|
68
|
+
@num_times_seen = old.num_times_seen.clone
|
69
|
+
end
|
70
|
+
|
71
|
+
# To allow the initialize_clone implementation.
|
72
|
+
protected
|
73
|
+
attr_reader :num_times_seen
|
74
|
+
end
|
75
|
+
|
76
|
+
# INTERNAL USE ONLY.
|
21
77
|
# Remove any finished commands from comms_running, insert the cpus
|
22
78
|
# freed by the commands termination to the free_cpus, insert the
|
23
79
|
# terminated commands on comms_executed.
|
@@ -42,7 +98,7 @@ module BatchExperiment
|
|
42
98
|
# filenames are derived from the commands. Appending '.out' to one of the
|
43
99
|
# partial filenames will give the filename were the command stdout was
|
44
100
|
# redirected. The analogue is valid for '.err' and stderr. Right before a
|
45
|
-
# command begans to run, a
|
101
|
+
# command begans to run, a 'partial_filename.unfinished' file is created.
|
46
102
|
# After the command ends its execution this file is removed. If the command
|
47
103
|
# ends its execution by means of a timeout the file is also removed. The file
|
48
104
|
# only remains if the batch procedure is interrupted (script was killed,
|
@@ -51,7 +107,7 @@ module BatchExperiment
|
|
51
107
|
#
|
52
108
|
# @param commands [Array<String>] The shell commands.
|
53
109
|
# @param conf [Hash] The configurations, as follows:
|
54
|
-
# -- cpus_available [Array<Fixnum>]
|
110
|
+
# -- cpus_available [Array<Fixnum>] CPU cores that can be used to run the
|
55
111
|
# commands. Required parameter. The cpu numbers begin at 0, despite what
|
56
112
|
# htop tells you.
|
57
113
|
# -- timeout [Number] Number of seconds before killing a command. Required
|
@@ -59,35 +115,36 @@ module BatchExperiment
|
|
59
115
|
# -- time_fmt [String] A string in the time (external command) format. See
|
60
116
|
# http://linux.die.net/man/1/time. Default: 'ext_time: %e\next_mem: %M\n'.
|
61
117
|
# -- busy_loop_sleep [Number] How many seconds to wait before checking if
|
62
|
-
# a command ended execution. This
|
63
|
-
# two commands. Default: 0.1.
|
118
|
+
# a command ended execution. This time will be very close to the max time a
|
119
|
+
# cpu will remain vacant between two commands. Default: 0.1 (1/10 second).
|
64
120
|
# -- post_timeout [Number] A command isn't guaranteed to end after
|
65
121
|
# receiving a TERM signal. If the command hasn't stopped, waits
|
66
122
|
# post_timeout seconds before sending a KILL signal (give it a chance to
|
67
123
|
# end gracefully). Default: 5.
|
68
|
-
# --
|
69
|
-
#
|
70
|
-
#
|
71
|
-
#
|
124
|
+
# -- converter [#call] The call method of this object should take a String
|
125
|
+
# and convert it (possibly losing information), to a valid filename. Used
|
126
|
+
# over the commands to define the output files of commands.
|
127
|
+
# Default: BatchExperiment::Comm2FnameConverter.new.
|
72
128
|
# -- skip_done_comms [FalseClass,TrueClass] Skip any command for what a
|
73
129
|
# corresponding '.out' file exists, except if both a '.out' and a
|
74
|
-
# '.unfinished' file exist, in the last case the command is
|
130
|
+
# '.unfinished' file exist, in the last case the command is always
|
131
|
+
# executed. If false, execute all commands and overwrite all ".out".
|
75
132
|
# Default: true.
|
76
133
|
# -- unfinished_ext [String] Extension to be used in place of
|
77
|
-
# '.unfinished'.
|
134
|
+
# '.unfinished'. Default: '.unfinished'.
|
78
135
|
# -- out_ext [String] Extension to be used in place of '.out'.
|
79
136
|
# Default: '.out'.
|
80
137
|
# -- err_ext [String] Extension to be used in place of '.err'.
|
81
138
|
# Default: '.err'.
|
139
|
+
#
|
82
140
|
# @return [String] Which commands were executed. Can be different from
|
83
141
|
# the 'commands' argument if commands are skipped (see :skip_done_comms).
|
84
142
|
#
|
85
143
|
# @note If the same command is executed over the same file more than one
|
86
|
-
# time, then
|
87
|
-
#
|
88
|
-
# conf\[:fname_sanitizer\]
|
89
|
-
#
|
90
|
-
# filename for every time it's called with the same argument).
|
144
|
+
# time, then any run besides the first will have a numeric suffix.
|
145
|
+
# Example: "sleep 1" -> "sleep_1", "sleep 1" -> "sleep_1.2".
|
146
|
+
# For more info see the parameter conf\[:fname_sanitizer\], and its
|
147
|
+
# default value BatchExperiment::Comm2FnameConverter.new.
|
91
148
|
# @note This procedure makes use of the following linux commands: time (not
|
92
149
|
# the bash internal one, but the package one, i.e.
|
93
150
|
# https://www.archlinux.org/packages/extra/x86_64/time/); timeout (from
|
@@ -101,8 +158,10 @@ module BatchExperiment
|
|
101
158
|
# conf\[:time_fmt\] to a empty string only a newline will be appended.
|
102
159
|
def self.batch(commands, conf)
|
103
160
|
# Throw exceptions if required configurations aren't provided.
|
104
|
-
|
105
|
-
|
161
|
+
if !conf[:cpus_available] then
|
162
|
+
fail ArgumentError, 'conf[:cpus_available] not set'
|
163
|
+
end
|
164
|
+
fail ArgumentError, 'conf[:timeout] not set' unless conf[:timeout]
|
106
165
|
|
107
166
|
# Initialize optional configurations with default values if they weren't
|
108
167
|
# provided. Don't change the conf argument, only our version of conf.
|
@@ -113,7 +172,7 @@ module BatchExperiment
|
|
113
172
|
conf[:err_ext] ||= '.err'
|
114
173
|
conf[:busy_loop_sleep] ||= 0.1
|
115
174
|
conf[:post_timeout] ||= 5
|
116
|
-
conf[:
|
175
|
+
conf[:converter] ||= BatchExperiment::Comm2FnameConverter.new
|
117
176
|
conf[:skip_done_comms] = true if conf[:skip_done_comms].nil?
|
118
177
|
|
119
178
|
# Initialize main variables
|
@@ -123,23 +182,23 @@ module BatchExperiment
|
|
123
182
|
comms_executed = []
|
124
183
|
|
125
184
|
commands.each do | command |
|
126
|
-
commfname = conf[:
|
185
|
+
commfname = conf[:converter].call(command)
|
127
186
|
out_fname = commfname + conf[:out_ext]
|
128
187
|
err_fname = commfname + conf[:err_ext]
|
129
188
|
lockfname = commfname + conf[:unfinished_ext]
|
130
189
|
|
131
190
|
if conf[:skip_done_comms] && File.exists?(out_fname)
|
132
191
|
if File.exists?(lockfname)
|
133
|
-
puts "
|
134
|
-
puts "
|
192
|
+
puts "Found file #{out_fname}, but a #{lockfname} also exists:"
|
193
|
+
puts "Will execute command '#{command}' anyway."
|
135
194
|
else
|
136
|
-
puts "
|
195
|
+
puts "Found file #{commfname}, skipping command: #{command}"
|
137
196
|
STDOUT.flush
|
138
197
|
next
|
139
198
|
end
|
140
199
|
end
|
141
200
|
|
142
|
-
puts "
|
201
|
+
puts "Waiting to execute command: #{command}"
|
143
202
|
STDOUT.flush
|
144
203
|
|
145
204
|
while free_cpus.empty? do
|
@@ -188,21 +247,29 @@ module BatchExperiment
|
|
188
247
|
comms_executed
|
189
248
|
end
|
190
249
|
|
191
|
-
# gencommff: GENerate COMMands For Files
|
250
|
+
# INTERNAL USE ONLY. gencommff: GENerate COMMands For Files.
|
251
|
+
# Creates a hash with the generated commands as keys, and store (as the
|
252
|
+
# respective value) the comm_info hash and the file (using a { comm_info: X,
|
253
|
+
# filename: Y } structure).
|
192
254
|
#
|
193
|
-
# @param
|
194
|
-
#
|
195
|
-
# @param files [Enumerable<String>] A list of strings
|
196
|
-
#
|
197
|
-
# @return [
|
198
|
-
#
|
199
|
-
def self.gencommff(
|
200
|
-
ret =
|
201
|
-
|
255
|
+
# @param comm_info [Hash] A hash structure following the same format
|
256
|
+
# that the elements of the comms_info array parameter of #experiment.
|
257
|
+
# @param files [Enumerable<String>] A list of strings that will replace
|
258
|
+
# comm_info[:pattern] at a copy of comm_info[:command].
|
259
|
+
# @return [Hash<String, Hash>] A hash on the following format
|
260
|
+
# { expanded_command => { comm_info: comm_info, filename: f }, ...}
|
261
|
+
def self.gencommff(comm_info, files)
|
262
|
+
ret = {}
|
263
|
+
comm = comm_info[:command]
|
264
|
+
patt = comm_info[:pattern]
|
265
|
+
files.each do | f |
|
266
|
+
ret[comm.gsub(patt, f)] = { comm_info: comm_info, filename: f }
|
267
|
+
end
|
202
268
|
ret
|
203
269
|
end
|
204
270
|
|
205
|
-
# Intercalate a variable number of variable sized arrays
|
271
|
+
# INTERNAL USE ONLY. Intercalate a variable number of variable sized arrays
|
272
|
+
# in one array.
|
206
273
|
#
|
207
274
|
# @param [Array<Array<Object>>] xss An array of arrays.
|
208
275
|
# @return [Array<Object>] An array of the same size as the sum of the size
|
@@ -223,53 +290,94 @@ module BatchExperiment
|
|
223
290
|
ret
|
224
291
|
end
|
225
292
|
|
293
|
+
class ColumnSpecError < ArgumentError; end
|
294
|
+
|
295
|
+
# INTERNAL USE ONLY. Check if the headers can be combined, if they can
|
296
|
+
# return a shallow copy of the biggest header, otherwise throw an exception.
|
297
|
+
#
|
298
|
+
# @param headers [Array<Array<Comparable>>] An array of arrays of strings
|
299
|
+
# (or any object that implements '!=').
|
300
|
+
# @return A shallow copy of the biggest inner array in headers. Only returns
|
301
|
+
# if for each position on the biggest inner array has the same value as
|
302
|
+
# that position on all the other arrays with at least that size.
|
303
|
+
def self.merge_headers(headers)
|
304
|
+
mer_size = headers.map { | h | h.size }.max
|
305
|
+
merged_h = Array.new(mer_size)
|
306
|
+
mer_size.times do | i |
|
307
|
+
headers.each do | h |
|
308
|
+
next if h.size < i
|
309
|
+
if merged_h[i].nil?
|
310
|
+
merged_h[i] = h[i]
|
311
|
+
elsif merged_h[i] != h[i]
|
312
|
+
raise ColumnSpecError, "Error: When using BatchExperiment::experiment"
|
313
|
+
+ " all the extractors have to agree on the columns they share."
|
314
|
+
+ " In the specific case: the column nº #{i} was labeled as"
|
315
|
+
+ " '#{merged_h[i]}' on one extractor, and '#{h[i]}' on another,"
|
316
|
+
+ " this can be only a difference on notation ('time' vs 'Time'),"
|
317
|
+
+ " or can mean that in the same column two different kinds of data"
|
318
|
+
+ " are being presented. The program will be aborted. Check that."
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|
322
|
+
merged_h
|
323
|
+
end
|
324
|
+
|
226
325
|
# Takes N shell commands and M files/parameters, execute each command of the
|
227
326
|
# N commands over the M files, save the output of each command/file
|
228
327
|
# combination, use objects provided with the command to extract relevant
|
229
|
-
# information from the output file, and group those information in a
|
328
|
+
# information from the output file, and group those information in a CSV
|
230
329
|
# file. Easier to understand seeing the sample_batch.rb example in action.
|
231
330
|
#
|
232
331
|
# @param comms_info [Array<Hash>] An array of hashs, each with the config
|
233
332
|
# needed to know how to deal with the command. Four required fields
|
234
333
|
# (all keys are symbols):
|
235
334
|
# command [String] A string with a sh shell command.
|
236
|
-
# pattern [String] A substring of command, will be
|
335
|
+
# pattern [String] A substring of command, will be replaced by the strings
|
237
336
|
# in the paramenter 'files'.
|
238
337
|
# extractor [#extract,#names] Object implementing the Extractor interface.
|
239
|
-
# prefix [String] A string that will be used
|
240
|
-
#
|
338
|
+
# prefix [String] A string that will be used on the 'algorithm' column
|
339
|
+
# to identify the used command.
|
241
340
|
# @param batch_conf [Hash] Configuration used to call batch. See the
|
242
341
|
# explanation for parameter 'conf' on the documentation of the batch
|
243
|
-
# method. There are required fields for this hash parameter.
|
342
|
+
# method. There are required fields for this hash parameter. Also, note
|
343
|
+
# that the batch_conf\[:converter\] should allow cloning without sharing
|
344
|
+
# mutable state. A converter clone is used by #experiment internally, it
|
345
|
+
# has to obtain the same results as the original copy (that is passed to
|
346
|
+
# BatchExperiment::batch).
|
244
347
|
# @param conf [Hash] Lots of parameters. Here's a list:
|
245
|
-
# csvfname [String] The filename/filepath for the file that will contain
|
348
|
+
# -- csvfname [String] The filename/filepath for the file that will contain
|
246
349
|
# the CSV data. Required field.
|
247
350
|
# separator [String] The separator used at the CSV file. Default: ';'.
|
248
|
-
#
|
249
|
-
#
|
250
|
-
#
|
251
|
-
#
|
252
|
-
#
|
253
|
-
#
|
254
|
-
#
|
255
|
-
#
|
256
|
-
#
|
257
|
-
#
|
258
|
-
#
|
259
|
-
#
|
260
|
-
#
|
261
|
-
#
|
262
|
-
#
|
263
|
-
#
|
264
|
-
#
|
265
|
-
#
|
351
|
+
# -- qt_runs [NilClass,Integer] If nil or one then each command is
|
352
|
+
# executed once. If is a number bigger than one, the command is executed
|
353
|
+
# that number of times. The batch_conf[:converter] will define the name
|
354
|
+
# that will be given to each run. Every file will appear qt_runs times on
|
355
|
+
# the filename column and, for the same file, the values on the run_number
|
356
|
+
# column will be the integer numbers between 1 and qt_runs (both
|
357
|
+
# inclusive). Default: nil.
|
358
|
+
# -- comms_order [:by_comm,:by_file,:random] The order the
|
359
|
+
# commands will be executed. Case by_comm: will execute the first command
|
360
|
+
# over all the files (using the files order), then will execute the
|
361
|
+
# second command over all files, and so on. Case by_file: will execute
|
362
|
+
# all the commands (using the comms_info order) over the first file,
|
363
|
+
# then will execute all the comands over the second file, and so on.
|
364
|
+
# Case random: will expand all the command/file combinations (replicating
|
365
|
+
# the same command qt_run times) and then will apply shuffle to this array,
|
366
|
+
# using the object passed to the rng parameter. This last option is the
|
367
|
+
# most adequate for statistical testing.
|
368
|
+
# -- rng [Nil,#rand] An object that implements the #rand method (behaves
|
369
|
+
# like an instance of the core Random class). If comms_order is random and
|
370
|
+
# rng is nil, will issue a warning remembering the default that was used.
|
371
|
+
# Default: Random.new(42).
|
266
372
|
# skip_commands [TrueClass, FalseClass] If true, will not execute the
|
267
|
-
# commands and assume that the outputs are already saved
|
268
|
-
# the extractors over the already saved outputs, and
|
269
|
-
# from them. Default: false.
|
373
|
+
# commands and assume that the outputs are already saved (on ".out" files).
|
374
|
+
# Will only execute the extractors over the already saved outputs, and
|
375
|
+
# create the CSV file from them. Default: false.
|
270
376
|
#
|
271
377
|
# @param files [Array<Strings>] The strings that will replace the :pattern
|
272
|
-
# on :command, for every element in comms_info.
|
378
|
+
# on :command, for every element in comms_info. Can be a filename, or
|
379
|
+
# can be anything else (a numeric parameter, sh code, etc..), but we
|
380
|
+
# refer to them as files for simplicity and uniformity.
|
273
381
|
#
|
274
382
|
# @return [NilClass,Array<String>] The return of the internal #batch
|
275
383
|
# call. Returns nil if conf[:skip_commands] was set to true.
|
@@ -283,67 +391,112 @@ module BatchExperiment
|
|
283
391
|
# Initialize optional configurations with default values if they weren't
|
284
392
|
# provided. Don't change the conf argument, only our version of conf.
|
285
393
|
conf = conf.clone
|
286
|
-
conf[:separator]
|
287
|
-
conf[:
|
288
|
-
conf[:
|
394
|
+
conf[:separator] ||= ';'
|
395
|
+
conf[:qt_runs] ||= 1
|
396
|
+
conf[:comms_order] ||= :by_comm
|
397
|
+
conf[:rng] ||= Random.new(42)
|
289
398
|
#conf[:skip_commands] defaults to false/nil
|
290
399
|
|
291
400
|
# Get some of the batch config that we use inside here too.
|
292
|
-
out_ext
|
293
|
-
unfinished_ext
|
294
|
-
|
295
|
-
|
401
|
+
out_ext = batch_conf[:out_ext] || '.out'
|
402
|
+
unfinished_ext = batch_conf[:unfinished_ext] || '.unfinished'
|
403
|
+
converter = batch_conf[:converter].clone unless batch_conf[:converter].nil?
|
404
|
+
converter ||= BatchExperiment::Comm2FnameConverter.new
|
296
405
|
|
297
|
-
#
|
406
|
+
# Expand all commands, combining command templates and files.
|
298
407
|
comms_sets = []
|
299
408
|
comms_info.each do | comm_info |
|
300
|
-
comms_sets << gencommff(comm_info
|
409
|
+
comms_sets << gencommff(comm_info, files)
|
301
410
|
end
|
302
411
|
|
303
|
-
|
412
|
+
expanded_comms = comms_sets.map { | h | h.keys }
|
413
|
+
# If each command should be run more than once...
|
414
|
+
if conf[:qt_runs] > 1
|
415
|
+
# ... we replace each single command by an array of qt_runs copies,
|
416
|
+
# and then flatten the parent array.
|
417
|
+
expanded_comms.map! do | a |
|
418
|
+
a.map! { | c | Array.new(conf[:qt_runs], c) }.flatten!
|
419
|
+
end
|
420
|
+
end
|
421
|
+
|
422
|
+
# At this moment the expanded_comms is an array of arrays, each internal
|
423
|
+
# array has all the expanded commands of the one single command template
|
424
|
+
# over all the files.
|
425
|
+
# After the code block below, the expanded_comms will be an one-level array
|
426
|
+
# of the expanded commands, in the order they will be executed.
|
427
|
+
expanded_comms = case conf[:comms_order]
|
428
|
+
when :by_comm # all runs of the first command template first
|
429
|
+
expanded_comms.flatten!
|
430
|
+
when :by_file # all runs over the first file first
|
431
|
+
intercalate(expanded_comms)
|
432
|
+
when :random # a random order
|
433
|
+
expanded_comms.flatten!.shuffle!(random: conf[:rng])
|
434
|
+
end
|
304
435
|
|
305
436
|
# Execute the commands (or not).
|
306
|
-
ret = batch(
|
437
|
+
ret = batch(expanded_comms, batch_conf) unless conf[:skip_commands]
|
307
438
|
|
308
439
|
# Build header (first csv line, column names).
|
309
|
-
header = []
|
310
|
-
comms_info.
|
311
|
-
|
312
|
-
|
440
|
+
header = ['algorithm', 'filename', 'run_number']
|
441
|
+
header << merge_headers(comms_info.map { | c | c[:extractor].names })
|
442
|
+
header = header.join(conf[:separator])
|
443
|
+
|
444
|
+
# We need to merge the union of all comms_sets to query it.
|
445
|
+
comm2origin = {}
|
446
|
+
comms_sets.each do | h |
|
447
|
+
comm2origin.merge!(h) do | k, v, v2 |
|
448
|
+
puts "WARNING: The command expansion '#{k}' was generated more than once. The first time was by the template '#{v[:comm]}' and the file '#{v[:file]}', and this time by template '#{v2[:comm]}' and the file '#{v2[:file]}'. Will report on CSV as this command was generated by the template '#{v[:comm]}' and the file '#{v[:file]}'."
|
449
|
+
v
|
313
450
|
end
|
314
|
-
header << prefixed_names
|
315
451
|
end
|
316
|
-
header = intercalate(header) if conf[:ic_columns]
|
317
|
-
header = ['Filename'].concat(header).join(conf[:separator])
|
318
452
|
|
319
|
-
# Build body (inspect all output files
|
453
|
+
# Build body (inspect all output files and make csv lines).
|
454
|
+
#
|
455
|
+
# Body format: algorithm;filename;run_number;first extracted column; ...
|
456
|
+
#
|
457
|
+
# This means that the extractors have to agree on what is each column, two
|
458
|
+
# different extractors have to extract the same kind of data at each column
|
459
|
+
# (the first field returned by all extractors has to be, for example, cpu
|
460
|
+
# time, the same applies for the remaining fields).
|
461
|
+
# If one extractor extract more fields than the others this is not a
|
462
|
+
# problem, if the second biggest extractor (in number of fields extract)
|
463
|
+
# will extract, for example, 4 fields, and the biggest extract 6 fields,
|
464
|
+
# the first 4 fields extracted by the biggest extractor have to be the same
|
465
|
+
# as the ones on the second-biggest extractor. This way, all the lines will
|
466
|
+
# have the kind of data on the first four columns (not counting the
|
467
|
+
# algorithm, filename and run_number ones), and only lines provenient from
|
468
|
+
# the biggest extractor will have data on the fifth and sixth columns.
|
320
469
|
body = [header]
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
470
|
+
times_found = {}
|
471
|
+
expanded_comms.each do | exp_comm |
|
472
|
+
run_info = comm2origin[exp_comm]
|
473
|
+
algorithm = run_info[:comm_info][:prefix]
|
474
|
+
filename = run_info[:filename]
|
475
|
+
|
476
|
+
times_found[exp_comm] ||= 0
|
477
|
+
times_found[exp_comm] += 1
|
478
|
+
run_number = times_found[exp_comm]
|
479
|
+
|
480
|
+
curr_line = [algorithm, filename, run_number]
|
481
|
+
|
482
|
+
partial_fname = converter.call(exp_comm)
|
483
|
+
out_fname = partial_fname + out_ext
|
484
|
+
lockfname = partial_fname + unfinished_ext
|
485
|
+
extractor = run_info[:comm_info][:extractor]
|
486
|
+
|
487
|
+
if File.exists?(out_fname)
|
488
|
+
if File.exists?(lockfname)
|
489
|
+
puts "Ignored file '#{out_fname}' because there was a"
|
490
|
+
+ " '#{lockfname}' file in the same folder."
|
337
491
|
else
|
338
|
-
|
339
|
-
|
340
|
-
line << comm_info[:extractor].names.map { | _ | '' }
|
492
|
+
f_content = File.open(out_fname, 'r') { | f | f.read }
|
493
|
+
curr_line << extractor.extract(f_content)
|
341
494
|
end
|
342
495
|
end
|
343
|
-
|
344
|
-
body <<
|
496
|
+
|
497
|
+
body << curr_line.join(conf[:separator])
|
345
498
|
end
|
346
|
-
body = body.
|
499
|
+
body = body.join(conf[:separator] + "\n")
|
347
500
|
|
348
501
|
# Write CSV data into a CSV file.
|
349
502
|
File.open(conf[:csvfname], 'w') { | f | f.write(body) }
|
@@ -2,7 +2,7 @@ module BatchExperiment
|
|
2
2
|
# Module that defines the interface used for extracting info from other
|
3
3
|
# programs output. You don't need to include it in your object, will suffice
|
4
4
|
# that the object (that you will use to extract info from the output) has the
|
5
|
-
#
|
5
|
+
# #names and #extract methods defined.
|
6
6
|
module Extractor
|
7
7
|
# Find a line in the following format: "field: value", return value.
|
8
8
|
#
|
@@ -13,7 +13,9 @@ module BatchExperiment
|
|
13
13
|
# @return [String] The 'value' as a string or, if 'field' isn't found, an
|
14
14
|
# empty string.
|
15
15
|
def self.get_field(lines, field)
|
16
|
-
lines.grep(/^#{field}: .*/).each
|
16
|
+
lines.grep(/^#{field}: .*/).each do | l |
|
17
|
+
return l.match(/:[\t ]+(.*)/)[1]
|
18
|
+
end
|
17
19
|
''
|
18
20
|
end
|
19
21
|
|
@@ -42,7 +44,7 @@ module BatchExperiment
|
|
42
44
|
# @return [Array<String>] The strings that will be used to make the column
|
43
45
|
# names at the BatchExperiment.experiment method.
|
44
46
|
def names
|
45
|
-
|
47
|
+
raise 'This method should have been overwritten by a subclass.'
|
46
48
|
end
|
47
49
|
|
48
50
|
# Extract N values of some program output, where N is equal to #names.size.
|
@@ -62,7 +64,7 @@ module BatchExperiment
|
|
62
64
|
# and the line string elements don't end in linebreak.
|
63
65
|
# @return [Array<String>] The N extracted values, as strings.
|
64
66
|
def extract_from_lines(lines)
|
65
|
-
|
67
|
+
raise 'This method should have been overwritten by a subclass.'
|
66
68
|
end
|
67
69
|
end
|
68
70
|
end
|
@@ -15,37 +15,6 @@ module BatchExperiment
|
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
18
|
-
module WcExtractor
|
19
|
-
extend Extractor
|
20
|
-
def self.names
|
21
|
-
['lines', 'words', 'bytes', 'ext_time', 'ext_mem']
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.extract(content)
|
25
|
-
arr = content.split(' ')
|
26
|
-
qt_lines, words, bytes = arr[0], arr[1], arr[2]
|
27
|
-
lines = content.lines.map! { | l | l.chomp! }
|
28
|
-
[ qt_lines, words, bytes,
|
29
|
-
Extractor.get_field(lines, 'ext_time'),
|
30
|
-
Extractor.get_field(lines, 'ext_mem')
|
31
|
-
]
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
module TwoWordsExtractor
|
36
|
-
extend Extractor
|
37
|
-
def self.names
|
38
|
-
['first word', 'second word', 'ext_time', 'ext_mem']
|
39
|
-
end
|
40
|
-
|
41
|
-
def self.extract_from_lines(lines)
|
42
|
-
words = lines.empty? || lines[0].nil? ? ['',''] : lines[0].split().take(2)
|
43
|
-
words << Extractor.get_field(lines, 'ext_time')
|
44
|
-
words << Extractor.get_field(lines, 'ext_mem')
|
45
|
-
words
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
18
|
# Sample extractors used at https://github.com/henriquebecker91/masters,
|
50
19
|
# where this code had its beggining. This file contains the code used to
|
51
20
|
# extract info from the different outputs generated by UKP solving programs.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: batch_experiment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Henrique Becker
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-06-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: childprocess
|
@@ -33,10 +33,10 @@ files:
|
|
33
33
|
- README.md
|
34
34
|
- examples/bible.txt
|
35
35
|
- examples/debug_batch.rb
|
36
|
-
- examples/
|
36
|
+
- examples/experiment_example.rb
|
37
37
|
- examples/sample_batch.rb
|
38
38
|
- examples/taoteching.txt
|
39
|
-
- examples/
|
39
|
+
- examples/ukp_experiment.rb
|
40
40
|
- lib/batch_experiment.rb
|
41
41
|
- lib/batch_experiment/extractor.rb
|
42
42
|
- lib/batch_experiment/sample_extractors.rb
|