GECS 1
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +10 -0
- data/lib/GECS.rb +754 -0
- data/test/test_GECS.rb +46 -0
- metadata +84 -0
data/Rakefile
ADDED
data/lib/GECS.rb
ADDED
@@ -0,0 +1,754 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# Gem for Experimental Computer Science
|
3
|
+
|
4
|
+
# Gem for Experimental Computer Science
|
5
|
+
#
|
6
|
+
# Author:: David Flater <dflater@nist.gov>
|
7
|
+
# Copyright:: Public domain
|
8
|
+
# License:: Unlicense
|
9
|
+
#
|
10
|
+
# This software is experimental. NIST assumes no responsibility whatsoever
|
11
|
+
# for its use by other parties and makes no guarantees, expressed or
|
12
|
+
# implied, about its quality, reliability, or any other characteristic.
|
13
|
+
#
|
14
|
+
# == Conventions used within the GECS module
|
15
|
+
#
|
16
|
+
# The following abbreviations are used:
|
17
|
+
# Parm:: parameter
|
18
|
+
# Est:: estimation or estimate
|
19
|
+
# Inv:: interval
|
20
|
+
# Ind:: independent
|
21
|
+
# Dep:: dependent
|
22
|
+
# Var:: variable
|
23
|
+
# Treat:: treatment (a vector of factor levels)
|
24
|
+
#
|
25
|
+
# Arrays of character strings (or in one case, ParmDef structs) are used as a
|
26
|
+
# kind of "enum." Values of the enum pseudo-type are unsigned integers that
|
27
|
+
# simply index the array to provide a concise identifier for whatever the
|
28
|
+
# referenced character string (or ParmDef) describes.
|
29
|
+
#
|
30
|
+
# A good many helper methods that should not be exposed through the GECS API
|
31
|
+
# unfortunately are exposed and get listed by rdoc. These have been tagged
|
32
|
+
# (Private) in their comments.
|
33
|
+
#
|
34
|
+
# Rdoc lists struct definitions in the Constants section.
|
35
|
+
#
|
36
|
+
# == Dependencies
|
37
|
+
#
|
38
|
+
# To use the ::bootstrapMeans method:
|
39
|
+
# * The Ruby gem parallel[rubygems.org/gems/parallel] must be installed.
|
40
|
+
# Tested version: 0.9.2.
|
41
|
+
# * The R[www.r-project.org] environment for statistical computing must be
|
42
|
+
# runnable from a shell command line. Tested version: 3.1.0.
|
43
|
+
# * The R package bootBCa[bootbca.r-forge.r-project.org] must be installed.
|
44
|
+
# Tested version: 1.0.
|
45
|
+
#
|
46
|
+
# To use the ::quickMeans method:
|
47
|
+
# * The Ruby gem statistics2[rubygems.org/gems/statistics2] must be installed.
|
48
|
+
# Tested version: 0.54.
|
49
|
+
#
|
50
|
+
# Both ::bootstrapMeans and ::quickMeans use set, a standard
|
51
|
+
# pre-installed class.
|
52
|
+
#
|
53
|
+
# == Bad behaviors
|
54
|
+
#
|
55
|
+
# ::bootstrapMeans writes temporary files into the current working directory.
|
56
|
+
# Normally, they will be deleted when no longer in use.
|
57
|
+
#
|
58
|
+
# The inverse t distribution in statistics2 that is used by ::quickMeans
|
59
|
+
# agrees with R and Octave only to 4 decimals or so.
|
60
|
+
|
61
|
+
module GECS
|
62
|
+
|
63
|
+
# Integer constant indicating the version of GECS that has been loaded.
|
64
|
+
# There is no major/minor/patchlevel encoding. It just increments.
|
65
|
+
Version = 1
|
66
|
+
|
67
|
+
# Array of character strings (pseudo-enum definition) used to identify
|
68
|
+
# parameters.
|
69
|
+
#
|
70
|
+
# Parameters are theoretically fixed but unknown metrics of a population
|
71
|
+
# that is bigger than the sample. They can only be estimated, and with
|
72
|
+
# estimation methods potentially being computationally expensive and
|
73
|
+
# complex, it may be important to preserve those estimates.
|
74
|
+
#
|
75
|
+
# - mean
|
76
|
+
# - standard deviation
|
77
|
+
# - variance
|
78
|
+
#
|
79
|
+
# This is a non-prescriptive definition for example or default use.
|
80
|
+
# Each export file encapsulates its own definitions.
|
81
|
+
Parms = [
|
82
|
+
"mean",
|
83
|
+
"standard deviation",
|
84
|
+
"variance"
|
85
|
+
]
|
86
|
+
|
87
|
+
# Array of character strings (pseudo-enum definition) used to identify
|
88
|
+
# estimation methods.
|
89
|
+
#
|
90
|
+
# - original
|
91
|
+
# - bootstrap, percentile interval
|
92
|
+
# - bootstrap, BCa interval
|
93
|
+
# - bootstrap-t interval
|
94
|
+
#
|
95
|
+
# This is a non-prescriptive definition for example or default use.
|
96
|
+
# Each export file encapsulates its own definitions.
|
97
|
+
EstMethods = [
|
98
|
+
"original", # Standard formulae using normal approximations.
|
99
|
+
"bootstrap, percentile interval",
|
100
|
+
"bootstrap, BCa interval",
|
101
|
+
"bootstrap-t interval" # A.k.a. studentized bootstrap
|
102
|
+
]
|
103
|
+
|
104
|
+
# Array of character strings (pseudo-enum definition) used to identify
|
105
|
+
# parameters of estimation methods.
|
106
|
+
#
|
107
|
+
# Different estimation methods will have different parameters.
|
108
|
+
# Interval type matters for asymmetrical distributions.
|
109
|
+
# Nested bootstrap replica count is used for bootstrap-t.
|
110
|
+
# An adaptive bootstrap may vary the replica count to achieve a precision
|
111
|
+
# specified as a numerical tolerance.
|
112
|
+
#
|
113
|
+
# - coverage probability
|
114
|
+
# - interval type
|
115
|
+
# - bootstrap replica count
|
116
|
+
# - nested bootstrap replica count
|
117
|
+
# - numerical tolerance of adaptive bootstrap
|
118
|
+
# - estimated attained precision of adaptive bootstrap
|
119
|
+
#
|
120
|
+
# This is a non-prescriptive definition for example or default use.
|
121
|
+
# Each export file encapsulates its own definitions.
|
122
|
+
EstMethodParms = [
|
123
|
+
"coverage probability",
|
124
|
+
"interval type",
|
125
|
+
"bootstrap replica count",
|
126
|
+
"nested bootstrap replica count",
|
127
|
+
"numerical tolerance of adaptive bootstrap",
|
128
|
+
"estimated attained precision of adaptive bootstrap"
|
129
|
+
]
|
130
|
+
|
131
|
+
# Array of character strings (pseudo-enum definition) used to identify
|
132
|
+
# types of confidence intervals.
|
133
|
+
#
|
134
|
+
# - probabilistically symmetric
|
135
|
+
# - shortest
|
136
|
+
#
|
137
|
+
# This is a non-prescriptive definition for example or default use.
|
138
|
+
# Each export file encapsulates its own definitions.
|
139
|
+
InvTypes = [
|
140
|
+
"probabilistically symmetric", # A.k.a. equi-tailed.
|
141
|
+
"shortest"
|
142
|
+
]
|
143
|
+
|
144
|
+
# --------------------------------------------------------------------------
|
145
|
+
|
146
|
+
# Normative struct definition.
|
147
|
+
#
|
148
|
+
# A ParmDef is a "parameterized parameter" providing additional context to
|
149
|
+
# disambiguate alternative ways of estimating the parameter. Additional
|
150
|
+
# result-specific context can be included in the ParmData if necessary.
|
151
|
+
#
|
152
|
+
# parm:: Index into parms.
|
153
|
+
# estMethod:: Index into estMethods.
|
154
|
+
# estMethodParms:: Hash from estMethodParms index to values.
|
155
|
+
ParmDef = Struct.new(
|
156
|
+
:parm,
|
157
|
+
:estMethod,
|
158
|
+
:estMethodParms
|
159
|
+
)
|
160
|
+
|
161
|
+
# Normative struct definition.
|
162
|
+
#
|
163
|
+
# est:: Value (or, if necessary, an array of values).
|
164
|
+
# lo:: Bounds are presumed inclusive unless infinite (or nil).
|
165
|
+
# hi:: Bounds are presumed inclusive unless infinite (or nil).
|
166
|
+
# estMethodParms:: Result-specific context (or nil).
|
167
|
+
ParmData = Struct.new(
|
168
|
+
:est,
|
169
|
+
:lo, :hi,
|
170
|
+
:estMethodParms
|
171
|
+
)
|
172
|
+
|
173
|
+
# Normative struct definition.
|
174
|
+
#
|
175
|
+
# id:: Primary key.
|
176
|
+
# indVars:: Array of factor identifiers (enum style).
|
177
|
+
# depVars:: Array of output variable identifiers (enum style).
|
178
|
+
# description:: Everything else as verbose text.
|
179
|
+
Experiment = Struct.new(
|
180
|
+
:id,
|
181
|
+
:indVars,
|
182
|
+
:depVars,
|
183
|
+
:description
|
184
|
+
) do
|
185
|
+
def to_s
|
186
|
+
"Experiment id " + id.to_s + ": " + description.to_s + "\n" +
|
187
|
+
" Independent variables: " + indVars.to_s + "\n" +
|
188
|
+
" Dependent variables: " + depVars.to_s
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
# Normative struct definition.
|
193
|
+
#
|
194
|
+
# A Key is used to retrieve either data or parameter estimates.
|
195
|
+
#
|
196
|
+
# experimentId:: References experiments.
|
197
|
+
# treat:: Array of factor values ordered per indVars. For data, all values
|
198
|
+
# must be specified. For parms, nil works like a wildcard. E.g.,
|
199
|
+
# for main effects, only one factor will have a specified level and
|
200
|
+
# all others will be nil. Factor values (levels) are not
|
201
|
+
# necessarily numeric.
|
202
|
+
Key = Struct.new(
|
203
|
+
:experimentId,
|
204
|
+
:treat
|
205
|
+
)
|
206
|
+
|
207
|
+
# Normative struct definition.
|
208
|
+
#
|
209
|
+
# BagOfHolding is a bag containing everything that is loaded from or saved
|
210
|
+
# to an export file except for the GECS version number.
|
211
|
+
#
|
212
|
+
# parms:: Array of strings defining pseudo-enum as used in a given data file. See Parms.
|
213
|
+
# estMethods:: Array of strings defining pseudo-enum as used in a given data file. See EstMethods.
|
214
|
+
# estMethodParms:: Array of strings defining pseudo-enum as used in a given data file. See EstMethodParms.
|
215
|
+
# invTypes:: Array of strings defining pseudo-enum as used in a given data file. See InvTypes.
|
216
|
+
# parmDefs:: Array of ParmDef structs defining pseudo-enum as used in a given data file.
|
217
|
+
# experiments:: Array of Experiment structs (index by experiment id).
|
218
|
+
# data:: Hash from Key to array (per depVars) of arrays (measurement values in chronological order).
|
219
|
+
# ests:: Hash from Key to array (per depVars) of hashes (from parmDefs index to ParmData).
|
220
|
+
BagOfHolding = Struct.new(
|
221
|
+
:parms, :estMethods, :estMethodParms, :invTypes,
|
222
|
+
:parmDefs,
|
223
|
+
:experiments,
|
224
|
+
:data,
|
225
|
+
:ests
|
226
|
+
) do
|
227
|
+
def to_s
|
228
|
+
experiments.join("\n")
|
229
|
+
end
|
230
|
+
|
231
|
+
# Methods to retrieve or create enum values. If a requested value does
|
232
|
+
# not already exist, it is added without fanfare.
|
233
|
+
def getOrAdd(array,name)
|
234
|
+
raise "Can't add to nil" if array.nil?
|
235
|
+
i = array.index(name)
|
236
|
+
if i.nil?
|
237
|
+
array.push(name)
|
238
|
+
array.length-1
|
239
|
+
else
|
240
|
+
i
|
241
|
+
end
|
242
|
+
end
|
243
|
+
def getOrAddParm(name)
|
244
|
+
self.parms ||= Array.new
|
245
|
+
getOrAdd(parms,name)
|
246
|
+
end
|
247
|
+
def getOrAddEstMethod(name)
|
248
|
+
self.estMethods ||= Array.new
|
249
|
+
getOrAdd(estMethods,name)
|
250
|
+
end
|
251
|
+
def getOrAddEstMethodParm(name)
|
252
|
+
self.estMethodParms ||= Array.new
|
253
|
+
getOrAdd(estMethodParms,name)
|
254
|
+
end
|
255
|
+
def getOrAddInvType(name)
|
256
|
+
self.invTypes ||= Array.new
|
257
|
+
getOrAdd(invTypes,name)
|
258
|
+
end
|
259
|
+
def getOrAddParmDef(parmDef)
|
260
|
+
self.parmDefs ||= Array.new
|
261
|
+
getOrAdd(parmDefs,parmDef)
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
# Normative struct definition.
|
266
|
+
#
|
267
|
+
# DoubleBag is a bag containing a GECS version number and a BagOfHolding.
|
268
|
+
# The format version identifier is added/removed by save/load.
|
269
|
+
DoubleBag = Struct.new(:version, :bagOfHolding)
|
270
|
+
|
271
|
+
# --------------------------------------------------------------------------
|
272
|
+
|
273
|
+
# Convenience methods.
|
274
|
+
|
275
|
+
# Save a GECS database to a file.
|
276
|
+
#
|
277
|
+
# bagOfHolding:: A BagOfHolding.
|
278
|
+
def GECS.save(filename, bagOfHolding)
|
279
|
+
Marshal.dump(DoubleBag.new(Version,bagOfHolding), open(filename,"w"))
|
280
|
+
end
|
281
|
+
|
282
|
+
# Load a GECS database from a file. Returns a BagOfHolding.
|
283
|
+
def GECS.load(filename)
|
284
|
+
temp = Marshal.load(File.open(filename,"r"))
|
285
|
+
if temp.version > Version
|
286
|
+
raise "File format version is later than GECS.rb version"
|
287
|
+
end
|
288
|
+
temp.bagOfHolding
|
289
|
+
end
|
290
|
+
|
291
|
+
# Simplify creation of a new database by nilling out the enums. Since
|
292
|
+
# enums are looked up using a find-or-create pattern, there is no harm in
|
293
|
+
# starting with nils.
|
294
|
+
#
|
295
|
+
# experiments:: Array of Experiment structs (index by experiment id).
|
296
|
+
# data:: Hash from Key to array (per depVars) of arrays (measurement values in chronological order).
|
297
|
+
def GECS.newBag(experiments,data)
|
298
|
+
BagOfHolding.new(nil, nil, nil, nil, nil, experiments, data, nil)
|
299
|
+
end
|
300
|
+
|
301
|
+
# Dump an experiment's raw data (not parameter estimates) as an R table
|
302
|
+
# (with header) with a column for each dependent variable and N rows for
|
303
|
+
# each treatment. Short and missing series are padded with NAs.
|
304
|
+
def GECS.dumpData(bagOfHolding,id)
|
305
|
+
throw "Bag is nil" if bagOfHolding.nil?
|
306
|
+
throw "Experiments are nil" if bagOfHolding.experiments.nil?
|
307
|
+
throw "No such experiment" if id >= bagOfHolding.experiments.length
|
308
|
+
exp = bagOfHolding.experiments[id]
|
309
|
+
raise "Experiment has no independent variables!" if exp.indVars.empty?
|
310
|
+
raise "Experiment has no dependent variables!" if exp.depVars.empty?
|
311
|
+
dump = quotesome(exp.indVars) + " " + quotesome(exp.depVars) + "\n"
|
312
|
+
results = bagOfHolding.data.select{|k,v| k.experimentId==id}
|
313
|
+
results.each{|k,cellarray|
|
314
|
+
raise "Null treatment data" if cellarray.nil?
|
315
|
+
raise "Bad treatment data" if cellarray.length != exp.depVars.length
|
316
|
+
maxlen = cellarray.map{|cell| cell.nil? ? 1 : cell.length}.max
|
317
|
+
for iteration in 0..maxlen-1
|
318
|
+
dump << quotesome(k.treat)
|
319
|
+
cellarray.each{|cell|
|
320
|
+
dump << " " + quotemaybe(cell.nil? ? nil : cell[iteration]).to_s
|
321
|
+
}
|
322
|
+
dump << "\n"
|
323
|
+
end
|
324
|
+
}
|
325
|
+
dump
|
326
|
+
end
|
327
|
+
|
328
|
+
# Dump parameter estimates for an experiment as an R table (with header)
|
329
|
+
# with crudely constructed column names: depVar X parmDefId X [est, lo,
|
330
|
+
# hi, optionally prec]. Estimates are assumed to be scalars. If prec is
|
331
|
+
# true, add a prec column for each parameter containing the value of the
|
332
|
+
# estMethodParm "estimated attained precision of adaptive bootstrap".
|
333
|
+
def GECS.dumpParms(bagOfHolding,id,prec=false)
|
334
|
+
throw "Bag is nil" if bagOfHolding.nil?
|
335
|
+
throw "Experiments are nil" if bagOfHolding.experiments.nil?
|
336
|
+
throw "No such experiment" if id >= bagOfHolding.experiments.length
|
337
|
+
experiment = bagOfHolding.experiments[id]
|
338
|
+
ests = bagOfHolding.ests.select{|k,v| k.experimentId==id}
|
339
|
+
parms = ests[ests.keys[0]][0].keys.sort
|
340
|
+
precparm = bagOfHolding.getOrAddEstMethodParm("estimated attained precision of adaptive bootstrap") if prec
|
341
|
+
dump = "# Key to parameter ID numbers:\n"
|
342
|
+
parms.each{|x|
|
343
|
+
parmDef = bagOfHolding.parmDefs[x]
|
344
|
+
dump += "# " + x.to_s + " = " + bagOfHolding.parms[parmDef.parm] +
|
345
|
+
", " + bagOfHolding.estMethods[parmDef.estMethod]
|
346
|
+
parmDef.estMethodParms.each{|k,v|
|
347
|
+
dump += ", " + bagOfHolding.estMethodParms[k] + "=" + v.to_s
|
348
|
+
}
|
349
|
+
dump += "\n"
|
350
|
+
}
|
351
|
+
dump += quotesome(experiment.indVars)
|
352
|
+
experiment.depVars.each{|d|
|
353
|
+
parms.each{|p|
|
354
|
+
dump += " \"" + d + " " + p.to_s + " est\"" +
|
355
|
+
" \"" + d + " " + p.to_s + " lo\"" +
|
356
|
+
" \"" + d + " " + p.to_s + " hi\""
|
357
|
+
dump += " \"" + d + " " + p.to_s + " prec\"" if prec
|
358
|
+
}
|
359
|
+
}
|
360
|
+
dump += "\n"
|
361
|
+
ests.each{|k,cells|
|
362
|
+
dump += quotesome(k.treat)
|
363
|
+
cells.each{|cell|
|
364
|
+
parms.each{|p|
|
365
|
+
if cell.nil?
|
366
|
+
dump += " NA NA NA"
|
367
|
+
dump += " NA" if prec
|
368
|
+
else
|
369
|
+
parm = cell[p]
|
370
|
+
dump += " " + quotemaybe(parm.est).to_s + " " + quotemaybe(parm.lo).to_s + " " + quotemaybe(parm.hi).to_s
|
371
|
+
if prec
|
372
|
+
if parm.estMethodParms.nil?
|
373
|
+
dump += " NA"
|
374
|
+
else
|
375
|
+
dump += " " + quotemaybe(parm.estMethodParms[precparm]).to_s
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
}
|
380
|
+
}
|
381
|
+
dump += "\n"
|
382
|
+
}
|
383
|
+
dump
|
384
|
+
end
|
385
|
+
|
386
|
+
# (Private) Helper method to quote values that aren't numeric.
|
387
|
+
#
|
388
|
+
# oneval:: A single value to be quoted or not. Nil becomes NA.
|
389
|
+
def GECS.quotemaybe(oneval)
|
390
|
+
if oneval.nil?
|
391
|
+
"NA"
|
392
|
+
elsif oneval.is_a?(String)
|
393
|
+
# R 3.0.2 looks like it is re-escaping strings on input so that \\
|
394
|
+
# turns into \\\\, yet this is the minimum amount of escaping that gets
|
395
|
+
# everything through read.table without choking. allowEscapes=F only
|
396
|
+
# makes it choke even more. The worst case seems to be when a string
|
397
|
+
# ends with a backslash.
|
398
|
+
"\""+oneval.gsub('\\'){'\\\\'}.gsub("\"","\\\"")+"\""
|
399
|
+
else
|
400
|
+
oneval
|
401
|
+
end
|
402
|
+
end
|
403
|
+
|
404
|
+
# (Private) Helper method to quote values that aren't numeric. Were they
|
405
|
+
# always well-behaved values, k.treat.join(" ") would suffice.
|
406
|
+
#
|
407
|
+
# treat:: An array of values that might need to be quoted.
|
408
|
+
def GECS.quotesome(treat)
|
409
|
+
treat.map{|level| quotemaybe(level)}.join(" ")
|
410
|
+
end
|
411
|
+
|
412
|
+
# Print out parameter metadata for an experiment.
|
413
|
+
def GECS.describeParms(bagOfHolding,id)
|
414
|
+
experiment = bagOfHolding.experiments[id]
|
415
|
+
ests = bagOfHolding.ests.select{|k,v| k.experimentId==id}
|
416
|
+
ests.each{|k,v|
|
417
|
+
print "Treatment " + k.treat.to_s + "\n"
|
418
|
+
v.each_index{|di|
|
419
|
+
print " Depvar ", experiment.depVars[di], "\n"
|
420
|
+
if v[di].nil?
|
421
|
+
print " Not applicable\n"
|
422
|
+
else
|
423
|
+
v[di].each{|pk,parmData|
|
424
|
+
parmDef = bagOfHolding.parmDefs[pk]
|
425
|
+
print " Parameter: ", bagOfHolding.parms[parmDef.parm], "\n"
|
426
|
+
print " Estimation method: ", bagOfHolding.estMethods[parmDef.estMethod], "\n"
|
427
|
+
print " Global estimation method parameters:", "\n"
|
428
|
+
printEstMethodParms(bagOfHolding, parmDef.estMethodParms)
|
429
|
+
print " Local estimation method parameters:", "\n"
|
430
|
+
printEstMethodParms(bagOfHolding, parmData.estMethodParms)
|
431
|
+
}
|
432
|
+
end
|
433
|
+
}
|
434
|
+
}
|
435
|
+
end
|
436
|
+
|
437
|
+
# (Private) Helper method for ::describeParms.
|
438
|
+
def GECS.printEstMethodParms(bagOfHolding,estMethodParms)
|
439
|
+
if estMethodParms.nil?
|
440
|
+
puts " nil"
|
441
|
+
else
|
442
|
+
estMethodParms.each{|k,v|
|
443
|
+
print " ", bagOfHolding.estMethodParms[k], " = ", v, "\n"
|
444
|
+
}
|
445
|
+
end
|
446
|
+
end
|
447
|
+
|
448
|
+
# --------------------------------------------------------------------------
|
449
|
+
|
450
|
+
# Helper functions to deal with main effects and interactions.
|
451
|
+
|
452
|
+
# (Private) Equality test for treatments that implements nil as wildcard.
|
453
|
+
def GECS.treatEq(a,b)
|
454
|
+
throw "Nil treatment passed to treatEq" if a.nil? or b.nil?
|
455
|
+
throw "Length mismatch" if a.length != b.length
|
456
|
+
a.each_index{|i|
|
457
|
+
return false if !a[i].nil? and !b[i].nil? and a[i]!=b[i]
|
458
|
+
}
|
459
|
+
true
|
460
|
+
end
|
461
|
+
|
462
|
+
# (Private) Refactor the data of an experiment according to a specified
|
463
|
+
# effect and extract the data from the specified cell. If there are no
|
464
|
+
# nils in key, this is just a slow way of doing bagOfHolding.data[key][di].
|
465
|
+
#
|
466
|
+
# di:: depvar index
|
467
|
+
def GECS.refactorExtract(bagOfHolding,key,di)
|
468
|
+
id = key.experimentId
|
469
|
+
r = nil
|
470
|
+
bagOfHolding.data.each{|k,v|
|
471
|
+
if k.experimentId==id
|
472
|
+
if treatEq(key.treat,k.treat)
|
473
|
+
unless v.nil? or v[di].nil?
|
474
|
+
r ||= Array.new
|
475
|
+
r.concat(v[di])
|
476
|
+
end
|
477
|
+
end
|
478
|
+
end
|
479
|
+
}
|
480
|
+
r
|
481
|
+
end
|
482
|
+
|
483
|
+
# (Private) Return true if a given key matches any data at all.
|
484
|
+
def GECS.matchesSomething(bagOfHolding,key)
|
485
|
+
id = key.experimentId
|
486
|
+
bagOfHolding.data.each{|k,v|
|
487
|
+
if k.experimentId==id
|
488
|
+
if treatEq(key.treat,k.treat)
|
489
|
+
unless v.nil?
|
490
|
+
return true # Need to check every v[di] too?
|
491
|
+
end
|
492
|
+
end
|
493
|
+
end
|
494
|
+
}
|
495
|
+
false
|
496
|
+
end
|
497
|
+
|
498
|
+
# (Private) Make a list of the Keys for all treatments, main effects, and
|
499
|
+
# 2-way interactions for an experiment. An attempt is made to suppress
|
500
|
+
# interactions for which there are no data at all (combinations of levels
|
501
|
+
# that don't occur).
|
502
|
+
def GECS.enumerateKeys(bagOfHolding,id)
|
503
|
+
require 'set'
|
504
|
+
throw "Bag is nil" if bagOfHolding.nil?
|
505
|
+
throw "Experiments are nil" if bagOfHolding.experiments.nil?
|
506
|
+
throw "No such experiment" if id >= bagOfHolding.experiments.length
|
507
|
+
|
508
|
+
exp = bagOfHolding.experiments[id]
|
509
|
+
throw "Null experiment" if exp.nil?
|
510
|
+
throw "Null indvars" if exp.indVars.nil?
|
511
|
+
throw "Null depvars" if exp.depVars.nil?
|
512
|
+
numfacs = exp.indVars.length
|
513
|
+
numdeps = exp.depVars.length
|
514
|
+
throw "Not enough indvars" if numfacs < 1
|
515
|
+
throw "Not enough depvars" if numdeps < 1
|
516
|
+
|
517
|
+
if numfacs==1
|
518
|
+
# Short cut for single-factor experiments.
|
519
|
+
bagOfHolding.data.select{|k,v| k.experimentId==id}.keys
|
520
|
+
else
|
521
|
+
# Enumerate the levels of all of the factors while adding all of the
|
522
|
+
# treatments.
|
523
|
+
r = Array.new
|
524
|
+
levels = Array.new(numfacs){Set.new}
|
525
|
+
bagOfHolding.data.each_key{|k|
|
526
|
+
if k.experimentId==id
|
527
|
+
r.push(k)
|
528
|
+
for fac in 0..numfacs-1
|
529
|
+
levels[fac].add(k.treat[fac])
|
530
|
+
end
|
531
|
+
end
|
532
|
+
}
|
533
|
+
|
534
|
+
treat = Array.new(numfacs,nil)
|
535
|
+
# Main effects. Single-factor experiments were already excluded.
|
536
|
+
for fac in 0..numfacs-1
|
537
|
+
treat.fill(nil)
|
538
|
+
for lvl in levels[fac]
|
539
|
+
treat[fac] = lvl
|
540
|
+
r.push(Key.new(id,Array.new(treat)))
|
541
|
+
end
|
542
|
+
end
|
543
|
+
# 2-way interactions.
|
544
|
+
if numfacs > 2 # Don't duplicate the treatments when numfacs==2.
|
545
|
+
for fac1 in 0..numfacs-2
|
546
|
+
for fac2 in fac1+1..numfacs-1
|
547
|
+
treat.fill(nil)
|
548
|
+
for lvl1 in levels[fac1]
|
549
|
+
treat[fac1] = lvl1
|
550
|
+
for lvl2 in levels[fac2]
|
551
|
+
treat[fac2] = lvl2
|
552
|
+
key = Key.new(id,Array.new(treat))
|
553
|
+
r.push(key) if matchesSomething(bagOfHolding,key)
|
554
|
+
end
|
555
|
+
end
|
556
|
+
end
|
557
|
+
end
|
558
|
+
end
|
559
|
+
r
|
560
|
+
end
|
561
|
+
end
|
562
|
+
|
563
|
+
# --------------------------------------------------------------------------
|
564
|
+
|
565
|
+
# rubygems.org/gems/statistics2 is required for the inverse t distribution.
|
566
|
+
# Unfortunately, as of version 0.54, its agreement with R and Octave is
|
567
|
+
# only to 4 decimals or so:
|
568
|
+
|
569
|
+
# R version 3.0.2 (2013-09-25) -- "Frisbee Sailing"
|
570
|
+
# > qt(0.975,5)
|
571
|
+
# [1] 2.570582
|
572
|
+
|
573
|
+
# GNU Octave, version 3.6.3
|
574
|
+
# octave:1> printf ("%f\n", tinv(0.975,5))
|
575
|
+
# 2.570582
|
576
|
+
|
577
|
+
# irb(main):002:0> puts Gem.loaded_specs["statistics2"].version,
|
578
|
+
# irb(main):003:0* Statistics2::ptdist(5,0.975)
|
579
|
+
# 0.54
|
580
|
+
# 2.57051
|
581
|
+
|
582
|
+
# (Private) Calculate the quick interval for the mean of some data.
|
583
|
+
# Returns a ParmData.
|
584
|
+
def GECS.quickInterval(data)
|
585
|
+
require 'statistics2'
|
586
|
+
if data.nil?
|
587
|
+
nil
|
588
|
+
else
|
589
|
+
count = data.length
|
590
|
+
sum = data.reduce(:+)
|
591
|
+
mean = sum.to_f/count
|
592
|
+
variance = data.map{|x| (mean-x)**2}.inject(:+)/(count-1.0)
|
593
|
+
meanU = Math.sqrt(variance/count)*Statistics2::ptdist(count-1,0.975)
|
594
|
+
ParmData.new(mean,mean-meanU,mean+meanU,nil)
|
595
|
+
end
|
596
|
+
end
|
597
|
+
|
598
|
+
# Add the following parameter to the data and effects of a specified
|
599
|
+
# experiment: mean, original method, 95% confidence. This is a quick way
|
600
|
+
# to summarize results when more complicated options are not needed. All
|
601
|
+
# values are computed as Floats with no respect for the original data type
|
602
|
+
# or its precision.
|
603
|
+
#
|
604
|
+
# bagOfHolding:: A BagOfHolding.
|
605
|
+
# id:: Experiment id.
|
606
|
+
def GECS.quickMeans(bagOfHolding,id)
|
607
|
+
throw "Bag is nil" if bagOfHolding.nil?
|
608
|
+
throw "Experiments are nil" if bagOfHolding.experiments.nil?
|
609
|
+
throw "No such experiment" if id >= bagOfHolding.experiments.length
|
610
|
+
|
611
|
+
exp = bagOfHolding.experiments[id]
|
612
|
+
throw "Null experiment" if exp.nil?
|
613
|
+
throw "Null indvars" if exp.indVars.nil?
|
614
|
+
throw "Null depvars" if exp.depVars.nil?
|
615
|
+
numfacs = exp.indVars.length
|
616
|
+
numdeps = exp.depVars.length
|
617
|
+
throw "Not enough indvars" if numfacs < 1
|
618
|
+
throw "Not enough depvars" if numdeps < 1
|
619
|
+
|
620
|
+
parmDef = ParmDef.new(bagOfHolding.getOrAddParm("mean"),
|
621
|
+
bagOfHolding.getOrAddEstMethod("original"),
|
622
|
+
{bagOfHolding.getOrAddEstMethodParm("coverage probability")=>0.95})
|
623
|
+
meanId = bagOfHolding.getOrAddParmDef(parmDef)
|
624
|
+
bagOfHolding.ests ||= Hash.new
|
625
|
+
enumerateKeys(bagOfHolding,id).each{|key|
|
626
|
+
bagOfHolding.ests[key] ||= Array.new(numdeps,nil)
|
627
|
+
for di in 0..numdeps-1
|
628
|
+
cell = refactorExtract(bagOfHolding,key,di)
|
629
|
+
unless cell.nil?
|
630
|
+
bagOfHolding.ests[key][di] ||= Hash.new
|
631
|
+
bagOfHolding.ests[key][di][meanId] = quickInterval(cell)
|
632
|
+
end
|
633
|
+
end
|
634
|
+
}
|
635
|
+
end
|
636
|
+
|
637
|
+
# --------------------------------------------------------------------------
|
638
|
+
|
639
|
+
# (Private) Unit of parallelization.
|
640
|
+
ParPod = Struct.new(
|
641
|
+
:key, # A Key to data or ests, as applicable.
|
642
|
+
:di, # Depvar index.
|
643
|
+
:sernum, # Unique integer.
|
644
|
+
:out # Returned interval.
|
645
|
+
)
|
646
|
+
|
647
|
+
# (Private) Create ParmData from ParPod return.
|
648
|
+
def GECS.parmRet(bagOfHolding,delta,pod)
|
649
|
+
if pod.out.nil?
|
650
|
+
nil
|
651
|
+
else
|
652
|
+
ParmData.new(pod.out[2].to_f, pod.out[3].to_f, pod.out[4].to_f, {
|
653
|
+
bagOfHolding.getOrAddEstMethodParm("bootstrap replica count")=>pod.out[0].to_i,
|
654
|
+
bagOfHolding.getOrAddEstMethodParm("numerical tolerance of adaptive bootstrap")=>delta[pod.di],
|
655
|
+
bagOfHolding.getOrAddEstMethodParm("estimated attained precision of adaptive bootstrap")=>pod.out[1].to_f})
|
656
|
+
end
|
657
|
+
end
|
658
|
+
|
659
|
+
# Add the following parameter to the results and effects of a specified
|
660
|
+
# experiment: mean, bootstrap method, BCa interval, 95% confidence,
|
661
|
+
# adaptive determination of bootstrap replica count to achieve the
|
662
|
+
# requested numerical tolerances for each depvar. The estimate provided is
|
663
|
+
# the sample mean, not the bootstrap estimate. Count will be at least
|
664
|
+
# 50000.
|
665
|
+
#
|
666
|
+
# This function is parallelized. All available CPUs will be used to run
|
667
|
+
# bootstrap calculations in R.
|
668
|
+
#
|
669
|
+
# bagOfHolding:: A BagOfHolding.
|
670
|
+
# id:: Experiment id.
|
671
|
+
# delta:: Array of numbers specifying desired numerical tolerances for each
|
672
|
+
# depvar. To reduce variation below the resolution of a typical
|
673
|
+
# plot of height 1000 pixels, you'd want delta something like
|
674
|
+
# (max(y)-min(y))/2000 for whatever range of y is being plotted.
|
675
|
+
def GECS.bootstrapMeans(bagOfHolding,id,delta)
|
676
|
+
require 'parallel'
|
677
|
+
throw "Bag is nil" if bagOfHolding.nil?
|
678
|
+
throw "Experiments are nil" if bagOfHolding.experiments.nil?
|
679
|
+
throw "No such experiment" if id >= bagOfHolding.experiments.length
|
680
|
+
|
681
|
+
exp = bagOfHolding.experiments[id]
|
682
|
+
throw "Null experiment" if exp.nil?
|
683
|
+
throw "Null indvars" if exp.indVars.nil?
|
684
|
+
throw "Null depvars" if exp.depVars.nil?
|
685
|
+
numfacs = exp.indVars.length
|
686
|
+
numdeps = exp.depVars.length
|
687
|
+
throw "Not enough indvars" if numfacs < 1
|
688
|
+
throw "Not enough depvars" if numdeps < 1
|
689
|
+
throw "Wrong number of deltas" if delta.length != numdeps
|
690
|
+
|
691
|
+
parmDef = ParmDef.new(bagOfHolding.getOrAddParm("mean"),
|
692
|
+
bagOfHolding.getOrAddEstMethod("bootstrap, BCa interval"),
|
693
|
+
{bagOfHolding.getOrAddEstMethodParm("coverage probability")=>0.95})
|
694
|
+
meanId = bagOfHolding.getOrAddParmDef(parmDef)
|
695
|
+
|
696
|
+
# Make a list of the data and effects that need fixing with a unique
|
697
|
+
# serial number assigned to each.
|
698
|
+
sernum = 0
|
699
|
+
pods = Array.new
|
700
|
+
enumerateKeys(bagOfHolding,id).each{|key|
|
701
|
+
for di in 0..numdeps-1
|
702
|
+
pods.push(ParPod.new(key,di,(sernum+=1),nil))
|
703
|
+
end
|
704
|
+
}
|
705
|
+
|
706
|
+
# Run numCPUs instances of BCa.R in parallel.
|
707
|
+
pods = Parallel.map(pods) do |pod|
|
708
|
+
poddata = refactorExtract(bagOfHolding,pod.key,pod.di)
|
709
|
+
unless poddata.nil?
|
710
|
+
descript = "Treatment " + pod.key.treat.to_s + " depvar " + pod.di.to_s
|
711
|
+
fnam = "bootstrap-in-" + pod.sernum.to_s + ".txt"
|
712
|
+
fp = File.open(fnam,"w")
|
713
|
+
fp.puts(poddata)
|
714
|
+
fp.close
|
715
|
+
|
716
|
+
# This R script that has been mangled onto the command line to avoid
|
717
|
+
# adding another external dependency is mostly just a wrapper for
|
718
|
+
# BCa.R, but the bootstrap estimate of the mean is replaced by the
|
719
|
+
# sample mean.
|
720
|
+
cmd = "Rscript " +
|
721
|
+
"-e 'library(\"bootBCa\")' " +
|
722
|
+
"-e 'data <- unlist(read.table(\"" + fnam + "\",header=F,colClasses=\"numeric\"))' " +
|
723
|
+
"-e 'out <- BCa(data,as.numeric(" + delta[pod.di].to_s + "),mean)' " +
|
724
|
+
"-e 'cat(sprintf(\"%d %0.16f %0.16f %0.16f %0.16f\\n\",out[1],out[2],mean(data),out[4],out[5]))'"
|
725
|
+
|
726
|
+
pod.out = `#{cmd}`.split
|
727
|
+
File.delete(fnam)
|
728
|
+
if pod.out.length < 5
|
729
|
+
# This should never happen.
|
730
|
+
print "Bootstrap failure\n"
|
731
|
+
print " ", descript, "\n"
|
732
|
+
print " Depvar: ", bagOfHolding.experiments[id].depVars[pod.di], "\n"
|
733
|
+
throw "Bootstrap failure"
|
734
|
+
end
|
735
|
+
|
736
|
+
# Verbose progress reporting
|
737
|
+
print descript + ", " + pod.out[0] + " iterations done\n"
|
738
|
+
end
|
739
|
+
pod
|
740
|
+
end
|
741
|
+
|
742
|
+
# Copy back parameter estimates.
|
743
|
+
bagOfHolding.ests ||= Hash.new
|
744
|
+
pods.each{|pod|
|
745
|
+
pd = parmRet(bagOfHolding,delta,pod)
|
746
|
+
unless pd.nil?
|
747
|
+
bagOfHolding.ests[pod.key] ||= Array.new(numdeps,nil)
|
748
|
+
bagOfHolding.ests[pod.key][pod.di] ||= Hash.new
|
749
|
+
bagOfHolding.ests[pod.key][pod.di][meanId] = pd
|
750
|
+
end
|
751
|
+
}
|
752
|
+
end
|
753
|
+
|
754
|
+
end
|
data/test/test_GECS.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'GECS'
|
3
|
+
|
4
|
+
class GECSTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test1
|
7
|
+
assert_nothing_raised do
|
8
|
+
assert_nothing_thrown do
|
9
|
+
puts "This test requires you to install R and the bootBCa package."
|
10
|
+
puts "R: www.r-project.org"
|
11
|
+
puts "bootBCa: bootbca.r-forge.r-project.org\n\n"
|
12
|
+
xid = 0
|
13
|
+
experiment = GECS::Experiment.new(xid, ["input"], ["output"],
|
14
|
+
"A silly, simple, small-sample test is better than nothing")
|
15
|
+
results = Hash.new
|
16
|
+
key5 = GECS::Key.new(xid,[5])
|
17
|
+
key10 = GECS::Key.new(xid,[10])
|
18
|
+
# rnorm(n=30,mean=5,sd=5)
|
19
|
+
results[key5] = [[
|
20
|
+
8.1239862, 0.7075978, 18.3996805, 1.3803859, 8.1125496, 0.5903406,
|
21
|
+
0.8996330, 4.8486252, -2.8038801, 13.7444565, 6.5451796, 6.8989424,
|
22
|
+
7.2075758, 19.0412270, 2.6637848, -1.7874951, 9.7959101, -0.4751056,
|
23
|
+
9.1401820, 9.8164959, 12.0606408, 6.0108636, -0.5488287, 12.0728692,
|
24
|
+
6.7253890, 6.4999955, 1.9586040, 2.2619639, 2.2761585, 3.5316891]]
|
25
|
+
# rnorm(n=30,mean=10,sd=5)
|
26
|
+
results[key10] = [[
|
27
|
+
10.225934, 7.991545, 8.665374, 5.550484, 6.315061, 17.274522,
|
28
|
+
11.291662, 19.324338, 8.760387, 5.561073, 10.225028, 7.939054,
|
29
|
+
13.094538, 7.893948, 2.026159, 19.918737, 11.955975, 7.056023,
|
30
|
+
2.883466, -2.054165, 8.938372, 3.319929, 18.280414, 5.495548,
|
31
|
+
4.556700, 1.883278, 7.677032, 7.741751, 15.817214, 16.886633]]
|
32
|
+
bag = GECS.newBag([experiment],results)
|
33
|
+
# Fail here because they didn't install bootBCa, or R.
|
34
|
+
GECS.bootstrapMeans(bag, xid, [0.1])
|
35
|
+
puts "\n", bag, "\n", GECS.dumpParms(bag,xid,true)
|
36
|
+
# [key][depvar index][parmDef index]
|
37
|
+
# We have only one depvar and only one parmDef.
|
38
|
+
parm = bag.ests[key5][0][0]
|
39
|
+
assert (parm.lo < 5 && 5 < parm.hi)
|
40
|
+
parm = bag.ests[key10][0][0]
|
41
|
+
assert (parm.lo < 10 && 10 < parm.hi)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
metadata
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: GECS
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- David Flater
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-07-31 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: parallel
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.9.2
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.9.2
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: statistics2
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0.54'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0.54'
|
46
|
+
description: This software is experimental. NIST assumes no responsibility whatsoever
|
47
|
+
for its use by other parties and makes no guarantees, expressed or implied, about
|
48
|
+
its quality, reliability, or any other characteristic.
|
49
|
+
email: dflater@nist.gov
|
50
|
+
executables: []
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files: []
|
53
|
+
files:
|
54
|
+
- lib/GECS.rb
|
55
|
+
- test/test_GECS.rb
|
56
|
+
- Rakefile
|
57
|
+
homepage: http://www.nist.gov/itl/ssd/cs/software-performance.cfm
|
58
|
+
licenses:
|
59
|
+
- Unlicense
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
require_paths:
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
requirements:
|
77
|
+
- ! 'R: www.r-project.org'
|
78
|
+
- ! 'bootBCa: bootbca.r-forge.r-project.org'
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.8.23
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: Gem for Experimental Computer Science
|
84
|
+
test_files: []
|