GECS 1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/Rakefile +10 -0
  2. data/lib/GECS.rb +754 -0
  3. data/test/test_GECS.rb +46 -0
  4. metadata +84 -0
@@ -0,0 +1,10 @@
1
+ # Verbatim from RubyGems Guide, "Make your own gem," section "Writing tests."
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ end
8
+
9
+ desc "Run tests"
10
+ task :default => :test
@@ -0,0 +1,754 @@
1
+ #!/usr/bin/ruby
2
+ # Gem for Experimental Computer Science
3
+
4
+ # Gem for Experimental Computer Science
5
+ #
6
+ # Author:: David Flater <dflater@nist.gov>
7
+ # Copyright:: Public domain
8
+ # License:: Unlicense
9
+ #
10
+ # This software is experimental. NIST assumes no responsibility whatsoever
11
+ # for its use by other parties and makes no guarantees, expressed or
12
+ # implied, about its quality, reliability, or any other characteristic.
13
+ #
14
+ # == Conventions used within the GECS module
15
+ #
16
+ # The following abbreviations are used:
17
+ # Parm:: parameter
18
+ # Est:: estimation or estimate
19
+ # Inv:: interval
20
+ # Ind:: independent
21
+ # Dep:: dependent
22
+ # Var:: variable
23
+ # Treat:: treatment (a vector of factor levels)
24
+ #
25
+ # Arrays of character strings (or in one case, ParmDef structs) are used as a
26
+ # kind of "enum." Values of the enum pseudo-type are unsigned integers that
27
+ # simply index the array to provide a concise identifier for whatever the
28
+ # referenced character string (or ParmDef) describes.
29
+ #
30
+ # A good many helper methods that should not be exposed through the GECS API
31
+ # unfortunately are exposed and get listed by rdoc. These have been tagged
32
+ # (Private) in their comments.
33
+ #
34
+ # Rdoc lists struct definitions in the Constants section.
35
+ #
36
+ # == Dependencies
37
+ #
38
+ # To use the ::bootstrapMeans method:
39
+ # * The Ruby gem parallel[rubygems.org/gems/parallel] must be installed.
40
+ # Tested version: 0.9.2.
41
+ # * The R[www.r-project.org] environment for statistical computing must be
42
+ # runnable from a shell command line. Tested version: 3.1.0.
43
+ # * The R package bootBCa[bootbca.r-forge.r-project.org] must be installed.
44
+ # Tested version: 1.0.
45
+ #
46
+ # To use the ::quickMeans method:
47
+ # * The Ruby gem statistics2[rubygems.org/gems/statistics2] must be installed.
48
+ # Tested version: 0.54.
49
+ #
50
+ # Both ::bootstrapMeans and ::quickMeans use set, a standard
51
+ # pre-installed class.
52
+ #
53
+ # == Bad behaviors
54
+ #
55
+ # ::bootstrapMeans writes temporary files into the current working directory.
56
+ # Normally, they will be deleted when no longer in use.
57
+ #
58
+ # The inverse t distribution in statistics2 that is used by ::quickMeans
59
+ # agrees with R and Octave only to 4 decimals or so.
60
+
61
+ module GECS
62
+
63
+ # Integer constant indicating the version of GECS that has been loaded.
64
+ # There is no major/minor/patchlevel encoding. It just increments.
65
+ Version = 1
66
+
67
+ # Array of character strings (pseudo-enum definition) used to identify
68
+ # parameters.
69
+ #
70
+ # Parameters are theoretically fixed but unknown metrics of a population
71
+ # that is bigger than the sample. They can only be estimated, and with
72
+ # estimation methods potentially being computationally expensive and
73
+ # complex, it may be important to preserve those estimates.
74
+ #
75
+ # - mean
76
+ # - standard deviation
77
+ # - variance
78
+ #
79
+ # This is a non-prescriptive definition for example or default use.
80
+ # Each export file encapsulates its own definitions.
81
+ Parms = [
82
+ "mean",
83
+ "standard deviation",
84
+ "variance"
85
+ ]
86
+
87
+ # Array of character strings (pseudo-enum definition) used to identify
88
+ # estimation methods.
89
+ #
90
+ # - original
91
+ # - bootstrap, percentile interval
92
+ # - bootstrap, BCa interval
93
+ # - bootstrap-t interval
94
+ #
95
+ # This is a non-prescriptive definition for example or default use.
96
+ # Each export file encapsulates its own definitions.
97
+ EstMethods = [
98
+ "original", # Standard formulae using normal approximations.
99
+ "bootstrap, percentile interval",
100
+ "bootstrap, BCa interval",
101
+ "bootstrap-t interval" # A.k.a. studentized bootstrap
102
+ ]
103
+
104
+ # Array of character strings (pseudo-enum definition) used to identify
105
+ # parameters of estimation methods.
106
+ #
107
+ # Different estimation methods will have different parameters.
108
+ # Interval type matters for asymmetrical distributions.
109
+ # Nested bootstrap replica count is used for bootstrap-t.
110
+ # An adaptive bootstrap may vary the replica count to achieve a precision
111
+ # specified as a numerical tolerance.
112
+ #
113
+ # - coverage probability
114
+ # - interval type
115
+ # - bootstrap replica count
116
+ # - nested bootstrap replica count
117
+ # - numerical tolerance of adaptive bootstrap
118
+ # - estimated attained precision of adaptive bootstrap
119
+ #
120
+ # This is a non-prescriptive definition for example or default use.
121
+ # Each export file encapsulates its own definitions.
122
+ EstMethodParms = [
123
+ "coverage probability",
124
+ "interval type",
125
+ "bootstrap replica count",
126
+ "nested bootstrap replica count",
127
+ "numerical tolerance of adaptive bootstrap",
128
+ "estimated attained precision of adaptive bootstrap"
129
+ ]
130
+
131
+ # Array of character strings (pseudo-enum definition) used to identify
132
+ # types of confidence intervals.
133
+ #
134
+ # - probabilistically symmetric
135
+ # - shortest
136
+ #
137
+ # This is a non-prescriptive definition for example or default use.
138
+ # Each export file encapsulates its own definitions.
139
+ InvTypes = [
140
+ "probabilistically symmetric", # A.k.a. equi-tailed.
141
+ "shortest"
142
+ ]
143
+
144
+ # --------------------------------------------------------------------------
145
+
146
+ # Normative struct definition.
147
+ #
148
+ # A ParmDef is a "parameterized parameter" providing additional context to
149
+ # disambiguate alternative ways of estimating the parameter. Additional
150
+ # result-specific context can be included in the ParmData if necessary.
151
+ #
152
+ # parm:: Index into parms.
153
+ # estMethod:: Index into estMethods.
154
+ # estMethodParms:: Hash from estMethodParms index to values.
155
+ ParmDef = Struct.new(
156
+ :parm,
157
+ :estMethod,
158
+ :estMethodParms
159
+ )
160
+
161
+ # Normative struct definition.
162
+ #
163
+ # est:: Value (or, if necessary, an array of values).
164
+ # lo:: Bounds are presumed inclusive unless infinite (or nil).
165
+ # hi:: Bounds are presumed inclusive unless infinite (or nil).
166
+ # estMethodParms:: Result-specific context (or nil).
167
+ ParmData = Struct.new(
168
+ :est,
169
+ :lo, :hi,
170
+ :estMethodParms
171
+ )
172
+
173
+ # Normative struct definition.
174
+ #
175
+ # id:: Primary key.
176
+ # indVars:: Array of factor identifiers (enum style).
177
+ # depVars:: Array of output variable identifiers (enum style).
178
+ # description:: Everything else as verbose text.
179
+ Experiment = Struct.new(
180
+ :id,
181
+ :indVars,
182
+ :depVars,
183
+ :description
184
+ ) do
185
+ def to_s
186
+ "Experiment id " + id.to_s + ": " + description.to_s + "\n" +
187
+ " Independent variables: " + indVars.to_s + "\n" +
188
+ " Dependent variables: " + depVars.to_s
189
+ end
190
+ end
191
+
192
+ # Normative struct definition.
193
+ #
194
+ # A Key is used to retrieve either data or parameter estimates.
195
+ #
196
+ # experimentId:: References experiments.
197
+ # treat:: Array of factor values ordered per indVars. For data, all values
198
+ # must be specified. For parms, nil works like a wildcard. E.g.,
199
+ # for main effects, only one factor will have a specified level and
200
+ # all others will be nil. Factor values (levels) are not
201
+ # necessarily numeric.
202
+ Key = Struct.new(
203
+ :experimentId,
204
+ :treat
205
+ )
206
+
207
+ # Normative struct definition.
208
+ #
209
+ # BagOfHolding is a bag containing everything that is loaded from or saved
210
+ # to an export file except for the GECS version number.
211
+ #
212
+ # parms:: Array of strings defining pseudo-enum as used in a given data file. See Parms.
213
+ # estMethods:: Array of strings defining pseudo-enum as used in a given data file. See EstMethods.
214
+ # estMethodParms:: Array of strings defining pseudo-enum as used in a given data file. See EstMethodParms.
215
+ # invTypes:: Array of strings defining pseudo-enum as used in a given data file. See InvTypes.
216
+ # parmDefs:: Array of ParmDef structs defining pseudo-enum as used in a given data file.
217
+ # experiments:: Array of Experiment structs (index by experiment id).
218
+ # data:: Hash from Key to array (per depVars) of arrays (measurement values in chronological order).
219
+ # ests:: Hash from Key to array (per depVars) of hashes (from parmDefs index to ParmData).
220
+ BagOfHolding = Struct.new(
221
+ :parms, :estMethods, :estMethodParms, :invTypes,
222
+ :parmDefs,
223
+ :experiments,
224
+ :data,
225
+ :ests
226
+ ) do
227
+ def to_s
228
+ experiments.join("\n")
229
+ end
230
+
231
+ # Methods to retrieve or create enum values. If a requested value does
232
+ # not already exist, it is added without fanfare.
233
+ def getOrAdd(array,name)
234
+ raise "Can't add to nil" if array.nil?
235
+ i = array.index(name)
236
+ if i.nil?
237
+ array.push(name)
238
+ array.length-1
239
+ else
240
+ i
241
+ end
242
+ end
243
+ def getOrAddParm(name)
244
+ self.parms ||= Array.new
245
+ getOrAdd(parms,name)
246
+ end
247
+ def getOrAddEstMethod(name)
248
+ self.estMethods ||= Array.new
249
+ getOrAdd(estMethods,name)
250
+ end
251
+ def getOrAddEstMethodParm(name)
252
+ self.estMethodParms ||= Array.new
253
+ getOrAdd(estMethodParms,name)
254
+ end
255
+ def getOrAddInvType(name)
256
+ self.invTypes ||= Array.new
257
+ getOrAdd(invTypes,name)
258
+ end
259
+ def getOrAddParmDef(parmDef)
260
+ self.parmDefs ||= Array.new
261
+ getOrAdd(parmDefs,parmDef)
262
+ end
263
+ end
264
+
265
+ # Normative struct definition.
266
+ #
267
+ # DoubleBag is a bag containing a GECS version number and a BagOfHolding.
268
+ # The format version identifier is added/removed by save/load.
269
+ DoubleBag = Struct.new(:version, :bagOfHolding)
270
+
271
+ # --------------------------------------------------------------------------
272
+
273
+ # Convenience methods.
274
+
275
+ # Save a GECS database to a file.
276
+ #
277
+ # bagOfHolding:: A BagOfHolding.
278
+ def GECS.save(filename, bagOfHolding)
279
+ Marshal.dump(DoubleBag.new(Version,bagOfHolding), open(filename,"w"))
280
+ end
281
+
282
+ # Load a GECS database from a file. Returns a BagOfHolding.
283
+ def GECS.load(filename)
284
+ temp = Marshal.load(File.open(filename,"r"))
285
+ if temp.version > Version
286
+ raise "File format version is later than GECS.rb version"
287
+ end
288
+ temp.bagOfHolding
289
+ end
290
+
291
+ # Simplify creation of a new database by nilling out the enums. Since
292
+ # enums are looked up using a find-or-create pattern, there is no harm in
293
+ # starting with nils.
294
+ #
295
+ # experiments:: Array of Experiment structs (index by experiment id).
296
+ # data:: Hash from Key to array (per depVars) of arrays (measurement values in chronological order).
297
+ def GECS.newBag(experiments,data)
298
+ BagOfHolding.new(nil, nil, nil, nil, nil, experiments, data, nil)
299
+ end
300
+
301
+ # Dump an experiment's raw data (not parameter estimates) as an R table
302
+ # (with header) with a column for each dependent variable and N rows for
303
+ # each treatment. Short and missing series are padded with NAs.
304
+ def GECS.dumpData(bagOfHolding,id)
305
+ throw "Bag is nil" if bagOfHolding.nil?
306
+ throw "Experiments are nil" if bagOfHolding.experiments.nil?
307
+ throw "No such experiment" if id >= bagOfHolding.experiments.length
308
+ exp = bagOfHolding.experiments[id]
309
+ raise "Experiment has no independent variables!" if exp.indVars.empty?
310
+ raise "Experiment has no dependent variables!" if exp.depVars.empty?
311
+ dump = quotesome(exp.indVars) + " " + quotesome(exp.depVars) + "\n"
312
+ results = bagOfHolding.data.select{|k,v| k.experimentId==id}
313
+ results.each{|k,cellarray|
314
+ raise "Null treatment data" if cellarray.nil?
315
+ raise "Bad treatment data" if cellarray.length != exp.depVars.length
316
+ maxlen = cellarray.map{|cell| cell.nil? ? 1 : cell.length}.max
317
+ for iteration in 0..maxlen-1
318
+ dump << quotesome(k.treat)
319
+ cellarray.each{|cell|
320
+ dump << " " + quotemaybe(cell.nil? ? nil : cell[iteration]).to_s
321
+ }
322
+ dump << "\n"
323
+ end
324
+ }
325
+ dump
326
+ end
327
+
328
+ # Dump parameter estimates for an experiment as an R table (with header)
329
+ # with crudely constructed column names: depVar X parmDefId X [est, lo,
330
+ # hi, optionally prec]. Estimates are assumed to be scalars. If prec is
331
+ # true, add a prec column for each parameter containing the value of the
332
+ # estMethodParm "estimated attained precision of adaptive bootstrap".
333
+ def GECS.dumpParms(bagOfHolding,id,prec=false)
334
+ throw "Bag is nil" if bagOfHolding.nil?
335
+ throw "Experiments are nil" if bagOfHolding.experiments.nil?
336
+ throw "No such experiment" if id >= bagOfHolding.experiments.length
337
+ experiment = bagOfHolding.experiments[id]
338
+ ests = bagOfHolding.ests.select{|k,v| k.experimentId==id}
339
+ parms = ests[ests.keys[0]][0].keys.sort
340
+ precparm = bagOfHolding.getOrAddEstMethodParm("estimated attained precision of adaptive bootstrap") if prec
341
+ dump = "# Key to parameter ID numbers:\n"
342
+ parms.each{|x|
343
+ parmDef = bagOfHolding.parmDefs[x]
344
+ dump += "# " + x.to_s + " = " + bagOfHolding.parms[parmDef.parm] +
345
+ ", " + bagOfHolding.estMethods[parmDef.estMethod]
346
+ parmDef.estMethodParms.each{|k,v|
347
+ dump += ", " + bagOfHolding.estMethodParms[k] + "=" + v.to_s
348
+ }
349
+ dump += "\n"
350
+ }
351
+ dump += quotesome(experiment.indVars)
352
+ experiment.depVars.each{|d|
353
+ parms.each{|p|
354
+ dump += " \"" + d + " " + p.to_s + " est\"" +
355
+ " \"" + d + " " + p.to_s + " lo\"" +
356
+ " \"" + d + " " + p.to_s + " hi\""
357
+ dump += " \"" + d + " " + p.to_s + " prec\"" if prec
358
+ }
359
+ }
360
+ dump += "\n"
361
+ ests.each{|k,cells|
362
+ dump += quotesome(k.treat)
363
+ cells.each{|cell|
364
+ parms.each{|p|
365
+ if cell.nil?
366
+ dump += " NA NA NA"
367
+ dump += " NA" if prec
368
+ else
369
+ parm = cell[p]
370
+ dump += " " + quotemaybe(parm.est).to_s + " " + quotemaybe(parm.lo).to_s + " " + quotemaybe(parm.hi).to_s
371
+ if prec
372
+ if parm.estMethodParms.nil?
373
+ dump += " NA"
374
+ else
375
+ dump += " " + quotemaybe(parm.estMethodParms[precparm]).to_s
376
+ end
377
+ end
378
+ end
379
+ }
380
+ }
381
+ dump += "\n"
382
+ }
383
+ dump
384
+ end
385
+
386
+ # (Private) Helper method to quote values that aren't numeric.
387
+ #
388
+ # oneval:: A single value to be quoted or not. Nil becomes NA.
389
+ def GECS.quotemaybe(oneval)
390
+ if oneval.nil?
391
+ "NA"
392
+ elsif oneval.is_a?(String)
393
+ # R 3.0.2 looks like it is re-escaping strings on input so that \\
394
+ # turns into \\\\, yet this is the minimum amount of escaping that gets
395
+ # everything through read.table without choking. allowEscapes=F only
396
+ # makes it choke even more. The worst case seems to be when a string
397
+ # ends with a backslash.
398
+ "\""+oneval.gsub('\\'){'\\\\'}.gsub("\"","\\\"")+"\""
399
+ else
400
+ oneval
401
+ end
402
+ end
403
+
404
+ # (Private) Helper method to quote values that aren't numeric. Were they
405
+ # always well-behaved values, k.treat.join(" ") would suffice.
406
+ #
407
+ # treat:: An array of values that might need to be quoted.
408
+ def GECS.quotesome(treat)
409
+ treat.map{|level| quotemaybe(level)}.join(" ")
410
+ end
411
+
412
+ # Print out parameter metadata for an experiment.
413
+ def GECS.describeParms(bagOfHolding,id)
414
+ experiment = bagOfHolding.experiments[id]
415
+ ests = bagOfHolding.ests.select{|k,v| k.experimentId==id}
416
+ ests.each{|k,v|
417
+ print "Treatment " + k.treat.to_s + "\n"
418
+ v.each_index{|di|
419
+ print " Depvar ", experiment.depVars[di], "\n"
420
+ if v[di].nil?
421
+ print " Not applicable\n"
422
+ else
423
+ v[di].each{|pk,parmData|
424
+ parmDef = bagOfHolding.parmDefs[pk]
425
+ print " Parameter: ", bagOfHolding.parms[parmDef.parm], "\n"
426
+ print " Estimation method: ", bagOfHolding.estMethods[parmDef.estMethod], "\n"
427
+ print " Global estimation method parameters:", "\n"
428
+ printEstMethodParms(bagOfHolding, parmDef.estMethodParms)
429
+ print " Local estimation method parameters:", "\n"
430
+ printEstMethodParms(bagOfHolding, parmData.estMethodParms)
431
+ }
432
+ end
433
+ }
434
+ }
435
+ end
436
+
437
+ # (Private) Helper method for ::describeParms.
438
+ def GECS.printEstMethodParms(bagOfHolding,estMethodParms)
439
+ if estMethodParms.nil?
440
+ puts " nil"
441
+ else
442
+ estMethodParms.each{|k,v|
443
+ print " ", bagOfHolding.estMethodParms[k], " = ", v, "\n"
444
+ }
445
+ end
446
+ end
447
+
448
+ # --------------------------------------------------------------------------
449
+
450
+ # Helper functions to deal with main effects and interactions.
451
+
452
+ # (Private) Equality test for treatments that implements nil as wildcard.
453
+ def GECS.treatEq(a,b)
454
+ throw "Nil treatment passed to treatEq" if a.nil? or b.nil?
455
+ throw "Length mismatch" if a.length != b.length
456
+ a.each_index{|i|
457
+ return false if !a[i].nil? and !b[i].nil? and a[i]!=b[i]
458
+ }
459
+ true
460
+ end
461
+
462
+ # (Private) Refactor the data of an experiment according to a specified
463
+ # effect and extract the data from the specified cell. If there are no
464
+ # nils in key, this is just a slow way of doing bagOfHolding.data[key][di].
465
+ #
466
+ # di:: depvar index
467
+ def GECS.refactorExtract(bagOfHolding,key,di)
468
+ id = key.experimentId
469
+ r = nil
470
+ bagOfHolding.data.each{|k,v|
471
+ if k.experimentId==id
472
+ if treatEq(key.treat,k.treat)
473
+ unless v.nil? or v[di].nil?
474
+ r ||= Array.new
475
+ r.concat(v[di])
476
+ end
477
+ end
478
+ end
479
+ }
480
+ r
481
+ end
482
+
483
+ # (Private) Return true if a given key matches any data at all.
484
+ def GECS.matchesSomething(bagOfHolding,key)
485
+ id = key.experimentId
486
+ bagOfHolding.data.each{|k,v|
487
+ if k.experimentId==id
488
+ if treatEq(key.treat,k.treat)
489
+ unless v.nil?
490
+ return true # Need to check every v[di] too?
491
+ end
492
+ end
493
+ end
494
+ }
495
+ false
496
+ end
497
+
498
+ # (Private) Make a list of the Keys for all treatments, main effects, and
499
+ # 2-way interactions for an experiment. An attempt is made to suppress
500
+ # interactions for which there are no data at all (combinations of levels
501
+ # that don't occur).
502
+ def GECS.enumerateKeys(bagOfHolding,id)
503
+ require 'set'
504
+ throw "Bag is nil" if bagOfHolding.nil?
505
+ throw "Experiments are nil" if bagOfHolding.experiments.nil?
506
+ throw "No such experiment" if id >= bagOfHolding.experiments.length
507
+
508
+ exp = bagOfHolding.experiments[id]
509
+ throw "Null experiment" if exp.nil?
510
+ throw "Null indvars" if exp.indVars.nil?
511
+ throw "Null depvars" if exp.depVars.nil?
512
+ numfacs = exp.indVars.length
513
+ numdeps = exp.depVars.length
514
+ throw "Not enough indvars" if numfacs < 1
515
+ throw "Not enough depvars" if numdeps < 1
516
+
517
+ if numfacs==1
518
+ # Short cut for single-factor experiments.
519
+ bagOfHolding.data.select{|k,v| k.experimentId==id}.keys
520
+ else
521
+ # Enumerate the levels of all of the factors while adding all of the
522
+ # treatments.
523
+ r = Array.new
524
+ levels = Array.new(numfacs){Set.new}
525
+ bagOfHolding.data.each_key{|k|
526
+ if k.experimentId==id
527
+ r.push(k)
528
+ for fac in 0..numfacs-1
529
+ levels[fac].add(k.treat[fac])
530
+ end
531
+ end
532
+ }
533
+
534
+ treat = Array.new(numfacs,nil)
535
+ # Main effects. Single-factor experiments were already excluded.
536
+ for fac in 0..numfacs-1
537
+ treat.fill(nil)
538
+ for lvl in levels[fac]
539
+ treat[fac] = lvl
540
+ r.push(Key.new(id,Array.new(treat)))
541
+ end
542
+ end
543
+ # 2-way interactions.
544
+ if numfacs > 2 # Don't duplicate the treatments when numfacs==2.
545
+ for fac1 in 0..numfacs-2
546
+ for fac2 in fac1+1..numfacs-1
547
+ treat.fill(nil)
548
+ for lvl1 in levels[fac1]
549
+ treat[fac1] = lvl1
550
+ for lvl2 in levels[fac2]
551
+ treat[fac2] = lvl2
552
+ key = Key.new(id,Array.new(treat))
553
+ r.push(key) if matchesSomething(bagOfHolding,key)
554
+ end
555
+ end
556
+ end
557
+ end
558
+ end
559
+ r
560
+ end
561
+ end
562
+
563
+ # --------------------------------------------------------------------------
564
+
565
+ # rubygems.org/gems/statistics2 is required for the inverse t distribution.
566
+ # Unfortunately, as of version 0.54, its agreement with R and Octave is
567
+ # only to 4 decimals or so:
568
+
569
+ # R version 3.0.2 (2013-09-25) -- "Frisbee Sailing"
570
+ # > qt(0.975,5)
571
+ # [1] 2.570582
572
+
573
+ # GNU Octave, version 3.6.3
574
+ # octave:1> printf ("%f\n", tinv(0.975,5))
575
+ # 2.570582
576
+
577
+ # irb(main):002:0> puts Gem.loaded_specs["statistics2"].version,
578
+ # irb(main):003:0* Statistics2::ptdist(5,0.975)
579
+ # 0.54
580
+ # 2.57051
581
+
582
+ # (Private) Calculate the quick interval for the mean of some data.
583
+ # Returns a ParmData.
584
+ def GECS.quickInterval(data)
585
+ require 'statistics2'
586
+ if data.nil?
587
+ nil
588
+ else
589
+ count = data.length
590
+ sum = data.reduce(:+)
591
+ mean = sum.to_f/count
592
+ variance = data.map{|x| (mean-x)**2}.inject(:+)/(count-1.0)
593
+ meanU = Math.sqrt(variance/count)*Statistics2::ptdist(count-1,0.975)
594
+ ParmData.new(mean,mean-meanU,mean+meanU,nil)
595
+ end
596
+ end
597
+
598
+ # Add the following parameter to the data and effects of a specified
599
+ # experiment: mean, original method, 95% confidence. This is a quick way
600
+ # to summarize results when more complicated options are not needed. All
601
+ # values are computed as Floats with no respect for the original data type
602
+ # or its precision.
603
+ #
604
+ # bagOfHolding:: A BagOfHolding.
605
+ # id:: Experiment id.
606
+ def GECS.quickMeans(bagOfHolding,id)
607
+ throw "Bag is nil" if bagOfHolding.nil?
608
+ throw "Experiments are nil" if bagOfHolding.experiments.nil?
609
+ throw "No such experiment" if id >= bagOfHolding.experiments.length
610
+
611
+ exp = bagOfHolding.experiments[id]
612
+ throw "Null experiment" if exp.nil?
613
+ throw "Null indvars" if exp.indVars.nil?
614
+ throw "Null depvars" if exp.depVars.nil?
615
+ numfacs = exp.indVars.length
616
+ numdeps = exp.depVars.length
617
+ throw "Not enough indvars" if numfacs < 1
618
+ throw "Not enough depvars" if numdeps < 1
619
+
620
+ parmDef = ParmDef.new(bagOfHolding.getOrAddParm("mean"),
621
+ bagOfHolding.getOrAddEstMethod("original"),
622
+ {bagOfHolding.getOrAddEstMethodParm("coverage probability")=>0.95})
623
+ meanId = bagOfHolding.getOrAddParmDef(parmDef)
624
+ bagOfHolding.ests ||= Hash.new
625
+ enumerateKeys(bagOfHolding,id).each{|key|
626
+ bagOfHolding.ests[key] ||= Array.new(numdeps,nil)
627
+ for di in 0..numdeps-1
628
+ cell = refactorExtract(bagOfHolding,key,di)
629
+ unless cell.nil?
630
+ bagOfHolding.ests[key][di] ||= Hash.new
631
+ bagOfHolding.ests[key][di][meanId] = quickInterval(cell)
632
+ end
633
+ end
634
+ }
635
+ end
636
+
637
+ # --------------------------------------------------------------------------
638
+
639
+ # (Private) Unit of parallelization.
640
+ ParPod = Struct.new(
641
+ :key, # A Key to data or ests, as applicable.
642
+ :di, # Depvar index.
643
+ :sernum, # Unique integer.
644
+ :out # Returned interval.
645
+ )
646
+
647
+ # (Private) Create ParmData from ParPod return.
648
+ def GECS.parmRet(bagOfHolding,delta,pod)
649
+ if pod.out.nil?
650
+ nil
651
+ else
652
+ ParmData.new(pod.out[2].to_f, pod.out[3].to_f, pod.out[4].to_f, {
653
+ bagOfHolding.getOrAddEstMethodParm("bootstrap replica count")=>pod.out[0].to_i,
654
+ bagOfHolding.getOrAddEstMethodParm("numerical tolerance of adaptive bootstrap")=>delta[pod.di],
655
+ bagOfHolding.getOrAddEstMethodParm("estimated attained precision of adaptive bootstrap")=>pod.out[1].to_f})
656
+ end
657
+ end
658
+
659
+ # Add the following parameter to the results and effects of a specified
660
+ # experiment: mean, bootstrap method, BCa interval, 95% confidence,
661
+ # adaptive determination of bootstrap replica count to achieve the
662
+ # requested numerical tolerances for each depvar. The estimate provided is
663
+ # the sample mean, not the bootstrap estimate. Count will be at least
664
+ # 50000.
665
+ #
666
+ # This function is parallelized. All available CPUs will be used to run
667
+ # bootstrap calculations in R.
668
+ #
669
+ # bagOfHolding:: A BagOfHolding.
670
+ # id:: Experiment id.
671
+ # delta:: Array of numbers specifying desired numerical tolerances for each
672
+ # depvar. To reduce variation below the resolution of a typical
673
+ # plot of height 1000 pixels, you'd want delta something like
674
+ # (max(y)-min(y))/2000 for whatever range of y is being plotted.
675
+ def GECS.bootstrapMeans(bagOfHolding,id,delta)
676
+ require 'parallel'
677
+ throw "Bag is nil" if bagOfHolding.nil?
678
+ throw "Experiments are nil" if bagOfHolding.experiments.nil?
679
+ throw "No such experiment" if id >= bagOfHolding.experiments.length
680
+
681
+ exp = bagOfHolding.experiments[id]
682
+ throw "Null experiment" if exp.nil?
683
+ throw "Null indvars" if exp.indVars.nil?
684
+ throw "Null depvars" if exp.depVars.nil?
685
+ numfacs = exp.indVars.length
686
+ numdeps = exp.depVars.length
687
+ throw "Not enough indvars" if numfacs < 1
688
+ throw "Not enough depvars" if numdeps < 1
689
+ throw "Wrong number of deltas" if delta.length != numdeps
690
+
691
+ parmDef = ParmDef.new(bagOfHolding.getOrAddParm("mean"),
692
+ bagOfHolding.getOrAddEstMethod("bootstrap, BCa interval"),
693
+ {bagOfHolding.getOrAddEstMethodParm("coverage probability")=>0.95})
694
+ meanId = bagOfHolding.getOrAddParmDef(parmDef)
695
+
696
+ # Make a list of the data and effects that need fixing with a unique
697
+ # serial number assigned to each.
698
+ sernum = 0
699
+ pods = Array.new
700
+ enumerateKeys(bagOfHolding,id).each{|key|
701
+ for di in 0..numdeps-1
702
+ pods.push(ParPod.new(key,di,(sernum+=1),nil))
703
+ end
704
+ }
705
+
706
+ # Run numCPUs instances of BCa.R in parallel.
707
+ pods = Parallel.map(pods) do |pod|
708
+ poddata = refactorExtract(bagOfHolding,pod.key,pod.di)
709
+ unless poddata.nil?
710
+ descript = "Treatment " + pod.key.treat.to_s + " depvar " + pod.di.to_s
711
+ fnam = "bootstrap-in-" + pod.sernum.to_s + ".txt"
712
+ fp = File.open(fnam,"w")
713
+ fp.puts(poddata)
714
+ fp.close
715
+
716
+ # This R script that has been mangled onto the command line to avoid
717
+ # adding another external dependency is mostly just a wrapper for
718
+ # BCa.R, but the bootstrap estimate of the mean is replaced by the
719
+ # sample mean.
720
+ cmd = "Rscript " +
721
+ "-e 'library(\"bootBCa\")' " +
722
+ "-e 'data <- unlist(read.table(\"" + fnam + "\",header=F,colClasses=\"numeric\"))' " +
723
+ "-e 'out <- BCa(data,as.numeric(" + delta[pod.di].to_s + "),mean)' " +
724
+ "-e 'cat(sprintf(\"%d %0.16f %0.16f %0.16f %0.16f\\n\",out[1],out[2],mean(data),out[4],out[5]))'"
725
+
726
+ pod.out = `#{cmd}`.split
727
+ File.delete(fnam)
728
+ if pod.out.length < 5
729
+ # This should never happen.
730
+ print "Bootstrap failure\n"
731
+ print " ", descript, "\n"
732
+ print " Depvar: ", bagOfHolding.experiments[id].depVars[pod.di], "\n"
733
+ throw "Bootstrap failure"
734
+ end
735
+
736
+ # Verbose progress reporting
737
+ print descript + ", " + pod.out[0] + " iterations done\n"
738
+ end
739
+ pod
740
+ end
741
+
742
+ # Copy back parameter estimates.
743
+ bagOfHolding.ests ||= Hash.new
744
+ pods.each{|pod|
745
+ pd = parmRet(bagOfHolding,delta,pod)
746
+ unless pd.nil?
747
+ bagOfHolding.ests[pod.key] ||= Array.new(numdeps,nil)
748
+ bagOfHolding.ests[pod.key][pod.di] ||= Hash.new
749
+ bagOfHolding.ests[pod.key][pod.di][meanId] = pd
750
+ end
751
+ }
752
+ end
753
+
754
+ end
@@ -0,0 +1,46 @@
1
+ require 'test/unit'
2
+ require 'GECS'
3
+
4
+ class GECSTest < Test::Unit::TestCase
5
+
6
+ def test1
7
+ assert_nothing_raised do
8
+ assert_nothing_thrown do
9
+ puts "This test requires you to install R and the bootBCa package."
10
+ puts "R: www.r-project.org"
11
+ puts "bootBCa: bootbca.r-forge.r-project.org\n\n"
12
+ xid = 0
13
+ experiment = GECS::Experiment.new(xid, ["input"], ["output"],
14
+ "A silly, simple, small-sample test is better than nothing")
15
+ results = Hash.new
16
+ key5 = GECS::Key.new(xid,[5])
17
+ key10 = GECS::Key.new(xid,[10])
18
+ # rnorm(n=30,mean=5,sd=5)
19
+ results[key5] = [[
20
+ 8.1239862, 0.7075978, 18.3996805, 1.3803859, 8.1125496, 0.5903406,
21
+ 0.8996330, 4.8486252, -2.8038801, 13.7444565, 6.5451796, 6.8989424,
22
+ 7.2075758, 19.0412270, 2.6637848, -1.7874951, 9.7959101, -0.4751056,
23
+ 9.1401820, 9.8164959, 12.0606408, 6.0108636, -0.5488287, 12.0728692,
24
+ 6.7253890, 6.4999955, 1.9586040, 2.2619639, 2.2761585, 3.5316891]]
25
+ # rnorm(n=30,mean=10,sd=5)
26
+ results[key10] = [[
27
+ 10.225934, 7.991545, 8.665374, 5.550484, 6.315061, 17.274522,
28
+ 11.291662, 19.324338, 8.760387, 5.561073, 10.225028, 7.939054,
29
+ 13.094538, 7.893948, 2.026159, 19.918737, 11.955975, 7.056023,
30
+ 2.883466, -2.054165, 8.938372, 3.319929, 18.280414, 5.495548,
31
+ 4.556700, 1.883278, 7.677032, 7.741751, 15.817214, 16.886633]]
32
+ bag = GECS.newBag([experiment],results)
33
+ # Fail here because they didn't install bootBCa, or R.
34
+ GECS.bootstrapMeans(bag, xid, [0.1])
35
+ puts "\n", bag, "\n", GECS.dumpParms(bag,xid,true)
36
+ # [key][depvar index][parmDef index]
37
+ # We have only one depvar and only one parmDef.
38
+ parm = bag.ests[key5][0][0]
39
+ assert (parm.lo < 5 && 5 < parm.hi)
40
+ parm = bag.ests[key10][0][0]
41
+ assert (parm.lo < 10 && 10 < parm.hi)
42
+ end
43
+ end
44
+ end
45
+
46
+ end
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: GECS
3
+ version: !ruby/object:Gem::Version
4
+ version: '1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - David Flater
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-07-31 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: parallel
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 0.9.2
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 0.9.2
30
+ - !ruby/object:Gem::Dependency
31
+ name: statistics2
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0.54'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0.54'
46
+ description: This software is experimental. NIST assumes no responsibility whatsoever
47
+ for its use by other parties and makes no guarantees, expressed or implied, about
48
+ its quality, reliability, or any other characteristic.
49
+ email: dflater@nist.gov
50
+ executables: []
51
+ extensions: []
52
+ extra_rdoc_files: []
53
+ files:
54
+ - lib/GECS.rb
55
+ - test/test_GECS.rb
56
+ - Rakefile
57
+ homepage: http://www.nist.gov/itl/ssd/cs/software-performance.cfm
58
+ licenses:
59
+ - Unlicense
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ requirements:
77
+ - ! 'R: www.r-project.org'
78
+ - ! 'bootBCa: bootbca.r-forge.r-project.org'
79
+ rubyforge_project:
80
+ rubygems_version: 1.8.23
81
+ signing_key:
82
+ specification_version: 3
83
+ summary: Gem for Experimental Computer Science
84
+ test_files: []