statsample 1.4.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -0
- data/Gemfile +1 -16
- data/History.txt +51 -46
- data/LICENSE.txt +7 -82
- data/README.md +145 -150
- data/Rakefile +20 -102
- data/lib/spss.rb +17 -14
- data/lib/statsample/crosstab.rb +2 -2
- data/lib/statsample/dataset.rb +82 -81
- data/lib/statsample/matrix.rb +43 -43
- data/lib/statsample/reliability.rb +1 -2
- data/lib/statsample/vector.rb +157 -124
- data/lib/statsample/version.rb +1 -1
- data/lib/statsample.rb +91 -91
- data/references.txt +2 -1
- data/statsample.gemspec +89 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_crosstab.rb +8 -0
- data/test/test_histogram.rb +7 -0
- data/test/test_vector.rb +62 -48
- metadata +109 -120
- data/.gemtest +0 -0
- data/Gemfile.lock +0 -78
- data/Manifest.txt +0 -157
- data/setup.rb +0 -1585
data/lib/statsample/vector.rb
CHANGED
@@ -5,8 +5,9 @@ module Statsample::VectorShorthands
|
|
5
5
|
# Creates a new Statsample::Vector object
|
6
6
|
# Argument should be equal to Vector.new
|
7
7
|
def to_vector(*args)
|
8
|
-
|
9
|
-
|
8
|
+
Statsample::Vector.new(self,*args)
|
9
|
+
end
|
10
|
+
|
10
11
|
# Creates a new Statsample::Vector object of type :scale
|
11
12
|
def to_scale(*args)
|
12
13
|
Statsample::Vector.new(self, :scale, *args)
|
@@ -26,27 +27,29 @@ if Statsample.has_gsl?
|
|
26
27
|
end
|
27
28
|
module Statsample
|
28
29
|
|
29
|
-
|
30
|
+
|
30
31
|
# Collection of values on one dimension. Works as a column on a Spreadsheet.
|
31
|
-
#
|
32
|
+
#
|
32
33
|
# == Usage
|
33
34
|
# The fast way to create a vector uses Array.to_vector or Array.to_scale.
|
34
35
|
#
|
35
36
|
# v=[1,2,3,4].to_vector(:scale)
|
36
37
|
# v=[1,2,3,4].to_scale
|
37
|
-
#
|
38
|
+
#
|
38
39
|
class Vector
|
39
40
|
include Enumerable
|
40
41
|
include Writable
|
41
42
|
include Summarizable
|
43
|
+
include Statsample::VectorShorthands
|
44
|
+
|
42
45
|
# Level of measurement. Could be :nominal, :ordinal or :scale
|
43
46
|
attr_reader :type
|
44
|
-
# Original data.
|
47
|
+
# Original data.
|
45
48
|
attr_reader :data
|
46
49
|
# Valid data. Equal to data, minus values assigned as missing values
|
47
50
|
attr_reader :valid_data
|
48
51
|
# Array of values considered as missing. Nil is a missing value, by default
|
49
|
-
attr_reader :missing_values
|
52
|
+
attr_reader :missing_values
|
50
53
|
# Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
|
51
54
|
attr_reader :today_values
|
52
55
|
# Missing values array
|
@@ -59,7 +62,7 @@ module Statsample
|
|
59
62
|
attr_accessor :labels
|
60
63
|
# Name of vector. Should be used for output by many classes
|
61
64
|
attr_accessor :name
|
62
|
-
|
65
|
+
|
63
66
|
# Creates a new Vector object.
|
64
67
|
# * <tt>data</tt> Any data which can be converted on Array
|
65
68
|
# * <tt>type</tt> Level of meausurement. See Vector#type
|
@@ -123,7 +126,7 @@ module Statsample
|
|
123
126
|
# Parameters
|
124
127
|
# [n] Size
|
125
128
|
# [val] Value of each value
|
126
|
-
# [&block] If block provided, is used to set the values of vector
|
129
|
+
# [&block] If block provided, is used to set the values of vector
|
127
130
|
def self.new_scale(n,val=nil, &block)
|
128
131
|
if block
|
129
132
|
vector=n.times.map {|i| block.call(i)}.to_scale
|
@@ -144,7 +147,7 @@ module Statsample
|
|
144
147
|
def dup_empty
|
145
148
|
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name)
|
146
149
|
end
|
147
|
-
|
150
|
+
|
148
151
|
if Statsample::STATSAMPLE__.respond_to?(:check_type)
|
149
152
|
# Raises an exception if type of vector is inferior to t type
|
150
153
|
def check_type(t)
|
@@ -155,8 +158,8 @@ module Statsample
|
|
155
158
|
_check_type(t)
|
156
159
|
end
|
157
160
|
end
|
158
|
-
|
159
|
-
|
161
|
+
|
162
|
+
|
160
163
|
def _check_type(t) #:nodoc:
|
161
164
|
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date) or (:date==@type)
|
162
165
|
end
|
@@ -167,12 +170,12 @@ module Statsample
|
|
167
170
|
# Return a vector usign the standarized values for data
|
168
171
|
# with sd with denominator n-1. With variance=0 or mean nil,
|
169
172
|
# returns a vector of equal size full of nils
|
170
|
-
#
|
173
|
+
#
|
171
174
|
def vector_standarized(use_population=false)
|
172
175
|
check_type :scale
|
173
176
|
m=mean
|
174
177
|
sd=use_population ? sdp : sds
|
175
|
-
return ([nil]*size).to_scale if mean.nil? or sd==0.0
|
178
|
+
return ([nil]*size).to_scale if mean.nil? or sd==0.0
|
176
179
|
vector=vector_standarized_compute(m,sd)
|
177
180
|
vector.name=_("%s(standarized)") % @name
|
178
181
|
vector
|
@@ -189,7 +192,7 @@ module Statsample
|
|
189
192
|
vector.name=_("%s(centered)") % @name
|
190
193
|
vector
|
191
194
|
end
|
192
|
-
|
195
|
+
|
193
196
|
alias_method :standarized, :vector_standarized
|
194
197
|
alias_method :centered, :vector_centered
|
195
198
|
# Return a vector with values replaced with the percentiles
|
@@ -215,24 +218,24 @@ module Statsample
|
|
215
218
|
end
|
216
219
|
}.to_vector(:scale)
|
217
220
|
end
|
218
|
-
|
221
|
+
|
219
222
|
# Vector equality.
|
220
223
|
# Two vector will be the same if their data, missing values, type, labels are equals
|
221
224
|
def ==(v2)
|
222
|
-
|
225
|
+
return false unless v2.instance_of? Statsample::Vector
|
223
226
|
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels==v2.labels
|
224
227
|
end
|
225
|
-
|
228
|
+
|
226
229
|
def _dump(i) # :nodoc:
|
227
230
|
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type,'name'=>@name})
|
228
231
|
end
|
229
|
-
|
232
|
+
|
230
233
|
def self._load(data) # :nodoc:
|
231
234
|
h=Marshal.load(data)
|
232
235
|
Vector.new(h['data'], h['type'], :missing_values=> h['missing_values'], :labels=>h['labels'], :name=>h['name'])
|
233
236
|
end
|
234
237
|
# Returns a new vector, with data modified by block.
|
235
|
-
# Equivalent to create a Vector after #collect on data
|
238
|
+
# Equivalent to create a Vector after #collect on data
|
236
239
|
def recode(type=nil)
|
237
240
|
type||=@type
|
238
241
|
@data.collect{|x|
|
@@ -240,7 +243,7 @@ module Statsample
|
|
240
243
|
}.to_vector(type)
|
241
244
|
end
|
242
245
|
# Modifies current vector, with data modified by block.
|
243
|
-
# Equivalent to #collect! on @data
|
246
|
+
# Equivalent to #collect! on @data
|
244
247
|
def recode!
|
245
248
|
@data.collect!{|x|
|
246
249
|
yield x
|
@@ -251,21 +254,22 @@ module Statsample
|
|
251
254
|
@data.push(v)
|
252
255
|
set_valid_data
|
253
256
|
end
|
257
|
+
|
254
258
|
# Dicotomize the vector with 0 and 1, based on lowest value
|
255
259
|
# If parameter if defined, this value and lower
|
256
260
|
# will be 0 and higher, 1
|
257
|
-
def dichotomize(low=nil)
|
258
|
-
|
259
|
-
|
260
|
-
@data_with_nils.collect
|
261
|
+
def dichotomize(low = nil)
|
262
|
+
low ||= factors.min
|
263
|
+
|
264
|
+
@data_with_nils.collect do |x|
|
261
265
|
if x.nil?
|
262
266
|
nil
|
263
|
-
elsif x>low
|
267
|
+
elsif x > low
|
264
268
|
1
|
265
269
|
else
|
266
270
|
0
|
267
271
|
end
|
268
|
-
|
272
|
+
end.to_scale
|
269
273
|
end
|
270
274
|
# Iterate on each item.
|
271
275
|
# Equivalent to
|
@@ -273,7 +277,7 @@ module Statsample
|
|
273
277
|
def each
|
274
278
|
@data.each{|x| yield(x) }
|
275
279
|
end
|
276
|
-
|
280
|
+
|
277
281
|
# Iterate on each item, retrieving index
|
278
282
|
def each_index
|
279
283
|
(0...@data.size).each {|i|
|
@@ -318,7 +322,7 @@ module Statsample
|
|
318
322
|
end
|
319
323
|
else
|
320
324
|
def set_valid_data_intern #:nodoc:
|
321
|
-
_set_valid_data_intern
|
325
|
+
_set_valid_data_intern
|
322
326
|
end
|
323
327
|
end
|
324
328
|
def _set_valid_data_intern #:nodoc:
|
@@ -333,19 +337,19 @@ module Statsample
|
|
333
337
|
end
|
334
338
|
@has_missing_data=@missing_data.size>0
|
335
339
|
end
|
336
|
-
|
340
|
+
|
337
341
|
# Retrieves true if data has one o more missing values
|
338
342
|
def has_missing_data?
|
339
343
|
@has_missing_data
|
340
344
|
end
|
341
|
-
alias :flawed? :has_missing_data?
|
342
|
-
|
345
|
+
alias :flawed? :has_missing_data?
|
346
|
+
|
343
347
|
# Retrieves label for value x. Retrieves x if
|
344
348
|
# no label defined.
|
345
349
|
def labeling(x)
|
346
350
|
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
347
351
|
end
|
348
|
-
alias :label :labeling
|
352
|
+
alias :label :labeling
|
349
353
|
# Returns a Vector with data with labels replaced by the label.
|
350
354
|
def vector_labeled
|
351
355
|
d=@data.collect{|x|
|
@@ -362,12 +366,12 @@ module Statsample
|
|
362
366
|
@data.size
|
363
367
|
end
|
364
368
|
alias_method :n, :size
|
365
|
-
|
369
|
+
|
366
370
|
# Retrieves i element of data
|
367
371
|
def [](i)
|
368
372
|
@data[i]
|
369
373
|
end
|
370
|
-
# Set i element of data.
|
374
|
+
# Set i element of data.
|
371
375
|
# Note: Use set_valid_data if you include missing values
|
372
376
|
def []=(i,v)
|
373
377
|
@data[i]=v
|
@@ -387,9 +391,9 @@ module Statsample
|
|
387
391
|
@today_values = vals
|
388
392
|
set_valid_data
|
389
393
|
end
|
390
|
-
# Set level of measurement.
|
394
|
+
# Set level of measurement.
|
391
395
|
def type=(t)
|
392
|
-
@type=t
|
396
|
+
@type=t
|
393
397
|
set_scale_data if(t==:scale)
|
394
398
|
set_date_data if (t==:date)
|
395
399
|
end
|
@@ -400,9 +404,9 @@ module Statsample
|
|
400
404
|
@data.to_a
|
401
405
|
end
|
402
406
|
end
|
403
|
-
alias_method :to_ary, :to_a
|
404
|
-
|
405
|
-
# Vector sum.
|
407
|
+
alias_method :to_ary, :to_a
|
408
|
+
|
409
|
+
# Vector sum.
|
406
410
|
# - If v is a scalar, add this value to all elements
|
407
411
|
# - If v is a Array or a Vector, should be of the same size of this vector
|
408
412
|
# every item of this vector will be added to the value of the
|
@@ -410,17 +414,17 @@ module Statsample
|
|
410
414
|
def +(v)
|
411
415
|
_vector_ari("+",v)
|
412
416
|
end
|
413
|
-
# Vector rest.
|
417
|
+
# Vector rest.
|
414
418
|
# - If v is a scalar, rest this value to all elements
|
415
|
-
# - If v is a Array or a Vector, should be of the same
|
419
|
+
# - If v is a Array or a Vector, should be of the same
|
416
420
|
# size of this vector
|
417
421
|
# every item of this vector will be rested to the value of the
|
418
422
|
# item at the same position on the other vector
|
419
|
-
|
423
|
+
|
420
424
|
def -(v)
|
421
425
|
_vector_ari("-",v)
|
422
426
|
end
|
423
|
-
|
427
|
+
|
424
428
|
def *(v)
|
425
429
|
_vector_ari("*",v)
|
426
430
|
end
|
@@ -459,7 +463,7 @@ module Statsample
|
|
459
463
|
else
|
460
464
|
raise TypeError,"You should pass a scalar or a array/vector"
|
461
465
|
end
|
462
|
-
|
466
|
+
|
463
467
|
end
|
464
468
|
# Return an array with the data splitted by a separator.
|
465
469
|
# a=Vector.new(["a,b","c,d","a,b","d"])
|
@@ -483,11 +487,11 @@ module Statsample
|
|
483
487
|
#
|
484
488
|
# a=Vector.new(["a,b","c,d","a,b"])
|
485
489
|
# a.split_by_separator
|
486
|
-
# => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
|
487
|
-
# @data=[1, 0, 1]>,
|
488
|
-
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
|
489
|
-
# @data=[1, 1, 0]>,
|
490
|
-
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
|
490
|
+
# => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
|
491
|
+
# @data=[1, 0, 1]>,
|
492
|
+
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
|
493
|
+
# @data=[1, 1, 0]>,
|
494
|
+
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
|
491
495
|
# @data=[0, 1, 1]>}
|
492
496
|
#
|
493
497
|
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
@@ -504,7 +508,7 @@ module Statsample
|
|
504
508
|
end
|
505
509
|
else
|
506
510
|
factors.each do |f|
|
507
|
-
out[f].push(r.include?(f) ? 1:0)
|
511
|
+
out[f].push(r.include?(f) ? 1:0)
|
508
512
|
end
|
509
513
|
end
|
510
514
|
end
|
@@ -519,11 +523,11 @@ module Statsample
|
|
519
523
|
a
|
520
524
|
}
|
521
525
|
end
|
522
|
-
|
526
|
+
|
523
527
|
# == Bootstrap
|
524
528
|
# Generate +nr+ resamples (with replacement) of size +s+
|
525
529
|
# from vector, computing each estimate from +estimators+
|
526
|
-
# over each resample.
|
530
|
+
# over each resample.
|
527
531
|
# +estimators+ could be
|
528
532
|
# a) Hash with variable names as keys and lambdas as values
|
529
533
|
# a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
|
@@ -532,33 +536,33 @@ module Statsample
|
|
532
536
|
# c) A single method to bootstrap
|
533
537
|
# a.jacknife(:mean, 1000)
|
534
538
|
# If s is nil, is set to vector size by default.
|
535
|
-
#
|
539
|
+
#
|
536
540
|
# Returns a dataset where each vector is an vector
|
537
541
|
# of length +nr+ containing the computed resample estimates.
|
538
542
|
def bootstrap(estimators, nr, s=nil)
|
539
543
|
s||=n
|
540
|
-
|
544
|
+
|
541
545
|
h_est, es, bss= prepare_bootstrap(estimators)
|
542
|
-
|
543
|
-
|
546
|
+
|
547
|
+
|
544
548
|
nr.times do |i|
|
545
549
|
bs=sample_with_replacement(s)
|
546
|
-
es.each do |estimator|
|
550
|
+
es.each do |estimator|
|
547
551
|
# Add bootstrap
|
548
552
|
bss[estimator].push(h_est[estimator].call(bs))
|
549
553
|
end
|
550
554
|
end
|
551
|
-
|
555
|
+
|
552
556
|
es.each do |est|
|
553
557
|
bss[est]=bss[est].to_scale
|
554
558
|
bss[est].type=:scale
|
555
559
|
end
|
556
560
|
bss.to_dataset
|
557
|
-
|
561
|
+
|
558
562
|
end
|
559
|
-
|
563
|
+
|
560
564
|
# == Jacknife
|
561
|
-
# Returns a dataset with jacknife delete-+k+ +estimators+
|
565
|
+
# Returns a dataset with jacknife delete-+k+ +estimators+
|
562
566
|
# +estimators+ could be:
|
563
567
|
# a) Hash with variable names as keys and lambdas as values
|
564
568
|
# a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
|
@@ -571,23 +575,23 @@ module Statsample
|
|
571
575
|
#
|
572
576
|
# Returns a dataset where each vector is an vector
|
573
577
|
# of length +cases+/+k+ containing the computed jacknife estimates.
|
574
|
-
#
|
578
|
+
#
|
575
579
|
# == Reference:
|
576
580
|
# * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
|
577
581
|
def jacknife(estimators, k=1)
|
578
582
|
raise "n should be divisible by k:#{k}" unless n%k==0
|
579
|
-
|
583
|
+
|
580
584
|
nb=(n / k).to_i
|
581
|
-
|
582
|
-
|
585
|
+
|
586
|
+
|
583
587
|
h_est, es, ps= prepare_bootstrap(estimators)
|
584
588
|
|
585
589
|
est_n=es.inject({}) {|h,v|
|
586
590
|
h[v]=h_est[v].call(self)
|
587
591
|
h
|
588
592
|
}
|
589
|
-
|
590
|
-
|
593
|
+
|
594
|
+
|
591
595
|
nb.times do |i|
|
592
596
|
other=@data_with_nils.dup
|
593
597
|
other.slice!(i*k,k)
|
@@ -597,16 +601,16 @@ module Statsample
|
|
597
601
|
ps[estimator].push( nb * est_n[estimator] - (nb-1) * h_est[estimator].call(other))
|
598
602
|
end
|
599
603
|
end
|
600
|
-
|
601
|
-
|
604
|
+
|
605
|
+
|
602
606
|
es.each do |est|
|
603
607
|
ps[est]=ps[est].to_scale
|
604
608
|
ps[est].type=:scale
|
605
609
|
end
|
606
610
|
ps.to_dataset
|
607
611
|
end
|
608
|
-
|
609
|
-
|
612
|
+
|
613
|
+
|
610
614
|
# For an array or hash of estimators methods, returns
|
611
615
|
# an array with three elements
|
612
616
|
# 1.- A hash with estimators names as keys and lambdas as values
|
@@ -614,23 +618,23 @@ module Statsample
|
|
614
618
|
# 3.- A Hash with estimators names as keys and empty arrays as values
|
615
619
|
def prepare_bootstrap(estimators)
|
616
620
|
h_est=estimators
|
617
|
-
|
621
|
+
|
618
622
|
h_est=[h_est] unless h_est.is_a? Array or h_est.is_a? Hash
|
619
|
-
|
623
|
+
|
620
624
|
if h_est.is_a? Array
|
621
625
|
h_est=h_est.inject({}) {|h,est|
|
622
626
|
h[est]=lambda {|v| v.send(est)}
|
623
627
|
h
|
624
628
|
}
|
625
629
|
end
|
626
|
-
|
630
|
+
|
627
631
|
bss=h_est.keys.inject({}) {|h,v| h[v]=[];h}
|
628
|
-
|
632
|
+
|
629
633
|
[h_est,h_est.keys, bss]
|
630
|
-
|
634
|
+
|
631
635
|
end
|
632
636
|
private :prepare_bootstrap
|
633
|
-
|
637
|
+
|
634
638
|
# Returns an random sample of size n, with replacement,
|
635
639
|
# only with valid data.
|
636
640
|
#
|
@@ -644,9 +648,9 @@ module Statsample
|
|
644
648
|
# only with valid data.
|
645
649
|
#
|
646
650
|
# Every element could only be selected once.
|
647
|
-
#
|
651
|
+
#
|
648
652
|
# A sample of the same size of the vector is the vector itself.
|
649
|
-
|
653
|
+
|
650
654
|
def sample_without_replacement(sample=1)
|
651
655
|
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
652
656
|
out=[]
|
@@ -673,9 +677,9 @@ module Statsample
|
|
673
677
|
frequencies[x].nil? ? 0 : frequencies[x]
|
674
678
|
end
|
675
679
|
end
|
676
|
-
|
680
|
+
|
677
681
|
# Returns the database type for the vector, according to its content
|
678
|
-
|
682
|
+
|
679
683
|
def db_type(dbs='mysql')
|
680
684
|
# first, detect any character not number
|
681
685
|
if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
|
@@ -690,7 +694,7 @@ module Statsample
|
|
690
694
|
end
|
691
695
|
# Return true if all data is Date, "today" values or nil
|
692
696
|
def can_be_date?
|
693
|
-
if @data.find {|v|
|
697
|
+
if @data.find {|v|
|
694
698
|
!v.nil? and !v.is_a? Date and !v.is_a? Time and (v.is_a? String and !@today_values.include? v) and (v.is_a? String and !(v=~/\d{4,4}[-\/]\d{1,2}[-\/]\d{1,2}/))}
|
695
699
|
false
|
696
700
|
else
|
@@ -705,7 +709,7 @@ module Statsample
|
|
705
709
|
true
|
706
710
|
end
|
707
711
|
end
|
708
|
-
|
712
|
+
|
709
713
|
def to_s
|
710
714
|
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
711
715
|
end
|
@@ -734,7 +738,7 @@ module Statsample
|
|
734
738
|
end
|
735
739
|
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
736
740
|
# Returns a hash with the distribution of frecuencies for
|
737
|
-
# the sample
|
741
|
+
# the sample
|
738
742
|
def frequencies
|
739
743
|
Statsample::STATSAMPLE__.frequencies(@valid_data)
|
740
744
|
end
|
@@ -743,8 +747,8 @@ module Statsample
|
|
743
747
|
_frequencies
|
744
748
|
end
|
745
749
|
end
|
746
|
-
|
747
|
-
|
750
|
+
|
751
|
+
|
748
752
|
def _frequencies #:nodoc:
|
749
753
|
@valid_data.inject(Hash.new) {|a,x|
|
750
754
|
a[x]||=0
|
@@ -752,7 +756,7 @@ module Statsample
|
|
752
756
|
a
|
753
757
|
}
|
754
758
|
end
|
755
|
-
|
759
|
+
|
756
760
|
# Returns the most frequent item.
|
757
761
|
def mode
|
758
762
|
frequencies.max{|a,b| a[1]<=>b[1]}.first
|
@@ -775,12 +779,12 @@ module Statsample
|
|
775
779
|
end
|
776
780
|
def report_building(b)
|
777
781
|
b.section(:name=>name) do |s|
|
778
|
-
s.text _("n :%d") % n
|
782
|
+
s.text _("n :%d") % n
|
779
783
|
s.text _("n valid:%d") % n_valid
|
780
784
|
if @type==:nominal
|
781
|
-
s.text _("factors:%s") % factors.join(",")
|
782
|
-
s.text _("mode: %s") % mode
|
783
|
-
|
785
|
+
s.text _("factors:%s") % factors.join(",")
|
786
|
+
s.text _("mode: %s") % mode
|
787
|
+
|
784
788
|
s.table(:name=>_("Distribution")) do |t|
|
785
789
|
frequencies.sort.each do |k,v|
|
786
790
|
key=labels.has_key?(k) ? labels[k]:k
|
@@ -788,7 +792,7 @@ module Statsample
|
|
788
792
|
end
|
789
793
|
end
|
790
794
|
end
|
791
|
-
|
795
|
+
|
792
796
|
s.text _("median: %s") % median.to_s if(@type==:ordinal or @type==:scale)
|
793
797
|
if(@type==:scale)
|
794
798
|
s.text _("mean: %0.4f") % mean
|
@@ -801,7 +805,7 @@ module Statsample
|
|
801
805
|
end
|
802
806
|
end
|
803
807
|
end
|
804
|
-
|
808
|
+
|
805
809
|
# Variance of p, according to poblation size
|
806
810
|
def variance_proportion(n_poblation, v=1)
|
807
811
|
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
@@ -816,29 +820,58 @@ module Statsample
|
|
816
820
|
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
817
821
|
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
818
822
|
end
|
819
|
-
|
823
|
+
|
820
824
|
self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
|
821
825
|
met_or=met.gsub("_slow","")
|
822
826
|
if !self.method_defined?(met_or)
|
823
827
|
alias_method met_or, met
|
824
828
|
end
|
825
829
|
end
|
826
|
-
|
830
|
+
|
827
831
|
######
|
828
832
|
### Ordinal Methods
|
829
833
|
######
|
830
|
-
|
831
|
-
#
|
832
|
-
|
834
|
+
|
835
|
+
# == Percentil
|
836
|
+
# Returns the value of the percentile q
|
837
|
+
#
|
838
|
+
# Accepts an optional second argument specifying the strategy to interpolate
|
839
|
+
# when the requested percentile lies between two data points a and b
|
840
|
+
# Valid strategies are:
|
841
|
+
# * :midpoint (Default): (a + b) / 2
|
842
|
+
# * :linear : a + (b - a) * d where d is the decimal part of the index between a and b.
|
843
|
+
# This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method)
|
844
|
+
#
|
845
|
+
def percentil(q, strategy = :midpoint)
|
833
846
|
check_type :ordinal
|
834
847
|
sorted=@valid_data.sort
|
835
|
-
|
836
|
-
|
837
|
-
|
848
|
+
|
849
|
+
case strategy
|
850
|
+
when :midpoint
|
851
|
+
v = (n_valid * q).quo(100)
|
852
|
+
if(v.to_i!=v)
|
853
|
+
sorted[v.to_i]
|
854
|
+
else
|
855
|
+
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
856
|
+
end
|
857
|
+
when :linear
|
858
|
+
index = (q / 100.0) * (n_valid + 1)
|
859
|
+
|
860
|
+
k = index.truncate
|
861
|
+
d = index % 1
|
862
|
+
|
863
|
+
if k == 0
|
864
|
+
sorted[0]
|
865
|
+
elsif k >= sorted.size
|
866
|
+
sorted[-1]
|
867
|
+
else
|
868
|
+
sorted[k - 1] + d * (sorted[k] - sorted[k - 1])
|
869
|
+
end
|
838
870
|
else
|
839
|
-
|
871
|
+
raise NotImplementedError.new "Unknown strategy #{strategy.to_s}"
|
840
872
|
end
|
841
873
|
end
|
874
|
+
|
842
875
|
# Returns a ranked vector.
|
843
876
|
def ranked(type=:ordinal)
|
844
877
|
check_type :ordinal
|
@@ -856,7 +889,7 @@ module Statsample
|
|
856
889
|
percentil(50)
|
857
890
|
end
|
858
891
|
# Minimun value
|
859
|
-
def min
|
892
|
+
def min
|
860
893
|
check_type :ordinal
|
861
894
|
@valid_data.min
|
862
895
|
end
|
@@ -865,7 +898,7 @@ module Statsample
|
|
865
898
|
check_type :ordinal
|
866
899
|
@valid_data.max
|
867
900
|
end
|
868
|
-
|
901
|
+
|
869
902
|
def set_date_data
|
870
903
|
@date_data_with_nils=@data.collect do|x|
|
871
904
|
if x.is_a? Date
|
@@ -881,7 +914,7 @@ module Statsample
|
|
881
914
|
end
|
882
915
|
end
|
883
916
|
end
|
884
|
-
|
917
|
+
|
885
918
|
def set_scale_data
|
886
919
|
@scale_data=@valid_data.collect do|x|
|
887
920
|
if x.is_a? Numeric
|
@@ -893,18 +926,18 @@ module Statsample
|
|
893
926
|
end
|
894
927
|
end
|
895
928
|
end
|
896
|
-
|
929
|
+
|
897
930
|
private :set_date_data, :set_scale_data
|
898
|
-
|
931
|
+
|
899
932
|
# The range of the data (max - min)
|
900
|
-
def range;
|
933
|
+
def range;
|
901
934
|
check_type :scale
|
902
935
|
@scale_data.max - @scale_data.min
|
903
936
|
end
|
904
937
|
# The sum of values for the data
|
905
938
|
def sum
|
906
939
|
check_type :scale
|
907
|
-
@scale_data.inject(0){|a,x|x+a} ;
|
940
|
+
@scale_data.inject(0){|a,x|x+a} ;
|
908
941
|
end
|
909
942
|
# The arithmetical mean of data
|
910
943
|
def mean
|
@@ -914,7 +947,7 @@ module Statsample
|
|
914
947
|
# Sum of squares for the data around a value.
|
915
948
|
# By default, this value is the mean
|
916
949
|
# ss= sum{(xi-m)^2}
|
917
|
-
#
|
950
|
+
#
|
918
951
|
def sum_of_squares(m=nil)
|
919
952
|
check_type :scale
|
920
953
|
m||=mean
|
@@ -925,7 +958,7 @@ module Statsample
|
|
925
958
|
check_type :scale
|
926
959
|
@scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
927
960
|
end
|
928
|
-
|
961
|
+
|
929
962
|
# Population variance (denominator N)
|
930
963
|
def variance_population(m=nil)
|
931
964
|
check_type :scale
|
@@ -933,8 +966,8 @@ module Statsample
|
|
933
966
|
squares=@scale_data.inject(0){|a,x| x.square+a}
|
934
967
|
squares.quo(n_valid) - m.square
|
935
968
|
end
|
936
|
-
|
937
|
-
|
969
|
+
|
970
|
+
|
938
971
|
# Population Standard deviation (denominator N)
|
939
972
|
def standard_deviation_population(m=nil)
|
940
973
|
check_type :scale
|
@@ -943,7 +976,7 @@ module Statsample
|
|
943
976
|
|
944
977
|
# Population average deviation (denominator N)
|
945
978
|
# author: Al Chou
|
946
|
-
|
979
|
+
|
947
980
|
def average_deviation_population( m = nil )
|
948
981
|
check_type :scale
|
949
982
|
m ||= mean
|
@@ -960,7 +993,7 @@ module Statsample
|
|
960
993
|
m||=mean
|
961
994
|
sum_of_squares(m).quo(n_valid - 1)
|
962
995
|
end
|
963
|
-
|
996
|
+
|
964
997
|
# Sample Standard deviation (denominator n-1)
|
965
998
|
def standard_deviation_sample(m=nil)
|
966
999
|
check_type :scale
|
@@ -980,23 +1013,23 @@ module Statsample
|
|
980
1013
|
m||=mean
|
981
1014
|
fo=@scale_data.inject(0){|a,x| a+((x-m)**4)}
|
982
1015
|
fo.quo((@scale_data.size)*sd(m)**4)-3
|
983
|
-
|
1016
|
+
|
984
1017
|
end
|
985
1018
|
# Product of all values on the sample
|
986
|
-
#
|
1019
|
+
#
|
987
1020
|
def product
|
988
1021
|
check_type :scale
|
989
1022
|
@scale_data.inject(1){|a,x| a*x }
|
990
1023
|
end
|
991
|
-
|
1024
|
+
|
992
1025
|
# With a fixnum, creates X bins within the range of data
|
993
1026
|
# With an Array, each value will be a cut point
|
994
1027
|
def histogram(bins=10)
|
995
1028
|
check_type :scale
|
996
|
-
|
1029
|
+
|
997
1030
|
if bins.is_a? Array
|
998
1031
|
#h=Statsample::Histogram.new(self, bins)
|
999
|
-
h=Statsample::Histogram.alloc(bins)
|
1032
|
+
h=Statsample::Histogram.alloc(bins)
|
1000
1033
|
else
|
1001
1034
|
# ugly patch. The upper limit for a bin has the form
|
1002
1035
|
# x < range
|
@@ -1013,7 +1046,7 @@ module Statsample
|
|
1013
1046
|
h.increment(@valid_data)
|
1014
1047
|
h
|
1015
1048
|
end
|
1016
|
-
|
1049
|
+
|
1017
1050
|
# Coefficient of variation
|
1018
1051
|
# Calculed with the sample standard deviation
|
1019
1052
|
def coefficient_of_variation
|
@@ -1026,12 +1059,12 @@ module Statsample
|
|
1026
1059
|
standard_deviation_sample.quo(Math.sqrt(valid_data.size))
|
1027
1060
|
end
|
1028
1061
|
alias :se :standard_error
|
1029
|
-
|
1062
|
+
|
1030
1063
|
alias_method :sdp, :standard_deviation_population
|
1031
1064
|
alias_method :sds, :standard_deviation_sample
|
1032
1065
|
alias_method :adp, :average_deviation_population
|
1033
1066
|
alias_method :cov, :coefficient_of_variation
|
1034
|
-
alias_method :variance, :variance_sample
|
1067
|
+
alias_method :variance, :variance_sample
|
1035
1068
|
alias_method :sd, :standard_deviation_sample
|
1036
1069
|
alias_method :ss, :sum_of_squares
|
1037
1070
|
include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl?
|