statsample 0.5.1 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +12 -0
- data/Manifest.txt +13 -0
- data/README.txt +2 -1
- data/demo/pca.rb +29 -0
- data/demo/umann.rb +8 -0
- data/lib/distribution.rb +0 -1
- data/lib/matrix_extension.rb +35 -21
- data/lib/statsample.rb +31 -28
- data/lib/statsample/anova.rb +7 -2
- data/lib/statsample/bivariate.rb +17 -11
- data/lib/statsample/codification.rb +136 -87
- data/lib/statsample/combination.rb +0 -2
- data/lib/statsample/converter/csv18.rb +1 -1
- data/lib/statsample/converter/csv19.rb +1 -1
- data/lib/statsample/converters.rb +176 -171
- data/lib/statsample/crosstab.rb +227 -154
- data/lib/statsample/dataset.rb +94 -12
- data/lib/statsample/dominanceanalysis.rb +69 -62
- data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
- data/lib/statsample/factor.rb +18 -0
- data/lib/statsample/factor/pca.rb +128 -0
- data/lib/statsample/factor/principalaxis.rb +133 -0
- data/lib/statsample/factor/rotation.rb +125 -0
- data/lib/statsample/histogram.rb +99 -0
- data/lib/statsample/mle.rb +125 -126
- data/lib/statsample/mle/logit.rb +91 -91
- data/lib/statsample/mle/probit.rb +84 -85
- data/lib/statsample/multiset.rb +1 -1
- data/lib/statsample/permutation.rb +96 -0
- data/lib/statsample/regression.rb +1 -1
- data/lib/statsample/regression/binomial.rb +89 -89
- data/lib/statsample/regression/binomial/logit.rb +9 -9
- data/lib/statsample/regression/binomial/probit.rb +9 -9
- data/lib/statsample/regression/multiple.rb +8 -14
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
- data/lib/statsample/resample.rb +12 -17
- data/lib/statsample/srs.rb +4 -1
- data/lib/statsample/test.rb +23 -22
- data/lib/statsample/test/umannwhitney.rb +182 -0
- data/lib/statsample/vector.rb +854 -815
- data/test/test_bivariate.rb +132 -132
- data/test/test_codification.rb +71 -50
- data/test/test_dataset.rb +19 -1
- data/test/test_factor.rb +44 -0
- data/test/test_histogram.rb +26 -0
- data/test/test_permutation.rb +37 -0
- data/test/test_statistics.rb +74 -63
- data/test/test_umannwhitney.rb +17 -0
- data/test/test_vector.rb +46 -30
- metadata +31 -4
data/lib/statsample/resample.rb
CHANGED
@@ -1,20 +1,15 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
Vector.new((0...size).collect {|x|
|
14
|
-
rand(range)+low
|
15
|
-
},:scale)
|
16
|
-
end
|
17
|
-
|
18
|
-
end
|
2
|
+
module Resample
|
3
|
+
class << self
|
4
|
+
def repeat_and_save(times,&action)
|
5
|
+
(1..times).inject([]) {|a,x| a.push(action.call); a}
|
6
|
+
end
|
7
|
+
|
8
|
+
def generate (size,low,upper)
|
9
|
+
range=upper-low+1
|
10
|
+
Vector.new((0...size).collect {|x| rand(range)+low },:scale)
|
11
|
+
end
|
12
|
+
|
19
13
|
end
|
14
|
+
end
|
20
15
|
end
|
data/lib/statsample/srs.rb
CHANGED
@@ -35,6 +35,8 @@ module Statsample
|
|
35
35
|
n0=estimation_n0(d,prop,margin)
|
36
36
|
n0.quo( 1 + ((n0 - 1).quo(n_pobl)))
|
37
37
|
end
|
38
|
+
|
39
|
+
|
38
40
|
# Proportion confidence interval with t values
|
39
41
|
# Uses estimated proportion, sample without replacement.
|
40
42
|
|
@@ -42,6 +44,7 @@ module Statsample
|
|
42
44
|
t = Distribution::T.p_value(1-((1-margin).quo(2)) , n_sample-1)
|
43
45
|
proportion_confidence_interval(prop,n_sample,n_population, t)
|
44
46
|
end
|
47
|
+
|
45
48
|
# Proportion confidence interval with z values
|
46
49
|
# Uses estimated proportion, sample without replacement.
|
47
50
|
def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95)
|
@@ -53,7 +56,7 @@ module Statsample
|
|
53
56
|
|
54
57
|
def proportion_confidence_interval(p, sam,pop , x)
|
55
58
|
f=sam.quo(pop)
|
56
|
-
one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p))
|
59
|
+
one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p)).quo (sam-1)) + (1.quo(sam * 2.0))
|
57
60
|
[p-one_range, p+one_range]
|
58
61
|
end
|
59
62
|
# Standard deviation for sample distribution of a proportion
|
data/lib/statsample/test.rb
CHANGED
@@ -1,25 +1,26 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
2
|
+
# Module for several statistical tests
|
3
|
+
|
4
|
+
module Test
|
5
|
+
autoload(:UMannWhitney, 'statsample/test/umannwhitney')
|
6
|
+
# Calculate chi square for two Matrix
|
7
|
+
class << self
|
8
|
+
def chi_square(real,expected)
|
9
|
+
raise TypeError, "Both argument should be Matrix" unless real.is_a? Matrix and expected.is_a?Matrix
|
10
|
+
sum=0
|
11
|
+
(0...real.row_size).each {|row_i|
|
12
|
+
(0...real.column_size).each {|col_i|
|
13
|
+
val=((real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2) / expected[row_i,col_i].to_f
|
14
|
+
# puts "Real: #{real[row_i,col_i].to_f} ; esperado: #{expected[row_i,col_i].to_f}"
|
15
|
+
# puts "Diferencial al cuadrado: #{(real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2}"
|
16
|
+
sum+=val
|
17
|
+
}
|
18
|
+
}
|
19
|
+
sum
|
20
|
+
end
|
21
|
+
def u_mannwhitney(v1p,v2p)
|
22
|
+
Statsample::Test::UMannWhitney.new(v1p,v2p)
|
23
|
+
end
|
24
24
|
end
|
25
|
+
end
|
25
26
|
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Test
|
3
|
+
#
|
4
|
+
# = U Mann-Whitney test
|
5
|
+
#
|
6
|
+
# Non-parametric test for assessing whether two independent samples
|
7
|
+
# of observations come from the same distribution.
|
8
|
+
#
|
9
|
+
# == Assumptions
|
10
|
+
#
|
11
|
+
# * The two samples under investigation in the test are independent of each other and the observations within each sample are independent.
|
12
|
+
# * The observations are comparable (i.e., for any two observations, one can assess whether they are equal or, if not, which one is greater).
|
13
|
+
# * The variances in the two groups are approximately equal.
|
14
|
+
#
|
15
|
+
# Higher differences of distributions correspond to
|
16
|
+
# to lower values of U.
|
17
|
+
#
|
18
|
+
class UMannWhitney
|
19
|
+
# Max for m*n allowed for exact calculation of probability
|
20
|
+
MAX_MN_EXACT=10000
|
21
|
+
|
22
|
+
# Exact probability based on Dinneen & Blakesley (1973) algorithm
|
23
|
+
# This is the algorithm used on SPSS
|
24
|
+
#
|
25
|
+
# Reference: Dinneen, L., & Blakesley, B. (1973). Algorithm AS 62: A Generator for the Sampling Distribution of the Mann- Whitney U Statistic. Journal of the Royal Statistical Society, 22(2), 269-273
|
26
|
+
#
|
27
|
+
def self.exact_probability_distribution_as62(n1,n2)
|
28
|
+
|
29
|
+
freq=[]
|
30
|
+
work=[]
|
31
|
+
mn1=n1*n2+1
|
32
|
+
max_u=n1*n2
|
33
|
+
minmn=n1<n2 ? n1 : n2
|
34
|
+
maxmn=n1>n2 ? n1 : n2
|
35
|
+
n1=maxmn+1
|
36
|
+
(1..n1).each{|i| freq[i]=1}
|
37
|
+
n1+=1
|
38
|
+
(n1..mn1).each{|i| freq[i]=0}
|
39
|
+
work[1]=0
|
40
|
+
xin=maxmn
|
41
|
+
(2..minmn).each do |i|
|
42
|
+
work[i]=0
|
43
|
+
xin=xin+maxmn
|
44
|
+
n1=xin+2
|
45
|
+
l=1+xin.quo(2)
|
46
|
+
k=i
|
47
|
+
(1..l).each do |j|
|
48
|
+
k=k+1
|
49
|
+
n1=n1-1
|
50
|
+
sum=freq[j]+work[j]
|
51
|
+
freq[j]=sum
|
52
|
+
work[k]=sum-freq[n1]
|
53
|
+
freq[n1]=sum
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Generate percentages for normal U
|
58
|
+
dist=(1+max_u/2).to_i
|
59
|
+
freq.shift
|
60
|
+
total=freq.inject(0) {|a,v| a+v }
|
61
|
+
(0...dist).collect {|i|
|
62
|
+
if i!=max_u-i
|
63
|
+
ues=freq[i]*2
|
64
|
+
else
|
65
|
+
ues=freq[i]
|
66
|
+
end
|
67
|
+
ues.quo(total)
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
71
|
+
# Generate distribution for permutations
|
72
|
+
|
73
|
+
def self.distribution_permutations(n1,n2)
|
74
|
+
base=[0]*n1+[1]*n2
|
75
|
+
po=Statsample::Permutation.new(base)
|
76
|
+
upper=0
|
77
|
+
total=n1*n2
|
78
|
+
req={}
|
79
|
+
po.each do |perm|
|
80
|
+
r0,s0=0,0
|
81
|
+
perm.each_index {|c_i|
|
82
|
+
if perm[c_i]==0
|
83
|
+
r0+=c_i+1
|
84
|
+
s0+=1
|
85
|
+
end
|
86
|
+
}
|
87
|
+
u1=r0-((s0*(s0+1)).quo(2))
|
88
|
+
u2=total-u1
|
89
|
+
temp_u= (u1 <= u2) ? u1 : u2
|
90
|
+
req[perm]=temp_u
|
91
|
+
end
|
92
|
+
req
|
93
|
+
end
|
94
|
+
# Sample 1 Rank sum
|
95
|
+
attr_reader :r1
|
96
|
+
# Sample 2 Rank sum
|
97
|
+
attr_reader :r2
|
98
|
+
# Sample 1 U
|
99
|
+
attr_reader :u1
|
100
|
+
# Sample 2 U
|
101
|
+
attr_reader :u2
|
102
|
+
# U Value
|
103
|
+
attr_reader :u
|
104
|
+
# Compensation for ties
|
105
|
+
attr_reader :t
|
106
|
+
def initialize(v1,v2)
|
107
|
+
@n1=v1.valid_data.size
|
108
|
+
@n2=v2.valid_data.size
|
109
|
+
|
110
|
+
data=(v1.valid_data+v2.valid_data).to_scale
|
111
|
+
groups=(([0]*@n1)+([1]*@n2)).to_vector
|
112
|
+
ds={'g'=>groups, 'data'=>data}.to_dataset
|
113
|
+
@t=nil
|
114
|
+
@ties=data.data.size!=data.data.uniq.size
|
115
|
+
if(@ties)
|
116
|
+
adjust_for_ties(ds['data'])
|
117
|
+
end
|
118
|
+
ds['ranked']=ds['data'].ranked(:scale)
|
119
|
+
|
120
|
+
@n=ds.cases
|
121
|
+
|
122
|
+
@r1=ds.filter{|r| r['g']==0}['ranked'].sum
|
123
|
+
@r2=((ds.cases*(ds.cases+1)).quo(2))-r1
|
124
|
+
@u1=r1-((@n1*(@n1+1)).quo(2))
|
125
|
+
@u2=r2-((@n2*(@n2+1)).quo(2))
|
126
|
+
@u=(u1<u2) ? u1 : u2
|
127
|
+
end
|
128
|
+
def summary
|
129
|
+
out=<<-HEREDOC
|
130
|
+
Mann-Whitney U
|
131
|
+
Sum of ranks v1: #{@r1.to_f}
|
132
|
+
Sum of ranks v1: #{@r2.to_f}
|
133
|
+
U Value: #{@u.to_f}
|
134
|
+
Z: #{sprintf("%0.3f",z)} (p: #{sprintf("%0.3f",z_probability)})
|
135
|
+
HEREDOC
|
136
|
+
if @n1*@n2<MAX_MN_EXACT
|
137
|
+
out+="Exact p (Dinneen & Blakesley): #{sprintf("%0.3f",exact_probability)}"
|
138
|
+
end
|
139
|
+
out
|
140
|
+
end
|
141
|
+
# Exact probability of finding values of U lower or equal to sample on U distribution. Use with caution with m*n>100000
|
142
|
+
# Reference: Dinneen & Blakesley (1973)
|
143
|
+
def exact_probability
|
144
|
+
dist=UMannWhitney.exact_probability_distribution_as62(@n1,@n2)
|
145
|
+
sum=0
|
146
|
+
(0..@u.to_i).each {|i|
|
147
|
+
sum+=dist[i]
|
148
|
+
}
|
149
|
+
sum
|
150
|
+
end
|
151
|
+
# Reference: http://europe.isixsigma.com/library/content/c080806a.asp
|
152
|
+
def adjust_for_ties(data)
|
153
|
+
@t=data.frequencies.find_all{|k,v| v>1}.inject(0) {|a,v|
|
154
|
+
a+(v[1]**3-v[1]).quo(12)
|
155
|
+
}
|
156
|
+
end
|
157
|
+
# Z value for U, with adjust for ties.
|
158
|
+
# For large samples, U is approximately normally distributed.
|
159
|
+
# In that case, you can use z to obtain probabily for U.
|
160
|
+
# Reference: SPSS Manual
|
161
|
+
def z
|
162
|
+
mu=(@n1*@n2).quo(2)
|
163
|
+
if(!@ties)
|
164
|
+
ou=Math::sqrt(((@n1*@n2)*(@n1+@n2+1)).quo(12))
|
165
|
+
else
|
166
|
+
n=@n1+@n2
|
167
|
+
first=(@n1*@n2).quo(n*(n-1))
|
168
|
+
second=((n**3-n).quo(12))-@t
|
169
|
+
ou=Math::sqrt(first*second)
|
170
|
+
end
|
171
|
+
(@u-mu).quo(ou)
|
172
|
+
end
|
173
|
+
# Assuming H_0, the proportion of cdf with values of U lower
|
174
|
+
# than the sample.
|
175
|
+
# Use with more than 30 cases per group.
|
176
|
+
def z_probability
|
177
|
+
(1-Distribution::Normal.cdf(z.abs()))*2
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
end
|
data/lib/statsample/vector.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'date'
|
1
2
|
class Array
|
2
3
|
# Creates a new Statsample::Vector object
|
3
4
|
# Argument should be equal to Vector.new
|
@@ -9,25 +10,24 @@ class Array
|
|
9
10
|
Statsample::Vector.new(self,:scale,*args)
|
10
11
|
end
|
11
12
|
end
|
12
|
-
|
13
13
|
module Statsample
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
14
|
+
class << self
|
15
|
+
# Create a matrix using vectors as columns.
|
16
|
+
# Use:
|
17
|
+
#
|
18
|
+
# matrix=Statsample.vector_cols_matrix(v1,v2)
|
19
|
+
def vector_cols_matrix(*vs)
|
20
|
+
# test
|
21
|
+
size=vs[0].size
|
22
|
+
vs.each{|v|
|
23
|
+
raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
|
24
|
+
raise ArgumentError,"Vectors size should be the same" if v.size!=size
|
25
|
+
}
|
26
|
+
Matrix.rows((0...size).to_a.collect() {|i|
|
27
|
+
vs.collect{|v| v[i]}
|
28
|
+
})
|
29
|
+
end
|
30
|
+
end
|
31
31
|
# Returns a duplicate of the input vectors, without missing data
|
32
32
|
# for any of the vectors.
|
33
33
|
#
|
@@ -46,834 +46,873 @@ module Statsample
|
|
46
46
|
ds.vectors.values
|
47
47
|
end
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
49
|
+
class Vector
|
50
|
+
include Enumerable
|
51
|
+
include Writable
|
52
|
+
DEFAULT_OPTIONS={
|
53
|
+
:missing_values=>[],
|
54
|
+
:today_values=>['NOW','TODAY', :NOW, :TODAY],
|
55
|
+
:labels=>{}
|
56
|
+
}
|
57
|
+
# Level of measurement. Could be :nominal, :ordinal or :scale
|
58
|
+
attr_reader :type
|
59
|
+
# Original data.
|
60
|
+
attr_reader :data
|
61
|
+
# Valid data. Equal to data, minus values assigned as missing values
|
62
|
+
attr_reader :valid_data
|
63
|
+
# Array of values considered as missing. Nil is a missing value, by default
|
64
|
+
attr_reader :missing_values
|
65
|
+
# Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
|
66
|
+
attr_reader :today_values
|
67
|
+
# Missing values array
|
68
|
+
attr_reader :missing_data
|
69
|
+
# Original data, with all missing values replaced by nils
|
70
|
+
attr_reader :data_with_nils
|
71
|
+
# Date date, with all missing values replaced by nils
|
72
|
+
attr_reader :date_data_with_nils
|
73
|
+
# GSL Object, only available with rbgsl extension and type==:scale
|
74
|
+
attr_reader :gsl
|
75
|
+
# Change label for specific values
|
76
|
+
attr_accessor :labels
|
77
|
+
# Creates a new Vector object.
|
78
|
+
# [data] Array of data.
|
79
|
+
# [type] Level of meausurement. See Vector#type
|
80
|
+
# [opts] Options
|
81
|
+
# [:missing_values] Array of missing values. See Vector#missing_values
|
82
|
+
# [:today_values] Array of 'today' values. See Vector#today_values
|
83
|
+
# [:labels] Labels for data values
|
84
|
+
#
|
85
|
+
# The fast way to create a vector uses Array.to_vector or Array.to_scale.
|
86
|
+
#
|
87
|
+
# v=[1,2,3,4].to_vector(:scale)
|
88
|
+
# v=[1,2,3,4].to_scale
|
89
|
+
#
|
90
|
+
|
91
|
+
def initialize(data=[], t=:nominal, opts=Hash.new)
|
92
|
+
raise "Data should be an array" unless data.is_a? Array
|
93
|
+
@data=data
|
94
|
+
@type=t
|
95
|
+
opts=DEFAULT_OPTIONS.merge(opts)
|
96
|
+
@missing_values=opts[:missing_values]
|
97
|
+
@labels=opts[:labels]
|
98
|
+
@today_values=opts[:today_values]
|
99
|
+
@valid_data=[]
|
100
|
+
@data_with_nils=[]
|
101
|
+
@date_data_with_nils=[]
|
102
|
+
@missing_data=[]
|
103
|
+
@has_missing_data=nil
|
104
|
+
@scale_data=nil
|
105
|
+
set_valid_data_intern
|
106
|
+
self.type=t
|
107
|
+
end
|
108
|
+
# Creates a duplicate of the Vector.
|
109
|
+
# Note: data, missing_values and labels are duplicated, so
|
110
|
+
# changes on original vector doesn't propages to copies.
|
111
|
+
def dup
|
112
|
+
Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
|
113
|
+
end
|
114
|
+
# Returns an empty duplicate of the vector. Maintains the type,
|
115
|
+
# missing values and labels.
|
116
|
+
def dup_empty
|
117
|
+
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
|
118
|
+
end
|
119
|
+
# Raises an exception if type of vector is inferior to t type
|
120
|
+
def check_type(t)
|
121
|
+
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date)
|
122
|
+
end
|
123
|
+
private :check_type
|
124
|
+
|
125
|
+
# Return a vector usign the standarized values for data
|
126
|
+
# with sd with denominator N
|
127
|
+
def vector_standarized_pop
|
128
|
+
vector_standarized(true)
|
129
|
+
end
|
130
|
+
# Return a vector usign the standarized values for data
|
131
|
+
# with sd with denominator n-1
|
132
|
+
|
133
|
+
def vector_standarized(use_population=false)
|
134
|
+
raise "Should be a scale" unless @type==:scale
|
135
|
+
m=mean
|
136
|
+
sd=use_population ? sdp : sds
|
137
|
+
@data_with_nils.collect{|x|
|
138
|
+
if !x.nil?
|
139
|
+
(x.to_f - m).quo(sd)
|
140
|
+
else
|
141
|
+
nil
|
142
|
+
end
|
143
|
+
}.to_vector(:scale)
|
144
|
+
end
|
145
|
+
|
146
|
+
alias_method :standarized, :vector_standarized
|
147
|
+
|
148
|
+
def box_cox_transformation(lambda) # :nodoc:
|
149
|
+
raise "Should be a scale" unless @type==:scale
|
150
|
+
@data_with_nils.collect{|x|
|
151
|
+
if !x.nil?
|
152
|
+
if(lambda==0)
|
153
|
+
Math.log(x)
|
154
|
+
else
|
155
|
+
(x**lambda-1).quo(lambda)
|
156
|
+
end
|
157
|
+
else
|
158
|
+
nil
|
159
|
+
end
|
160
|
+
}.to_vector(:scale)
|
161
|
+
end
|
162
|
+
|
163
|
+
# Vector equality.
|
164
|
+
# Two vector will be the same if their data, missing values, type, labels are equals
|
165
|
+
def ==(v2)
|
166
|
+
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
167
|
+
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
168
|
+
end
|
169
|
+
|
170
|
+
def _dump(i) # :nodoc:
|
171
|
+
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
172
|
+
end
|
173
|
+
|
174
|
+
def self._load(data) # :nodoc:
|
175
|
+
h=Marshal.load(data)
|
176
|
+
Vector.new(h['data'], h['type'],:missing_values=> h['missing_values'], :labels=>h['labels'])
|
177
|
+
end
|
178
|
+
# Returns a new vector, with data modified by block.
|
179
|
+
# Equivalent to create a Vector after #collect on data
|
180
|
+
def recode
|
181
|
+
@data.collect{|x|
|
182
|
+
yield x
|
183
|
+
}.to_vector(@type)
|
184
|
+
end
|
185
|
+
# Modifies current vector, with data modified by block.
|
186
|
+
# Equivalent to #collect! on @data
|
187
|
+
def recode!
|
188
|
+
@data.collect!{|x|
|
189
|
+
yield x
|
55
190
|
}
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
191
|
+
set_valid_data
|
192
|
+
end
|
193
|
+
# Dicotomize the vector with 0 and 1, based on lowest value
|
194
|
+
# If parameter if defined, this value and lower
|
195
|
+
# will be 0 and higher, 1
|
196
|
+
def dichotomize(low=nil)
|
197
|
+
fs=factors
|
198
|
+
low||=factors.min
|
199
|
+
@data_with_nils.collect{|x|
|
200
|
+
if x.nil?
|
201
|
+
nil
|
202
|
+
elsif x>low
|
203
|
+
1
|
204
|
+
else
|
205
|
+
0
|
206
|
+
end
|
207
|
+
}.to_scale
|
208
|
+
end
|
209
|
+
# Iterate on each item.
|
210
|
+
# Equivalent to
|
211
|
+
# @data.each{|x| yield x}
|
212
|
+
def each
|
213
|
+
@data.each{|x| yield(x) }
|
214
|
+
end
|
215
|
+
|
216
|
+
# Iterate on each item, retrieving index
|
217
|
+
def each_index
|
218
|
+
(0...@data.size).each {|i|
|
219
|
+
yield(i)
|
220
|
+
}
|
221
|
+
end
|
222
|
+
# Add a value at the end of the vector.
|
223
|
+
# If second argument set to false, you should update the Vector usign
|
224
|
+
# Vector.set_valid_data at the end of your insertion cycle
|
225
|
+
#
|
226
|
+
def add(v,update_valid=true)
|
227
|
+
@data.push(v)
|
228
|
+
set_valid_data if update_valid
|
229
|
+
end
|
230
|
+
# Update valid_data, missing_data, data_with_nils and gsl
|
231
|
+
# at the end of an insertion.
|
232
|
+
#
|
233
|
+
# Use after Vector.add(v,false)
|
234
|
+
# Usage:
|
235
|
+
# v=Statsample::Vector.new
|
236
|
+
# v.add(2,false)
|
237
|
+
# v.add(4,false)
|
238
|
+
# v.data
|
239
|
+
# => [2,3]
|
240
|
+
# v.valid_data
|
241
|
+
# => []
|
242
|
+
# v.set_valid_data
|
243
|
+
# v.valid_data
|
244
|
+
# => [2,3]
|
245
|
+
def set_valid_data
|
246
|
+
@valid_data.clear
|
247
|
+
@missing_data.clear
|
248
|
+
@data_with_nils.clear
|
249
|
+
@date_data_with_nils.clear
|
250
|
+
@gsl=nil
|
251
|
+
set_valid_data_intern
|
252
|
+
set_scale_data if(@type==:scale)
|
253
|
+
set_date_data if(@type==:date)
|
254
|
+
end
|
255
|
+
|
256
|
+
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
257
|
+
def set_valid_data_intern #:nodoc:
|
258
|
+
Statsample::STATSAMPLE__.set_valid_data_intern(self)
|
259
|
+
end
|
260
|
+
else
|
261
|
+
def set_valid_data_intern #:nodoc:
|
262
|
+
_set_valid_data_intern
|
263
|
+
end
|
264
|
+
end
|
265
|
+
def _set_valid_data_intern #:nodoc:
|
266
|
+
@data.each do |n|
|
267
|
+
if is_valid? n
|
268
|
+
@valid_data.push(n)
|
269
|
+
@data_with_nils.push(n)
|
270
|
+
else
|
271
|
+
@data_with_nils.push(nil)
|
272
|
+
@missing_data.push(n)
|
273
|
+
end
|
274
|
+
end
|
275
|
+
@has_missing_data=@missing_data.size>0
|
276
|
+
end
|
277
|
+
|
278
|
+
# Retrieves true if data has one o more missing values
|
279
|
+
def has_missing_data?
|
280
|
+
@has_missing_data
|
281
|
+
end
|
282
|
+
# Retrieves label for value x. Retrieves x if
|
283
|
+
# no label defined.
|
284
|
+
def labeling(x)
|
285
|
+
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
286
|
+
end
|
287
|
+
# Returns a Vector with data with labels replaced by the label.
|
288
|
+
def vector_labeled
|
289
|
+
d=@data.collect{|x|
|
290
|
+
if @labels.has_key? x
|
291
|
+
@labels[x]
|
292
|
+
else
|
293
|
+
x
|
294
|
+
end
|
295
|
+
}
|
296
|
+
Vector.new(d,@type)
|
297
|
+
end
|
298
|
+
# Size of total data
|
299
|
+
def size
|
300
|
+
@data.size
|
301
|
+
end
|
302
|
+
alias_method :n, :size
|
303
|
+
|
304
|
+
# Retrieves i element of data
|
305
|
+
def [](i)
|
306
|
+
@data[i]
|
307
|
+
end
|
308
|
+
# Set i element of data.
|
309
|
+
# Note: Use set_valid_data if you include missing values
|
310
|
+
def []=(i,v)
|
311
|
+
@data[i]=v
|
312
|
+
end
|
313
|
+
# Return true if a value is valid (not nil and not included on missing values)
|
314
|
+
def is_valid?(x)
|
315
|
+
!(x.nil? or @missing_values.include? x)
|
316
|
+
end
|
317
|
+
# Set missing_values.
|
318
|
+
# if update_valid = false, you should use
|
319
|
+
# set_valid_data after all changes
|
320
|
+
def missing_values=(vals)
|
321
|
+
@missing_values = vals
|
322
|
+
set_valid_data
|
323
|
+
end
|
324
|
+
def today_values=(vals)
|
325
|
+
@today_values = vals
|
326
|
+
set_valid_data
|
327
|
+
end
|
328
|
+
# Set level of measurement.
|
329
|
+
def type=(t)
|
330
|
+
@type=t
|
331
|
+
set_scale_data if(t==:scale)
|
332
|
+
set_date_data if (t==:date)
|
333
|
+
end
|
334
|
+
def to_a
|
335
|
+
@data.dup
|
336
|
+
end
|
337
|
+
alias_method :to_ary, :to_a
|
338
|
+
|
339
|
+
# Vector sum.
|
340
|
+
# - If v is a scalar, add this value to all elements
|
341
|
+
# - If v is a Array or a Vector, should be of the same size of this vector
|
342
|
+
# every item of this vector will be added to the value of the
|
343
|
+
# item at the same position on the other vector
|
344
|
+
def +(v)
|
345
|
+
_vector_ari("+",v)
|
346
|
+
end
|
347
|
+
# Vector rest.
|
348
|
+
# - If v is a scalar, rest this value to all elements
|
349
|
+
# - If v is a Array or a Vector, should be of the same
|
350
|
+
# size of this vector
|
351
|
+
# every item of this vector will be rested to the value of the
|
352
|
+
# item at the same position on the other vector
|
353
|
+
|
354
|
+
def -(v)
|
355
|
+
_vector_ari("-",v)
|
356
|
+
end
|
357
|
+
# Reports all values that doesn't comply with a condition.
|
358
|
+
# Returns a hash with the index of data and the invalid data.
|
359
|
+
def verify
|
360
|
+
h={}
|
361
|
+
(0...@data.size).to_a.each{|i|
|
362
|
+
if !(yield @data[i])
|
363
|
+
h[i]=@data[i]
|
364
|
+
end
|
365
|
+
}
|
366
|
+
h
|
367
|
+
end
|
368
|
+
def _vector_ari(method,v) # :nodoc:
|
369
|
+
if(v.is_a? Vector or v.is_a? Array)
|
370
|
+
if v.size==@data.size
|
371
|
+
# i=0
|
372
|
+
sum=[]
|
373
|
+
0.upto(v.size-1) {|i|
|
374
|
+
if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
|
375
|
+
sum.push(@data[i].send(method,v[i]))
|
376
|
+
else
|
377
|
+
sum.push(nil)
|
378
|
+
end
|
379
|
+
}
|
380
|
+
Statsample::Vector.new(sum)
|
381
|
+
else
|
382
|
+
raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
|
383
|
+
end
|
384
|
+
elsif(v.respond_to? method )
|
385
|
+
Statsample::Vector.new(
|
386
|
+
@data.collect {|x|
|
387
|
+
if(!x.nil?)
|
388
|
+
x.send(method,v)
|
389
|
+
else
|
150
390
|
nil
|
151
|
-
|
152
|
-
}.to_vector(:scale)
|
153
|
-
end
|
154
|
-
|
155
|
-
# Vector equality.
|
156
|
-
# Two vector will be the same if their data, missing values, type, labels are equals
|
157
|
-
def ==(v2)
|
158
|
-
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
159
|
-
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
160
|
-
end
|
161
|
-
|
162
|
-
def _dump(i) # :nodoc:
|
163
|
-
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
164
|
-
end
|
165
|
-
|
166
|
-
def self._load(data) # :nodoc:
|
167
|
-
h=Marshal.load(data)
|
168
|
-
Vector.new(h['data'], h['type'],:missing_values=> h['missing_values'], :labels=>h['labels'])
|
169
|
-
end
|
170
|
-
# Returns a new vector, with data modified by block.
|
171
|
-
# Equivalent to create a Vector after #collect on data
|
172
|
-
def recode
|
173
|
-
@data.collect{|x|
|
174
|
-
yield x
|
175
|
-
}.to_vector(@type)
|
176
|
-
end
|
177
|
-
# Modifies current vector, with data modified by block.
|
178
|
-
# Equivalent to #collect! on @data
|
179
|
-
def recode!
|
180
|
-
@data.collect!{|x|
|
181
|
-
yield x
|
182
|
-
}
|
183
|
-
set_valid_data
|
184
|
-
end
|
185
|
-
# Dicotomize the vector with 0 and 1, based on lowest value
|
186
|
-
# If parameter if defined, this value and lower
|
187
|
-
# will be 0 and higher, 1
|
188
|
-
def dichotomize(low=nil)
|
189
|
-
fs=factors
|
190
|
-
low||=factors.min
|
191
|
-
@data_with_nils.collect{|x|
|
192
|
-
if x.nil?
|
193
|
-
nil
|
194
|
-
elsif x>low
|
195
|
-
1
|
196
|
-
else
|
197
|
-
0
|
198
|
-
end
|
199
|
-
}.to_scale
|
200
|
-
end
|
201
|
-
# Iterate on each item.
|
202
|
-
# Equivalent to
|
203
|
-
# @data.each{|x| yield x}
|
204
|
-
def each
|
205
|
-
@data.each{|x| yield(x) }
|
206
|
-
end
|
207
|
-
|
208
|
-
# Iterate on each item, retrieving index
|
209
|
-
def each_index
|
210
|
-
(0...@data.size).each {|i|
|
211
|
-
yield(i)
|
391
|
+
end
|
212
392
|
}
|
213
|
-
|
214
|
-
# Add a value at the end of the vector.
|
215
|
-
# If second argument set to false, you should update the Vector usign
|
216
|
-
# Vector.set_valid_data at the end of your insertion cycle
|
217
|
-
#
|
218
|
-
def add(v,update_valid=true)
|
219
|
-
@data.push(v)
|
220
|
-
set_valid_data if update_valid
|
221
|
-
end
|
222
|
-
# Update valid_data, missing_data, data_with_nils and gsl
|
223
|
-
# at the end of an insertion.
|
224
|
-
#
|
225
|
-
# Use after Vector.add(v,false)
|
226
|
-
# Usage:
|
227
|
-
# v=Statsample::Vector.new
|
228
|
-
# v.add(2,false)
|
229
|
-
# v.add(4,false)
|
230
|
-
# v.data
|
231
|
-
# => [2,3]
|
232
|
-
# v.valid_data
|
233
|
-
# => []
|
234
|
-
# v.set_valid_data
|
235
|
-
# v.valid_data
|
236
|
-
# => [2,3]
|
237
|
-
def set_valid_data
|
238
|
-
@valid_data.clear
|
239
|
-
@missing_data.clear
|
240
|
-
@data_with_nils.clear
|
241
|
-
@gsl=nil
|
242
|
-
set_valid_data_intern
|
243
|
-
set_scale_data if(@type==:scale)
|
244
|
-
end
|
245
|
-
|
246
|
-
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
247
|
-
def set_valid_data_intern #:nodoc:
|
248
|
-
Statsample::STATSAMPLE__.set_valid_data_intern(self)
|
249
|
-
end
|
393
|
+
)
|
250
394
|
else
|
251
|
-
|
252
|
-
_set_valid_data_intern
|
253
|
-
end
|
254
|
-
end
|
255
|
-
def _set_valid_data_intern #:nodoc:
|
256
|
-
@data.each do |n|
|
257
|
-
if is_valid? n
|
258
|
-
@valid_data.push(n)
|
259
|
-
@data_with_nils.push(n)
|
260
|
-
else
|
261
|
-
@data_with_nils.push(nil)
|
262
|
-
@missing_data.push(n)
|
263
|
-
end
|
264
|
-
end
|
265
|
-
@has_missing_data=@missing_data.size>0
|
395
|
+
raise TypeError,"You should pass a scalar or a array/vector"
|
266
396
|
end
|
267
|
-
|
268
|
-
# Retrieves true if data has one o more missing values
|
269
|
-
def has_missing_data?
|
270
|
-
@has_missing_data
|
271
|
-
end
|
272
|
-
# Retrieves label for value x. Retrieves x if
|
273
|
-
# no label defined.
|
274
|
-
def labeling(x)
|
275
|
-
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
276
|
-
end
|
277
|
-
# Returns a Vector with data with labels replaced by the label.
|
278
|
-
def vector_labeled
|
279
|
-
d=@data.collect{|x|
|
280
|
-
if @labels.has_key? x
|
281
|
-
@labels[x]
|
282
|
-
else
|
283
|
-
x
|
284
|
-
end
|
285
|
-
}
|
286
|
-
Vector.new(d,@type)
|
287
|
-
end
|
288
|
-
# Size of total data
|
289
|
-
def size
|
290
|
-
@data.size
|
291
|
-
end
|
292
|
-
alias_method :n, :size
|
293
|
-
|
294
|
-
# Retrieves i element of data
|
295
|
-
def [](i)
|
296
|
-
@data[i]
|
297
|
-
end
|
298
|
-
# Set i element of data.
|
299
|
-
# Note: Use set_valid_data if you include missing values
|
300
|
-
def []=(i,v)
|
301
|
-
@data[i]=v
|
302
|
-
end
|
303
|
-
# Return true if a value is valid (not nil and not included on missing values)
|
304
|
-
def is_valid?(x)
|
305
|
-
!(x.nil? or @missing_values.include? x)
|
306
|
-
end
|
307
|
-
# Set missing_values.
|
308
|
-
# if update_valid = false, you should use
|
309
|
-
# set_valid_data after all changes
|
310
|
-
def missing_values=(vals)
|
311
|
-
@missing_values = vals
|
312
|
-
set_valid_data
|
313
|
-
end
|
314
|
-
# Set level of measurement.
|
315
|
-
def type=(t)
|
316
|
-
@type=t
|
317
|
-
set_scale_data if(t==:scale)
|
318
|
-
end
|
319
|
-
def to_a
|
320
|
-
@data.dup
|
321
|
-
end
|
322
|
-
alias_method :to_ary, :to_a
|
323
|
-
|
324
|
-
# Vector sum.
|
325
|
-
# - If v is a scalar, add this value to all elements
|
326
|
-
# - If v is a Array or a Vector, should be of the same size of this vector
|
327
|
-
# every item of this vector will be added to the value of the
|
328
|
-
# item at the same position on the other vector
|
329
|
-
def +(v)
|
330
|
-
_vector_ari("+",v)
|
331
|
-
end
|
332
|
-
# Vector rest.
|
333
|
-
# - If v is a scalar, rest this value to all elements
|
334
|
-
# - If v is a Array or a Vector, should be of the same
|
335
|
-
# size of this vector
|
336
|
-
# every item of this vector will be rested to the value of the
|
337
|
-
# item at the same position on the other vector
|
338
397
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
#
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
)
|
379
|
-
else
|
380
|
-
raise TypeError,"You should pass a scalar or a array/vector"
|
381
|
-
end
|
382
|
-
|
383
|
-
end
|
384
|
-
# Return an array with the data splitted by a separator.
|
385
|
-
# a=Vector.new(["a,b","c,d","a,b","d"])
|
386
|
-
# a.splitted
|
387
|
-
# =>
|
388
|
-
# [["a","b"],["c","d"],["a","b"],["d"]]
|
389
|
-
def splitted(sep=Statsample::SPLIT_TOKEN)
|
390
|
-
@data.collect{|x|
|
391
|
-
if x.nil?
|
392
|
-
nil
|
393
|
-
elsif (x.respond_to? :split)
|
394
|
-
x.split(sep)
|
395
|
-
else
|
396
|
-
[x]
|
397
|
-
end
|
398
|
-
}
|
399
|
-
end
|
400
|
-
# Returns a hash of Vectors, defined by the different values
|
401
|
-
# defined on the fields
|
402
|
-
# Example:
|
403
|
-
#
|
404
|
-
# a=Vector.new(["a,b","c,d","a,b"])
|
405
|
-
# a.split_by_separator
|
406
|
-
# => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
|
407
|
-
# @data=[1, 0, 1]>,
|
408
|
-
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
|
409
|
-
# @data=[1, 1, 0]>,
|
410
|
-
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
|
411
|
-
# @data=[0, 1, 1]>}
|
412
|
-
#
|
413
|
-
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
414
|
-
split_data=splitted(sep)
|
415
|
-
factors=split_data.flatten.uniq.compact
|
416
|
-
out=factors.inject({}) {|a,x|
|
417
|
-
a[x]=[]
|
418
|
-
a
|
419
|
-
}
|
420
|
-
split_data.each{|r|
|
421
|
-
if r.nil?
|
422
|
-
factors.each{|f|
|
423
|
-
out[f].push(nil)
|
424
|
-
}
|
425
|
-
else
|
398
|
+
end
|
399
|
+
# Return an array with the data splitted by a separator.
|
400
|
+
# a=Vector.new(["a,b","c,d","a,b","d"])
|
401
|
+
# a.splitted
|
402
|
+
# =>
|
403
|
+
# [["a","b"],["c","d"],["a","b"],["d"]]
|
404
|
+
def splitted(sep=Statsample::SPLIT_TOKEN)
|
405
|
+
@data.collect{|x|
|
406
|
+
if x.nil?
|
407
|
+
nil
|
408
|
+
elsif (x.respond_to? :split)
|
409
|
+
x.split(sep)
|
410
|
+
else
|
411
|
+
[x]
|
412
|
+
end
|
413
|
+
}
|
414
|
+
end
|
415
|
+
# Returns a hash of Vectors, defined by the different values
|
416
|
+
# defined on the fields
|
417
|
+
# Example:
|
418
|
+
#
|
419
|
+
# a=Vector.new(["a,b","c,d","a,b"])
|
420
|
+
# a.split_by_separator
|
421
|
+
# => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
|
422
|
+
# @data=[1, 0, 1]>,
|
423
|
+
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
|
424
|
+
# @data=[1, 1, 0]>,
|
425
|
+
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
|
426
|
+
# @data=[0, 1, 1]>}
|
427
|
+
#
|
428
|
+
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
429
|
+
split_data=splitted(sep)
|
430
|
+
factors=split_data.flatten.uniq.compact
|
431
|
+
out=factors.inject({}) {|a,x|
|
432
|
+
a[x]=[]
|
433
|
+
a
|
434
|
+
}
|
435
|
+
split_data.each{|r|
|
436
|
+
if r.nil?
|
426
437
|
factors.each{|f|
|
427
|
-
out[f].push(
|
438
|
+
out[f].push(nil)
|
428
439
|
}
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
s[v[0]]=Vector.new(v[1],:nominal)
|
433
|
-
s
|
440
|
+
else
|
441
|
+
factors.each{|f|
|
442
|
+
out[f].push(r.include?(f) ? 1:0)
|
434
443
|
}
|
444
|
+
end
|
445
|
+
}
|
446
|
+
out.inject({}){|s,v|
|
447
|
+
s[v[0]]=Vector.new(v[1],:nominal)
|
448
|
+
s
|
449
|
+
}
|
450
|
+
end
|
451
|
+
def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
|
452
|
+
split_by_separator(sep).inject({}) {|a,v|
|
453
|
+
a[v[0]]=v[1].inject {|s,x| s+x.to_i}
|
454
|
+
a
|
455
|
+
}
|
456
|
+
end
|
457
|
+
|
458
|
+
# Returns an random sample of size n, with replacement,
|
459
|
+
# only with valid data.
|
460
|
+
#
|
461
|
+
# In all the trails, every item have the same probability
|
462
|
+
# of been selected.
|
463
|
+
def sample_with_replacement(sample=1)
|
464
|
+
if(@type!=:scale or !HAS_GSL)
|
465
|
+
vds=@valid_data.size
|
466
|
+
(0...sample).collect{ @valid_data[rand(vds)] }
|
467
|
+
else
|
468
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
469
|
+
r.sample(@gsl, sample).to_a
|
470
|
+
end
|
471
|
+
end
|
472
|
+
# Returns an random sample of size n, without replacement,
|
473
|
+
# only with valid data.
|
474
|
+
#
|
475
|
+
# Every element could only be selected once.
|
476
|
+
#
|
477
|
+
# A sample of the same size of the vector is the vector itself.
|
478
|
+
|
479
|
+
def sample_without_replacement(sample=1)
|
480
|
+
if(@type!=:scale or !HAS_GSL)
|
481
|
+
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
482
|
+
out=[]
|
483
|
+
size=@valid_data.size
|
484
|
+
while out.size<sample
|
485
|
+
value=rand(size)
|
486
|
+
out.push(value) if !out.include?value
|
487
|
+
end
|
488
|
+
out.collect{|i|@data[i]}
|
489
|
+
else
|
490
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
491
|
+
r.choose(@gsl, sample).to_a
|
492
|
+
end
|
493
|
+
end
|
494
|
+
# Retrieves number of cases which comply condition.
|
495
|
+
# If block given, retrieves number of instances where
|
496
|
+
# block returns true.
|
497
|
+
# If other values given, retrieves the frequency for
|
498
|
+
# this value.
|
499
|
+
def count(x=false)
|
500
|
+
if block_given?
|
501
|
+
r=@data.inject(0) {|s, i|
|
502
|
+
r=yield i
|
503
|
+
s+(r ? 1 : 0)
|
504
|
+
}
|
505
|
+
r.nil? ? 0 : r
|
506
|
+
else
|
507
|
+
frequencies[x].nil? ? 0 : frequencies[x]
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
# Returns the database type for the vector, according to its content
|
512
|
+
|
513
|
+
def db_type(dbs='mysql')
|
514
|
+
# first, detect any character not number
|
515
|
+
if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
|
516
|
+
return "DATE"
|
517
|
+
elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
|
518
|
+
return "VARCHAR (255)"
|
519
|
+
elsif @data.find {|v| v.to_s=~/\./}
|
520
|
+
return "DOUBLE"
|
521
|
+
else
|
522
|
+
return "INTEGER"
|
523
|
+
end
|
524
|
+
end
|
525
|
+
# Return true if all data is Date, "today" values or nil
|
526
|
+
def can_be_date?
|
527
|
+
if @data.find {|v|
|
528
|
+
!v.nil? and !v.is_a? Date and !v.is_a? Time and (v.is_a? String and !@today_values.include? v) and (v.is_a? String and !(v=~/\d{4,4}[-\/]\d{1,2}[-\/]\d{1,2}/))}
|
529
|
+
false
|
530
|
+
else
|
531
|
+
true
|
435
532
|
end
|
436
|
-
|
437
|
-
|
438
|
-
|
533
|
+
end
|
534
|
+
# Return true if all data is Numeric or nil
|
535
|
+
def can_be_scale?
|
536
|
+
if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
|
537
|
+
false
|
538
|
+
else
|
539
|
+
true
|
540
|
+
end
|
541
|
+
end
|
542
|
+
|
543
|
+
def to_s
|
544
|
+
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
545
|
+
end
|
546
|
+
# Ugly name. Really, create a Vector for standard 'matrix' package.
|
547
|
+
# <tt>dir</tt> could be :horizontal or :vertical
|
548
|
+
def to_matrix(dir=:horizontal)
|
549
|
+
case dir
|
550
|
+
when :horizontal
|
551
|
+
Matrix[@data]
|
552
|
+
when :vertical
|
553
|
+
Matrix.columns([@data])
|
554
|
+
end
|
555
|
+
end
|
556
|
+
def inspect
|
557
|
+
self.to_s
|
558
|
+
end
|
559
|
+
# Retrieves uniques values for data.
|
560
|
+
def factors
|
561
|
+
if @type==:scale
|
562
|
+
@scale_data.uniq.sort
|
563
|
+
elsif @type==:date
|
564
|
+
@date_data_with_nils.uniq.sort
|
565
|
+
else
|
566
|
+
@valid_data.uniq.sort
|
567
|
+
end
|
568
|
+
end
|
569
|
+
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
570
|
+
# Returns a hash with the distribution of frecuencies for
|
571
|
+
# the sample
|
572
|
+
def frequencies
|
573
|
+
Statsample::STATSAMPLE__.frequencies(@valid_data)
|
574
|
+
end
|
575
|
+
else
|
576
|
+
def frequencies #:nodoc:
|
577
|
+
_frequencies
|
578
|
+
end
|
579
|
+
end
|
580
|
+
def _frequencies #:nodoc:
|
581
|
+
@valid_data.inject(Hash.new) {|a,x|
|
582
|
+
a[x]||=0
|
583
|
+
a[x]=a[x]+1
|
584
|
+
a
|
585
|
+
}
|
586
|
+
end
|
587
|
+
# Plot frequencies on a chart, using gnuplot
|
588
|
+
def plot_frequencies
|
589
|
+
require 'gnuplot'
|
590
|
+
x=[]
|
591
|
+
y=[]
|
592
|
+
self.frequencies.sort.each{|k,v|
|
593
|
+
x.push(k)
|
594
|
+
y.push(v)
|
595
|
+
}
|
596
|
+
Gnuplot.open do |gp|
|
597
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
598
|
+
plot.boxwidth("0.9 absolute")
|
599
|
+
plot.yrange("[0:#{y.max}]")
|
600
|
+
plot.style("fill solid 1.00 border -1")
|
601
|
+
plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
|
602
|
+
plot.style("histogram")
|
603
|
+
plot.style("data histogram")
|
604
|
+
i=-1
|
605
|
+
plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
|
606
|
+
plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
|
607
|
+
end
|
608
|
+
end
|
609
|
+
end
|
610
|
+
|
611
|
+
end
|
612
|
+
|
613
|
+
|
614
|
+
# Returns the most frequent item.
|
615
|
+
def mode
|
616
|
+
frequencies.max{|a,b| a[1]<=>b[1]}[0]
|
617
|
+
end
|
618
|
+
# The numbers of item with valid data.
|
619
|
+
def n_valid
|
620
|
+
@valid_data.size
|
621
|
+
end
|
622
|
+
# Returns a hash with the distribution of proportions of
|
623
|
+
# the sample.
|
624
|
+
def proportions
|
625
|
+
frequencies.inject({}){|a,v|
|
626
|
+
a[v[0]] = v[1].quo(n_valid)
|
439
627
|
a
|
440
628
|
}
|
441
629
|
end
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
# Returns an random sample of size n, without replacement,
|
458
|
-
# only with valid data.
|
459
|
-
#
|
460
|
-
# Every element could only be selected once.
|
461
|
-
#
|
462
|
-
# A sample of the same size of the vector is the vector itself.
|
463
|
-
|
464
|
-
def sample_without_replacement(sample=1)
|
465
|
-
if(@type!=:scale or !HAS_GSL)
|
466
|
-
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
467
|
-
out=[]
|
468
|
-
size=@valid_data.size
|
469
|
-
while out.size<sample
|
470
|
-
value=rand(size)
|
471
|
-
out.push(value) if !out.include?value
|
472
|
-
end
|
473
|
-
out.collect{|i|@data[i]}
|
474
|
-
else
|
475
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
476
|
-
r.choose(@gsl, sample).to_a
|
630
|
+
# Proportion of a given value.
|
631
|
+
def proportion(v=1)
|
632
|
+
frequencies[v].quo(@valid_data.size)
|
633
|
+
end
|
634
|
+
def summary(out="")
|
635
|
+
out << sprintf("n valid:%d\n",n_valid)
|
636
|
+
out << sprintf("factors:%s\n",factors.join(","))
|
637
|
+
out << "mode:"+mode.to_s+"\n"
|
638
|
+
out << "Distribution:\n"
|
639
|
+
frequencies.sort.each{|k,v|
|
640
|
+
key=labels.has_key?(k) ? labels[k]:k
|
641
|
+
out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
|
642
|
+
}
|
643
|
+
if(@type==:ordinal)
|
644
|
+
out << "median:"+median.to_s+"\n"
|
477
645
|
end
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
# If other values given, retrieves the frequency for
|
483
|
-
# this value.
|
484
|
-
def count(x=false)
|
485
|
-
if block_given?
|
486
|
-
r=@data.inject(0) {|s, i|
|
487
|
-
r=yield i
|
488
|
-
s+(r ? 1 : 0)
|
489
|
-
}
|
490
|
-
r.nil? ? 0 : r
|
491
|
-
else
|
492
|
-
frequencies[x].nil? ? 0 : frequencies[x]
|
646
|
+
if(@type==:scale)
|
647
|
+
out << "mean:"+mean.to_s+"\n"
|
648
|
+
out << "sd:"+sd.to_s+"\n"
|
649
|
+
|
493
650
|
end
|
651
|
+
out
|
494
652
|
end
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
653
|
+
|
654
|
+
# Variance of p, according to poblation size
|
655
|
+
def variance_proportion(n_poblation, v=1)
|
656
|
+
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
657
|
+
end
|
658
|
+
# Variance of p, according to poblation size
|
659
|
+
def variance_total(n_poblation, v=1)
|
660
|
+
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
661
|
+
end
|
662
|
+
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
663
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
664
|
+
end
|
665
|
+
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
666
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
667
|
+
end
|
668
|
+
|
669
|
+
self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
|
670
|
+
met_or=met.gsub("_slow","")
|
671
|
+
if !self.method_defined?(met_or)
|
672
|
+
alias_method met_or, met
|
673
|
+
end
|
674
|
+
end
|
675
|
+
######
|
676
|
+
### Ordinal Methods
|
677
|
+
######
|
678
|
+
|
679
|
+
# Return the value of the percentil q
|
680
|
+
def percentil(q)
|
681
|
+
check_type :ordinal
|
682
|
+
sorted=@valid_data.sort
|
683
|
+
v= (n_valid * q).quo(100)
|
684
|
+
if(v.to_i!=v)
|
685
|
+
sorted[v.to_i]
|
686
|
+
else
|
687
|
+
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
688
|
+
end
|
689
|
+
end
|
690
|
+
# Returns a ranked vector.
|
691
|
+
def ranked(type=:ordinal)
|
692
|
+
check_type :ordinal
|
693
|
+
i=0
|
694
|
+
r=frequencies.sort.inject({}){|a,v|
|
695
|
+
a[v[0]]=(i+1 + i+v[1]).quo(2)
|
696
|
+
i+=v[1]
|
697
|
+
a
|
698
|
+
}
|
699
|
+
@data.collect {|c| r[c] }.to_vector(type)
|
700
|
+
end
|
701
|
+
# Return the median (percentil 50)
|
702
|
+
def median
|
703
|
+
check_type :ordinal
|
704
|
+
if HAS_GSL and @type==:scale
|
705
|
+
sorted=GSL::Vector.alloc(@scale_data.sort)
|
706
|
+
GSL::Stats::median_from_sorted_data(sorted)
|
707
|
+
else
|
708
|
+
percentil(50)
|
709
|
+
end
|
710
|
+
end
|
711
|
+
# Minimun value
|
712
|
+
def min;
|
713
|
+
check_type :ordinal
|
714
|
+
@valid_data.min;
|
715
|
+
end
|
716
|
+
# Maximum value
|
717
|
+
def max;
|
718
|
+
check_type :ordinal
|
719
|
+
@valid_data.max;
|
720
|
+
end
|
721
|
+
def set_date_data # :nodoc:
|
722
|
+
@date_data_with_nils=@data.collect do|x|
|
723
|
+
if x.is_a? Date
|
724
|
+
x
|
725
|
+
elsif x.is_a? Time
|
726
|
+
Date.new(x.year, x.month, x.day)
|
727
|
+
elsif x.is_a? String and x=~/(\d{4,4})[-\/](\d{1,2})[-\/](\d{1,2})/
|
728
|
+
Date.new($1.to_i,$2.to_i,$3.to_i)
|
729
|
+
elsif @today_values.include? x
|
730
|
+
Date.today()
|
731
|
+
elsif @missing_values.include? x or x.nil?
|
732
|
+
nil
|
508
733
|
end
|
734
|
+
end
|
509
735
|
end
|
510
|
-
|
511
|
-
|
512
|
-
if
|
513
|
-
|
736
|
+
def set_scale_data # :nodoc
|
737
|
+
@scale_data=@valid_data.collect do|x|
|
738
|
+
if x.is_a? Numeric
|
739
|
+
x
|
740
|
+
elsif x.is_a? String and x.to_i==x.to_f
|
741
|
+
x.to_i
|
514
742
|
else
|
515
|
-
|
516
|
-
end
|
743
|
+
x.to_f
|
744
|
+
end
|
745
|
+
end
|
746
|
+
if HAS_GSL
|
747
|
+
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
748
|
+
end
|
749
|
+
end
|
750
|
+
private :set_scale_data
|
751
|
+
|
752
|
+
# The range of the data (max - min)
|
753
|
+
def range;
|
754
|
+
check_type :scale
|
755
|
+
@scale_data.max - @scale_data.min
|
756
|
+
end
|
757
|
+
# The sum of values for the data
|
758
|
+
def sum
|
759
|
+
check_type :scale
|
760
|
+
@scale_data.inject(0){|a,x|x+a} ;
|
761
|
+
end
|
762
|
+
# The arithmetical mean of data
|
763
|
+
def mean
|
764
|
+
check_type :scale
|
765
|
+
sum.to_f.quo(n_valid)
|
766
|
+
end
|
767
|
+
# Sum of squares for the data around a value.
|
768
|
+
# By default, this value is the mean
|
769
|
+
# ss= sum{(xi-m)^2}
|
770
|
+
#
|
771
|
+
def sum_of_squares(m=nil)
|
772
|
+
check_type :scale
|
773
|
+
m||=mean
|
774
|
+
@scale_data.inject(0){|a,x| a+(x-m).square}
|
517
775
|
end
|
518
776
|
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
# <tt>dir</tt> could be :horizontal or :vertical
|
524
|
-
def to_matrix(dir=:horizontal)
|
525
|
-
case dir
|
526
|
-
when :horizontal
|
527
|
-
Matrix[@data]
|
528
|
-
when :vertical
|
529
|
-
Matrix.columns([@data])
|
530
|
-
end
|
531
|
-
end
|
532
|
-
def inspect
|
533
|
-
self.to_s
|
777
|
+
# Sum of squared deviation
|
778
|
+
def sum_of_squared_deviation
|
779
|
+
check_type :scale
|
780
|
+
@scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
534
781
|
end
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
782
|
+
|
783
|
+
# Population variance (denominator N)
|
784
|
+
def variance_population(m=nil)
|
785
|
+
check_type :scale
|
786
|
+
m||=mean
|
787
|
+
squares=@scale_data.inject(0){|a,x| x.square+a}
|
788
|
+
squares.quo(n_valid) - m.square
|
542
789
|
end
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
else
|
550
|
-
def frequencies #:nodoc:
|
551
|
-
_frequencies
|
552
|
-
end
|
790
|
+
|
791
|
+
|
792
|
+
# Population Standard deviation (denominator N)
|
793
|
+
def standard_deviation_population(m=nil)
|
794
|
+
check_type :scale
|
795
|
+
Math::sqrt( variance_population(m) )
|
553
796
|
end
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
797
|
+
# Sample Variance (denominator n-1)
|
798
|
+
|
799
|
+
def variance_sample(m=nil)
|
800
|
+
check_type :scale
|
801
|
+
m||=mean
|
802
|
+
sum_of_squares(m).quo(n_valid - 1)
|
560
803
|
end
|
561
|
-
# Plot frequencies on a chart, using gnuplot
|
562
|
-
def plot_frequencies
|
563
|
-
require 'gnuplot'
|
564
|
-
x=[]
|
565
|
-
y=[]
|
566
|
-
self.frequencies.sort.each{|k,v|
|
567
|
-
x.push(k)
|
568
|
-
y.push(v)
|
569
|
-
}
|
570
|
-
Gnuplot.open do |gp|
|
571
|
-
Gnuplot::Plot.new( gp ) do |plot|
|
572
|
-
plot.boxwidth("0.9 absolute")
|
573
|
-
plot.yrange("[0:#{y.max}]")
|
574
|
-
plot.style("fill solid 1.00 border -1")
|
575
|
-
plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
|
576
|
-
plot.style("histogram")
|
577
|
-
plot.style("data histogram")
|
578
|
-
i=-1
|
579
|
-
plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
|
580
|
-
plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
|
581
|
-
end
|
582
|
-
end
|
583
|
-
end
|
584
804
|
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
def mode
|
590
|
-
frequencies.max{|a,b| a[1]<=>b[1]}[0]
|
591
|
-
end
|
592
|
-
# The numbers of item with valid data.
|
593
|
-
def n_valid
|
594
|
-
@valid_data.size
|
595
|
-
end
|
596
|
-
# Returns a hash with the distribution of proportions of
|
597
|
-
# the sample.
|
598
|
-
def proportions
|
599
|
-
frequencies.inject({}){|a,v|
|
600
|
-
a[v[0]] = v[1].quo(n_valid)
|
601
|
-
a
|
602
|
-
}
|
603
|
-
end
|
604
|
-
# Proportion of a given value.
|
605
|
-
def proportion(v=1)
|
606
|
-
frequencies[v].quo(@valid_data.size)
|
607
|
-
end
|
608
|
-
def summary(out="")
|
609
|
-
out << sprintf("n valid:%d\n",n_valid)
|
610
|
-
out << sprintf("factors:%s\n",factors.join(","))
|
611
|
-
out << "mode:"+mode.to_s+"\n"
|
612
|
-
out << "Distribution:\n"
|
613
|
-
frequencies.sort.each{|k,v|
|
614
|
-
key=labels.has_key?(k) ? labels[k]:k
|
615
|
-
out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
|
616
|
-
}
|
617
|
-
if(@type==:ordinal)
|
618
|
-
out << "median:"+median.to_s+"\n"
|
619
|
-
end
|
620
|
-
if(@type==:scale)
|
621
|
-
out << "mean:"+mean.to_s+"\n"
|
622
|
-
out << "sd:"+sd.to_s+"\n"
|
623
|
-
|
624
|
-
end
|
625
|
-
out
|
626
|
-
end
|
805
|
+
# Sample Standard deviation (denominator n-1)
|
806
|
+
|
807
|
+
def standard_deviation_sample(m=nil)
|
808
|
+
check_type :scale
|
627
809
|
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
810
|
+
m||=mean
|
811
|
+
Math::sqrt(variance_sample(m))
|
812
|
+
end
|
813
|
+
# Skewness of the sample
|
814
|
+
def skew(m=nil)
|
815
|
+
check_type :scale
|
816
|
+
m||=mean
|
817
|
+
th=@scale_data.inject(0){|a,x| a+((x-m)**3)}
|
818
|
+
th.quo((@scale_data.size)*sd(m)**3)
|
819
|
+
end
|
820
|
+
# Kurtosis of the sample
|
821
|
+
def kurtosis(m=nil)
|
822
|
+
check_type :scale
|
823
|
+
m||=mean
|
824
|
+
fo=@scale_data.inject(0){|a,x| a+((x-m)**4)}
|
825
|
+
fo.quo((@scale_data.size)*sd(m)**4)-3
|
642
826
|
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
end
|
677
|
-
# Return the median (percentil 50)
|
678
|
-
def median
|
679
|
-
check_type :ordinal
|
680
|
-
if HAS_GSL and @type==:scale
|
681
|
-
GSL::Stats::median_from_sorted_data(@gsl)
|
682
|
-
else
|
683
|
-
percentil(50)
|
684
|
-
end
|
685
|
-
end
|
686
|
-
# Minimun value
|
687
|
-
def min;
|
688
|
-
check_type :ordinal
|
689
|
-
@valid_data.min;
|
690
|
-
end
|
691
|
-
# Maximum value
|
692
|
-
def max;
|
693
|
-
check_type :ordinal
|
694
|
-
@valid_data.max;
|
695
|
-
end
|
696
|
-
|
697
|
-
def set_scale_data # :nodoc
|
698
|
-
@scale_data=@valid_data.collect do|x|
|
699
|
-
if x.is_a? Numeric
|
700
|
-
x
|
701
|
-
elsif x.is_a? String and x.to_i==x.to_f
|
702
|
-
x.to_i
|
703
|
-
else
|
704
|
-
x.to_f
|
705
|
-
end
|
706
|
-
end
|
707
|
-
if HAS_GSL
|
708
|
-
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
709
|
-
end
|
710
|
-
end
|
711
|
-
private :set_scale_data
|
827
|
+
end
|
828
|
+
# Product of all values on the sample
|
829
|
+
#
|
830
|
+
def product
|
831
|
+
check_type :scale
|
832
|
+
@scale_data.inject(1){|a,x| a*x }
|
833
|
+
end
|
834
|
+
if HAS_GSL
|
835
|
+
%w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
|
836
|
+
m_nuevo=(m+"_slow").intern
|
837
|
+
alias_method m_nuevo, m.intern
|
838
|
+
}
|
839
|
+
def sum # :nodoc:
|
840
|
+
check_type :scale
|
841
|
+
|
842
|
+
@gsl.sum
|
843
|
+
end
|
844
|
+
def mean # :nodoc:
|
845
|
+
check_type :scale
|
846
|
+
|
847
|
+
@gsl.mean
|
848
|
+
end
|
849
|
+
def variance_sample(m=nil) # :nodoc:
|
850
|
+
check_type :scale
|
851
|
+
|
852
|
+
m||=mean
|
853
|
+
@gsl.variance_m
|
854
|
+
end
|
855
|
+
def standard_deviation_sample(m=nil) # :nodoc:
|
856
|
+
check_type :scale
|
857
|
+
m||=mean
|
858
|
+
@gsl.sd(m)
|
859
|
+
end
|
712
860
|
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
end
|
737
|
-
|
738
|
-
# Sum of squared deviation
|
739
|
-
def sum_of_squared_deviation
|
740
|
-
check_type :scale
|
741
|
-
@scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
742
|
-
end
|
861
|
+
def variance_population(m=nil) # :nodoc:
|
862
|
+
check_type :scale
|
863
|
+
m||=mean
|
864
|
+
@gsl.variance_with_fixed_mean(m)
|
865
|
+
end
|
866
|
+
def standard_deviation_population(m=nil) # :nodoc:
|
867
|
+
check_type :scale
|
868
|
+
m||=mean
|
869
|
+
@gsl.sd_with_fixed_mean(m)
|
870
|
+
end
|
871
|
+
def skew # :nodoc:
|
872
|
+
check_type :scale
|
873
|
+
@gsl.skew
|
874
|
+
end
|
875
|
+
def kurtosis # :nodoc:
|
876
|
+
check_type :scale
|
877
|
+
@gsl.kurtosis
|
878
|
+
end
|
879
|
+
# Create a GSL::Histogram
|
880
|
+
# With a fixnum, creates X bins within the range of data
|
881
|
+
# With an Array, each value will be a cut point
|
882
|
+
def histogram(bins=10)
|
883
|
+
check_type :scale
|
743
884
|
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
885
|
+
if bins.is_a? Array
|
886
|
+
#h=Statsample::Histogram.new(self, bins)
|
887
|
+
h=GSL::Histogram.alloc(bins)
|
888
|
+
else
|
889
|
+
# ugly patch. The upper limit for a bin has the form
|
890
|
+
# x < range
|
891
|
+
#h=Statsample::Histogram.new(self, bins)
|
892
|
+
h=GSL::Histogram.alloc(bins,[@valid_data.min,@valid_data.max+0.0001])
|
750
893
|
end
|
751
|
-
|
894
|
+
h.increment(@gsl)
|
895
|
+
h
|
896
|
+
end
|
897
|
+
def plot_histogram(bins=10,options="")
|
898
|
+
check_type :scale
|
899
|
+
self.histogram(bins).graph(options)
|
900
|
+
end
|
752
901
|
|
753
|
-
# Population Standard deviation (denominator N)
|
754
|
-
def standard_deviation_population(m=nil)
|
755
|
-
check_type :scale
|
756
|
-
|
757
|
-
Math::sqrt( variance_population(m) )
|
758
|
-
end
|
759
|
-
# Sample Variance (denominator n-1)
|
760
|
-
|
761
|
-
def variance_sample(m=nil)
|
762
|
-
check_type :scale
|
763
|
-
|
764
|
-
m||=mean
|
765
|
-
sum_of_squares(m).quo(n_valid - 1)
|
766
|
-
end
|
767
|
-
|
768
|
-
# Sample Standard deviation (denominator n-1)
|
769
|
-
|
770
|
-
def standard_deviation_sample(m=nil)
|
771
|
-
check_type :scale
|
772
|
-
|
773
|
-
m||=m
|
774
|
-
Math::sqrt(variance_sample(m))
|
775
|
-
end
|
776
|
-
# Skewness of the sample
|
777
|
-
def skew
|
778
|
-
check_type :scale
|
779
|
-
m=mean
|
780
|
-
thirds=@scale_data.inject(0){|a,x| a+((x-mean)**3)}
|
781
|
-
thirds.quo((@scale_data.size-1)*sd**3)
|
782
|
-
end
|
783
|
-
# Kurtosis of the sample
|
784
|
-
def kurtosis
|
785
|
-
check_type :scale
|
786
|
-
|
787
|
-
m=mean
|
788
|
-
thirds=@scale_data.inject(0){|a,x| a+((x-mean)**4)}
|
789
|
-
thirds.quo((@scale_data.size-1)*sd**4)
|
790
|
-
|
791
|
-
end
|
792
|
-
# Product of all values on the sample
|
793
|
-
#
|
794
|
-
def product
|
795
|
-
check_type :scale
|
796
|
-
@scale_data.inject(1){|a,x| a*x }
|
797
|
-
end
|
798
|
-
if HAS_GSL
|
799
|
-
%w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
|
800
|
-
m_nuevo=(m+"_slow").intern
|
801
|
-
alias_method m_nuevo, m.intern
|
802
|
-
}
|
803
|
-
def sum # :nodoc:
|
804
|
-
check_type :scale
|
805
|
-
|
806
|
-
@gsl.sum
|
807
|
-
end
|
808
|
-
def mean # :nodoc:
|
809
|
-
check_type :scale
|
810
|
-
|
811
|
-
@gsl.mean
|
812
|
-
end
|
813
|
-
def variance_sample(m=nil) # :nodoc:
|
814
|
-
check_type :scale
|
815
|
-
|
816
|
-
m||=mean
|
817
|
-
@gsl.variance_m
|
818
|
-
end
|
819
|
-
def standard_deviation_sample(m=nil) # :nodoc:
|
820
|
-
check_type :scale
|
821
|
-
m||=mean
|
822
|
-
@gsl.sd(m)
|
823
|
-
end
|
824
|
-
|
825
|
-
def variance_population(m=nil) # :nodoc:
|
826
|
-
check_type :scale
|
827
|
-
m||=mean
|
828
|
-
@gsl.variance_with_fixed_mean(m)
|
829
|
-
end
|
830
|
-
def standard_deviation_population(m=nil) # :nodoc:
|
831
|
-
check_type :scale
|
832
|
-
m||=mean
|
833
|
-
@gsl.sd_with_fixed_mean(m)
|
834
|
-
end
|
835
|
-
def skew # :nodoc:
|
836
|
-
check_type :scale
|
837
|
-
@gsl.skew
|
838
|
-
end
|
839
|
-
def kurtosis # :nodoc:
|
840
|
-
check_type :scale
|
841
|
-
@gsl.kurtosis
|
842
|
-
end
|
843
|
-
# Create a GSL::Histogram
|
844
|
-
# With a fixnum, creates X bins within the range of data
|
845
|
-
# With an Array, each value will be a cut point
|
846
|
-
def histogram(bins=10)
|
847
|
-
check_type :scale
|
848
|
-
if bins.is_a? Array
|
849
|
-
h=GSL::Histogram.alloc(bins)
|
850
|
-
else
|
851
|
-
# ugly patch. The upper limit for a bin has the form
|
852
|
-
# x < range
|
853
|
-
h=GSL::Histogram.alloc(bins,[@valid_data.min,@valid_data.max+0.0001])
|
854
|
-
end
|
855
|
-
h.increment(@gsl)
|
856
|
-
h
|
857
|
-
end
|
858
|
-
def plot_histogram(bins=10,options="")
|
859
|
-
check_type :scale
|
860
|
-
self.histogram(bins).graph(options)
|
861
|
-
end
|
862
|
-
|
863
|
-
end
|
864
|
-
|
865
|
-
# Coefficient of variation
|
866
|
-
# Calculed with the sample standard deviation
|
867
|
-
def coefficient_of_variation
|
868
|
-
check_type :scale
|
869
|
-
standard_deviation_sample.quo(mean)
|
870
|
-
end
|
871
|
-
|
872
|
-
alias_method :sdp, :standard_deviation_population
|
873
|
-
alias_method :sds, :standard_deviation_sample
|
874
|
-
alias_method :cov, :coefficient_of_variation
|
875
|
-
alias_method :variance, :variance_sample
|
876
|
-
alias_method :sd, :standard_deviation_sample
|
877
|
-
alias_method :ss, :sum_of_squares
|
878
902
|
end
|
903
|
+
|
904
|
+
# Coefficient of variation
|
905
|
+
# Calculed with the sample standard deviation
|
906
|
+
def coefficient_of_variation
|
907
|
+
check_type :scale
|
908
|
+
standard_deviation_sample.quo(mean)
|
909
|
+
end
|
910
|
+
|
911
|
+
alias_method :sdp, :standard_deviation_population
|
912
|
+
alias_method :sds, :standard_deviation_sample
|
913
|
+
alias_method :cov, :coefficient_of_variation
|
914
|
+
alias_method :variance, :variance_sample
|
915
|
+
alias_method :sd, :standard_deviation_sample
|
916
|
+
alias_method :ss, :sum_of_squares
|
917
|
+
end
|
879
918
|
end
|