statsample 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +12 -0
- data/Manifest.txt +13 -0
- data/README.txt +2 -1
- data/demo/pca.rb +29 -0
- data/demo/umann.rb +8 -0
- data/lib/distribution.rb +0 -1
- data/lib/matrix_extension.rb +35 -21
- data/lib/statsample.rb +31 -28
- data/lib/statsample/anova.rb +7 -2
- data/lib/statsample/bivariate.rb +17 -11
- data/lib/statsample/codification.rb +136 -87
- data/lib/statsample/combination.rb +0 -2
- data/lib/statsample/converter/csv18.rb +1 -1
- data/lib/statsample/converter/csv19.rb +1 -1
- data/lib/statsample/converters.rb +176 -171
- data/lib/statsample/crosstab.rb +227 -154
- data/lib/statsample/dataset.rb +94 -12
- data/lib/statsample/dominanceanalysis.rb +69 -62
- data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
- data/lib/statsample/factor.rb +18 -0
- data/lib/statsample/factor/pca.rb +128 -0
- data/lib/statsample/factor/principalaxis.rb +133 -0
- data/lib/statsample/factor/rotation.rb +125 -0
- data/lib/statsample/histogram.rb +99 -0
- data/lib/statsample/mle.rb +125 -126
- data/lib/statsample/mle/logit.rb +91 -91
- data/lib/statsample/mle/probit.rb +84 -85
- data/lib/statsample/multiset.rb +1 -1
- data/lib/statsample/permutation.rb +96 -0
- data/lib/statsample/regression.rb +1 -1
- data/lib/statsample/regression/binomial.rb +89 -89
- data/lib/statsample/regression/binomial/logit.rb +9 -9
- data/lib/statsample/regression/binomial/probit.rb +9 -9
- data/lib/statsample/regression/multiple.rb +8 -14
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
- data/lib/statsample/resample.rb +12 -17
- data/lib/statsample/srs.rb +4 -1
- data/lib/statsample/test.rb +23 -22
- data/lib/statsample/test/umannwhitney.rb +182 -0
- data/lib/statsample/vector.rb +854 -815
- data/test/test_bivariate.rb +132 -132
- data/test/test_codification.rb +71 -50
- data/test/test_dataset.rb +19 -1
- data/test/test_factor.rb +44 -0
- data/test/test_histogram.rb +26 -0
- data/test/test_permutation.rb +37 -0
- data/test/test_statistics.rb +74 -63
- data/test/test_umannwhitney.rb +17 -0
- data/test/test_vector.rb +46 -30
- metadata +31 -4
data/lib/statsample/resample.rb
CHANGED
@@ -1,20 +1,15 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
Vector.new((0...size).collect {|x|
|
14
|
-
rand(range)+low
|
15
|
-
},:scale)
|
16
|
-
end
|
17
|
-
|
18
|
-
end
|
2
|
+
module Resample
|
3
|
+
class << self
|
4
|
+
def repeat_and_save(times,&action)
|
5
|
+
(1..times).inject([]) {|a,x| a.push(action.call); a}
|
6
|
+
end
|
7
|
+
|
8
|
+
def generate (size,low,upper)
|
9
|
+
range=upper-low+1
|
10
|
+
Vector.new((0...size).collect {|x| rand(range)+low },:scale)
|
11
|
+
end
|
12
|
+
|
19
13
|
end
|
14
|
+
end
|
20
15
|
end
|
data/lib/statsample/srs.rb
CHANGED
@@ -35,6 +35,8 @@ module Statsample
|
|
35
35
|
n0=estimation_n0(d,prop,margin)
|
36
36
|
n0.quo( 1 + ((n0 - 1).quo(n_pobl)))
|
37
37
|
end
|
38
|
+
|
39
|
+
|
38
40
|
# Proportion confidence interval with t values
|
39
41
|
# Uses estimated proportion, sample without replacement.
|
40
42
|
|
@@ -42,6 +44,7 @@ module Statsample
|
|
42
44
|
t = Distribution::T.p_value(1-((1-margin).quo(2)) , n_sample-1)
|
43
45
|
proportion_confidence_interval(prop,n_sample,n_population, t)
|
44
46
|
end
|
47
|
+
|
45
48
|
# Proportion confidence interval with z values
|
46
49
|
# Uses estimated proportion, sample without replacement.
|
47
50
|
def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95)
|
@@ -53,7 +56,7 @@ module Statsample
|
|
53
56
|
|
54
57
|
def proportion_confidence_interval(p, sam,pop , x)
|
55
58
|
f=sam.quo(pop)
|
56
|
-
one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p))
|
59
|
+
one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p)).quo (sam-1)) + (1.quo(sam * 2.0))
|
57
60
|
[p-one_range, p+one_range]
|
58
61
|
end
|
59
62
|
# Standard deviation for sample distribution of a proportion
|
data/lib/statsample/test.rb
CHANGED
@@ -1,25 +1,26 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
2
|
+
# Module for several statistical tests
|
3
|
+
|
4
|
+
module Test
|
5
|
+
autoload(:UMannWhitney, 'statsample/test/umannwhitney')
|
6
|
+
# Calculate chi square for two Matrix
|
7
|
+
class << self
|
8
|
+
def chi_square(real,expected)
|
9
|
+
raise TypeError, "Both argument should be Matrix" unless real.is_a? Matrix and expected.is_a?Matrix
|
10
|
+
sum=0
|
11
|
+
(0...real.row_size).each {|row_i|
|
12
|
+
(0...real.column_size).each {|col_i|
|
13
|
+
val=((real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2) / expected[row_i,col_i].to_f
|
14
|
+
# puts "Real: #{real[row_i,col_i].to_f} ; esperado: #{expected[row_i,col_i].to_f}"
|
15
|
+
# puts "Diferencial al cuadrado: #{(real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2}"
|
16
|
+
sum+=val
|
17
|
+
}
|
18
|
+
}
|
19
|
+
sum
|
20
|
+
end
|
21
|
+
def u_mannwhitney(v1p,v2p)
|
22
|
+
Statsample::Test::UMannWhitney.new(v1p,v2p)
|
23
|
+
end
|
24
24
|
end
|
25
|
+
end
|
25
26
|
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Test
|
3
|
+
#
|
4
|
+
# = U Mann-Whitney test
|
5
|
+
#
|
6
|
+
# Non-parametric test for assessing whether two independent samples
|
7
|
+
# of observations come from the same distribution.
|
8
|
+
#
|
9
|
+
# == Assumptions
|
10
|
+
#
|
11
|
+
# * The two samples under investigation in the test are independent of each other and the observations within each sample are independent.
|
12
|
+
# * The observations are comparable (i.e., for any two observations, one can assess whether they are equal or, if not, which one is greater).
|
13
|
+
# * The variances in the two groups are approximately equal.
|
14
|
+
#
|
15
|
+
# Higher differences of distributions correspond to
|
16
|
+
# to lower values of U.
|
17
|
+
#
|
18
|
+
class UMannWhitney
|
19
|
+
# Max for m*n allowed for exact calculation of probability
|
20
|
+
MAX_MN_EXACT=10000
|
21
|
+
|
22
|
+
# Exact probability based on Dinneen & Blakesley (1973) algorithm
|
23
|
+
# This is the algorithm used on SPSS
|
24
|
+
#
|
25
|
+
# Reference: Dinneen, L., & Blakesley, B. (1973). Algorithm AS 62: A Generator for the Sampling Distribution of the Mann- Whitney U Statistic. Journal of the Royal Statistical Society, 22(2), 269-273
|
26
|
+
#
|
27
|
+
def self.exact_probability_distribution_as62(n1,n2)
|
28
|
+
|
29
|
+
freq=[]
|
30
|
+
work=[]
|
31
|
+
mn1=n1*n2+1
|
32
|
+
max_u=n1*n2
|
33
|
+
minmn=n1<n2 ? n1 : n2
|
34
|
+
maxmn=n1>n2 ? n1 : n2
|
35
|
+
n1=maxmn+1
|
36
|
+
(1..n1).each{|i| freq[i]=1}
|
37
|
+
n1+=1
|
38
|
+
(n1..mn1).each{|i| freq[i]=0}
|
39
|
+
work[1]=0
|
40
|
+
xin=maxmn
|
41
|
+
(2..minmn).each do |i|
|
42
|
+
work[i]=0
|
43
|
+
xin=xin+maxmn
|
44
|
+
n1=xin+2
|
45
|
+
l=1+xin.quo(2)
|
46
|
+
k=i
|
47
|
+
(1..l).each do |j|
|
48
|
+
k=k+1
|
49
|
+
n1=n1-1
|
50
|
+
sum=freq[j]+work[j]
|
51
|
+
freq[j]=sum
|
52
|
+
work[k]=sum-freq[n1]
|
53
|
+
freq[n1]=sum
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Generate percentages for normal U
|
58
|
+
dist=(1+max_u/2).to_i
|
59
|
+
freq.shift
|
60
|
+
total=freq.inject(0) {|a,v| a+v }
|
61
|
+
(0...dist).collect {|i|
|
62
|
+
if i!=max_u-i
|
63
|
+
ues=freq[i]*2
|
64
|
+
else
|
65
|
+
ues=freq[i]
|
66
|
+
end
|
67
|
+
ues.quo(total)
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
71
|
+
# Generate distribution for permutations
|
72
|
+
|
73
|
+
def self.distribution_permutations(n1,n2)
|
74
|
+
base=[0]*n1+[1]*n2
|
75
|
+
po=Statsample::Permutation.new(base)
|
76
|
+
upper=0
|
77
|
+
total=n1*n2
|
78
|
+
req={}
|
79
|
+
po.each do |perm|
|
80
|
+
r0,s0=0,0
|
81
|
+
perm.each_index {|c_i|
|
82
|
+
if perm[c_i]==0
|
83
|
+
r0+=c_i+1
|
84
|
+
s0+=1
|
85
|
+
end
|
86
|
+
}
|
87
|
+
u1=r0-((s0*(s0+1)).quo(2))
|
88
|
+
u2=total-u1
|
89
|
+
temp_u= (u1 <= u2) ? u1 : u2
|
90
|
+
req[perm]=temp_u
|
91
|
+
end
|
92
|
+
req
|
93
|
+
end
|
94
|
+
# Sample 1 Rank sum
|
95
|
+
attr_reader :r1
|
96
|
+
# Sample 2 Rank sum
|
97
|
+
attr_reader :r2
|
98
|
+
# Sample 1 U
|
99
|
+
attr_reader :u1
|
100
|
+
# Sample 2 U
|
101
|
+
attr_reader :u2
|
102
|
+
# U Value
|
103
|
+
attr_reader :u
|
104
|
+
# Compensation for ties
|
105
|
+
attr_reader :t
|
106
|
+
def initialize(v1,v2)
|
107
|
+
@n1=v1.valid_data.size
|
108
|
+
@n2=v2.valid_data.size
|
109
|
+
|
110
|
+
data=(v1.valid_data+v2.valid_data).to_scale
|
111
|
+
groups=(([0]*@n1)+([1]*@n2)).to_vector
|
112
|
+
ds={'g'=>groups, 'data'=>data}.to_dataset
|
113
|
+
@t=nil
|
114
|
+
@ties=data.data.size!=data.data.uniq.size
|
115
|
+
if(@ties)
|
116
|
+
adjust_for_ties(ds['data'])
|
117
|
+
end
|
118
|
+
ds['ranked']=ds['data'].ranked(:scale)
|
119
|
+
|
120
|
+
@n=ds.cases
|
121
|
+
|
122
|
+
@r1=ds.filter{|r| r['g']==0}['ranked'].sum
|
123
|
+
@r2=((ds.cases*(ds.cases+1)).quo(2))-r1
|
124
|
+
@u1=r1-((@n1*(@n1+1)).quo(2))
|
125
|
+
@u2=r2-((@n2*(@n2+1)).quo(2))
|
126
|
+
@u=(u1<u2) ? u1 : u2
|
127
|
+
end
|
128
|
+
def summary
|
129
|
+
out=<<-HEREDOC
|
130
|
+
Mann-Whitney U
|
131
|
+
Sum of ranks v1: #{@r1.to_f}
|
132
|
+
Sum of ranks v1: #{@r2.to_f}
|
133
|
+
U Value: #{@u.to_f}
|
134
|
+
Z: #{sprintf("%0.3f",z)} (p: #{sprintf("%0.3f",z_probability)})
|
135
|
+
HEREDOC
|
136
|
+
if @n1*@n2<MAX_MN_EXACT
|
137
|
+
out+="Exact p (Dinneen & Blakesley): #{sprintf("%0.3f",exact_probability)}"
|
138
|
+
end
|
139
|
+
out
|
140
|
+
end
|
141
|
+
# Exact probability of finding values of U lower or equal to sample on U distribution. Use with caution with m*n>100000
|
142
|
+
# Reference: Dinneen & Blakesley (1973)
|
143
|
+
def exact_probability
|
144
|
+
dist=UMannWhitney.exact_probability_distribution_as62(@n1,@n2)
|
145
|
+
sum=0
|
146
|
+
(0..@u.to_i).each {|i|
|
147
|
+
sum+=dist[i]
|
148
|
+
}
|
149
|
+
sum
|
150
|
+
end
|
151
|
+
# Reference: http://europe.isixsigma.com/library/content/c080806a.asp
|
152
|
+
def adjust_for_ties(data)
|
153
|
+
@t=data.frequencies.find_all{|k,v| v>1}.inject(0) {|a,v|
|
154
|
+
a+(v[1]**3-v[1]).quo(12)
|
155
|
+
}
|
156
|
+
end
|
157
|
+
# Z value for U, with adjust for ties.
|
158
|
+
# For large samples, U is approximately normally distributed.
|
159
|
+
# In that case, you can use z to obtain probabily for U.
|
160
|
+
# Reference: SPSS Manual
|
161
|
+
def z
|
162
|
+
mu=(@n1*@n2).quo(2)
|
163
|
+
if(!@ties)
|
164
|
+
ou=Math::sqrt(((@n1*@n2)*(@n1+@n2+1)).quo(12))
|
165
|
+
else
|
166
|
+
n=@n1+@n2
|
167
|
+
first=(@n1*@n2).quo(n*(n-1))
|
168
|
+
second=((n**3-n).quo(12))-@t
|
169
|
+
ou=Math::sqrt(first*second)
|
170
|
+
end
|
171
|
+
(@u-mu).quo(ou)
|
172
|
+
end
|
173
|
+
# Assuming H_0, the proportion of cdf with values of U lower
|
174
|
+
# than the sample.
|
175
|
+
# Use with more than 30 cases per group.
|
176
|
+
def z_probability
|
177
|
+
(1-Distribution::Normal.cdf(z.abs()))*2
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
end
|
data/lib/statsample/vector.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'date'
|
1
2
|
class Array
|
2
3
|
# Creates a new Statsample::Vector object
|
3
4
|
# Argument should be equal to Vector.new
|
@@ -9,25 +10,24 @@ class Array
|
|
9
10
|
Statsample::Vector.new(self,:scale,*args)
|
10
11
|
end
|
11
12
|
end
|
12
|
-
|
13
13
|
module Statsample
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
14
|
+
class << self
|
15
|
+
# Create a matrix using vectors as columns.
|
16
|
+
# Use:
|
17
|
+
#
|
18
|
+
# matrix=Statsample.vector_cols_matrix(v1,v2)
|
19
|
+
def vector_cols_matrix(*vs)
|
20
|
+
# test
|
21
|
+
size=vs[0].size
|
22
|
+
vs.each{|v|
|
23
|
+
raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
|
24
|
+
raise ArgumentError,"Vectors size should be the same" if v.size!=size
|
25
|
+
}
|
26
|
+
Matrix.rows((0...size).to_a.collect() {|i|
|
27
|
+
vs.collect{|v| v[i]}
|
28
|
+
})
|
29
|
+
end
|
30
|
+
end
|
31
31
|
# Returns a duplicate of the input vectors, without missing data
|
32
32
|
# for any of the vectors.
|
33
33
|
#
|
@@ -46,834 +46,873 @@ module Statsample
|
|
46
46
|
ds.vectors.values
|
47
47
|
end
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
49
|
+
class Vector
|
50
|
+
include Enumerable
|
51
|
+
include Writable
|
52
|
+
DEFAULT_OPTIONS={
|
53
|
+
:missing_values=>[],
|
54
|
+
:today_values=>['NOW','TODAY', :NOW, :TODAY],
|
55
|
+
:labels=>{}
|
56
|
+
}
|
57
|
+
# Level of measurement. Could be :nominal, :ordinal or :scale
|
58
|
+
attr_reader :type
|
59
|
+
# Original data.
|
60
|
+
attr_reader :data
|
61
|
+
# Valid data. Equal to data, minus values assigned as missing values
|
62
|
+
attr_reader :valid_data
|
63
|
+
# Array of values considered as missing. Nil is a missing value, by default
|
64
|
+
attr_reader :missing_values
|
65
|
+
# Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
|
66
|
+
attr_reader :today_values
|
67
|
+
# Missing values array
|
68
|
+
attr_reader :missing_data
|
69
|
+
# Original data, with all missing values replaced by nils
|
70
|
+
attr_reader :data_with_nils
|
71
|
+
# Date date, with all missing values replaced by nils
|
72
|
+
attr_reader :date_data_with_nils
|
73
|
+
# GSL Object, only available with rbgsl extension and type==:scale
|
74
|
+
attr_reader :gsl
|
75
|
+
# Change label for specific values
|
76
|
+
attr_accessor :labels
|
77
|
+
# Creates a new Vector object.
|
78
|
+
# [data] Array of data.
|
79
|
+
# [type] Level of meausurement. See Vector#type
|
80
|
+
# [opts] Options
|
81
|
+
# [:missing_values] Array of missing values. See Vector#missing_values
|
82
|
+
# [:today_values] Array of 'today' values. See Vector#today_values
|
83
|
+
# [:labels] Labels for data values
|
84
|
+
#
|
85
|
+
# The fast way to create a vector uses Array.to_vector or Array.to_scale.
|
86
|
+
#
|
87
|
+
# v=[1,2,3,4].to_vector(:scale)
|
88
|
+
# v=[1,2,3,4].to_scale
|
89
|
+
#
|
90
|
+
|
91
|
+
def initialize(data=[], t=:nominal, opts=Hash.new)
|
92
|
+
raise "Data should be an array" unless data.is_a? Array
|
93
|
+
@data=data
|
94
|
+
@type=t
|
95
|
+
opts=DEFAULT_OPTIONS.merge(opts)
|
96
|
+
@missing_values=opts[:missing_values]
|
97
|
+
@labels=opts[:labels]
|
98
|
+
@today_values=opts[:today_values]
|
99
|
+
@valid_data=[]
|
100
|
+
@data_with_nils=[]
|
101
|
+
@date_data_with_nils=[]
|
102
|
+
@missing_data=[]
|
103
|
+
@has_missing_data=nil
|
104
|
+
@scale_data=nil
|
105
|
+
set_valid_data_intern
|
106
|
+
self.type=t
|
107
|
+
end
|
108
|
+
# Creates a duplicate of the Vector.
|
109
|
+
# Note: data, missing_values and labels are duplicated, so
|
110
|
+
# changes on original vector doesn't propages to copies.
|
111
|
+
def dup
|
112
|
+
Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
|
113
|
+
end
|
114
|
+
# Returns an empty duplicate of the vector. Maintains the type,
|
115
|
+
# missing values and labels.
|
116
|
+
def dup_empty
|
117
|
+
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
|
118
|
+
end
|
119
|
+
# Raises an exception if type of vector is inferior to t type
|
120
|
+
def check_type(t)
|
121
|
+
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date)
|
122
|
+
end
|
123
|
+
private :check_type
|
124
|
+
|
125
|
+
# Return a vector usign the standarized values for data
|
126
|
+
# with sd with denominator N
|
127
|
+
def vector_standarized_pop
|
128
|
+
vector_standarized(true)
|
129
|
+
end
|
130
|
+
# Return a vector usign the standarized values for data
|
131
|
+
# with sd with denominator n-1
|
132
|
+
|
133
|
+
def vector_standarized(use_population=false)
|
134
|
+
raise "Should be a scale" unless @type==:scale
|
135
|
+
m=mean
|
136
|
+
sd=use_population ? sdp : sds
|
137
|
+
@data_with_nils.collect{|x|
|
138
|
+
if !x.nil?
|
139
|
+
(x.to_f - m).quo(sd)
|
140
|
+
else
|
141
|
+
nil
|
142
|
+
end
|
143
|
+
}.to_vector(:scale)
|
144
|
+
end
|
145
|
+
|
146
|
+
alias_method :standarized, :vector_standarized
|
147
|
+
|
148
|
+
def box_cox_transformation(lambda) # :nodoc:
|
149
|
+
raise "Should be a scale" unless @type==:scale
|
150
|
+
@data_with_nils.collect{|x|
|
151
|
+
if !x.nil?
|
152
|
+
if(lambda==0)
|
153
|
+
Math.log(x)
|
154
|
+
else
|
155
|
+
(x**lambda-1).quo(lambda)
|
156
|
+
end
|
157
|
+
else
|
158
|
+
nil
|
159
|
+
end
|
160
|
+
}.to_vector(:scale)
|
161
|
+
end
|
162
|
+
|
163
|
+
# Vector equality.
|
164
|
+
# Two vector will be the same if their data, missing values, type, labels are equals
|
165
|
+
def ==(v2)
|
166
|
+
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
167
|
+
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
168
|
+
end
|
169
|
+
|
170
|
+
def _dump(i) # :nodoc:
|
171
|
+
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
172
|
+
end
|
173
|
+
|
174
|
+
def self._load(data) # :nodoc:
|
175
|
+
h=Marshal.load(data)
|
176
|
+
Vector.new(h['data'], h['type'],:missing_values=> h['missing_values'], :labels=>h['labels'])
|
177
|
+
end
|
178
|
+
# Returns a new vector, with data modified by block.
|
179
|
+
# Equivalent to create a Vector after #collect on data
|
180
|
+
def recode
|
181
|
+
@data.collect{|x|
|
182
|
+
yield x
|
183
|
+
}.to_vector(@type)
|
184
|
+
end
|
185
|
+
# Modifies current vector, with data modified by block.
|
186
|
+
# Equivalent to #collect! on @data
|
187
|
+
def recode!
|
188
|
+
@data.collect!{|x|
|
189
|
+
yield x
|
55
190
|
}
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
191
|
+
set_valid_data
|
192
|
+
end
|
193
|
+
# Dicotomize the vector with 0 and 1, based on lowest value
|
194
|
+
# If parameter if defined, this value and lower
|
195
|
+
# will be 0 and higher, 1
|
196
|
+
def dichotomize(low=nil)
|
197
|
+
fs=factors
|
198
|
+
low||=factors.min
|
199
|
+
@data_with_nils.collect{|x|
|
200
|
+
if x.nil?
|
201
|
+
nil
|
202
|
+
elsif x>low
|
203
|
+
1
|
204
|
+
else
|
205
|
+
0
|
206
|
+
end
|
207
|
+
}.to_scale
|
208
|
+
end
|
209
|
+
# Iterate on each item.
|
210
|
+
# Equivalent to
|
211
|
+
# @data.each{|x| yield x}
|
212
|
+
def each
|
213
|
+
@data.each{|x| yield(x) }
|
214
|
+
end
|
215
|
+
|
216
|
+
# Iterate on each item, retrieving index
|
217
|
+
def each_index
|
218
|
+
(0...@data.size).each {|i|
|
219
|
+
yield(i)
|
220
|
+
}
|
221
|
+
end
|
222
|
+
# Add a value at the end of the vector.
|
223
|
+
# If second argument set to false, you should update the Vector usign
|
224
|
+
# Vector.set_valid_data at the end of your insertion cycle
|
225
|
+
#
|
226
|
+
def add(v,update_valid=true)
|
227
|
+
@data.push(v)
|
228
|
+
set_valid_data if update_valid
|
229
|
+
end
|
230
|
+
# Update valid_data, missing_data, data_with_nils and gsl
|
231
|
+
# at the end of an insertion.
|
232
|
+
#
|
233
|
+
# Use after Vector.add(v,false)
|
234
|
+
# Usage:
|
235
|
+
# v=Statsample::Vector.new
|
236
|
+
# v.add(2,false)
|
237
|
+
# v.add(4,false)
|
238
|
+
# v.data
|
239
|
+
# => [2,3]
|
240
|
+
# v.valid_data
|
241
|
+
# => []
|
242
|
+
# v.set_valid_data
|
243
|
+
# v.valid_data
|
244
|
+
# => [2,3]
|
245
|
+
def set_valid_data
|
246
|
+
@valid_data.clear
|
247
|
+
@missing_data.clear
|
248
|
+
@data_with_nils.clear
|
249
|
+
@date_data_with_nils.clear
|
250
|
+
@gsl=nil
|
251
|
+
set_valid_data_intern
|
252
|
+
set_scale_data if(@type==:scale)
|
253
|
+
set_date_data if(@type==:date)
|
254
|
+
end
|
255
|
+
|
256
|
+
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
257
|
+
def set_valid_data_intern #:nodoc:
|
258
|
+
Statsample::STATSAMPLE__.set_valid_data_intern(self)
|
259
|
+
end
|
260
|
+
else
|
261
|
+
def set_valid_data_intern #:nodoc:
|
262
|
+
_set_valid_data_intern
|
263
|
+
end
|
264
|
+
end
|
265
|
+
def _set_valid_data_intern #:nodoc:
|
266
|
+
@data.each do |n|
|
267
|
+
if is_valid? n
|
268
|
+
@valid_data.push(n)
|
269
|
+
@data_with_nils.push(n)
|
270
|
+
else
|
271
|
+
@data_with_nils.push(nil)
|
272
|
+
@missing_data.push(n)
|
273
|
+
end
|
274
|
+
end
|
275
|
+
@has_missing_data=@missing_data.size>0
|
276
|
+
end
|
277
|
+
|
278
|
+
# Retrieves true if data has one o more missing values
|
279
|
+
def has_missing_data?
|
280
|
+
@has_missing_data
|
281
|
+
end
|
282
|
+
# Retrieves label for value x. Retrieves x if
|
283
|
+
# no label defined.
|
284
|
+
def labeling(x)
|
285
|
+
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
286
|
+
end
|
287
|
+
# Returns a Vector with data with labels replaced by the label.
|
288
|
+
def vector_labeled
|
289
|
+
d=@data.collect{|x|
|
290
|
+
if @labels.has_key? x
|
291
|
+
@labels[x]
|
292
|
+
else
|
293
|
+
x
|
294
|
+
end
|
295
|
+
}
|
296
|
+
Vector.new(d,@type)
|
297
|
+
end
|
298
|
+
# Size of total data
|
299
|
+
def size
|
300
|
+
@data.size
|
301
|
+
end
|
302
|
+
alias_method :n, :size
|
303
|
+
|
304
|
+
# Retrieves i element of data
|
305
|
+
def [](i)
|
306
|
+
@data[i]
|
307
|
+
end
|
308
|
+
# Set i element of data.
|
309
|
+
# Note: Use set_valid_data if you include missing values
|
310
|
+
def []=(i,v)
|
311
|
+
@data[i]=v
|
312
|
+
end
|
313
|
+
# Return true if a value is valid (not nil and not included on missing values)
|
314
|
+
def is_valid?(x)
|
315
|
+
!(x.nil? or @missing_values.include? x)
|
316
|
+
end
|
317
|
+
# Set missing_values.
|
318
|
+
# if update_valid = false, you should use
|
319
|
+
# set_valid_data after all changes
|
320
|
+
def missing_values=(vals)
|
321
|
+
@missing_values = vals
|
322
|
+
set_valid_data
|
323
|
+
end
|
324
|
+
def today_values=(vals)
|
325
|
+
@today_values = vals
|
326
|
+
set_valid_data
|
327
|
+
end
|
328
|
+
# Set level of measurement.
|
329
|
+
def type=(t)
|
330
|
+
@type=t
|
331
|
+
set_scale_data if(t==:scale)
|
332
|
+
set_date_data if (t==:date)
|
333
|
+
end
|
334
|
+
def to_a
|
335
|
+
@data.dup
|
336
|
+
end
|
337
|
+
alias_method :to_ary, :to_a
|
338
|
+
|
339
|
+
# Vector sum.
|
340
|
+
# - If v is a scalar, add this value to all elements
|
341
|
+
# - If v is a Array or a Vector, should be of the same size of this vector
|
342
|
+
# every item of this vector will be added to the value of the
|
343
|
+
# item at the same position on the other vector
|
344
|
+
def +(v)
|
345
|
+
_vector_ari("+",v)
|
346
|
+
end
|
347
|
+
# Vector rest.
|
348
|
+
# - If v is a scalar, rest this value to all elements
|
349
|
+
# - If v is a Array or a Vector, should be of the same
|
350
|
+
# size of this vector
|
351
|
+
# every item of this vector will be rested to the value of the
|
352
|
+
# item at the same position on the other vector
|
353
|
+
|
354
|
+
def -(v)
|
355
|
+
_vector_ari("-",v)
|
356
|
+
end
|
357
|
+
# Reports all values that doesn't comply with a condition.
|
358
|
+
# Returns a hash with the index of data and the invalid data.
|
359
|
+
def verify
|
360
|
+
h={}
|
361
|
+
(0...@data.size).to_a.each{|i|
|
362
|
+
if !(yield @data[i])
|
363
|
+
h[i]=@data[i]
|
364
|
+
end
|
365
|
+
}
|
366
|
+
h
|
367
|
+
end
|
368
|
+
def _vector_ari(method,v) # :nodoc:
|
369
|
+
if(v.is_a? Vector or v.is_a? Array)
|
370
|
+
if v.size==@data.size
|
371
|
+
# i=0
|
372
|
+
sum=[]
|
373
|
+
0.upto(v.size-1) {|i|
|
374
|
+
if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
|
375
|
+
sum.push(@data[i].send(method,v[i]))
|
376
|
+
else
|
377
|
+
sum.push(nil)
|
378
|
+
end
|
379
|
+
}
|
380
|
+
Statsample::Vector.new(sum)
|
381
|
+
else
|
382
|
+
raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
|
383
|
+
end
|
384
|
+
elsif(v.respond_to? method )
|
385
|
+
Statsample::Vector.new(
|
386
|
+
@data.collect {|x|
|
387
|
+
if(!x.nil?)
|
388
|
+
x.send(method,v)
|
389
|
+
else
|
150
390
|
nil
|
151
|
-
|
152
|
-
}.to_vector(:scale)
|
153
|
-
end
|
154
|
-
|
155
|
-
# Vector equality.
|
156
|
-
# Two vector will be the same if their data, missing values, type, labels are equals
|
157
|
-
def ==(v2)
|
158
|
-
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
159
|
-
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
160
|
-
end
|
161
|
-
|
162
|
-
def _dump(i) # :nodoc:
|
163
|
-
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
164
|
-
end
|
165
|
-
|
166
|
-
def self._load(data) # :nodoc:
|
167
|
-
h=Marshal.load(data)
|
168
|
-
Vector.new(h['data'], h['type'],:missing_values=> h['missing_values'], :labels=>h['labels'])
|
169
|
-
end
|
170
|
-
# Returns a new vector, with data modified by block.
|
171
|
-
# Equivalent to create a Vector after #collect on data
|
172
|
-
def recode
|
173
|
-
@data.collect{|x|
|
174
|
-
yield x
|
175
|
-
}.to_vector(@type)
|
176
|
-
end
|
177
|
-
# Modifies current vector, with data modified by block.
|
178
|
-
# Equivalent to #collect! on @data
|
179
|
-
def recode!
|
180
|
-
@data.collect!{|x|
|
181
|
-
yield x
|
182
|
-
}
|
183
|
-
set_valid_data
|
184
|
-
end
|
185
|
-
# Dicotomize the vector with 0 and 1, based on lowest value
|
186
|
-
# If parameter if defined, this value and lower
|
187
|
-
# will be 0 and higher, 1
|
188
|
-
def dichotomize(low=nil)
|
189
|
-
fs=factors
|
190
|
-
low||=factors.min
|
191
|
-
@data_with_nils.collect{|x|
|
192
|
-
if x.nil?
|
193
|
-
nil
|
194
|
-
elsif x>low
|
195
|
-
1
|
196
|
-
else
|
197
|
-
0
|
198
|
-
end
|
199
|
-
}.to_scale
|
200
|
-
end
|
201
|
-
# Iterate on each item.
|
202
|
-
# Equivalent to
|
203
|
-
# @data.each{|x| yield x}
|
204
|
-
def each
|
205
|
-
@data.each{|x| yield(x) }
|
206
|
-
end
|
207
|
-
|
208
|
-
# Iterate on each item, retrieving index
|
209
|
-
def each_index
|
210
|
-
(0...@data.size).each {|i|
|
211
|
-
yield(i)
|
391
|
+
end
|
212
392
|
}
|
213
|
-
|
214
|
-
# Add a value at the end of the vector.
|
215
|
-
# If second argument set to false, you should update the Vector usign
|
216
|
-
# Vector.set_valid_data at the end of your insertion cycle
|
217
|
-
#
|
218
|
-
def add(v,update_valid=true)
|
219
|
-
@data.push(v)
|
220
|
-
set_valid_data if update_valid
|
221
|
-
end
|
222
|
-
# Update valid_data, missing_data, data_with_nils and gsl
|
223
|
-
# at the end of an insertion.
|
224
|
-
#
|
225
|
-
# Use after Vector.add(v,false)
|
226
|
-
# Usage:
|
227
|
-
# v=Statsample::Vector.new
|
228
|
-
# v.add(2,false)
|
229
|
-
# v.add(4,false)
|
230
|
-
# v.data
|
231
|
-
# => [2,3]
|
232
|
-
# v.valid_data
|
233
|
-
# => []
|
234
|
-
# v.set_valid_data
|
235
|
-
# v.valid_data
|
236
|
-
# => [2,3]
|
237
|
-
def set_valid_data
|
238
|
-
@valid_data.clear
|
239
|
-
@missing_data.clear
|
240
|
-
@data_with_nils.clear
|
241
|
-
@gsl=nil
|
242
|
-
set_valid_data_intern
|
243
|
-
set_scale_data if(@type==:scale)
|
244
|
-
end
|
245
|
-
|
246
|
-
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
247
|
-
def set_valid_data_intern #:nodoc:
|
248
|
-
Statsample::STATSAMPLE__.set_valid_data_intern(self)
|
249
|
-
end
|
393
|
+
)
|
250
394
|
else
|
251
|
-
|
252
|
-
_set_valid_data_intern
|
253
|
-
end
|
254
|
-
end
|
255
|
-
def _set_valid_data_intern #:nodoc:
|
256
|
-
@data.each do |n|
|
257
|
-
if is_valid? n
|
258
|
-
@valid_data.push(n)
|
259
|
-
@data_with_nils.push(n)
|
260
|
-
else
|
261
|
-
@data_with_nils.push(nil)
|
262
|
-
@missing_data.push(n)
|
263
|
-
end
|
264
|
-
end
|
265
|
-
@has_missing_data=@missing_data.size>0
|
395
|
+
raise TypeError,"You should pass a scalar or a array/vector"
|
266
396
|
end
|
267
|
-
|
268
|
-
# Retrieves true if data has one o more missing values
|
269
|
-
def has_missing_data?
|
270
|
-
@has_missing_data
|
271
|
-
end
|
272
|
-
# Retrieves label for value x. Retrieves x if
|
273
|
-
# no label defined.
|
274
|
-
def labeling(x)
|
275
|
-
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
276
|
-
end
|
277
|
-
# Returns a Vector with data with labels replaced by the label.
|
278
|
-
def vector_labeled
|
279
|
-
d=@data.collect{|x|
|
280
|
-
if @labels.has_key? x
|
281
|
-
@labels[x]
|
282
|
-
else
|
283
|
-
x
|
284
|
-
end
|
285
|
-
}
|
286
|
-
Vector.new(d,@type)
|
287
|
-
end
|
288
|
-
# Size of total data
|
289
|
-
def size
|
290
|
-
@data.size
|
291
|
-
end
|
292
|
-
alias_method :n, :size
|
293
|
-
|
294
|
-
# Retrieves i element of data
|
295
|
-
def [](i)
|
296
|
-
@data[i]
|
297
|
-
end
|
298
|
-
# Set i element of data.
|
299
|
-
# Note: Use set_valid_data if you include missing values
|
300
|
-
def []=(i,v)
|
301
|
-
@data[i]=v
|
302
|
-
end
|
303
|
-
# Return true if a value is valid (not nil and not included on missing values)
|
304
|
-
def is_valid?(x)
|
305
|
-
!(x.nil? or @missing_values.include? x)
|
306
|
-
end
|
307
|
-
# Set missing_values.
|
308
|
-
# if update_valid = false, you should use
|
309
|
-
# set_valid_data after all changes
|
310
|
-
def missing_values=(vals)
|
311
|
-
@missing_values = vals
|
312
|
-
set_valid_data
|
313
|
-
end
|
314
|
-
# Set level of measurement.
|
315
|
-
def type=(t)
|
316
|
-
@type=t
|
317
|
-
set_scale_data if(t==:scale)
|
318
|
-
end
|
319
|
-
def to_a
|
320
|
-
@data.dup
|
321
|
-
end
|
322
|
-
alias_method :to_ary, :to_a
|
323
|
-
|
324
|
-
# Vector sum.
|
325
|
-
# - If v is a scalar, add this value to all elements
|
326
|
-
# - If v is a Array or a Vector, should be of the same size of this vector
|
327
|
-
# every item of this vector will be added to the value of the
|
328
|
-
# item at the same position on the other vector
|
329
|
-
def +(v)
|
330
|
-
_vector_ari("+",v)
|
331
|
-
end
|
332
|
-
# Vector rest.
|
333
|
-
# - If v is a scalar, rest this value to all elements
|
334
|
-
# - If v is a Array or a Vector, should be of the same
|
335
|
-
# size of this vector
|
336
|
-
# every item of this vector will be rested to the value of the
|
337
|
-
# item at the same position on the other vector
|
338
397
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
#
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
)
|
379
|
-
else
|
380
|
-
raise TypeError,"You should pass a scalar or a array/vector"
|
381
|
-
end
|
382
|
-
|
383
|
-
end
|
384
|
-
# Return an array with the data splitted by a separator.
|
385
|
-
# a=Vector.new(["a,b","c,d","a,b","d"])
|
386
|
-
# a.splitted
|
387
|
-
# =>
|
388
|
-
# [["a","b"],["c","d"],["a","b"],["d"]]
|
389
|
-
def splitted(sep=Statsample::SPLIT_TOKEN)
|
390
|
-
@data.collect{|x|
|
391
|
-
if x.nil?
|
392
|
-
nil
|
393
|
-
elsif (x.respond_to? :split)
|
394
|
-
x.split(sep)
|
395
|
-
else
|
396
|
-
[x]
|
397
|
-
end
|
398
|
-
}
|
399
|
-
end
|
400
|
-
# Returns a hash of Vectors, defined by the different values
|
401
|
-
# defined on the fields
|
402
|
-
# Example:
|
403
|
-
#
|
404
|
-
# a=Vector.new(["a,b","c,d","a,b"])
|
405
|
-
# a.split_by_separator
|
406
|
-
# => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
|
407
|
-
# @data=[1, 0, 1]>,
|
408
|
-
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
|
409
|
-
# @data=[1, 1, 0]>,
|
410
|
-
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
|
411
|
-
# @data=[0, 1, 1]>}
|
412
|
-
#
|
413
|
-
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
414
|
-
split_data=splitted(sep)
|
415
|
-
factors=split_data.flatten.uniq.compact
|
416
|
-
out=factors.inject({}) {|a,x|
|
417
|
-
a[x]=[]
|
418
|
-
a
|
419
|
-
}
|
420
|
-
split_data.each{|r|
|
421
|
-
if r.nil?
|
422
|
-
factors.each{|f|
|
423
|
-
out[f].push(nil)
|
424
|
-
}
|
425
|
-
else
|
398
|
+
end
|
399
|
+
# Return an array with the data splitted by a separator.
|
400
|
+
# a=Vector.new(["a,b","c,d","a,b","d"])
|
401
|
+
# a.splitted
|
402
|
+
# =>
|
403
|
+
# [["a","b"],["c","d"],["a","b"],["d"]]
|
404
|
+
def splitted(sep=Statsample::SPLIT_TOKEN)
|
405
|
+
@data.collect{|x|
|
406
|
+
if x.nil?
|
407
|
+
nil
|
408
|
+
elsif (x.respond_to? :split)
|
409
|
+
x.split(sep)
|
410
|
+
else
|
411
|
+
[x]
|
412
|
+
end
|
413
|
+
}
|
414
|
+
end
|
415
|
+
# Returns a hash of Vectors, defined by the different values
|
416
|
+
# defined on the fields
|
417
|
+
# Example:
|
418
|
+
#
|
419
|
+
# a=Vector.new(["a,b","c,d","a,b"])
|
420
|
+
# a.split_by_separator
|
421
|
+
# => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
|
422
|
+
# @data=[1, 0, 1]>,
|
423
|
+
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
|
424
|
+
# @data=[1, 1, 0]>,
|
425
|
+
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
|
426
|
+
# @data=[0, 1, 1]>}
|
427
|
+
#
|
428
|
+
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
429
|
+
split_data=splitted(sep)
|
430
|
+
factors=split_data.flatten.uniq.compact
|
431
|
+
out=factors.inject({}) {|a,x|
|
432
|
+
a[x]=[]
|
433
|
+
a
|
434
|
+
}
|
435
|
+
split_data.each{|r|
|
436
|
+
if r.nil?
|
426
437
|
factors.each{|f|
|
427
|
-
out[f].push(
|
438
|
+
out[f].push(nil)
|
428
439
|
}
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
s[v[0]]=Vector.new(v[1],:nominal)
|
433
|
-
s
|
440
|
+
else
|
441
|
+
factors.each{|f|
|
442
|
+
out[f].push(r.include?(f) ? 1:0)
|
434
443
|
}
|
444
|
+
end
|
445
|
+
}
|
446
|
+
out.inject({}){|s,v|
|
447
|
+
s[v[0]]=Vector.new(v[1],:nominal)
|
448
|
+
s
|
449
|
+
}
|
450
|
+
end
|
451
|
+
def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
|
452
|
+
split_by_separator(sep).inject({}) {|a,v|
|
453
|
+
a[v[0]]=v[1].inject {|s,x| s+x.to_i}
|
454
|
+
a
|
455
|
+
}
|
456
|
+
end
|
457
|
+
|
458
|
+
# Returns an random sample of size n, with replacement,
|
459
|
+
# only with valid data.
|
460
|
+
#
|
461
|
+
# In all the trails, every item have the same probability
|
462
|
+
# of been selected.
|
463
|
+
def sample_with_replacement(sample=1)
|
464
|
+
if(@type!=:scale or !HAS_GSL)
|
465
|
+
vds=@valid_data.size
|
466
|
+
(0...sample).collect{ @valid_data[rand(vds)] }
|
467
|
+
else
|
468
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
469
|
+
r.sample(@gsl, sample).to_a
|
470
|
+
end
|
471
|
+
end
|
472
|
+
# Returns an random sample of size n, without replacement,
|
473
|
+
# only with valid data.
|
474
|
+
#
|
475
|
+
# Every element could only be selected once.
|
476
|
+
#
|
477
|
+
# A sample of the same size of the vector is the vector itself.
|
478
|
+
|
479
|
+
def sample_without_replacement(sample=1)
|
480
|
+
if(@type!=:scale or !HAS_GSL)
|
481
|
+
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
482
|
+
out=[]
|
483
|
+
size=@valid_data.size
|
484
|
+
while out.size<sample
|
485
|
+
value=rand(size)
|
486
|
+
out.push(value) if !out.include?value
|
487
|
+
end
|
488
|
+
out.collect{|i|@data[i]}
|
489
|
+
else
|
490
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
491
|
+
r.choose(@gsl, sample).to_a
|
492
|
+
end
|
493
|
+
end
|
494
|
+
# Retrieves number of cases which comply condition.
|
495
|
+
# If block given, retrieves number of instances where
|
496
|
+
# block returns true.
|
497
|
+
# If other values given, retrieves the frequency for
|
498
|
+
# this value.
|
499
|
+
def count(x=false)
|
500
|
+
if block_given?
|
501
|
+
r=@data.inject(0) {|s, i|
|
502
|
+
r=yield i
|
503
|
+
s+(r ? 1 : 0)
|
504
|
+
}
|
505
|
+
r.nil? ? 0 : r
|
506
|
+
else
|
507
|
+
frequencies[x].nil? ? 0 : frequencies[x]
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
# Returns the database type for the vector, according to its content
|
512
|
+
|
513
|
+
def db_type(dbs='mysql')
|
514
|
+
# first, detect any character not number
|
515
|
+
if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
|
516
|
+
return "DATE"
|
517
|
+
elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
|
518
|
+
return "VARCHAR (255)"
|
519
|
+
elsif @data.find {|v| v.to_s=~/\./}
|
520
|
+
return "DOUBLE"
|
521
|
+
else
|
522
|
+
return "INTEGER"
|
523
|
+
end
|
524
|
+
end
|
525
|
+
# Return true if all data is Date, "today" values or nil
|
526
|
+
def can_be_date?
|
527
|
+
if @data.find {|v|
|
528
|
+
!v.nil? and !v.is_a? Date and !v.is_a? Time and (v.is_a? String and !@today_values.include? v) and (v.is_a? String and !(v=~/\d{4,4}[-\/]\d{1,2}[-\/]\d{1,2}/))}
|
529
|
+
false
|
530
|
+
else
|
531
|
+
true
|
435
532
|
end
|
436
|
-
|
437
|
-
|
438
|
-
|
533
|
+
end
|
534
|
+
# Return true if all data is Numeric or nil
|
535
|
+
def can_be_scale?
|
536
|
+
if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
|
537
|
+
false
|
538
|
+
else
|
539
|
+
true
|
540
|
+
end
|
541
|
+
end
|
542
|
+
|
543
|
+
def to_s
|
544
|
+
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
545
|
+
end
|
546
|
+
# Ugly name. Really, create a Vector for standard 'matrix' package.
|
547
|
+
# <tt>dir</tt> could be :horizontal or :vertical
|
548
|
+
def to_matrix(dir=:horizontal)
|
549
|
+
case dir
|
550
|
+
when :horizontal
|
551
|
+
Matrix[@data]
|
552
|
+
when :vertical
|
553
|
+
Matrix.columns([@data])
|
554
|
+
end
|
555
|
+
end
|
556
|
+
def inspect
|
557
|
+
self.to_s
|
558
|
+
end
|
559
|
+
# Retrieves uniques values for data.
|
560
|
+
def factors
|
561
|
+
if @type==:scale
|
562
|
+
@scale_data.uniq.sort
|
563
|
+
elsif @type==:date
|
564
|
+
@date_data_with_nils.uniq.sort
|
565
|
+
else
|
566
|
+
@valid_data.uniq.sort
|
567
|
+
end
|
568
|
+
end
|
569
|
+
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
570
|
+
# Returns a hash with the distribution of frecuencies for
|
571
|
+
# the sample
|
572
|
+
def frequencies
|
573
|
+
Statsample::STATSAMPLE__.frequencies(@valid_data)
|
574
|
+
end
|
575
|
+
else
|
576
|
+
def frequencies #:nodoc:
|
577
|
+
_frequencies
|
578
|
+
end
|
579
|
+
end
|
580
|
+
def _frequencies #:nodoc:
|
581
|
+
@valid_data.inject(Hash.new) {|a,x|
|
582
|
+
a[x]||=0
|
583
|
+
a[x]=a[x]+1
|
584
|
+
a
|
585
|
+
}
|
586
|
+
end
|
587
|
+
# Plot frequencies on a chart, using gnuplot
|
588
|
+
def plot_frequencies
|
589
|
+
require 'gnuplot'
|
590
|
+
x=[]
|
591
|
+
y=[]
|
592
|
+
self.frequencies.sort.each{|k,v|
|
593
|
+
x.push(k)
|
594
|
+
y.push(v)
|
595
|
+
}
|
596
|
+
Gnuplot.open do |gp|
|
597
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
598
|
+
plot.boxwidth("0.9 absolute")
|
599
|
+
plot.yrange("[0:#{y.max}]")
|
600
|
+
plot.style("fill solid 1.00 border -1")
|
601
|
+
plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
|
602
|
+
plot.style("histogram")
|
603
|
+
plot.style("data histogram")
|
604
|
+
i=-1
|
605
|
+
plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
|
606
|
+
plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
|
607
|
+
end
|
608
|
+
end
|
609
|
+
end
|
610
|
+
|
611
|
+
end
|
612
|
+
|
613
|
+
|
614
|
+
# Returns the most frequent item.
|
615
|
+
def mode
|
616
|
+
frequencies.max{|a,b| a[1]<=>b[1]}[0]
|
617
|
+
end
|
618
|
+
# The numbers of item with valid data.
|
619
|
+
def n_valid
|
620
|
+
@valid_data.size
|
621
|
+
end
|
622
|
+
# Returns a hash with the distribution of proportions of
|
623
|
+
# the sample.
|
624
|
+
def proportions
|
625
|
+
frequencies.inject({}){|a,v|
|
626
|
+
a[v[0]] = v[1].quo(n_valid)
|
439
627
|
a
|
440
628
|
}
|
441
629
|
end
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
# Returns an random sample of size n, without replacement,
|
458
|
-
# only with valid data.
|
459
|
-
#
|
460
|
-
# Every element could only be selected once.
|
461
|
-
#
|
462
|
-
# A sample of the same size of the vector is the vector itself.
|
463
|
-
|
464
|
-
def sample_without_replacement(sample=1)
|
465
|
-
if(@type!=:scale or !HAS_GSL)
|
466
|
-
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
467
|
-
out=[]
|
468
|
-
size=@valid_data.size
|
469
|
-
while out.size<sample
|
470
|
-
value=rand(size)
|
471
|
-
out.push(value) if !out.include?value
|
472
|
-
end
|
473
|
-
out.collect{|i|@data[i]}
|
474
|
-
else
|
475
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
476
|
-
r.choose(@gsl, sample).to_a
|
630
|
+
# Proportion of a given value.
|
631
|
+
def proportion(v=1)
|
632
|
+
frequencies[v].quo(@valid_data.size)
|
633
|
+
end
|
634
|
+
def summary(out="")
|
635
|
+
out << sprintf("n valid:%d\n",n_valid)
|
636
|
+
out << sprintf("factors:%s\n",factors.join(","))
|
637
|
+
out << "mode:"+mode.to_s+"\n"
|
638
|
+
out << "Distribution:\n"
|
639
|
+
frequencies.sort.each{|k,v|
|
640
|
+
key=labels.has_key?(k) ? labels[k]:k
|
641
|
+
out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
|
642
|
+
}
|
643
|
+
if(@type==:ordinal)
|
644
|
+
out << "median:"+median.to_s+"\n"
|
477
645
|
end
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
# If other values given, retrieves the frequency for
|
483
|
-
# this value.
|
484
|
-
def count(x=false)
|
485
|
-
if block_given?
|
486
|
-
r=@data.inject(0) {|s, i|
|
487
|
-
r=yield i
|
488
|
-
s+(r ? 1 : 0)
|
489
|
-
}
|
490
|
-
r.nil? ? 0 : r
|
491
|
-
else
|
492
|
-
frequencies[x].nil? ? 0 : frequencies[x]
|
646
|
+
if(@type==:scale)
|
647
|
+
out << "mean:"+mean.to_s+"\n"
|
648
|
+
out << "sd:"+sd.to_s+"\n"
|
649
|
+
|
493
650
|
end
|
651
|
+
out
|
494
652
|
end
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
653
|
+
|
654
|
+
# Variance of p, according to poblation size
|
655
|
+
def variance_proportion(n_poblation, v=1)
|
656
|
+
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
657
|
+
end
|
658
|
+
# Variance of p, according to poblation size
|
659
|
+
def variance_total(n_poblation, v=1)
|
660
|
+
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
661
|
+
end
|
662
|
+
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
663
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
664
|
+
end
|
665
|
+
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
666
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
667
|
+
end
|
668
|
+
|
669
|
+
self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
|
670
|
+
met_or=met.gsub("_slow","")
|
671
|
+
if !self.method_defined?(met_or)
|
672
|
+
alias_method met_or, met
|
673
|
+
end
|
674
|
+
end
|
675
|
+
######
|
676
|
+
### Ordinal Methods
|
677
|
+
######
|
678
|
+
|
679
|
+
# Return the value of the percentil q
|
680
|
+
def percentil(q)
|
681
|
+
check_type :ordinal
|
682
|
+
sorted=@valid_data.sort
|
683
|
+
v= (n_valid * q).quo(100)
|
684
|
+
if(v.to_i!=v)
|
685
|
+
sorted[v.to_i]
|
686
|
+
else
|
687
|
+
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
688
|
+
end
|
689
|
+
end
|
690
|
+
# Returns a ranked vector.
|
691
|
+
def ranked(type=:ordinal)
|
692
|
+
check_type :ordinal
|
693
|
+
i=0
|
694
|
+
r=frequencies.sort.inject({}){|a,v|
|
695
|
+
a[v[0]]=(i+1 + i+v[1]).quo(2)
|
696
|
+
i+=v[1]
|
697
|
+
a
|
698
|
+
}
|
699
|
+
@data.collect {|c| r[c] }.to_vector(type)
|
700
|
+
end
|
701
|
+
# Return the median (percentil 50)
|
702
|
+
def median
|
703
|
+
check_type :ordinal
|
704
|
+
if HAS_GSL and @type==:scale
|
705
|
+
sorted=GSL::Vector.alloc(@scale_data.sort)
|
706
|
+
GSL::Stats::median_from_sorted_data(sorted)
|
707
|
+
else
|
708
|
+
percentil(50)
|
709
|
+
end
|
710
|
+
end
|
711
|
+
# Minimun value
|
712
|
+
def min;
|
713
|
+
check_type :ordinal
|
714
|
+
@valid_data.min;
|
715
|
+
end
|
716
|
+
# Maximum value
|
717
|
+
def max;
|
718
|
+
check_type :ordinal
|
719
|
+
@valid_data.max;
|
720
|
+
end
|
721
|
+
def set_date_data # :nodoc:
|
722
|
+
@date_data_with_nils=@data.collect do|x|
|
723
|
+
if x.is_a? Date
|
724
|
+
x
|
725
|
+
elsif x.is_a? Time
|
726
|
+
Date.new(x.year, x.month, x.day)
|
727
|
+
elsif x.is_a? String and x=~/(\d{4,4})[-\/](\d{1,2})[-\/](\d{1,2})/
|
728
|
+
Date.new($1.to_i,$2.to_i,$3.to_i)
|
729
|
+
elsif @today_values.include? x
|
730
|
+
Date.today()
|
731
|
+
elsif @missing_values.include? x or x.nil?
|
732
|
+
nil
|
508
733
|
end
|
734
|
+
end
|
509
735
|
end
|
510
|
-
|
511
|
-
|
512
|
-
if
|
513
|
-
|
736
|
+
def set_scale_data # :nodoc
|
737
|
+
@scale_data=@valid_data.collect do|x|
|
738
|
+
if x.is_a? Numeric
|
739
|
+
x
|
740
|
+
elsif x.is_a? String and x.to_i==x.to_f
|
741
|
+
x.to_i
|
514
742
|
else
|
515
|
-
|
516
|
-
end
|
743
|
+
x.to_f
|
744
|
+
end
|
745
|
+
end
|
746
|
+
if HAS_GSL
|
747
|
+
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
748
|
+
end
|
749
|
+
end
|
750
|
+
private :set_scale_data
|
751
|
+
|
752
|
+
# The range of the data (max - min)
|
753
|
+
def range;
|
754
|
+
check_type :scale
|
755
|
+
@scale_data.max - @scale_data.min
|
756
|
+
end
|
757
|
+
# The sum of values for the data
|
758
|
+
def sum
|
759
|
+
check_type :scale
|
760
|
+
@scale_data.inject(0){|a,x|x+a} ;
|
761
|
+
end
|
762
|
+
# The arithmetical mean of data
|
763
|
+
def mean
|
764
|
+
check_type :scale
|
765
|
+
sum.to_f.quo(n_valid)
|
766
|
+
end
|
767
|
+
# Sum of squares for the data around a value.
|
768
|
+
# By default, this value is the mean
|
769
|
+
# ss= sum{(xi-m)^2}
|
770
|
+
#
|
771
|
+
def sum_of_squares(m=nil)
|
772
|
+
check_type :scale
|
773
|
+
m||=mean
|
774
|
+
@scale_data.inject(0){|a,x| a+(x-m).square}
|
517
775
|
end
|
518
776
|
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
# <tt>dir</tt> could be :horizontal or :vertical
|
524
|
-
def to_matrix(dir=:horizontal)
|
525
|
-
case dir
|
526
|
-
when :horizontal
|
527
|
-
Matrix[@data]
|
528
|
-
when :vertical
|
529
|
-
Matrix.columns([@data])
|
530
|
-
end
|
531
|
-
end
|
532
|
-
def inspect
|
533
|
-
self.to_s
|
777
|
+
# Sum of squared deviation
|
778
|
+
def sum_of_squared_deviation
|
779
|
+
check_type :scale
|
780
|
+
@scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
534
781
|
end
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
782
|
+
|
783
|
+
# Population variance (denominator N)
|
784
|
+
def variance_population(m=nil)
|
785
|
+
check_type :scale
|
786
|
+
m||=mean
|
787
|
+
squares=@scale_data.inject(0){|a,x| x.square+a}
|
788
|
+
squares.quo(n_valid) - m.square
|
542
789
|
end
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
else
|
550
|
-
def frequencies #:nodoc:
|
551
|
-
_frequencies
|
552
|
-
end
|
790
|
+
|
791
|
+
|
792
|
+
# Population Standard deviation (denominator N)
|
793
|
+
def standard_deviation_population(m=nil)
|
794
|
+
check_type :scale
|
795
|
+
Math::sqrt( variance_population(m) )
|
553
796
|
end
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
797
|
+
# Sample Variance (denominator n-1)
|
798
|
+
|
799
|
+
def variance_sample(m=nil)
|
800
|
+
check_type :scale
|
801
|
+
m||=mean
|
802
|
+
sum_of_squares(m).quo(n_valid - 1)
|
560
803
|
end
|
561
|
-
# Plot frequencies on a chart, using gnuplot
|
562
|
-
def plot_frequencies
|
563
|
-
require 'gnuplot'
|
564
|
-
x=[]
|
565
|
-
y=[]
|
566
|
-
self.frequencies.sort.each{|k,v|
|
567
|
-
x.push(k)
|
568
|
-
y.push(v)
|
569
|
-
}
|
570
|
-
Gnuplot.open do |gp|
|
571
|
-
Gnuplot::Plot.new( gp ) do |plot|
|
572
|
-
plot.boxwidth("0.9 absolute")
|
573
|
-
plot.yrange("[0:#{y.max}]")
|
574
|
-
plot.style("fill solid 1.00 border -1")
|
575
|
-
plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
|
576
|
-
plot.style("histogram")
|
577
|
-
plot.style("data histogram")
|
578
|
-
i=-1
|
579
|
-
plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
|
580
|
-
plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
|
581
|
-
end
|
582
|
-
end
|
583
|
-
end
|
584
804
|
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
def mode
|
590
|
-
frequencies.max{|a,b| a[1]<=>b[1]}[0]
|
591
|
-
end
|
592
|
-
# The numbers of item with valid data.
|
593
|
-
def n_valid
|
594
|
-
@valid_data.size
|
595
|
-
end
|
596
|
-
# Returns a hash with the distribution of proportions of
|
597
|
-
# the sample.
|
598
|
-
def proportions
|
599
|
-
frequencies.inject({}){|a,v|
|
600
|
-
a[v[0]] = v[1].quo(n_valid)
|
601
|
-
a
|
602
|
-
}
|
603
|
-
end
|
604
|
-
# Proportion of a given value.
|
605
|
-
def proportion(v=1)
|
606
|
-
frequencies[v].quo(@valid_data.size)
|
607
|
-
end
|
608
|
-
def summary(out="")
|
609
|
-
out << sprintf("n valid:%d\n",n_valid)
|
610
|
-
out << sprintf("factors:%s\n",factors.join(","))
|
611
|
-
out << "mode:"+mode.to_s+"\n"
|
612
|
-
out << "Distribution:\n"
|
613
|
-
frequencies.sort.each{|k,v|
|
614
|
-
key=labels.has_key?(k) ? labels[k]:k
|
615
|
-
out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
|
616
|
-
}
|
617
|
-
if(@type==:ordinal)
|
618
|
-
out << "median:"+median.to_s+"\n"
|
619
|
-
end
|
620
|
-
if(@type==:scale)
|
621
|
-
out << "mean:"+mean.to_s+"\n"
|
622
|
-
out << "sd:"+sd.to_s+"\n"
|
623
|
-
|
624
|
-
end
|
625
|
-
out
|
626
|
-
end
|
805
|
+
# Sample Standard deviation (denominator n-1)
|
806
|
+
|
807
|
+
def standard_deviation_sample(m=nil)
|
808
|
+
check_type :scale
|
627
809
|
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
810
|
+
m||=mean
|
811
|
+
Math::sqrt(variance_sample(m))
|
812
|
+
end
|
813
|
+
# Skewness of the sample
|
814
|
+
def skew(m=nil)
|
815
|
+
check_type :scale
|
816
|
+
m||=mean
|
817
|
+
th=@scale_data.inject(0){|a,x| a+((x-m)**3)}
|
818
|
+
th.quo((@scale_data.size)*sd(m)**3)
|
819
|
+
end
|
820
|
+
# Kurtosis of the sample
|
821
|
+
def kurtosis(m=nil)
|
822
|
+
check_type :scale
|
823
|
+
m||=mean
|
824
|
+
fo=@scale_data.inject(0){|a,x| a+((x-m)**4)}
|
825
|
+
fo.quo((@scale_data.size)*sd(m)**4)-3
|
642
826
|
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
end
|
677
|
-
# Return the median (percentil 50)
|
678
|
-
def median
|
679
|
-
check_type :ordinal
|
680
|
-
if HAS_GSL and @type==:scale
|
681
|
-
GSL::Stats::median_from_sorted_data(@gsl)
|
682
|
-
else
|
683
|
-
percentil(50)
|
684
|
-
end
|
685
|
-
end
|
686
|
-
# Minimun value
|
687
|
-
def min;
|
688
|
-
check_type :ordinal
|
689
|
-
@valid_data.min;
|
690
|
-
end
|
691
|
-
# Maximum value
|
692
|
-
def max;
|
693
|
-
check_type :ordinal
|
694
|
-
@valid_data.max;
|
695
|
-
end
|
696
|
-
|
697
|
-
def set_scale_data # :nodoc
|
698
|
-
@scale_data=@valid_data.collect do|x|
|
699
|
-
if x.is_a? Numeric
|
700
|
-
x
|
701
|
-
elsif x.is_a? String and x.to_i==x.to_f
|
702
|
-
x.to_i
|
703
|
-
else
|
704
|
-
x.to_f
|
705
|
-
end
|
706
|
-
end
|
707
|
-
if HAS_GSL
|
708
|
-
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
709
|
-
end
|
710
|
-
end
|
711
|
-
private :set_scale_data
|
827
|
+
end
|
828
|
+
# Product of all values on the sample
|
829
|
+
#
|
830
|
+
def product
|
831
|
+
check_type :scale
|
832
|
+
@scale_data.inject(1){|a,x| a*x }
|
833
|
+
end
|
834
|
+
if HAS_GSL
|
835
|
+
%w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
|
836
|
+
m_nuevo=(m+"_slow").intern
|
837
|
+
alias_method m_nuevo, m.intern
|
838
|
+
}
|
839
|
+
def sum # :nodoc:
|
840
|
+
check_type :scale
|
841
|
+
|
842
|
+
@gsl.sum
|
843
|
+
end
|
844
|
+
def mean # :nodoc:
|
845
|
+
check_type :scale
|
846
|
+
|
847
|
+
@gsl.mean
|
848
|
+
end
|
849
|
+
def variance_sample(m=nil) # :nodoc:
|
850
|
+
check_type :scale
|
851
|
+
|
852
|
+
m||=mean
|
853
|
+
@gsl.variance_m
|
854
|
+
end
|
855
|
+
def standard_deviation_sample(m=nil) # :nodoc:
|
856
|
+
check_type :scale
|
857
|
+
m||=mean
|
858
|
+
@gsl.sd(m)
|
859
|
+
end
|
712
860
|
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
end
|
737
|
-
|
738
|
-
# Sum of squared deviation
|
739
|
-
def sum_of_squared_deviation
|
740
|
-
check_type :scale
|
741
|
-
@scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
742
|
-
end
|
861
|
+
def variance_population(m=nil) # :nodoc:
|
862
|
+
check_type :scale
|
863
|
+
m||=mean
|
864
|
+
@gsl.variance_with_fixed_mean(m)
|
865
|
+
end
|
866
|
+
def standard_deviation_population(m=nil) # :nodoc:
|
867
|
+
check_type :scale
|
868
|
+
m||=mean
|
869
|
+
@gsl.sd_with_fixed_mean(m)
|
870
|
+
end
|
871
|
+
def skew # :nodoc:
|
872
|
+
check_type :scale
|
873
|
+
@gsl.skew
|
874
|
+
end
|
875
|
+
def kurtosis # :nodoc:
|
876
|
+
check_type :scale
|
877
|
+
@gsl.kurtosis
|
878
|
+
end
|
879
|
+
# Create a GSL::Histogram
|
880
|
+
# With a fixnum, creates X bins within the range of data
|
881
|
+
# With an Array, each value will be a cut point
|
882
|
+
def histogram(bins=10)
|
883
|
+
check_type :scale
|
743
884
|
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
885
|
+
if bins.is_a? Array
|
886
|
+
#h=Statsample::Histogram.new(self, bins)
|
887
|
+
h=GSL::Histogram.alloc(bins)
|
888
|
+
else
|
889
|
+
# ugly patch. The upper limit for a bin has the form
|
890
|
+
# x < range
|
891
|
+
#h=Statsample::Histogram.new(self, bins)
|
892
|
+
h=GSL::Histogram.alloc(bins,[@valid_data.min,@valid_data.max+0.0001])
|
750
893
|
end
|
751
|
-
|
894
|
+
h.increment(@gsl)
|
895
|
+
h
|
896
|
+
end
|
897
|
+
def plot_histogram(bins=10,options="")
|
898
|
+
check_type :scale
|
899
|
+
self.histogram(bins).graph(options)
|
900
|
+
end
|
752
901
|
|
753
|
-
# Population Standard deviation (denominator N)
|
754
|
-
def standard_deviation_population(m=nil)
|
755
|
-
check_type :scale
|
756
|
-
|
757
|
-
Math::sqrt( variance_population(m) )
|
758
|
-
end
|
759
|
-
# Sample Variance (denominator n-1)
|
760
|
-
|
761
|
-
def variance_sample(m=nil)
|
762
|
-
check_type :scale
|
763
|
-
|
764
|
-
m||=mean
|
765
|
-
sum_of_squares(m).quo(n_valid - 1)
|
766
|
-
end
|
767
|
-
|
768
|
-
# Sample Standard deviation (denominator n-1)
|
769
|
-
|
770
|
-
def standard_deviation_sample(m=nil)
|
771
|
-
check_type :scale
|
772
|
-
|
773
|
-
m||=m
|
774
|
-
Math::sqrt(variance_sample(m))
|
775
|
-
end
|
776
|
-
# Skewness of the sample
|
777
|
-
def skew
|
778
|
-
check_type :scale
|
779
|
-
m=mean
|
780
|
-
thirds=@scale_data.inject(0){|a,x| a+((x-mean)**3)}
|
781
|
-
thirds.quo((@scale_data.size-1)*sd**3)
|
782
|
-
end
|
783
|
-
# Kurtosis of the sample
|
784
|
-
def kurtosis
|
785
|
-
check_type :scale
|
786
|
-
|
787
|
-
m=mean
|
788
|
-
thirds=@scale_data.inject(0){|a,x| a+((x-mean)**4)}
|
789
|
-
thirds.quo((@scale_data.size-1)*sd**4)
|
790
|
-
|
791
|
-
end
|
792
|
-
# Product of all values on the sample
|
793
|
-
#
|
794
|
-
def product
|
795
|
-
check_type :scale
|
796
|
-
@scale_data.inject(1){|a,x| a*x }
|
797
|
-
end
|
798
|
-
if HAS_GSL
|
799
|
-
%w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
|
800
|
-
m_nuevo=(m+"_slow").intern
|
801
|
-
alias_method m_nuevo, m.intern
|
802
|
-
}
|
803
|
-
def sum # :nodoc:
|
804
|
-
check_type :scale
|
805
|
-
|
806
|
-
@gsl.sum
|
807
|
-
end
|
808
|
-
def mean # :nodoc:
|
809
|
-
check_type :scale
|
810
|
-
|
811
|
-
@gsl.mean
|
812
|
-
end
|
813
|
-
def variance_sample(m=nil) # :nodoc:
|
814
|
-
check_type :scale
|
815
|
-
|
816
|
-
m||=mean
|
817
|
-
@gsl.variance_m
|
818
|
-
end
|
819
|
-
def standard_deviation_sample(m=nil) # :nodoc:
|
820
|
-
check_type :scale
|
821
|
-
m||=mean
|
822
|
-
@gsl.sd(m)
|
823
|
-
end
|
824
|
-
|
825
|
-
def variance_population(m=nil) # :nodoc:
|
826
|
-
check_type :scale
|
827
|
-
m||=mean
|
828
|
-
@gsl.variance_with_fixed_mean(m)
|
829
|
-
end
|
830
|
-
def standard_deviation_population(m=nil) # :nodoc:
|
831
|
-
check_type :scale
|
832
|
-
m||=mean
|
833
|
-
@gsl.sd_with_fixed_mean(m)
|
834
|
-
end
|
835
|
-
def skew # :nodoc:
|
836
|
-
check_type :scale
|
837
|
-
@gsl.skew
|
838
|
-
end
|
839
|
-
def kurtosis # :nodoc:
|
840
|
-
check_type :scale
|
841
|
-
@gsl.kurtosis
|
842
|
-
end
|
843
|
-
# Create a GSL::Histogram
|
844
|
-
# With a fixnum, creates X bins within the range of data
|
845
|
-
# With an Array, each value will be a cut point
|
846
|
-
def histogram(bins=10)
|
847
|
-
check_type :scale
|
848
|
-
if bins.is_a? Array
|
849
|
-
h=GSL::Histogram.alloc(bins)
|
850
|
-
else
|
851
|
-
# ugly patch. The upper limit for a bin has the form
|
852
|
-
# x < range
|
853
|
-
h=GSL::Histogram.alloc(bins,[@valid_data.min,@valid_data.max+0.0001])
|
854
|
-
end
|
855
|
-
h.increment(@gsl)
|
856
|
-
h
|
857
|
-
end
|
858
|
-
def plot_histogram(bins=10,options="")
|
859
|
-
check_type :scale
|
860
|
-
self.histogram(bins).graph(options)
|
861
|
-
end
|
862
|
-
|
863
|
-
end
|
864
|
-
|
865
|
-
# Coefficient of variation
|
866
|
-
# Calculed with the sample standard deviation
|
867
|
-
def coefficient_of_variation
|
868
|
-
check_type :scale
|
869
|
-
standard_deviation_sample.quo(mean)
|
870
|
-
end
|
871
|
-
|
872
|
-
alias_method :sdp, :standard_deviation_population
|
873
|
-
alias_method :sds, :standard_deviation_sample
|
874
|
-
alias_method :cov, :coefficient_of_variation
|
875
|
-
alias_method :variance, :variance_sample
|
876
|
-
alias_method :sd, :standard_deviation_sample
|
877
|
-
alias_method :ss, :sum_of_squares
|
878
902
|
end
|
903
|
+
|
904
|
+
# Coefficient of variation
|
905
|
+
# Calculed with the sample standard deviation
|
906
|
+
def coefficient_of_variation
|
907
|
+
check_type :scale
|
908
|
+
standard_deviation_sample.quo(mean)
|
909
|
+
end
|
910
|
+
|
911
|
+
alias_method :sdp, :standard_deviation_population
|
912
|
+
alias_method :sds, :standard_deviation_sample
|
913
|
+
alias_method :cov, :coefficient_of_variation
|
914
|
+
alias_method :variance, :variance_sample
|
915
|
+
alias_method :sd, :standard_deviation_sample
|
916
|
+
alias_method :ss, :sum_of_squares
|
917
|
+
end
|
879
918
|
end
|