statsample 1.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.build.sh +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +19 -7
- data/CONTRIBUTING.md +33 -0
- data/History.txt +5 -0
- data/README.md +41 -53
- data/benchmarks/correlation_matrix_15_variables.rb +6 -5
- data/benchmarks/correlation_matrix_5_variables.rb +6 -5
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
- data/examples/boxplot.rb +17 -5
- data/examples/correlation_matrix.rb +36 -7
- data/examples/dataset.rb +25 -5
- data/examples/dominance_analysis.rb +8 -7
- data/examples/dominance_analysis_bootstrap.rb +16 -11
- data/examples/histogram.rb +16 -2
- data/examples/icc.rb +5 -6
- data/examples/levene.rb +17 -3
- data/examples/multiple_regression.rb +6 -3
- data/examples/parallel_analysis.rb +11 -6
- data/examples/polychoric.rb +26 -13
- data/examples/principal_axis.rb +8 -4
- data/examples/reliability.rb +10 -10
- data/examples/scatterplot.rb +8 -0
- data/examples/t_test.rb +7 -0
- data/examples/u_test.rb +10 -2
- data/examples/vector.rb +9 -6
- data/examples/velicer_map_test.rb +12 -8
- data/lib/statsample.rb +13 -47
- data/lib/statsample/analysis/suite.rb +1 -1
- data/lib/statsample/anova/oneway.rb +6 -6
- data/lib/statsample/anova/twoway.rb +26 -24
- data/lib/statsample/bivariate.rb +78 -61
- data/lib/statsample/bivariate/pearson.rb +2 -2
- data/lib/statsample/codification.rb +45 -32
- data/lib/statsample/converter/csv.rb +15 -53
- data/lib/statsample/converter/spss.rb +6 -5
- data/lib/statsample/converters.rb +50 -211
- data/lib/statsample/crosstab.rb +26 -25
- data/lib/statsample/daru.rb +117 -0
- data/lib/statsample/dataset.rb +70 -942
- data/lib/statsample/dominanceanalysis.rb +16 -17
- data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
- data/lib/statsample/factor/parallelanalysis.rb +17 -19
- data/lib/statsample/factor/pca.rb +21 -20
- data/lib/statsample/factor/principalaxis.rb +3 -3
- data/lib/statsample/graph/boxplot.rb +8 -16
- data/lib/statsample/graph/histogram.rb +4 -4
- data/lib/statsample/graph/scatterplot.rb +8 -7
- data/lib/statsample/histogram.rb +128 -119
- data/lib/statsample/matrix.rb +20 -16
- data/lib/statsample/multiset.rb +39 -38
- data/lib/statsample/regression.rb +3 -3
- data/lib/statsample/regression/multiple.rb +8 -10
- data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
- data/lib/statsample/regression/multiple/baseengine.rb +32 -32
- data/lib/statsample/regression/multiple/gslengine.rb +33 -36
- data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
- data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
- data/lib/statsample/reliability.rb +23 -25
- data/lib/statsample/reliability/icc.rb +8 -7
- data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
- data/lib/statsample/reliability/scaleanalysis.rb +58 -60
- data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
- data/lib/statsample/resample.rb +1 -1
- data/lib/statsample/shorthand.rb +29 -25
- data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
- data/lib/statsample/test/levene.rb +28 -27
- data/lib/statsample/test/t.rb +7 -9
- data/lib/statsample/test/umannwhitney.rb +28 -28
- data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
- data/lib/statsample/vector.rb +70 -1013
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +12 -16
- data/test/helpers_tests.rb +1 -1
- data/test/test_analysis.rb +17 -17
- data/test/test_anova_contrast.rb +6 -6
- data/test/test_anovatwowaywithdataset.rb +8 -8
- data/test/test_anovawithvectors.rb +8 -8
- data/test/test_awesome_print_bug.rb +1 -1
- data/test/test_bartlettsphericity.rb +4 -4
- data/test/test_bivariate.rb +48 -43
- data/test/test_codification.rb +33 -33
- data/test/test_crosstab.rb +9 -9
- data/test/test_dataset.rb +28 -458
- data/test/test_factor.rb +46 -38
- data/test/test_factor_pa.rb +22 -13
- data/test/test_ggobi.rb +4 -4
- data/test/test_gsl.rb +4 -4
- data/test/test_histogram.rb +3 -3
- data/test/test_matrix.rb +13 -13
- data/test/test_multiset.rb +103 -91
- data/test/test_regression.rb +57 -52
- data/test/test_reliability.rb +55 -45
- data/test/test_reliability_icc.rb +8 -8
- data/test/test_reliability_skillscale.rb +26 -24
- data/test/test_resample.rb +1 -1
- data/test/test_statistics.rb +3 -13
- data/test/test_stest.rb +9 -9
- data/test/test_stratified.rb +3 -3
- data/test/test_test_t.rb +12 -12
- data/test/test_umannwhitney.rb +2 -2
- data/test/test_vector.rb +76 -613
- data/test/test_wilcoxonsignedrank.rb +4 -4
- metadata +57 -28
- data/lib/statsample/rserve_extension.rb +0 -20
- data/lib/statsample/vector/gsl.rb +0 -106
- data/test/fixtures/repeated_fields.csv +0 -7
- data/test/fixtures/scientific_notation.csv +0 -4
- data/test/fixtures/test_csv.csv +0 -7
- data/test/fixtures/test_xls.xls +0 -0
- data/test/test_csv.rb +0 -63
- data/test/test_rserve_extension.rb +0 -42
- data/test/test_xls.rb +0 -52
@@ -8,13 +8,13 @@ module Statsample
|
|
8
8
|
|
9
9
|
# Name of F analysis
|
10
10
|
attr_accessor :name
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
attr_reader :w
|
12
|
+
attr_reader :nr
|
13
|
+
attr_writer :tails
|
14
14
|
# Parameters:
|
15
15
|
def initialize(v1,v2, opts=Hash.new)
|
16
|
-
|
17
|
-
|
16
|
+
@v1 = v1
|
17
|
+
@v2 = v2
|
18
18
|
opts_default={:name=>_("Wilcoxon Signed Rank Test"),:tails=>:both}
|
19
19
|
@opts=opts_default.merge(opts)
|
20
20
|
opts_default.keys.each {|k|
|
@@ -22,66 +22,68 @@ module Statsample
|
|
22
22
|
}
|
23
23
|
calculate
|
24
24
|
end
|
25
|
+
|
25
26
|
def calculate
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
}.sum
|
27
|
+
df = Daru::DataFrame.new({:v1 => @v1,:v2 => @v2})
|
28
|
+
# df[:abs]=df.collect(:row) { |row| (row[:v2] - row[:v1]).abs }
|
29
|
+
df[:abs] = (df[:v2] - df[:v1]).abs
|
30
|
+
df[:sgn] = df.collect(:row) { |row|
|
31
|
+
r = row[:v2] - row[:v1]
|
32
|
+
r == 0 ? 0 : r/r.abs
|
33
|
+
}
|
34
|
+
df = df.filter_rows { |row| row[:sgn] != 0}
|
35
|
+
df[:rank] = df[:abs].ranked
|
36
|
+
@nr = df.nrows
|
37
|
+
|
38
|
+
@w = df.collect(:row) { |row|
|
39
|
+
row[:sgn] * row[:rank]
|
40
|
+
}.sum
|
41
41
|
end
|
42
|
+
|
42
43
|
def report_building(generator) # :nodoc:
|
43
44
|
generator.section(:name=>@name) do |s|
|
44
45
|
s.table(:name=>_("%s results") % @name) do |t|
|
45
46
|
t.row([_("W Value"), "%0.3f" % @w])
|
46
47
|
t.row([_("Z"), "%0.3f (p: %0.3f)" % [z, probability_z]])
|
47
48
|
if(nr<=10)
|
48
|
-
|
49
|
+
t.row([_("Exact probability"), "p-exact: %0.3f" % [probability_exact]])
|
49
50
|
end
|
50
51
|
end
|
51
52
|
end
|
52
53
|
end
|
53
54
|
def z
|
54
|
-
|
55
|
-
|
55
|
+
sigma=Math.sqrt((nr*(nr+1)*(2*nr+1))/6)
|
56
|
+
(w-0.5)/sigma
|
56
57
|
end
|
57
58
|
# Assuming normal distribution of W, this calculate
|
58
59
|
# the probability of samples with Z equal or higher than
|
59
60
|
# obtained on sample
|
60
61
|
def probability_z
|
61
|
-
|
62
|
+
(1-Distribution::Normal.cdf(z))*(@tails==:both ? 2:1)
|
62
63
|
end
|
63
64
|
# Calculate exact probability.
|
64
65
|
# Don't calculate for large Nr, please!
|
65
66
|
def probability_exact
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
67
|
+
str_format="%0#{nr}b"
|
68
|
+
combinations=2**nr
|
69
|
+
#p str_format
|
70
|
+
total_w=combinations.times.map do |i|
|
71
|
+
comb=sprintf(str_format,i)
|
72
|
+
w_local=comb.length.times.inject(0) do |ac,j|
|
73
|
+
sgn=comb[j]=="0" ? -1 : 1
|
74
|
+
ac+(j+1)*sgn
|
75
|
+
end
|
76
|
+
end.sort
|
77
|
+
|
78
|
+
total_w.find_all do |v|
|
79
|
+
if @tails==:both
|
80
|
+
v<=-w.abs or v>=w.abs
|
81
|
+
elsif @tails==:left
|
82
|
+
v<=w
|
83
|
+
elsif @tails==:right
|
84
|
+
v>=w
|
85
|
+
end
|
86
|
+
end.count/(combinations.to_f)
|
85
87
|
end
|
86
88
|
end
|
87
89
|
end
|
data/lib/statsample/vector.rb
CHANGED
@@ -1,22 +1,18 @@
|
|
1
|
-
require 'date'
|
2
|
-
require 'statsample/vector/gsl'
|
3
|
-
|
4
1
|
module Statsample::VectorShorthands
|
5
2
|
# Creates a new Statsample::Vector object
|
6
3
|
# Argument should be equal to Vector.new
|
7
4
|
def to_vector(*args)
|
8
|
-
Statsample::Vector.new(self
|
5
|
+
Statsample::Vector.new(self)
|
9
6
|
end
|
10
7
|
|
11
|
-
# Creates a new
|
8
|
+
# Creates a new Daru::Vector object of type :scale.
|
12
9
|
# Deprecated. Use to_numeric instead.
|
13
10
|
def to_scale(*args)
|
14
|
-
|
15
|
-
Statsample::Vector.new(self, :numeric, *args)
|
11
|
+
Statsample::Vector.new(self, *args)
|
16
12
|
end
|
17
13
|
|
18
14
|
def to_numeric(*args)
|
19
|
-
Statsample::Vector.new(self
|
15
|
+
Statsample::Vector.new(self)
|
20
16
|
end
|
21
17
|
end
|
22
18
|
|
@@ -39,1057 +35,118 @@ module Statsample
|
|
39
35
|
# == Usage
|
40
36
|
# The fast way to create a vector uses Array.to_vector or Array.to_numeric.
|
41
37
|
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
38
|
+
# == Deprecation Warning
|
39
|
+
#
|
40
|
+
# Statsample::Vector has been deprecated in favour of Daru::Vector. Daru is
|
41
|
+
# a dedicated data analysis and manipulation library that brings awesome
|
42
|
+
# data analysis functionality to ruby. Check out the daru docs at
|
43
|
+
# https://github.com/v0dro/daru#notebooks
|
44
|
+
class Vector < Daru::Vector
|
49
45
|
include Statsample::VectorShorthands
|
50
46
|
|
51
|
-
#
|
52
|
-
|
53
|
-
#
|
54
|
-
|
55
|
-
#
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
attr_reader :today_values
|
47
|
+
# Valid data. Equal to data, minus values assigned as missing values.
|
48
|
+
#
|
49
|
+
# == Deprecation Warning
|
50
|
+
#
|
51
|
+
# Use Daru::Vector#only_valid instead of this method.
|
52
|
+
def valid_data
|
53
|
+
$stderr.puts "WARNING: valid_data in Statsample::Vector has been deprecated in favor of only_valid in Daru::Vector. Please use that.\n"
|
54
|
+
only_valid.to_a
|
55
|
+
end
|
61
56
|
# Missing values array
|
62
|
-
|
63
|
-
#
|
64
|
-
|
65
|
-
#
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
#
|
70
|
-
|
57
|
+
#
|
58
|
+
# == Deprecation Warning
|
59
|
+
#
|
60
|
+
# Use Daru::Vector#only_valid instead of this method.
|
61
|
+
def missing_data
|
62
|
+
only_missing.to_a
|
63
|
+
end
|
64
|
+
# Original data.
|
65
|
+
#
|
66
|
+
# == Deprecation Warning
|
67
|
+
#
|
68
|
+
# Use Daru::Vector#to_a instead of this method.
|
69
|
+
def data_with_nils
|
70
|
+
to_a
|
71
|
+
end
|
72
|
+
|
73
|
+
def type= val
|
74
|
+
raise NoMethodError, "Daru::Vector automatically figures the type of data. There is no need to assign it anymore."
|
75
|
+
end
|
76
|
+
|
77
|
+
def initialize(data=[], type=:object, opts=Hash.new)
|
78
|
+
$stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using that."
|
71
79
|
|
72
|
-
# Creates a new Vector object.
|
73
|
-
# * <tt>data</tt> Any data which can be converted on Array
|
74
|
-
# * <tt>type</tt> Level of meausurement. See Vector#type
|
75
|
-
# * <tt>opts</tt> Hash of options
|
76
|
-
# * <tt>:missing_values</tt> Array of missing values. See Vector#missing_values
|
77
|
-
# * <tt>:today_values</tt> Array of 'today' values. See Vector#today_values
|
78
|
-
# * <tt>:labels</tt> Labels for data values
|
79
|
-
# * <tt>:name</tt> Name of vector
|
80
|
-
def initialize(data=[], type=:object, opts=Hash.new)
|
81
80
|
if type == :ordinal or type == :scale
|
82
|
-
$stderr.puts "WARNING: #{type} has been deprecated.
|
83
|
-
type = :numeric
|
81
|
+
$stderr.puts "WARNING: #{type} has been deprecated."
|
84
82
|
end
|
85
83
|
|
86
84
|
if type == :nominal
|
87
|
-
$stderr.puts "WARNING: nominal has been deprecated.
|
88
|
-
type = :object
|
85
|
+
$stderr.puts "WARNING: nominal has been deprecated."
|
89
86
|
end
|
90
87
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
:labels=>{},
|
97
|
-
:name=>nil
|
98
|
-
}
|
99
|
-
@opts=opts_default.merge(opts)
|
100
|
-
if @opts[:name].nil?
|
88
|
+
if opts[:today_values]
|
89
|
+
raise ArgumentError, "This option is no longer supported in Vector. Watch out for the next version of Daru::Vector that will have full time series support"
|
90
|
+
end
|
91
|
+
|
92
|
+
if opts[:name].nil?
|
101
93
|
@@n_table||=0
|
102
94
|
@@n_table+=1
|
103
|
-
|
95
|
+
opts[:name] = "Vector #{@@n_table}"
|
104
96
|
end
|
105
|
-
|
106
|
-
|
107
|
-
@today_values=@opts[:today_values]
|
108
|
-
@name=@opts[:name]
|
109
|
-
@valid_data=[]
|
110
|
-
@data_with_nils=[]
|
111
|
-
@date_data_with_nils=[]
|
112
|
-
@missing_data=[]
|
113
|
-
@has_missing_data=nil
|
114
|
-
@numeric_data=nil
|
115
|
-
set_valid_data
|
116
|
-
self.type=type
|
97
|
+
|
98
|
+
super(data, opts)
|
117
99
|
end
|
100
|
+
|
118
101
|
# Create a vector using (almost) any object
|
119
102
|
# * Array: flattened
|
120
103
|
# * Range: transformed using to_a
|
121
104
|
# * Statsample::Vector
|
122
105
|
# * Numeric and string values
|
106
|
+
#
|
107
|
+
# == Deprecation Warning
|
108
|
+
#
|
109
|
+
# Statsample::Vector is to be replaced by Daru::Vector soon. Use the
|
110
|
+
# equivalent method Daru::Vector.[] for this purpose.
|
123
111
|
def self.[](*args)
|
124
|
-
|
125
|
-
args
|
126
|
-
case a
|
127
|
-
when Array
|
128
|
-
values.concat a.flatten
|
129
|
-
when Statsample::Vector
|
130
|
-
values.concat a.to_a
|
131
|
-
when Range
|
132
|
-
values.concat a.to_a
|
133
|
-
else
|
134
|
-
values << a
|
135
|
-
end
|
136
|
-
end
|
137
|
-
vector=new(values)
|
138
|
-
vector.type=:numeric if vector.can_be_numeric?
|
139
|
-
vector
|
112
|
+
$stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using that."
|
113
|
+
super *args
|
140
114
|
end
|
115
|
+
|
141
116
|
# Create a new numeric type vector
|
142
117
|
# Parameters
|
143
118
|
# [n] Size
|
144
119
|
# [val] Value of each value
|
145
120
|
# [&block] If block provided, is used to set the values of vector
|
121
|
+
#
|
122
|
+
# == Deprecation Warning
|
123
|
+
#
|
124
|
+
# Statsample::Vector is to be replaced by Daru::Vector soon. Use the
|
125
|
+
# equivalent method Daru::Vector.[] for this purpose.
|
146
126
|
def self.new_numeric(n,val=nil, &block)
|
147
127
|
if block
|
148
|
-
|
128
|
+
Statsample::Vector.new(n.times.map {|i| block.call(i)})
|
149
129
|
else
|
150
|
-
|
130
|
+
Statsample::Vector.new(n.times.map { val })
|
151
131
|
end
|
152
|
-
vector.type=:numeric
|
153
|
-
vector
|
154
132
|
end
|
155
133
|
|
156
134
|
# Deprecated. Use new_numeric instead.
|
157
135
|
def self.new_scale(n, val=nil,&block)
|
158
|
-
$stderr.puts "WARNING: .new_scale has been deprecated. Use .new_numeric instead."
|
159
136
|
new_numeric n, val, &block
|
160
137
|
end
|
161
|
-
# Creates a duplicate of the Vector.
|
162
|
-
# Note: data, missing_values and labels are duplicated, so
|
163
|
-
# changes on original vector doesn't propages to copies.
|
164
|
-
def dup
|
165
|
-
Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=>@name)
|
166
|
-
end
|
167
|
-
# Returns an empty duplicate of the vector. Maintains the type,
|
168
|
-
# missing values and labels.
|
169
|
-
def dup_empty
|
170
|
-
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name)
|
171
|
-
end
|
172
|
-
|
173
|
-
if Statsample::STATSAMPLE__.respond_to?(:check_type)
|
174
|
-
# Raises an exception if type of vector is inferior to t type
|
175
|
-
def check_type(t)
|
176
|
-
Statsample::STATSAMPLE__.check_type(self,t)
|
177
|
-
end
|
178
|
-
else
|
179
|
-
def check_type(t) #:nodoc:
|
180
|
-
_check_type(t)
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
|
185
|
-
def _check_type(t) #:nodoc:
|
186
|
-
raise NoMethodError if (t == :numeric and @type == :object) or
|
187
|
-
(t == :date) or (:date == @type)
|
188
|
-
end
|
189
|
-
|
190
|
-
def vector_standarized_compute(m,sd) # :nodoc:
|
191
|
-
@data_with_nils.collect{|x| x.nil? ? nil : (x.to_f - m).quo(sd) }.to_vector(:numeric)
|
192
|
-
end
|
193
|
-
# Return a vector usign the standarized values for data
|
194
|
-
# with sd with denominator n-1. With variance=0 or mean nil,
|
195
|
-
# returns a vector of equal size full of nils
|
196
|
-
#
|
197
|
-
def vector_standarized(use_population=false)
|
198
|
-
check_type :numeric
|
199
|
-
m=mean
|
200
|
-
sd=use_population ? sdp : sds
|
201
|
-
return ([nil]*size).to_numeric if mean.nil? or sd==0.0
|
202
|
-
vector=vector_standarized_compute(m,sd)
|
203
|
-
vector.name=_("%s(standarized)") % @name
|
204
|
-
vector
|
205
|
-
end
|
206
|
-
def vector_centered_compute(m) #:nodoc:
|
207
|
-
@data_with_nils.collect {|x| x.nil? ? nil : x.to_f-m }.to_numeric
|
208
|
-
end
|
209
|
-
# Return a centered vector
|
210
|
-
def vector_centered
|
211
|
-
check_type :numeric
|
212
|
-
m=mean
|
213
|
-
return ([nil]*size).to_numeric if mean.nil?
|
214
|
-
vector=vector_centered_compute(m)
|
215
|
-
vector.name=_("%s(centered)") % @name
|
216
|
-
vector
|
217
|
-
end
|
218
|
-
|
219
|
-
alias_method :standarized, :vector_standarized
|
220
|
-
alias_method :centered, :vector_centered
|
221
|
-
# Return a vector with values replaced with the percentiles
|
222
|
-
# of each values
|
223
|
-
def vector_percentil
|
224
|
-
check_type :numeric
|
225
|
-
c=@valid_data.size
|
226
|
-
vector=ranked.map {|i| i.nil? ? nil : (i.quo(c)*100).to_f }.to_vector(@type)
|
227
|
-
vector.name=_("%s(percentil)") % @name
|
228
|
-
vector
|
229
|
-
end
|
230
|
-
def box_cox_transformation(lambda) # :nodoc:
|
231
|
-
raise "Should be a numeric" unless @type==:numeric
|
232
|
-
@data_with_nils.collect{|x|
|
233
|
-
if !x.nil?
|
234
|
-
if(lambda==0)
|
235
|
-
Math.log(x)
|
236
|
-
else
|
237
|
-
(x**lambda-1).quo(lambda)
|
238
|
-
end
|
239
|
-
else
|
240
|
-
nil
|
241
|
-
end
|
242
|
-
}.to_vector(:numeric)
|
243
|
-
end
|
244
|
-
|
245
|
-
# Vector equality.
|
246
|
-
# Two vector will be the same if their data, missing values, type, labels are equals
|
247
|
-
def ==(v2)
|
248
|
-
return false unless v2.instance_of? Statsample::Vector
|
249
|
-
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels==v2.labels
|
250
|
-
end
|
251
|
-
|
252
|
-
def _dump(i) # :nodoc:
|
253
|
-
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type,'name'=>@name})
|
254
|
-
end
|
255
|
-
|
256
|
-
def self._load(data) # :nodoc:
|
257
|
-
h=Marshal.load(data)
|
258
|
-
Vector.new(h['data'], h['type'], :missing_values=> h['missing_values'], :labels=>h['labels'], :name=>h['name'])
|
259
|
-
end
|
260
|
-
# Returns a new vector, with data modified by block.
|
261
|
-
# Equivalent to create a Vector after #collect on data
|
262
|
-
def recode(type=nil)
|
263
|
-
type||=@type
|
264
|
-
@data.collect{|x|
|
265
|
-
yield x
|
266
|
-
}.to_vector(type)
|
267
|
-
end
|
268
|
-
# Modifies current vector, with data modified by block.
|
269
|
-
# Equivalent to #collect! on @data
|
270
|
-
def recode!
|
271
|
-
@data.collect!{|x|
|
272
|
-
yield x
|
273
|
-
}
|
274
|
-
set_valid_data
|
275
|
-
end
|
276
|
-
def push(v)
|
277
|
-
@data.push(v)
|
278
|
-
set_valid_data
|
279
|
-
end
|
280
|
-
|
281
|
-
# Dicotomize the vector with 0 and 1, based on lowest value
|
282
|
-
# If parameter if defined, this value and lower
|
283
|
-
# will be 0 and higher, 1
|
284
|
-
def dichotomize(low = nil)
|
285
|
-
low ||= factors.min
|
286
|
-
|
287
|
-
@data_with_nils.collect do |x|
|
288
|
-
if x.nil?
|
289
|
-
nil
|
290
|
-
elsif x > low
|
291
|
-
1
|
292
|
-
else
|
293
|
-
0
|
294
|
-
end
|
295
|
-
end.to_numeric
|
296
|
-
end
|
297
|
-
# Iterate on each item.
|
298
|
-
# Equivalent to
|
299
|
-
# @data.each{|x| yield x}
|
300
|
-
def each
|
301
|
-
@data.each{|x| yield(x) }
|
302
|
-
end
|
303
|
-
|
304
|
-
# Iterate on each item, retrieving index
|
305
|
-
def each_index
|
306
|
-
(0...@data.size).each {|i|
|
307
|
-
yield(i)
|
308
|
-
}
|
309
|
-
end
|
310
|
-
# Add a value at the end of the vector.
|
311
|
-
# If second argument set to false, you should update the Vector usign
|
312
|
-
# Vector.set_valid_data at the end of your insertion cycle
|
313
|
-
#
|
314
|
-
def add(v,update_valid=true)
|
315
|
-
@data.push(v)
|
316
|
-
set_valid_data if update_valid
|
317
|
-
end
|
318
|
-
# Update valid_data, missing_data, data_with_nils and gsl
|
319
|
-
# at the end of an insertion.
|
320
|
-
#
|
321
|
-
# Use after Vector.add(v,false)
|
322
|
-
# Usage:
|
323
|
-
# v=Statsample::Vector.new
|
324
|
-
# v.add(2,false)
|
325
|
-
# v.add(4,false)
|
326
|
-
# v.data
|
327
|
-
# => [2,3]
|
328
|
-
# v.valid_data
|
329
|
-
# => []
|
330
|
-
# v.set_valid_data
|
331
|
-
# v.valid_data
|
332
|
-
# => [2,3]
|
333
|
-
def set_valid_data
|
334
|
-
@valid_data.clear
|
335
|
-
@missing_data.clear
|
336
|
-
@data_with_nils.clear
|
337
|
-
@date_data_with_nils.clear
|
338
|
-
set_valid_data_intern
|
339
|
-
set_numeric_data if(@type==:numeric)
|
340
|
-
set_date_data if(@type==:date)
|
341
|
-
end
|
342
|
-
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
343
|
-
def set_valid_data_intern #:nodoc:
|
344
|
-
Statsample::STATSAMPLE__.set_valid_data_intern(self)
|
345
|
-
end
|
346
|
-
else
|
347
|
-
def set_valid_data_intern #:nodoc:
|
348
|
-
_set_valid_data_intern
|
349
|
-
end
|
350
|
-
end
|
351
|
-
def _set_valid_data_intern #:nodoc:
|
352
|
-
@data.each do |n|
|
353
|
-
if is_valid? n
|
354
|
-
@valid_data.push(n)
|
355
|
-
@data_with_nils.push(n)
|
356
|
-
else
|
357
|
-
@data_with_nils.push(nil)
|
358
|
-
@missing_data.push(n)
|
359
|
-
end
|
360
|
-
end
|
361
|
-
@has_missing_data=@missing_data.size>0
|
362
|
-
end
|
363
|
-
|
364
|
-
# Retrieves true if data has one o more missing values
|
365
|
-
def has_missing_data?
|
366
|
-
@has_missing_data
|
367
|
-
end
|
368
|
-
alias :flawed? :has_missing_data?
|
369
|
-
|
370
|
-
# Retrieves label for value x. Retrieves x if
|
371
|
-
# no label defined.
|
372
|
-
def labeling(x)
|
373
|
-
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
374
|
-
end
|
375
|
-
alias :label :labeling
|
376
|
-
# Returns a Vector with data with labels replaced by the label.
|
377
|
-
def vector_labeled
|
378
|
-
d=@data.collect{|x|
|
379
|
-
if @labels.has_key? x
|
380
|
-
@labels[x]
|
381
|
-
else
|
382
|
-
x
|
383
|
-
end
|
384
|
-
}
|
385
|
-
Vector.new(d,@type)
|
386
|
-
end
|
387
|
-
# Size of total data
|
388
|
-
def size
|
389
|
-
@data.size
|
390
|
-
end
|
391
|
-
alias_method :n, :size
|
392
|
-
|
393
|
-
# Retrieves i element of data
|
394
|
-
def [](i)
|
395
|
-
@data[i]
|
396
|
-
end
|
397
|
-
# Set i element of data.
|
398
|
-
# Note: Use set_valid_data if you include missing values
|
399
|
-
def []=(i,v)
|
400
|
-
@data[i]=v
|
401
|
-
end
|
402
|
-
# Return true if a value is valid (not nil and not included on missing values)
|
403
|
-
def is_valid?(x)
|
404
|
-
!(x.nil? or @missing_values.include? x)
|
405
|
-
end
|
406
|
-
# Set missing_values.
|
407
|
-
# set_valid_data is called after changes
|
408
|
-
def missing_values=(vals)
|
409
|
-
@missing_values = vals
|
410
|
-
set_valid_data
|
411
|
-
end
|
412
|
-
# Set data considered as "today" on data vectors
|
413
|
-
def today_values=(vals)
|
414
|
-
@today_values = vals
|
415
|
-
set_valid_data
|
416
|
-
end
|
417
|
-
# Set level of measurement.
|
418
|
-
def type=(t)
|
419
|
-
@type=t
|
420
|
-
set_numeric_data if(t==:numeric)
|
421
|
-
set_date_data if (t==:date)
|
422
|
-
end
|
423
|
-
def to_a
|
424
|
-
if @data.is_a? Array
|
425
|
-
@data.dup
|
426
|
-
else
|
427
|
-
@data.to_a
|
428
|
-
end
|
429
|
-
end
|
430
|
-
alias_method :to_ary, :to_a
|
431
|
-
|
432
|
-
# Vector sum.
|
433
|
-
# - If v is a scalar, add this value to all elements
|
434
|
-
# - If v is a Array or a Vector, should be of the same size of this vector
|
435
|
-
# every item of this vector will be added to the value of the
|
436
|
-
# item at the same position on the other vector
|
437
|
-
def +(v)
|
438
|
-
_vector_ari("+",v)
|
439
|
-
end
|
440
|
-
# Vector rest.
|
441
|
-
# - If v is a scalar, rest this value to all elements
|
442
|
-
# - If v is a Array or a Vector, should be of the same
|
443
|
-
# size of this vector
|
444
|
-
# every item of this vector will be rested to the value of the
|
445
|
-
# item at the same position on the other vector
|
446
|
-
|
447
|
-
def -(v)
|
448
|
-
_vector_ari("-",v)
|
449
|
-
end
|
450
|
-
|
451
|
-
def *(v)
|
452
|
-
_vector_ari("*",v)
|
453
|
-
end
|
454
|
-
# Reports all values that doesn't comply with a condition.
|
455
|
-
# Returns a hash with the index of data and the invalid data.
|
456
|
-
def verify
|
457
|
-
h={}
|
458
|
-
(0...@data.size).to_a.each{|i|
|
459
|
-
if !(yield @data[i])
|
460
|
-
h[i]=@data[i]
|
461
|
-
end
|
462
|
-
}
|
463
|
-
h
|
464
|
-
end
|
465
|
-
def _vector_ari(method,v) # :nodoc:
|
466
|
-
if(v.is_a? Vector or v.is_a? Array)
|
467
|
-
raise ArgumentError, "The array/vector parameter (#{v.size}) should be of the same size of the original vector (#{@data.size})" unless v.size==@data.size
|
468
|
-
sum=[]
|
469
|
-
v.size.times {|i|
|
470
|
-
if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
|
471
|
-
sum.push(@data[i].send(method,v[i]))
|
472
|
-
else
|
473
|
-
sum.push(nil)
|
474
|
-
end
|
475
|
-
}
|
476
|
-
Statsample::Vector.new(sum, :numeric)
|
477
|
-
elsif(v.respond_to? method )
|
478
|
-
Statsample::Vector.new(
|
479
|
-
@data.collect {|x|
|
480
|
-
if(!x.nil?)
|
481
|
-
x.send(method,v)
|
482
|
-
else
|
483
|
-
nil
|
484
|
-
end
|
485
|
-
} , :numeric)
|
486
|
-
else
|
487
|
-
raise TypeError,"You should pass a scalar or a array/vector"
|
488
|
-
end
|
489
|
-
|
490
|
-
end
|
491
|
-
# Return an array with the data splitted by a separator.
|
492
|
-
# a=Vector.new(["a,b","c,d","a,b","d"])
|
493
|
-
# a.splitted
|
494
|
-
# =>
|
495
|
-
# [["a","b"],["c","d"],["a","b"],["d"]]
|
496
|
-
def splitted(sep=Statsample::SPLIT_TOKEN)
|
497
|
-
@data.collect{|x|
|
498
|
-
if x.nil?
|
499
|
-
nil
|
500
|
-
elsif (x.respond_to? :split)
|
501
|
-
x.split(sep)
|
502
|
-
else
|
503
|
-
[x]
|
504
|
-
end
|
505
|
-
}
|
506
|
-
end
|
507
|
-
# Returns a hash of Vectors, defined by the different values
|
508
|
-
# defined on the fields
|
509
|
-
# Example:
|
510
|
-
#
|
511
|
-
# a=Vector.new(["a,b","c,d","a,b"])
|
512
|
-
# a.split_by_separator
|
513
|
-
# => {"a"=>#<Statsample::Type::object:0x7f2dbcc09d88
|
514
|
-
# @data=[1, 0, 1]>,
|
515
|
-
# "b"=>#<Statsample::Type::object:0x7f2dbcc09c48
|
516
|
-
# @data=[1, 1, 0]>,
|
517
|
-
# "c"=>#<Statsample::Type::object:0x7f2dbcc09b08
|
518
|
-
# @data=[0, 1, 1]>}
|
519
|
-
#
|
520
|
-
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
521
|
-
split_data=splitted(sep)
|
522
|
-
factors=split_data.flatten.uniq.compact
|
523
|
-
out=factors.inject({}) {|a,x|
|
524
|
-
a[x]=[]
|
525
|
-
a
|
526
|
-
}
|
527
|
-
split_data.each do |r|
|
528
|
-
if r.nil?
|
529
|
-
factors.each do |f|
|
530
|
-
out[f].push(nil)
|
531
|
-
end
|
532
|
-
else
|
533
|
-
factors.each do |f|
|
534
|
-
out[f].push(r.include?(f) ? 1:0)
|
535
|
-
end
|
536
|
-
end
|
537
|
-
end
|
538
|
-
out.inject({}){|s,v|
|
539
|
-
s[v[0]]=Vector.new(v[1],:object)
|
540
|
-
s
|
541
|
-
}
|
542
|
-
end
|
543
|
-
def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
|
544
|
-
split_by_separator(sep).inject({}) {|a,v|
|
545
|
-
a[v[0]]=v[1].inject {|s,x| s+x.to_i}
|
546
|
-
a
|
547
|
-
}
|
548
|
-
end
|
549
|
-
|
550
|
-
# == Bootstrap
|
551
|
-
# Generate +nr+ resamples (with replacement) of size +s+
|
552
|
-
# from vector, computing each estimate from +estimators+
|
553
|
-
# over each resample.
|
554
|
-
# +estimators+ could be
|
555
|
-
# a) Hash with variable names as keys and lambdas as values
|
556
|
-
# a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
|
557
|
-
# b) Array with names of method to bootstrap
|
558
|
-
# a.bootstrap([:mean, :sd],1000)
|
559
|
-
# c) A single method to bootstrap
|
560
|
-
# a.jacknife(:mean, 1000)
|
561
|
-
# If s is nil, is set to vector size by default.
|
562
|
-
#
|
563
|
-
# Returns a dataset where each vector is an vector
|
564
|
-
# of length +nr+ containing the computed resample estimates.
|
565
|
-
def bootstrap(estimators, nr, s=nil)
|
566
|
-
s||=n
|
567
|
-
|
568
|
-
h_est, es, bss= prepare_bootstrap(estimators)
|
569
|
-
|
570
|
-
|
571
|
-
nr.times do |i|
|
572
|
-
bs=sample_with_replacement(s)
|
573
|
-
es.each do |estimator|
|
574
|
-
# Add bootstrap
|
575
|
-
bss[estimator].push(h_est[estimator].call(bs))
|
576
|
-
end
|
577
|
-
end
|
578
|
-
|
579
|
-
es.each do |est|
|
580
|
-
bss[est]=bss[est].to_numeric
|
581
|
-
bss[est].type=:numeric
|
582
|
-
end
|
583
|
-
bss.to_dataset
|
584
|
-
|
585
|
-
end
|
586
|
-
|
587
|
-
# == Jacknife
|
588
|
-
# Returns a dataset with jacknife delete-+k+ +estimators+
|
589
|
-
# +estimators+ could be:
|
590
|
-
# a) Hash with variable names as keys and lambdas as values
|
591
|
-
# a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
|
592
|
-
# b) Array with method names to jacknife
|
593
|
-
# a.jacknife([:mean, :sd])
|
594
|
-
# c) A single method to jacknife
|
595
|
-
# a.jacknife(:mean)
|
596
|
-
# +k+ represent the block size for block jacknife. By default
|
597
|
-
# is set to 1, for classic delete-one jacknife.
|
598
|
-
#
|
599
|
-
# Returns a dataset where each vector is an vector
|
600
|
-
# of length +cases+/+k+ containing the computed jacknife estimates.
|
601
|
-
#
|
602
|
-
# == Reference:
|
603
|
-
# * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
|
604
|
-
def jacknife(estimators, k=1)
|
605
|
-
raise "n should be divisible by k:#{k}" unless n%k==0
|
606
|
-
|
607
|
-
nb=(n / k).to_i
|
608
|
-
|
609
|
-
|
610
|
-
h_est, es, ps= prepare_bootstrap(estimators)
|
611
|
-
|
612
|
-
est_n=es.inject({}) {|h,v|
|
613
|
-
h[v]=h_est[v].call(self)
|
614
|
-
h
|
615
|
-
}
|
616
|
-
|
617
138
|
|
618
|
-
nb.times do |i|
|
619
|
-
other=@data_with_nils.dup
|
620
|
-
other.slice!(i*k,k)
|
621
|
-
other=other.to_numeric
|
622
|
-
es.each do |estimator|
|
623
|
-
# Add pseudovalue
|
624
|
-
ps[estimator].push( nb * est_n[estimator] - (nb-1) * h_est[estimator].call(other))
|
625
|
-
end
|
626
|
-
end
|
627
|
-
|
628
|
-
|
629
|
-
es.each do |est|
|
630
|
-
ps[est]=ps[est].to_numeric
|
631
|
-
ps[est].type=:numeric
|
632
|
-
end
|
633
|
-
ps.to_dataset
|
634
|
-
end
|
635
|
-
|
636
|
-
|
637
|
-
# For an array or hash of estimators methods, returns
|
638
|
-
# an array with three elements
|
639
|
-
# 1.- A hash with estimators names as keys and lambdas as values
|
640
|
-
# 2.- An array with estimators names
|
641
|
-
# 3.- A Hash with estimators names as keys and empty arrays as values
|
642
|
-
def prepare_bootstrap(estimators)
|
643
|
-
h_est=estimators
|
644
|
-
|
645
|
-
h_est=[h_est] unless h_est.is_a? Array or h_est.is_a? Hash
|
646
|
-
|
647
|
-
if h_est.is_a? Array
|
648
|
-
h_est=h_est.inject({}) {|h,est|
|
649
|
-
h[est]=lambda {|v| v.send(est)}
|
650
|
-
h
|
651
|
-
}
|
652
|
-
end
|
653
|
-
|
654
|
-
bss=h_est.keys.inject({}) {|h,v| h[v]=[];h}
|
655
|
-
|
656
|
-
[h_est,h_est.keys, bss]
|
657
|
-
|
658
|
-
end
|
659
|
-
private :prepare_bootstrap
|
660
|
-
|
661
|
-
# Returns an random sample of size n, with replacement,
|
662
|
-
# only with valid data.
|
663
|
-
#
|
664
|
-
# In all the trails, every item have the same probability
|
665
|
-
# of been selected.
|
666
|
-
def sample_with_replacement(sample=1)
|
667
|
-
vds=@valid_data.size
|
668
|
-
(0...sample).collect{ @valid_data[rand(vds)] }
|
669
|
-
end
|
670
|
-
# Returns an random sample of size n, without replacement,
|
671
|
-
# only with valid data.
|
672
|
-
#
|
673
|
-
# Every element could only be selected once.
|
674
|
-
#
|
675
|
-
# A sample of the same size of the vector is the vector itself.
|
676
|
-
|
677
|
-
def sample_without_replacement(sample=1)
|
678
|
-
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
679
|
-
out=[]
|
680
|
-
size=@valid_data.size
|
681
|
-
while out.size<sample
|
682
|
-
value=rand(size)
|
683
|
-
out.push(value) if !out.include?value
|
684
|
-
end
|
685
|
-
out.collect{|i| @data[i]}
|
686
|
-
end
|
687
|
-
# Retrieves number of cases which comply condition.
|
688
|
-
# If block given, retrieves number of instances where
|
689
|
-
# block returns true.
|
690
|
-
# If other values given, retrieves the frequency for
|
691
|
-
# this value.
|
692
|
-
def count(x=false)
|
693
|
-
if block_given?
|
694
|
-
r=@data.inject(0) {|s, i|
|
695
|
-
r=yield i
|
696
|
-
s+(r ? 1 : 0)
|
697
|
-
}
|
698
|
-
r.nil? ? 0 : r
|
699
|
-
else
|
700
|
-
frequencies[x].nil? ? 0 : frequencies[x]
|
701
|
-
end
|
702
|
-
end
|
703
|
-
|
704
|
-
# Returns the database type for the vector, according to its content
|
705
|
-
|
706
|
-
def db_type(dbs='mysql')
|
707
|
-
# first, detect any character not number
|
708
|
-
if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
|
709
|
-
return "DATE"
|
710
|
-
elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
|
711
|
-
return "VARCHAR (255)"
|
712
|
-
elsif @data.find {|v| v.to_s=~/\./}
|
713
|
-
return "DOUBLE"
|
714
|
-
else
|
715
|
-
return "INTEGER"
|
716
|
-
end
|
717
|
-
end
|
718
139
|
# Return true if all data is Date, "today" values or nil
|
719
140
|
def can_be_date?
|
720
|
-
|
721
|
-
!v.nil? and !v.is_a? Date and !v.is_a? Time and (v.is_a? String and !@today_values.include? v) and (v.is_a? String and !(v=~/\d{4,4}[-\/]\d{1,2}[-\/]\d{1,2}/))}
|
722
|
-
false
|
723
|
-
else
|
724
|
-
true
|
725
|
-
end
|
141
|
+
raise NoMethodError, "This method is no longer supported."
|
726
142
|
end
|
727
143
|
# Return true if all data is Numeric or nil
|
728
144
|
def can_be_numeric?
|
729
|
-
|
730
|
-
false
|
731
|
-
else
|
732
|
-
true
|
733
|
-
end
|
145
|
+
type == :numeric
|
734
146
|
end
|
735
147
|
|
736
148
|
def to_s
|
737
149
|
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
738
150
|
end
|
739
|
-
# Ugly name. Really, create a Vector for standard 'matrix' package.
|
740
|
-
# <tt>dir</tt> could be :horizontal or :vertical
|
741
|
-
def to_matrix(dir=:horizontal)
|
742
|
-
case dir
|
743
|
-
when :horizontal
|
744
|
-
Matrix[@data]
|
745
|
-
when :vertical
|
746
|
-
Matrix.columns([@data])
|
747
|
-
end
|
748
|
-
end
|
749
|
-
def inspect
|
750
|
-
self.to_s
|
751
|
-
end
|
752
|
-
# Retrieves uniques values for data.
|
753
|
-
def factors
|
754
|
-
if @type==:numeric
|
755
|
-
@numeric_data.uniq.sort
|
756
|
-
elsif @type==:date
|
757
|
-
@date_data_with_nils.uniq.sort
|
758
|
-
else
|
759
|
-
@valid_data.uniq.sort
|
760
|
-
end
|
761
|
-
end
|
762
|
-
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
763
|
-
# Returns a hash with the distribution of frecuencies for
|
764
|
-
# the sample
|
765
|
-
def frequencies
|
766
|
-
Statsample::STATSAMPLE__.frequencies(@valid_data)
|
767
|
-
end
|
768
|
-
else
|
769
|
-
def frequencies #:nodoc:
|
770
|
-
_frequencies
|
771
|
-
end
|
772
|
-
end
|
773
|
-
|
774
|
-
|
775
|
-
def _frequencies #:nodoc:
|
776
|
-
@valid_data.inject(Hash.new) {|a,x|
|
777
|
-
a[x]||=0
|
778
|
-
a[x]=a[x]+1
|
779
|
-
a
|
780
|
-
}
|
781
|
-
end
|
782
|
-
|
783
|
-
# Returns the most frequent item.
|
784
|
-
def mode
|
785
|
-
frequencies.max{|a,b| a[1]<=>b[1]}.first
|
786
|
-
end
|
787
|
-
# The numbers of item with valid data.
|
788
|
-
def n_valid
|
789
|
-
@valid_data.size
|
790
|
-
end
|
791
|
-
# Returns a hash with the distribution of proportions of
|
792
|
-
# the sample.
|
793
|
-
def proportions
|
794
|
-
frequencies.inject({}){|a,v|
|
795
|
-
a[v[0]] = v[1].quo(n_valid)
|
796
|
-
a
|
797
|
-
}
|
798
|
-
end
|
799
|
-
# Proportion of a given value.
|
800
|
-
def proportion(v=1)
|
801
|
-
frequencies[v].quo(@valid_data.size)
|
802
|
-
end
|
803
|
-
def report_building(b)
|
804
|
-
b.section(:name=>name) do |s|
|
805
|
-
s.text _("n :%d") % n
|
806
|
-
s.text _("n valid:%d") % n_valid
|
807
|
-
if @type==:object
|
808
|
-
s.text _("factors:%s") % factors.join(",")
|
809
|
-
s.text _("mode: %s") % mode
|
810
|
-
|
811
|
-
s.table(:name=>_("Distribution")) do |t|
|
812
|
-
frequencies.sort.each do |k,v|
|
813
|
-
key=labels.has_key?(k) ? labels[k]:k
|
814
|
-
t.row [key, v , ("%0.2f%%" % (v.quo(n_valid)*100))]
|
815
|
-
end
|
816
|
-
end
|
817
|
-
end
|
818
|
-
|
819
|
-
s.text _("median: %s") % median.to_s if(@type==:numeric or @type==:numeric)
|
820
|
-
if(@type==:numeric)
|
821
|
-
s.text _("mean: %0.4f") % mean
|
822
|
-
if sd
|
823
|
-
s.text _("std.dev.: %0.4f") % sd
|
824
|
-
s.text _("std.err.: %0.4f") % se
|
825
|
-
s.text _("skew: %0.4f") % skew
|
826
|
-
s.text _("kurtosis: %0.4f") % kurtosis
|
827
|
-
end
|
828
|
-
end
|
829
|
-
end
|
830
|
-
end
|
831
|
-
|
832
|
-
# Variance of p, according to poblation size
|
833
|
-
def variance_proportion(n_poblation, v=1)
|
834
|
-
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
835
|
-
end
|
836
|
-
# Variance of p, according to poblation size
|
837
|
-
def variance_total(n_poblation, v=1)
|
838
|
-
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
839
|
-
end
|
840
|
-
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
841
|
-
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
842
|
-
end
|
843
|
-
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
844
|
-
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
845
|
-
end
|
846
|
-
|
847
|
-
self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
|
848
|
-
met_or=met.gsub("_slow","")
|
849
|
-
if !self.method_defined?(met_or)
|
850
|
-
alias_method met_or, met
|
851
|
-
end
|
852
|
-
end
|
853
|
-
|
854
|
-
######
|
855
|
-
### numeric Methods
|
856
|
-
######
|
857
|
-
|
858
|
-
# == Percentil
|
859
|
-
# Returns the value of the percentile q
|
860
|
-
#
|
861
|
-
# Accepts an optional second argument specifying the strategy to interpolate
|
862
|
-
# when the requested percentile lies between two data points a and b
|
863
|
-
# Valid strategies are:
|
864
|
-
# * :midpoint (Default): (a + b) / 2
|
865
|
-
# * :linear : a + (b - a) * d where d is the decimal part of the index between a and b.
|
866
|
-
# This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method)
|
867
|
-
#
|
868
|
-
def percentil(q, strategy = :midpoint)
|
869
|
-
check_type :numeric
|
870
|
-
sorted=@valid_data.sort
|
871
|
-
|
872
|
-
case strategy
|
873
|
-
when :midpoint
|
874
|
-
v = (n_valid * q).quo(100)
|
875
|
-
if(v.to_i!=v)
|
876
|
-
sorted[v.to_i]
|
877
|
-
else
|
878
|
-
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
879
|
-
end
|
880
|
-
when :linear
|
881
|
-
index = (q / 100.0) * (n_valid + 1)
|
882
|
-
|
883
|
-
k = index.truncate
|
884
|
-
d = index % 1
|
885
|
-
|
886
|
-
if k == 0
|
887
|
-
sorted[0]
|
888
|
-
elsif k >= sorted.size
|
889
|
-
sorted[-1]
|
890
|
-
else
|
891
|
-
sorted[k - 1] + d * (sorted[k] - sorted[k - 1])
|
892
|
-
end
|
893
|
-
else
|
894
|
-
raise NotImplementedError.new "Unknown strategy #{strategy.to_s}"
|
895
|
-
end
|
896
|
-
end
|
897
|
-
|
898
|
-
# Returns a ranked vector.
|
899
|
-
def ranked(type=:numeric)
|
900
|
-
check_type :numeric
|
901
|
-
i=0
|
902
|
-
r=frequencies.sort.inject({}){|a,v|
|
903
|
-
a[v[0]]=(i+1 + i+v[1]).quo(2)
|
904
|
-
i+=v[1]
|
905
|
-
a
|
906
|
-
}
|
907
|
-
@data.collect {|c| r[c] }.to_vector(type)
|
908
|
-
end
|
909
|
-
# Return the median (percentil 50)
|
910
|
-
def median
|
911
|
-
check_type :numeric
|
912
|
-
percentil(50)
|
913
|
-
end
|
914
|
-
# Minimun value
|
915
|
-
def min
|
916
|
-
check_type :numeric
|
917
|
-
@valid_data.min
|
918
|
-
end
|
919
|
-
# Maximum value
|
920
|
-
def max
|
921
|
-
check_type :numeric
|
922
|
-
@valid_data.max
|
923
|
-
end
|
924
|
-
|
925
|
-
def set_date_data
|
926
|
-
@date_data_with_nils=@data.collect do|x|
|
927
|
-
if x.is_a? Date
|
928
|
-
x
|
929
|
-
elsif x.is_a? Time
|
930
|
-
Date.new(x.year, x.month, x.day)
|
931
|
-
elsif x.is_a? String and x=~/(\d{4,4})[-\/](\d{1,2})[-\/](\d{1,2})/
|
932
|
-
Date.new($1.to_i,$2.to_i,$3.to_i)
|
933
|
-
elsif @today_values.include? x
|
934
|
-
Date.today()
|
935
|
-
elsif @missing_values.include? x or x.nil?
|
936
|
-
nil
|
937
|
-
end
|
938
|
-
end
|
939
|
-
end
|
940
|
-
|
941
|
-
def set_numeric_data
|
942
|
-
@numeric_data=@valid_data.collect do|x|
|
943
|
-
if x.is_a? Numeric
|
944
|
-
x
|
945
|
-
elsif x.is_a? String and x.to_i==x.to_f
|
946
|
-
x.to_i
|
947
|
-
else
|
948
|
-
x.to_f
|
949
|
-
end
|
950
|
-
end
|
951
|
-
end
|
952
|
-
|
953
|
-
private :set_date_data, :set_numeric_data
|
954
|
-
|
955
|
-
# The range of the data (max - min)
|
956
|
-
def range;
|
957
|
-
check_type :numeric
|
958
|
-
@numeric_data.max - @numeric_data.min
|
959
|
-
end
|
960
|
-
# The sum of values for the data
|
961
|
-
def sum
|
962
|
-
check_type :numeric
|
963
|
-
@numeric_data.inject(0){|a,x|x+a} ;
|
964
|
-
end
|
965
|
-
# The arithmetical mean of data
|
966
|
-
def mean
|
967
|
-
check_type :numeric
|
968
|
-
sum.to_f.quo(n_valid)
|
969
|
-
end
|
970
|
-
# Sum of squares for the data around a value.
|
971
|
-
# By default, this value is the mean
|
972
|
-
# ss= sum{(xi-m)^2}
|
973
|
-
#
|
974
|
-
def sum_of_squares(m=nil)
|
975
|
-
check_type :numeric
|
976
|
-
m||=mean
|
977
|
-
@numeric_data.inject(0){|a,x| a+(x-m).square}
|
978
|
-
end
|
979
|
-
# Sum of squared deviation
|
980
|
-
def sum_of_squared_deviation
|
981
|
-
check_type :numeric
|
982
|
-
@numeric_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
983
|
-
end
|
984
|
-
|
985
|
-
# Population variance (denominator N)
|
986
|
-
def variance_population(m=nil)
|
987
|
-
check_type :numeric
|
988
|
-
m||=mean
|
989
|
-
squares=@numeric_data.inject(0){|a,x| x.square+a}
|
990
|
-
squares.quo(n_valid) - m.square
|
991
|
-
end
|
992
|
-
|
993
|
-
|
994
|
-
# Population Standard deviation (denominator N)
|
995
|
-
def standard_deviation_population(m=nil)
|
996
|
-
check_type :numeric
|
997
|
-
Math::sqrt( variance_population(m) )
|
998
|
-
end
|
999
|
-
|
1000
|
-
# Population average deviation (denominator N)
|
1001
|
-
# author: Al Chou
|
1002
|
-
|
1003
|
-
def average_deviation_population( m = nil )
|
1004
|
-
check_type :numeric
|
1005
|
-
m ||= mean
|
1006
|
-
( @numeric_data.inject( 0 ) { |a, x| ( x - m ).abs + a } ).quo( n_valid )
|
1007
|
-
end
|
1008
|
-
def median_absolute_deviation
|
1009
|
-
med=median
|
1010
|
-
recode {|x| (x-med).abs}.median
|
1011
|
-
end
|
1012
|
-
alias :mad :median_absolute_deviation
|
1013
|
-
# Sample Variance (denominator n-1)
|
1014
|
-
def variance_sample(m=nil)
|
1015
|
-
check_type :numeric
|
1016
|
-
m||=mean
|
1017
|
-
sum_of_squares(m).quo(n_valid - 1)
|
1018
|
-
end
|
1019
|
-
|
1020
|
-
# Sample Standard deviation (denominator n-1)
|
1021
|
-
def standard_deviation_sample(m=nil)
|
1022
|
-
check_type :numeric
|
1023
|
-
m||=mean
|
1024
|
-
Math::sqrt(variance_sample(m))
|
1025
|
-
end
|
1026
|
-
# Skewness of the sample
|
1027
|
-
def skew(m=nil)
|
1028
|
-
check_type :numeric
|
1029
|
-
m||=mean
|
1030
|
-
th=@numeric_data.inject(0){|a,x| a+((x-m)**3)}
|
1031
|
-
th.quo((@numeric_data.size)*sd(m)**3)
|
1032
|
-
end
|
1033
|
-
# Kurtosis of the sample
|
1034
|
-
def kurtosis(m=nil)
|
1035
|
-
check_type :numeric
|
1036
|
-
m||=mean
|
1037
|
-
fo=@numeric_data.inject(0){|a,x| a+((x-m)**4)}
|
1038
|
-
fo.quo((@numeric_data.size)*sd(m)**4)-3
|
1039
|
-
|
1040
|
-
end
|
1041
|
-
# Product of all values on the sample
|
1042
|
-
#
|
1043
|
-
def product
|
1044
|
-
check_type :numeric
|
1045
|
-
@numeric_data.inject(1){|a,x| a*x }
|
1046
|
-
end
|
1047
|
-
|
1048
|
-
# With a fixnum, creates X bins within the range of data
|
1049
|
-
# With an Array, each value will be a cut point
|
1050
|
-
def histogram(bins=10)
|
1051
|
-
check_type :numeric
|
1052
|
-
|
1053
|
-
if bins.is_a? Array
|
1054
|
-
#h=Statsample::Histogram.new(self, bins)
|
1055
|
-
h=Statsample::Histogram.alloc(bins)
|
1056
|
-
else
|
1057
|
-
# ugly patch. The upper limit for a bin has the form
|
1058
|
-
# x < range
|
1059
|
-
#h=Statsample::Histogram.new(self, bins)
|
1060
|
-
min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
|
1061
|
-
# fix last data
|
1062
|
-
if max==@valid_data.max
|
1063
|
-
max+=1e-10
|
1064
|
-
end
|
1065
|
-
h=Statsample::Histogram.alloc(bins,[min,max])
|
1066
|
-
# Fix last bin
|
1067
|
-
|
1068
|
-
end
|
1069
|
-
h.increment(@valid_data)
|
1070
|
-
h
|
1071
|
-
end
|
1072
|
-
|
1073
|
-
# Coefficient of variation
|
1074
|
-
# Calculed with the sample standard deviation
|
1075
|
-
def coefficient_of_variation
|
1076
|
-
check_type :numeric
|
1077
|
-
standard_deviation_sample.quo(mean)
|
1078
|
-
end
|
1079
|
-
# Standard error of the distribution mean
|
1080
|
-
# Calculated using sd/sqrt(n)
|
1081
|
-
def standard_error
|
1082
|
-
standard_deviation_sample.quo(Math.sqrt(valid_data.size))
|
1083
|
-
end
|
1084
|
-
alias :se :standard_error
|
1085
|
-
|
1086
|
-
alias_method :sdp, :standard_deviation_population
|
1087
|
-
alias_method :sds, :standard_deviation_sample
|
1088
|
-
alias_method :adp, :average_deviation_population
|
1089
|
-
alias_method :cov, :coefficient_of_variation
|
1090
|
-
alias_method :variance, :variance_sample
|
1091
|
-
alias_method :sd, :standard_deviation_sample
|
1092
|
-
alias_method :ss, :sum_of_squares
|
1093
|
-
include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl?
|
1094
151
|
end
|
1095
152
|
end
|