statsample 0.6.5 → 0.6.7
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +15 -0
- data/Manifest.txt +6 -0
- data/README.txt +30 -12
- data/Rakefile +91 -0
- data/demo/levene.rb +9 -0
- data/demo/multiple_regression.rb +1 -7
- data/demo/polychoric.rb +1 -0
- data/demo/principal_axis.rb +8 -0
- data/lib/distribution/f.rb +22 -22
- data/lib/spss.rb +99 -99
- data/lib/statsample/bivariate/polychoric.rb +32 -22
- data/lib/statsample/bivariate/tetrachoric.rb +212 -207
- data/lib/statsample/bivariate.rb +6 -6
- data/lib/statsample/codification.rb +65 -65
- data/lib/statsample/combination.rb +60 -59
- data/lib/statsample/converter/csv19.rb +12 -12
- data/lib/statsample/converters.rb +1 -1
- data/lib/statsample/dataset.rb +93 -36
- data/lib/statsample/dominanceanalysis/bootstrap.rb +66 -3
- data/lib/statsample/dominanceanalysis.rb +5 -6
- data/lib/statsample/factor/pca.rb +41 -11
- data/lib/statsample/factor/principalaxis.rb +105 -29
- data/lib/statsample/factor/rotation.rb +20 -3
- data/lib/statsample/factor.rb +1 -1
- data/lib/statsample/graph/gdchart.rb +13 -13
- data/lib/statsample/graph/svggraph.rb +166 -167
- data/lib/statsample/matrix.rb +22 -12
- data/lib/statsample/mle/logit.rb +3 -2
- data/lib/statsample/mle/probit.rb +7 -5
- data/lib/statsample/mle.rb +4 -2
- data/lib/statsample/multiset.rb +125 -124
- data/lib/statsample/permutation.rb +2 -1
- data/lib/statsample/regression/binomial/logit.rb +4 -3
- data/lib/statsample/regression/binomial/probit.rb +2 -1
- data/lib/statsample/regression/binomial.rb +62 -81
- data/lib/statsample/regression/multiple/baseengine.rb +1 -1
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/matrixengine.rb +12 -6
- data/lib/statsample/regression/multiple.rb +15 -42
- data/lib/statsample/regression/simple.rb +93 -78
- data/lib/statsample/regression.rb +74 -2
- data/lib/statsample/reliability.rb +117 -120
- data/lib/statsample/srs.rb +156 -153
- data/lib/statsample/test/levene.rb +90 -0
- data/lib/statsample/test/umannwhitney.rb +25 -9
- data/lib/statsample/test.rb +2 -0
- data/lib/statsample/vector.rb +388 -413
- data/lib/statsample.rb +74 -30
- data/po/es/statsample.mo +0 -0
- data/test/test_bivariate.rb +5 -4
- data/test/test_combination.rb +1 -1
- data/test/test_dataset.rb +2 -2
- data/test/test_factor.rb +53 -6
- data/test/test_gsl.rb +1 -1
- data/test/test_mle.rb +1 -1
- data/test/test_regression.rb +18 -33
- data/test/test_statistics.rb +15 -33
- data/test/test_stest.rb +35 -0
- data/test/test_svg_graph.rb +2 -2
- data/test/test_vector.rb +331 -333
- metadata +38 -11
data/lib/statsample/vector.rb
CHANGED
@@ -10,127 +10,98 @@ class Array
|
|
10
10
|
Statsample::Vector.new(self,:scale,*args)
|
11
11
|
end
|
12
12
|
end
|
13
|
+
|
13
14
|
module Statsample
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
# matrix=Statsample.vector_cols_matrix(v1,v2)
|
19
|
-
def vector_cols_matrix(*vs)
|
20
|
-
# test
|
21
|
-
size=vs[0].size
|
22
|
-
vs.each{|v|
|
23
|
-
raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
|
24
|
-
raise ArgumentError,"Vectors size should be the same" if v.size!=size
|
25
|
-
}
|
26
|
-
Matrix.rows((0...size).to_a.collect() {|i|
|
27
|
-
vs.collect{|v| v[i]}
|
28
|
-
})
|
29
|
-
end
|
30
|
-
end
|
31
|
-
# Returns a duplicate of the input vectors, without missing data
|
32
|
-
# for any of the vectors.
|
33
|
-
#
|
34
|
-
# a=[1,2,3,6,7,nil,3,5].to_scale
|
35
|
-
# b=[nil,nil,5,6,4,5,10,2].to_scale
|
36
|
-
# c=[2,4,6,7,4,5,6,7].to_scale
|
37
|
-
# a2,b2,c2=Statsample.only_valid(a,b,c)
|
38
|
-
# => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
|
39
|
-
# #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
|
40
|
-
# #<Statsample::Scale:0xb748c760 @data=[6, 7, 4, 6, 7]>]
|
41
|
-
#
|
42
|
-
def self.only_valid(*vs)
|
43
|
-
i=1
|
44
|
-
h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a}
|
45
|
-
ds=Statsample::Dataset.new(h).dup_only_valid
|
46
|
-
ds.vectors.values
|
47
|
-
end
|
48
|
-
|
49
|
-
class Vector
|
50
|
-
include Enumerable
|
51
|
-
include Writable
|
52
|
-
DEFAULT_OPTIONS={
|
53
|
-
:missing_values=>[],
|
54
|
-
:today_values=>['NOW','TODAY', :NOW, :TODAY],
|
55
|
-
:labels=>{}
|
56
|
-
}
|
57
|
-
# Level of measurement. Could be :nominal, :ordinal or :scale
|
58
|
-
attr_reader :type
|
59
|
-
# Original data.
|
60
|
-
attr_reader :data
|
61
|
-
# Valid data. Equal to data, minus values assigned as missing values
|
62
|
-
attr_reader :valid_data
|
63
|
-
# Array of values considered as missing. Nil is a missing value, by default
|
64
|
-
attr_reader :missing_values
|
65
|
-
# Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
|
66
|
-
attr_reader :today_values
|
67
|
-
# Missing values array
|
68
|
-
attr_reader :missing_data
|
69
|
-
# Original data, with all missing values replaced by nils
|
70
|
-
attr_reader :data_with_nils
|
71
|
-
# Date date, with all missing values replaced by nils
|
72
|
-
attr_reader :date_data_with_nils
|
73
|
-
# GSL Object, only available with rbgsl extension and type==:scale
|
74
|
-
attr_reader :gsl
|
75
|
-
# Change label for specific values
|
76
|
-
attr_accessor :labels
|
77
|
-
# Creates a new Vector object.
|
78
|
-
# [data] Array of data.
|
79
|
-
# [type] Level of meausurement. See Vector#type
|
80
|
-
# [opts] Options
|
81
|
-
# [:missing_values] Array of missing values. See Vector#missing_values
|
82
|
-
# [:today_values] Array of 'today' values. See Vector#today_values
|
83
|
-
# [:labels] Labels for data values
|
84
|
-
#
|
15
|
+
|
16
|
+
# Collection of values on one dimension. Works as a column on a Spreadsheet.
|
17
|
+
#
|
18
|
+
# == Usage
|
85
19
|
# The fast way to create a vector uses Array.to_vector or Array.to_scale.
|
86
20
|
#
|
87
21
|
# v=[1,2,3,4].to_vector(:scale)
|
88
22
|
# v=[1,2,3,4].to_scale
|
89
|
-
#
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
23
|
+
#
|
24
|
+
class Vector
|
25
|
+
include Enumerable
|
26
|
+
include Writable
|
27
|
+
# DEFAULT OPTIONS
|
28
|
+
DEFAULT_OPTIONS={
|
29
|
+
:missing_values=>[],
|
30
|
+
:today_values=>['NOW','TODAY', :NOW, :TODAY],
|
31
|
+
:labels=>{}
|
32
|
+
}
|
33
|
+
# Level of measurement. Could be :nominal, :ordinal or :scale
|
34
|
+
attr_reader :type
|
35
|
+
# Original data.
|
36
|
+
attr_reader :data
|
37
|
+
# Valid data. Equal to data, minus values assigned as missing values
|
38
|
+
attr_reader :valid_data
|
39
|
+
# Array of values considered as missing. Nil is a missing value, by default
|
40
|
+
attr_reader :missing_values
|
41
|
+
# Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
|
42
|
+
attr_reader :today_values
|
43
|
+
# Missing values array
|
44
|
+
attr_reader :missing_data
|
45
|
+
# Original data, with all missing values replaced by nils
|
46
|
+
attr_reader :data_with_nils
|
47
|
+
# Date date, with all missing values replaced by nils
|
48
|
+
attr_reader :date_data_with_nils
|
49
|
+
# GSL Object, only available with rbgsl extension and type==:scale
|
50
|
+
attr_reader :gsl
|
51
|
+
# Change label for specific values
|
52
|
+
attr_accessor :labels
|
53
|
+
#
|
54
|
+
# Creates a new Vector object.
|
55
|
+
# [data] Array of data.
|
56
|
+
# [type] Level of meausurement. See Vector#type
|
57
|
+
# [opts] Options
|
58
|
+
# [:missing_values] Array of missing values. See Vector#missing_values
|
59
|
+
# [:today_values] Array of 'today' values. See Vector#today_values
|
60
|
+
# [:labels] Labels for data values
|
61
|
+
#
|
62
|
+
def initialize(data=[], type=:nominal, opts=Hash.new)
|
63
|
+
raise "Data should be an array" unless data.is_a? Array
|
64
|
+
@data=data
|
65
|
+
@type=type
|
66
|
+
opts=DEFAULT_OPTIONS.merge(opts)
|
67
|
+
@missing_values=opts[:missing_values]
|
68
|
+
@labels=opts[:labels]
|
69
|
+
@today_values=opts[:today_values]
|
70
|
+
@valid_data=[]
|
71
|
+
@data_with_nils=[]
|
72
|
+
@date_data_with_nils=[]
|
73
|
+
@missing_data=[]
|
74
|
+
@has_missing_data=nil
|
75
|
+
@scale_data=nil
|
76
|
+
set_valid_data_intern
|
77
|
+
self.type=type
|
78
|
+
end
|
79
|
+
# Creates a duplicate of the Vector.
|
80
|
+
# Note: data, missing_values and labels are duplicated, so
|
81
|
+
# changes on original vector doesn't propages to copies.
|
82
|
+
def dup
|
112
83
|
Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
84
|
+
end
|
85
|
+
# Returns an empty duplicate of the vector. Maintains the type,
|
86
|
+
# missing values and labels.
|
87
|
+
def dup_empty
|
117
88
|
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
|
118
|
-
|
119
|
-
|
120
|
-
|
89
|
+
end
|
90
|
+
# Raises an exception if type of vector is inferior to t type
|
91
|
+
def check_type(t)
|
121
92
|
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date)
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
93
|
+
end
|
94
|
+
private :check_type
|
95
|
+
|
96
|
+
# Return a vector usign the standarized values for data
|
97
|
+
# with sd with denominator N
|
98
|
+
def vector_standarized_pop
|
128
99
|
vector_standarized(true)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
100
|
+
end
|
101
|
+
# Return a vector usign the standarized values for data
|
102
|
+
# with sd with denominator n-1
|
103
|
+
|
104
|
+
def vector_standarized(use_population=false)
|
134
105
|
raise "Should be a scale" unless @type==:scale
|
135
106
|
m=mean
|
136
107
|
sd=use_population ? sdp : sds
|
@@ -141,11 +112,10 @@ module Statsample
|
|
141
112
|
nil
|
142
113
|
end
|
143
114
|
}.to_vector(:scale)
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
def box_cox_transformation(lambda) # :nodoc:
|
115
|
+
end
|
116
|
+
alias_method :standarized, :vector_standarized
|
117
|
+
|
118
|
+
def box_cox_transformation(lambda) # :nodoc:
|
149
119
|
raise "Should be a scale" unless @type==:scale
|
150
120
|
@data_with_nils.collect{|x|
|
151
121
|
if !x.nil?
|
@@ -158,42 +128,42 @@ module Statsample
|
|
158
128
|
nil
|
159
129
|
end
|
160
130
|
}.to_vector(:scale)
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
131
|
+
end
|
132
|
+
|
133
|
+
# Vector equality.
|
134
|
+
# Two vector will be the same if their data, missing values, type, labels are equals
|
135
|
+
def ==(v2)
|
166
136
|
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
167
137
|
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
168
|
-
|
169
|
-
|
170
|
-
|
138
|
+
end
|
139
|
+
|
140
|
+
def _dump(i) # :nodoc:
|
171
141
|
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
172
|
-
|
173
|
-
|
174
|
-
|
142
|
+
end
|
143
|
+
|
144
|
+
def self._load(data) # :nodoc:
|
175
145
|
h=Marshal.load(data)
|
176
146
|
Vector.new(h['data'], h['type'],:missing_values=> h['missing_values'], :labels=>h['labels'])
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
147
|
+
end
|
148
|
+
# Returns a new vector, with data modified by block.
|
149
|
+
# Equivalent to create a Vector after #collect on data
|
150
|
+
def recode
|
181
151
|
@data.collect{|x|
|
182
152
|
yield x
|
183
153
|
}.to_vector(@type)
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
154
|
+
end
|
155
|
+
# Modifies current vector, with data modified by block.
|
156
|
+
# Equivalent to #collect! on @data
|
157
|
+
def recode!
|
188
158
|
@data.collect!{|x|
|
189
159
|
yield x
|
190
160
|
}
|
191
161
|
set_valid_data
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
162
|
+
end
|
163
|
+
# Dicotomize the vector with 0 and 1, based on lowest value
|
164
|
+
# If parameter if defined, this value and lower
|
165
|
+
# will be 0 and higher, 1
|
166
|
+
def dichotomize(low=nil)
|
197
167
|
fs=factors
|
198
168
|
low||=factors.min
|
199
169
|
@data_with_nils.collect{|x|
|
@@ -205,44 +175,44 @@ module Statsample
|
|
205
175
|
0
|
206
176
|
end
|
207
177
|
}.to_scale
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
178
|
+
end
|
179
|
+
# Iterate on each item.
|
180
|
+
# Equivalent to
|
181
|
+
# @data.each{|x| yield x}
|
182
|
+
def each
|
213
183
|
@data.each{|x| yield(x) }
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
184
|
+
end
|
185
|
+
|
186
|
+
# Iterate on each item, retrieving index
|
187
|
+
def each_index
|
218
188
|
(0...@data.size).each {|i|
|
219
189
|
yield(i)
|
220
190
|
}
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
191
|
+
end
|
192
|
+
# Add a value at the end of the vector.
|
193
|
+
# If second argument set to false, you should update the Vector usign
|
194
|
+
# Vector.set_valid_data at the end of your insertion cycle
|
195
|
+
#
|
196
|
+
def add(v,update_valid=true)
|
227
197
|
@data.push(v)
|
228
198
|
set_valid_data if update_valid
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
199
|
+
end
|
200
|
+
# Update valid_data, missing_data, data_with_nils and gsl
|
201
|
+
# at the end of an insertion.
|
202
|
+
#
|
203
|
+
# Use after Vector.add(v,false)
|
204
|
+
# Usage:
|
205
|
+
# v=Statsample::Vector.new
|
206
|
+
# v.add(2,false)
|
207
|
+
# v.add(4,false)
|
208
|
+
# v.data
|
209
|
+
# => [2,3]
|
210
|
+
# v.valid_data
|
211
|
+
# => []
|
212
|
+
# v.set_valid_data
|
213
|
+
# v.valid_data
|
214
|
+
# => [2,3]
|
215
|
+
def set_valid_data
|
246
216
|
@valid_data.clear
|
247
217
|
@missing_data.clear
|
248
218
|
@data_with_nils.clear
|
@@ -251,18 +221,18 @@ module Statsample
|
|
251
221
|
set_valid_data_intern
|
252
222
|
set_scale_data if(@type==:scale)
|
253
223
|
set_date_data if(@type==:date)
|
254
|
-
|
255
|
-
|
256
|
-
|
224
|
+
end
|
225
|
+
|
226
|
+
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
257
227
|
def set_valid_data_intern #:nodoc:
|
258
228
|
Statsample::STATSAMPLE__.set_valid_data_intern(self)
|
259
229
|
end
|
260
|
-
|
230
|
+
else
|
261
231
|
def set_valid_data_intern #:nodoc:
|
262
232
|
_set_valid_data_intern
|
263
233
|
end
|
264
|
-
|
265
|
-
|
234
|
+
end
|
235
|
+
def _set_valid_data_intern #:nodoc:
|
266
236
|
@data.each do |n|
|
267
237
|
if is_valid? n
|
268
238
|
@valid_data.push(n)
|
@@ -273,19 +243,19 @@ module Statsample
|
|
273
243
|
end
|
274
244
|
end
|
275
245
|
@has_missing_data=@missing_data.size>0
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
246
|
+
end
|
247
|
+
|
248
|
+
# Retrieves true if data has one o more missing values
|
249
|
+
def has_missing_data?
|
280
250
|
@has_missing_data
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
251
|
+
end
|
252
|
+
# Retrieves label for value x. Retrieves x if
|
253
|
+
# no label defined.
|
254
|
+
def labeling(x)
|
285
255
|
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
286
|
-
|
287
|
-
|
288
|
-
|
256
|
+
end
|
257
|
+
# Returns a Vector with data with labels replaced by the label.
|
258
|
+
def vector_labeled
|
289
259
|
d=@data.collect{|x|
|
290
260
|
if @labels.has_key? x
|
291
261
|
@labels[x]
|
@@ -294,69 +264,70 @@ module Statsample
|
|
294
264
|
end
|
295
265
|
}
|
296
266
|
Vector.new(d,@type)
|
297
|
-
|
298
|
-
|
299
|
-
|
267
|
+
end
|
268
|
+
# Size of total data
|
269
|
+
def size
|
300
270
|
@data.size
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
271
|
+
end
|
272
|
+
alias_method :n, :size
|
273
|
+
|
274
|
+
# Retrieves i element of data
|
275
|
+
def [](i)
|
306
276
|
@data[i]
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
277
|
+
end
|
278
|
+
# Set i element of data.
|
279
|
+
# Note: Use set_valid_data if you include missing values
|
280
|
+
def []=(i,v)
|
311
281
|
@data[i]=v
|
312
|
-
|
313
|
-
|
314
|
-
|
282
|
+
end
|
283
|
+
# Return true if a value is valid (not nil and not included on missing values)
|
284
|
+
def is_valid?(x)
|
315
285
|
!(x.nil? or @missing_values.include? x)
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
286
|
+
end
|
287
|
+
# Set missing_values.
|
288
|
+
# if update_valid = false, you should use
|
289
|
+
# set_valid_data after all changes
|
290
|
+
def missing_values=(vals)
|
321
291
|
@missing_values = vals
|
322
292
|
set_valid_data
|
323
|
-
|
324
|
-
|
293
|
+
end
|
294
|
+
# Set data considered as "today" on data vectors
|
295
|
+
def today_values=(vals)
|
325
296
|
@today_values = vals
|
326
297
|
set_valid_data
|
327
|
-
|
328
|
-
|
329
|
-
|
298
|
+
end
|
299
|
+
# Set level of measurement.
|
300
|
+
def type=(t)
|
330
301
|
@type=t
|
331
302
|
set_scale_data if(t==:scale)
|
332
303
|
set_date_data if (t==:date)
|
333
|
-
|
334
|
-
|
304
|
+
end
|
305
|
+
def to_a
|
335
306
|
@data.dup
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
307
|
+
end
|
308
|
+
alias_method :to_ary, :to_a
|
309
|
+
|
310
|
+
# Vector sum.
|
311
|
+
# - If v is a scalar, add this value to all elements
|
312
|
+
# - If v is a Array or a Vector, should be of the same size of this vector
|
313
|
+
# every item of this vector will be added to the value of the
|
314
|
+
# item at the same position on the other vector
|
315
|
+
def +(v)
|
345
316
|
_vector_ari("+",v)
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
317
|
+
end
|
318
|
+
# Vector rest.
|
319
|
+
# - If v is a scalar, rest this value to all elements
|
320
|
+
# - If v is a Array or a Vector, should be of the same
|
321
|
+
# size of this vector
|
322
|
+
# every item of this vector will be rested to the value of the
|
323
|
+
# item at the same position on the other vector
|
324
|
+
|
325
|
+
def -(v)
|
355
326
|
_vector_ari("-",v)
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
327
|
+
end
|
328
|
+
# Reports all values that doesn't comply with a condition.
|
329
|
+
# Returns a hash with the index of data and the invalid data.
|
330
|
+
def verify
|
360
331
|
h={}
|
361
332
|
(0...@data.size).to_a.each{|i|
|
362
333
|
if !(yield @data[i])
|
@@ -364,8 +335,8 @@ module Statsample
|
|
364
335
|
end
|
365
336
|
}
|
366
337
|
h
|
367
|
-
|
368
|
-
|
338
|
+
end
|
339
|
+
def _vector_ari(method,v) # :nodoc:
|
369
340
|
if(v.is_a? Vector or v.is_a? Array)
|
370
341
|
if v.size==@data.size
|
371
342
|
# i=0
|
@@ -395,13 +366,13 @@ module Statsample
|
|
395
366
|
raise TypeError,"You should pass a scalar or a array/vector"
|
396
367
|
end
|
397
368
|
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
369
|
+
end
|
370
|
+
# Return an array with the data splitted by a separator.
|
371
|
+
# a=Vector.new(["a,b","c,d","a,b","d"])
|
372
|
+
# a.splitted
|
373
|
+
# =>
|
374
|
+
# [["a","b"],["c","d"],["a","b"],["d"]]
|
375
|
+
def splitted(sep=Statsample::SPLIT_TOKEN)
|
405
376
|
@data.collect{|x|
|
406
377
|
if x.nil?
|
407
378
|
nil
|
@@ -411,73 +382,73 @@ module Statsample
|
|
411
382
|
[x]
|
412
383
|
end
|
413
384
|
}
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
385
|
+
end
|
386
|
+
# Returns a hash of Vectors, defined by the different values
|
387
|
+
# defined on the fields
|
388
|
+
# Example:
|
389
|
+
#
|
390
|
+
# a=Vector.new(["a,b","c,d","a,b"])
|
391
|
+
# a.split_by_separator
|
392
|
+
# => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
|
393
|
+
# @data=[1, 0, 1]>,
|
394
|
+
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
|
395
|
+
# @data=[1, 1, 0]>,
|
396
|
+
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
|
397
|
+
# @data=[0, 1, 1]>}
|
398
|
+
#
|
399
|
+
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
429
400
|
split_data=splitted(sep)
|
430
401
|
factors=split_data.flatten.uniq.compact
|
431
402
|
out=factors.inject({}) {|a,x|
|
432
|
-
|
433
|
-
|
403
|
+
a[x]=[]
|
404
|
+
a
|
434
405
|
}
|
435
|
-
split_data.each
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
}
|
440
|
-
else
|
441
|
-
factors.each{|f|
|
442
|
-
out[f].push(r.include?(f) ? 1:0)
|
443
|
-
}
|
406
|
+
split_data.each do |r|
|
407
|
+
if r.nil?
|
408
|
+
factors.each do |f|
|
409
|
+
out[f].push(nil)
|
444
410
|
end
|
445
|
-
|
411
|
+
else
|
412
|
+
factors.each do |f|
|
413
|
+
out[f].push(r.include?(f) ? 1:0)
|
414
|
+
end
|
415
|
+
end
|
416
|
+
end
|
446
417
|
out.inject({}){|s,v|
|
447
|
-
|
448
|
-
|
418
|
+
s[v[0]]=Vector.new(v[1],:nominal)
|
419
|
+
s
|
449
420
|
}
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
if(@type!=:scale or !
|
421
|
+
end
|
422
|
+
def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
|
423
|
+
split_by_separator(sep).inject({}) {|a,v|
|
424
|
+
a[v[0]]=v[1].inject {|s,x| s+x.to_i}
|
425
|
+
a
|
426
|
+
}
|
427
|
+
end
|
428
|
+
|
429
|
+
# Returns an random sample of size n, with replacement,
|
430
|
+
# only with valid data.
|
431
|
+
#
|
432
|
+
# In all the trails, every item have the same probability
|
433
|
+
# of been selected.
|
434
|
+
def sample_with_replacement(sample=1)
|
435
|
+
if(@type!=:scale or !Statsample.has_gsl?)
|
465
436
|
vds=@valid_data.size
|
466
437
|
(0...sample).collect{ @valid_data[rand(vds)] }
|
467
438
|
else
|
468
439
|
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
469
440
|
r.sample(@gsl, sample).to_a
|
470
441
|
end
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
442
|
+
end
|
443
|
+
# Returns an random sample of size n, without replacement,
|
444
|
+
# only with valid data.
|
445
|
+
#
|
446
|
+
# Every element could only be selected once.
|
447
|
+
#
|
448
|
+
# A sample of the same size of the vector is the vector itself.
|
478
449
|
|
479
|
-
|
480
|
-
if(@type!=:scale or !
|
450
|
+
def sample_without_replacement(sample=1)
|
451
|
+
if(@type!=:scale or !Statsample.has_gsl?)
|
481
452
|
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
482
453
|
out=[]
|
483
454
|
size=@valid_data.size
|
@@ -490,13 +461,13 @@ module Statsample
|
|
490
461
|
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
491
462
|
r.choose(@gsl, sample).to_a
|
492
463
|
end
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
464
|
+
end
|
465
|
+
# Retrieves number of cases which comply condition.
|
466
|
+
# If block given, retrieves number of instances where
|
467
|
+
# block returns true.
|
468
|
+
# If other values given, retrieves the frequency for
|
469
|
+
# this value.
|
470
|
+
def count(x=false)
|
500
471
|
if block_given?
|
501
472
|
r=@data.inject(0) {|s, i|
|
502
473
|
r=yield i
|
@@ -506,11 +477,11 @@ module Statsample
|
|
506
477
|
else
|
507
478
|
frequencies[x].nil? ? 0 : frequencies[x]
|
508
479
|
end
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
480
|
+
end
|
481
|
+
|
482
|
+
# Returns the database type for the vector, according to its content
|
483
|
+
|
484
|
+
def db_type(dbs='mysql')
|
514
485
|
# first, detect any character not number
|
515
486
|
if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
|
516
487
|
return "DATE"
|
@@ -521,43 +492,43 @@ module Statsample
|
|
521
492
|
else
|
522
493
|
return "INTEGER"
|
523
494
|
end
|
524
|
-
|
525
|
-
|
526
|
-
|
495
|
+
end
|
496
|
+
# Return true if all data is Date, "today" values or nil
|
497
|
+
def can_be_date?
|
527
498
|
if @data.find {|v|
|
528
499
|
!v.nil? and !v.is_a? Date and !v.is_a? Time and (v.is_a? String and !@today_values.include? v) and (v.is_a? String and !(v=~/\d{4,4}[-\/]\d{1,2}[-\/]\d{1,2}/))}
|
529
500
|
false
|
530
501
|
else
|
531
502
|
true
|
532
503
|
end
|
533
|
-
|
534
|
-
|
535
|
-
|
504
|
+
end
|
505
|
+
# Return true if all data is Numeric or nil
|
506
|
+
def can_be_scale?
|
536
507
|
if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
|
537
508
|
false
|
538
509
|
else
|
539
510
|
true
|
540
511
|
end
|
541
|
-
|
542
|
-
|
543
|
-
|
512
|
+
end
|
513
|
+
|
514
|
+
def to_s
|
544
515
|
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
516
|
+
end
|
517
|
+
# Ugly name. Really, create a Vector for standard 'matrix' package.
|
518
|
+
# <tt>dir</tt> could be :horizontal or :vertical
|
519
|
+
def to_matrix(dir=:horizontal)
|
549
520
|
case dir
|
550
521
|
when :horizontal
|
551
522
|
Matrix[@data]
|
552
523
|
when :vertical
|
553
524
|
Matrix.columns([@data])
|
554
525
|
end
|
555
|
-
|
556
|
-
|
526
|
+
end
|
527
|
+
def inspect
|
557
528
|
self.to_s
|
558
|
-
|
559
|
-
|
560
|
-
|
529
|
+
end
|
530
|
+
# Retrieves uniques values for data.
|
531
|
+
def factors
|
561
532
|
if @type==:scale
|
562
533
|
@scale_data.uniq.sort
|
563
534
|
elsif @type==:date
|
@@ -565,26 +536,26 @@ module Statsample
|
|
565
536
|
else
|
566
537
|
@valid_data.uniq.sort
|
567
538
|
end
|
568
|
-
|
569
|
-
|
539
|
+
end
|
540
|
+
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
570
541
|
# Returns a hash with the distribution of frecuencies for
|
571
542
|
# the sample
|
572
543
|
def frequencies
|
573
544
|
Statsample::STATSAMPLE__.frequencies(@valid_data)
|
574
545
|
end
|
575
|
-
|
546
|
+
else
|
576
547
|
def frequencies #:nodoc:
|
577
548
|
_frequencies
|
578
549
|
end
|
579
|
-
|
580
|
-
|
550
|
+
end
|
551
|
+
def _frequencies #:nodoc:
|
581
552
|
@valid_data.inject(Hash.new) {|a,x|
|
582
553
|
a[x]||=0
|
583
554
|
a[x]=a[x]+1
|
584
555
|
a
|
585
556
|
}
|
586
|
-
|
587
|
-
|
557
|
+
end
|
558
|
+
# Plot frequencies on a chart, using gnuplot
|
588
559
|
def plot_frequencies
|
589
560
|
require 'gnuplot'
|
590
561
|
x=[]
|
@@ -594,30 +565,30 @@ module Statsample
|
|
594
565
|
y.push(v)
|
595
566
|
}
|
596
567
|
Gnuplot.open do |gp|
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
end
|
568
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
569
|
+
plot.boxwidth("0.9 absolute")
|
570
|
+
plot.yrange("[0:#{y.max}]")
|
571
|
+
plot.style("fill solid 1.00 border -1")
|
572
|
+
plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
|
573
|
+
plot.style("histogram")
|
574
|
+
plot.style("data histogram")
|
575
|
+
i=-1
|
576
|
+
plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
|
577
|
+
plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
|
608
578
|
end
|
609
579
|
end
|
610
|
-
|
580
|
+
end
|
581
|
+
|
611
582
|
end
|
612
|
-
|
613
|
-
|
583
|
+
|
584
|
+
|
614
585
|
# Returns the most frequent item.
|
615
586
|
def mode
|
616
|
-
|
587
|
+
frequencies.max{|a,b| a[1]<=>b[1]}[0]
|
617
588
|
end
|
618
589
|
# The numbers of item with valid data.
|
619
590
|
def n_valid
|
620
|
-
|
591
|
+
@valid_data.size
|
621
592
|
end
|
622
593
|
# Returns a hash with the distribution of proportions of
|
623
594
|
# the sample.
|
@@ -632,38 +603,38 @@ module Statsample
|
|
632
603
|
frequencies[v].quo(@valid_data.size)
|
633
604
|
end
|
634
605
|
def summary(out="")
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
606
|
+
out << sprintf("n valid:%d\n",n_valid)
|
607
|
+
out << sprintf("factors:%s\n",factors.join(","))
|
608
|
+
out << "mode:"+mode.to_s+"\n"
|
609
|
+
out << "Distribution:\n"
|
610
|
+
frequencies.sort.each{|k,v|
|
611
|
+
key=labels.has_key?(k) ? labels[k]:k
|
612
|
+
out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
|
613
|
+
}
|
614
|
+
if(@type==:ordinal)
|
615
|
+
out << "median:"+median.to_s+"\n"
|
616
|
+
end
|
617
|
+
if(@type==:scale)
|
618
|
+
out << "mean:"+mean.to_s+"\n"
|
619
|
+
out << "sd:"+sd.to_s+"\n"
|
620
|
+
|
621
|
+
end
|
622
|
+
out
|
652
623
|
end
|
653
624
|
|
654
625
|
# Variance of p, according to poblation size
|
655
626
|
def variance_proportion(n_poblation, v=1)
|
656
|
-
|
627
|
+
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
657
628
|
end
|
658
629
|
# Variance of p, according to poblation size
|
659
630
|
def variance_total(n_poblation, v=1)
|
660
|
-
|
631
|
+
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
661
632
|
end
|
662
633
|
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
663
|
-
|
634
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
664
635
|
end
|
665
636
|
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
666
|
-
|
637
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
667
638
|
end
|
668
639
|
|
669
640
|
self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
|
@@ -672,20 +643,21 @@ module Statsample
|
|
672
643
|
alias_method met_or, met
|
673
644
|
end
|
674
645
|
end
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
646
|
+
|
647
|
+
######
|
648
|
+
### Ordinal Methods
|
649
|
+
######
|
650
|
+
|
679
651
|
# Return the value of the percentil q
|
680
652
|
def percentil(q)
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
653
|
+
check_type :ordinal
|
654
|
+
sorted=@valid_data.sort
|
655
|
+
v= (n_valid * q).quo(100)
|
656
|
+
if(v.to_i!=v)
|
657
|
+
sorted[v.to_i]
|
658
|
+
else
|
659
|
+
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
660
|
+
end
|
689
661
|
end
|
690
662
|
# Returns a ranked vector.
|
691
663
|
def ranked(type=:ordinal)
|
@@ -698,27 +670,28 @@ module Statsample
|
|
698
670
|
}
|
699
671
|
@data.collect {|c| r[c] }.to_vector(type)
|
700
672
|
end
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
673
|
+
# Return the median (percentil 50)
|
674
|
+
def median
|
675
|
+
check_type :ordinal
|
676
|
+
if Statsample.has_gsl? and @type==:scale
|
677
|
+
sorted=GSL::Vector.alloc(@scale_data.sort)
|
678
|
+
GSL::Stats::median_from_sorted_data(sorted)
|
679
|
+
else
|
680
|
+
percentil(50)
|
681
|
+
end
|
682
|
+
end
|
683
|
+
# Minimun value
|
684
|
+
def min
|
685
|
+
check_type :ordinal
|
686
|
+
@valid_data.min;
|
709
687
|
end
|
710
|
-
end
|
711
|
-
# Minimun value
|
712
|
-
def min;
|
713
|
-
check_type :ordinal
|
714
|
-
@valid_data.min;
|
715
|
-
end
|
716
688
|
# Maximum value
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
689
|
+
def max
|
690
|
+
check_type :ordinal
|
691
|
+
@valid_data.max;
|
692
|
+
end
|
693
|
+
|
694
|
+
def set_date_data
|
722
695
|
@date_data_with_nils=@data.collect do|x|
|
723
696
|
if x.is_a? Date
|
724
697
|
x
|
@@ -733,7 +706,8 @@ module Statsample
|
|
733
706
|
end
|
734
707
|
end
|
735
708
|
end
|
736
|
-
|
709
|
+
|
710
|
+
def set_scale_data
|
737
711
|
@scale_data=@valid_data.collect do|x|
|
738
712
|
if x.is_a? Numeric
|
739
713
|
x
|
@@ -743,12 +717,13 @@ module Statsample
|
|
743
717
|
x.to_f
|
744
718
|
end
|
745
719
|
end
|
746
|
-
if
|
720
|
+
if Statsample.has_gsl?
|
747
721
|
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
748
722
|
end
|
749
723
|
end
|
750
|
-
|
751
|
-
|
724
|
+
|
725
|
+
private :set_date_data, :set_scale_data
|
726
|
+
|
752
727
|
# The range of the data (max - min)
|
753
728
|
def range;
|
754
729
|
check_type :scale
|
@@ -788,7 +763,7 @@ module Statsample
|
|
788
763
|
squares.quo(n_valid) - m.square
|
789
764
|
end
|
790
765
|
|
791
|
-
|
766
|
+
|
792
767
|
# Population Standard deviation (denominator N)
|
793
768
|
def standard_deviation_population(m=nil)
|
794
769
|
check_type :scale
|
@@ -801,7 +776,7 @@ module Statsample
|
|
801
776
|
m||=mean
|
802
777
|
sum_of_squares(m).quo(n_valid - 1)
|
803
778
|
end
|
804
|
-
|
779
|
+
|
805
780
|
# Sample Standard deviation (denominator n-1)
|
806
781
|
|
807
782
|
def standard_deviation_sample(m=nil)
|
@@ -831,7 +806,7 @@ module Statsample
|
|
831
806
|
check_type :scale
|
832
807
|
@scale_data.inject(1){|a,x| a*x }
|
833
808
|
end
|
834
|
-
if
|
809
|
+
if Statsample.has_gsl?
|
835
810
|
%w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
|
836
811
|
m_nuevo=(m+"_slow").intern
|
837
812
|
alias_method m_nuevo, m.intern
|