statsample 0.6.5 → 0.6.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +15 -0
- data/Manifest.txt +6 -0
- data/README.txt +30 -12
- data/Rakefile +91 -0
- data/demo/levene.rb +9 -0
- data/demo/multiple_regression.rb +1 -7
- data/demo/polychoric.rb +1 -0
- data/demo/principal_axis.rb +8 -0
- data/lib/distribution/f.rb +22 -22
- data/lib/spss.rb +99 -99
- data/lib/statsample/bivariate/polychoric.rb +32 -22
- data/lib/statsample/bivariate/tetrachoric.rb +212 -207
- data/lib/statsample/bivariate.rb +6 -6
- data/lib/statsample/codification.rb +65 -65
- data/lib/statsample/combination.rb +60 -59
- data/lib/statsample/converter/csv19.rb +12 -12
- data/lib/statsample/converters.rb +1 -1
- data/lib/statsample/dataset.rb +93 -36
- data/lib/statsample/dominanceanalysis/bootstrap.rb +66 -3
- data/lib/statsample/dominanceanalysis.rb +5 -6
- data/lib/statsample/factor/pca.rb +41 -11
- data/lib/statsample/factor/principalaxis.rb +105 -29
- data/lib/statsample/factor/rotation.rb +20 -3
- data/lib/statsample/factor.rb +1 -1
- data/lib/statsample/graph/gdchart.rb +13 -13
- data/lib/statsample/graph/svggraph.rb +166 -167
- data/lib/statsample/matrix.rb +22 -12
- data/lib/statsample/mle/logit.rb +3 -2
- data/lib/statsample/mle/probit.rb +7 -5
- data/lib/statsample/mle.rb +4 -2
- data/lib/statsample/multiset.rb +125 -124
- data/lib/statsample/permutation.rb +2 -1
- data/lib/statsample/regression/binomial/logit.rb +4 -3
- data/lib/statsample/regression/binomial/probit.rb +2 -1
- data/lib/statsample/regression/binomial.rb +62 -81
- data/lib/statsample/regression/multiple/baseengine.rb +1 -1
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/matrixengine.rb +12 -6
- data/lib/statsample/regression/multiple.rb +15 -42
- data/lib/statsample/regression/simple.rb +93 -78
- data/lib/statsample/regression.rb +74 -2
- data/lib/statsample/reliability.rb +117 -120
- data/lib/statsample/srs.rb +156 -153
- data/lib/statsample/test/levene.rb +90 -0
- data/lib/statsample/test/umannwhitney.rb +25 -9
- data/lib/statsample/test.rb +2 -0
- data/lib/statsample/vector.rb +388 -413
- data/lib/statsample.rb +74 -30
- data/po/es/statsample.mo +0 -0
- data/test/test_bivariate.rb +5 -4
- data/test/test_combination.rb +1 -1
- data/test/test_dataset.rb +2 -2
- data/test/test_factor.rb +53 -6
- data/test/test_gsl.rb +1 -1
- data/test/test_mle.rb +1 -1
- data/test/test_regression.rb +18 -33
- data/test/test_statistics.rb +15 -33
- data/test/test_stest.rb +35 -0
- data/test/test_svg_graph.rb +2 -2
- data/test/test_vector.rb +331 -333
- metadata +38 -11
data/lib/statsample/vector.rb
CHANGED
@@ -10,127 +10,98 @@ class Array
|
|
10
10
|
Statsample::Vector.new(self,:scale,*args)
|
11
11
|
end
|
12
12
|
end
|
13
|
+
|
13
14
|
module Statsample
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
# matrix=Statsample.vector_cols_matrix(v1,v2)
|
19
|
-
def vector_cols_matrix(*vs)
|
20
|
-
# test
|
21
|
-
size=vs[0].size
|
22
|
-
vs.each{|v|
|
23
|
-
raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
|
24
|
-
raise ArgumentError,"Vectors size should be the same" if v.size!=size
|
25
|
-
}
|
26
|
-
Matrix.rows((0...size).to_a.collect() {|i|
|
27
|
-
vs.collect{|v| v[i]}
|
28
|
-
})
|
29
|
-
end
|
30
|
-
end
|
31
|
-
# Returns a duplicate of the input vectors, without missing data
|
32
|
-
# for any of the vectors.
|
33
|
-
#
|
34
|
-
# a=[1,2,3,6,7,nil,3,5].to_scale
|
35
|
-
# b=[nil,nil,5,6,4,5,10,2].to_scale
|
36
|
-
# c=[2,4,6,7,4,5,6,7].to_scale
|
37
|
-
# a2,b2,c2=Statsample.only_valid(a,b,c)
|
38
|
-
# => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
|
39
|
-
# #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
|
40
|
-
# #<Statsample::Scale:0xb748c760 @data=[6, 7, 4, 6, 7]>]
|
41
|
-
#
|
42
|
-
def self.only_valid(*vs)
|
43
|
-
i=1
|
44
|
-
h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a}
|
45
|
-
ds=Statsample::Dataset.new(h).dup_only_valid
|
46
|
-
ds.vectors.values
|
47
|
-
end
|
48
|
-
|
49
|
-
class Vector
|
50
|
-
include Enumerable
|
51
|
-
include Writable
|
52
|
-
DEFAULT_OPTIONS={
|
53
|
-
:missing_values=>[],
|
54
|
-
:today_values=>['NOW','TODAY', :NOW, :TODAY],
|
55
|
-
:labels=>{}
|
56
|
-
}
|
57
|
-
# Level of measurement. Could be :nominal, :ordinal or :scale
|
58
|
-
attr_reader :type
|
59
|
-
# Original data.
|
60
|
-
attr_reader :data
|
61
|
-
# Valid data. Equal to data, minus values assigned as missing values
|
62
|
-
attr_reader :valid_data
|
63
|
-
# Array of values considered as missing. Nil is a missing value, by default
|
64
|
-
attr_reader :missing_values
|
65
|
-
# Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
|
66
|
-
attr_reader :today_values
|
67
|
-
# Missing values array
|
68
|
-
attr_reader :missing_data
|
69
|
-
# Original data, with all missing values replaced by nils
|
70
|
-
attr_reader :data_with_nils
|
71
|
-
# Date date, with all missing values replaced by nils
|
72
|
-
attr_reader :date_data_with_nils
|
73
|
-
# GSL Object, only available with rbgsl extension and type==:scale
|
74
|
-
attr_reader :gsl
|
75
|
-
# Change label for specific values
|
76
|
-
attr_accessor :labels
|
77
|
-
# Creates a new Vector object.
|
78
|
-
# [data] Array of data.
|
79
|
-
# [type] Level of meausurement. See Vector#type
|
80
|
-
# [opts] Options
|
81
|
-
# [:missing_values] Array of missing values. See Vector#missing_values
|
82
|
-
# [:today_values] Array of 'today' values. See Vector#today_values
|
83
|
-
# [:labels] Labels for data values
|
84
|
-
#
|
15
|
+
|
16
|
+
# Collection of values on one dimension. Works as a column on a Spreadsheet.
|
17
|
+
#
|
18
|
+
# == Usage
|
85
19
|
# The fast way to create a vector uses Array.to_vector or Array.to_scale.
|
86
20
|
#
|
87
21
|
# v=[1,2,3,4].to_vector(:scale)
|
88
22
|
# v=[1,2,3,4].to_scale
|
89
|
-
#
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
23
|
+
#
|
24
|
+
class Vector
|
25
|
+
include Enumerable
|
26
|
+
include Writable
|
27
|
+
# DEFAULT OPTIONS
|
28
|
+
DEFAULT_OPTIONS={
|
29
|
+
:missing_values=>[],
|
30
|
+
:today_values=>['NOW','TODAY', :NOW, :TODAY],
|
31
|
+
:labels=>{}
|
32
|
+
}
|
33
|
+
# Level of measurement. Could be :nominal, :ordinal or :scale
|
34
|
+
attr_reader :type
|
35
|
+
# Original data.
|
36
|
+
attr_reader :data
|
37
|
+
# Valid data. Equal to data, minus values assigned as missing values
|
38
|
+
attr_reader :valid_data
|
39
|
+
# Array of values considered as missing. Nil is a missing value, by default
|
40
|
+
attr_reader :missing_values
|
41
|
+
# Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
|
42
|
+
attr_reader :today_values
|
43
|
+
# Missing values array
|
44
|
+
attr_reader :missing_data
|
45
|
+
# Original data, with all missing values replaced by nils
|
46
|
+
attr_reader :data_with_nils
|
47
|
+
# Date date, with all missing values replaced by nils
|
48
|
+
attr_reader :date_data_with_nils
|
49
|
+
# GSL Object, only available with rbgsl extension and type==:scale
|
50
|
+
attr_reader :gsl
|
51
|
+
# Change label for specific values
|
52
|
+
attr_accessor :labels
|
53
|
+
#
|
54
|
+
# Creates a new Vector object.
|
55
|
+
# [data] Array of data.
|
56
|
+
# [type] Level of meausurement. See Vector#type
|
57
|
+
# [opts] Options
|
58
|
+
# [:missing_values] Array of missing values. See Vector#missing_values
|
59
|
+
# [:today_values] Array of 'today' values. See Vector#today_values
|
60
|
+
# [:labels] Labels for data values
|
61
|
+
#
|
62
|
+
def initialize(data=[], type=:nominal, opts=Hash.new)
|
63
|
+
raise "Data should be an array" unless data.is_a? Array
|
64
|
+
@data=data
|
65
|
+
@type=type
|
66
|
+
opts=DEFAULT_OPTIONS.merge(opts)
|
67
|
+
@missing_values=opts[:missing_values]
|
68
|
+
@labels=opts[:labels]
|
69
|
+
@today_values=opts[:today_values]
|
70
|
+
@valid_data=[]
|
71
|
+
@data_with_nils=[]
|
72
|
+
@date_data_with_nils=[]
|
73
|
+
@missing_data=[]
|
74
|
+
@has_missing_data=nil
|
75
|
+
@scale_data=nil
|
76
|
+
set_valid_data_intern
|
77
|
+
self.type=type
|
78
|
+
end
|
79
|
+
# Creates a duplicate of the Vector.
|
80
|
+
# Note: data, missing_values and labels are duplicated, so
|
81
|
+
# changes on original vector doesn't propages to copies.
|
82
|
+
def dup
|
112
83
|
Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
84
|
+
end
|
85
|
+
# Returns an empty duplicate of the vector. Maintains the type,
|
86
|
+
# missing values and labels.
|
87
|
+
def dup_empty
|
117
88
|
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
|
118
|
-
|
119
|
-
|
120
|
-
|
89
|
+
end
|
90
|
+
# Raises an exception if type of vector is inferior to t type
|
91
|
+
def check_type(t)
|
121
92
|
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date)
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
93
|
+
end
|
94
|
+
private :check_type
|
95
|
+
|
96
|
+
# Return a vector usign the standarized values for data
|
97
|
+
# with sd with denominator N
|
98
|
+
def vector_standarized_pop
|
128
99
|
vector_standarized(true)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
100
|
+
end
|
101
|
+
# Return a vector usign the standarized values for data
|
102
|
+
# with sd with denominator n-1
|
103
|
+
|
104
|
+
def vector_standarized(use_population=false)
|
134
105
|
raise "Should be a scale" unless @type==:scale
|
135
106
|
m=mean
|
136
107
|
sd=use_population ? sdp : sds
|
@@ -141,11 +112,10 @@ module Statsample
|
|
141
112
|
nil
|
142
113
|
end
|
143
114
|
}.to_vector(:scale)
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
def box_cox_transformation(lambda) # :nodoc:
|
115
|
+
end
|
116
|
+
alias_method :standarized, :vector_standarized
|
117
|
+
|
118
|
+
def box_cox_transformation(lambda) # :nodoc:
|
149
119
|
raise "Should be a scale" unless @type==:scale
|
150
120
|
@data_with_nils.collect{|x|
|
151
121
|
if !x.nil?
|
@@ -158,42 +128,42 @@ module Statsample
|
|
158
128
|
nil
|
159
129
|
end
|
160
130
|
}.to_vector(:scale)
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
131
|
+
end
|
132
|
+
|
133
|
+
# Vector equality.
|
134
|
+
# Two vector will be the same if their data, missing values, type, labels are equals
|
135
|
+
def ==(v2)
|
166
136
|
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
167
137
|
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
168
|
-
|
169
|
-
|
170
|
-
|
138
|
+
end
|
139
|
+
|
140
|
+
def _dump(i) # :nodoc:
|
171
141
|
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
172
|
-
|
173
|
-
|
174
|
-
|
142
|
+
end
|
143
|
+
|
144
|
+
def self._load(data) # :nodoc:
|
175
145
|
h=Marshal.load(data)
|
176
146
|
Vector.new(h['data'], h['type'],:missing_values=> h['missing_values'], :labels=>h['labels'])
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
147
|
+
end
|
148
|
+
# Returns a new vector, with data modified by block.
|
149
|
+
# Equivalent to create a Vector after #collect on data
|
150
|
+
def recode
|
181
151
|
@data.collect{|x|
|
182
152
|
yield x
|
183
153
|
}.to_vector(@type)
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
154
|
+
end
|
155
|
+
# Modifies current vector, with data modified by block.
|
156
|
+
# Equivalent to #collect! on @data
|
157
|
+
def recode!
|
188
158
|
@data.collect!{|x|
|
189
159
|
yield x
|
190
160
|
}
|
191
161
|
set_valid_data
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
162
|
+
end
|
163
|
+
# Dicotomize the vector with 0 and 1, based on lowest value
|
164
|
+
# If parameter if defined, this value and lower
|
165
|
+
# will be 0 and higher, 1
|
166
|
+
def dichotomize(low=nil)
|
197
167
|
fs=factors
|
198
168
|
low||=factors.min
|
199
169
|
@data_with_nils.collect{|x|
|
@@ -205,44 +175,44 @@ module Statsample
|
|
205
175
|
0
|
206
176
|
end
|
207
177
|
}.to_scale
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
178
|
+
end
|
179
|
+
# Iterate on each item.
|
180
|
+
# Equivalent to
|
181
|
+
# @data.each{|x| yield x}
|
182
|
+
def each
|
213
183
|
@data.each{|x| yield(x) }
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
184
|
+
end
|
185
|
+
|
186
|
+
# Iterate on each item, retrieving index
|
187
|
+
def each_index
|
218
188
|
(0...@data.size).each {|i|
|
219
189
|
yield(i)
|
220
190
|
}
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
191
|
+
end
|
192
|
+
# Add a value at the end of the vector.
|
193
|
+
# If second argument set to false, you should update the Vector usign
|
194
|
+
# Vector.set_valid_data at the end of your insertion cycle
|
195
|
+
#
|
196
|
+
def add(v,update_valid=true)
|
227
197
|
@data.push(v)
|
228
198
|
set_valid_data if update_valid
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
199
|
+
end
|
200
|
+
# Update valid_data, missing_data, data_with_nils and gsl
|
201
|
+
# at the end of an insertion.
|
202
|
+
#
|
203
|
+
# Use after Vector.add(v,false)
|
204
|
+
# Usage:
|
205
|
+
# v=Statsample::Vector.new
|
206
|
+
# v.add(2,false)
|
207
|
+
# v.add(4,false)
|
208
|
+
# v.data
|
209
|
+
# => [2,3]
|
210
|
+
# v.valid_data
|
211
|
+
# => []
|
212
|
+
# v.set_valid_data
|
213
|
+
# v.valid_data
|
214
|
+
# => [2,3]
|
215
|
+
def set_valid_data
|
246
216
|
@valid_data.clear
|
247
217
|
@missing_data.clear
|
248
218
|
@data_with_nils.clear
|
@@ -251,18 +221,18 @@ module Statsample
|
|
251
221
|
set_valid_data_intern
|
252
222
|
set_scale_data if(@type==:scale)
|
253
223
|
set_date_data if(@type==:date)
|
254
|
-
|
255
|
-
|
256
|
-
|
224
|
+
end
|
225
|
+
|
226
|
+
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
257
227
|
def set_valid_data_intern #:nodoc:
|
258
228
|
Statsample::STATSAMPLE__.set_valid_data_intern(self)
|
259
229
|
end
|
260
|
-
|
230
|
+
else
|
261
231
|
def set_valid_data_intern #:nodoc:
|
262
232
|
_set_valid_data_intern
|
263
233
|
end
|
264
|
-
|
265
|
-
|
234
|
+
end
|
235
|
+
def _set_valid_data_intern #:nodoc:
|
266
236
|
@data.each do |n|
|
267
237
|
if is_valid? n
|
268
238
|
@valid_data.push(n)
|
@@ -273,19 +243,19 @@ module Statsample
|
|
273
243
|
end
|
274
244
|
end
|
275
245
|
@has_missing_data=@missing_data.size>0
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
246
|
+
end
|
247
|
+
|
248
|
+
# Retrieves true if data has one o more missing values
|
249
|
+
def has_missing_data?
|
280
250
|
@has_missing_data
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
251
|
+
end
|
252
|
+
# Retrieves label for value x. Retrieves x if
|
253
|
+
# no label defined.
|
254
|
+
def labeling(x)
|
285
255
|
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
286
|
-
|
287
|
-
|
288
|
-
|
256
|
+
end
|
257
|
+
# Returns a Vector with data with labels replaced by the label.
|
258
|
+
def vector_labeled
|
289
259
|
d=@data.collect{|x|
|
290
260
|
if @labels.has_key? x
|
291
261
|
@labels[x]
|
@@ -294,69 +264,70 @@ module Statsample
|
|
294
264
|
end
|
295
265
|
}
|
296
266
|
Vector.new(d,@type)
|
297
|
-
|
298
|
-
|
299
|
-
|
267
|
+
end
|
268
|
+
# Size of total data
|
269
|
+
def size
|
300
270
|
@data.size
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
271
|
+
end
|
272
|
+
alias_method :n, :size
|
273
|
+
|
274
|
+
# Retrieves i element of data
|
275
|
+
def [](i)
|
306
276
|
@data[i]
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
277
|
+
end
|
278
|
+
# Set i element of data.
|
279
|
+
# Note: Use set_valid_data if you include missing values
|
280
|
+
def []=(i,v)
|
311
281
|
@data[i]=v
|
312
|
-
|
313
|
-
|
314
|
-
|
282
|
+
end
|
283
|
+
# Return true if a value is valid (not nil and not included on missing values)
|
284
|
+
def is_valid?(x)
|
315
285
|
!(x.nil? or @missing_values.include? x)
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
286
|
+
end
|
287
|
+
# Set missing_values.
|
288
|
+
# if update_valid = false, you should use
|
289
|
+
# set_valid_data after all changes
|
290
|
+
def missing_values=(vals)
|
321
291
|
@missing_values = vals
|
322
292
|
set_valid_data
|
323
|
-
|
324
|
-
|
293
|
+
end
|
294
|
+
# Set data considered as "today" on data vectors
|
295
|
+
def today_values=(vals)
|
325
296
|
@today_values = vals
|
326
297
|
set_valid_data
|
327
|
-
|
328
|
-
|
329
|
-
|
298
|
+
end
|
299
|
+
# Set level of measurement.
|
300
|
+
def type=(t)
|
330
301
|
@type=t
|
331
302
|
set_scale_data if(t==:scale)
|
332
303
|
set_date_data if (t==:date)
|
333
|
-
|
334
|
-
|
304
|
+
end
|
305
|
+
def to_a
|
335
306
|
@data.dup
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
307
|
+
end
|
308
|
+
alias_method :to_ary, :to_a
|
309
|
+
|
310
|
+
# Vector sum.
|
311
|
+
# - If v is a scalar, add this value to all elements
|
312
|
+
# - If v is a Array or a Vector, should be of the same size of this vector
|
313
|
+
# every item of this vector will be added to the value of the
|
314
|
+
# item at the same position on the other vector
|
315
|
+
def +(v)
|
345
316
|
_vector_ari("+",v)
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
317
|
+
end
|
318
|
+
# Vector rest.
|
319
|
+
# - If v is a scalar, rest this value to all elements
|
320
|
+
# - If v is a Array or a Vector, should be of the same
|
321
|
+
# size of this vector
|
322
|
+
# every item of this vector will be rested to the value of the
|
323
|
+
# item at the same position on the other vector
|
324
|
+
|
325
|
+
def -(v)
|
355
326
|
_vector_ari("-",v)
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
327
|
+
end
|
328
|
+
# Reports all values that doesn't comply with a condition.
|
329
|
+
# Returns a hash with the index of data and the invalid data.
|
330
|
+
def verify
|
360
331
|
h={}
|
361
332
|
(0...@data.size).to_a.each{|i|
|
362
333
|
if !(yield @data[i])
|
@@ -364,8 +335,8 @@ module Statsample
|
|
364
335
|
end
|
365
336
|
}
|
366
337
|
h
|
367
|
-
|
368
|
-
|
338
|
+
end
|
339
|
+
def _vector_ari(method,v) # :nodoc:
|
369
340
|
if(v.is_a? Vector or v.is_a? Array)
|
370
341
|
if v.size==@data.size
|
371
342
|
# i=0
|
@@ -395,13 +366,13 @@ module Statsample
|
|
395
366
|
raise TypeError,"You should pass a scalar or a array/vector"
|
396
367
|
end
|
397
368
|
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
369
|
+
end
|
370
|
+
# Return an array with the data splitted by a separator.
|
371
|
+
# a=Vector.new(["a,b","c,d","a,b","d"])
|
372
|
+
# a.splitted
|
373
|
+
# =>
|
374
|
+
# [["a","b"],["c","d"],["a","b"],["d"]]
|
375
|
+
def splitted(sep=Statsample::SPLIT_TOKEN)
|
405
376
|
@data.collect{|x|
|
406
377
|
if x.nil?
|
407
378
|
nil
|
@@ -411,73 +382,73 @@ module Statsample
|
|
411
382
|
[x]
|
412
383
|
end
|
413
384
|
}
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
385
|
+
end
|
386
|
+
# Returns a hash of Vectors, defined by the different values
|
387
|
+
# defined on the fields
|
388
|
+
# Example:
|
389
|
+
#
|
390
|
+
# a=Vector.new(["a,b","c,d","a,b"])
|
391
|
+
# a.split_by_separator
|
392
|
+
# => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
|
393
|
+
# @data=[1, 0, 1]>,
|
394
|
+
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
|
395
|
+
# @data=[1, 1, 0]>,
|
396
|
+
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
|
397
|
+
# @data=[0, 1, 1]>}
|
398
|
+
#
|
399
|
+
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
429
400
|
split_data=splitted(sep)
|
430
401
|
factors=split_data.flatten.uniq.compact
|
431
402
|
out=factors.inject({}) {|a,x|
|
432
|
-
|
433
|
-
|
403
|
+
a[x]=[]
|
404
|
+
a
|
434
405
|
}
|
435
|
-
split_data.each
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
}
|
440
|
-
else
|
441
|
-
factors.each{|f|
|
442
|
-
out[f].push(r.include?(f) ? 1:0)
|
443
|
-
}
|
406
|
+
split_data.each do |r|
|
407
|
+
if r.nil?
|
408
|
+
factors.each do |f|
|
409
|
+
out[f].push(nil)
|
444
410
|
end
|
445
|
-
|
411
|
+
else
|
412
|
+
factors.each do |f|
|
413
|
+
out[f].push(r.include?(f) ? 1:0)
|
414
|
+
end
|
415
|
+
end
|
416
|
+
end
|
446
417
|
out.inject({}){|s,v|
|
447
|
-
|
448
|
-
|
418
|
+
s[v[0]]=Vector.new(v[1],:nominal)
|
419
|
+
s
|
449
420
|
}
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
if(@type!=:scale or !
|
421
|
+
end
|
422
|
+
def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
|
423
|
+
split_by_separator(sep).inject({}) {|a,v|
|
424
|
+
a[v[0]]=v[1].inject {|s,x| s+x.to_i}
|
425
|
+
a
|
426
|
+
}
|
427
|
+
end
|
428
|
+
|
429
|
+
# Returns an random sample of size n, with replacement,
|
430
|
+
# only with valid data.
|
431
|
+
#
|
432
|
+
# In all the trails, every item have the same probability
|
433
|
+
# of been selected.
|
434
|
+
def sample_with_replacement(sample=1)
|
435
|
+
if(@type!=:scale or !Statsample.has_gsl?)
|
465
436
|
vds=@valid_data.size
|
466
437
|
(0...sample).collect{ @valid_data[rand(vds)] }
|
467
438
|
else
|
468
439
|
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
469
440
|
r.sample(@gsl, sample).to_a
|
470
441
|
end
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
442
|
+
end
|
443
|
+
# Returns an random sample of size n, without replacement,
|
444
|
+
# only with valid data.
|
445
|
+
#
|
446
|
+
# Every element could only be selected once.
|
447
|
+
#
|
448
|
+
# A sample of the same size of the vector is the vector itself.
|
478
449
|
|
479
|
-
|
480
|
-
if(@type!=:scale or !
|
450
|
+
def sample_without_replacement(sample=1)
|
451
|
+
if(@type!=:scale or !Statsample.has_gsl?)
|
481
452
|
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
482
453
|
out=[]
|
483
454
|
size=@valid_data.size
|
@@ -490,13 +461,13 @@ module Statsample
|
|
490
461
|
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
491
462
|
r.choose(@gsl, sample).to_a
|
492
463
|
end
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
464
|
+
end
|
465
|
+
# Retrieves number of cases which comply condition.
|
466
|
+
# If block given, retrieves number of instances where
|
467
|
+
# block returns true.
|
468
|
+
# If other values given, retrieves the frequency for
|
469
|
+
# this value.
|
470
|
+
def count(x=false)
|
500
471
|
if block_given?
|
501
472
|
r=@data.inject(0) {|s, i|
|
502
473
|
r=yield i
|
@@ -506,11 +477,11 @@ module Statsample
|
|
506
477
|
else
|
507
478
|
frequencies[x].nil? ? 0 : frequencies[x]
|
508
479
|
end
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
480
|
+
end
|
481
|
+
|
482
|
+
# Returns the database type for the vector, according to its content
|
483
|
+
|
484
|
+
def db_type(dbs='mysql')
|
514
485
|
# first, detect any character not number
|
515
486
|
if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
|
516
487
|
return "DATE"
|
@@ -521,43 +492,43 @@ module Statsample
|
|
521
492
|
else
|
522
493
|
return "INTEGER"
|
523
494
|
end
|
524
|
-
|
525
|
-
|
526
|
-
|
495
|
+
end
|
496
|
+
# Return true if all data is Date, "today" values or nil
|
497
|
+
def can_be_date?
|
527
498
|
if @data.find {|v|
|
528
499
|
!v.nil? and !v.is_a? Date and !v.is_a? Time and (v.is_a? String and !@today_values.include? v) and (v.is_a? String and !(v=~/\d{4,4}[-\/]\d{1,2}[-\/]\d{1,2}/))}
|
529
500
|
false
|
530
501
|
else
|
531
502
|
true
|
532
503
|
end
|
533
|
-
|
534
|
-
|
535
|
-
|
504
|
+
end
|
505
|
+
# Return true if all data is Numeric or nil
|
506
|
+
def can_be_scale?
|
536
507
|
if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
|
537
508
|
false
|
538
509
|
else
|
539
510
|
true
|
540
511
|
end
|
541
|
-
|
542
|
-
|
543
|
-
|
512
|
+
end
|
513
|
+
|
514
|
+
def to_s
|
544
515
|
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
516
|
+
end
|
517
|
+
# Ugly name. Really, create a Vector for standard 'matrix' package.
|
518
|
+
# <tt>dir</tt> could be :horizontal or :vertical
|
519
|
+
def to_matrix(dir=:horizontal)
|
549
520
|
case dir
|
550
521
|
when :horizontal
|
551
522
|
Matrix[@data]
|
552
523
|
when :vertical
|
553
524
|
Matrix.columns([@data])
|
554
525
|
end
|
555
|
-
|
556
|
-
|
526
|
+
end
|
527
|
+
def inspect
|
557
528
|
self.to_s
|
558
|
-
|
559
|
-
|
560
|
-
|
529
|
+
end
|
530
|
+
# Retrieves uniques values for data.
|
531
|
+
def factors
|
561
532
|
if @type==:scale
|
562
533
|
@scale_data.uniq.sort
|
563
534
|
elsif @type==:date
|
@@ -565,26 +536,26 @@ module Statsample
|
|
565
536
|
else
|
566
537
|
@valid_data.uniq.sort
|
567
538
|
end
|
568
|
-
|
569
|
-
|
539
|
+
end
|
540
|
+
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
570
541
|
# Returns a hash with the distribution of frecuencies for
|
571
542
|
# the sample
|
572
543
|
def frequencies
|
573
544
|
Statsample::STATSAMPLE__.frequencies(@valid_data)
|
574
545
|
end
|
575
|
-
|
546
|
+
else
|
576
547
|
def frequencies #:nodoc:
|
577
548
|
_frequencies
|
578
549
|
end
|
579
|
-
|
580
|
-
|
550
|
+
end
|
551
|
+
def _frequencies #:nodoc:
|
581
552
|
@valid_data.inject(Hash.new) {|a,x|
|
582
553
|
a[x]||=0
|
583
554
|
a[x]=a[x]+1
|
584
555
|
a
|
585
556
|
}
|
586
|
-
|
587
|
-
|
557
|
+
end
|
558
|
+
# Plot frequencies on a chart, using gnuplot
|
588
559
|
def plot_frequencies
|
589
560
|
require 'gnuplot'
|
590
561
|
x=[]
|
@@ -594,30 +565,30 @@ module Statsample
|
|
594
565
|
y.push(v)
|
595
566
|
}
|
596
567
|
Gnuplot.open do |gp|
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
end
|
568
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
569
|
+
plot.boxwidth("0.9 absolute")
|
570
|
+
plot.yrange("[0:#{y.max}]")
|
571
|
+
plot.style("fill solid 1.00 border -1")
|
572
|
+
plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
|
573
|
+
plot.style("histogram")
|
574
|
+
plot.style("data histogram")
|
575
|
+
i=-1
|
576
|
+
plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
|
577
|
+
plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
|
608
578
|
end
|
609
579
|
end
|
610
|
-
|
580
|
+
end
|
581
|
+
|
611
582
|
end
|
612
|
-
|
613
|
-
|
583
|
+
|
584
|
+
|
614
585
|
# Returns the most frequent item.
|
615
586
|
def mode
|
616
|
-
|
587
|
+
frequencies.max{|a,b| a[1]<=>b[1]}[0]
|
617
588
|
end
|
618
589
|
# The numbers of item with valid data.
|
619
590
|
def n_valid
|
620
|
-
|
591
|
+
@valid_data.size
|
621
592
|
end
|
622
593
|
# Returns a hash with the distribution of proportions of
|
623
594
|
# the sample.
|
@@ -632,38 +603,38 @@ module Statsample
|
|
632
603
|
frequencies[v].quo(@valid_data.size)
|
633
604
|
end
|
634
605
|
def summary(out="")
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
606
|
+
out << sprintf("n valid:%d\n",n_valid)
|
607
|
+
out << sprintf("factors:%s\n",factors.join(","))
|
608
|
+
out << "mode:"+mode.to_s+"\n"
|
609
|
+
out << "Distribution:\n"
|
610
|
+
frequencies.sort.each{|k,v|
|
611
|
+
key=labels.has_key?(k) ? labels[k]:k
|
612
|
+
out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
|
613
|
+
}
|
614
|
+
if(@type==:ordinal)
|
615
|
+
out << "median:"+median.to_s+"\n"
|
616
|
+
end
|
617
|
+
if(@type==:scale)
|
618
|
+
out << "mean:"+mean.to_s+"\n"
|
619
|
+
out << "sd:"+sd.to_s+"\n"
|
620
|
+
|
621
|
+
end
|
622
|
+
out
|
652
623
|
end
|
653
624
|
|
654
625
|
# Variance of p, according to poblation size
|
655
626
|
def variance_proportion(n_poblation, v=1)
|
656
|
-
|
627
|
+
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
657
628
|
end
|
658
629
|
# Variance of p, according to poblation size
|
659
630
|
def variance_total(n_poblation, v=1)
|
660
|
-
|
631
|
+
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
661
632
|
end
|
662
633
|
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
663
|
-
|
634
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
664
635
|
end
|
665
636
|
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
666
|
-
|
637
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
667
638
|
end
|
668
639
|
|
669
640
|
self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
|
@@ -672,20 +643,21 @@ module Statsample
|
|
672
643
|
alias_method met_or, met
|
673
644
|
end
|
674
645
|
end
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
646
|
+
|
647
|
+
######
|
648
|
+
### Ordinal Methods
|
649
|
+
######
|
650
|
+
|
679
651
|
# Return the value of the percentil q
|
680
652
|
def percentil(q)
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
653
|
+
check_type :ordinal
|
654
|
+
sorted=@valid_data.sort
|
655
|
+
v= (n_valid * q).quo(100)
|
656
|
+
if(v.to_i!=v)
|
657
|
+
sorted[v.to_i]
|
658
|
+
else
|
659
|
+
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
660
|
+
end
|
689
661
|
end
|
690
662
|
# Returns a ranked vector.
|
691
663
|
def ranked(type=:ordinal)
|
@@ -698,27 +670,28 @@ module Statsample
|
|
698
670
|
}
|
699
671
|
@data.collect {|c| r[c] }.to_vector(type)
|
700
672
|
end
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
673
|
+
# Return the median (percentil 50)
|
674
|
+
def median
|
675
|
+
check_type :ordinal
|
676
|
+
if Statsample.has_gsl? and @type==:scale
|
677
|
+
sorted=GSL::Vector.alloc(@scale_data.sort)
|
678
|
+
GSL::Stats::median_from_sorted_data(sorted)
|
679
|
+
else
|
680
|
+
percentil(50)
|
681
|
+
end
|
682
|
+
end
|
683
|
+
# Minimun value
|
684
|
+
def min
|
685
|
+
check_type :ordinal
|
686
|
+
@valid_data.min;
|
709
687
|
end
|
710
|
-
end
|
711
|
-
# Minimun value
|
712
|
-
def min;
|
713
|
-
check_type :ordinal
|
714
|
-
@valid_data.min;
|
715
|
-
end
|
716
688
|
# Maximum value
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
689
|
+
def max
|
690
|
+
check_type :ordinal
|
691
|
+
@valid_data.max;
|
692
|
+
end
|
693
|
+
|
694
|
+
def set_date_data
|
722
695
|
@date_data_with_nils=@data.collect do|x|
|
723
696
|
if x.is_a? Date
|
724
697
|
x
|
@@ -733,7 +706,8 @@ module Statsample
|
|
733
706
|
end
|
734
707
|
end
|
735
708
|
end
|
736
|
-
|
709
|
+
|
710
|
+
def set_scale_data
|
737
711
|
@scale_data=@valid_data.collect do|x|
|
738
712
|
if x.is_a? Numeric
|
739
713
|
x
|
@@ -743,12 +717,13 @@ module Statsample
|
|
743
717
|
x.to_f
|
744
718
|
end
|
745
719
|
end
|
746
|
-
if
|
720
|
+
if Statsample.has_gsl?
|
747
721
|
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
748
722
|
end
|
749
723
|
end
|
750
|
-
|
751
|
-
|
724
|
+
|
725
|
+
private :set_date_data, :set_scale_data
|
726
|
+
|
752
727
|
# The range of the data (max - min)
|
753
728
|
def range;
|
754
729
|
check_type :scale
|
@@ -788,7 +763,7 @@ module Statsample
|
|
788
763
|
squares.quo(n_valid) - m.square
|
789
764
|
end
|
790
765
|
|
791
|
-
|
766
|
+
|
792
767
|
# Population Standard deviation (denominator N)
|
793
768
|
def standard_deviation_population(m=nil)
|
794
769
|
check_type :scale
|
@@ -801,7 +776,7 @@ module Statsample
|
|
801
776
|
m||=mean
|
802
777
|
sum_of_squares(m).quo(n_valid - 1)
|
803
778
|
end
|
804
|
-
|
779
|
+
|
805
780
|
# Sample Standard deviation (denominator n-1)
|
806
781
|
|
807
782
|
def standard_deviation_sample(m=nil)
|
@@ -831,7 +806,7 @@ module Statsample
|
|
831
806
|
check_type :scale
|
832
807
|
@scale_data.inject(1){|a,x| a*x }
|
833
808
|
end
|
834
|
-
if
|
809
|
+
if Statsample.has_gsl?
|
835
810
|
%w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
|
836
811
|
m_nuevo=(m+"_slow").intern
|
837
812
|
alias_method m_nuevo, m.intern
|