statsample 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,8 +34,8 @@ class GslEngine < BaseEngine
34
34
  j=0
35
35
  @ds.fields.each{|f|
36
36
  if f!=@y_var
37
- @ds[f].each_index{|i|
38
- max_deps.set(i,j,@ds[f][i])
37
+ @ds[f].each_index{|i1|
38
+ max_deps.set(i1,j,@ds[f][i1])
39
39
  }
40
40
  columns.push(@ds[f].to_a)
41
41
  @fields.push(f)
@@ -1,33 +1,39 @@
1
1
  class Array
2
+ # Creates a new Statsample::Vector object
3
+ # Argument should be equal to Vector.new
2
4
  def to_vector(*args)
3
5
  Statsample::Vector.new(self,*args)
4
6
  end
7
+ # Creates a new Statsample::Vector object of type :scale
8
+ def to_scale(*args)
9
+ Statsample::Vector.new(self,:scale,*args)
10
+ end
5
11
  end
6
12
 
7
13
  module Statsample
8
14
  class << self
9
- # Create a matrix using vectors as columns.
10
- # Use:
11
- #
12
- # matrix=Statsample.vector_cols_matrix(v1,v2)
13
- def vector_cols_matrix(*vs)
14
- # test
15
- size=vs[0].size
16
- vs.each{|v|
17
- raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
18
- raise ArgumentError,"Vectors size should be the same" if v.size!=size
19
- }
20
- Matrix.rows((0...size).to_a.collect() {|i|
21
- vs.collect{|v| v[i]}
22
- })
23
- end
24
- end
15
+ # Create a matrix using vectors as columns.
16
+ # Use:
17
+ #
18
+ # matrix=Statsample.vector_cols_matrix(v1,v2)
19
+ def vector_cols_matrix(*vs)
20
+ # test
21
+ size=vs[0].size
22
+ vs.each{|v|
23
+ raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
24
+ raise ArgumentError,"Vectors size should be the same" if v.size!=size
25
+ }
26
+ Matrix.rows((0...size).to_a.collect() {|i|
27
+ vs.collect{|v| v[i]}
28
+ })
29
+ end
30
+ end
25
31
  # Returns a duplicate of the input vectors, without missing data
26
32
  # for any of the vectors.
27
33
  #
28
- # a=[1,2,3,6,7,nil,3,5].to_vector(:scale)
29
- # b=[nil,nil,5,6,4,5,10,2].to_vector(:scale)
30
- # c=[2,4,6,7,4,5,6,7].to_vector(:scale)
34
+ # a=[1,2,3,6,7,nil,3,5].to_scale
35
+ # b=[nil,nil,5,6,4,5,10,2].to_scale
36
+ # c=[2,4,6,7,4,5,6,7].to_scale
31
37
  # a2,b2,c2=Statsample.only_valid(a,b,c)
32
38
  # => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
33
39
  # #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
@@ -40,8 +46,13 @@ module Statsample
40
46
  ds.vectors.values
41
47
  end
42
48
 
43
- class Vector
49
+ class Vector
44
50
  include Enumerable
51
+ include Writable
52
+ DEFAULT_OPTIONS={
53
+ :missing_values=>[],
54
+ :labels=>{}
55
+ }
45
56
  # Level of measurement. Could be :nominal, :ordinal or :scale
46
57
  attr_reader :type
47
58
  # Original data.
@@ -60,788 +71,795 @@ class Vector
60
71
  attr_accessor :labels
61
72
  # Creates a new Vector object.
62
73
  # [data] Array of data.
63
- # [type] Level of meausurement. See Vector#type
64
- # [missing_values] Array of missing values. See Vector#missing_values
65
- # [labels] Labels for data values
74
+ # [type] Level of meausurement. See Vector#type
75
+ # [opts] Options
76
+ # [:missing_values] Array of missing values. See Vector#missing_values
77
+ # [:labels] Labels for data values
66
78
  #
67
- # The fast way to create a vector uses Array#to_vector. Remember
68
- # to include as the first argument the level of measurement
79
+ # The fast way to create a vector uses Array.to_vector or Array.to_scale.
69
80
  #
70
81
  # v=[1,2,3,4].to_vector(:scale)
82
+ # v=[1,2,3,4].to_scale
71
83
  #
72
- def initialize(data=[], t=:nominal,missing_values=[],labels={})
73
- raise "Data should be an array" unless data.is_a? Array
74
- @data=data
75
- @missing_values=missing_values
76
- @labels=labels
77
- @type=t
78
- @valid_data=[]
79
- @data_with_nils=[]
80
- @missing_data=[]
81
- @has_missing_data=nil
82
- @scale_data=nil
83
- set_valid_data_intern
84
- self.type=t
85
- end
86
- # Creates a duplicate of the Vector.
87
- # Note: data, missing_values and labels are duplicated, so
88
- # changes on original vector doesn't propages to copies.
89
- def dup
90
- Vector.new(@data.dup,@type,@missing_values.dup,@labels.dup)
91
- end
92
- # Returns an empty duplicate of the vector. Maintains the type,
93
- # missing values and labels.
94
- def dup_empty
95
- Vector.new([],@type,@missing_values.dup,@labels.dup)
96
- end
97
- # Raises an exception if type of vector is inferior to t type
98
- def check_type(t)
99
- raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal)
100
- end
101
- private :check_type
102
-
103
- # Return a vector usign the standarized values for data
104
- # with sd with denominator N
105
- def vector_standarized_pop
106
- vector_standarized(true)
107
- end
108
- # Return a vector usign the standarized values for data
109
- # with sd with denominator n-1
110
-
111
- def vector_standarized(use_population=false)
112
- raise "Should be a scale" unless @type==:scale
113
- m=mean
114
- sd=use_population ? sdp : sds
115
- @data_with_nils.collect{|x|
116
- if !x.nil?
117
- (x.to_f - m).quo(sd)
118
- else
119
- nil
120
- end
121
- }.to_vector(:scale)
122
- end
123
-
124
- alias_method :standarized, :vector_standarized
125
-
126
- def box_cox_transformation(lambda) # :nodoc:
127
- raise "Should be a scale" unless @type==:scale
128
- @data_with_nils.collect{|x|
84
+
85
+ def initialize(data=[], t=:nominal, opts=Hash.new)
86
+ raise "Data should be an array" unless data.is_a? Array
87
+ @data=data
88
+ @type=t
89
+ opts=DEFAULT_OPTIONS.merge(opts)
90
+ @missing_values=opts[:missing_values]
91
+ @labels=opts[:labels]
92
+ @valid_data=[]
93
+ @data_with_nils=[]
94
+ @missing_data=[]
95
+ @has_missing_data=nil
96
+ @scale_data=nil
97
+ set_valid_data_intern
98
+ self.type=t
99
+ end
100
+ # Creates a duplicate of the Vector.
101
+ # Note: data, missing_values and labels are duplicated, so
102
+ # changes on original vector doesn't propages to copies.
103
+ def dup
104
+ Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
105
+ end
106
+ # Returns an empty duplicate of the vector. Maintains the type,
107
+ # missing values and labels.
108
+ def dup_empty
109
+ Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
110
+ end
111
+ # Raises an exception if type of vector is inferior to t type
112
+ def check_type(t)
113
+ raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal)
114
+ end
115
+ private :check_type
116
+
117
+ # Return a vector usign the standarized values for data
118
+ # with sd with denominator N
119
+ def vector_standarized_pop
120
+ vector_standarized(true)
121
+ end
122
+ # Return a vector usign the standarized values for data
123
+ # with sd with denominator n-1
124
+
125
+ def vector_standarized(use_population=false)
126
+ raise "Should be a scale" unless @type==:scale
127
+ m=mean
128
+ sd=use_population ? sdp : sds
129
+ @data_with_nils.collect{|x|
129
130
  if !x.nil?
130
- if(lambda==0)
131
- Math.log(x)
132
- else
133
- (x**lambda-1).quo(lambda)
134
- end
131
+ (x.to_f - m).quo(sd)
135
132
  else
136
133
  nil
137
134
  end
138
- }.to_vector(:scale)
135
+ }.to_vector(:scale)
136
+ end
137
+
138
+ alias_method :standarized, :vector_standarized
139
+
140
+ def box_cox_transformation(lambda) # :nodoc:
141
+ raise "Should be a scale" unless @type==:scale
142
+ @data_with_nils.collect{|x|
143
+ if !x.nil?
144
+ if(lambda==0)
145
+ Math.log(x)
146
+ else
147
+ (x**lambda-1).quo(lambda)
148
+ end
149
+ else
150
+ nil
139
151
  end
140
-
141
- # Vector equality.
142
- # Two vector will be the same if their data, missing values, type, labels are equals
143
- def ==(v2)
144
- raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
145
- @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
152
+ }.to_vector(:scale)
153
+ end
154
+
155
+ # Vector equality.
156
+ # Two vector will be the same if their data, missing values, type, labels are equals
157
+ def ==(v2)
158
+ raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
159
+ @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
160
+ end
161
+
162
+ def _dump(i) # :nodoc:
163
+ Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
164
+ end
165
+
166
+ def self._load(data) # :nodoc:
167
+ h=Marshal.load(data)
168
+ Vector.new(h['data'], h['type'],:missing_values=> h['missing_values'], :labels=>h['labels'])
169
+ end
170
+ # Returns a new vector, with data modified by block.
171
+ # Equivalent to create a Vector after #collect on data
172
+ def recode
173
+ @data.collect{|x|
174
+ yield x
175
+ }.to_vector(@type)
176
+ end
177
+ # Modifies current vector, with data modified by block.
178
+ # Equivalent to #collect! on @data
179
+ def recode!
180
+ @data.collect!{|x|
181
+ yield x
182
+ }
183
+ set_valid_data
184
+ end
185
+ # Iterate on each item.
186
+ # Equivalent to
187
+ # @data.each{|x| yield x}
188
+ def each
189
+ @data.each{|x| yield(x) }
190
+ end
191
+
192
+ # Iterate on each item, retrieving index
193
+
194
+ def each_index
195
+ (0...@data.size).each {|i|
196
+ yield(i)
197
+ }
198
+ end
199
+ # Add a value at the end of the vector.
200
+ # If second argument set to false, you should update the Vector usign
201
+ # Vector.set_valid_data at the end of your insertion cycle
202
+ #
203
+ def add(v,update_valid=true)
204
+ @data.push(v)
205
+ set_valid_data if update_valid
206
+ end
207
+ # Update valid_data, missing_data, data_with_nils and gsl
208
+ # at the end of an insertion.
209
+ #
210
+ # Use after Vector.add(v,false)
211
+ # Usage:
212
+ # v=Statsample::Vector.new
213
+ # v.add(2,false)
214
+ # v.add(4,false)
215
+ # v.data
216
+ # => [2,3]
217
+ # v.valid_data
218
+ # => []
219
+ # v.set_valid_data
220
+ # v.valid_data
221
+ # => [2,3]
222
+ def set_valid_data
223
+ @valid_data.clear
224
+ @missing_data.clear
225
+ @data_with_nils.clear
226
+ @gsl=nil
227
+ set_valid_data_intern
228
+ set_scale_data if(@type==:scale)
229
+ end
230
+
231
+ if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
232
+ def set_valid_data_intern #:nodoc:
233
+ Statsample::STATSAMPLE__.set_valid_data_intern(self)
234
+ end
235
+ else
236
+ def set_valid_data_intern #:nodoc:
237
+ _set_valid_data_intern
238
+ end
239
+ end
240
+ def _set_valid_data_intern #:nodoc:
241
+ @data.each do |n|
242
+ if is_valid? n
243
+ @valid_data.push(n)
244
+ @data_with_nils.push(n)
245
+ else
246
+ @data_with_nils.push(nil)
247
+ @missing_data.push(n)
248
+ end
146
249
  end
147
-
148
- def _dump(i) # :nodoc:
149
- Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
250
+ @has_missing_data=@missing_data.size>0
251
+ end
252
+
253
+ # Retrieves true if data has one o more missing values
254
+ def has_missing_data?
255
+ @has_missing_data
256
+ end
257
+ # Retrieves label for value x. Retrieves x if
258
+ # no label defined.
259
+ def labeling(x)
260
+ @labels.has_key?(x) ? @labels[x].to_s : x.to_s
261
+ end
262
+ # Returns a Vector with data with labels replaced by the label.
263
+ def vector_labeled
264
+ d=@data.collect{|x|
265
+ if @labels.has_key? x
266
+ @labels[x]
267
+ else
268
+ x
269
+ end
270
+ }
271
+ Vector.new(d,@type)
272
+ end
273
+ # Size of total data
274
+ def size
275
+ @data.size
276
+ end
277
+ alias_method :n, :size
278
+
279
+ # Retrieves i element of data
280
+ def [](i)
281
+ @data[i]
282
+ end
283
+ # Set i element of data.
284
+ # Note: Use set_valid_data if you include missing values
285
+ def []=(i,v)
286
+ @data[i]=v
287
+ end
288
+ # Return true if a value is valid (not nil and not included on missing values)
289
+ def is_valid?(x)
290
+ !(x.nil? or @missing_values.include? x)
291
+ end
292
+ # Set missing_values.
293
+ # if update_valid = false, you should use
294
+ # set_valid_data after all changes
295
+ def missing_values=(vals)
296
+ @missing_values = vals
297
+ set_valid_data
298
+ end
299
+ # Set level of measurement.
300
+ def type=(t)
301
+ @type=t
302
+ set_scale_data if(t==:scale)
303
+ end
304
+ def to_a
305
+ @data.dup
306
+ end
307
+ alias_method :to_ary, :to_a
308
+
309
+ # Vector sum.
310
+ # - If v is a scalar, add this value to all elements
311
+ # - If v is a Array or a Vector, should be of the same size of this vector
312
+ # every item of this vector will be added to the value of the
313
+ # item at the same position on the other vector
314
+ def +(v)
315
+ _vector_ari("+",v)
316
+ end
317
+ # Vector rest.
318
+ # - If v is a scalar, rest this value to all elements
319
+ # - If v is a Array or a Vector, should be of the same
320
+ # size of this vector
321
+ # every item of this vector will be rested to the value of the
322
+ # item at the same position on the other vector
323
+
324
+ def -(v)
325
+ _vector_ari("-",v)
326
+ end
327
+ # Reports all values that doesn't comply with a condition.
328
+ # Returns a hash with the index of data and the invalid data.
329
+ def verify
330
+ h={}
331
+ (0...@data.size).to_a.each{|i|
332
+ if !(yield @data[i])
333
+ h[i]=@data[i]
334
+ end
335
+ }
336
+ h
337
+ end
338
+ def _vector_ari(method,v) # :nodoc:
339
+ if(v.is_a? Vector or v.is_a? Array)
340
+ if v.size==@data.size
341
+ # i=0
342
+ sum=[]
343
+ 0.upto(v.size-1) {|i|
344
+ if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
345
+ sum.push(@data[i].send(method,v[i]))
346
+ else
347
+ sum.push(nil)
348
+ end
349
+ }
350
+ Statsample::Vector.new(sum)
351
+ else
352
+ raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
353
+ end
354
+ elsif(v.respond_to? method )
355
+ Statsample::Vector.new(
356
+ @data.collect {|x|
357
+ if(!x.nil?)
358
+ x.send(method,v)
359
+ else
360
+ nil
361
+ end
362
+ }
363
+ )
364
+ else
365
+ raise TypeError,"You should pass a scalar or a array/vector"
150
366
  end
151
367
 
152
- def self._load(data) # :nodoc:
153
- h=Marshal.load(data)
154
- Vector.new(h['data'], h['type'], h['missing_values'], h['labels'])
155
- end
156
- # Returns a new vector, with data modified by block.
157
- # Equivalent to create a Vector after #collect on data
158
- def recode
159
- @data.collect{|x|
160
- yield x
161
- }.to_vector(@type)
162
- end
163
- # Modifies current vector, with data modified by block.
164
- # Equivalent to #collect! on @data
165
- def recode!
166
- @data.collect!{|x|
167
- yield x
368
+ end
369
+ # Return an array with the data splitted by a separator.
370
+ # a=Vector.new(["a,b","c,d","a,b","d"])
371
+ # a.splitted
372
+ # =>
373
+ # [["a","b"],["c","d"],["a","b"],["d"]]
374
+ def splitted(sep=Statsample::SPLIT_TOKEN)
375
+ @data.collect{|x|
376
+ if x.nil?
377
+ nil
378
+ elsif (x.respond_to? :split)
379
+ x.split(sep)
380
+ else
381
+ [x]
382
+ end
383
+ }
384
+ end
385
+ # Returns a hash of Vectors, defined by the different values
386
+ # defined on the fields
387
+ # Example:
388
+ #
389
+ # a=Vector.new(["a,b","c,d","a,b"])
390
+ # a.split_by_separator
391
+ # => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
392
+ # @data=[1, 0, 1]>,
393
+ # "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
394
+ # @data=[1, 1, 0]>,
395
+ # "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
396
+ # @data=[0, 1, 1]>}
397
+ #
398
+ def split_by_separator(sep=Statsample::SPLIT_TOKEN)
399
+ split_data=splitted(sep)
400
+ factors=split_data.flatten.uniq.compact
401
+ out=factors.inject({}) {|a,x|
402
+ a[x]=[]
403
+ a
404
+ }
405
+ split_data.each{|r|
406
+ if r.nil?
407
+ factors.each{|f|
408
+ out[f].push(nil)
409
+ }
410
+ else
411
+ factors.each{|f|
412
+ out[f].push(r.include?(f) ? 1:0)
168
413
  }
169
- set_valid_data
170
- end
171
- # Iterate on each item.
172
- # Equivalent to
173
- # @data.each{|x| yield x}
174
- def each
175
- @data.each{|x| yield(x) }
414
+ end
415
+ }
416
+ out.inject({}){|s,v|
417
+ s[v[0]]=Vector.new(v[1],:nominal)
418
+ s
419
+ }
420
+ end
421
+ def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
422
+ split_by_separator(sep).inject({}) {|a,v|
423
+ a[v[0]]=v[1].inject {|s,x| s+x.to_i}
424
+ a
425
+ }
426
+ end
427
+
428
+ # Returns an random sample of size n, with replacement,
429
+ # only with valid data.
430
+ #
431
+ # In all the trails, every item have the same probability
432
+ # of been selected.
433
+ def sample_with_replacement(sample=1)
434
+ if(@type!=:scale or !HAS_GSL)
435
+ vds=@valid_data.size
436
+ (0...sample).collect{ @valid_data[rand(vds)] }
437
+ else
438
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
439
+ r.sample(@gsl, sample).to_a
176
440
  end
441
+ end
442
+ # Returns an random sample of size n, without replacement,
443
+ # only with valid data.
444
+ #
445
+ # Every element could only be selected once.
446
+ #
447
+ # A sample of the same size of the vector is the vector itself.
177
448
 
178
- # Iterate on each item, retrieving index
179
-
180
- def each_index
181
- (0...@data.size).each {|i|
182
- yield(i)
449
+ def sample_without_replacement(sample=1)
450
+ if(@type!=:scale or !HAS_GSL)
451
+ raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
452
+ out=[]
453
+ size=@valid_data.size
454
+ while out.size<sample
455
+ value=rand(size)
456
+ out.push(value) if !out.include?value
457
+ end
458
+ out.collect{|i|@data[i]}
459
+ else
460
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
461
+ r.choose(@gsl, sample).to_a
462
+ end
463
+ end
464
+ # Retrieves number of cases which comply condition.
465
+ # If block given, retrieves number of instances where
466
+ # block returns true.
467
+ # If other values given, retrieves the frequency for
468
+ # this value.
469
+ def count(x=false)
470
+ if block_given?
471
+ r=@data.inject(0) {|s, i|
472
+ r=yield i
473
+ s+(r ? 1 : 0)
183
474
  }
475
+ r.nil? ? 0 : r
476
+ else
477
+ frequencies[x].nil? ? 0 : frequencies[x]
184
478
  end
185
- # Add a value at the end of the vector.
186
- # If second argument set to false, you should update the Vector usign
187
- # Vector.set_valid_data at the end of your insertion cycle
188
- #
189
- def add(v,update_valid=true)
190
- @data.push(v)
191
- set_valid_data if update_valid
479
+ end
480
+
481
+ # Returns the database type for the vector, according to its content
482
+
483
+ def db_type(dbs='mysql')
484
+ # first, detect any character not number
485
+ if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
486
+ return "DATE"
487
+ elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
488
+ return "VARCHAR (255)"
489
+ elsif @data.find {|v| v.to_s=~/\./}
490
+ return "DOUBLE"
491
+ else
492
+ return "INTEGER"
192
493
  end
193
- # Update valid_data, missing_data, data_with_nils and gsl
194
- # at the end of an insertion.
195
- #
196
- # Use after Vector.add(v,false)
197
- # Usage:
198
- # v=Statsample::Vector.new
199
- # v.add(2,false)
200
- # v.add(4,false)
201
- # v.data
202
- # => [2,3]
203
- # v.valid_data
204
- # => []
205
- # v.set_valid_data
206
- # v.valid_data
207
- # => [2,3]
208
- def set_valid_data
209
- @valid_data.clear
210
- @missing_data.clear
211
- @data_with_nils.clear
212
- @gsl=nil
213
- set_valid_data_intern
214
- set_scale_data if(@type==:scale)
215
- end
216
-
217
- if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
218
- def set_valid_data_intern #:nodoc:
219
- Statsample::STATSAMPLE__.set_valid_data_intern(self)
220
- end
494
+ end
495
+ # Return true if all data is Numeric or nil
496
+ def can_be_scale?
497
+ if @data.find {|v| !v.nil? and !v.is_a? Numeric}
498
+ false
221
499
  else
222
- def set_valid_data_intern #:nodoc:
223
- _set_valid_data_intern
224
- end
500
+ true
225
501
  end
226
- def _set_valid_data_intern #:nodoc:
227
- @data.each do |n|
228
- if is_valid? n
229
- @valid_data.push(n)
230
- @data_with_nils.push(n)
231
- else
232
- @data_with_nils.push(nil)
233
- @missing_data.push(n)
502
+ end
503
+
504
+ def to_s
505
+ sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
506
+ end
507
+ # Ugly name. Really, create a Vector for standard 'matrix' package.
508
+ # <tt>dir</tt> could be :horizontal or :vertical
509
+ def to_matrix(dir=:horizontal)
510
+ case dir
511
+ when :horizontal
512
+ Matrix[@data]
513
+ when :vertical
514
+ Matrix.columns([@data])
515
+ end
516
+ end
517
+ def inspect
518
+ self.to_s
519
+ end
520
+ # Retrieves uniques values for data.
521
+ def factors
522
+ if @type==:scale
523
+
524
+ @scale_data.uniq.sort
525
+ else
526
+ @valid_data.uniq.sort
527
+ end
528
+ end
529
+ if Statsample::STATSAMPLE__.respond_to?(:frequencies)
530
+ # Returns a hash with the distribution of frecuencies for
531
+ # the sample
532
+ def frequencies
533
+ Statsample::STATSAMPLE__.frequencies(@valid_data)
534
+ end
535
+ else
536
+ def frequencies #:nodoc:
537
+ _frequencies
538
+ end
539
+ end
540
+ def _frequencies #:nodoc:
541
+ @valid_data.inject(Hash.new) {|a,x|
542
+ a[x]||=0
543
+ a[x]=a[x]+1
544
+ a
545
+ }
546
+ end
547
+ # Plot frequencies on a chart, using gnuplot
548
+ def plot_frequencies
549
+ require 'gnuplot'
550
+ x=[]
551
+ y=[]
552
+ self.frequencies.sort.each{|k,v|
553
+ x.push(k)
554
+ y.push(v)
555
+ }
556
+ Gnuplot.open do |gp|
557
+ Gnuplot::Plot.new( gp ) do |plot|
558
+ plot.boxwidth("0.9 absolute")
559
+ plot.yrange("[0:#{y.max}]")
560
+ plot.style("fill solid 1.00 border -1")
561
+ plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
562
+ plot.style("histogram")
563
+ plot.style("data histogram")
564
+ i=-1
565
+ plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
566
+ plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
567
+ end
234
568
  end
235
569
  end
236
- @has_missing_data=@missing_data.size>0
237
- end
238
570
 
239
- # Retrieves true if data has one o more missing values
240
- def has_missing_data?
241
- @has_missing_data
242
- end
243
- # Retrieves label for value x. Retrieves x if
244
- # no label defined.
245
- def labeling(x)
246
- @labels.has_key?(x) ? @labels[x].to_s : x.to_s
247
- end
248
- # Returns a Vector with data with labels replaced by the label.
249
- def vector_labeled
250
- d=@data.collect{|x|
251
- if @labels.has_key? x
252
- @labels[x]
253
- else
254
- x
255
- end
256
- }
257
- Vector.new(d,@type)
258
571
  end
259
- # Size of total data
260
- def size
261
- @data.size
262
- end
263
- alias_method :n, :size
264
572
 
265
- # Retrieves i element of data
266
- def [](i)
267
- @data[i]
268
- end
269
- # Set i element of data.
270
- # Note: Use set_valid_data if you include missing values
271
- def []=(i,v)
272
- @data[i]=v
273
- end
274
- # Return true if a value is valid (not nil and not included on missing values)
275
- def is_valid?(x)
276
- !(x.nil? or @missing_values.include? x)
277
- end
278
- # Set missing_values
279
- def missing_values=(vals)
280
- @missing_values = vals
281
- set_valid_data
282
- end
283
- # Set level of measurement.
284
- def type=(t)
285
- @type=t
286
- set_scale_data if(t==:scale)
287
- end
288
- def to_a
289
- @data.dup
290
- end
291
- alias_method :to_ary, :to_a
292
-
293
- # Vector sum.
294
- # - If v is a scalar, add this value to all elements
295
- # - If v is a Array or a Vector, should be of the same size of this vector
296
- # every item of this vector will be added to the value of the
297
- # item at the same position on the other vector
298
- def +(v)
299
- _vector_ari("+",v)
300
- end
301
- # Vector rest.
302
- # - If v is a scalar, rest this value to all elements
303
- # - If v is a Array or a Vector, should be of the same
304
- # size of this vector
305
- # every item of this vector will be rested to the value of the
306
- # item at the same position on the other vector
307
-
308
- def -(v)
309
- _vector_ari("-",v)
310
- end
311
- # Reports all values that doesn't comply with a condition.
312
- # Returns a hash with the index of data and the invalid data.
313
- def verify
314
- h={}
315
- (0...@data.size).to_a.each{|i|
316
- if !(yield @data[i])
317
- h[i]=@data[i]
318
- end
319
- }
320
- h
321
- end
322
- def _vector_ari(method,v) # :nodoc:
323
- if(v.is_a? Vector or v.is_a? Array)
324
- if v.size==@data.size
325
- i=0
326
- sum=[]
327
- 0.upto(v.size-1) {|i|
328
- if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
329
- sum.push(@data[i].send(method,v[i]))
330
- else
331
- sum.push(nil)
332
- end
333
- }
334
- Statsample::Vector.new(sum)
335
- else
336
- raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
337
- end
338
- elsif(v.respond_to? method )
339
- Statsample::Vector.new(
340
- @data.collect {|x|
341
- if(!x.nil?)
342
- x.send(method,v)
343
- else
344
- nil
345
- end
346
- }
347
- )
348
- else
349
- raise TypeError,"You should pass a scalar or a array/vector"
350
- end
351
-
352
- end
353
- # Return an array with the data splitted by a separator.
354
- # a=Vector.new(["a,b","c,d","a,b","d"])
355
- # a.splitted
356
- # =>
357
- # [["a","b"],["c","d"],["a","b"],["d"]]
358
- def splitted(sep=Statsample::SPLIT_TOKEN)
359
- @data.collect{|x|
360
- if x.nil?
361
- nil
362
- elsif (x.respond_to? :split)
363
- x.split(sep)
364
- else
365
- [x]
366
- end
367
- }
368
- end
369
- # Returns a hash of Vectors, defined by the different values
370
- # defined on the fields
371
- # Example:
372
- #
373
- # a=Vector.new(["a,b","c,d","a,b"])
374
- # a.split_by_separator
375
- # => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
376
- # @data=[1, 0, 1]>,
377
- # "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
378
- # @data=[1, 1, 0]>,
379
- # "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
380
- # @data=[0, 1, 1]>}
381
- #
382
- def split_by_separator(sep=Statsample::SPLIT_TOKEN)
383
- split_data=splitted(sep)
384
- factors=split_data.flatten.uniq.compact
385
- out=factors.inject({}) {|a,x|
386
- a[x]=[]
573
+
574
+ # Returns the most frequent item.
575
+ def mode
576
+ frequencies.max{|a,b| a[1]<=>b[1]}[0]
577
+ end
578
+ # The numbers of item with valid data.
579
+ def n_valid
580
+ @valid_data.size
581
+ end
582
+ # Returns a hash with the distribution of proportions of
583
+ # the sample.
584
+ def proportions
585
+ frequencies.inject({}){|a,v|
586
+ a[v[0]] = v[1].quo(n_valid)
387
587
  a
388
588
  }
389
- split_data.each{|r|
390
- if r.nil?
391
- factors.each{|f|
392
- out[f].push(nil)
393
- }
394
- else
395
- factors.each{|f|
396
- out[f].push(r.include?(f) ? 1:0)
397
- }
398
- end
399
- }
400
- out.inject({}){|s,v|
401
- s[v[0]]=Vector.new(v[1],:nominal)
402
- s
403
- }
404
589
  end
405
- def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
406
- split_by_separator(sep).inject({}) {|a,v|
407
- a[v[0]]=v[1].inject {|s,x| s+x.to_i}
408
- a
590
+ # Proportion of a given value.
591
+ def proportion(v=1)
592
+ frequencies[v].quo(@valid_data.size)
593
+ end
594
+ def summary(out="")
595
+ out << sprintf("n valid:%d\n",n_valid)
596
+ out << sprintf("factors:%s\n",factors.join(","))
597
+ out << "mode:"+mode.to_s+"\n"
598
+ out << "Distribution:\n"
599
+ frequencies.sort.each{|k,v|
600
+ key=labels.has_key?(k) ? labels[k]:k
601
+ out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
409
602
  }
410
- end
411
-
412
- # Returns an random sample of size n, with replacement,
413
- # only with valid data.
414
- #
415
- # In all the trails, every item have the same probability
416
- # of been selected.
417
- def sample_with_replacement(sample=1)
418
- if(@type!=:scale or !HAS_GSL)
419
- vds=@valid_data.size
420
- (0...sample).collect{ @valid_data[rand(vds)] }
421
- else
422
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
423
- r.sample(@gsl, sample).to_a
424
- end
425
- end
426
- # Returns an random sample of size n, without replacement,
427
- # only with valid data.
428
- #
429
- # Every element could only be selected once.
430
- #
431
- # A sample of the same size of the vector is the vector itself.
432
-
433
- def sample_without_replacement(sample=1)
434
- if(@type!=:scale or !HAS_GSL)
435
- raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
436
- out=[]
437
- size=@valid_data.size
438
- while out.size<sample
439
- value=rand(size)
440
- out.push(value) if !out.include?value
441
- end
442
- out.collect{|i|@data[i]}
443
- else
444
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
445
- r.choose(@gsl, sample).to_a
603
+ if(@type==:ordinal)
604
+ out << "median:"+median.to_s+"\n"
446
605
  end
447
- end
448
- # Retrieves number of cases which comply condition.
449
- # If block given, retrieves number of instances where
450
- # block returns true.
451
- # If other values given, retrieves the frequency for
452
- # this value.
453
- def count(x=false)
454
- if block_given?
455
- r=@data.inject(0) {|s, i|
456
- r=yield i
457
- s+(r ? 1 : 0)
458
- }
459
- r.nil? ? 0 : r
460
- else
461
- frequencies[x].nil? ? 0 : frequencies[x]
606
+ if(@type==:scale)
607
+ out << "mean:"+mean.to_s+"\n"
608
+ out << "sd:"+sd.to_s+"\n"
609
+
462
610
  end
611
+ out
463
612
  end
464
613
 
465
- # Returns the database type for the vector, according to its content
466
-
467
- def db_type(dbs='mysql')
468
- # first, detect any character not number
469
- if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
470
- return "DATE"
471
- elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
472
- return "VARCHAR (255)"
473
- elsif @data.find {|v| v.to_s=~/\./}
474
- return "DOUBLE"
475
- else
476
- return "INTEGER"
477
- end
614
+ # Variance of p, according to poblation size
615
+ def variance_proportion(n_poblation, v=1)
616
+ Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
478
617
  end
479
- # Return true if all data is Numeric or nil
480
- def can_be_scale?
481
- if @data.find {|v| !v.nil? and !v.is_a? Numeric}
482
- false
483
- else
484
- true
485
- end
618
+ # Variance of p, according to poblation size
619
+ def variance_total(n_poblation, v=1)
620
+ Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
621
+ end
622
+ def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
623
+ Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
624
+ end
625
+ def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
626
+ Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
486
627
  end
487
628
 
488
- def to_s
489
- sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
490
- end
491
- # Ugly name. Really, create a Vector for standard 'matrix' package.
492
- # <tt>dir</tt> could be :horizontal or :vertical
493
- def to_matrix(dir=:horizontal)
494
- case dir
495
- when :horizontal
496
- Matrix[@data]
497
- when :vertical
498
- Matrix.columns([@data])
629
+ self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
630
+ met_or=met.gsub("_slow","")
631
+ if !self.method_defined?(met_or)
632
+ alias_method met_or, met
499
633
  end
500
634
  end
501
- def inspect
502
- self.to_s
503
- end
504
- # Retrieves uniques values for data.
505
- def factors
506
- if @type==:scale
507
- @scale_data.uniq.sort
635
+ ######
636
+ ### Ordinal Methods
637
+ ######
638
+
639
+ # Return the value of the percentil q
640
+ def percentil(q)
641
+ check_type :ordinal
642
+ sorted=@valid_data.sort
643
+ v= (n_valid * q).quo(100)
644
+ if(v.to_i!=v)
645
+ sorted[v.to_i]
508
646
  else
509
- @valid_data.uniq.sort
647
+ (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
510
648
  end
511
649
  end
512
- if Statsample::STATSAMPLE__.respond_to?(:frequencies)
513
- # Returns a hash with the distribution of frecuencies for
514
- # the sample
515
- def frequencies
516
- Statsample::STATSAMPLE__.frequencies(@valid_data)
517
- end
518
- else
519
- def frequencies #:nodoc:
520
- _frequencies
521
- end
522
- end
523
- def _frequencies #:nodoc:
524
- @valid_data.inject(Hash.new) {|a,x|
525
- a[x]||=0
526
- a[x]=a[x]+1
650
+ # Returns a ranked vector.
651
+ def ranked(type=:ordinal)
652
+ check_type :ordinal
653
+ i=0
654
+ r=frequencies.sort.inject({}){|a,v|
655
+ a[v[0]]=(i+1 + i+v[1]).quo(2)
656
+ i+=v[1]
527
657
  a
528
658
  }
529
- end
530
- # Plot frequencies on a chart, using gnuplot
531
- def plot_frequencies
532
- require 'gnuplot'
533
- x=[]
534
- y=[]
535
- self.frequencies.sort.each{|k,v|
536
- x.push(k)
537
- y.push(v)
538
- }
539
- Gnuplot.open do |gp|
540
- Gnuplot::Plot.new( gp ) do |plot|
541
- plot.boxwidth("0.9 absolute")
542
- plot.yrange("[0:#{y.max}]")
543
- plot.style("fill solid 1.00 border -1")
544
- plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
545
- plot.style("histogram")
546
- plot.style("data histogram")
547
- i=-1
548
- plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
549
- plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
550
- end
551
- end
552
- end
553
-
659
+ @data.collect {|c|
660
+ r[c]
661
+ }.to_vector(type)
662
+ end
663
+ # Return the median (percentil 50)
664
+ def median
665
+ check_type :ordinal
666
+ if HAS_GSL and @type==:scale
667
+ GSL::Stats::median_from_sorted_data(@gsl)
668
+ else
669
+ percentil(50)
554
670
  end
671
+ end
672
+ # Minimun value
673
+ def min;
674
+ check_type :ordinal
675
+ @valid_data.min;
676
+ end
677
+ # Maximum value
678
+ def max;
679
+ check_type :ordinal
680
+ @valid_data.max;
681
+ end
555
682
 
556
-
557
- # Returns the most frequent item.
558
- def mode
559
- frequencies.max{|a,b| a[1]<=>b[1]}[0]
560
- end
561
- # The numbers of item with valid data.
562
- def n_valid
563
- @valid_data.size
564
- end
565
- # Returns a hash with the distribution of proportions of
566
- # the sample.
567
- def proportions
568
- frequencies.inject({}){|a,v|
569
- a[v[0]] = v[1].quo(n_valid)
570
- a
571
- }
572
- end
573
- # Proportion of a given value.
574
- def proportion(v=1)
575
- frequencies[v].quo(@valid_data.size)
576
- end
577
- def summary(out="")
578
- out << sprintf("n valid:%d\n",n_valid)
579
- out << sprintf("factors:%s\n",factors.join(","))
580
- out << "mode:"+mode.to_s+"\n"
581
- out << "Distribution:\n"
582
- frequencies.sort.each{|k,v|
583
- key=labels.has_key?(k) ? labels[k]:k
584
- out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
585
- }
586
- if(@type==:ordinal)
587
- out << "median:"+median.to_s+"\n"
588
- end
589
- if(@type==:scale)
590
- out << "mean:"+mean.to_s+"\n"
591
- out << "sd:"+sd.to_s+"\n"
592
-
683
+ def set_scale_data # :nodoc
684
+ @scale_data=@valid_data.collect do|x|
685
+ if x.is_a? Numeric
686
+ x
687
+ elsif x.is_a? String and x.to_i==x.to_f
688
+ x.to_i
689
+ else
690
+ x.to_f
593
691
  end
594
- out
595
692
  end
596
-
597
- # Variance of p, according to poblation size
598
- def variance_proportion(n_poblation, v=1)
599
- Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
600
- end
601
- # Variance of p, according to poblation size
602
- def variance_total(n_poblation, v=1)
603
- Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
604
- end
605
- def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
606
- Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
607
- end
608
- def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
609
- Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
693
+ if HAS_GSL
694
+ @gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
610
695
  end
696
+ end
697
+ private :set_scale_data
698
+
699
+ # The range of the data (max - min)
700
+ def range;
701
+ check_type :scale
702
+ @scale_data.max - @scale_data.min
703
+ end
704
+ # The sum of values for the data
705
+ def sum
706
+ check_type :scale
707
+ @scale_data.inject(0){|a,x|x+a} ;
708
+ end
709
+ # The arithmetical mean of data
710
+ def mean
711
+ check_type :scale
712
+ sum.to_f.quo(n_valid)
713
+ end
714
+ # Sum of squares for the data around a value.
715
+ # By default, this value is the mean
716
+ # ss= sum{(xi-m)^2}
717
+ #
718
+ def sum_of_squares(m=nil)
719
+ check_type :scale
720
+ m||=mean
721
+ @scale_data.inject(0){|a,x| a+(x-m).square}
722
+ end
723
+
724
+ # Sum of squared deviation
725
+ def sum_of_squared_deviation
726
+ check_type :scale
727
+ @scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
728
+ end
729
+
730
+ # Population variance (denominator N)
731
+ def variance_population(m=nil)
732
+ check_type :scale
733
+ m||=mean
734
+ squares=@scale_data.inject(0){|a,x| x.square+a}
735
+ squares.quo(n_valid) - m.square
736
+ end
737
+
738
+
739
+ # Population Standard deviation (denominator N)
740
+ def standard_deviation_population(m=nil)
741
+ check_type :scale
611
742
 
612
- self.instance_methods.find_all{|met| met=~/_slow$/}.each{|met|
613
- met_or=met.gsub("_slow","")
614
- if !self.method_defined?(met_or)
615
- alias_method met_or, met
616
- end
617
- }
618
- ######
619
- ### Ordinal Methods
620
- ######
743
+ Math::sqrt( variance_population(m) )
744
+ end
745
+ # Sample Variance (denominator n-1)
621
746
 
622
- # Return the value of the percentil q
623
- def percentil(q)
624
- check_type :ordinal
625
- sorted=@valid_data.sort
626
- v= (n_valid * q).quo(100)
627
- if(v.to_i!=v)
628
- sorted[v.to_i]
629
- else
630
- (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
631
- end
632
- end
633
- # Returns a ranked vector.
634
- def ranked(type=:ordinal)
635
- check_type :ordinal
636
- i=0
637
- r=frequencies.sort.inject({}){|a,v|
638
- a[v[0]]=(i+1 + i+v[1]).quo(2)
639
- i+=v[1]
640
- a
641
- }
642
- @data.collect {|c|
643
- r[c]
644
- }.to_vector(type)
645
- end
646
- # Return the median (percentil 50)
647
- def median
648
- check_type :ordinal
649
- if HAS_GSL and @type==:scale
650
- GSL::Stats::median_from_sorted_data(@gsl)
651
- else
652
- percentil(50)
653
- end
654
- end
655
- # Minimun value
656
- def min;
657
- check_type :ordinal
658
- @valid_data.min;
659
- end
660
- # Maximum value
661
- def max;
662
- check_type :ordinal
663
- @valid_data.max;
664
- end
747
+ def variance_sample(m=nil)
748
+ check_type :scale
749
+
750
+ m||=mean
751
+ sum_of_squares(m).quo(n_valid - 1)
752
+ end
665
753
 
666
- def set_scale_data # :nodoc
667
- @scale_data=@valid_data.collect{|x|
668
- if x.is_a? Numeric
669
- x
670
- elsif x.is_a? String and x.to_i==x.to_f
671
- x.to_i
672
- else
673
- x.to_f
674
- end
754
+ # Sample Standard deviation (denominator n-1)
755
+
756
+ def standard_deviation_sample(m=nil)
757
+ check_type :scale
758
+
759
+ m||=m
760
+ Math::sqrt(variance_sample(m))
761
+ end
762
+ # Skewness of the sample
763
+ def skew
764
+ check_type :scale
765
+ m=mean
766
+ thirds=@scale_data.inject(0){|a,x| a+((x-mean)**3)}
767
+ thirds.quo((@scale_data.size-1)*sd**3)
768
+ end
769
+ # Kurtosis of the sample
770
+ def kurtosis
771
+ check_type :scale
772
+
773
+ m=mean
774
+ thirds=@scale_data.inject(0){|a,x| a+((x-mean)**4)}
775
+ thirds.quo((@scale_data.size-1)*sd**4)
776
+
777
+ end
778
+ # Product of all values on the sample
779
+ #
780
+ def product
781
+ check_type :scale
782
+ @scale_data.inject(1){|a,x| a*x }
783
+ end
784
+ if HAS_GSL
785
+ %w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
786
+ m_nuevo=(m+"_slow").intern
787
+ alias_method m_nuevo, m.intern
675
788
  }
676
- if HAS_GSL
677
- @gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
789
+ def sum # :nodoc:
790
+ check_type :scale
791
+
792
+ @gsl.sum
678
793
  end
679
- end
680
- private :set_scale_data
681
-
682
- # The range of the data (max - min)
683
- def range;
684
- check_type :scale
685
- @scale_data.max - @scale_data.min
794
+ def mean # :nodoc:
795
+ check_type :scale
796
+
797
+ @gsl.mean
798
+ end
799
+ def variance_sample(m=nil) # :nodoc:
800
+ check_type :scale
801
+
802
+ m||=mean
803
+ @gsl.variance_m
686
804
  end
687
- # The sum of values for the data
688
- def sum
689
- check_type :scale
690
- @scale_data.inject(0){|a,x|x+a} ; end
691
- # The arithmetical mean of data
692
- def mean
693
- check_type :scale
694
- sum.to_f.quo(n_valid)
695
- end
696
- # Sum of squares for the data around a value.
697
- # By default, this value is the mean
698
- # ss= sum{(xi-m)^2}
699
- #
700
- def sum_of_squares(m=nil)
805
+ def standard_deviation_sample(m=nil) # :nodoc:
701
806
  check_type :scale
702
807
  m||=mean
703
- @scale_data.inject(0){|a,x| a+(x-m).square}
808
+ @gsl.sd(m)
704
809
  end
705
810
 
706
- # Sum of squared deviation
707
- def sum_of_squared_deviation
708
- check_type :scale
709
- @scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
710
- end
711
-
712
- # Population variance (denominator N)
713
- def variance_population(m=nil)
714
- check_type :scale
811
+ def variance_population(m=nil) # :nodoc:
812
+ check_type :scale
715
813
  m||=mean
716
- squares=@scale_data.inject(0){|a,x| x.square+a}
717
- squares.quo(n_valid) - m.square
814
+ @gsl.variance_with_fixed_mean(m)
718
815
  end
719
-
720
-
721
- # Population Standard deviation (denominator N)
722
- def standard_deviation_population(m=nil)
816
+ def standard_deviation_population(m=nil) # :nodoc:
723
817
  check_type :scale
724
-
725
- Math::sqrt( variance_population(m) )
818
+ m||=mean
819
+ @gsl.sd_with_fixed_mean(m)
726
820
  end
727
- # Sample Variance (denominator n-1)
728
-
729
- def variance_sample(m=nil)
821
+ def skew # :nodoc:
730
822
  check_type :scale
731
-
732
- m||=mean
733
- sum_of_squares(m).quo(n_valid - 1)
734
- end
735
-
736
- # Sample Standard deviation (denominator n-1)
737
-
738
- def standard_deviation_sample(m=nil)
739
- check_type :scale
740
-
741
- m||=m
742
- Math::sqrt(variance_sample(m))
743
- end
744
- # Skewness of the sample
745
- def skew
746
- check_type :scale
747
- m=mean
748
- thirds=@scale_data.inject(0){|a,x| a+((x-mean)**3)}
749
- thirds.quo((@scale_data.size-1)*sd**3)
750
- end
751
- # Kurtosis of the sample
752
- def kurtosis
753
- check_type :scale
754
-
755
- m=mean
756
- thirds=@scale_data.inject(0){|a,x| a+((x-mean)**4)}
757
- thirds.quo((@scale_data.size-1)*sd**4)
758
-
759
- end
760
- # Product of all values on the sample
761
- #
762
- def product
763
- check_type :scale
764
- @scale_data.inject(1){|a,x| a*x }
823
+ @gsl.skew
765
824
  end
766
- if HAS_GSL
767
- %w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
768
- m_nuevo=(m+"_slow").intern
769
- alias_method m_nuevo, m.intern
770
- }
771
- def sum # :nodoc:
825
+ def kurtosis # :nodoc:
772
826
  check_type :scale
773
-
774
- @gsl.sum
775
- end
776
- def mean # :nodoc:
777
- check_type :scale
778
-
779
- @gsl.mean
780
- end
781
- def variance_sample(m=nil) # :nodoc:
827
+ @gsl.kurtosis
828
+ end
829
+ # Create a GSL::Histogram
830
+ # With a fixnum, creates X bins within the range of data
831
+ # With an Array, each value will be a cut point
832
+ def histogram(bins=10)
782
833
  check_type :scale
783
-
784
- m||=mean
785
- @gsl.variance_m
786
- end
787
- def standard_deviation_sample(m=nil) # :nodoc:
788
- check_type :scale
789
- m||=mean
790
- @gsl.sd(m)
791
- end
792
-
793
- def variance_population(m=nil) # :nodoc:
794
- check_type :scale
795
- m||=mean
796
- @gsl.variance_with_fixed_mean(m)
797
- end
798
- def standard_deviation_population(m=nil) # :nodoc:
799
- check_type :scale
800
- m||=mean
801
- @gsl.sd_with_fixed_mean(m)
802
- end
803
- def skew # :nodoc:
804
- check_type :scale
805
- @gsl.skew
806
- end
807
- def kurtosis # :nodoc:
808
- check_type :scale
809
- @gsl.kurtosis
810
- end
811
- # Create a GSL::Histogram
812
- # With a fixnum, creates X bins within the range of data
813
- # With an Array, each value will be a cut point
814
- def histogram(bins=10)
815
- check_type :scale
816
- if bins.is_a? Array
817
- h=GSL::Histogram.alloc(bins)
818
- else
819
- # ugly patch. The upper limit for a bin has the form
820
- # x < range
821
- h=GSL::Histogram.alloc(bins,[@data.min,@data.max+0.0001])
822
- end
823
- h.increment(@gsl)
824
- h
825
- end
826
- def plot_histogram(bins=10,options="")
827
- check_type :scale
828
- self.histogram(bins).graph(options)
834
+ if bins.is_a? Array
835
+ h=GSL::Histogram.alloc(bins)
836
+ else
837
+ # ugly patch. The upper limit for a bin has the form
838
+ # x < range
839
+ h=GSL::Histogram.alloc(bins,[@valid_data.min,@valid_data.max+0.0001])
829
840
  end
830
-
831
- end
832
-
833
- # Coefficient of variation
834
- # Calculed with the sample standard deviation
835
- def coefficient_of_variation
841
+ h.increment(@gsl)
842
+ h
843
+ end
844
+ def plot_histogram(bins=10,options="")
836
845
  check_type :scale
837
- standard_deviation_sample.quo(mean)
838
- end
839
-
840
- alias_method :sdp, :standard_deviation_population
841
- alias_method :sds, :standard_deviation_sample
842
- alias_method :cov, :coefficient_of_variation
843
- alias_method :variance, :variance_sample
844
- alias_method :sd, :standard_deviation_sample
845
- alias_method :ss, :sum_of_squares
846
- end
846
+ self.histogram(bins).graph(options)
847
+ end
848
+
849
+ end
850
+
851
+ # Coefficient of variation
852
+ # Calculed with the sample standard deviation
853
+ def coefficient_of_variation
854
+ check_type :scale
855
+ standard_deviation_sample.quo(mean)
856
+ end
857
+
858
+ alias_method :sdp, :standard_deviation_population
859
+ alias_method :sds, :standard_deviation_sample
860
+ alias_method :cov, :coefficient_of_variation
861
+ alias_method :variance, :variance_sample
862
+ alias_method :sd, :standard_deviation_sample
863
+ alias_method :ss, :sum_of_squares
864
+ end
847
865
  end