statsample 0.6.5 → 0.6.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. data/History.txt +15 -0
  2. data/Manifest.txt +6 -0
  3. data/README.txt +30 -12
  4. data/Rakefile +91 -0
  5. data/demo/levene.rb +9 -0
  6. data/demo/multiple_regression.rb +1 -7
  7. data/demo/polychoric.rb +1 -0
  8. data/demo/principal_axis.rb +8 -0
  9. data/lib/distribution/f.rb +22 -22
  10. data/lib/spss.rb +99 -99
  11. data/lib/statsample/bivariate/polychoric.rb +32 -22
  12. data/lib/statsample/bivariate/tetrachoric.rb +212 -207
  13. data/lib/statsample/bivariate.rb +6 -6
  14. data/lib/statsample/codification.rb +65 -65
  15. data/lib/statsample/combination.rb +60 -59
  16. data/lib/statsample/converter/csv19.rb +12 -12
  17. data/lib/statsample/converters.rb +1 -1
  18. data/lib/statsample/dataset.rb +93 -36
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +66 -3
  20. data/lib/statsample/dominanceanalysis.rb +5 -6
  21. data/lib/statsample/factor/pca.rb +41 -11
  22. data/lib/statsample/factor/principalaxis.rb +105 -29
  23. data/lib/statsample/factor/rotation.rb +20 -3
  24. data/lib/statsample/factor.rb +1 -1
  25. data/lib/statsample/graph/gdchart.rb +13 -13
  26. data/lib/statsample/graph/svggraph.rb +166 -167
  27. data/lib/statsample/matrix.rb +22 -12
  28. data/lib/statsample/mle/logit.rb +3 -2
  29. data/lib/statsample/mle/probit.rb +7 -5
  30. data/lib/statsample/mle.rb +4 -2
  31. data/lib/statsample/multiset.rb +125 -124
  32. data/lib/statsample/permutation.rb +2 -1
  33. data/lib/statsample/regression/binomial/logit.rb +4 -3
  34. data/lib/statsample/regression/binomial/probit.rb +2 -1
  35. data/lib/statsample/regression/binomial.rb +62 -81
  36. data/lib/statsample/regression/multiple/baseengine.rb +1 -1
  37. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  38. data/lib/statsample/regression/multiple/matrixengine.rb +12 -6
  39. data/lib/statsample/regression/multiple.rb +15 -42
  40. data/lib/statsample/regression/simple.rb +93 -78
  41. data/lib/statsample/regression.rb +74 -2
  42. data/lib/statsample/reliability.rb +117 -120
  43. data/lib/statsample/srs.rb +156 -153
  44. data/lib/statsample/test/levene.rb +90 -0
  45. data/lib/statsample/test/umannwhitney.rb +25 -9
  46. data/lib/statsample/test.rb +2 -0
  47. data/lib/statsample/vector.rb +388 -413
  48. data/lib/statsample.rb +74 -30
  49. data/po/es/statsample.mo +0 -0
  50. data/test/test_bivariate.rb +5 -4
  51. data/test/test_combination.rb +1 -1
  52. data/test/test_dataset.rb +2 -2
  53. data/test/test_factor.rb +53 -6
  54. data/test/test_gsl.rb +1 -1
  55. data/test/test_mle.rb +1 -1
  56. data/test/test_regression.rb +18 -33
  57. data/test/test_statistics.rb +15 -33
  58. data/test/test_stest.rb +35 -0
  59. data/test/test_svg_graph.rb +2 -2
  60. data/test/test_vector.rb +331 -333
  61. metadata +38 -11
@@ -10,127 +10,98 @@ class Array
10
10
  Statsample::Vector.new(self,:scale,*args)
11
11
  end
12
12
  end
13
+
13
14
  module Statsample
14
- class << self
15
- # Create a matrix using vectors as columns.
16
- # Use:
17
- #
18
- # matrix=Statsample.vector_cols_matrix(v1,v2)
19
- def vector_cols_matrix(*vs)
20
- # test
21
- size=vs[0].size
22
- vs.each{|v|
23
- raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
24
- raise ArgumentError,"Vectors size should be the same" if v.size!=size
25
- }
26
- Matrix.rows((0...size).to_a.collect() {|i|
27
- vs.collect{|v| v[i]}
28
- })
29
- end
30
- end
31
- # Returns a duplicate of the input vectors, without missing data
32
- # for any of the vectors.
33
- #
34
- # a=[1,2,3,6,7,nil,3,5].to_scale
35
- # b=[nil,nil,5,6,4,5,10,2].to_scale
36
- # c=[2,4,6,7,4,5,6,7].to_scale
37
- # a2,b2,c2=Statsample.only_valid(a,b,c)
38
- # => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
39
- # #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
40
- # #<Statsample::Scale:0xb748c760 @data=[6, 7, 4, 6, 7]>]
41
- #
42
- def self.only_valid(*vs)
43
- i=1
44
- h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a}
45
- ds=Statsample::Dataset.new(h).dup_only_valid
46
- ds.vectors.values
47
- end
48
-
49
- class Vector
50
- include Enumerable
51
- include Writable
52
- DEFAULT_OPTIONS={
53
- :missing_values=>[],
54
- :today_values=>['NOW','TODAY', :NOW, :TODAY],
55
- :labels=>{}
56
- }
57
- # Level of measurement. Could be :nominal, :ordinal or :scale
58
- attr_reader :type
59
- # Original data.
60
- attr_reader :data
61
- # Valid data. Equal to data, minus values assigned as missing values
62
- attr_reader :valid_data
63
- # Array of values considered as missing. Nil is a missing value, by default
64
- attr_reader :missing_values
65
- # Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
66
- attr_reader :today_values
67
- # Missing values array
68
- attr_reader :missing_data
69
- # Original data, with all missing values replaced by nils
70
- attr_reader :data_with_nils
71
- # Date date, with all missing values replaced by nils
72
- attr_reader :date_data_with_nils
73
- # GSL Object, only available with rbgsl extension and type==:scale
74
- attr_reader :gsl
75
- # Change label for specific values
76
- attr_accessor :labels
77
- # Creates a new Vector object.
78
- # [data] Array of data.
79
- # [type] Level of meausurement. See Vector#type
80
- # [opts] Options
81
- # [:missing_values] Array of missing values. See Vector#missing_values
82
- # [:today_values] Array of 'today' values. See Vector#today_values
83
- # [:labels] Labels for data values
84
- #
15
+
16
+ # Collection of values on one dimension. Works as a column on a Spreadsheet.
17
+ #
18
+ # == Usage
85
19
  # The fast way to create a vector uses Array.to_vector or Array.to_scale.
86
20
  #
87
21
  # v=[1,2,3,4].to_vector(:scale)
88
22
  # v=[1,2,3,4].to_scale
89
- #
90
-
91
- def initialize(data=[], t=:nominal, opts=Hash.new)
92
- raise "Data should be an array" unless data.is_a? Array
93
- @data=data
94
- @type=t
95
- opts=DEFAULT_OPTIONS.merge(opts)
96
- @missing_values=opts[:missing_values]
97
- @labels=opts[:labels]
98
- @today_values=opts[:today_values]
99
- @valid_data=[]
100
- @data_with_nils=[]
101
- @date_data_with_nils=[]
102
- @missing_data=[]
103
- @has_missing_data=nil
104
- @scale_data=nil
105
- set_valid_data_intern
106
- self.type=t
107
- end
108
- # Creates a duplicate of the Vector.
109
- # Note: data, missing_values and labels are duplicated, so
110
- # changes on original vector doesn't propages to copies.
111
- def dup
23
+ #
24
+ class Vector
25
+ include Enumerable
26
+ include Writable
27
+ # DEFAULT OPTIONS
28
+ DEFAULT_OPTIONS={
29
+ :missing_values=>[],
30
+ :today_values=>['NOW','TODAY', :NOW, :TODAY],
31
+ :labels=>{}
32
+ }
33
+ # Level of measurement. Could be :nominal, :ordinal or :scale
34
+ attr_reader :type
35
+ # Original data.
36
+ attr_reader :data
37
+ # Valid data. Equal to data, minus values assigned as missing values
38
+ attr_reader :valid_data
39
+ # Array of values considered as missing. Nil is a missing value, by default
40
+ attr_reader :missing_values
41
+ # Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
42
+ attr_reader :today_values
43
+ # Missing values array
44
+ attr_reader :missing_data
45
+ # Original data, with all missing values replaced by nils
46
+ attr_reader :data_with_nils
47
+ # Date date, with all missing values replaced by nils
48
+ attr_reader :date_data_with_nils
49
+ # GSL Object, only available with rbgsl extension and type==:scale
50
+ attr_reader :gsl
51
+ # Change label for specific values
52
+ attr_accessor :labels
53
+ #
54
+ # Creates a new Vector object.
55
+ # [data] Array of data.
56
+ # [type] Level of meausurement. See Vector#type
57
+ # [opts] Options
58
+ # [:missing_values] Array of missing values. See Vector#missing_values
59
+ # [:today_values] Array of 'today' values. See Vector#today_values
60
+ # [:labels] Labels for data values
61
+ #
62
+ def initialize(data=[], type=:nominal, opts=Hash.new)
63
+ raise "Data should be an array" unless data.is_a? Array
64
+ @data=data
65
+ @type=type
66
+ opts=DEFAULT_OPTIONS.merge(opts)
67
+ @missing_values=opts[:missing_values]
68
+ @labels=opts[:labels]
69
+ @today_values=opts[:today_values]
70
+ @valid_data=[]
71
+ @data_with_nils=[]
72
+ @date_data_with_nils=[]
73
+ @missing_data=[]
74
+ @has_missing_data=nil
75
+ @scale_data=nil
76
+ set_valid_data_intern
77
+ self.type=type
78
+ end
79
+ # Creates a duplicate of the Vector.
80
+ # Note: data, missing_values and labels are duplicated, so
81
+ # changes on original vector doesn't propages to copies.
82
+ def dup
112
83
  Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
113
- end
114
- # Returns an empty duplicate of the vector. Maintains the type,
115
- # missing values and labels.
116
- def dup_empty
84
+ end
85
+ # Returns an empty duplicate of the vector. Maintains the type,
86
+ # missing values and labels.
87
+ def dup_empty
117
88
  Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
118
- end
119
- # Raises an exception if type of vector is inferior to t type
120
- def check_type(t)
89
+ end
90
+ # Raises an exception if type of vector is inferior to t type
91
+ def check_type(t)
121
92
  raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date)
122
- end
123
- private :check_type
124
-
125
- # Return a vector usign the standarized values for data
126
- # with sd with denominator N
127
- def vector_standarized_pop
93
+ end
94
+ private :check_type
95
+
96
+ # Return a vector usign the standarized values for data
97
+ # with sd with denominator N
98
+ def vector_standarized_pop
128
99
  vector_standarized(true)
129
- end
130
- # Return a vector usign the standarized values for data
131
- # with sd with denominator n-1
132
-
133
- def vector_standarized(use_population=false)
100
+ end
101
+ # Return a vector usign the standarized values for data
102
+ # with sd with denominator n-1
103
+
104
+ def vector_standarized(use_population=false)
134
105
  raise "Should be a scale" unless @type==:scale
135
106
  m=mean
136
107
  sd=use_population ? sdp : sds
@@ -141,11 +112,10 @@ module Statsample
141
112
  nil
142
113
  end
143
114
  }.to_vector(:scale)
144
- end
145
-
146
- alias_method :standarized, :vector_standarized
147
-
148
- def box_cox_transformation(lambda) # :nodoc:
115
+ end
116
+ alias_method :standarized, :vector_standarized
117
+
118
+ def box_cox_transformation(lambda) # :nodoc:
149
119
  raise "Should be a scale" unless @type==:scale
150
120
  @data_with_nils.collect{|x|
151
121
  if !x.nil?
@@ -158,42 +128,42 @@ module Statsample
158
128
  nil
159
129
  end
160
130
  }.to_vector(:scale)
161
- end
162
-
163
- # Vector equality.
164
- # Two vector will be the same if their data, missing values, type, labels are equals
165
- def ==(v2)
131
+ end
132
+
133
+ # Vector equality.
134
+ # Two vector will be the same if their data, missing values, type, labels are equals
135
+ def ==(v2)
166
136
  raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
167
137
  @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
168
- end
169
-
170
- def _dump(i) # :nodoc:
138
+ end
139
+
140
+ def _dump(i) # :nodoc:
171
141
  Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
172
- end
173
-
174
- def self._load(data) # :nodoc:
142
+ end
143
+
144
+ def self._load(data) # :nodoc:
175
145
  h=Marshal.load(data)
176
146
  Vector.new(h['data'], h['type'],:missing_values=> h['missing_values'], :labels=>h['labels'])
177
- end
178
- # Returns a new vector, with data modified by block.
179
- # Equivalent to create a Vector after #collect on data
180
- def recode
147
+ end
148
+ # Returns a new vector, with data modified by block.
149
+ # Equivalent to create a Vector after #collect on data
150
+ def recode
181
151
  @data.collect{|x|
182
152
  yield x
183
153
  }.to_vector(@type)
184
- end
185
- # Modifies current vector, with data modified by block.
186
- # Equivalent to #collect! on @data
187
- def recode!
154
+ end
155
+ # Modifies current vector, with data modified by block.
156
+ # Equivalent to #collect! on @data
157
+ def recode!
188
158
  @data.collect!{|x|
189
159
  yield x
190
160
  }
191
161
  set_valid_data
192
- end
193
- # Dicotomize the vector with 0 and 1, based on lowest value
194
- # If parameter if defined, this value and lower
195
- # will be 0 and higher, 1
196
- def dichotomize(low=nil)
162
+ end
163
+ # Dicotomize the vector with 0 and 1, based on lowest value
164
+ # If parameter if defined, this value and lower
165
+ # will be 0 and higher, 1
166
+ def dichotomize(low=nil)
197
167
  fs=factors
198
168
  low||=factors.min
199
169
  @data_with_nils.collect{|x|
@@ -205,44 +175,44 @@ module Statsample
205
175
  0
206
176
  end
207
177
  }.to_scale
208
- end
209
- # Iterate on each item.
210
- # Equivalent to
211
- # @data.each{|x| yield x}
212
- def each
178
+ end
179
+ # Iterate on each item.
180
+ # Equivalent to
181
+ # @data.each{|x| yield x}
182
+ def each
213
183
  @data.each{|x| yield(x) }
214
- end
215
-
216
- # Iterate on each item, retrieving index
217
- def each_index
184
+ end
185
+
186
+ # Iterate on each item, retrieving index
187
+ def each_index
218
188
  (0...@data.size).each {|i|
219
189
  yield(i)
220
190
  }
221
- end
222
- # Add a value at the end of the vector.
223
- # If second argument set to false, you should update the Vector usign
224
- # Vector.set_valid_data at the end of your insertion cycle
225
- #
226
- def add(v,update_valid=true)
191
+ end
192
+ # Add a value at the end of the vector.
193
+ # If second argument set to false, you should update the Vector usign
194
+ # Vector.set_valid_data at the end of your insertion cycle
195
+ #
196
+ def add(v,update_valid=true)
227
197
  @data.push(v)
228
198
  set_valid_data if update_valid
229
- end
230
- # Update valid_data, missing_data, data_with_nils and gsl
231
- # at the end of an insertion.
232
- #
233
- # Use after Vector.add(v,false)
234
- # Usage:
235
- # v=Statsample::Vector.new
236
- # v.add(2,false)
237
- # v.add(4,false)
238
- # v.data
239
- # => [2,3]
240
- # v.valid_data
241
- # => []
242
- # v.set_valid_data
243
- # v.valid_data
244
- # => [2,3]
245
- def set_valid_data
199
+ end
200
+ # Update valid_data, missing_data, data_with_nils and gsl
201
+ # at the end of an insertion.
202
+ #
203
+ # Use after Vector.add(v,false)
204
+ # Usage:
205
+ # v=Statsample::Vector.new
206
+ # v.add(2,false)
207
+ # v.add(4,false)
208
+ # v.data
209
+ # => [2,3]
210
+ # v.valid_data
211
+ # => []
212
+ # v.set_valid_data
213
+ # v.valid_data
214
+ # => [2,3]
215
+ def set_valid_data
246
216
  @valid_data.clear
247
217
  @missing_data.clear
248
218
  @data_with_nils.clear
@@ -251,18 +221,18 @@ module Statsample
251
221
  set_valid_data_intern
252
222
  set_scale_data if(@type==:scale)
253
223
  set_date_data if(@type==:date)
254
- end
255
-
256
- if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
224
+ end
225
+
226
+ if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
257
227
  def set_valid_data_intern #:nodoc:
258
228
  Statsample::STATSAMPLE__.set_valid_data_intern(self)
259
229
  end
260
- else
230
+ else
261
231
  def set_valid_data_intern #:nodoc:
262
232
  _set_valid_data_intern
263
233
  end
264
- end
265
- def _set_valid_data_intern #:nodoc:
234
+ end
235
+ def _set_valid_data_intern #:nodoc:
266
236
  @data.each do |n|
267
237
  if is_valid? n
268
238
  @valid_data.push(n)
@@ -273,19 +243,19 @@ module Statsample
273
243
  end
274
244
  end
275
245
  @has_missing_data=@missing_data.size>0
276
- end
277
-
278
- # Retrieves true if data has one o more missing values
279
- def has_missing_data?
246
+ end
247
+
248
+ # Retrieves true if data has one o more missing values
249
+ def has_missing_data?
280
250
  @has_missing_data
281
- end
282
- # Retrieves label for value x. Retrieves x if
283
- # no label defined.
284
- def labeling(x)
251
+ end
252
+ # Retrieves label for value x. Retrieves x if
253
+ # no label defined.
254
+ def labeling(x)
285
255
  @labels.has_key?(x) ? @labels[x].to_s : x.to_s
286
- end
287
- # Returns a Vector with data with labels replaced by the label.
288
- def vector_labeled
256
+ end
257
+ # Returns a Vector with data with labels replaced by the label.
258
+ def vector_labeled
289
259
  d=@data.collect{|x|
290
260
  if @labels.has_key? x
291
261
  @labels[x]
@@ -294,69 +264,70 @@ module Statsample
294
264
  end
295
265
  }
296
266
  Vector.new(d,@type)
297
- end
298
- # Size of total data
299
- def size
267
+ end
268
+ # Size of total data
269
+ def size
300
270
  @data.size
301
- end
302
- alias_method :n, :size
303
-
304
- # Retrieves i element of data
305
- def [](i)
271
+ end
272
+ alias_method :n, :size
273
+
274
+ # Retrieves i element of data
275
+ def [](i)
306
276
  @data[i]
307
- end
308
- # Set i element of data.
309
- # Note: Use set_valid_data if you include missing values
310
- def []=(i,v)
277
+ end
278
+ # Set i element of data.
279
+ # Note: Use set_valid_data if you include missing values
280
+ def []=(i,v)
311
281
  @data[i]=v
312
- end
313
- # Return true if a value is valid (not nil and not included on missing values)
314
- def is_valid?(x)
282
+ end
283
+ # Return true if a value is valid (not nil and not included on missing values)
284
+ def is_valid?(x)
315
285
  !(x.nil? or @missing_values.include? x)
316
- end
317
- # Set missing_values.
318
- # if update_valid = false, you should use
319
- # set_valid_data after all changes
320
- def missing_values=(vals)
286
+ end
287
+ # Set missing_values.
288
+ # if update_valid = false, you should use
289
+ # set_valid_data after all changes
290
+ def missing_values=(vals)
321
291
  @missing_values = vals
322
292
  set_valid_data
323
- end
324
- def today_values=(vals)
293
+ end
294
+ # Set data considered as "today" on data vectors
295
+ def today_values=(vals)
325
296
  @today_values = vals
326
297
  set_valid_data
327
- end
328
- # Set level of measurement.
329
- def type=(t)
298
+ end
299
+ # Set level of measurement.
300
+ def type=(t)
330
301
  @type=t
331
302
  set_scale_data if(t==:scale)
332
303
  set_date_data if (t==:date)
333
- end
334
- def to_a
304
+ end
305
+ def to_a
335
306
  @data.dup
336
- end
337
- alias_method :to_ary, :to_a
338
-
339
- # Vector sum.
340
- # - If v is a scalar, add this value to all elements
341
- # - If v is a Array or a Vector, should be of the same size of this vector
342
- # every item of this vector will be added to the value of the
343
- # item at the same position on the other vector
344
- def +(v)
307
+ end
308
+ alias_method :to_ary, :to_a
309
+
310
+ # Vector sum.
311
+ # - If v is a scalar, add this value to all elements
312
+ # - If v is a Array or a Vector, should be of the same size of this vector
313
+ # every item of this vector will be added to the value of the
314
+ # item at the same position on the other vector
315
+ def +(v)
345
316
  _vector_ari("+",v)
346
- end
347
- # Vector rest.
348
- # - If v is a scalar, rest this value to all elements
349
- # - If v is a Array or a Vector, should be of the same
350
- # size of this vector
351
- # every item of this vector will be rested to the value of the
352
- # item at the same position on the other vector
353
-
354
- def -(v)
317
+ end
318
+ # Vector rest.
319
+ # - If v is a scalar, rest this value to all elements
320
+ # - If v is a Array or a Vector, should be of the same
321
+ # size of this vector
322
+ # every item of this vector will be rested to the value of the
323
+ # item at the same position on the other vector
324
+
325
+ def -(v)
355
326
  _vector_ari("-",v)
356
- end
357
- # Reports all values that doesn't comply with a condition.
358
- # Returns a hash with the index of data and the invalid data.
359
- def verify
327
+ end
328
+ # Reports all values that doesn't comply with a condition.
329
+ # Returns a hash with the index of data and the invalid data.
330
+ def verify
360
331
  h={}
361
332
  (0...@data.size).to_a.each{|i|
362
333
  if !(yield @data[i])
@@ -364,8 +335,8 @@ module Statsample
364
335
  end
365
336
  }
366
337
  h
367
- end
368
- def _vector_ari(method,v) # :nodoc:
338
+ end
339
+ def _vector_ari(method,v) # :nodoc:
369
340
  if(v.is_a? Vector or v.is_a? Array)
370
341
  if v.size==@data.size
371
342
  # i=0
@@ -395,13 +366,13 @@ module Statsample
395
366
  raise TypeError,"You should pass a scalar or a array/vector"
396
367
  end
397
368
 
398
- end
399
- # Return an array with the data splitted by a separator.
400
- # a=Vector.new(["a,b","c,d","a,b","d"])
401
- # a.splitted
402
- # =>
403
- # [["a","b"],["c","d"],["a","b"],["d"]]
404
- def splitted(sep=Statsample::SPLIT_TOKEN)
369
+ end
370
+ # Return an array with the data splitted by a separator.
371
+ # a=Vector.new(["a,b","c,d","a,b","d"])
372
+ # a.splitted
373
+ # =>
374
+ # [["a","b"],["c","d"],["a","b"],["d"]]
375
+ def splitted(sep=Statsample::SPLIT_TOKEN)
405
376
  @data.collect{|x|
406
377
  if x.nil?
407
378
  nil
@@ -411,73 +382,73 @@ module Statsample
411
382
  [x]
412
383
  end
413
384
  }
414
- end
415
- # Returns a hash of Vectors, defined by the different values
416
- # defined on the fields
417
- # Example:
418
- #
419
- # a=Vector.new(["a,b","c,d","a,b"])
420
- # a.split_by_separator
421
- # => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
422
- # @data=[1, 0, 1]>,
423
- # "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
424
- # @data=[1, 1, 0]>,
425
- # "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
426
- # @data=[0, 1, 1]>}
427
- #
428
- def split_by_separator(sep=Statsample::SPLIT_TOKEN)
385
+ end
386
+ # Returns a hash of Vectors, defined by the different values
387
+ # defined on the fields
388
+ # Example:
389
+ #
390
+ # a=Vector.new(["a,b","c,d","a,b"])
391
+ # a.split_by_separator
392
+ # => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
393
+ # @data=[1, 0, 1]>,
394
+ # "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
395
+ # @data=[1, 1, 0]>,
396
+ # "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
397
+ # @data=[0, 1, 1]>}
398
+ #
399
+ def split_by_separator(sep=Statsample::SPLIT_TOKEN)
429
400
  split_data=splitted(sep)
430
401
  factors=split_data.flatten.uniq.compact
431
402
  out=factors.inject({}) {|a,x|
432
- a[x]=[]
433
- a
403
+ a[x]=[]
404
+ a
434
405
  }
435
- split_data.each{|r|
436
- if r.nil?
437
- factors.each{|f|
438
- out[f].push(nil)
439
- }
440
- else
441
- factors.each{|f|
442
- out[f].push(r.include?(f) ? 1:0)
443
- }
406
+ split_data.each do |r|
407
+ if r.nil?
408
+ factors.each do |f|
409
+ out[f].push(nil)
444
410
  end
445
- }
411
+ else
412
+ factors.each do |f|
413
+ out[f].push(r.include?(f) ? 1:0)
414
+ end
415
+ end
416
+ end
446
417
  out.inject({}){|s,v|
447
- s[v[0]]=Vector.new(v[1],:nominal)
448
- s
418
+ s[v[0]]=Vector.new(v[1],:nominal)
419
+ s
449
420
  }
450
- end
451
- def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
452
- split_by_separator(sep).inject({}) {|a,v|
453
- a[v[0]]=v[1].inject {|s,x| s+x.to_i}
454
- a
455
- }
456
- end
457
-
458
- # Returns an random sample of size n, with replacement,
459
- # only with valid data.
460
- #
461
- # In all the trails, every item have the same probability
462
- # of been selected.
463
- def sample_with_replacement(sample=1)
464
- if(@type!=:scale or !HAS_GSL)
421
+ end
422
+ def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
423
+ split_by_separator(sep).inject({}) {|a,v|
424
+ a[v[0]]=v[1].inject {|s,x| s+x.to_i}
425
+ a
426
+ }
427
+ end
428
+
429
+ # Returns an random sample of size n, with replacement,
430
+ # only with valid data.
431
+ #
432
+ # In all the trails, every item have the same probability
433
+ # of been selected.
434
+ def sample_with_replacement(sample=1)
435
+ if(@type!=:scale or !Statsample.has_gsl?)
465
436
  vds=@valid_data.size
466
437
  (0...sample).collect{ @valid_data[rand(vds)] }
467
438
  else
468
439
  r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
469
440
  r.sample(@gsl, sample).to_a
470
441
  end
471
- end
472
- # Returns an random sample of size n, without replacement,
473
- # only with valid data.
474
- #
475
- # Every element could only be selected once.
476
- #
477
- # A sample of the same size of the vector is the vector itself.
442
+ end
443
+ # Returns an random sample of size n, without replacement,
444
+ # only with valid data.
445
+ #
446
+ # Every element could only be selected once.
447
+ #
448
+ # A sample of the same size of the vector is the vector itself.
478
449
 
479
- def sample_without_replacement(sample=1)
480
- if(@type!=:scale or !HAS_GSL)
450
+ def sample_without_replacement(sample=1)
451
+ if(@type!=:scale or !Statsample.has_gsl?)
481
452
  raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
482
453
  out=[]
483
454
  size=@valid_data.size
@@ -490,13 +461,13 @@ module Statsample
490
461
  r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
491
462
  r.choose(@gsl, sample).to_a
492
463
  end
493
- end
494
- # Retrieves number of cases which comply condition.
495
- # If block given, retrieves number of instances where
496
- # block returns true.
497
- # If other values given, retrieves the frequency for
498
- # this value.
499
- def count(x=false)
464
+ end
465
+ # Retrieves number of cases which comply condition.
466
+ # If block given, retrieves number of instances where
467
+ # block returns true.
468
+ # If other values given, retrieves the frequency for
469
+ # this value.
470
+ def count(x=false)
500
471
  if block_given?
501
472
  r=@data.inject(0) {|s, i|
502
473
  r=yield i
@@ -506,11 +477,11 @@ module Statsample
506
477
  else
507
478
  frequencies[x].nil? ? 0 : frequencies[x]
508
479
  end
509
- end
510
-
511
- # Returns the database type for the vector, according to its content
512
-
513
- def db_type(dbs='mysql')
480
+ end
481
+
482
+ # Returns the database type for the vector, according to its content
483
+
484
+ def db_type(dbs='mysql')
514
485
  # first, detect any character not number
515
486
  if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
516
487
  return "DATE"
@@ -521,43 +492,43 @@ module Statsample
521
492
  else
522
493
  return "INTEGER"
523
494
  end
524
- end
525
- # Return true if all data is Date, "today" values or nil
526
- def can_be_date?
495
+ end
496
+ # Return true if all data is Date, "today" values or nil
497
+ def can_be_date?
527
498
  if @data.find {|v|
528
499
  !v.nil? and !v.is_a? Date and !v.is_a? Time and (v.is_a? String and !@today_values.include? v) and (v.is_a? String and !(v=~/\d{4,4}[-\/]\d{1,2}[-\/]\d{1,2}/))}
529
500
  false
530
501
  else
531
502
  true
532
503
  end
533
- end
534
- # Return true if all data is Numeric or nil
535
- def can_be_scale?
504
+ end
505
+ # Return true if all data is Numeric or nil
506
+ def can_be_scale?
536
507
  if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
537
508
  false
538
509
  else
539
510
  true
540
511
  end
541
- end
542
-
543
- def to_s
512
+ end
513
+
514
+ def to_s
544
515
  sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
545
- end
546
- # Ugly name. Really, create a Vector for standard 'matrix' package.
547
- # <tt>dir</tt> could be :horizontal or :vertical
548
- def to_matrix(dir=:horizontal)
516
+ end
517
+ # Ugly name. Really, create a Vector for standard 'matrix' package.
518
+ # <tt>dir</tt> could be :horizontal or :vertical
519
+ def to_matrix(dir=:horizontal)
549
520
  case dir
550
521
  when :horizontal
551
522
  Matrix[@data]
552
523
  when :vertical
553
524
  Matrix.columns([@data])
554
525
  end
555
- end
556
- def inspect
526
+ end
527
+ def inspect
557
528
  self.to_s
558
- end
559
- # Retrieves uniques values for data.
560
- def factors
529
+ end
530
+ # Retrieves uniques values for data.
531
+ def factors
561
532
  if @type==:scale
562
533
  @scale_data.uniq.sort
563
534
  elsif @type==:date
@@ -565,26 +536,26 @@ module Statsample
565
536
  else
566
537
  @valid_data.uniq.sort
567
538
  end
568
- end
569
- if Statsample::STATSAMPLE__.respond_to?(:frequencies)
539
+ end
540
+ if Statsample::STATSAMPLE__.respond_to?(:frequencies)
570
541
  # Returns a hash with the distribution of frecuencies for
571
542
  # the sample
572
543
  def frequencies
573
544
  Statsample::STATSAMPLE__.frequencies(@valid_data)
574
545
  end
575
- else
546
+ else
576
547
  def frequencies #:nodoc:
577
548
  _frequencies
578
549
  end
579
- end
580
- def _frequencies #:nodoc:
550
+ end
551
+ def _frequencies #:nodoc:
581
552
  @valid_data.inject(Hash.new) {|a,x|
582
553
  a[x]||=0
583
554
  a[x]=a[x]+1
584
555
  a
585
556
  }
586
- end
587
- # Plot frequencies on a chart, using gnuplot
557
+ end
558
+ # Plot frequencies on a chart, using gnuplot
588
559
  def plot_frequencies
589
560
  require 'gnuplot'
590
561
  x=[]
@@ -594,30 +565,30 @@ module Statsample
594
565
  y.push(v)
595
566
  }
596
567
  Gnuplot.open do |gp|
597
- Gnuplot::Plot.new( gp ) do |plot|
598
- plot.boxwidth("0.9 absolute")
599
- plot.yrange("[0:#{y.max}]")
600
- plot.style("fill solid 1.00 border -1")
601
- plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
602
- plot.style("histogram")
603
- plot.style("data histogram")
604
- i=-1
605
- plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
606
- plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
607
- end
568
+ Gnuplot::Plot.new( gp ) do |plot|
569
+ plot.boxwidth("0.9 absolute")
570
+ plot.yrange("[0:#{y.max}]")
571
+ plot.style("fill solid 1.00 border -1")
572
+ plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
573
+ plot.style("histogram")
574
+ plot.style("data histogram")
575
+ i=-1
576
+ plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
577
+ plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
608
578
  end
609
579
  end
610
-
580
+ end
581
+
611
582
  end
612
-
613
-
583
+
584
+
614
585
  # Returns the most frequent item.
615
586
  def mode
616
- frequencies.max{|a,b| a[1]<=>b[1]}[0]
587
+ frequencies.max{|a,b| a[1]<=>b[1]}[0]
617
588
  end
618
589
  # The numbers of item with valid data.
619
590
  def n_valid
620
- @valid_data.size
591
+ @valid_data.size
621
592
  end
622
593
  # Returns a hash with the distribution of proportions of
623
594
  # the sample.
@@ -632,38 +603,38 @@ module Statsample
632
603
  frequencies[v].quo(@valid_data.size)
633
604
  end
634
605
  def summary(out="")
635
- out << sprintf("n valid:%d\n",n_valid)
636
- out << sprintf("factors:%s\n",factors.join(","))
637
- out << "mode:"+mode.to_s+"\n"
638
- out << "Distribution:\n"
639
- frequencies.sort.each{|k,v|
640
- key=labels.has_key?(k) ? labels[k]:k
641
- out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
642
- }
643
- if(@type==:ordinal)
644
- out << "median:"+median.to_s+"\n"
645
- end
646
- if(@type==:scale)
647
- out << "mean:"+mean.to_s+"\n"
648
- out << "sd:"+sd.to_s+"\n"
649
-
650
- end
651
- out
606
+ out << sprintf("n valid:%d\n",n_valid)
607
+ out << sprintf("factors:%s\n",factors.join(","))
608
+ out << "mode:"+mode.to_s+"\n"
609
+ out << "Distribution:\n"
610
+ frequencies.sort.each{|k,v|
611
+ key=labels.has_key?(k) ? labels[k]:k
612
+ out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
613
+ }
614
+ if(@type==:ordinal)
615
+ out << "median:"+median.to_s+"\n"
616
+ end
617
+ if(@type==:scale)
618
+ out << "mean:"+mean.to_s+"\n"
619
+ out << "sd:"+sd.to_s+"\n"
620
+
621
+ end
622
+ out
652
623
  end
653
624
 
654
625
  # Variance of p, according to poblation size
655
626
  def variance_proportion(n_poblation, v=1)
656
- Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
627
+ Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
657
628
  end
658
629
  # Variance of p, according to poblation size
659
630
  def variance_total(n_poblation, v=1)
660
- Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
631
+ Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
661
632
  end
662
633
  def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
663
- Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
634
+ Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
664
635
  end
665
636
  def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
666
- Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
637
+ Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
667
638
  end
668
639
 
669
640
  self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
@@ -672,20 +643,21 @@ module Statsample
672
643
  alias_method met_or, met
673
644
  end
674
645
  end
675
- ######
676
- ### Ordinal Methods
677
- ######
678
-
646
+
647
+ ######
648
+ ### Ordinal Methods
649
+ ######
650
+
679
651
  # Return the value of the percentil q
680
652
  def percentil(q)
681
- check_type :ordinal
682
- sorted=@valid_data.sort
683
- v= (n_valid * q).quo(100)
684
- if(v.to_i!=v)
685
- sorted[v.to_i]
686
- else
687
- (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
688
- end
653
+ check_type :ordinal
654
+ sorted=@valid_data.sort
655
+ v= (n_valid * q).quo(100)
656
+ if(v.to_i!=v)
657
+ sorted[v.to_i]
658
+ else
659
+ (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
660
+ end
689
661
  end
690
662
  # Returns a ranked vector.
691
663
  def ranked(type=:ordinal)
@@ -698,27 +670,28 @@ module Statsample
698
670
  }
699
671
  @data.collect {|c| r[c] }.to_vector(type)
700
672
  end
701
- # Return the median (percentil 50)
702
- def median
703
- check_type :ordinal
704
- if HAS_GSL and @type==:scale
705
- sorted=GSL::Vector.alloc(@scale_data.sort)
706
- GSL::Stats::median_from_sorted_data(sorted)
707
- else
708
- percentil(50)
673
+ # Return the median (percentil 50)
674
+ def median
675
+ check_type :ordinal
676
+ if Statsample.has_gsl? and @type==:scale
677
+ sorted=GSL::Vector.alloc(@scale_data.sort)
678
+ GSL::Stats::median_from_sorted_data(sorted)
679
+ else
680
+ percentil(50)
681
+ end
682
+ end
683
+ # Minimun value
684
+ def min
685
+ check_type :ordinal
686
+ @valid_data.min;
709
687
  end
710
- end
711
- # Minimun value
712
- def min;
713
- check_type :ordinal
714
- @valid_data.min;
715
- end
716
688
  # Maximum value
717
- def max;
718
- check_type :ordinal
719
- @valid_data.max;
720
- end
721
- def set_date_data # :nodoc:
689
+ def max
690
+ check_type :ordinal
691
+ @valid_data.max;
692
+ end
693
+
694
+ def set_date_data
722
695
  @date_data_with_nils=@data.collect do|x|
723
696
  if x.is_a? Date
724
697
  x
@@ -733,7 +706,8 @@ module Statsample
733
706
  end
734
707
  end
735
708
  end
736
- def set_scale_data # :nodoc
709
+
710
+ def set_scale_data
737
711
  @scale_data=@valid_data.collect do|x|
738
712
  if x.is_a? Numeric
739
713
  x
@@ -743,12 +717,13 @@ module Statsample
743
717
  x.to_f
744
718
  end
745
719
  end
746
- if HAS_GSL
720
+ if Statsample.has_gsl?
747
721
  @gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
748
722
  end
749
723
  end
750
- private :set_scale_data
751
-
724
+
725
+ private :set_date_data, :set_scale_data
726
+
752
727
  # The range of the data (max - min)
753
728
  def range;
754
729
  check_type :scale
@@ -788,7 +763,7 @@ module Statsample
788
763
  squares.quo(n_valid) - m.square
789
764
  end
790
765
 
791
-
766
+
792
767
  # Population Standard deviation (denominator N)
793
768
  def standard_deviation_population(m=nil)
794
769
  check_type :scale
@@ -801,7 +776,7 @@ module Statsample
801
776
  m||=mean
802
777
  sum_of_squares(m).quo(n_valid - 1)
803
778
  end
804
-
779
+
805
780
  # Sample Standard deviation (denominator n-1)
806
781
 
807
782
  def standard_deviation_sample(m=nil)
@@ -831,7 +806,7 @@ module Statsample
831
806
  check_type :scale
832
807
  @scale_data.inject(1){|a,x| a*x }
833
808
  end
834
- if HAS_GSL
809
+ if Statsample.has_gsl?
835
810
  %w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
836
811
  m_nuevo=(m+"_slow").intern
837
812
  alias_method m_nuevo, m.intern