statsample 0.6.5 → 0.6.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. data/History.txt +15 -0
  2. data/Manifest.txt +6 -0
  3. data/README.txt +30 -12
  4. data/Rakefile +91 -0
  5. data/demo/levene.rb +9 -0
  6. data/demo/multiple_regression.rb +1 -7
  7. data/demo/polychoric.rb +1 -0
  8. data/demo/principal_axis.rb +8 -0
  9. data/lib/distribution/f.rb +22 -22
  10. data/lib/spss.rb +99 -99
  11. data/lib/statsample/bivariate/polychoric.rb +32 -22
  12. data/lib/statsample/bivariate/tetrachoric.rb +212 -207
  13. data/lib/statsample/bivariate.rb +6 -6
  14. data/lib/statsample/codification.rb +65 -65
  15. data/lib/statsample/combination.rb +60 -59
  16. data/lib/statsample/converter/csv19.rb +12 -12
  17. data/lib/statsample/converters.rb +1 -1
  18. data/lib/statsample/dataset.rb +93 -36
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +66 -3
  20. data/lib/statsample/dominanceanalysis.rb +5 -6
  21. data/lib/statsample/factor/pca.rb +41 -11
  22. data/lib/statsample/factor/principalaxis.rb +105 -29
  23. data/lib/statsample/factor/rotation.rb +20 -3
  24. data/lib/statsample/factor.rb +1 -1
  25. data/lib/statsample/graph/gdchart.rb +13 -13
  26. data/lib/statsample/graph/svggraph.rb +166 -167
  27. data/lib/statsample/matrix.rb +22 -12
  28. data/lib/statsample/mle/logit.rb +3 -2
  29. data/lib/statsample/mle/probit.rb +7 -5
  30. data/lib/statsample/mle.rb +4 -2
  31. data/lib/statsample/multiset.rb +125 -124
  32. data/lib/statsample/permutation.rb +2 -1
  33. data/lib/statsample/regression/binomial/logit.rb +4 -3
  34. data/lib/statsample/regression/binomial/probit.rb +2 -1
  35. data/lib/statsample/regression/binomial.rb +62 -81
  36. data/lib/statsample/regression/multiple/baseengine.rb +1 -1
  37. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  38. data/lib/statsample/regression/multiple/matrixengine.rb +12 -6
  39. data/lib/statsample/regression/multiple.rb +15 -42
  40. data/lib/statsample/regression/simple.rb +93 -78
  41. data/lib/statsample/regression.rb +74 -2
  42. data/lib/statsample/reliability.rb +117 -120
  43. data/lib/statsample/srs.rb +156 -153
  44. data/lib/statsample/test/levene.rb +90 -0
  45. data/lib/statsample/test/umannwhitney.rb +25 -9
  46. data/lib/statsample/test.rb +2 -0
  47. data/lib/statsample/vector.rb +388 -413
  48. data/lib/statsample.rb +74 -30
  49. data/po/es/statsample.mo +0 -0
  50. data/test/test_bivariate.rb +5 -4
  51. data/test/test_combination.rb +1 -1
  52. data/test/test_dataset.rb +2 -2
  53. data/test/test_factor.rb +53 -6
  54. data/test/test_gsl.rb +1 -1
  55. data/test/test_mle.rb +1 -1
  56. data/test/test_regression.rb +18 -33
  57. data/test/test_statistics.rb +15 -33
  58. data/test/test_stest.rb +35 -0
  59. data/test/test_svg_graph.rb +2 -2
  60. data/test/test_vector.rb +331 -333
  61. metadata +38 -11
@@ -10,127 +10,98 @@ class Array
10
10
  Statsample::Vector.new(self,:scale,*args)
11
11
  end
12
12
  end
13
+
13
14
  module Statsample
14
- class << self
15
- # Create a matrix using vectors as columns.
16
- # Use:
17
- #
18
- # matrix=Statsample.vector_cols_matrix(v1,v2)
19
- def vector_cols_matrix(*vs)
20
- # test
21
- size=vs[0].size
22
- vs.each{|v|
23
- raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
24
- raise ArgumentError,"Vectors size should be the same" if v.size!=size
25
- }
26
- Matrix.rows((0...size).to_a.collect() {|i|
27
- vs.collect{|v| v[i]}
28
- })
29
- end
30
- end
31
- # Returns a duplicate of the input vectors, without missing data
32
- # for any of the vectors.
33
- #
34
- # a=[1,2,3,6,7,nil,3,5].to_scale
35
- # b=[nil,nil,5,6,4,5,10,2].to_scale
36
- # c=[2,4,6,7,4,5,6,7].to_scale
37
- # a2,b2,c2=Statsample.only_valid(a,b,c)
38
- # => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
39
- # #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
40
- # #<Statsample::Scale:0xb748c760 @data=[6, 7, 4, 6, 7]>]
41
- #
42
- def self.only_valid(*vs)
43
- i=1
44
- h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a}
45
- ds=Statsample::Dataset.new(h).dup_only_valid
46
- ds.vectors.values
47
- end
48
-
49
- class Vector
50
- include Enumerable
51
- include Writable
52
- DEFAULT_OPTIONS={
53
- :missing_values=>[],
54
- :today_values=>['NOW','TODAY', :NOW, :TODAY],
55
- :labels=>{}
56
- }
57
- # Level of measurement. Could be :nominal, :ordinal or :scale
58
- attr_reader :type
59
- # Original data.
60
- attr_reader :data
61
- # Valid data. Equal to data, minus values assigned as missing values
62
- attr_reader :valid_data
63
- # Array of values considered as missing. Nil is a missing value, by default
64
- attr_reader :missing_values
65
- # Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
66
- attr_reader :today_values
67
- # Missing values array
68
- attr_reader :missing_data
69
- # Original data, with all missing values replaced by nils
70
- attr_reader :data_with_nils
71
- # Date date, with all missing values replaced by nils
72
- attr_reader :date_data_with_nils
73
- # GSL Object, only available with rbgsl extension and type==:scale
74
- attr_reader :gsl
75
- # Change label for specific values
76
- attr_accessor :labels
77
- # Creates a new Vector object.
78
- # [data] Array of data.
79
- # [type] Level of meausurement. See Vector#type
80
- # [opts] Options
81
- # [:missing_values] Array of missing values. See Vector#missing_values
82
- # [:today_values] Array of 'today' values. See Vector#today_values
83
- # [:labels] Labels for data values
84
- #
15
+
16
+ # Collection of values on one dimension. Works as a column on a Spreadsheet.
17
+ #
18
+ # == Usage
85
19
  # The fast way to create a vector uses Array.to_vector or Array.to_scale.
86
20
  #
87
21
  # v=[1,2,3,4].to_vector(:scale)
88
22
  # v=[1,2,3,4].to_scale
89
- #
90
-
91
- def initialize(data=[], t=:nominal, opts=Hash.new)
92
- raise "Data should be an array" unless data.is_a? Array
93
- @data=data
94
- @type=t
95
- opts=DEFAULT_OPTIONS.merge(opts)
96
- @missing_values=opts[:missing_values]
97
- @labels=opts[:labels]
98
- @today_values=opts[:today_values]
99
- @valid_data=[]
100
- @data_with_nils=[]
101
- @date_data_with_nils=[]
102
- @missing_data=[]
103
- @has_missing_data=nil
104
- @scale_data=nil
105
- set_valid_data_intern
106
- self.type=t
107
- end
108
- # Creates a duplicate of the Vector.
109
- # Note: data, missing_values and labels are duplicated, so
110
- # changes on original vector doesn't propages to copies.
111
- def dup
23
+ #
24
+ class Vector
25
+ include Enumerable
26
+ include Writable
27
+ # DEFAULT OPTIONS
28
+ DEFAULT_OPTIONS={
29
+ :missing_values=>[],
30
+ :today_values=>['NOW','TODAY', :NOW, :TODAY],
31
+ :labels=>{}
32
+ }
33
+ # Level of measurement. Could be :nominal, :ordinal or :scale
34
+ attr_reader :type
35
+ # Original data.
36
+ attr_reader :data
37
+ # Valid data. Equal to data, minus values assigned as missing values
38
+ attr_reader :valid_data
39
+ # Array of values considered as missing. Nil is a missing value, by default
40
+ attr_reader :missing_values
41
+ # Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
42
+ attr_reader :today_values
43
+ # Missing values array
44
+ attr_reader :missing_data
45
+ # Original data, with all missing values replaced by nils
46
+ attr_reader :data_with_nils
47
+ # Date date, with all missing values replaced by nils
48
+ attr_reader :date_data_with_nils
49
+ # GSL Object, only available with rbgsl extension and type==:scale
50
+ attr_reader :gsl
51
+ # Change label for specific values
52
+ attr_accessor :labels
53
+ #
54
+ # Creates a new Vector object.
55
+ # [data] Array of data.
56
+ # [type] Level of meausurement. See Vector#type
57
+ # [opts] Options
58
+ # [:missing_values] Array of missing values. See Vector#missing_values
59
+ # [:today_values] Array of 'today' values. See Vector#today_values
60
+ # [:labels] Labels for data values
61
+ #
62
+ def initialize(data=[], type=:nominal, opts=Hash.new)
63
+ raise "Data should be an array" unless data.is_a? Array
64
+ @data=data
65
+ @type=type
66
+ opts=DEFAULT_OPTIONS.merge(opts)
67
+ @missing_values=opts[:missing_values]
68
+ @labels=opts[:labels]
69
+ @today_values=opts[:today_values]
70
+ @valid_data=[]
71
+ @data_with_nils=[]
72
+ @date_data_with_nils=[]
73
+ @missing_data=[]
74
+ @has_missing_data=nil
75
+ @scale_data=nil
76
+ set_valid_data_intern
77
+ self.type=type
78
+ end
79
+ # Creates a duplicate of the Vector.
80
+ # Note: data, missing_values and labels are duplicated, so
81
+ # changes on original vector doesn't propages to copies.
82
+ def dup
112
83
  Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
113
- end
114
- # Returns an empty duplicate of the vector. Maintains the type,
115
- # missing values and labels.
116
- def dup_empty
84
+ end
85
+ # Returns an empty duplicate of the vector. Maintains the type,
86
+ # missing values and labels.
87
+ def dup_empty
117
88
  Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
118
- end
119
- # Raises an exception if type of vector is inferior to t type
120
- def check_type(t)
89
+ end
90
+ # Raises an exception if type of vector is inferior to t type
91
+ def check_type(t)
121
92
  raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date)
122
- end
123
- private :check_type
124
-
125
- # Return a vector usign the standarized values for data
126
- # with sd with denominator N
127
- def vector_standarized_pop
93
+ end
94
+ private :check_type
95
+
96
+ # Return a vector usign the standarized values for data
97
+ # with sd with denominator N
98
+ def vector_standarized_pop
128
99
  vector_standarized(true)
129
- end
130
- # Return a vector usign the standarized values for data
131
- # with sd with denominator n-1
132
-
133
- def vector_standarized(use_population=false)
100
+ end
101
+ # Return a vector usign the standarized values for data
102
+ # with sd with denominator n-1
103
+
104
+ def vector_standarized(use_population=false)
134
105
  raise "Should be a scale" unless @type==:scale
135
106
  m=mean
136
107
  sd=use_population ? sdp : sds
@@ -141,11 +112,10 @@ module Statsample
141
112
  nil
142
113
  end
143
114
  }.to_vector(:scale)
144
- end
145
-
146
- alias_method :standarized, :vector_standarized
147
-
148
- def box_cox_transformation(lambda) # :nodoc:
115
+ end
116
+ alias_method :standarized, :vector_standarized
117
+
118
+ def box_cox_transformation(lambda) # :nodoc:
149
119
  raise "Should be a scale" unless @type==:scale
150
120
  @data_with_nils.collect{|x|
151
121
  if !x.nil?
@@ -158,42 +128,42 @@ module Statsample
158
128
  nil
159
129
  end
160
130
  }.to_vector(:scale)
161
- end
162
-
163
- # Vector equality.
164
- # Two vector will be the same if their data, missing values, type, labels are equals
165
- def ==(v2)
131
+ end
132
+
133
+ # Vector equality.
134
+ # Two vector will be the same if their data, missing values, type, labels are equals
135
+ def ==(v2)
166
136
  raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
167
137
  @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
168
- end
169
-
170
- def _dump(i) # :nodoc:
138
+ end
139
+
140
+ def _dump(i) # :nodoc:
171
141
  Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
172
- end
173
-
174
- def self._load(data) # :nodoc:
142
+ end
143
+
144
+ def self._load(data) # :nodoc:
175
145
  h=Marshal.load(data)
176
146
  Vector.new(h['data'], h['type'],:missing_values=> h['missing_values'], :labels=>h['labels'])
177
- end
178
- # Returns a new vector, with data modified by block.
179
- # Equivalent to create a Vector after #collect on data
180
- def recode
147
+ end
148
+ # Returns a new vector, with data modified by block.
149
+ # Equivalent to create a Vector after #collect on data
150
+ def recode
181
151
  @data.collect{|x|
182
152
  yield x
183
153
  }.to_vector(@type)
184
- end
185
- # Modifies current vector, with data modified by block.
186
- # Equivalent to #collect! on @data
187
- def recode!
154
+ end
155
+ # Modifies current vector, with data modified by block.
156
+ # Equivalent to #collect! on @data
157
+ def recode!
188
158
  @data.collect!{|x|
189
159
  yield x
190
160
  }
191
161
  set_valid_data
192
- end
193
- # Dicotomize the vector with 0 and 1, based on lowest value
194
- # If parameter if defined, this value and lower
195
- # will be 0 and higher, 1
196
- def dichotomize(low=nil)
162
+ end
163
+ # Dicotomize the vector with 0 and 1, based on lowest value
164
+ # If parameter if defined, this value and lower
165
+ # will be 0 and higher, 1
166
+ def dichotomize(low=nil)
197
167
  fs=factors
198
168
  low||=factors.min
199
169
  @data_with_nils.collect{|x|
@@ -205,44 +175,44 @@ module Statsample
205
175
  0
206
176
  end
207
177
  }.to_scale
208
- end
209
- # Iterate on each item.
210
- # Equivalent to
211
- # @data.each{|x| yield x}
212
- def each
178
+ end
179
+ # Iterate on each item.
180
+ # Equivalent to
181
+ # @data.each{|x| yield x}
182
+ def each
213
183
  @data.each{|x| yield(x) }
214
- end
215
-
216
- # Iterate on each item, retrieving index
217
- def each_index
184
+ end
185
+
186
+ # Iterate on each item, retrieving index
187
+ def each_index
218
188
  (0...@data.size).each {|i|
219
189
  yield(i)
220
190
  }
221
- end
222
- # Add a value at the end of the vector.
223
- # If second argument set to false, you should update the Vector usign
224
- # Vector.set_valid_data at the end of your insertion cycle
225
- #
226
- def add(v,update_valid=true)
191
+ end
192
+ # Add a value at the end of the vector.
193
+ # If second argument set to false, you should update the Vector usign
194
+ # Vector.set_valid_data at the end of your insertion cycle
195
+ #
196
+ def add(v,update_valid=true)
227
197
  @data.push(v)
228
198
  set_valid_data if update_valid
229
- end
230
- # Update valid_data, missing_data, data_with_nils and gsl
231
- # at the end of an insertion.
232
- #
233
- # Use after Vector.add(v,false)
234
- # Usage:
235
- # v=Statsample::Vector.new
236
- # v.add(2,false)
237
- # v.add(4,false)
238
- # v.data
239
- # => [2,3]
240
- # v.valid_data
241
- # => []
242
- # v.set_valid_data
243
- # v.valid_data
244
- # => [2,3]
245
- def set_valid_data
199
+ end
200
+ # Update valid_data, missing_data, data_with_nils and gsl
201
+ # at the end of an insertion.
202
+ #
203
+ # Use after Vector.add(v,false)
204
+ # Usage:
205
+ # v=Statsample::Vector.new
206
+ # v.add(2,false)
207
+ # v.add(4,false)
208
+ # v.data
209
+ # => [2,3]
210
+ # v.valid_data
211
+ # => []
212
+ # v.set_valid_data
213
+ # v.valid_data
214
+ # => [2,3]
215
+ def set_valid_data
246
216
  @valid_data.clear
247
217
  @missing_data.clear
248
218
  @data_with_nils.clear
@@ -251,18 +221,18 @@ module Statsample
251
221
  set_valid_data_intern
252
222
  set_scale_data if(@type==:scale)
253
223
  set_date_data if(@type==:date)
254
- end
255
-
256
- if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
224
+ end
225
+
226
+ if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
257
227
  def set_valid_data_intern #:nodoc:
258
228
  Statsample::STATSAMPLE__.set_valid_data_intern(self)
259
229
  end
260
- else
230
+ else
261
231
  def set_valid_data_intern #:nodoc:
262
232
  _set_valid_data_intern
263
233
  end
264
- end
265
- def _set_valid_data_intern #:nodoc:
234
+ end
235
+ def _set_valid_data_intern #:nodoc:
266
236
  @data.each do |n|
267
237
  if is_valid? n
268
238
  @valid_data.push(n)
@@ -273,19 +243,19 @@ module Statsample
273
243
  end
274
244
  end
275
245
  @has_missing_data=@missing_data.size>0
276
- end
277
-
278
- # Retrieves true if data has one o more missing values
279
- def has_missing_data?
246
+ end
247
+
248
+ # Retrieves true if data has one o more missing values
249
+ def has_missing_data?
280
250
  @has_missing_data
281
- end
282
- # Retrieves label for value x. Retrieves x if
283
- # no label defined.
284
- def labeling(x)
251
+ end
252
+ # Retrieves label for value x. Retrieves x if
253
+ # no label defined.
254
+ def labeling(x)
285
255
  @labels.has_key?(x) ? @labels[x].to_s : x.to_s
286
- end
287
- # Returns a Vector with data with labels replaced by the label.
288
- def vector_labeled
256
+ end
257
+ # Returns a Vector with data with labels replaced by the label.
258
+ def vector_labeled
289
259
  d=@data.collect{|x|
290
260
  if @labels.has_key? x
291
261
  @labels[x]
@@ -294,69 +264,70 @@ module Statsample
294
264
  end
295
265
  }
296
266
  Vector.new(d,@type)
297
- end
298
- # Size of total data
299
- def size
267
+ end
268
+ # Size of total data
269
+ def size
300
270
  @data.size
301
- end
302
- alias_method :n, :size
303
-
304
- # Retrieves i element of data
305
- def [](i)
271
+ end
272
+ alias_method :n, :size
273
+
274
+ # Retrieves i element of data
275
+ def [](i)
306
276
  @data[i]
307
- end
308
- # Set i element of data.
309
- # Note: Use set_valid_data if you include missing values
310
- def []=(i,v)
277
+ end
278
+ # Set i element of data.
279
+ # Note: Use set_valid_data if you include missing values
280
+ def []=(i,v)
311
281
  @data[i]=v
312
- end
313
- # Return true if a value is valid (not nil and not included on missing values)
314
- def is_valid?(x)
282
+ end
283
+ # Return true if a value is valid (not nil and not included on missing values)
284
+ def is_valid?(x)
315
285
  !(x.nil? or @missing_values.include? x)
316
- end
317
- # Set missing_values.
318
- # if update_valid = false, you should use
319
- # set_valid_data after all changes
320
- def missing_values=(vals)
286
+ end
287
+ # Set missing_values.
288
+ # if update_valid = false, you should use
289
+ # set_valid_data after all changes
290
+ def missing_values=(vals)
321
291
  @missing_values = vals
322
292
  set_valid_data
323
- end
324
- def today_values=(vals)
293
+ end
294
+ # Set data considered as "today" on data vectors
295
+ def today_values=(vals)
325
296
  @today_values = vals
326
297
  set_valid_data
327
- end
328
- # Set level of measurement.
329
- def type=(t)
298
+ end
299
+ # Set level of measurement.
300
+ def type=(t)
330
301
  @type=t
331
302
  set_scale_data if(t==:scale)
332
303
  set_date_data if (t==:date)
333
- end
334
- def to_a
304
+ end
305
+ def to_a
335
306
  @data.dup
336
- end
337
- alias_method :to_ary, :to_a
338
-
339
- # Vector sum.
340
- # - If v is a scalar, add this value to all elements
341
- # - If v is a Array or a Vector, should be of the same size of this vector
342
- # every item of this vector will be added to the value of the
343
- # item at the same position on the other vector
344
- def +(v)
307
+ end
308
+ alias_method :to_ary, :to_a
309
+
310
+ # Vector sum.
311
+ # - If v is a scalar, add this value to all elements
312
+ # - If v is a Array or a Vector, should be of the same size of this vector
313
+ # every item of this vector will be added to the value of the
314
+ # item at the same position on the other vector
315
+ def +(v)
345
316
  _vector_ari("+",v)
346
- end
347
- # Vector rest.
348
- # - If v is a scalar, rest this value to all elements
349
- # - If v is a Array or a Vector, should be of the same
350
- # size of this vector
351
- # every item of this vector will be rested to the value of the
352
- # item at the same position on the other vector
353
-
354
- def -(v)
317
+ end
318
+ # Vector rest.
319
+ # - If v is a scalar, rest this value to all elements
320
+ # - If v is a Array or a Vector, should be of the same
321
+ # size of this vector
322
+ # every item of this vector will be rested to the value of the
323
+ # item at the same position on the other vector
324
+
325
+ def -(v)
355
326
  _vector_ari("-",v)
356
- end
357
- # Reports all values that doesn't comply with a condition.
358
- # Returns a hash with the index of data and the invalid data.
359
- def verify
327
+ end
328
+ # Reports all values that doesn't comply with a condition.
329
+ # Returns a hash with the index of data and the invalid data.
330
+ def verify
360
331
  h={}
361
332
  (0...@data.size).to_a.each{|i|
362
333
  if !(yield @data[i])
@@ -364,8 +335,8 @@ module Statsample
364
335
  end
365
336
  }
366
337
  h
367
- end
368
- def _vector_ari(method,v) # :nodoc:
338
+ end
339
+ def _vector_ari(method,v) # :nodoc:
369
340
  if(v.is_a? Vector or v.is_a? Array)
370
341
  if v.size==@data.size
371
342
  # i=0
@@ -395,13 +366,13 @@ module Statsample
395
366
  raise TypeError,"You should pass a scalar or a array/vector"
396
367
  end
397
368
 
398
- end
399
- # Return an array with the data splitted by a separator.
400
- # a=Vector.new(["a,b","c,d","a,b","d"])
401
- # a.splitted
402
- # =>
403
- # [["a","b"],["c","d"],["a","b"],["d"]]
404
- def splitted(sep=Statsample::SPLIT_TOKEN)
369
+ end
370
+ # Return an array with the data splitted by a separator.
371
+ # a=Vector.new(["a,b","c,d","a,b","d"])
372
+ # a.splitted
373
+ # =>
374
+ # [["a","b"],["c","d"],["a","b"],["d"]]
375
+ def splitted(sep=Statsample::SPLIT_TOKEN)
405
376
  @data.collect{|x|
406
377
  if x.nil?
407
378
  nil
@@ -411,73 +382,73 @@ module Statsample
411
382
  [x]
412
383
  end
413
384
  }
414
- end
415
- # Returns a hash of Vectors, defined by the different values
416
- # defined on the fields
417
- # Example:
418
- #
419
- # a=Vector.new(["a,b","c,d","a,b"])
420
- # a.split_by_separator
421
- # => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
422
- # @data=[1, 0, 1]>,
423
- # "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
424
- # @data=[1, 1, 0]>,
425
- # "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
426
- # @data=[0, 1, 1]>}
427
- #
428
- def split_by_separator(sep=Statsample::SPLIT_TOKEN)
385
+ end
386
+ # Returns a hash of Vectors, defined by the different values
387
+ # defined on the fields
388
+ # Example:
389
+ #
390
+ # a=Vector.new(["a,b","c,d","a,b"])
391
+ # a.split_by_separator
392
+ # => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
393
+ # @data=[1, 0, 1]>,
394
+ # "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
395
+ # @data=[1, 1, 0]>,
396
+ # "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
397
+ # @data=[0, 1, 1]>}
398
+ #
399
+ def split_by_separator(sep=Statsample::SPLIT_TOKEN)
429
400
  split_data=splitted(sep)
430
401
  factors=split_data.flatten.uniq.compact
431
402
  out=factors.inject({}) {|a,x|
432
- a[x]=[]
433
- a
403
+ a[x]=[]
404
+ a
434
405
  }
435
- split_data.each{|r|
436
- if r.nil?
437
- factors.each{|f|
438
- out[f].push(nil)
439
- }
440
- else
441
- factors.each{|f|
442
- out[f].push(r.include?(f) ? 1:0)
443
- }
406
+ split_data.each do |r|
407
+ if r.nil?
408
+ factors.each do |f|
409
+ out[f].push(nil)
444
410
  end
445
- }
411
+ else
412
+ factors.each do |f|
413
+ out[f].push(r.include?(f) ? 1:0)
414
+ end
415
+ end
416
+ end
446
417
  out.inject({}){|s,v|
447
- s[v[0]]=Vector.new(v[1],:nominal)
448
- s
418
+ s[v[0]]=Vector.new(v[1],:nominal)
419
+ s
449
420
  }
450
- end
451
- def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
452
- split_by_separator(sep).inject({}) {|a,v|
453
- a[v[0]]=v[1].inject {|s,x| s+x.to_i}
454
- a
455
- }
456
- end
457
-
458
- # Returns an random sample of size n, with replacement,
459
- # only with valid data.
460
- #
461
- # In all the trails, every item have the same probability
462
- # of been selected.
463
- def sample_with_replacement(sample=1)
464
- if(@type!=:scale or !HAS_GSL)
421
+ end
422
+ def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
423
+ split_by_separator(sep).inject({}) {|a,v|
424
+ a[v[0]]=v[1].inject {|s,x| s+x.to_i}
425
+ a
426
+ }
427
+ end
428
+
429
+ # Returns an random sample of size n, with replacement,
430
+ # only with valid data.
431
+ #
432
+ # In all the trails, every item have the same probability
433
+ # of been selected.
434
+ def sample_with_replacement(sample=1)
435
+ if(@type!=:scale or !Statsample.has_gsl?)
465
436
  vds=@valid_data.size
466
437
  (0...sample).collect{ @valid_data[rand(vds)] }
467
438
  else
468
439
  r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
469
440
  r.sample(@gsl, sample).to_a
470
441
  end
471
- end
472
- # Returns an random sample of size n, without replacement,
473
- # only with valid data.
474
- #
475
- # Every element could only be selected once.
476
- #
477
- # A sample of the same size of the vector is the vector itself.
442
+ end
443
+ # Returns an random sample of size n, without replacement,
444
+ # only with valid data.
445
+ #
446
+ # Every element could only be selected once.
447
+ #
448
+ # A sample of the same size of the vector is the vector itself.
478
449
 
479
- def sample_without_replacement(sample=1)
480
- if(@type!=:scale or !HAS_GSL)
450
+ def sample_without_replacement(sample=1)
451
+ if(@type!=:scale or !Statsample.has_gsl?)
481
452
  raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
482
453
  out=[]
483
454
  size=@valid_data.size
@@ -490,13 +461,13 @@ module Statsample
490
461
  r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
491
462
  r.choose(@gsl, sample).to_a
492
463
  end
493
- end
494
- # Retrieves number of cases which comply condition.
495
- # If block given, retrieves number of instances where
496
- # block returns true.
497
- # If other values given, retrieves the frequency for
498
- # this value.
499
- def count(x=false)
464
+ end
465
+ # Retrieves number of cases which comply condition.
466
+ # If block given, retrieves number of instances where
467
+ # block returns true.
468
+ # If other values given, retrieves the frequency for
469
+ # this value.
470
+ def count(x=false)
500
471
  if block_given?
501
472
  r=@data.inject(0) {|s, i|
502
473
  r=yield i
@@ -506,11 +477,11 @@ module Statsample
506
477
  else
507
478
  frequencies[x].nil? ? 0 : frequencies[x]
508
479
  end
509
- end
510
-
511
- # Returns the database type for the vector, according to its content
512
-
513
- def db_type(dbs='mysql')
480
+ end
481
+
482
+ # Returns the database type for the vector, according to its content
483
+
484
+ def db_type(dbs='mysql')
514
485
  # first, detect any character not number
515
486
  if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
516
487
  return "DATE"
@@ -521,43 +492,43 @@ module Statsample
521
492
  else
522
493
  return "INTEGER"
523
494
  end
524
- end
525
- # Return true if all data is Date, "today" values or nil
526
- def can_be_date?
495
+ end
496
+ # Return true if all data is Date, "today" values or nil
497
+ def can_be_date?
527
498
  if @data.find {|v|
528
499
  !v.nil? and !v.is_a? Date and !v.is_a? Time and (v.is_a? String and !@today_values.include? v) and (v.is_a? String and !(v=~/\d{4,4}[-\/]\d{1,2}[-\/]\d{1,2}/))}
529
500
  false
530
501
  else
531
502
  true
532
503
  end
533
- end
534
- # Return true if all data is Numeric or nil
535
- def can_be_scale?
504
+ end
505
+ # Return true if all data is Numeric or nil
506
+ def can_be_scale?
536
507
  if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
537
508
  false
538
509
  else
539
510
  true
540
511
  end
541
- end
542
-
543
- def to_s
512
+ end
513
+
514
+ def to_s
544
515
  sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
545
- end
546
- # Ugly name. Really, create a Vector for standard 'matrix' package.
547
- # <tt>dir</tt> could be :horizontal or :vertical
548
- def to_matrix(dir=:horizontal)
516
+ end
517
+ # Ugly name. Really, create a Vector for standard 'matrix' package.
518
+ # <tt>dir</tt> could be :horizontal or :vertical
519
+ def to_matrix(dir=:horizontal)
549
520
  case dir
550
521
  when :horizontal
551
522
  Matrix[@data]
552
523
  when :vertical
553
524
  Matrix.columns([@data])
554
525
  end
555
- end
556
- def inspect
526
+ end
527
+ def inspect
557
528
  self.to_s
558
- end
559
- # Retrieves uniques values for data.
560
- def factors
529
+ end
530
+ # Retrieves uniques values for data.
531
+ def factors
561
532
  if @type==:scale
562
533
  @scale_data.uniq.sort
563
534
  elsif @type==:date
@@ -565,26 +536,26 @@ module Statsample
565
536
  else
566
537
  @valid_data.uniq.sort
567
538
  end
568
- end
569
- if Statsample::STATSAMPLE__.respond_to?(:frequencies)
539
+ end
540
+ if Statsample::STATSAMPLE__.respond_to?(:frequencies)
570
541
  # Returns a hash with the distribution of frecuencies for
571
542
  # the sample
572
543
  def frequencies
573
544
  Statsample::STATSAMPLE__.frequencies(@valid_data)
574
545
  end
575
- else
546
+ else
576
547
  def frequencies #:nodoc:
577
548
  _frequencies
578
549
  end
579
- end
580
- def _frequencies #:nodoc:
550
+ end
551
+ def _frequencies #:nodoc:
581
552
  @valid_data.inject(Hash.new) {|a,x|
582
553
  a[x]||=0
583
554
  a[x]=a[x]+1
584
555
  a
585
556
  }
586
- end
587
- # Plot frequencies on a chart, using gnuplot
557
+ end
558
+ # Plot frequencies on a chart, using gnuplot
588
559
  def plot_frequencies
589
560
  require 'gnuplot'
590
561
  x=[]
@@ -594,30 +565,30 @@ module Statsample
594
565
  y.push(v)
595
566
  }
596
567
  Gnuplot.open do |gp|
597
- Gnuplot::Plot.new( gp ) do |plot|
598
- plot.boxwidth("0.9 absolute")
599
- plot.yrange("[0:#{y.max}]")
600
- plot.style("fill solid 1.00 border -1")
601
- plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
602
- plot.style("histogram")
603
- plot.style("data histogram")
604
- i=-1
605
- plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
606
- plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
607
- end
568
+ Gnuplot::Plot.new( gp ) do |plot|
569
+ plot.boxwidth("0.9 absolute")
570
+ plot.yrange("[0:#{y.max}]")
571
+ plot.style("fill solid 1.00 border -1")
572
+ plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
573
+ plot.style("histogram")
574
+ plot.style("data histogram")
575
+ i=-1
576
+ plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
577
+ plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
608
578
  end
609
579
  end
610
-
580
+ end
581
+
611
582
  end
612
-
613
-
583
+
584
+
614
585
  # Returns the most frequent item.
615
586
  def mode
616
- frequencies.max{|a,b| a[1]<=>b[1]}[0]
587
+ frequencies.max{|a,b| a[1]<=>b[1]}[0]
617
588
  end
618
589
  # The numbers of item with valid data.
619
590
  def n_valid
620
- @valid_data.size
591
+ @valid_data.size
621
592
  end
622
593
  # Returns a hash with the distribution of proportions of
623
594
  # the sample.
@@ -632,38 +603,38 @@ module Statsample
632
603
  frequencies[v].quo(@valid_data.size)
633
604
  end
634
605
  def summary(out="")
635
- out << sprintf("n valid:%d\n",n_valid)
636
- out << sprintf("factors:%s\n",factors.join(","))
637
- out << "mode:"+mode.to_s+"\n"
638
- out << "Distribution:\n"
639
- frequencies.sort.each{|k,v|
640
- key=labels.has_key?(k) ? labels[k]:k
641
- out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
642
- }
643
- if(@type==:ordinal)
644
- out << "median:"+median.to_s+"\n"
645
- end
646
- if(@type==:scale)
647
- out << "mean:"+mean.to_s+"\n"
648
- out << "sd:"+sd.to_s+"\n"
649
-
650
- end
651
- out
606
+ out << sprintf("n valid:%d\n",n_valid)
607
+ out << sprintf("factors:%s\n",factors.join(","))
608
+ out << "mode:"+mode.to_s+"\n"
609
+ out << "Distribution:\n"
610
+ frequencies.sort.each{|k,v|
611
+ key=labels.has_key?(k) ? labels[k]:k
612
+ out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
613
+ }
614
+ if(@type==:ordinal)
615
+ out << "median:"+median.to_s+"\n"
616
+ end
617
+ if(@type==:scale)
618
+ out << "mean:"+mean.to_s+"\n"
619
+ out << "sd:"+sd.to_s+"\n"
620
+
621
+ end
622
+ out
652
623
  end
653
624
 
654
625
  # Variance of p, according to poblation size
655
626
  def variance_proportion(n_poblation, v=1)
656
- Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
627
+ Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
657
628
  end
658
629
  # Variance of p, according to poblation size
659
630
  def variance_total(n_poblation, v=1)
660
- Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
631
+ Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
661
632
  end
662
633
  def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
663
- Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
634
+ Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
664
635
  end
665
636
  def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
666
- Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
637
+ Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
667
638
  end
668
639
 
669
640
  self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
@@ -672,20 +643,21 @@ module Statsample
672
643
  alias_method met_or, met
673
644
  end
674
645
  end
675
- ######
676
- ### Ordinal Methods
677
- ######
678
-
646
+
647
+ ######
648
+ ### Ordinal Methods
649
+ ######
650
+
679
651
  # Return the value of the percentil q
680
652
  def percentil(q)
681
- check_type :ordinal
682
- sorted=@valid_data.sort
683
- v= (n_valid * q).quo(100)
684
- if(v.to_i!=v)
685
- sorted[v.to_i]
686
- else
687
- (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
688
- end
653
+ check_type :ordinal
654
+ sorted=@valid_data.sort
655
+ v= (n_valid * q).quo(100)
656
+ if(v.to_i!=v)
657
+ sorted[v.to_i]
658
+ else
659
+ (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
660
+ end
689
661
  end
690
662
  # Returns a ranked vector.
691
663
  def ranked(type=:ordinal)
@@ -698,27 +670,28 @@ module Statsample
698
670
  }
699
671
  @data.collect {|c| r[c] }.to_vector(type)
700
672
  end
701
- # Return the median (percentil 50)
702
- def median
703
- check_type :ordinal
704
- if HAS_GSL and @type==:scale
705
- sorted=GSL::Vector.alloc(@scale_data.sort)
706
- GSL::Stats::median_from_sorted_data(sorted)
707
- else
708
- percentil(50)
673
+ # Return the median (percentil 50)
674
+ def median
675
+ check_type :ordinal
676
+ if Statsample.has_gsl? and @type==:scale
677
+ sorted=GSL::Vector.alloc(@scale_data.sort)
678
+ GSL::Stats::median_from_sorted_data(sorted)
679
+ else
680
+ percentil(50)
681
+ end
682
+ end
683
+ # Minimun value
684
+ def min
685
+ check_type :ordinal
686
+ @valid_data.min;
709
687
  end
710
- end
711
- # Minimun value
712
- def min;
713
- check_type :ordinal
714
- @valid_data.min;
715
- end
716
688
  # Maximum value
717
- def max;
718
- check_type :ordinal
719
- @valid_data.max;
720
- end
721
- def set_date_data # :nodoc:
689
+ def max
690
+ check_type :ordinal
691
+ @valid_data.max;
692
+ end
693
+
694
+ def set_date_data
722
695
  @date_data_with_nils=@data.collect do|x|
723
696
  if x.is_a? Date
724
697
  x
@@ -733,7 +706,8 @@ module Statsample
733
706
  end
734
707
  end
735
708
  end
736
- def set_scale_data # :nodoc
709
+
710
+ def set_scale_data
737
711
  @scale_data=@valid_data.collect do|x|
738
712
  if x.is_a? Numeric
739
713
  x
@@ -743,12 +717,13 @@ module Statsample
743
717
  x.to_f
744
718
  end
745
719
  end
746
- if HAS_GSL
720
+ if Statsample.has_gsl?
747
721
  @gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
748
722
  end
749
723
  end
750
- private :set_scale_data
751
-
724
+
725
+ private :set_date_data, :set_scale_data
726
+
752
727
  # The range of the data (max - min)
753
728
  def range;
754
729
  check_type :scale
@@ -788,7 +763,7 @@ module Statsample
788
763
  squares.quo(n_valid) - m.square
789
764
  end
790
765
 
791
-
766
+
792
767
  # Population Standard deviation (denominator N)
793
768
  def standard_deviation_population(m=nil)
794
769
  check_type :scale
@@ -801,7 +776,7 @@ module Statsample
801
776
  m||=mean
802
777
  sum_of_squares(m).quo(n_valid - 1)
803
778
  end
804
-
779
+
805
780
  # Sample Standard deviation (denominator n-1)
806
781
 
807
782
  def standard_deviation_sample(m=nil)
@@ -831,7 +806,7 @@ module Statsample
831
806
  check_type :scale
832
807
  @scale_data.inject(1){|a,x| a*x }
833
808
  end
834
- if HAS_GSL
809
+ if Statsample.has_gsl?
835
810
  %w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
836
811
  m_nuevo=(m+"_slow").intern
837
812
  alias_method m_nuevo, m.intern