statsample 0.3.4 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/History.txt +8 -0
  2. data/Manifest.txt +20 -2
  3. data/data/crime.txt +47 -0
  4. data/data/test_binomial.csv +201 -0
  5. data/demo/distribution_t.rb +2 -2
  6. data/demo/regression.rb +2 -1
  7. data/lib/distribution.rb +8 -0
  8. data/lib/distribution/chisquare.rb +24 -0
  9. data/lib/distribution/f.rb +25 -0
  10. data/lib/distribution/normal.rb +25 -0
  11. data/lib/distribution/t.rb +22 -0
  12. data/lib/matrix_extension.rb +78 -0
  13. data/lib/statistics2.rb +531 -0
  14. data/lib/statsample.rb +12 -9
  15. data/lib/statsample/anova.rb +1 -5
  16. data/lib/statsample/bivariate.rb +24 -20
  17. data/lib/statsample/combination.rb +14 -4
  18. data/lib/statsample/converters.rb +17 -1
  19. data/lib/statsample/dataset.rb +66 -10
  20. data/lib/statsample/dominanceanalysis/bootstrap.rb +1 -3
  21. data/lib/statsample/graph/gdchart.rb +2 -3
  22. data/lib/statsample/graph/svggraph.rb +8 -4
  23. data/lib/statsample/mle.rb +137 -0
  24. data/lib/statsample/mle/logit.rb +95 -0
  25. data/lib/statsample/mle/normal.rb +83 -0
  26. data/lib/statsample/mle/probit.rb +93 -0
  27. data/lib/statsample/regression.rb +3 -1
  28. data/lib/statsample/regression/binomial.rb +65 -0
  29. data/lib/statsample/regression/binomial/logit.rb +13 -0
  30. data/lib/statsample/regression/binomial/probit.rb +13 -0
  31. data/lib/statsample/regression/multiple.rb +61 -58
  32. data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
  33. data/lib/statsample/srs.rb +5 -5
  34. data/lib/statsample/vector.rb +129 -59
  35. data/test/test_anova.rb +0 -5
  36. data/test/test_dataset.rb +13 -1
  37. data/test/test_distribution.rb +57 -0
  38. data/test/test_gsl.rb +22 -0
  39. data/test/test_logit.rb +22 -0
  40. data/test/test_mle.rb +140 -0
  41. data/test/test_r.rb +9 -0
  42. data/test/test_regression.rb +12 -4
  43. data/test/test_srs.rb +0 -4
  44. data/test/test_stata.rb +11 -0
  45. data/test/test_statistics.rb +0 -15
  46. data/test/test_vector.rb +11 -0
  47. metadata +28 -4
  48. data/lib/statsample/chidistribution.rb +0 -39
  49. data/lib/statsample/regression/logit.rb +0 -35
data/lib/statsample.rb CHANGED
@@ -21,9 +21,8 @@
21
21
  $:.unshift(File.dirname(__FILE__))
22
22
  $:.unshift(File.expand_path(File.dirname(__FILE__)+"/../ext"))
23
23
 
24
- require 'delegate'
25
24
  require 'matrix'
26
-
25
+ require 'distribution'
27
26
 
28
27
  class Numeric
29
28
  def square ; self * self ; end
@@ -44,6 +43,7 @@ def create_test(*args,&proc)
44
43
  fields=args
45
44
  [description, fields, Proc.new]
46
45
  end
46
+ #--
47
47
  # Test extensions
48
48
  begin
49
49
  require 'gettext'
@@ -59,7 +59,7 @@ begin
59
59
  end
60
60
  end
61
61
  end
62
-
62
+
63
63
  begin
64
64
  require 'rbgsl'
65
65
  HAS_GSL=true
@@ -72,7 +72,7 @@ end
72
72
  rescue LoadError
73
73
  HAS_ALGIB=false
74
74
  end
75
- #
75
+ # ++
76
76
  # Modules for statistical analysis
77
77
  # See first:
78
78
  # * Converter : several modules to import and export data
@@ -80,12 +80,14 @@ end
80
80
  # * Dataset: An union of vectors.
81
81
  #
82
82
  module Statsample
83
- VERSION = '0.3.4'
83
+
84
+ VERSION = '0.4.0'
84
85
  SPLIT_TOKEN = ","
85
86
  autoload(:Database, 'statsample/converters')
86
87
  autoload(:Anova, 'statsample/anova')
87
88
  autoload(:Combination, 'statsample/combination')
88
89
  autoload(:CSV, 'statsample/converters')
90
+ autoload(:PlainText, 'statsample/converters')
89
91
  autoload(:Excel, 'statsample/converters')
90
92
  autoload(:GGobi, 'statsample/converters')
91
93
  autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
@@ -99,8 +101,7 @@ module Statsample
99
101
  autoload(:Multivariate, 'statsample/multivariate')
100
102
  autoload(:Multiset, 'statsample/multiset')
101
103
  autoload(:StratifiedSample, 'statsample/multiset')
102
-
103
-
104
+ autoload(:MLE, 'statsample/mle')
104
105
  autoload(:Regression, 'statsample/regression')
105
106
  autoload(:Test, 'statsample/test')
106
107
  def self.load(filename)
@@ -240,16 +241,18 @@ module Statsample
240
241
  end
241
242
  end
242
243
 
243
- module STATSAMPLE__
244
+ module STATSAMPLE__ #:nodoc:
244
245
  end
245
246
 
246
247
  end
247
248
 
248
249
 
250
+
251
+ #--
249
252
  begin
250
253
  require 'statsamplert'
251
254
  rescue LoadError
252
- module Statsample
255
+ module Statsample
253
256
  OPTIMIZED=false
254
257
  end
255
258
  end
@@ -63,11 +63,7 @@ module Statsample
63
63
  end
64
64
  # Significance of Fisher
65
65
  def significance
66
- if HAS_GSL
67
- GSL::Cdf.fdist_Q(f,df_bg,df_wg)
68
- else
69
- raise "Need Ruby/GSL"
70
- end
66
+ 1.0-Distribution::F.cdf(f,df_bg,df_wg)
71
67
  end
72
68
  end
73
69
  end
@@ -20,8 +20,8 @@ module Statsample
20
20
  }
21
21
  sum
22
22
  end
23
- # Covariance. The denominator is n-1
24
- def covariance_slow(v1a,v2a)
23
+
24
+ def covariance_slow(v1a,v2a) # :nodoc:
25
25
  t=0
26
26
  m1=v1a.mean
27
27
  m2=v1a.mean
@@ -40,8 +40,8 @@ module Statsample
40
40
  pearson_slow(v1a,v2a)
41
41
  end
42
42
  end
43
- #:nodoc:
44
- def pearson_slow(v1a,v2a)
43
+ def pearson_slow(v1a,v2a) # :nodoc:
44
+
45
45
  v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
46
46
  t=0
47
47
  siz=v1s.size
@@ -62,7 +62,7 @@ module Statsample
62
62
  # Retrieves the value for t test for a pearson correlation
63
63
  # giving r and vector size
64
64
  def t_r(r,size)
65
- r*Math::sqrt(((size)-2).to_f / (1 - r**2))
65
+ r * Math::sqrt(((size)-2).to_f / (1 - r**2))
66
66
  end
67
67
  # Retrieves the probability value (a la SPSS)
68
68
  # for a given t, size and number of tails.
@@ -71,7 +71,7 @@ module Statsample
71
71
  # * :right, :positive or 1 : for r > 0
72
72
  # * :left, :negative : for r < 0
73
73
 
74
- def prop_pearson(t,size, tails=:both)
74
+ def prop_pearson(t, size, tails=:both)
75
75
  tails=:both if tails==2
76
76
  tails=:right if tails==1 or tails==:positive
77
77
  tails=:left if tails==:negative
@@ -82,16 +82,12 @@ module Statsample
82
82
  else
83
83
  1
84
84
  end
85
- if HAS_GSL
86
- t=-t if t>0 and (tails==:both)
87
- cdf=GSL::Cdf::tdist_P(t,size-2)
88
- if(tails==:right)
89
- 1.0-(cdf*n_tails)
90
- else
91
- cdf*n_tails
92
- end
85
+ t=-t if t>0 and (tails==:both)
86
+ cdf=Distribution::T.cdf(t, size-2)
87
+ if(tails==:right)
88
+ 1.0-(cdf*n_tails)
93
89
  else
94
- raise "Needs ruby-gsl"
90
+ cdf*n_tails
95
91
  end
96
92
  end
97
93
  # Returns residual score after delete variance
@@ -110,6 +106,8 @@ module Statsample
110
106
  }
111
107
  nv.to_vector(:scale)
112
108
  end
109
+ # Correlation between v1 and v2, controling the effect of
110
+ # control on both.
113
111
  def partial_correlation(v1,v2,control)
114
112
  v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
115
113
  rv1v2=pearson(v1a,v2a)
@@ -119,7 +117,9 @@ module Statsample
119
117
  (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
120
118
 
121
119
  end
122
- # Covariance matrix
120
+ # Covariance matrix.
121
+ # Order of rows and columns depends on Dataset#fields order
122
+
123
123
  def covariance_matrix(ds)
124
124
  ds.collect_matrix do |row,col|
125
125
  if (ds[row].type!=:scale or ds[col].type!=:scale)
@@ -130,7 +130,8 @@ module Statsample
130
130
  end
131
131
  end
132
132
 
133
- # The classic correlation matrix for all fields of a dataset
133
+ # Correlation matrix.
134
+ # Order of rows and columns depends on Dataset#fields order
134
135
 
135
136
  def correlation_matrix(ds)
136
137
  ds.collect_matrix {|row,col|
@@ -154,16 +155,19 @@ module Statsample
154
155
  end
155
156
  }
156
157
  end
157
- def correlation_probability_matrix(ds)
158
+ # Matrix of correlation probability
159
+ # Order of rows and columns depends on Dataset#fields order
160
+
161
+ def correlation_probability_matrix(ds, tails=:both)
158
162
  rows=ds.fields.collect{|row|
159
163
  ds.fields.collect{|col|
160
164
  v1a,v2a=Statsample.only_valid(ds[row],ds[col])
161
- (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size)
165
+ (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
162
166
  }
163
167
  }
164
168
  Matrix.rows(rows)
165
169
  end
166
- # Calculate Spearman correlation coefficient between 2 vectors
170
+ # Spearman ranked correlation coefficient between 2 vectors
167
171
  def spearman(v1,v2)
168
172
  v1a,v2a=Statsample.only_valid(v1,v2)
169
173
  v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
@@ -1,12 +1,22 @@
1
1
  module Statsample
2
2
  # Combination class systematically generates all combinations of n elements, taken r at a time.
3
- # Use GSL::Combination is available for extra speed
3
+ # With rbgsl, GSL::Combination is available for extra speed
4
4
  # Source: http://snippets.dzone.com/posts/show/4666
5
5
  # Use:
6
6
  # comb=Statsample::Combination.new(3,5)
7
- # comb.each{|c|
8
- # p c
9
- # }
7
+ # => #<Statsample::Combination:0x7f6323804e08 @n=5, @d=#<Statsample::Combination::CombinationGsl:0x7f63237ff7f0 @n=5, @k=3, @c=GSL::Combination>, @k=3>
8
+ # comb.each{|c| p c }
9
+ # [0, 1, 2]
10
+ # [0, 1, 3]
11
+ # [0, 1, 4]
12
+ # [0, 2, 3]
13
+ # [0, 2, 4]
14
+ # [0, 3, 4]
15
+ # [1, 2, 3]
16
+ # [1, 2, 4]
17
+ # [1, 3, 4]
18
+ # [2, 3, 4]
19
+ #
10
20
  class Combination
11
21
  attr_reader :d
12
22
  def initialize(k,n,only_ruby=false)
@@ -117,6 +117,21 @@ module Statsample
117
117
 
118
118
  end
119
119
  end
120
+ class PlainText < SpreadsheetBase
121
+ class << self
122
+ def read(filename, fields)
123
+ ds=Statsample::Dataset.new(fields)
124
+ fp=File.open(filename,"r")
125
+ fp.each_line do |line|
126
+ row=process_row(line.strip.split(/\s+/),[""])
127
+ ds.add_case_array(row)
128
+ end
129
+ convert_to_scale(ds,fields)
130
+ ds.update_valid_data
131
+ ds
132
+ end
133
+ end
134
+ end
120
135
  class Excel < SpreadsheetBase
121
136
  class << self
122
137
  def write(dataset,filename)
@@ -157,7 +172,7 @@ module Statsample
157
172
  }
158
173
  line_number+=1
159
174
  if(line_number<=ignore_lines)
160
- #puts "Skip line"
175
+ #puts "Skip line #{line_number}:#{row.to_s}"
161
176
  next
162
177
  end
163
178
  # This should be fixed.
@@ -235,6 +250,7 @@ module Statsample
235
250
  # USE:
236
251
  # Statsample::CSV.write(ds,"test_csv.csv")
237
252
  def write(dataset,filename, convert_comma=false,*opts)
253
+ require 'csv'
238
254
  writer=::CSV.open(filename,'w',*opts)
239
255
  writer << dataset.fields
240
256
  dataset.each_array{|row|
@@ -36,11 +36,26 @@ module Statsample
36
36
  include Writable
37
37
  attr_reader :vectors, :fields, :cases, :i
38
38
  attr_accessor :labels
39
- # To create a dataset
40
- # * Dataset.new()
41
- # * Dataset.new(%w{v1 v2 v3})
42
- # * Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
43
- # * Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
39
+ # Creates a new dataset. A dataset is a set of ordered named vectors
40
+ # of the same size.
41
+ #
42
+ # [vectors] With an array, creates a set of empty vectors named as
43
+ # values on the array. With a hash, each Vector is assigned as
44
+ # a variable of the Dataset named as its key
45
+ # [fields] Array of names for vectors. Is only used for set the
46
+ # order of variables. If empty, vectors keys on alfabethic order as
47
+ # used as fields
48
+ # [labels] Hash to set names for fields.
49
+ #
50
+ #
51
+ # Dataset.new()
52
+ # Dataset.new(%w{v1 v2 v3})
53
+ # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
54
+ # Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
55
+ #
56
+ # The fast way to create a dataset uses Hash#to_dataset, with
57
+ # fields and labels as arguments
58
+ # ds = {'v1'=>[1,2,3].to_vector}.to_dataset
44
59
  #
45
60
  def initialize(vectors={}, fields=[], labels={})
46
61
  if vectors.instance_of? Array
@@ -296,7 +311,7 @@ module Statsample
296
311
  }
297
312
  end
298
313
  if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
299
- def case_as_hash(c)
314
+ def case_as_hash(c) # :nodoc:
300
315
  Statsample::STATSAMPLE__.case_as_hash(self,c)
301
316
  end
302
317
  else
@@ -306,7 +321,7 @@ module Statsample
306
321
  end
307
322
 
308
323
  if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
309
- def case_as_array(c)
324
+ def case_as_array(c) # :nodoc:
310
325
  Statsample::STATSAMPLE__.case_as_array(self,c)
311
326
  end
312
327
  else
@@ -314,16 +329,16 @@ module Statsample
314
329
  _case_as_array(c)
315
330
  end
316
331
  end
317
- def _case_as_hash(c)
332
+ def _case_as_hash(c) # :nodoc:
318
333
  @fields.inject({}) {|a,x|
319
334
  a[x]=@vectors[x][c]
320
335
  a
321
336
  }
322
337
  end
323
- def _case_as_array(c)
338
+ def _case_as_array(c) # :nodoc:
324
339
  @fields.collect {|x| @vectors[x][c]}
325
340
  end
326
-
341
+ # Returns each case as a hash
327
342
  def each
328
343
  begin
329
344
  @i=0
@@ -337,6 +352,7 @@ module Statsample
337
352
  raise DatasetException.new(self,e)
338
353
  end
339
354
  end
355
+ # Returns each case as index and hash
340
356
  def each_with_index
341
357
  begin
342
358
  @i=0
@@ -350,6 +366,7 @@ module Statsample
350
366
  raise DatasetException.new(self,e)
351
367
  end
352
368
  end
369
+ # Returns each case as an array
353
370
  def each_array
354
371
  @cases.times {|i|
355
372
  @i=i
@@ -495,6 +512,40 @@ module Statsample
495
512
  ms
496
513
 
497
514
  end
515
+ # Returns a vector, based on a string with a calculation based
516
+ # on vector
517
+ # The calculation will be eval'ed, so you can put any variable
518
+ # or expression valid on ruby
519
+ # For example:
520
+ # a=[1,2].to_vector(scale)
521
+ # b=[3,4].to_vector(scale)
522
+ # ds={'a'=>a,'b'=>b}.to_dataset
523
+ # ds.calculate("a+b")
524
+ # => Vector [4,6]
525
+ def compute(text)
526
+ @fields.each{|f|
527
+ if @vectors[f].type=:scale
528
+ text.gsub!(f,"row['#{f}'].to_f")
529
+ else
530
+ text.gsub!(f,"row['#{f}']")
531
+
532
+ end
533
+
534
+ }
535
+ collect_with_index {|i,row|
536
+ invalid=false
537
+ @fields.each{|f|
538
+ if @vectors[f].data_with_nils[i].nil?
539
+ invalid=true
540
+ end
541
+ }
542
+ if invalid
543
+ nil
544
+ else
545
+ eval(text)
546
+ end
547
+ }
548
+ end
498
549
  # Test each row with one or more tests
499
550
  # each test is a Proc with the form
500
551
  # Proc.new {|row| row['age']>0}
@@ -540,5 +591,10 @@ module Statsample
540
591
  }
541
592
  out
542
593
  end
594
+ def as_r
595
+ require 'rsruby/dataframe'
596
+ r=RSRuby.instance
597
+
598
+ end
543
599
  end
544
600
  end
@@ -69,10 +69,8 @@ class DominanceAnalysis
69
69
  out.extend report_type
70
70
  out.add _("Summary for Bootstrap Dominance Analysis of %s on %s\n") % [@fields.join(", "), @y_var]
71
71
  out.add _("Sample size: %d\n") % @n_samples
72
- if HAS_GSL
73
- t=GSL::Cdf.tdist_Pinv(1-((1-alfa) / 2),@n_samples - 1)
72
+ t=Distribution::T.p_value(1-((1-alfa) / 2),@n_samples - 1)
74
73
  out.add "t:#{t}\n"
75
- end
76
74
  out.add "Linear Regression Engine: #{@lr_class.name}"
77
75
  out.nl
78
76
  table=ReportTable.new
@@ -17,7 +17,7 @@ module Statsample
17
17
  end
18
18
  end
19
19
  end
20
- class Nominal
20
+ class Vector
21
21
  # Creates a barchart using ruby-gdchart
22
22
  def gdchart_frequencies(file, width=300, height=150, chart_type=GDChart::BAR, options={})
23
23
  labels,data=[],[]
@@ -28,9 +28,8 @@ module Statsample
28
28
  options['ext_color']=[0xFF3399,0xFF9933,0xFFEE33,0x33FF33, 0x9966FF]
29
29
  Statsample::Util.chart_gdchart(file,width,height,chart_type, labels,options,1,data)
30
30
  end
31
- end
32
- class Scale < Ordinal
33
31
  def gdchart_histogram(bins,file, width=300, height=150, chart_type=GDChart::BAR, options={})
32
+ check_type :scale
34
33
  labels=[]
35
34
  h=histogram(bins)
36
35
  data=[]
@@ -27,6 +27,7 @@ module Statsample
27
27
  }
28
28
  end
29
29
  def svggraph_histogram(bins, options={})
30
+ check_type :scale
30
31
  options={:graph_title=>"Histogram", :show_graph_title=>true,:show_normal=>true, :mean=>self.mean, :sigma=>sdp }.merge! options
31
32
  graph = Statsample::Graph::SvgHistogram.new(options)
32
33
  graph.histogram=histogram(bins)
@@ -35,6 +36,7 @@ module Statsample
35
36
  # Returns a Run-Sequence Plot
36
37
  # Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/runseqpl.htm
37
38
  def svggraph_runsequence_plot(options={})
39
+ check_type :scale
38
40
  options={:graph_title=>"Run-Sequence Plot", :show_graph_title=>true, :scale_x_integers => true, :add_popups=>true }.merge! options
39
41
  vx=(1..@data.size).to_a.to_vector(:scale)
40
42
  vy=@data.to_vector(:scale)
@@ -45,6 +47,7 @@ module Statsample
45
47
  graph
46
48
  end
47
49
  def svggraph_boxplot(options={})
50
+ check_type :scale
48
51
  options={:graph_title=>"Boxplot", :fields=>['vector'], :show_graph_title=>true}.merge! options
49
52
  vx=@data.to_a.to_vector(:scale)
50
53
  graph = Statsample::Graph::SvgBoxplot.new(options)
@@ -53,6 +56,7 @@ module Statsample
53
56
  end
54
57
 
55
58
  def svggraph_lag_plot(options={})
59
+ check_type :scale
56
60
  options={:graph_title=>"Lag Plot", :show_graph_title=>true}.merge! options
57
61
  vx=@data[0...(@data.size-1)].to_vector(:scale)
58
62
  vy=@data[1...@data.size].to_vector(:scale)
@@ -66,12 +70,12 @@ module Statsample
66
70
  # Returns a Normal Probability Plot
67
71
  # Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/normprpl.htm
68
72
  def svggraph_normalprobability_plot(options={})
69
- extend Statsample::Util
70
-
71
- options={:graph_title=>"Normal Probability Plot", :show_graph_title=>true}.merge! options
73
+ extend Statsample::Util
74
+ check_type :scale
75
+ options={:graph_title=>"Normal Probability Plot", :show_graph_title=>true}.merge! options
72
76
  n=@data.size
73
77
  vx=(1..@data.size).to_a.collect{|i|
74
- GSL::Cdf.gaussian_Pinv(normal_order_statistic_medians(i,n))
78
+ Distribution::Normal.p_value(normal_order_statistic_medians(i,n))
75
79
  }.to_vector(:scale)
76
80
  vy=@data.sort.to_vector(:scale)
77
81
  ds={'normal_order_statistics_medians'=>vx, 'ordered_response'=>vy}.to_dataset