statsample 0.3.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/History.txt +8 -0
  2. data/Manifest.txt +20 -2
  3. data/data/crime.txt +47 -0
  4. data/data/test_binomial.csv +201 -0
  5. data/demo/distribution_t.rb +2 -2
  6. data/demo/regression.rb +2 -1
  7. data/lib/distribution.rb +8 -0
  8. data/lib/distribution/chisquare.rb +24 -0
  9. data/lib/distribution/f.rb +25 -0
  10. data/lib/distribution/normal.rb +25 -0
  11. data/lib/distribution/t.rb +22 -0
  12. data/lib/matrix_extension.rb +78 -0
  13. data/lib/statistics2.rb +531 -0
  14. data/lib/statsample.rb +12 -9
  15. data/lib/statsample/anova.rb +1 -5
  16. data/lib/statsample/bivariate.rb +24 -20
  17. data/lib/statsample/combination.rb +14 -4
  18. data/lib/statsample/converters.rb +17 -1
  19. data/lib/statsample/dataset.rb +66 -10
  20. data/lib/statsample/dominanceanalysis/bootstrap.rb +1 -3
  21. data/lib/statsample/graph/gdchart.rb +2 -3
  22. data/lib/statsample/graph/svggraph.rb +8 -4
  23. data/lib/statsample/mle.rb +137 -0
  24. data/lib/statsample/mle/logit.rb +95 -0
  25. data/lib/statsample/mle/normal.rb +83 -0
  26. data/lib/statsample/mle/probit.rb +93 -0
  27. data/lib/statsample/regression.rb +3 -1
  28. data/lib/statsample/regression/binomial.rb +65 -0
  29. data/lib/statsample/regression/binomial/logit.rb +13 -0
  30. data/lib/statsample/regression/binomial/probit.rb +13 -0
  31. data/lib/statsample/regression/multiple.rb +61 -58
  32. data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
  33. data/lib/statsample/srs.rb +5 -5
  34. data/lib/statsample/vector.rb +129 -59
  35. data/test/test_anova.rb +0 -5
  36. data/test/test_dataset.rb +13 -1
  37. data/test/test_distribution.rb +57 -0
  38. data/test/test_gsl.rb +22 -0
  39. data/test/test_logit.rb +22 -0
  40. data/test/test_mle.rb +140 -0
  41. data/test/test_r.rb +9 -0
  42. data/test/test_regression.rb +12 -4
  43. data/test/test_srs.rb +0 -4
  44. data/test/test_stata.rb +11 -0
  45. data/test/test_statistics.rb +0 -15
  46. data/test/test_vector.rb +11 -0
  47. metadata +28 -4
  48. data/lib/statsample/chidistribution.rb +0 -39
  49. data/lib/statsample/regression/logit.rb +0 -35
data/lib/statsample.rb CHANGED
@@ -21,9 +21,8 @@
21
21
  $:.unshift(File.dirname(__FILE__))
22
22
  $:.unshift(File.expand_path(File.dirname(__FILE__)+"/../ext"))
23
23
 
24
- require 'delegate'
25
24
  require 'matrix'
26
-
25
+ require 'distribution'
27
26
 
28
27
  class Numeric
29
28
  def square ; self * self ; end
@@ -44,6 +43,7 @@ def create_test(*args,&proc)
44
43
  fields=args
45
44
  [description, fields, Proc.new]
46
45
  end
46
+ #--
47
47
  # Test extensions
48
48
  begin
49
49
  require 'gettext'
@@ -59,7 +59,7 @@ begin
59
59
  end
60
60
  end
61
61
  end
62
-
62
+
63
63
  begin
64
64
  require 'rbgsl'
65
65
  HAS_GSL=true
@@ -72,7 +72,7 @@ end
72
72
  rescue LoadError
73
73
  HAS_ALGIB=false
74
74
  end
75
- #
75
+ # ++
76
76
  # Modules for statistical analysis
77
77
  # See first:
78
78
  # * Converter : several modules to import and export data
@@ -80,12 +80,14 @@ end
80
80
  # * Dataset: An union of vectors.
81
81
  #
82
82
  module Statsample
83
- VERSION = '0.3.4'
83
+
84
+ VERSION = '0.4.0'
84
85
  SPLIT_TOKEN = ","
85
86
  autoload(:Database, 'statsample/converters')
86
87
  autoload(:Anova, 'statsample/anova')
87
88
  autoload(:Combination, 'statsample/combination')
88
89
  autoload(:CSV, 'statsample/converters')
90
+ autoload(:PlainText, 'statsample/converters')
89
91
  autoload(:Excel, 'statsample/converters')
90
92
  autoload(:GGobi, 'statsample/converters')
91
93
  autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
@@ -99,8 +101,7 @@ module Statsample
99
101
  autoload(:Multivariate, 'statsample/multivariate')
100
102
  autoload(:Multiset, 'statsample/multiset')
101
103
  autoload(:StratifiedSample, 'statsample/multiset')
102
-
103
-
104
+ autoload(:MLE, 'statsample/mle')
104
105
  autoload(:Regression, 'statsample/regression')
105
106
  autoload(:Test, 'statsample/test')
106
107
  def self.load(filename)
@@ -240,16 +241,18 @@ module Statsample
240
241
  end
241
242
  end
242
243
 
243
- module STATSAMPLE__
244
+ module STATSAMPLE__ #:nodoc:
244
245
  end
245
246
 
246
247
  end
247
248
 
248
249
 
250
+
251
+ #--
249
252
  begin
250
253
  require 'statsamplert'
251
254
  rescue LoadError
252
- module Statsample
255
+ module Statsample
253
256
  OPTIMIZED=false
254
257
  end
255
258
  end
@@ -63,11 +63,7 @@ module Statsample
63
63
  end
64
64
  # Significance of Fisher
65
65
  def significance
66
- if HAS_GSL
67
- GSL::Cdf.fdist_Q(f,df_bg,df_wg)
68
- else
69
- raise "Need Ruby/GSL"
70
- end
66
+ 1.0-Distribution::F.cdf(f,df_bg,df_wg)
71
67
  end
72
68
  end
73
69
  end
@@ -20,8 +20,8 @@ module Statsample
20
20
  }
21
21
  sum
22
22
  end
23
- # Covariance. The denominator is n-1
24
- def covariance_slow(v1a,v2a)
23
+
24
+ def covariance_slow(v1a,v2a) # :nodoc:
25
25
  t=0
26
26
  m1=v1a.mean
27
27
  m2=v1a.mean
@@ -40,8 +40,8 @@ module Statsample
40
40
  pearson_slow(v1a,v2a)
41
41
  end
42
42
  end
43
- #:nodoc:
44
- def pearson_slow(v1a,v2a)
43
+ def pearson_slow(v1a,v2a) # :nodoc:
44
+
45
45
  v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
46
46
  t=0
47
47
  siz=v1s.size
@@ -62,7 +62,7 @@ module Statsample
62
62
  # Retrieves the value for t test for a pearson correlation
63
63
  # giving r and vector size
64
64
  def t_r(r,size)
65
- r*Math::sqrt(((size)-2).to_f / (1 - r**2))
65
+ r * Math::sqrt(((size)-2).to_f / (1 - r**2))
66
66
  end
67
67
  # Retrieves the probability value (a la SPSS)
68
68
  # for a given t, size and number of tails.
@@ -71,7 +71,7 @@ module Statsample
71
71
  # * :right, :positive or 1 : for r > 0
72
72
  # * :left, :negative : for r < 0
73
73
 
74
- def prop_pearson(t,size, tails=:both)
74
+ def prop_pearson(t, size, tails=:both)
75
75
  tails=:both if tails==2
76
76
  tails=:right if tails==1 or tails==:positive
77
77
  tails=:left if tails==:negative
@@ -82,16 +82,12 @@ module Statsample
82
82
  else
83
83
  1
84
84
  end
85
- if HAS_GSL
86
- t=-t if t>0 and (tails==:both)
87
- cdf=GSL::Cdf::tdist_P(t,size-2)
88
- if(tails==:right)
89
- 1.0-(cdf*n_tails)
90
- else
91
- cdf*n_tails
92
- end
85
+ t=-t if t>0 and (tails==:both)
86
+ cdf=Distribution::T.cdf(t, size-2)
87
+ if(tails==:right)
88
+ 1.0-(cdf*n_tails)
93
89
  else
94
- raise "Needs ruby-gsl"
90
+ cdf*n_tails
95
91
  end
96
92
  end
97
93
  # Returns residual score after delete variance
@@ -110,6 +106,8 @@ module Statsample
110
106
  }
111
107
  nv.to_vector(:scale)
112
108
  end
109
+ # Correlation between v1 and v2, controling the effect of
110
+ # control on both.
113
111
  def partial_correlation(v1,v2,control)
114
112
  v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
115
113
  rv1v2=pearson(v1a,v2a)
@@ -119,7 +117,9 @@ module Statsample
119
117
  (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
120
118
 
121
119
  end
122
- # Covariance matrix
120
+ # Covariance matrix.
121
+ # Order of rows and columns depends on Dataset#fields order
122
+
123
123
  def covariance_matrix(ds)
124
124
  ds.collect_matrix do |row,col|
125
125
  if (ds[row].type!=:scale or ds[col].type!=:scale)
@@ -130,7 +130,8 @@ module Statsample
130
130
  end
131
131
  end
132
132
 
133
- # The classic correlation matrix for all fields of a dataset
133
+ # Correlation matrix.
134
+ # Order of rows and columns depends on Dataset#fields order
134
135
 
135
136
  def correlation_matrix(ds)
136
137
  ds.collect_matrix {|row,col|
@@ -154,16 +155,19 @@ module Statsample
154
155
  end
155
156
  }
156
157
  end
157
- def correlation_probability_matrix(ds)
158
+ # Matrix of correlation probability
159
+ # Order of rows and columns depends on Dataset#fields order
160
+
161
+ def correlation_probability_matrix(ds, tails=:both)
158
162
  rows=ds.fields.collect{|row|
159
163
  ds.fields.collect{|col|
160
164
  v1a,v2a=Statsample.only_valid(ds[row],ds[col])
161
- (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size)
165
+ (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
162
166
  }
163
167
  }
164
168
  Matrix.rows(rows)
165
169
  end
166
- # Calculate Spearman correlation coefficient between 2 vectors
170
+ # Spearman ranked correlation coefficient between 2 vectors
167
171
  def spearman(v1,v2)
168
172
  v1a,v2a=Statsample.only_valid(v1,v2)
169
173
  v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
@@ -1,12 +1,22 @@
1
1
  module Statsample
2
2
  # Combination class systematically generates all combinations of n elements, taken r at a time.
3
- # Use GSL::Combination is available for extra speed
3
+ # With rbgsl, GSL::Combination is available for extra speed
4
4
  # Source: http://snippets.dzone.com/posts/show/4666
5
5
  # Use:
6
6
  # comb=Statsample::Combination.new(3,5)
7
- # comb.each{|c|
8
- # p c
9
- # }
7
+ # => #<Statsample::Combination:0x7f6323804e08 @n=5, @d=#<Statsample::Combination::CombinationGsl:0x7f63237ff7f0 @n=5, @k=3, @c=GSL::Combination>, @k=3>
8
+ # comb.each{|c| p c }
9
+ # [0, 1, 2]
10
+ # [0, 1, 3]
11
+ # [0, 1, 4]
12
+ # [0, 2, 3]
13
+ # [0, 2, 4]
14
+ # [0, 3, 4]
15
+ # [1, 2, 3]
16
+ # [1, 2, 4]
17
+ # [1, 3, 4]
18
+ # [2, 3, 4]
19
+ #
10
20
  class Combination
11
21
  attr_reader :d
12
22
  def initialize(k,n,only_ruby=false)
@@ -117,6 +117,21 @@ module Statsample
117
117
 
118
118
  end
119
119
  end
120
+ class PlainText < SpreadsheetBase
121
+ class << self
122
+ def read(filename, fields)
123
+ ds=Statsample::Dataset.new(fields)
124
+ fp=File.open(filename,"r")
125
+ fp.each_line do |line|
126
+ row=process_row(line.strip.split(/\s+/),[""])
127
+ ds.add_case_array(row)
128
+ end
129
+ convert_to_scale(ds,fields)
130
+ ds.update_valid_data
131
+ ds
132
+ end
133
+ end
134
+ end
120
135
  class Excel < SpreadsheetBase
121
136
  class << self
122
137
  def write(dataset,filename)
@@ -157,7 +172,7 @@ module Statsample
157
172
  }
158
173
  line_number+=1
159
174
  if(line_number<=ignore_lines)
160
- #puts "Skip line"
175
+ #puts "Skip line #{line_number}:#{row.to_s}"
161
176
  next
162
177
  end
163
178
  # This should be fixed.
@@ -235,6 +250,7 @@ module Statsample
235
250
  # USE:
236
251
  # Statsample::CSV.write(ds,"test_csv.csv")
237
252
  def write(dataset,filename, convert_comma=false,*opts)
253
+ require 'csv'
238
254
  writer=::CSV.open(filename,'w',*opts)
239
255
  writer << dataset.fields
240
256
  dataset.each_array{|row|
@@ -36,11 +36,26 @@ module Statsample
36
36
  include Writable
37
37
  attr_reader :vectors, :fields, :cases, :i
38
38
  attr_accessor :labels
39
- # To create a dataset
40
- # * Dataset.new()
41
- # * Dataset.new(%w{v1 v2 v3})
42
- # * Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
43
- # * Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
39
+ # Creates a new dataset. A dataset is a set of ordered named vectors
40
+ # of the same size.
41
+ #
42
+ # [vectors] With an array, creates a set of empty vectors named as
43
+ # values on the array. With a hash, each Vector is assigned as
44
+ # a variable of the Dataset named as its key
45
+ # [fields] Array of names for vectors. Is only used for set the
46
+ # order of variables. If empty, vectors keys on alfabethic order as
47
+ # used as fields
48
+ # [labels] Hash to set names for fields.
49
+ #
50
+ #
51
+ # Dataset.new()
52
+ # Dataset.new(%w{v1 v2 v3})
53
+ # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
54
+ # Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
55
+ #
56
+ # The fast way to create a dataset uses Hash#to_dataset, with
57
+ # fields and labels as arguments
58
+ # ds = {'v1'=>[1,2,3].to_vector}.to_dataset
44
59
  #
45
60
  def initialize(vectors={}, fields=[], labels={})
46
61
  if vectors.instance_of? Array
@@ -296,7 +311,7 @@ module Statsample
296
311
  }
297
312
  end
298
313
  if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
299
- def case_as_hash(c)
314
+ def case_as_hash(c) # :nodoc:
300
315
  Statsample::STATSAMPLE__.case_as_hash(self,c)
301
316
  end
302
317
  else
@@ -306,7 +321,7 @@ module Statsample
306
321
  end
307
322
 
308
323
  if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
309
- def case_as_array(c)
324
+ def case_as_array(c) # :nodoc:
310
325
  Statsample::STATSAMPLE__.case_as_array(self,c)
311
326
  end
312
327
  else
@@ -314,16 +329,16 @@ module Statsample
314
329
  _case_as_array(c)
315
330
  end
316
331
  end
317
- def _case_as_hash(c)
332
+ def _case_as_hash(c) # :nodoc:
318
333
  @fields.inject({}) {|a,x|
319
334
  a[x]=@vectors[x][c]
320
335
  a
321
336
  }
322
337
  end
323
- def _case_as_array(c)
338
+ def _case_as_array(c) # :nodoc:
324
339
  @fields.collect {|x| @vectors[x][c]}
325
340
  end
326
-
341
+ # Returns each case as a hash
327
342
  def each
328
343
  begin
329
344
  @i=0
@@ -337,6 +352,7 @@ module Statsample
337
352
  raise DatasetException.new(self,e)
338
353
  end
339
354
  end
355
+ # Returns each case as index and hash
340
356
  def each_with_index
341
357
  begin
342
358
  @i=0
@@ -350,6 +366,7 @@ module Statsample
350
366
  raise DatasetException.new(self,e)
351
367
  end
352
368
  end
369
+ # Returns each case as an array
353
370
  def each_array
354
371
  @cases.times {|i|
355
372
  @i=i
@@ -495,6 +512,40 @@ module Statsample
495
512
  ms
496
513
 
497
514
  end
515
+ # Returns a vector, based on a string with a calculation based
516
+ # on vector
517
+ # The calculation will be eval'ed, so you can put any variable
518
+ # or expression valid on ruby
519
+ # For example:
520
+ # a=[1,2].to_vector(scale)
521
+ # b=[3,4].to_vector(scale)
522
+ # ds={'a'=>a,'b'=>b}.to_dataset
523
+ # ds.calculate("a+b")
524
+ # => Vector [4,6]
525
+ def compute(text)
526
+ @fields.each{|f|
527
+ if @vectors[f].type=:scale
528
+ text.gsub!(f,"row['#{f}'].to_f")
529
+ else
530
+ text.gsub!(f,"row['#{f}']")
531
+
532
+ end
533
+
534
+ }
535
+ collect_with_index {|i,row|
536
+ invalid=false
537
+ @fields.each{|f|
538
+ if @vectors[f].data_with_nils[i].nil?
539
+ invalid=true
540
+ end
541
+ }
542
+ if invalid
543
+ nil
544
+ else
545
+ eval(text)
546
+ end
547
+ }
548
+ end
498
549
  # Test each row with one or more tests
499
550
  # each test is a Proc with the form
500
551
  # Proc.new {|row| row['age']>0}
@@ -540,5 +591,10 @@ module Statsample
540
591
  }
541
592
  out
542
593
  end
594
+ def as_r
595
+ require 'rsruby/dataframe'
596
+ r=RSRuby.instance
597
+
598
+ end
543
599
  end
544
600
  end
@@ -69,10 +69,8 @@ class DominanceAnalysis
69
69
  out.extend report_type
70
70
  out.add _("Summary for Bootstrap Dominance Analysis of %s on %s\n") % [@fields.join(", "), @y_var]
71
71
  out.add _("Sample size: %d\n") % @n_samples
72
- if HAS_GSL
73
- t=GSL::Cdf.tdist_Pinv(1-((1-alfa) / 2),@n_samples - 1)
72
+ t=Distribution::T.p_value(1-((1-alfa) / 2),@n_samples - 1)
74
73
  out.add "t:#{t}\n"
75
- end
76
74
  out.add "Linear Regression Engine: #{@lr_class.name}"
77
75
  out.nl
78
76
  table=ReportTable.new
@@ -17,7 +17,7 @@ module Statsample
17
17
  end
18
18
  end
19
19
  end
20
- class Nominal
20
+ class Vector
21
21
  # Creates a barchart using ruby-gdchart
22
22
  def gdchart_frequencies(file, width=300, height=150, chart_type=GDChart::BAR, options={})
23
23
  labels,data=[],[]
@@ -28,9 +28,8 @@ module Statsample
28
28
  options['ext_color']=[0xFF3399,0xFF9933,0xFFEE33,0x33FF33, 0x9966FF]
29
29
  Statsample::Util.chart_gdchart(file,width,height,chart_type, labels,options,1,data)
30
30
  end
31
- end
32
- class Scale < Ordinal
33
31
  def gdchart_histogram(bins,file, width=300, height=150, chart_type=GDChart::BAR, options={})
32
+ check_type :scale
34
33
  labels=[]
35
34
  h=histogram(bins)
36
35
  data=[]
@@ -27,6 +27,7 @@ module Statsample
27
27
  }
28
28
  end
29
29
  def svggraph_histogram(bins, options={})
30
+ check_type :scale
30
31
  options={:graph_title=>"Histogram", :show_graph_title=>true,:show_normal=>true, :mean=>self.mean, :sigma=>sdp }.merge! options
31
32
  graph = Statsample::Graph::SvgHistogram.new(options)
32
33
  graph.histogram=histogram(bins)
@@ -35,6 +36,7 @@ module Statsample
35
36
  # Returns a Run-Sequence Plot
36
37
  # Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/runseqpl.htm
37
38
  def svggraph_runsequence_plot(options={})
39
+ check_type :scale
38
40
  options={:graph_title=>"Run-Sequence Plot", :show_graph_title=>true, :scale_x_integers => true, :add_popups=>true }.merge! options
39
41
  vx=(1..@data.size).to_a.to_vector(:scale)
40
42
  vy=@data.to_vector(:scale)
@@ -45,6 +47,7 @@ module Statsample
45
47
  graph
46
48
  end
47
49
  def svggraph_boxplot(options={})
50
+ check_type :scale
48
51
  options={:graph_title=>"Boxplot", :fields=>['vector'], :show_graph_title=>true}.merge! options
49
52
  vx=@data.to_a.to_vector(:scale)
50
53
  graph = Statsample::Graph::SvgBoxplot.new(options)
@@ -53,6 +56,7 @@ module Statsample
53
56
  end
54
57
 
55
58
  def svggraph_lag_plot(options={})
59
+ check_type :scale
56
60
  options={:graph_title=>"Lag Plot", :show_graph_title=>true}.merge! options
57
61
  vx=@data[0...(@data.size-1)].to_vector(:scale)
58
62
  vy=@data[1...@data.size].to_vector(:scale)
@@ -66,12 +70,12 @@ module Statsample
66
70
  # Returns a Normal Probability Plot
67
71
  # Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/normprpl.htm
68
72
  def svggraph_normalprobability_plot(options={})
69
- extend Statsample::Util
70
-
71
- options={:graph_title=>"Normal Probability Plot", :show_graph_title=>true}.merge! options
73
+ extend Statsample::Util
74
+ check_type :scale
75
+ options={:graph_title=>"Normal Probability Plot", :show_graph_title=>true}.merge! options
72
76
  n=@data.size
73
77
  vx=(1..@data.size).to_a.collect{|i|
74
- GSL::Cdf.gaussian_Pinv(normal_order_statistic_medians(i,n))
78
+ Distribution::Normal.p_value(normal_order_statistic_medians(i,n))
75
79
  }.to_vector(:scale)
76
80
  vy=@data.sort.to_vector(:scale)
77
81
  ds={'normal_order_statistics_medians'=>vx, 'ordered_response'=>vy}.to_dataset