statsample 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +14 -0
  3. data/Manifest.txt +4 -0
  4. data/README.txt +49 -13
  5. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  6. data/lib/statsample.rb +1 -23
  7. data/lib/statsample/analysis.rb +49 -28
  8. data/lib/statsample/analysis/suite.rb +18 -5
  9. data/lib/statsample/analysis/suitereportbuilder.rb +9 -3
  10. data/lib/statsample/anova.rb +2 -0
  11. data/lib/statsample/anova/contrast.rb +79 -0
  12. data/lib/statsample/anova/oneway.rb +39 -5
  13. data/lib/statsample/converter/csv.rb +2 -5
  14. data/lib/statsample/converters.rb +1 -0
  15. data/lib/statsample/dataset.rb +31 -1
  16. data/lib/statsample/graph/histogram.rb +1 -1
  17. data/lib/statsample/regression/multiple/baseengine.rb +5 -0
  18. data/lib/statsample/reliability/multiscaleanalysis.rb +3 -1
  19. data/lib/statsample/reliability/scaleanalysis.rb +3 -4
  20. data/lib/statsample/shorthand.rb +41 -1
  21. data/lib/statsample/test.rb +10 -0
  22. data/lib/statsample/test/kolmogorovsmirnov.rb +61 -0
  23. data/lib/statsample/test/t.rb +92 -9
  24. data/lib/statsample/vector.rb +143 -10
  25. data/po/es/statsample.mo +0 -0
  26. data/po/es/statsample.po +109 -110
  27. data/po/statsample.pot +108 -60
  28. data/test/helpers_tests.rb +1 -0
  29. data/test/test_analysis.rb +70 -11
  30. data/test/test_anova_contrast.rb +36 -0
  31. data/test/test_anovawithvectors.rb +8 -0
  32. data/test/test_dataset.rb +12 -0
  33. data/test/test_factor_pa.rb +1 -3
  34. data/test/test_test_kolmogorovsmirnov.rb +34 -0
  35. data/test/test_test_t.rb +16 -0
  36. data/test/test_vector.rb +40 -2
  37. metadata +44 -118
  38. data.tar.gz.sig +0 -0
  39. metadata.gz.sig +0 -0
@@ -35,10 +35,10 @@ module Statsample
35
35
  :name_denominator=>_("Explained variance"),
36
36
  :name_numerator=>_("Unexplained variance")}
37
37
  @opts=opts_default.merge(opts)
38
- opts_default.keys.each {|k|
39
- send("#{k}=", @opts[k])
38
+ opts.keys.each {|k|
39
+ send("#{k}=", @opts[k]) if self.respond_to? "#{k}="
40
40
  }
41
- @f_object=Statsample::Test::F.new(@ms_num,@ms_den,@df_num,@df_den)
41
+ @f_object=Statsample::Test::F.new(@ms_num, @ms_den, @df_num,@df_den)
42
42
  end
43
43
  # F value
44
44
  def f
@@ -62,6 +62,7 @@ module Statsample
62
62
  end
63
63
 
64
64
  end
65
+
65
66
  # One Way Anova with vectors
66
67
  # Example:
67
68
  # v1=[2,3,4,5,6].to_scale
@@ -80,6 +81,11 @@ module Statsample
80
81
  attr_accessor :summary_levene
81
82
  # Show on summary descriptives for vectors
82
83
  attr_accessor :summary_descriptives
84
+ # Show on summary of contrasts
85
+ attr_accessor :summary_contrasts
86
+ # Array with stored contrasts
87
+ attr_reader :contrasts
88
+
83
89
  def initialize(*args)
84
90
  if args[0].is_a? Array
85
91
  @vectors=args.shift
@@ -92,11 +98,31 @@ module Statsample
92
98
  :name_numerator=>_("Between Groups"),
93
99
  :name_denominator=>_("Within Groups"),
94
100
  :summary_descriptives=>false,
95
- :summary_levene=>true}
101
+ :summary_levene=>true,
102
+ :summary_contrasts=>true
103
+ }
96
104
  @opts=opts_default.merge(opts).merge(:ss_num=>ssbg, :ss_den=>sswg, :df_num=>df_bg, :df_den=>df_wg)
105
+ @contrasts=[]
97
106
  super(@opts)
98
107
  end
99
- alias :sst :ss_total
108
+ alias :sst :ss_total
109
+ alias :msb :ms_num
110
+ alias :msw :ms_den
111
+
112
+ # Generates and store a contrast.
113
+ # Options should be provided as a hash
114
+ # [:c]=>contrast vector
115
+ # [:c1 - :c2]=>index for automatic construction of contrast
116
+ # [:name]=>contrast name
117
+
118
+ def contrast(opts=Hash.new)
119
+ name=opts[:name] || _("Contrast for %s") % @name
120
+ opts=opts.merge({:vectors=>@vectors, :name=>name})
121
+ c=Statsample::Anova::Contrast.new(opts)
122
+ @contrasts.push(c)
123
+ c
124
+ end
125
+
100
126
  def levene
101
127
  Statsample::Test.levene(@vectors, :name=>_("Test of Homogeneity of variances (Levene)"))
102
128
  end
@@ -140,10 +166,18 @@ module Statsample
140
166
  end
141
167
  end
142
168
  end
169
+
143
170
  if summary_levene
144
171
  s.parse_element(levene)
145
172
  end
146
173
  report_building_table(s)
174
+ if summary_contrasts and @contrasts.size>0
175
+
176
+ @contrasts.each do |c|
177
+ s.parse_element(c)
178
+ end
179
+ end
180
+
147
181
  end
148
182
  end
149
183
  end
@@ -12,16 +12,13 @@ module Statsample
12
12
  #
13
13
  # USE:
14
14
  # ds=Statsample::CSV.read("test_csv.csv")
15
- def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
15
+ def read(filename, empty=[''],ignore_lines=0,csv_opts=Hash.new)
16
16
  first_row=true
17
17
  fields=[]
18
18
  fields_data={}
19
19
  ds=nil
20
20
  line_number=0
21
- opts={}
22
- opts[:col_sep]=fs unless fs.nil?
23
- opts[:row_sep]=rs unless rs.nil?
24
- csv=CSV_klass.open(filename,'r',opts)
21
+ csv=CSV_klass.open(filename,'rb', csv_opts)
25
22
  csv.each do |row|
26
23
  line_number+=1
27
24
  if(line_number<=ignore_lines)
@@ -184,6 +184,7 @@ module Statsample
184
184
  #
185
185
  def read(filename, opts=Hash.new)
186
186
  require 'spreadsheet'
187
+ raise "options should be Hash" unless opts.is_a? Hash
187
188
  opts_default={
188
189
  :worksheet_id=>0,
189
190
  :ignore_lines=>0,
@@ -119,6 +119,33 @@ module Statsample
119
119
  def has_missing_data?
120
120
  @vectors.any? {|k,v| v.has_missing_data?}
121
121
  end
122
+ # Return a nested hash using fields as keys and
123
+ # an array constructed of hashes with other values.
124
+ # If block provided, is used to provide the
125
+ # values, with parameters +row+ of dataset,
126
+ # +current+ last hash on hierarchy and
127
+ # +name+ of the key to include
128
+ def nest(*tree_keys,&block)
129
+ tree_keys=tree_keys[0] if tree_keys[0].is_a? Array
130
+ out=Hash.new
131
+ each do |row|
132
+ current=out
133
+ # Create tree
134
+ tree_keys[0,tree_keys.size-1].each do |f|
135
+ root=row[f]
136
+ current[root]||=Hash.new
137
+ current=current[root]
138
+ end
139
+ name=row[tree_keys.last]
140
+ if !block
141
+ current[name]||=Array.new
142
+ current[name].push(row.delete_if{|key,value| tree_keys.include? key})
143
+ else
144
+ current[name]=block.call(row, current,name)
145
+ end
146
+ end
147
+ out
148
+ end
122
149
  # Creates a new dataset. A dataset is a set of ordered named vectors
123
150
  # of the same size.
124
151
  #
@@ -170,6 +197,7 @@ module Statsample
170
197
  else
171
198
  ds=dup fields_to_include
172
199
  end
200
+ ds.name= self.name
173
201
  ds
174
202
  end
175
203
  #
@@ -192,7 +220,9 @@ module Statsample
192
220
  vectors[f]=@vectors[f].dup
193
221
  fields.push(f)
194
222
  }
195
- Dataset.new(vectors,fields)
223
+ ds=Dataset.new(vectors,fields)
224
+ ds.name= self.name
225
+ ds
196
226
  end
197
227
 
198
228
 
@@ -44,7 +44,7 @@ module Statsample
44
44
  # Add a line showing normal distribution
45
45
  attr_accessor :line_normal_distribution
46
46
  # data could be a vector or a histogram
47
- def initialize(data,opts=Hash.new)
47
+ def initialize(data, opts=Hash.new)
48
48
  prov_name=(data.respond_to?(:name)) ? data.name : ""
49
49
  opts_default={
50
50
  :name=>_("Histograma (%s)") % prov_name,
@@ -79,6 +79,11 @@ module Statsample
79
79
  def sst
80
80
  raise "You should implement this"
81
81
  end
82
+ # R^2 Adjusted.
83
+ # Estimate Population R^2 usign Ezequiel formula.
84
+ # Always lower than sample R^2
85
+ # == Reference:
86
+ # * Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11.
82
87
  def r2_adjusted
83
88
  r2-((1-r2)*@predictors_n).quo(df_e)
84
89
  end
@@ -110,6 +110,8 @@ module Statsample
110
110
  opts||=pca_options
111
111
  Statsample::Factor::PCA.new(correlation_matrix, opts)
112
112
  end
113
+ # Retrieve Velicer's MAP
114
+ # using all scales.
113
115
  def map(opts=nil)
114
116
  opts||=map_options
115
117
  Statsample::Factor::MAP.new(correlation_matrix, opts)
@@ -141,7 +143,7 @@ module Statsample
141
143
  def report_building(b) # :nodoc:
142
144
  b.section(:name=>name) do |s|
143
145
  s.section(:name=>_("Reliability analysis of scales")) do |s2|
144
- @scales.each_pair do |k,scale|
146
+ @scales.each_pair do |k, scale|
145
147
  s2.parse_element(scale)
146
148
  end
147
149
  end
@@ -22,11 +22,10 @@ module Statsample
22
22
 
23
23
  @ods=ds
24
24
  @ds=ds.dup_only_valid(ds.fields - @dumped)
25
-
25
+ @ds.name=ds.name
26
26
 
27
27
  @k=@ds.fields.size
28
28
  @total=@ds.vector_sum
29
-
30
29
  @o_total=@dumped.size > 0 ? @ods.vector_sum : nil
31
30
 
32
31
  @vector_mean=@ds.vector_mean
@@ -165,7 +164,7 @@ module Statsample
165
164
  t.row(["#{@ods[f].name}(#{f})", "%0.5f" % @ods[f].mean])
166
165
  end
167
166
  end
168
- s.parse_element(Statsample::Graph::Histogram.new(@o_total)) if @summary_histogram
167
+ s.parse_element(Statsample::Graph::Histogram.new(@o_total, :name=>"Histogram (complete data) for %s" % @name)) if @summary_histogram
169
168
  end
170
169
  end
171
170
 
@@ -229,7 +228,7 @@ module Statsample
229
228
  t.row row
230
229
  end # end each
231
230
  end # table
232
- s.parse_element(Statsample::Graph::Histogram.new(@total)) if @summary_histogram
231
+ s.parse_element(Statsample::Graph::Histogram.new(@total, :name=>"Histogram (valid data) for %s" % @name)) if @summary_histogram
233
232
  end # section
234
233
  end # def
235
234
  end # class
@@ -1,9 +1,36 @@
1
+ class Object
2
+ # Shorthand for Statsample::Analysis.store(*args,&block)
3
+ def ss_analysis(*args,&block)
4
+ Statsample::Analysis.store(*args,&block)
5
+ end
6
+ end
7
+
1
8
  module Statsample
2
9
  # Module which provide shorthands for many methods.
3
10
  module Shorthand
4
11
  ###
5
12
  # :section: R like methods
6
13
  ###
14
+ def read_with_cache(klass, filename,opts=Hash.new, cache=true)
15
+ file_ds=filename+".ds"
16
+ if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
17
+ ds=Statsample.load(file_ds)
18
+ else
19
+ ds=klass.read(filename)
20
+ ds.save(file_ds) if cache
21
+ end
22
+ ds
23
+ end
24
+ # Import an Excel file. Cache result by default
25
+ def read_excel(filename, opts=Hash.new, cache=true)
26
+ read_with_cache(Statsample::Excel, filename, opts, cache)
27
+
28
+ end
29
+ # Import an CSV file. Cache result by default
30
+
31
+ def read_csv
32
+ read_with_cache(Statsample::CSV, filename, opts, cache)
33
+ end
7
34
 
8
35
  # Retrieve names (fields) from dataset
9
36
  def names(ds)
@@ -19,7 +46,7 @@ module Statsample
19
46
  end
20
47
  # Create a Statsample::Vector
21
48
  # Analog to R's c
22
- def c(*args)
49
+ def vector(*args)
23
50
  Statsample::Vector[*args]
24
51
  end
25
52
  # Random generation for the normal distribution
@@ -77,5 +104,18 @@ module Statsample
77
104
  def dominance_analysis_bootstrap(*args)
78
105
  Statsample::DominanceAnalysis::Bootstrap.new(*args)
79
106
  end
107
+ def scale_analysis(*args)
108
+ Statsample::Reliability::ScaleAnalysis.new(*args)
109
+ end
110
+ def skill_scale_analysis(*args)
111
+ Statsample::Reliability::SkillScaleAnalysis.new(*args)
112
+ end
113
+ def multiscale_analysis(*args,&block)
114
+ Statsample::Reliability::MultiScaleAnalysis.new(*args,&block)
115
+ end
116
+ def test_u(*args)
117
+ Statsample::Test::UMannWhitney.new(*args)
118
+ end
119
+ module_function :test_u, :rnorm
80
120
  end
81
121
  end
@@ -8,6 +8,7 @@ module Statsample
8
8
  autoload(:F, 'statsample/test/f')
9
9
  autoload(:ChiSquare, 'statsample/test/chisquare')
10
10
  autoload(:BartlettSphericity, 'statsample/test/bartlettsphericity')
11
+ autoload(:KolmogorovSmirnov, 'statsample/test/kolmogorovsmirnov')
11
12
 
12
13
  # Returns probability of getting a value lower or higher
13
14
  # than sample, using cdf and number of tails.
@@ -29,6 +30,15 @@ module Statsample
29
30
  2*cdf
30
31
  end
31
32
  end
33
+ # Get critical t to create confidence interval
34
+ def t_critical(confidence_level, df)
35
+ -Distribution::T.p_value((1-confidence_level) / 2.0, df)
36
+ end
37
+ # Get critical z to create confidence interval
38
+ def z_critical(confidence_level)
39
+ -Distribution::Z.p_value((1-confidence_level) / 2.0)
40
+ end
41
+
32
42
  extend self
33
43
  # Calculate chi square for two Matrix
34
44
  class << self
@@ -0,0 +1,61 @@
1
+ module Statsample
2
+ module Test
3
+ # == Kolmogorov-Smirnov's test of equality of distributions.
4
+ class KolmogorovSmirnov
5
+
6
+ attr_reader :d
7
+ include Statsample::Test
8
+ include Summarizable
9
+ # Creates a new Kolmogorov-Smirnov test
10
+ # d1 should have each method
11
+ # d2 could be a Distribution class, with a cdf method,
12
+ # a vector or a lambda
13
+ def initialize(d1,d2)
14
+ raise "First argument should have each method" unless d1.respond_to? :each
15
+ @d1=make_cdf(d1)
16
+ if d2.respond_to? :cdf or d2.is_a? Proc
17
+ @d2=d2
18
+ elsif d2.respond_to? :each
19
+ @d2=make_cdf(d2)
20
+ else
21
+ raise "Second argument should respond to cdf or each"
22
+ end
23
+ calculate
24
+ end
25
+ def calculate
26
+ d=0
27
+ @d1.each {|x|
28
+ v1=@d1.cdf(x);
29
+ v2=@d2.is_a?(Proc) ? @d2.call(x) : @d2.cdf(x)
30
+ d=(v1-v2).to_f.abs if (v1-v2).abs>d
31
+ }
32
+ @d=d
33
+ end
34
+ # Make a wrapper EmpiricDistribution to any method which implements
35
+ # each
36
+ # On Statsample::Vector, only uses #valid_data
37
+ def make_cdf(v)
38
+ v.is_a?(Statsample::Vector) ? EmpiricDistribution.new(v.valid_data) : EmpiricDistribution.new(v)
39
+ end
40
+ class EmpiricDistribution
41
+ def initialize(data)
42
+ @min=data.min
43
+ @max=data.max
44
+ @data=data.sort
45
+ @n=data.size
46
+ end
47
+ def each
48
+ @data.each {|x|
49
+ yield x
50
+ }
51
+ end
52
+ def cdf(x)
53
+ return 0 if x<@min
54
+ return 1 if x>=@max
55
+ v=@data.index{|v1| v1>=x}
56
+ v.nil? ? 0 : (v+(x==@data[v]? 1 : 0)).quo(@n)
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -1,6 +1,12 @@
1
1
  module Statsample
2
2
  module Test
3
- module T
3
+
4
+
5
+
6
+
7
+ # A t-test is any statistical hypothesis test in which the test statistic follows a Student's t distribution, if the null hypothesis is supported
8
+ class T
9
+
4
10
  class << self
5
11
  include Math
6
12
  # Test the null hypothesis that the population mean is equal to a specified value u, one uses the statistic.
@@ -10,7 +16,7 @@ module Statsample
10
16
  # * <tt>s</tt>: sample/differences standard deviation
11
17
  # * <tt>n</tt>: sample size
12
18
  def one_sample(x,u,s,n)
13
- (x-u).quo(s.quo(Math::sqrt(n)))
19
+ (x-u)*Math::sqrt(n).quo(s)
14
20
  end
15
21
  # Test if means of two samples are different.
16
22
  # * <tt>x1</tt>: sample 1 mean
@@ -50,6 +56,73 @@ module Statsample
50
56
  num.quo(den)
51
57
  end
52
58
  end
59
+
60
+ include Statsample::Test
61
+ include Summarizable
62
+ attr_reader :standard_error, :estimate, :df
63
+ # Tails for p-value (:both, :left or :right). Default :both
64
+ attr_accessor :tails
65
+ # Name of F analysis
66
+ attr_accessor :name
67
+ attr_accessor :confidence_level
68
+ attr_reader :t
69
+ attr_accessor :estimate_name, :standard_error_name
70
+ # Creates a generic t test. Use OneSample or TwoSamplesIndependent
71
+ # classes for better summaries.
72
+ # Parameters:
73
+ # * estimate: estimate
74
+ # * standard_error: standard error of estimate
75
+ # * df: degrees of freedom
76
+ def initialize(estimate, standard_error, df, opts=Hash.new)
77
+ @estimate=estimate
78
+ @standard_error=standard_error
79
+ @df=df
80
+ @t = @estimate / @standard_error.to_f
81
+ opts_default={ :tails=>:both,
82
+ :name=>_("T Test"),
83
+ :estimate_name=>_("Estimate"),
84
+ :standard_error_name=>_("Std.Err.of Estimate"),
85
+ :confidence_level=>0.95}
86
+ @opts = opts_default.merge(opts)
87
+
88
+ @opts.keys.each {|k|
89
+ send("#{k}=", @opts[k]) if respond_to? k
90
+ }
91
+ end
92
+
93
+ alias :se :standard_error
94
+
95
+ def to_f
96
+ t
97
+ end
98
+
99
+ # probability
100
+ def probability
101
+ p_using_cdf(Distribution::T.cdf(t, df), tails)
102
+ end
103
+
104
+ def confidence_interval(cl=nil)
105
+ cl||=confidence_level
106
+ t_crit = t_critical(cl, df)
107
+ [estimate - se*t_crit, estimate + se*t_crit]
108
+ end
109
+ alias :ci :confidence_interval
110
+
111
+
112
+ def report_building(builder) #:nodoc:
113
+ builder.section(:name=>@name) do |section|
114
+ section.text _("%s: %0.4f | %s: %0.4f") % [@estimate_name, @estimate, @standard_error_name, se]
115
+ report_building_t(section)
116
+ end
117
+ end
118
+ def report_building_t(s)
119
+ df_f=@df.is_a?(Integer) ? "%d" : "%0.4f"
120
+ s.text _("t(%d) = %0.4f, p=%0.4f (%s tails)") % [df, t,probability, tails]
121
+ s.text _("CI(%d%%): %0.4f - %0.4f") % [confidence_level*100, ci[0],ci[1]]
122
+
123
+ end
124
+
125
+
53
126
  # One Sample t-test
54
127
  # == Usage
55
128
  # a=1000.times.map {rand(100)}.to_scale
@@ -91,22 +164,32 @@ module Statsample
91
164
  @name=@opts[:name]
92
165
  @u=@opts[:u]
93
166
  @tails=@opts[:tails]
167
+ @confidence_level=@opts[:confidence_level] || 0.95
94
168
  @df= @vector.n_valid-1
95
169
  @t=nil
96
170
  end
171
+ def t_object
172
+ T.new(@vector.mean-u, @vector.se, @vector.n_valid-1, opts)
173
+ end
97
174
  def t
98
- T.one_sample(@vector.mean, @u, @vector.sd, @vector.n_valid)
175
+ t_object.t
99
176
  end
100
-
101
177
  def probability
102
- p_using_cdf(Distribution::T.cdf(t, @df), tails)
178
+ t_object.probability
179
+ end
180
+ def standard_error
181
+ t_object.standard_error
182
+ end
183
+ alias :se :standard_error
184
+ def confidence_interval(cl=nil)
185
+ t_object.confidence_interval(cl)
103
186
  end
187
+ alias :ci :confidence_interval
104
188
  def report_building(b) # :nodoc:
105
189
  b.section(:name=>@name) {|s|
106
- s.text "Sample mean: #{@vector.mean}"
107
- s.text "Population mean:#{u}"
108
- s.text "Tails: #{tails}"
109
- s.text sprintf("t = %0.4f, p=%0.4f, d.f=%d", t, probability, df)
190
+ s.text _("Sample mean: %0.4f | Sample sd: %0.4f | se : %0.4f") % [@vector.mean, @vector.sd, se]
191
+ s.text _("Population mean: %0.4f") % u if u!=0
192
+ t_object.report_building_t(s)
110
193
  }
111
194
  end
112
195
  end