statsample 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +14 -0
  3. data/Manifest.txt +4 -0
  4. data/README.txt +49 -13
  5. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  6. data/lib/statsample.rb +1 -23
  7. data/lib/statsample/analysis.rb +49 -28
  8. data/lib/statsample/analysis/suite.rb +18 -5
  9. data/lib/statsample/analysis/suitereportbuilder.rb +9 -3
  10. data/lib/statsample/anova.rb +2 -0
  11. data/lib/statsample/anova/contrast.rb +79 -0
  12. data/lib/statsample/anova/oneway.rb +39 -5
  13. data/lib/statsample/converter/csv.rb +2 -5
  14. data/lib/statsample/converters.rb +1 -0
  15. data/lib/statsample/dataset.rb +31 -1
  16. data/lib/statsample/graph/histogram.rb +1 -1
  17. data/lib/statsample/regression/multiple/baseengine.rb +5 -0
  18. data/lib/statsample/reliability/multiscaleanalysis.rb +3 -1
  19. data/lib/statsample/reliability/scaleanalysis.rb +3 -4
  20. data/lib/statsample/shorthand.rb +41 -1
  21. data/lib/statsample/test.rb +10 -0
  22. data/lib/statsample/test/kolmogorovsmirnov.rb +61 -0
  23. data/lib/statsample/test/t.rb +92 -9
  24. data/lib/statsample/vector.rb +143 -10
  25. data/po/es/statsample.mo +0 -0
  26. data/po/es/statsample.po +109 -110
  27. data/po/statsample.pot +108 -60
  28. data/test/helpers_tests.rb +1 -0
  29. data/test/test_analysis.rb +70 -11
  30. data/test/test_anova_contrast.rb +36 -0
  31. data/test/test_anovawithvectors.rb +8 -0
  32. data/test/test_dataset.rb +12 -0
  33. data/test/test_factor_pa.rb +1 -3
  34. data/test/test_test_kolmogorovsmirnov.rb +34 -0
  35. data/test/test_test_t.rb +16 -0
  36. data/test/test_vector.rb +40 -2
  37. metadata +44 -118
  38. data.tar.gz.sig +0 -0
  39. metadata.gz.sig +0 -0
@@ -35,10 +35,10 @@ module Statsample
35
35
  :name_denominator=>_("Explained variance"),
36
36
  :name_numerator=>_("Unexplained variance")}
37
37
  @opts=opts_default.merge(opts)
38
- opts_default.keys.each {|k|
39
- send("#{k}=", @opts[k])
38
+ opts.keys.each {|k|
39
+ send("#{k}=", @opts[k]) if self.respond_to? "#{k}="
40
40
  }
41
- @f_object=Statsample::Test::F.new(@ms_num,@ms_den,@df_num,@df_den)
41
+ @f_object=Statsample::Test::F.new(@ms_num, @ms_den, @df_num,@df_den)
42
42
  end
43
43
  # F value
44
44
  def f
@@ -62,6 +62,7 @@ module Statsample
62
62
  end
63
63
 
64
64
  end
65
+
65
66
  # One Way Anova with vectors
66
67
  # Example:
67
68
  # v1=[2,3,4,5,6].to_scale
@@ -80,6 +81,11 @@ module Statsample
80
81
  attr_accessor :summary_levene
81
82
  # Show on summary descriptives for vectors
82
83
  attr_accessor :summary_descriptives
84
+ # Show on summary of contrasts
85
+ attr_accessor :summary_contrasts
86
+ # Array with stored contrasts
87
+ attr_reader :contrasts
88
+
83
89
  def initialize(*args)
84
90
  if args[0].is_a? Array
85
91
  @vectors=args.shift
@@ -92,11 +98,31 @@ module Statsample
92
98
  :name_numerator=>_("Between Groups"),
93
99
  :name_denominator=>_("Within Groups"),
94
100
  :summary_descriptives=>false,
95
- :summary_levene=>true}
101
+ :summary_levene=>true,
102
+ :summary_contrasts=>true
103
+ }
96
104
  @opts=opts_default.merge(opts).merge(:ss_num=>ssbg, :ss_den=>sswg, :df_num=>df_bg, :df_den=>df_wg)
105
+ @contrasts=[]
97
106
  super(@opts)
98
107
  end
99
- alias :sst :ss_total
108
+ alias :sst :ss_total
109
+ alias :msb :ms_num
110
+ alias :msw :ms_den
111
+
112
+ # Generates and store a contrast.
113
+ # Options should be provided as a hash
114
+ # [:c]=>contrast vector
115
+ # [:c1 - :c2]=>index for automatic construction of contrast
116
+ # [:name]=>contrast name
117
+
118
+ def contrast(opts=Hash.new)
119
+ name=opts[:name] || _("Contrast for %s") % @name
120
+ opts=opts.merge({:vectors=>@vectors, :name=>name})
121
+ c=Statsample::Anova::Contrast.new(opts)
122
+ @contrasts.push(c)
123
+ c
124
+ end
125
+
100
126
  def levene
101
127
  Statsample::Test.levene(@vectors, :name=>_("Test of Homogeneity of variances (Levene)"))
102
128
  end
@@ -140,10 +166,18 @@ module Statsample
140
166
  end
141
167
  end
142
168
  end
169
+
143
170
  if summary_levene
144
171
  s.parse_element(levene)
145
172
  end
146
173
  report_building_table(s)
174
+ if summary_contrasts and @contrasts.size>0
175
+
176
+ @contrasts.each do |c|
177
+ s.parse_element(c)
178
+ end
179
+ end
180
+
147
181
  end
148
182
  end
149
183
  end
@@ -12,16 +12,13 @@ module Statsample
12
12
  #
13
13
  # USE:
14
14
  # ds=Statsample::CSV.read("test_csv.csv")
15
- def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
15
+ def read(filename, empty=[''],ignore_lines=0,csv_opts=Hash.new)
16
16
  first_row=true
17
17
  fields=[]
18
18
  fields_data={}
19
19
  ds=nil
20
20
  line_number=0
21
- opts={}
22
- opts[:col_sep]=fs unless fs.nil?
23
- opts[:row_sep]=rs unless rs.nil?
24
- csv=CSV_klass.open(filename,'r',opts)
21
+ csv=CSV_klass.open(filename,'rb', csv_opts)
25
22
  csv.each do |row|
26
23
  line_number+=1
27
24
  if(line_number<=ignore_lines)
@@ -184,6 +184,7 @@ module Statsample
184
184
  #
185
185
  def read(filename, opts=Hash.new)
186
186
  require 'spreadsheet'
187
+ raise "options should be Hash" unless opts.is_a? Hash
187
188
  opts_default={
188
189
  :worksheet_id=>0,
189
190
  :ignore_lines=>0,
@@ -119,6 +119,33 @@ module Statsample
119
119
  def has_missing_data?
120
120
  @vectors.any? {|k,v| v.has_missing_data?}
121
121
  end
122
+ # Return a nested hash using fields as keys and
123
+ # an array constructed of hashes with other values.
124
+ # If block provided, is used to provide the
125
+ # values, with parameters +row+ of dataset,
126
+ # +current+ last hash on hierarchy and
127
+ # +name+ of the key to include
128
+ def nest(*tree_keys,&block)
129
+ tree_keys=tree_keys[0] if tree_keys[0].is_a? Array
130
+ out=Hash.new
131
+ each do |row|
132
+ current=out
133
+ # Create tree
134
+ tree_keys[0,tree_keys.size-1].each do |f|
135
+ root=row[f]
136
+ current[root]||=Hash.new
137
+ current=current[root]
138
+ end
139
+ name=row[tree_keys.last]
140
+ if !block
141
+ current[name]||=Array.new
142
+ current[name].push(row.delete_if{|key,value| tree_keys.include? key})
143
+ else
144
+ current[name]=block.call(row, current,name)
145
+ end
146
+ end
147
+ out
148
+ end
122
149
  # Creates a new dataset. A dataset is a set of ordered named vectors
123
150
  # of the same size.
124
151
  #
@@ -170,6 +197,7 @@ module Statsample
170
197
  else
171
198
  ds=dup fields_to_include
172
199
  end
200
+ ds.name= self.name
173
201
  ds
174
202
  end
175
203
  #
@@ -192,7 +220,9 @@ module Statsample
192
220
  vectors[f]=@vectors[f].dup
193
221
  fields.push(f)
194
222
  }
195
- Dataset.new(vectors,fields)
223
+ ds=Dataset.new(vectors,fields)
224
+ ds.name= self.name
225
+ ds
196
226
  end
197
227
 
198
228
 
@@ -44,7 +44,7 @@ module Statsample
44
44
  # Add a line showing normal distribution
45
45
  attr_accessor :line_normal_distribution
46
46
  # data could be a vector or a histogram
47
- def initialize(data,opts=Hash.new)
47
+ def initialize(data, opts=Hash.new)
48
48
  prov_name=(data.respond_to?(:name)) ? data.name : ""
49
49
  opts_default={
50
50
  :name=>_("Histograma (%s)") % prov_name,
@@ -79,6 +79,11 @@ module Statsample
79
79
  def sst
80
80
  raise "You should implement this"
81
81
  end
82
+ # R^2 Adjusted.
83
+ # Estimate Population R^2 usign Ezequiel formula.
84
+ # Always lower than sample R^2
85
+ # == Reference:
86
+ # * Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11.
82
87
  def r2_adjusted
83
88
  r2-((1-r2)*@predictors_n).quo(df_e)
84
89
  end
@@ -110,6 +110,8 @@ module Statsample
110
110
  opts||=pca_options
111
111
  Statsample::Factor::PCA.new(correlation_matrix, opts)
112
112
  end
113
+ # Retrieve Velicer's MAP
114
+ # using all scales.
113
115
  def map(opts=nil)
114
116
  opts||=map_options
115
117
  Statsample::Factor::MAP.new(correlation_matrix, opts)
@@ -141,7 +143,7 @@ module Statsample
141
143
  def report_building(b) # :nodoc:
142
144
  b.section(:name=>name) do |s|
143
145
  s.section(:name=>_("Reliability analysis of scales")) do |s2|
144
- @scales.each_pair do |k,scale|
146
+ @scales.each_pair do |k, scale|
145
147
  s2.parse_element(scale)
146
148
  end
147
149
  end
@@ -22,11 +22,10 @@ module Statsample
22
22
 
23
23
  @ods=ds
24
24
  @ds=ds.dup_only_valid(ds.fields - @dumped)
25
-
25
+ @ds.name=ds.name
26
26
 
27
27
  @k=@ds.fields.size
28
28
  @total=@ds.vector_sum
29
-
30
29
  @o_total=@dumped.size > 0 ? @ods.vector_sum : nil
31
30
 
32
31
  @vector_mean=@ds.vector_mean
@@ -165,7 +164,7 @@ module Statsample
165
164
  t.row(["#{@ods[f].name}(#{f})", "%0.5f" % @ods[f].mean])
166
165
  end
167
166
  end
168
- s.parse_element(Statsample::Graph::Histogram.new(@o_total)) if @summary_histogram
167
+ s.parse_element(Statsample::Graph::Histogram.new(@o_total, :name=>"Histogram (complete data) for %s" % @name)) if @summary_histogram
169
168
  end
170
169
  end
171
170
 
@@ -229,7 +228,7 @@ module Statsample
229
228
  t.row row
230
229
  end # end each
231
230
  end # table
232
- s.parse_element(Statsample::Graph::Histogram.new(@total)) if @summary_histogram
231
+ s.parse_element(Statsample::Graph::Histogram.new(@total, :name=>"Histogram (valid data) for %s" % @name)) if @summary_histogram
233
232
  end # section
234
233
  end # def
235
234
  end # class
@@ -1,9 +1,36 @@
1
+ class Object
2
+ # Shorthand for Statsample::Analysis.store(*args,&block)
3
+ def ss_analysis(*args,&block)
4
+ Statsample::Analysis.store(*args,&block)
5
+ end
6
+ end
7
+
1
8
  module Statsample
2
9
  # Module which provide shorthands for many methods.
3
10
  module Shorthand
4
11
  ###
5
12
  # :section: R like methods
6
13
  ###
14
+ def read_with_cache(klass, filename,opts=Hash.new, cache=true)
15
+ file_ds=filename+".ds"
16
+ if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
17
+ ds=Statsample.load(file_ds)
18
+ else
19
+ ds=klass.read(filename)
20
+ ds.save(file_ds) if cache
21
+ end
22
+ ds
23
+ end
24
+ # Import an Excel file. Cache result by default
25
+ def read_excel(filename, opts=Hash.new, cache=true)
26
+ read_with_cache(Statsample::Excel, filename, opts, cache)
27
+
28
+ end
29
+ # Import an CSV file. Cache result by default
30
+
31
+ def read_csv
32
+ read_with_cache(Statsample::CSV, filename, opts, cache)
33
+ end
7
34
 
8
35
  # Retrieve names (fields) from dataset
9
36
  def names(ds)
@@ -19,7 +46,7 @@ module Statsample
19
46
  end
20
47
  # Create a Statsample::Vector
21
48
  # Analog to R's c
22
- def c(*args)
49
+ def vector(*args)
23
50
  Statsample::Vector[*args]
24
51
  end
25
52
  # Random generation for the normal distribution
@@ -77,5 +104,18 @@ module Statsample
77
104
  def dominance_analysis_bootstrap(*args)
78
105
  Statsample::DominanceAnalysis::Bootstrap.new(*args)
79
106
  end
107
+ def scale_analysis(*args)
108
+ Statsample::Reliability::ScaleAnalysis.new(*args)
109
+ end
110
+ def skill_scale_analysis(*args)
111
+ Statsample::Reliability::SkillScaleAnalysis.new(*args)
112
+ end
113
+ def multiscale_analysis(*args,&block)
114
+ Statsample::Reliability::MultiScaleAnalysis.new(*args,&block)
115
+ end
116
+ def test_u(*args)
117
+ Statsample::Test::UMannWhitney.new(*args)
118
+ end
119
+ module_function :test_u, :rnorm
80
120
  end
81
121
  end
@@ -8,6 +8,7 @@ module Statsample
8
8
  autoload(:F, 'statsample/test/f')
9
9
  autoload(:ChiSquare, 'statsample/test/chisquare')
10
10
  autoload(:BartlettSphericity, 'statsample/test/bartlettsphericity')
11
+ autoload(:KolmogorovSmirnov, 'statsample/test/kolmogorovsmirnov')
11
12
 
12
13
  # Returns probability of getting a value lower or higher
13
14
  # than sample, using cdf and number of tails.
@@ -29,6 +30,15 @@ module Statsample
29
30
  2*cdf
30
31
  end
31
32
  end
33
+ # Get critical t to create confidence interval
34
+ def t_critical(confidence_level, df)
35
+ -Distribution::T.p_value((1-confidence_level) / 2.0, df)
36
+ end
37
+ # Get critical z to create confidence interval
38
+ def z_critical(confidence_level)
39
+ -Distribution::Z.p_value((1-confidence_level) / 2.0)
40
+ end
41
+
32
42
  extend self
33
43
  # Calculate chi square for two Matrix
34
44
  class << self
@@ -0,0 +1,61 @@
1
+ module Statsample
2
+ module Test
3
+ # == Kolmogorov-Smirnov's test of equality of distributions.
4
+ class KolmogorovSmirnov
5
+
6
+ attr_reader :d
7
+ include Statsample::Test
8
+ include Summarizable
9
+ # Creates a new Kolmogorov-Smirnov test
10
+ # d1 should have each method
11
+ # d2 could be a Distribution class, with a cdf method,
12
+ # a vector or a lambda
13
+ def initialize(d1,d2)
14
+ raise "First argument should have each method" unless d1.respond_to? :each
15
+ @d1=make_cdf(d1)
16
+ if d2.respond_to? :cdf or d2.is_a? Proc
17
+ @d2=d2
18
+ elsif d2.respond_to? :each
19
+ @d2=make_cdf(d2)
20
+ else
21
+ raise "Second argument should respond to cdf or each"
22
+ end
23
+ calculate
24
+ end
25
+ def calculate
26
+ d=0
27
+ @d1.each {|x|
28
+ v1=@d1.cdf(x);
29
+ v2=@d2.is_a?(Proc) ? @d2.call(x) : @d2.cdf(x)
30
+ d=(v1-v2).to_f.abs if (v1-v2).abs>d
31
+ }
32
+ @d=d
33
+ end
34
+ # Make a wrapper EmpiricDistribution to any method which implements
35
+ # each
36
+ # On Statsample::Vector, only uses #valid_data
37
+ def make_cdf(v)
38
+ v.is_a?(Statsample::Vector) ? EmpiricDistribution.new(v.valid_data) : EmpiricDistribution.new(v)
39
+ end
40
+ class EmpiricDistribution
41
+ def initialize(data)
42
+ @min=data.min
43
+ @max=data.max
44
+ @data=data.sort
45
+ @n=data.size
46
+ end
47
+ def each
48
+ @data.each {|x|
49
+ yield x
50
+ }
51
+ end
52
+ def cdf(x)
53
+ return 0 if x<@min
54
+ return 1 if x>=@max
55
+ v=@data.index{|v1| v1>=x}
56
+ v.nil? ? 0 : (v+(x==@data[v]? 1 : 0)).quo(@n)
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -1,6 +1,12 @@
1
1
  module Statsample
2
2
  module Test
3
- module T
3
+
4
+
5
+
6
+
7
+ # A t-test is any statistical hypothesis test in which the test statistic follows a Student's t distribution, if the null hypothesis is supported
8
+ class T
9
+
4
10
  class << self
5
11
  include Math
6
12
  # Test the null hypothesis that the population mean is equal to a specified value u, one uses the statistic.
@@ -10,7 +16,7 @@ module Statsample
10
16
  # * <tt>s</tt>: sample/differences standard deviation
11
17
  # * <tt>n</tt>: sample size
12
18
  def one_sample(x,u,s,n)
13
- (x-u).quo(s.quo(Math::sqrt(n)))
19
+ (x-u)*Math::sqrt(n).quo(s)
14
20
  end
15
21
  # Test if means of two samples are different.
16
22
  # * <tt>x1</tt>: sample 1 mean
@@ -50,6 +56,73 @@ module Statsample
50
56
  num.quo(den)
51
57
  end
52
58
  end
59
+
60
+ include Statsample::Test
61
+ include Summarizable
62
+ attr_reader :standard_error, :estimate, :df
63
+ # Tails for p-value (:both, :left or :right). Default :both
64
+ attr_accessor :tails
65
+ # Name of F analysis
66
+ attr_accessor :name
67
+ attr_accessor :confidence_level
68
+ attr_reader :t
69
+ attr_accessor :estimate_name, :standard_error_name
70
+ # Creates a generic t test. Use OneSample or TwoSamplesIndependent
71
+ # classes for better summaries.
72
+ # Parameters:
73
+ # * estimate: estimate
74
+ # * standard_error: standard error of estimate
75
+ # * df: degrees of freedom
76
+ def initialize(estimate, standard_error, df, opts=Hash.new)
77
+ @estimate=estimate
78
+ @standard_error=standard_error
79
+ @df=df
80
+ @t = @estimate / @standard_error.to_f
81
+ opts_default={ :tails=>:both,
82
+ :name=>_("T Test"),
83
+ :estimate_name=>_("Estimate"),
84
+ :standard_error_name=>_("Std.Err.of Estimate"),
85
+ :confidence_level=>0.95}
86
+ @opts = opts_default.merge(opts)
87
+
88
+ @opts.keys.each {|k|
89
+ send("#{k}=", @opts[k]) if respond_to? k
90
+ }
91
+ end
92
+
93
+ alias :se :standard_error
94
+
95
+ def to_f
96
+ t
97
+ end
98
+
99
+ # probability
100
+ def probability
101
+ p_using_cdf(Distribution::T.cdf(t, df), tails)
102
+ end
103
+
104
+ def confidence_interval(cl=nil)
105
+ cl||=confidence_level
106
+ t_crit = t_critical(cl, df)
107
+ [estimate - se*t_crit, estimate + se*t_crit]
108
+ end
109
+ alias :ci :confidence_interval
110
+
111
+
112
+ def report_building(builder) #:nodoc:
113
+ builder.section(:name=>@name) do |section|
114
+ section.text _("%s: %0.4f | %s: %0.4f") % [@estimate_name, @estimate, @standard_error_name, se]
115
+ report_building_t(section)
116
+ end
117
+ end
118
+ def report_building_t(s)
119
+ df_f=@df.is_a?(Integer) ? "%d" : "%0.4f"
120
+ s.text _("t(%d) = %0.4f, p=%0.4f (%s tails)") % [df, t,probability, tails]
121
+ s.text _("CI(%d%%): %0.4f - %0.4f") % [confidence_level*100, ci[0],ci[1]]
122
+
123
+ end
124
+
125
+
53
126
  # One Sample t-test
54
127
  # == Usage
55
128
  # a=1000.times.map {rand(100)}.to_scale
@@ -91,22 +164,32 @@ module Statsample
91
164
  @name=@opts[:name]
92
165
  @u=@opts[:u]
93
166
  @tails=@opts[:tails]
167
+ @confidence_level=@opts[:confidence_level] || 0.95
94
168
  @df= @vector.n_valid-1
95
169
  @t=nil
96
170
  end
171
+ def t_object
172
+ T.new(@vector.mean-u, @vector.se, @vector.n_valid-1, opts)
173
+ end
97
174
  def t
98
- T.one_sample(@vector.mean, @u, @vector.sd, @vector.n_valid)
175
+ t_object.t
99
176
  end
100
-
101
177
  def probability
102
- p_using_cdf(Distribution::T.cdf(t, @df), tails)
178
+ t_object.probability
179
+ end
180
+ def standard_error
181
+ t_object.standard_error
182
+ end
183
+ alias :se :standard_error
184
+ def confidence_interval(cl=nil)
185
+ t_object.confidence_interval(cl)
103
186
  end
187
+ alias :ci :confidence_interval
104
188
  def report_building(b) # :nodoc:
105
189
  b.section(:name=>@name) {|s|
106
- s.text "Sample mean: #{@vector.mean}"
107
- s.text "Population mean:#{u}"
108
- s.text "Tails: #{tails}"
109
- s.text sprintf("t = %0.4f, p=%0.4f, d.f=%d", t, probability, df)
190
+ s.text _("Sample mean: %0.4f | Sample sd: %0.4f | se : %0.4f") % [@vector.mean, @vector.sd, se]
191
+ s.text _("Population mean: %0.4f") % u if u!=0
192
+ t_object.report_building_t(s)
110
193
  }
111
194
  end
112
195
  end