statsample 0.11.2 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,7 +39,7 @@ module Factor
39
39
  include GetText
40
40
  bindtextdomain("statsample")
41
41
 
42
- def initialize(matrix ,opts=Hash.new)
42
+ def initialize(matrix, opts=Hash.new)
43
43
  @use_gsl=nil
44
44
  @name=_("Principal Component Analysis")
45
45
  @matrix=matrix
@@ -1,3 +1,4 @@
1
+ require 'tempfile'
1
2
  module Statsample
2
3
  module Graph
3
4
  class SvgScatterplot < SVG::Graph::Plot # :nodoc:
@@ -7,6 +8,14 @@ module Statsample
7
8
  @ds=ds
8
9
  set_x(@ds.fields[0])
9
10
  end
11
+ def report_building_html(g)
12
+ self.parse()
13
+ tf=Tempfile.new(['image','.svg'])
14
+ tf.write self.burn
15
+ tf.close
16
+ image=ReportBuilder::Image.new(tf.path)
17
+ g.parse_element(image)
18
+ end
10
19
  def set_defaults
11
20
  super
12
21
  init_with(
@@ -27,7 +36,7 @@ module Statsample
27
36
  }
28
37
  data.each{|y,d|
29
38
  add_data({
30
- :data=>d, :title=>@ds.vector_label(y)
39
+ :data=>d, :title=>@ds[y].name
31
40
  })
32
41
  }
33
42
  end
@@ -21,13 +21,44 @@ module Statsample
21
21
  }.to_dataset
22
22
  cronbach_alpha(ds)
23
23
  end
24
+ def cronbach_alpha_from_n_s2_cov(n,s2,cov)
25
+ (n.quo(n-1)) * (1-(s2.quo(s2+(n-1)*cov)))
26
+ end
27
+ # Returns n necessary to obtain specific alpha
28
+ # given variance and covariance mean of items
29
+ def n_for_desired_alpha(alpha,s2,cov)
30
+ # Start with a regular test : 50 items
31
+ min=2
32
+ max=1000
33
+ n=50
34
+ prev_n=0
35
+ epsilon=0.0001
36
+ dif=1000
37
+ c_a=cronbach_alpha_from_n_s2_cov(n,s2,cov)
38
+ dif=c_a - alpha
39
+ while(dif.abs>epsilon and n!=prev_n)
40
+ prev_n=n
41
+ if dif<0
42
+ min=n
43
+ n=(n+(max-min).quo(2)).to_i
44
+ else
45
+ max=n
46
+ n=(n-(max-min).quo(2)).to_i
47
+ end
48
+ c_a=cronbach_alpha_from_n_s2_cov(n,s2,cov)
49
+ dif=c_a - alpha
50
+ #puts "#{n} , #{c_a}"
51
+
52
+ end
53
+ n
54
+ end
24
55
  # First derivative for alfa
25
56
  # Parameters
26
57
  # <tt>n</tt>: Number of items
27
58
  # <tt>sx</tt>: mean of variances
28
59
  # <tt>sxy</tt>: mean of covariances
29
60
 
30
- def alfa_first_derivative(n,sx,sxy)
61
+ def alpha_first_derivative(n,sx,sxy)
31
62
  (sxy*(sx-sxy)).quo(((sxy*(n-1))+sx)**2)
32
63
  end
33
64
  # Second derivative for alfa
@@ -75,193 +106,9 @@ module Statsample
75
106
  out[value]=count_value.quo(n)
76
107
  end
77
108
  out
78
- end
79
- end
80
- class ItemAnalysis
81
- attr_reader :mean, :sd,:valid_n, :alpha , :alpha_standarized, :variances_mean, :covariances_mean
82
- attr_accessor :name
83
- def initialize(ds,opts=Hash.new)
84
- @ds=ds.dup_only_valid
85
- @k=@ds.fields.size
86
- @total=@ds.vector_sum
87
- @item_mean=@ds.vector_mean.mean
88
- @mean=@total.mean
89
- @median=@total.median
90
- @skew=@total.skew
91
- @kurtosis=@total.kurtosis
92
- @sd = @total.sd
93
- @variance=@total.variance
94
- @valid_n = @total.size
95
- opts_default={:name=>"Reliability Analisis"}
96
- @opts=opts_default.merge(opts)
97
- @name=@opts[:name]
98
- # Mean for covariances and variances
99
- @variances=@ds.fields.map {|f| @ds[f].variance}.to_scale
100
- @variances_mean=@variances.mean
101
- @covariances_mean=(@variance-@variances.sum).quo(@k**2-@k)
102
- begin
103
- @alpha = Statsample::Reliability.cronbach_alpha(ds)
104
- @alpha_standarized = Statsample::Reliability.cronbach_alpha_standarized(ds)
105
- rescue => e
106
- raise DatasetException.new(@ds,e), "Error calculating alpha"
107
- end
108
- end
109
- # Returns a hash with structure
110
- def item_characteristic_curve
111
- i=0
112
- out={}
113
- total={}
114
- @ds.each do |row|
115
- tot=@total[i]
116
- @ds.fields.each do |f|
117
- out[f]||= {}
118
- total[f]||={}
119
- out[f][tot]||= 0
120
- total[f][tot]||=0
121
- out[f][tot]+= row[f]
122
- total[f][tot]+=1
123
- end
124
- i+=1
125
- end
126
- total.each do |f,var|
127
- var.each do |tot,v|
128
- out[f][tot]=out[f][tot].to_f / total[f][tot]
129
- end
130
- end
131
- out
132
- end
133
- def gnuplot_item_characteristic_curve(directory, base="crd",options={})
134
- require 'gnuplot'
135
-
136
- crd=item_characteristic_curve
137
- @ds.fields.each do |f|
138
- x=[]
139
- y=[]
140
- Gnuplot.open do |gp|
141
- Gnuplot::Plot.new( gp ) do |plot|
142
- crd[f].sort.each do |tot,prop|
143
- x.push(tot)
144
- y.push((prop*100).to_i.to_f/100)
145
- end
146
- plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
147
- ds.with = "linespoints"
148
- ds.notitle
149
- end
150
-
151
- end
152
- end
153
- end
154
- end
155
- def svggraph_item_characteristic_curve(directory, base="icc",options={})
156
- require 'statsample/graph/svggraph'
157
- crd=ItemCharacteristicCurve.new(@ds)
158
- @ds.fields.each do |f|
159
- factors=@ds[f].factors.sort
160
- options={
161
- :height=>500,
162
- :width=>800,
163
- :key=>true
164
- }.update(options)
165
- graph = ::SVG::Graph::Plot.new(options)
166
- factors.each do |factor|
167
- factor=factor.to_s
168
- dataset=[]
169
- crd.curve_field(f, factor).each do |tot,prop|
170
- dataset.push(tot)
171
- dataset.push((prop*100).to_i.to_f/100)
172
- end
173
- graph.add_data({
174
- :title=>"#{factor}",
175
- :data=>dataset
176
- })
177
- end
178
- File.open(directory+"/"+base+"_#{f}.svg","w") {|fp|
179
- fp.puts(graph.burn())
180
- }
181
- end
182
- end
183
- def item_total_correlation
184
- @ds.fields.inject({}) do |a,v|
185
- vector=@ds[v].dup
186
- ds2=@ds.dup
187
- ds2.delete_vector(v)
188
- total=ds2.vector_sum
189
- a[v]=Statsample::Bivariate.pearson(vector,total)
190
- a
191
- end
192
- end
193
- def item_statistics
194
- @ds.fields.inject({}) do |a,v|
195
- a[v]={:mean=>@ds[v].mean,:sds=>@ds[v].sds}
196
- a
197
- end
198
- end
199
- # Returns a dataset with cases ordered by score
200
- # and variables ordered by difficulty
201
-
202
- def item_difficulty_analysis
203
- dif={}
204
- @ds.fields.each{|f| dif[f]=@ds[f].mean }
205
- dif_sort=dif.sort{|a,b| -(a[1]<=>b[1])}
206
- scores_sort={}
207
- scores=@ds.vector_mean
208
- scores.each_index{|i| scores_sort[i]=scores[i] }
209
- scores_sort=scores_sort.sort{|a,b| a[1]<=>b[1]}
210
- ds_new=Statsample::Dataset.new(['case','score'] + dif_sort.collect{|a,b| a})
211
- scores_sort.each do |i,score|
212
- row=[i, score]
213
- case_row=@ds.case_as_hash(i)
214
- dif_sort.each{|variable,dif_value| row.push(case_row[variable]) }
215
- ds_new.add_case_array(row)
216
- end
217
- ds_new.update_valid_data
218
- ds_new
219
- end
220
- def stats_if_deleted
221
- @ds.fields.inject({}) do |a,v|
222
- ds2=@ds.dup
223
- ds2.delete_vector(v)
224
- total=ds2.vector_sum
225
- a[v]={}
226
- a[v][:mean]=total.mean
227
- a[v][:sds]=total.sds
228
- a[v][:variance_sample]=total.variance_sample
229
- a[v][:alpha]=Statsample::Reliability.cronbach_alpha(ds2)
230
- a
231
- end
232
- end
233
- def summary
234
- ReportBuilder.new(:no_title=>true).add(self).to_text
235
- end
236
- def report_building(builder)
237
- builder.section(:name=>@name) do |s|
238
- s.table(:name=>"Summary") do |t|
239
- t.row ["Items", @ds.fields.size]
240
- t.row ["Total Mean", @mean]
241
- t.row ["Total S.D.", @sd]
242
- t.row ["Total Variance", @variance]
243
- t.row ["Item Mean", @item_mean]
244
- t.row ["Median", @median]
245
- t.row ["Skewness", "%0.4f" % @skew]
246
- t.row ["Kurtosis", "%0.4f" % @kurtosis]
247
- t.row ["Valid n", @valid_n]
248
- t.row ["Cronbach's alpha", "%0.4f" % @alpha]
249
- t.row ["Standarized Cronbach's alpha", "%0.4f" % @alpha_standarized]
250
- t.row ["Variances mean", "%g" % @variances_mean]
251
- t.row ["Covariances mean" , "%g" % @covariances_mean]
252
- end
253
-
254
- itc=item_total_correlation
255
- sid=stats_if_deleted
256
- is=item_statistics
257
-
258
- s.table(:name=>"Items report", :header=>["item","mean","sd", "mean if deleted", "var if deleted", "sd if deleted"," item-total correl.", "alpha if deleted"]) do |t|
259
- @ds.fields.each do |f|
260
- t.row(["#{@ds[f].name}(#{f})", sprintf("%0.5f",is[f][:mean]), sprintf("%0.5f",is[f][:sds]), sprintf("%0.5f",sid[f][:mean]), sprintf("%0.5f",sid[f][:variance_sample]), sprintf("%0.5f",sid[f][:sds]), sprintf("%0.5f",itc[f]), sprintf("%0.5f",sid[f][:alpha])])
261
- end
262
- end
263
- end
264
- end
265
- end
266
- end
267
- end
109
+ end # def
110
+ end # self
111
+ end # Reliability
112
+ end # Statsample
113
+ require 'statsample/reliability/scaleanalysis.rb'
114
+ require 'statsample/reliability/multiscaleanalysis.rb'
@@ -0,0 +1,87 @@
1
+ module Statsample
2
+ module Reliability
3
+ # DSL for analysis of multiple scales analysis. Analoge of Scale Reliability analysis on SPSS.
4
+ # Returns several statistics for complete scale and each item
5
+ # == Usage
6
+ # @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_vector(:scale)
7
+ # @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_vector(:scale)
8
+ # @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_vector(:scale)
9
+ # @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_vector(:scale)
10
+ # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset
11
+ # msa=Statsample::Reliability::MultiScaleAnalysis.new(:name=>"Scales") do |m|
12
+ # m.scale :s1, "Section 1", ds.clone(%w{x1 x2})
13
+ # m.scale :s2, "Section 2", ds.clone(%w{x3 x4})
14
+ # m.correlation_matrix
15
+ # m.factor_analysis
16
+ # end
17
+ # puts msa.summary
18
+ class MultiScaleAnalysis
19
+ include Statsample::Summarizable
20
+ attr_reader :scales
21
+ attr_accessor :name
22
+ attr_accessor :summary_correlation_matrix
23
+ attr_accessor :summary_pca
24
+ attr_accessor :pca_options
25
+ def initialize(opts=Hash.new, &block)
26
+ @scales=Hash.new
27
+ opts_default={ :name=>_("Multiple Scale analysis"),
28
+ :summary_correlation_matrix=>false,
29
+ :summary_pca=>false,
30
+ :pca_options=>Hash.new}
31
+ @opts=opts_default.merge(opts)
32
+ @opts.each{|k,v|
33
+ self.send("#{k}=",v) if self.respond_to? k
34
+ }
35
+
36
+ if block
37
+ block.arity<1 ? instance_eval(&block) : block.call(self)
38
+ end
39
+ end
40
+ def scale(code,ds=nil, opts=nil)
41
+ if ds.nil?
42
+ @scales[code]
43
+ else
44
+ opts={:name=>_("Scale %s") % code} if opts.nil?
45
+ @scales[code]=ScaleAnalysis.new(ds, opts)
46
+ end
47
+ end
48
+ def delete_scale(code)
49
+ @scales.delete code
50
+ end
51
+ def pca(opts=Hash.new)
52
+ Statsample::Factor::PCA.new(correlation_matrix,opts)
53
+ end
54
+ def factor_analysis(opts=nil)
55
+ opts||=pca_options
56
+ Statsample::Factor::FactorAnalysis.new(correlation_matrix,opts)
57
+ end
58
+
59
+ def correlation_matrix
60
+ vectors=Hash.new
61
+ @scales.each_pair do |code,scale|
62
+ vectors[code.to_s]=scale.ds.vector_sum
63
+ end
64
+ Statsample::Bivariate.correlation_matrix(vectors.to_dataset)
65
+ end
66
+ def report_building(b)
67
+ b.section(:name=>name) do |s|
68
+ s.section(:name=>_("Reliability analysis of scales")) do |s2|
69
+ @scales.each_pair do |k,scale|
70
+ s2.parse_element(scale)
71
+ end
72
+ end
73
+ if summary_correlation_matrix
74
+ s.section(:name=>_("Correlation matrix for %s") % name) do |s2|
75
+ s2.parse_element(correlation_matrix)
76
+ end
77
+ end
78
+ if summary_pca
79
+ s.section(:name=>_("PCA for %s") % name) do |s2|
80
+ s2.parse_element(pca)
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,204 @@
1
+ module Statsample
2
+ module Reliability
3
+ # Analysis of a Scale. Analoge of Scale Reliability analysis on SPSS.
4
+ # Returns several statistics for complete scale and each item
5
+ # == Usage
6
+ # @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_vector(:scale)
7
+ # @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_vector(:scale)
8
+ # @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_vector(:scale)
9
+ # @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_vector(:scale)
10
+ # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset
11
+ # ia=Statsample::Reliability::ScaleAnalysis.new(ds)
12
+ # puts ia.summary
13
+ class ScaleAnalysis
14
+ include Summarizable
15
+ attr_reader :ds,:mean, :sd,:valid_n, :alpha , :alpha_standarized, :variances_mean, :covariances_mean
16
+ attr_accessor :name
17
+ def initialize(ds, opts=Hash.new)
18
+ @ds=ds.dup_only_valid
19
+ @k=@ds.fields.size
20
+ @total=@ds.vector_sum
21
+ @item_mean=@ds.vector_mean.mean
22
+ @mean=@total.mean
23
+ @median=@total.median
24
+ @skew=@total.skew
25
+ @kurtosis=@total.kurtosis
26
+ @sd = @total.sd
27
+ @variance=@total.variance
28
+ @valid_n = @total.size
29
+ opts_default={:name=>"Reliability Analisis"}
30
+ @opts=opts_default.merge(opts)
31
+ @name=@opts[:name]
32
+ # Mean for covariances and variances
33
+ @variances=@ds.fields.map {|f| @ds[f].variance}.to_scale
34
+ @variances_mean=@variances.mean
35
+ @covariances_mean=(@variance-@variances.sum).quo(@k**2-@k)
36
+ begin
37
+ @alpha = Statsample::Reliability.cronbach_alpha(ds)
38
+ @alpha_standarized = Statsample::Reliability.cronbach_alpha_standarized(ds)
39
+ rescue => e
40
+ raise DatasetException.new(@ds,e), "Error calculating alpha"
41
+ end
42
+ end
43
+ # Returns a hash with structure
44
+ def item_characteristic_curve
45
+ i=0
46
+ out={}
47
+ total={}
48
+ @ds.each do |row|
49
+ tot=@total[i]
50
+ @ds.fields.each do |f|
51
+ out[f]||= {}
52
+ total[f]||={}
53
+ out[f][tot]||= 0
54
+ total[f][tot]||=0
55
+ out[f][tot]+= row[f]
56
+ total[f][tot]+=1
57
+ end
58
+ i+=1
59
+ end
60
+ total.each do |f,var|
61
+ var.each do |tot,v|
62
+ out[f][tot]=out[f][tot].to_f / total[f][tot]
63
+ end
64
+ end
65
+ out
66
+ end
67
+ def gnuplot_item_characteristic_curve(directory, base="crd",options={})
68
+ require 'gnuplot'
69
+
70
+ crd=item_characteristic_curve
71
+ @ds.fields.each do |f|
72
+ x=[]
73
+ y=[]
74
+ Gnuplot.open do |gp|
75
+ Gnuplot::Plot.new( gp ) do |plot|
76
+ crd[f].sort.each do |tot,prop|
77
+ x.push(tot)
78
+ y.push((prop*100).to_i.to_f/100)
79
+ end
80
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
81
+ ds.with = "linespoints"
82
+ ds.notitle
83
+ end
84
+
85
+ end
86
+ end
87
+ end
88
+ end
89
+ def svggraph_item_characteristic_curve(directory, base="icc",options={})
90
+ require 'statsample/graph/svggraph'
91
+ crd=ItemCharacteristicCurve.new(@ds)
92
+ @ds.fields.each do |f|
93
+ factors=@ds[f].factors.sort
94
+ options={
95
+ :height=>500,
96
+ :width=>800,
97
+ :key=>true
98
+ }.update(options)
99
+ graph = ::SVG::Graph::Plot.new(options)
100
+ factors.each do |factor|
101
+ factor=factor.to_s
102
+ dataset=[]
103
+ crd.curve_field(f, factor).each do |tot,prop|
104
+ dataset.push(tot)
105
+ dataset.push((prop*100).to_i.to_f/100)
106
+ end
107
+ graph.add_data({
108
+ :title=>"#{factor}",
109
+ :data=>dataset
110
+ })
111
+ end
112
+ File.open(directory+"/"+base+"_#{f}.svg","w") {|fp|
113
+ fp.puts(graph.burn())
114
+ }
115
+ end
116
+ end
117
+ def item_total_correlation
118
+ @ds.fields.inject({}) do |a,v|
119
+ vector=@ds[v].dup
120
+ ds2=@ds.dup
121
+ ds2.delete_vector(v)
122
+ total=ds2.vector_sum
123
+ a[v]=Statsample::Bivariate.pearson(vector,total)
124
+ a
125
+ end
126
+ end
127
+ def item_statistics
128
+ @ds.fields.inject({}) do |a,v|
129
+ a[v]={:mean=>@ds[v].mean,:sds=>@ds[v].sds}
130
+ a
131
+ end
132
+ end
133
+ # Returns a dataset with cases ordered by score
134
+ # and variables ordered by difficulty
135
+
136
+ def item_difficulty_analysis
137
+ dif={}
138
+ @ds.fields.each{|f| dif[f]=@ds[f].mean }
139
+ dif_sort=dif.sort{|a,b| -(a[1]<=>b[1])}
140
+ scores_sort={}
141
+ scores=@ds.vector_mean
142
+ scores.each_index{|i| scores_sort[i]=scores[i] }
143
+ scores_sort=scores_sort.sort{|a,b| a[1]<=>b[1]}
144
+ ds_new=Statsample::Dataset.new(['case','score'] + dif_sort.collect{|a,b| a})
145
+ scores_sort.each do |i,score|
146
+ row=[i, score]
147
+ case_row=@ds.case_as_hash(i)
148
+ dif_sort.each{|variable,dif_value| row.push(case_row[variable]) }
149
+ ds_new.add_case_array(row)
150
+ end
151
+ ds_new.update_valid_data
152
+ ds_new
153
+ end
154
+ def stats_if_deleted
155
+ @ds.fields.inject({}) do |a,v|
156
+ ds2=@ds.dup
157
+ ds2.delete_vector(v)
158
+ total=ds2.vector_sum
159
+ a[v]={}
160
+ a[v][:mean]=total.mean
161
+ a[v][:sds]=total.sds
162
+ a[v][:variance_sample]=total.variance_sample
163
+ a[v][:alpha]=Statsample::Reliability.cronbach_alpha(ds2)
164
+ a
165
+ end
166
+ end
167
+ def report_building(builder)
168
+ builder.section(:name=>@name) do |s|
169
+ s.table(:name=>_("Summary for %s") % @name) do |t|
170
+ t.row [_("Items"), @ds.fields.size]
171
+ t.row [_("Sum mean"), @mean]
172
+ t.row [_("Sum sd"), @sd]
173
+ t.row [_("Sum variance"), @variance]
174
+ t.row [_("Sum median"), @median]
175
+ t.hr
176
+ t.row [_("Item mean"), @item_mean]
177
+ t.row [_("Skewness"), "%0.4f" % @skew]
178
+ t.row [_("Kurtosis"), "%0.4f" % @kurtosis]
179
+ t.hr
180
+ t.row [_("Valid n"), @valid_n]
181
+ t.row [_("Cronbach's alpha"), "%0.4f" % @alpha]
182
+ t.row [_("Standarized Cronbach's alpha"), "%0.4f" % @alpha_standarized]
183
+ t.hr
184
+ t.row [_("Variances mean"), "%g" % @variances_mean]
185
+ t.row [_("Covariances mean") , "%g" % @covariances_mean]
186
+ end
187
+ s.text _("items for obtain alpha(0.8) : %d" % Statsample::Reliability::n_for_desired_alpha(0.8, @variances_mean,@covariances_mean))
188
+ s.text _("items for obtain alpha(0.9) : %d" % Statsample::Reliability::n_for_desired_alpha(0.9, @variances_mean,@covariances_mean))
189
+ itc=item_total_correlation
190
+ sid=stats_if_deleted
191
+ is=item_statistics
192
+
193
+
194
+
195
+ s.table(:name=>_("Items report for %s") % @name, :header=>["item","mean","sd", "mean if deleted", "var if deleted", "sd if deleted"," item-total correl.", "alpha if deleted"]) do |t|
196
+ @ds.fields.each do |f|
197
+ t.row(["#{@ds[f].name}(#{f})", sprintf("%0.5f",is[f][:mean]), sprintf("%0.5f",is[f][:sds]), sprintf("%0.5f",sid[f][:mean]), sprintf("%0.5f",sid[f][:variance_sample]), sprintf("%0.5f",sid[f][:sds]), sprintf("%0.5f",itc[f]), sprintf("%0.5f",sid[f][:alpha])])
198
+ end # end each
199
+ end # table
200
+ end # section
201
+ end # def
202
+ end # class
203
+ end # module
204
+ end # module