statsample 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
data/demo/stratum.rb ADDED
@@ -0,0 +1,141 @@
1
+ require File.dirname(__FILE__)+"/../lib/statsample"
2
+ require 'statsample/multiset'
3
+ require 'statsample/srs'
4
+ require 'statsample/resample'
5
+ require 'gnuplot'
6
+
7
+ tests=3000
8
+ sample_size=50
9
+
10
+ a=[10]*50+[12]*10+[14]*20+[16]*10+[19]*10
11
+ b=[11000]*50+[11050]*10+[11100]*20+[11300]*10+[11240]*10
12
+ a_size=a.size
13
+ b_size=b.size
14
+ av=a.to_vector(:scale)
15
+ bv=b.to_vector(:scale)
16
+
17
+ ads={'data'=>a.to_vector(:scale)}.to_dataset
18
+ bds={'data'=>b.to_vector(:scale)}.to_dataset
19
+
20
+ m=Statsample::Multiset.new(['data'])
21
+ m.add_dataset('a',ads)
22
+ m.add_dataset('b',bds)
23
+ ss=Statsample::StratifiedSample.new(m,{'a'=>a.size,'b'=>b.size})
24
+
25
+ es=[{'N'=>a_size,'n'=>sample_size/2,'s'=>av.standard_deviation_population}, {'N'=>b_size,'n'=>sample_size/2,'s'=>bv.standard_deviation_population}]
26
+
27
+
28
+
29
+ sd_estimated_wr=Statsample::StratifiedSample.standard_error_ksd_wr(es)
30
+
31
+ sd_estimated_wor = Statsample::StratifiedSample.standard_error_ksd_wor(es)
32
+
33
+
34
+
35
+ pop=(a+b).to_vector(:scale)
36
+ s=pop.standard_deviation_population
37
+
38
+
39
+
40
+
41
+ puts "-------------"
42
+
43
+ puts "Estadísticos:"
44
+ puts "Mean:"+pop.mean.to_s
45
+ puts "SD:"+s.to_s
46
+ puts "EE con reemplazo:"+Statsample::SRS.standard_error_ksd_wr(s, sample_size, pop.size).to_s
47
+ puts "EE sin reemplazo:"+Statsample::SRS.standard_error_ksd_wor(s, sample_size,pop.size).to_s
48
+
49
+ puts "EE estratified con reemplazo:"+sd_estimated_wr.to_s
50
+ puts "EE estratified sin reemplazo:"+sd_estimated_wor.to_s
51
+ sd_with=[]
52
+ sd_without=[]
53
+ sd_strat_wr=[]
54
+ sd_strat_wor=[]
55
+ monte_with=Statsample::Resample.repeat_and_save(tests) {
56
+ sample= pop.sample_with_replacement(sample_size)
57
+ sd_with.push(Statsample::SRS.standard_error_esd_wr(sample.sds,sample_size,pop.size))
58
+ sample.mean
59
+ }
60
+
61
+
62
+ monte_without=Statsample::Resample.repeat_and_save(tests) {
63
+ sample= pop.sample_without_replacement(sample_size)
64
+ sd_without.push(Statsample::SRS.standard_error_esd_wor(sample.sds,sample_size,pop.size))
65
+ sample.mean
66
+ }
67
+
68
+
69
+
70
+ stratum_wor=Statsample::Resample.repeat_and_save(tests) {
71
+ a_sample= {'data'=>av.sample_without_replacement(sample_size/2)}.to_dataset
72
+ b_sample= {'data'=>bv.sample_without_replacement(sample_size/2)}.to_dataset
73
+ m=Statsample::Multiset.new(['data'])
74
+ m.add_dataset('a',a_sample)
75
+ m.add_dataset('b',b_sample)
76
+ ss=Statsample::StratifiedSample.new(m,{'a'=>a_size,'b'=>b_size})
77
+ sd_strat_wor.push(ss.standard_error_wor('data'))
78
+ ss.mean('data')
79
+ }.to_vector(:scale)
80
+
81
+ stratum_wr=Statsample::Resample.repeat_and_save(tests) {
82
+ a_sample= {'data'=>av.sample_with_replacement(sample_size/2)}.to_dataset
83
+ b_sample= {'data'=>bv.sample_with_replacement(sample_size/2)}.to_dataset
84
+ m=Statsample::Multiset.new(['data'])
85
+ m.add_dataset('a',a_sample)
86
+ m.add_dataset('b',b_sample)
87
+ ss=Statsample::StratifiedSample.new(m,{'a'=>a_size,'b'=>b_size})
88
+ sd_strat_wr.push(ss.standard_error_wr('data'))
89
+ ss.mean('data')
90
+ }.to_vector(:scale)
91
+
92
+
93
+
94
+ v_sd_with=sd_with.to_vector(:scale)
95
+ v_sd_without=sd_without.to_vector(:scale)
96
+ v_sd_strat_wr=sd_strat_wr.to_vector(:scale)
97
+ v_sd_strat_wor=sd_strat_wor.to_vector(:scale)
98
+
99
+
100
+ v_with=monte_with.to_vector(:scale)
101
+ v_without=monte_without.to_vector(:scale)
102
+ puts "=============="
103
+ puts "Con reemplazo"
104
+ puts "Mean:"+v_with.mean.to_s
105
+ puts "Sd:"+v_with.sds.to_s
106
+ puts "Sd (estimated):"+v_sd_with.mean.to_s
107
+ puts "=============="
108
+ puts "Sin reemplazo"
109
+ puts "Mean:"+v_without.mean.to_s
110
+ puts "Sd:"+v_without.sds.to_s
111
+ puts "Sd (estimated):"+v_sd_without.mean.to_s
112
+ puts "=============="
113
+ puts "Estratificado Con reemplazo"
114
+ puts "Mean:"+stratum_wr.mean.to_s
115
+ puts "Sd:"+stratum_wr.sds.to_s
116
+ puts "Sd (estimated):"+v_sd_strat_wr.mean.to_s
117
+
118
+ puts "=============="
119
+ puts "Estratificado Sin reemplazo"
120
+ puts "Mean:"+stratum_wor.mean.to_s
121
+ puts "Sd:"+stratum_wor.sds.to_s
122
+ puts "Sd (estimated):"+v_sd_strat_wor.mean.to_s
123
+
124
+ p v_without.plot_histogram
125
+
126
+ =begin
127
+
128
+
129
+
130
+ x=[]
131
+ y=[]
132
+ y2=[]
133
+ prev=0
134
+ prev_chi=0
135
+ v.frequencies.sort.each{|k,v1|
136
+ x.push(k)
137
+ y.push(prev+v1)
138
+ prev=prev+v1
139
+ }
140
+ GSL::graph(GSL::Vector.alloc(x), GSL::Vector.alloc(y))
141
+ =end
data/lib/spss.rb ADDED
@@ -0,0 +1,131 @@
1
+ # = spss.rb -
2
+ #
3
+ # Provides utilites for working with spss files
4
+ #
5
+ # Copyright (C) 2009 Claudio Bustos
6
+ #
7
+ # Claudio Bustos mailto:clbustos@gmail.com
8
+
9
+ module SPSS
10
+ module Dictionary
11
+ class Element
12
+ def add(a)
13
+ @elements.push(a)
14
+ end
15
+ def parse_elements(func=:to_s)
16
+ @elements.collect{|e| " "+e.send(func)}.join("\n")
17
+ end
18
+ def init_with config
19
+ config.each {|key,value|
20
+ self.send(key.to_s+"=",value) if methods.include? key.to_s
21
+ }
22
+ end
23
+ def initialize(config={})
24
+ @config=config
25
+ @elements=[]
26
+ end
27
+ end
28
+ class Dictionary < Element
29
+ attr_accessor :locale, :date_time, :row_count
30
+ def initialize(config={})
31
+ super
32
+ init_with ({
33
+ :locale=>"en_US",
34
+ :date_time=>Time.new().strftime("%Y-%m-%dT%H:%M:%S"),
35
+ :row_count=>1
36
+ })
37
+ init_with config
38
+ end
39
+
40
+ def to_xml
41
+ "<dictionary locale='#{@locale}' creationDateTime='#{@date_time}' rowCount='#{@row_count}' xmlns='http://xml.spss.com/spss/data'>\n"+parse_elements(:to_xml)+"\n</dictionary>"
42
+
43
+ end
44
+ def to_spss
45
+ parse_elements(:to_spss)
46
+ end
47
+ end
48
+
49
+ class MissingValue < Element
50
+ attr_accessor :data, :type, :from, :to
51
+ def initialize(data,type=nil)
52
+ @data=data
53
+ if type.nil? or type=="lowerBound" or type=="upperBound"
54
+ @type=type
55
+ else
56
+ raise Exception,"Incorrect value for type"
57
+ end
58
+ end
59
+ def to_xml
60
+ "<missingValue data='#{@data}' "+(type.nil? ? "":"type='#{type}'")+"/>"
61
+ end
62
+ end
63
+ class LabelSet
64
+ attr_accessor
65
+ def initialize(labels)
66
+ @labels=labels
67
+ end
68
+ def parse_xml(name)
69
+ "<valueLabelSet>\n "+@labels.collect{|key,value| "<valueLabel label='#{key}' value='#{value}' />"}.join("\n ")+"\n <valueLabelVariable name='#{name}' />\n</valueLabelSet>"
70
+ end
71
+ def parse_spss()
72
+ @labels.collect{|key,value| "#{key} '#{value}'"}.join("\n ")
73
+ end
74
+ end
75
+ class Variable < Element
76
+ attr_accessor :aligment, :display_width, :label, :measurement_level, :name, :type, :decimals, :width, :type_format, :labelset, :missing_values
77
+ def initialize(config={})
78
+ super
79
+ @@var_number||=1
80
+ init_with({
81
+ :aligment => "left",
82
+ :display_width => 8,
83
+ :label => "Variable #{@@var_number}",
84
+ :measurement_level => "SCALE",
85
+ :name => "var#{@@var_number}",
86
+ :type => 0,
87
+ :decimals => 2,
88
+ :width => 10,
89
+ :type_format => "F",
90
+ :labelset => nil
91
+ })
92
+ init_with config
93
+ @missing_values=[]
94
+ @@var_number+=1
95
+ end
96
+ def to_xml
97
+ labelset_s=(@labelset.nil?) ? "":"\n"+@labelset.parse_xml(@name)
98
+ missing_values=(@missing_values.size>0) ? @missing_values.collect {|m| m.to_xml}.join("\n"):""
99
+ "<variable aligment='#{@aligment}' displayWidth='#{@display_width}' label='#{@label}' measurementLevel='#{@measurement_level}' name='#{@name}' type='#{@type}'>\n<variableFormat decimals='#{@decimals}' width='#{@width}' type='#{@type_format}' />\n"+parse_elements(:to_xml)+missing_values+"</variable>"+labelset_s
100
+ end
101
+ def to_spss
102
+ out=<<HERE
103
+ VARIABLE LABELS #{@name} '#{label}' .
104
+ VARIABLE ALIGMENT #{@name} (#{@aligment.upcase}) .
105
+ VARIABLE WIDTH #{@name} (#{@display_width}) .
106
+ VARIABLE LEVEL #{@name} (#{@measurement_level.upcase}) .
107
+ HERE
108
+ if !@labelset.nil?
109
+ out << "VALUE LABELS #{@name} "+labelset.parse_spss()+" ."
110
+ end
111
+ if @missing_values.size>0
112
+ out << "MISSING VALUES #{@name} ("+@missing_values.collect{|m| m.data}.join(",")+") ."
113
+ end
114
+ out
115
+ end
116
+ end
117
+ end
118
+ end
119
+ n=SPSS::Dictionary::Dictionary.new
120
+ ls=SPSS::Dictionary::LabelSet.new({1=>"Si",2=>"No"})
121
+ var1=SPSS::Dictionary::Variable.new
122
+ var1.labelset=ls
123
+ mv1=SPSS::Dictionary::MissingValue.new("-99")
124
+ var2=SPSS::Dictionary::Variable.new
125
+ n.add(var1)
126
+ n.add(var2)
127
+ var2.missing_values=[mv1]
128
+
129
+ File.open("dic_spss.sps","wb") {|f|
130
+ f.puts n.to_spss
131
+ }
data/lib/statsample.rb ADDED
@@ -0,0 +1,216 @@
1
+ # = statsample.rb -
2
+
3
+ # Process files and databases for statistical purposes, with focus on
4
+ # estimation of parameters for several types of samples (simple random,
5
+ # stratified and multistage sampling).
6
+ #
7
+ # Copyright (C) 2008-2009 Claudio Bustos
8
+ #
9
+ # Claudio Bustos mailto:clbustos_AT_gmail.com
10
+
11
+ # :stopdoc:
12
+
13
+ $:.unshift(File.dirname(__FILE__))
14
+ $:.unshift(File.expand_path(File.dirname(__FILE__)+"/../ext"))
15
+
16
+ require 'delegate'
17
+ require 'matrix'
18
+
19
+
20
+ class Numeric
21
+ def square ; self * self ; end
22
+ end
23
+
24
+
25
+ def create_test(*args,&proc)
26
+ description=args.shift
27
+ fields=args
28
+ [description, fields, Proc.new]
29
+ end
30
+
31
+ # Process files and databases for statistical purposes, with focus on
32
+ # estimation of parameters for several types of samples (simple random,
33
+ # stratified and multistage sampling).
34
+
35
+ begin
36
+ require 'rbgsl'
37
+ HAS_GSL=true
38
+ rescue LoadError
39
+ HAS_GSL=false
40
+ end
41
+ begin
42
+ require 'alglib'
43
+ HAS_ALGIB=true
44
+ rescue LoadError
45
+ HAS_ALGIB=false
46
+ end
47
+
48
+
49
+ begin
50
+ require 'statsample/optimization'
51
+ rescue LoadError
52
+ module Statsample
53
+ OPTIMIZED=false
54
+ end
55
+ end
56
+
57
+ #
58
+ # :startdoc:
59
+ #
60
+ module Statsample
61
+ VERSION = '0.3.0'
62
+ SPLIT_TOKEN = ","
63
+ autoload(:Database, 'statsample/converters')
64
+ autoload(:Anova, 'statsample/anova')
65
+ autoload(:CSV, 'statsample/converters')
66
+ autoload(:Excel, 'statsample/converters')
67
+ autoload(:GGobi, 'statsample/converters')
68
+ autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
69
+ autoload(:HtmlReport, 'statsample/htmlreport')
70
+ autoload(:Mx, 'statsample/converters')
71
+ autoload(:Resample, 'statsample/resample')
72
+ autoload(:SRS, 'statsample/srs')
73
+ autoload(:Codification, 'statsample/codification')
74
+ autoload(:Reliability, 'statsample/reliability')
75
+ autoload(:Bivariate, 'statsample/bivariate')
76
+ autoload(:Multivariate, 'statsample/multivariate')
77
+
78
+ autoload(:Regression, 'statsample/regression')
79
+ autoload(:Test, 'statsample/test')
80
+ def self.load(filename)
81
+ fp=File.open(filename,"r")
82
+ o=Marshal.load(fp)
83
+ fp.close
84
+ o
85
+ end
86
+
87
+ module Util
88
+ # Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/normprpl.htm
89
+ def normal_order_statistic_medians(i,n)
90
+ if i==1
91
+ u= 1.0 - normal_order_statistic_medians(n,n)
92
+ elsif i==n
93
+ u=0.5**(1 / n.to_f)
94
+ else
95
+ u= (i - 0.3175) / (n + 0.365)
96
+ end
97
+ u
98
+ end
99
+ end
100
+ module Writable
101
+ def save(filename)
102
+ fp=File.open(filename,"w")
103
+ Marshal.dump(self,fp)
104
+ fp.close
105
+ end
106
+ end
107
+ module HtmlSummary
108
+ def add_line(n=nil)
109
+ self << "<hr />"
110
+ end
111
+ def nl
112
+ self << "<br />"
113
+ end
114
+ def add(text)
115
+ self << ("<p>"+text.gsub("\n","<br />")+"</p>")
116
+ end
117
+ def parse_table(table)
118
+ self << table.parse_html
119
+ end
120
+ end
121
+ module ConsoleSummary
122
+ def add_line(n=80)
123
+ self << "-"*n+"\n"
124
+ end
125
+ def nl
126
+ self << "\n"
127
+ end
128
+ def add(text)
129
+ self << text
130
+ end
131
+ def parse_table(table)
132
+ self << table.parse_console
133
+ end
134
+ end
135
+ class ReportTable
136
+ attr_reader :header
137
+ def initialize(header=[])
138
+ @header=header
139
+ @rows=[]
140
+ @max_cols=[]
141
+ end
142
+ def add_row(row)
143
+ row.each_index{|i|
144
+ @max_cols[i]=row[i].to_s.size if @max_cols[i].nil? or row[i].to_s.size > @max_cols[i]
145
+ }
146
+ @rows.push(row)
147
+ end
148
+ def add_horizontal_line
149
+ @rows.push(:hr)
150
+ end
151
+ def header=(h)
152
+ h.each_index{|i|
153
+ @max_cols[i]=h[i].to_s.size if @max_cols[i].nil? or h[i].to_s.size>@max_cols[i]
154
+ }
155
+ @header=h
156
+ end
157
+ def parse_console_row(row)
158
+ out="| "
159
+ @max_cols.each_index{|i|
160
+ if row[i].nil?
161
+ out << " "*(@max_cols[i]+2)+"|"
162
+ else
163
+ t=row[i].to_s
164
+ out << " "+t+" "*(@max_cols[i]-t.size+1)+"|"
165
+ end
166
+ }
167
+ out << "\n"
168
+ out
169
+ end
170
+ def parse_console_hr
171
+ "-"*(@max_cols.inject(0){|a,v|a+v.size+3}+2)+"\n"
172
+ end
173
+ def parse_console
174
+ out="\n"
175
+ out << parse_console_hr
176
+ out << parse_console_row(header)
177
+ out << parse_console_hr
178
+
179
+ @rows.each{|row|
180
+ if row==:hr
181
+ out << parse_console_hr
182
+ else
183
+ out << parse_console_row(row)
184
+ end
185
+ }
186
+ out << parse_console_hr
187
+
188
+ out
189
+ end
190
+ def parse_html
191
+ out="<table>\n"
192
+ if header.size>0
193
+ out << "<thead><th>"+header.join("</th><th>")+"</thead><tbody>"
194
+ end
195
+ out << "<tbody>\n"
196
+ row_with_line=false
197
+ @rows.each{|row|
198
+ if row==:hr
199
+ row_with_line=true
200
+ else
201
+ out << "<tr class='"+(row_with_line ? 'line':'')+"'><td>"
202
+ out << row.join("</td><td>") +"</td>"
203
+ out << "</tr>\n"
204
+ row_with_line=false
205
+ end
206
+ }
207
+ out << "</tbody></table>\n"
208
+ out
209
+ end
210
+ end
211
+ end
212
+
213
+ require 'statsample/vector'
214
+ require 'statsample/dataset'
215
+ require 'statsample/crosstab'
216
+