statsample 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,338 @@
1
+ module Statsample
2
+ # Create and dumps Datasets on a database
3
+ module Database
4
+ require 'dbi'
5
+ class << self
6
+ # Read a database query and returns a Dataset
7
+ #
8
+ # USE:
9
+ #
10
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
11
+ # Statsample.read(dbh, "SELECT * FROM test")
12
+ #
13
+ def read(dbh,query)
14
+ sth=dbh.execute(query)
15
+ vectors={}
16
+ fields=[]
17
+ sth.column_info.each {|c|
18
+ vectors[c['name']]=Statsample::Vector.new([])
19
+ vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
20
+ fields.push(c['name'])
21
+ }
22
+ ds=Statsample::Dataset.new(vectors,fields)
23
+ sth.fetch do |row|
24
+ ds.add_case(row.to_a, false )
25
+ end
26
+ ds.update_valid_data
27
+ ds
28
+ end
29
+ # Insert each case of the Dataset on the selected table
30
+ #
31
+ # USE:
32
+ #
33
+ # ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
34
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
35
+ # Statsample::Database.insert(ds,dbh,"test")
36
+ #
37
+ def insert(ds, dbh,table)
38
+ query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
39
+ sth=dbh.prepare(query)
40
+ ds.each_array{|c|
41
+ sth.execute(*c)
42
+ }
43
+ end
44
+ # Create a sql, basen on a given Dataset
45
+ #
46
+ # USE:
47
+ #
48
+ # ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
49
+ # Statsample::Database.create_sql(ds,'names')
50
+ # ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
51
+ #
52
+ def create_sql(ds,table,charset="UTF8")
53
+ sql="CREATE TABLE #{table} ("
54
+ fields=ds.fields.collect{|f|
55
+ v=ds[f]
56
+ f+" "+v.db_type
57
+ }
58
+ sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
59
+ end
60
+ end
61
+ end
62
+ module Mondrian
63
+ class << self
64
+ def write(dataset,filename)
65
+ File.open(filename,"wb") do |fp|
66
+ fp.puts dataset.fields.join("\t")
67
+ dataset.each {|row|
68
+ values=dataset.fields.collect{|f|
69
+ if dataset[f].is_valid? row[f]
70
+ row[f]
71
+ else
72
+ ""
73
+ end
74
+ }
75
+ fp.puts(values.join("\t"))
76
+ }
77
+ end
78
+ end
79
+ end
80
+ end
81
+ module Excel
82
+ class << self
83
+ def write(dataset,filename)
84
+ require 'spreadsheet'
85
+ book = Spreadsheet::Workbook.new
86
+ sheet = book.create_worksheet
87
+ format = Spreadsheet::Format.new :color => :blue,
88
+ :weight => :bold
89
+ sheet.row(0).concat(dataset.fields)
90
+ sheet.row(0).default_format = format
91
+ i=1
92
+ dataset.each_array{|row|
93
+ sheet.row(i).concat(row)
94
+ i+=1
95
+ }
96
+ book.write(filename)
97
+ end
98
+ # Returns a dataset based on a xls file
99
+ # USE:
100
+ # ds = Statsample::Excel.read("test.xls")
101
+ #
102
+ def read(filename, worksheet_id=0, ignore_lines=0, empty=[''])
103
+ require 'spreadsheet'
104
+
105
+ first_row=true
106
+ fields=[]
107
+ fields_data={}
108
+ ds=nil
109
+ line_number=0
110
+ book = Spreadsheet.open filename
111
+ sheet= book.worksheet worksheet_id
112
+ sheet.each do |row|
113
+ line_number+=1
114
+ if(line_number<=ignore_lines)
115
+ #puts "Skip line"
116
+ next
117
+ end
118
+ # This should be fixed.
119
+ # If we have a Formula, should be resolver first
120
+ row.collect!{|c|
121
+ if c.is_a? Spreadsheet::Formula
122
+ nil
123
+ else
124
+ c.to_s
125
+ end
126
+ }
127
+ if first_row
128
+ fields=row.to_a.collect{|c| c.downcase}
129
+ if fields.size!=fields.uniq.size
130
+ repeated=fields.inject({}) {|a,v|
131
+ (a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v|k}.join(",")
132
+ raise "There are some repeated fields on the header:#{repeated}. Please, fix"
133
+ end
134
+ ds=Statsample::Dataset.new(fields)
135
+ first_row=false
136
+ else
137
+ rowa=row.to_a.collect{|c|
138
+
139
+ empty.include?(c) ? nil: c
140
+ }
141
+ (fields.size - rowa.size).times {|i|
142
+ rowa << nil
143
+ }
144
+ ds.add_case(rowa,false)
145
+ end
146
+ end
147
+ ds.update_valid_data
148
+ ds
149
+ end
150
+ end
151
+ end
152
+ module CSV
153
+ class << self
154
+ # Returns a Dataset based on a csv file
155
+ #
156
+ # USE:
157
+ # ds=Statsample::CSV.read("test_csv.csv")
158
+ def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
159
+ require 'csv'
160
+
161
+ first_row=true
162
+ fields=[]
163
+ fields_data={}
164
+ ds=nil
165
+ line_number=0
166
+ ::CSV.open(filename,'r',fs,rs) do |row|
167
+ line_number+=1
168
+ if(line_number<=ignore_lines)
169
+ #puts "Skip line"
170
+ next
171
+ end
172
+ row.collect!{|c|
173
+ c.to_s
174
+ }
175
+ if first_row
176
+ fields=row.to_a.collect{|c| c.downcase}
177
+ if fields.size!=fields.uniq.size
178
+ repeated=fields.inject({}) {|a,v|
179
+ (a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v|k}.join(",")
180
+
181
+ raise "There are some repeated fields on the header:#{repeated}. Please, fix"
182
+ end
183
+ ds=Statsample::Dataset.new(fields)
184
+ first_row=false
185
+ else
186
+ rowa=row.to_a.collect{|c|
187
+ empty.include?(c) ? nil: c
188
+ }
189
+
190
+ ds.add_case(rowa,false)
191
+ end
192
+ end
193
+ ds.update_valid_data
194
+ ds
195
+ end
196
+ # Save a Dataset on a csv file
197
+ #
198
+ # USE:
199
+ # Statsample::CSV.write(ds,"test_csv.csv")
200
+ def write(dataset,filename, convert_comma=false,*opts)
201
+ writer=::CSV.open(filename,'w',*opts)
202
+ writer << dataset.fields
203
+ dataset.each_array{|row|
204
+ if(convert_comma)
205
+ row.collect!{|v| v.to_s.gsub(".",",")}
206
+ end
207
+ writer << row
208
+ }
209
+ writer.close
210
+ end
211
+ end
212
+ end
213
+ module Mx
214
+ class << self
215
+ def write(dataset,filename,type=:covariance)
216
+ puts "Writing MX File"
217
+ File.open(filename,"w") {|fp|
218
+ fp.puts "! #{filename}"
219
+ fp.puts "! Output generated by Statsample"
220
+ fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}"
221
+ fp.puts "Labels "+dataset.fields.join(" ")
222
+ case type
223
+ when :raw
224
+ fp.puts "Rectangular"
225
+ dataset.each {|row|
226
+ out=dataset.fields.collect {|f|
227
+ if dataset[f].is_valid? row[f]
228
+ row[f]
229
+ else
230
+ "."
231
+ end
232
+ }
233
+ fp.puts out.join("\t")
234
+ }
235
+ fp.puts "End Rectangular"
236
+ when :covariance
237
+ fp.puts " CMatrix Full"
238
+ cm=Statsample::Bivariate.covariance_matrix(dataset)
239
+ d=(0...(cm.row_size)).collect {|row|
240
+ (0...(cm.column_size)).collect{|col|
241
+ cm[row,col].nil? ? "." : sprintf("%0.3f", cm[row,col])
242
+ }.join(" ")
243
+ }.join("\n")
244
+ fp.puts d
245
+ end
246
+ }
247
+ end
248
+ end
249
+ end
250
+ module GGobi
251
+ class << self
252
+ def write(dataset,filename,opt={})
253
+ File.open(filename,"w") {|fp|
254
+ fp.write(self.out(dataset,opt))
255
+ }
256
+ end
257
+ def out(dataset,opt={})
258
+ require 'ostruct'
259
+ default_opt = {:dataname => "Default", :description=>""}
260
+ default_opt.merge! opt
261
+ carrier=OpenStruct.new
262
+ carrier.categorials=[]
263
+ carrier.conversions={}
264
+ variables_def=dataset.vectors.collect{|k,v|
265
+ variable_definition(carrier,v,k)
266
+ }.join("\n")
267
+
268
+ indexes=carrier.categorials.inject({}) {|s,c|
269
+ s[dataset.fields.index(c)]=c
270
+ s
271
+ }
272
+ records=""
273
+ dataset.each_array {|c|
274
+ indexes.each{|ik,iv|
275
+ c[ik]=carrier.conversions[iv][c[ik]]
276
+ }
277
+ records << "<record>#{values_definition(c)}</record>\n"
278
+ }
279
+
280
+ out=<<EOC
281
+ <?xml version="1.0"?>
282
+ <!DOCTYPE ggobidata SYSTEM "ggobi.dtd">
283
+ <ggobidata count="1">
284
+ <data name="#{default_opt[:dataname]}">
285
+ <description>#{default_opt[:description]}</description>
286
+ <variables count="#{dataset.fields.size}">
287
+ #{variables_def}
288
+ </variables>
289
+ <records count="#{dataset.cases}">
290
+ #{records}
291
+ </records>
292
+
293
+ </data>
294
+ </ggobidata>
295
+ EOC
296
+
297
+ out
298
+
299
+ end
300
+ def values_definition(c)
301
+ c.collect{|v|
302
+ if v.is_a? Float
303
+ "<real>#{v}</real>"
304
+ elsif v.is_a? Integer
305
+ "<int>#{v}</int>"
306
+ else
307
+ "<string>#{v}</string>"
308
+ end
309
+ }.join(" ")
310
+ end
311
+ # Outputs a string for a variable definition
312
+ # v = vector
313
+ # name = name of the variable
314
+ # nickname = nickname
315
+ def variable_definition(carrier,v,name,nickname=nil)
316
+ nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" )
317
+ if v.type==:nominal or v.data.find {|d| d.is_a? String }
318
+ carrier.categorials.push(name)
319
+ carrier.conversions[name]={}
320
+ factors=v.data.uniq.sort
321
+ out ="<categoricalvariable name=\"#{name}\" #{nickname}>\n"
322
+ out << "<levels count=\"#{factors.size}\">\n"
323
+ out << (1..factors.size).to_a.collect{|i|
324
+ carrier.conversions[name][factors[i-1]]=i
325
+ "<level value=\"#{i}\">#{v.labeling(factors[i-1])}</level>"
326
+ }.join("\n")
327
+ out << "</levels>\n</categoricalvariable>\n"
328
+ out
329
+ elsif v.data.find {|d| d.is_a? Float}
330
+ "<realvariable name=\"#{name}\" #{nickname} />"
331
+ else
332
+ "<integervariable name=\"#{name}\" #{nickname} />"
333
+ end
334
+ end
335
+
336
+ end
337
+ end
338
+ end
@@ -0,0 +1,122 @@
1
+ module Statsample
2
+ # Class to create crosstab of data
3
+ # With this, you can create reports and do chi square test
4
+ # The first vector will be at rows and the second will the the columns
5
+ #
6
+ class Crosstab
7
+ attr_reader :v_rows, :v_cols
8
+ def initialize(v1,v2)
9
+ raise ArgumentError, "Both arguments should be Vectors" unless v1.instance_of? Vector and v2.instance_of? Vector
10
+ raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
11
+ @v_rows,@v_cols=v1,v2
12
+ end
13
+ def rows_names
14
+ @v_rows.factors.sort
15
+ end
16
+ def cols_names
17
+ @v_cols.factors.sort
18
+ end
19
+ def rows_total
20
+ @v_rows.frequencies
21
+ end
22
+ def cols_total
23
+ @v_cols.frequencies
24
+ end
25
+ def frequencies
26
+ base=rows_names.inject([]){|s,row|
27
+ s+=cols_names.collect{|col| [row,col]}
28
+ }.inject({}) {|s,par|
29
+ s[par]=0
30
+ s
31
+ }
32
+ base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
33
+ end
34
+ def to_matrix
35
+ f=frequencies
36
+ rn=rows_names
37
+ cn=cols_names
38
+ Matrix.rows(rn.collect{|row|
39
+ cn.collect{|col| f[[row,col]]}
40
+ })
41
+ end
42
+ def frequencies_by_row
43
+ f=frequencies
44
+ rows_names.inject({}){|sr,row|
45
+ sr[row]=cols_names.inject({}) {|sc,col|
46
+ sc[col]=f[[row,col]]
47
+ sc
48
+ }
49
+ sr
50
+ }
51
+ end
52
+ def frequencies_by_col
53
+ f=frequencies
54
+ cols_names.inject({}){|sc,col|
55
+ sc[col]=rows_names.inject({}) {|sr,row|
56
+ sr[row]=f[[row,col]]
57
+ sr
58
+ }
59
+ sc
60
+ }
61
+ end
62
+ # Chi square, based on expected and real matrix
63
+ def chi_square
64
+ require 'statsample/test'
65
+ Statsample::Test.chi_square(self.to_matrix,matrix_expected)
66
+ end
67
+ # Useful to obtain chi square
68
+ def matrix_expected
69
+ rn=rows_names
70
+ cn=cols_names
71
+ rt=rows_total
72
+ ct=cols_total
73
+ t=@v_rows.size.to_f
74
+ m=rn.collect{|row|
75
+ cn.collect{|col|
76
+ (rt[row]*ct[col]) / t
77
+ }
78
+ }
79
+ Matrix.rows(m)
80
+ end
81
+ def to_s
82
+ fq=frequencies
83
+ rn=rows_names
84
+ cn=cols_names
85
+ total=0
86
+ total_cols=cn.inject({}) {|a,x| a[x]=0;a}
87
+ max_row_size = rn.inject(0) {|s,x| sl=@v_rows.labeling(x).size; sl>s ? sl : s}
88
+
89
+ max_row_size=max_row_size<6 ? 6 : max_row_size
90
+
91
+ max_col_size = cn.inject(0) {|s,x| sl=@v_cols.labeling(x).size; sl>s ? sl : s}
92
+ max_col_size = frequencies.inject(max_col_size) {|s,x| x[1].to_s.size>s ? x[1].to_s.size : s}
93
+
94
+ out=""
95
+ out << " " * (max_row_size+2) << "|" << cn.collect{|c| name=@v_cols.labeling(c); " "+name+(" "*(max_col_size-name.size))+" "}.join("|") << "| Total\n"
96
+ linea="-" * (max_row_size+2) << "|" << ("-"*(max_col_size+2) +"|")*cn.size << "-"*7 << "\n"
97
+ out << linea
98
+ rn.each{|row|
99
+ total_row=0;
100
+ name=@v_rows.labeling(row)
101
+ out << " " +name << " "*(max_row_size-name.size) << " | "
102
+ cn.each{|col|
103
+ data=fq[[row,col]].to_s
104
+ total_row+=fq[[row,col]]
105
+ total+=fq[[row,col]]
106
+ total_cols[col]+=fq[[row,col]]
107
+ out << " " << data << " "*(max_col_size-data.size) << "| "
108
+ }
109
+ out << " " << total_row.to_s
110
+ out << "\n"
111
+ }
112
+ out << linea
113
+ out << " Total " << " "*(max_row_size-5) << "| "
114
+ cn.each{|v|
115
+ data=total_cols[v].to_s
116
+ out << " " << data << " "*(max_col_size-data.size) << "| "
117
+ }
118
+ out << " " << total.to_s
119
+ out
120
+ end
121
+ end
122
+ end