statsample 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,338 @@
1
+ module Statsample
2
+ # Create and dumps Datasets on a database
3
+ module Database
4
+ require 'dbi'
5
+ class << self
6
+ # Read a database query and returns a Dataset
7
+ #
8
+ # USE:
9
+ #
10
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
11
+ # Statsample.read(dbh, "SELECT * FROM test")
12
+ #
13
+ def read(dbh,query)
14
+ sth=dbh.execute(query)
15
+ vectors={}
16
+ fields=[]
17
+ sth.column_info.each {|c|
18
+ vectors[c['name']]=Statsample::Vector.new([])
19
+ vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
20
+ fields.push(c['name'])
21
+ }
22
+ ds=Statsample::Dataset.new(vectors,fields)
23
+ sth.fetch do |row|
24
+ ds.add_case(row.to_a, false )
25
+ end
26
+ ds.update_valid_data
27
+ ds
28
+ end
29
+ # Insert each case of the Dataset on the selected table
30
+ #
31
+ # USE:
32
+ #
33
+ # ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
34
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
35
+ # Statsample::Database.insert(ds,dbh,"test")
36
+ #
37
+ def insert(ds, dbh,table)
38
+ query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
39
+ sth=dbh.prepare(query)
40
+ ds.each_array{|c|
41
+ sth.execute(*c)
42
+ }
43
+ end
44
+ # Create a sql, basen on a given Dataset
45
+ #
46
+ # USE:
47
+ #
48
+ # ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
49
+ # Statsample::Database.create_sql(ds,'names')
50
+ # ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
51
+ #
52
+ def create_sql(ds,table,charset="UTF8")
53
+ sql="CREATE TABLE #{table} ("
54
+ fields=ds.fields.collect{|f|
55
+ v=ds[f]
56
+ f+" "+v.db_type
57
+ }
58
+ sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
59
+ end
60
+ end
61
+ end
62
+ module Mondrian
63
+ class << self
64
+ def write(dataset,filename)
65
+ File.open(filename,"wb") do |fp|
66
+ fp.puts dataset.fields.join("\t")
67
+ dataset.each {|row|
68
+ values=dataset.fields.collect{|f|
69
+ if dataset[f].is_valid? row[f]
70
+ row[f]
71
+ else
72
+ ""
73
+ end
74
+ }
75
+ fp.puts(values.join("\t"))
76
+ }
77
+ end
78
+ end
79
+ end
80
+ end
81
+ module Excel
82
+ class << self
83
+ def write(dataset,filename)
84
+ require 'spreadsheet'
85
+ book = Spreadsheet::Workbook.new
86
+ sheet = book.create_worksheet
87
+ format = Spreadsheet::Format.new :color => :blue,
88
+ :weight => :bold
89
+ sheet.row(0).concat(dataset.fields)
90
+ sheet.row(0).default_format = format
91
+ i=1
92
+ dataset.each_array{|row|
93
+ sheet.row(i).concat(row)
94
+ i+=1
95
+ }
96
+ book.write(filename)
97
+ end
98
+ # Returns a dataset based on a xls file
99
+ # USE:
100
+ # ds = Statsample::Excel.read("test.xls")
101
+ #
102
+ def read(filename, worksheet_id=0, ignore_lines=0, empty=[''])
103
+ require 'spreadsheet'
104
+
105
+ first_row=true
106
+ fields=[]
107
+ fields_data={}
108
+ ds=nil
109
+ line_number=0
110
+ book = Spreadsheet.open filename
111
+ sheet= book.worksheet worksheet_id
112
+ sheet.each do |row|
113
+ line_number+=1
114
+ if(line_number<=ignore_lines)
115
+ #puts "Skip line"
116
+ next
117
+ end
118
+ # This should be fixed.
119
+ # If we have a Formula, should be resolver first
120
+ row.collect!{|c|
121
+ if c.is_a? Spreadsheet::Formula
122
+ nil
123
+ else
124
+ c.to_s
125
+ end
126
+ }
127
+ if first_row
128
+ fields=row.to_a.collect{|c| c.downcase}
129
+ if fields.size!=fields.uniq.size
130
+ repeated=fields.inject({}) {|a,v|
131
+ (a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v|k}.join(",")
132
+ raise "There are some repeated fields on the header:#{repeated}. Please, fix"
133
+ end
134
+ ds=Statsample::Dataset.new(fields)
135
+ first_row=false
136
+ else
137
+ rowa=row.to_a.collect{|c|
138
+
139
+ empty.include?(c) ? nil: c
140
+ }
141
+ (fields.size - rowa.size).times {|i|
142
+ rowa << nil
143
+ }
144
+ ds.add_case(rowa,false)
145
+ end
146
+ end
147
+ ds.update_valid_data
148
+ ds
149
+ end
150
+ end
151
+ end
152
+ module CSV
153
+ class << self
154
+ # Returns a Dataset based on a csv file
155
+ #
156
+ # USE:
157
+ # ds=Statsample::CSV.read("test_csv.csv")
158
+ def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
159
+ require 'csv'
160
+
161
+ first_row=true
162
+ fields=[]
163
+ fields_data={}
164
+ ds=nil
165
+ line_number=0
166
+ ::CSV.open(filename,'r',fs,rs) do |row|
167
+ line_number+=1
168
+ if(line_number<=ignore_lines)
169
+ #puts "Skip line"
170
+ next
171
+ end
172
+ row.collect!{|c|
173
+ c.to_s
174
+ }
175
+ if first_row
176
+ fields=row.to_a.collect{|c| c.downcase}
177
+ if fields.size!=fields.uniq.size
178
+ repeated=fields.inject({}) {|a,v|
179
+ (a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v|k}.join(",")
180
+
181
+ raise "There are some repeated fields on the header:#{repeated}. Please, fix"
182
+ end
183
+ ds=Statsample::Dataset.new(fields)
184
+ first_row=false
185
+ else
186
+ rowa=row.to_a.collect{|c|
187
+ empty.include?(c) ? nil: c
188
+ }
189
+
190
+ ds.add_case(rowa,false)
191
+ end
192
+ end
193
+ ds.update_valid_data
194
+ ds
195
+ end
196
+ # Save a Dataset on a csv file
197
+ #
198
+ # USE:
199
+ # Statsample::CSV.write(ds,"test_csv.csv")
200
+ def write(dataset,filename, convert_comma=false,*opts)
201
+ writer=::CSV.open(filename,'w',*opts)
202
+ writer << dataset.fields
203
+ dataset.each_array{|row|
204
+ if(convert_comma)
205
+ row.collect!{|v| v.to_s.gsub(".",",")}
206
+ end
207
+ writer << row
208
+ }
209
+ writer.close
210
+ end
211
+ end
212
+ end
213
+ module Mx
214
+ class << self
215
+ def write(dataset,filename,type=:covariance)
216
+ puts "Writing MX File"
217
+ File.open(filename,"w") {|fp|
218
+ fp.puts "! #{filename}"
219
+ fp.puts "! Output generated by Statsample"
220
+ fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}"
221
+ fp.puts "Labels "+dataset.fields.join(" ")
222
+ case type
223
+ when :raw
224
+ fp.puts "Rectangular"
225
+ dataset.each {|row|
226
+ out=dataset.fields.collect {|f|
227
+ if dataset[f].is_valid? row[f]
228
+ row[f]
229
+ else
230
+ "."
231
+ end
232
+ }
233
+ fp.puts out.join("\t")
234
+ }
235
+ fp.puts "End Rectangular"
236
+ when :covariance
237
+ fp.puts " CMatrix Full"
238
+ cm=Statsample::Bivariate.covariance_matrix(dataset)
239
+ d=(0...(cm.row_size)).collect {|row|
240
+ (0...(cm.column_size)).collect{|col|
241
+ cm[row,col].nil? ? "." : sprintf("%0.3f", cm[row,col])
242
+ }.join(" ")
243
+ }.join("\n")
244
+ fp.puts d
245
+ end
246
+ }
247
+ end
248
+ end
249
+ end
250
+ module GGobi
251
+ class << self
252
+ def write(dataset,filename,opt={})
253
+ File.open(filename,"w") {|fp|
254
+ fp.write(self.out(dataset,opt))
255
+ }
256
+ end
257
+ def out(dataset,opt={})
258
+ require 'ostruct'
259
+ default_opt = {:dataname => "Default", :description=>""}
260
+ default_opt.merge! opt
261
+ carrier=OpenStruct.new
262
+ carrier.categorials=[]
263
+ carrier.conversions={}
264
+ variables_def=dataset.vectors.collect{|k,v|
265
+ variable_definition(carrier,v,k)
266
+ }.join("\n")
267
+
268
+ indexes=carrier.categorials.inject({}) {|s,c|
269
+ s[dataset.fields.index(c)]=c
270
+ s
271
+ }
272
+ records=""
273
+ dataset.each_array {|c|
274
+ indexes.each{|ik,iv|
275
+ c[ik]=carrier.conversions[iv][c[ik]]
276
+ }
277
+ records << "<record>#{values_definition(c)}</record>\n"
278
+ }
279
+
280
+ out=<<EOC
281
+ <?xml version="1.0"?>
282
+ <!DOCTYPE ggobidata SYSTEM "ggobi.dtd">
283
+ <ggobidata count="1">
284
+ <data name="#{default_opt[:dataname]}">
285
+ <description>#{default_opt[:description]}</description>
286
+ <variables count="#{dataset.fields.size}">
287
+ #{variables_def}
288
+ </variables>
289
+ <records count="#{dataset.cases}">
290
+ #{records}
291
+ </records>
292
+
293
+ </data>
294
+ </ggobidata>
295
+ EOC
296
+
297
+ out
298
+
299
+ end
300
+ def values_definition(c)
301
+ c.collect{|v|
302
+ if v.is_a? Float
303
+ "<real>#{v}</real>"
304
+ elsif v.is_a? Integer
305
+ "<int>#{v}</int>"
306
+ else
307
+ "<string>#{v}</string>"
308
+ end
309
+ }.join(" ")
310
+ end
311
+ # Outputs a string for a variable definition
312
+ # v = vector
313
+ # name = name of the variable
314
+ # nickname = nickname
315
+ def variable_definition(carrier,v,name,nickname=nil)
316
+ nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" )
317
+ if v.type==:nominal or v.data.find {|d| d.is_a? String }
318
+ carrier.categorials.push(name)
319
+ carrier.conversions[name]={}
320
+ factors=v.data.uniq.sort
321
+ out ="<categoricalvariable name=\"#{name}\" #{nickname}>\n"
322
+ out << "<levels count=\"#{factors.size}\">\n"
323
+ out << (1..factors.size).to_a.collect{|i|
324
+ carrier.conversions[name][factors[i-1]]=i
325
+ "<level value=\"#{i}\">#{v.labeling(factors[i-1])}</level>"
326
+ }.join("\n")
327
+ out << "</levels>\n</categoricalvariable>\n"
328
+ out
329
+ elsif v.data.find {|d| d.is_a? Float}
330
+ "<realvariable name=\"#{name}\" #{nickname} />"
331
+ else
332
+ "<integervariable name=\"#{name}\" #{nickname} />"
333
+ end
334
+ end
335
+
336
+ end
337
+ end
338
+ end
@@ -0,0 +1,122 @@
1
+ module Statsample
2
+ # Class to create crosstab of data
3
+ # With this, you can create reports and do chi square test
4
+ # The first vector will be at rows and the second will the the columns
5
+ #
6
+ class Crosstab
7
+ attr_reader :v_rows, :v_cols
8
+ def initialize(v1,v2)
9
+ raise ArgumentError, "Both arguments should be Vectors" unless v1.instance_of? Vector and v2.instance_of? Vector
10
+ raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
11
+ @v_rows,@v_cols=v1,v2
12
+ end
13
+ def rows_names
14
+ @v_rows.factors.sort
15
+ end
16
+ def cols_names
17
+ @v_cols.factors.sort
18
+ end
19
+ def rows_total
20
+ @v_rows.frequencies
21
+ end
22
+ def cols_total
23
+ @v_cols.frequencies
24
+ end
25
+ def frequencies
26
+ base=rows_names.inject([]){|s,row|
27
+ s+=cols_names.collect{|col| [row,col]}
28
+ }.inject({}) {|s,par|
29
+ s[par]=0
30
+ s
31
+ }
32
+ base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
33
+ end
34
+ def to_matrix
35
+ f=frequencies
36
+ rn=rows_names
37
+ cn=cols_names
38
+ Matrix.rows(rn.collect{|row|
39
+ cn.collect{|col| f[[row,col]]}
40
+ })
41
+ end
42
+ def frequencies_by_row
43
+ f=frequencies
44
+ rows_names.inject({}){|sr,row|
45
+ sr[row]=cols_names.inject({}) {|sc,col|
46
+ sc[col]=f[[row,col]]
47
+ sc
48
+ }
49
+ sr
50
+ }
51
+ end
52
+ def frequencies_by_col
53
+ f=frequencies
54
+ cols_names.inject({}){|sc,col|
55
+ sc[col]=rows_names.inject({}) {|sr,row|
56
+ sr[row]=f[[row,col]]
57
+ sr
58
+ }
59
+ sc
60
+ }
61
+ end
62
+ # Chi square, based on expected and real matrix
63
+ def chi_square
64
+ require 'statsample/test'
65
+ Statsample::Test.chi_square(self.to_matrix,matrix_expected)
66
+ end
67
+ # Useful to obtain chi square
68
+ def matrix_expected
69
+ rn=rows_names
70
+ cn=cols_names
71
+ rt=rows_total
72
+ ct=cols_total
73
+ t=@v_rows.size.to_f
74
+ m=rn.collect{|row|
75
+ cn.collect{|col|
76
+ (rt[row]*ct[col]) / t
77
+ }
78
+ }
79
+ Matrix.rows(m)
80
+ end
81
+ def to_s
82
+ fq=frequencies
83
+ rn=rows_names
84
+ cn=cols_names
85
+ total=0
86
+ total_cols=cn.inject({}) {|a,x| a[x]=0;a}
87
+ max_row_size = rn.inject(0) {|s,x| sl=@v_rows.labeling(x).size; sl>s ? sl : s}
88
+
89
+ max_row_size=max_row_size<6 ? 6 : max_row_size
90
+
91
+ max_col_size = cn.inject(0) {|s,x| sl=@v_cols.labeling(x).size; sl>s ? sl : s}
92
+ max_col_size = frequencies.inject(max_col_size) {|s,x| x[1].to_s.size>s ? x[1].to_s.size : s}
93
+
94
+ out=""
95
+ out << " " * (max_row_size+2) << "|" << cn.collect{|c| name=@v_cols.labeling(c); " "+name+(" "*(max_col_size-name.size))+" "}.join("|") << "| Total\n"
96
+ linea="-" * (max_row_size+2) << "|" << ("-"*(max_col_size+2) +"|")*cn.size << "-"*7 << "\n"
97
+ out << linea
98
+ rn.each{|row|
99
+ total_row=0;
100
+ name=@v_rows.labeling(row)
101
+ out << " " +name << " "*(max_row_size-name.size) << " | "
102
+ cn.each{|col|
103
+ data=fq[[row,col]].to_s
104
+ total_row+=fq[[row,col]]
105
+ total+=fq[[row,col]]
106
+ total_cols[col]+=fq[[row,col]]
107
+ out << " " << data << " "*(max_col_size-data.size) << "| "
108
+ }
109
+ out << " " << total_row.to_s
110
+ out << "\n"
111
+ }
112
+ out << linea
113
+ out << " Total " << " "*(max_row_size-5) << "| "
114
+ cn.each{|v|
115
+ data=total_cols[v].to_s
116
+ out << " " << data << " "*(max_col_size-data.size) << "| "
117
+ }
118
+ out << " " << total.to_s
119
+ out
120
+ end
121
+ end
122
+ end