statsample 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +79 -0
- data/Manifest.txt +56 -0
- data/README.txt +77 -0
- data/Rakefile +22 -0
- data/bin/statsample +2 -0
- data/demo/benchmark.rb +52 -0
- data/demo/chi-square.rb +44 -0
- data/demo/dice.rb +13 -0
- data/demo/distribution_t.rb +95 -0
- data/demo/graph.rb +9 -0
- data/demo/item_analysis.rb +30 -0
- data/demo/mean.rb +81 -0
- data/demo/proportion.rb +57 -0
- data/demo/sample_test.csv +113 -0
- data/demo/strata_proportion.rb +152 -0
- data/demo/stratum.rb +141 -0
- data/lib/spss.rb +131 -0
- data/lib/statsample.rb +216 -0
- data/lib/statsample/anova.rb +74 -0
- data/lib/statsample/bivariate.rb +255 -0
- data/lib/statsample/chidistribution.rb +39 -0
- data/lib/statsample/codification.rb +120 -0
- data/lib/statsample/converters.rb +338 -0
- data/lib/statsample/crosstab.rb +122 -0
- data/lib/statsample/dataset.rb +526 -0
- data/lib/statsample/dominanceanalysis.rb +259 -0
- data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
- data/lib/statsample/graph/gdchart.rb +45 -0
- data/lib/statsample/graph/svgboxplot.rb +108 -0
- data/lib/statsample/graph/svggraph.rb +181 -0
- data/lib/statsample/graph/svghistogram.rb +208 -0
- data/lib/statsample/graph/svgscatterplot.rb +111 -0
- data/lib/statsample/htmlreport.rb +232 -0
- data/lib/statsample/multiset.rb +281 -0
- data/lib/statsample/regression.rb +522 -0
- data/lib/statsample/reliability.rb +235 -0
- data/lib/statsample/resample.rb +20 -0
- data/lib/statsample/srs.rb +159 -0
- data/lib/statsample/test.rb +25 -0
- data/lib/statsample/vector.rb +759 -0
- data/test/_test_chart.rb +58 -0
- data/test/test_anova.rb +31 -0
- data/test/test_codification.rb +59 -0
- data/test/test_crosstab.rb +55 -0
- data/test/test_csv.csv +7 -0
- data/test/test_csv.rb +27 -0
- data/test/test_dataset.rb +293 -0
- data/test/test_ggobi.rb +42 -0
- data/test/test_multiset.rb +98 -0
- data/test/test_regression.rb +108 -0
- data/test/test_reliability.rb +32 -0
- data/test/test_resample.rb +23 -0
- data/test/test_srs.rb +14 -0
- data/test/test_statistics.rb +152 -0
- data/test/test_stratified.rb +19 -0
- data/test/test_svg_graph.rb +63 -0
- data/test/test_vector.rb +265 -0
- data/test/test_xls.rb +32 -0
- metadata +158 -0
@@ -0,0 +1,338 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Create and dumps Datasets on a database
|
3
|
+
module Database
|
4
|
+
require 'dbi'
|
5
|
+
class << self
|
6
|
+
# Read a database query and returns a Dataset
|
7
|
+
#
|
8
|
+
# USE:
|
9
|
+
#
|
10
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
11
|
+
# Statsample.read(dbh, "SELECT * FROM test")
|
12
|
+
#
|
13
|
+
def read(dbh,query)
|
14
|
+
sth=dbh.execute(query)
|
15
|
+
vectors={}
|
16
|
+
fields=[]
|
17
|
+
sth.column_info.each {|c|
|
18
|
+
vectors[c['name']]=Statsample::Vector.new([])
|
19
|
+
vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
|
20
|
+
fields.push(c['name'])
|
21
|
+
}
|
22
|
+
ds=Statsample::Dataset.new(vectors,fields)
|
23
|
+
sth.fetch do |row|
|
24
|
+
ds.add_case(row.to_a, false )
|
25
|
+
end
|
26
|
+
ds.update_valid_data
|
27
|
+
ds
|
28
|
+
end
|
29
|
+
# Insert each case of the Dataset on the selected table
|
30
|
+
#
|
31
|
+
# USE:
|
32
|
+
#
|
33
|
+
# ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
|
34
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
35
|
+
# Statsample::Database.insert(ds,dbh,"test")
|
36
|
+
#
|
37
|
+
def insert(ds, dbh,table)
|
38
|
+
query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
|
39
|
+
sth=dbh.prepare(query)
|
40
|
+
ds.each_array{|c|
|
41
|
+
sth.execute(*c)
|
42
|
+
}
|
43
|
+
end
|
44
|
+
# Create a sql, basen on a given Dataset
|
45
|
+
#
|
46
|
+
# USE:
|
47
|
+
#
|
48
|
+
# ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
|
49
|
+
# Statsample::Database.create_sql(ds,'names')
|
50
|
+
# ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
51
|
+
#
|
52
|
+
def create_sql(ds,table,charset="UTF8")
|
53
|
+
sql="CREATE TABLE #{table} ("
|
54
|
+
fields=ds.fields.collect{|f|
|
55
|
+
v=ds[f]
|
56
|
+
f+" "+v.db_type
|
57
|
+
}
|
58
|
+
sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
module Mondrian
|
63
|
+
class << self
|
64
|
+
def write(dataset,filename)
|
65
|
+
File.open(filename,"wb") do |fp|
|
66
|
+
fp.puts dataset.fields.join("\t")
|
67
|
+
dataset.each {|row|
|
68
|
+
values=dataset.fields.collect{|f|
|
69
|
+
if dataset[f].is_valid? row[f]
|
70
|
+
row[f]
|
71
|
+
else
|
72
|
+
""
|
73
|
+
end
|
74
|
+
}
|
75
|
+
fp.puts(values.join("\t"))
|
76
|
+
}
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
module Excel
|
82
|
+
class << self
|
83
|
+
def write(dataset,filename)
|
84
|
+
require 'spreadsheet'
|
85
|
+
book = Spreadsheet::Workbook.new
|
86
|
+
sheet = book.create_worksheet
|
87
|
+
format = Spreadsheet::Format.new :color => :blue,
|
88
|
+
:weight => :bold
|
89
|
+
sheet.row(0).concat(dataset.fields)
|
90
|
+
sheet.row(0).default_format = format
|
91
|
+
i=1
|
92
|
+
dataset.each_array{|row|
|
93
|
+
sheet.row(i).concat(row)
|
94
|
+
i+=1
|
95
|
+
}
|
96
|
+
book.write(filename)
|
97
|
+
end
|
98
|
+
# Returns a dataset based on a xls file
|
99
|
+
# USE:
|
100
|
+
# ds = Statsample::Excel.read("test.xls")
|
101
|
+
#
|
102
|
+
def read(filename, worksheet_id=0, ignore_lines=0, empty=[''])
|
103
|
+
require 'spreadsheet'
|
104
|
+
|
105
|
+
first_row=true
|
106
|
+
fields=[]
|
107
|
+
fields_data={}
|
108
|
+
ds=nil
|
109
|
+
line_number=0
|
110
|
+
book = Spreadsheet.open filename
|
111
|
+
sheet= book.worksheet worksheet_id
|
112
|
+
sheet.each do |row|
|
113
|
+
line_number+=1
|
114
|
+
if(line_number<=ignore_lines)
|
115
|
+
#puts "Skip line"
|
116
|
+
next
|
117
|
+
end
|
118
|
+
# This should be fixed.
|
119
|
+
# If we have a Formula, should be resolver first
|
120
|
+
row.collect!{|c|
|
121
|
+
if c.is_a? Spreadsheet::Formula
|
122
|
+
nil
|
123
|
+
else
|
124
|
+
c.to_s
|
125
|
+
end
|
126
|
+
}
|
127
|
+
if first_row
|
128
|
+
fields=row.to_a.collect{|c| c.downcase}
|
129
|
+
if fields.size!=fields.uniq.size
|
130
|
+
repeated=fields.inject({}) {|a,v|
|
131
|
+
(a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v|k}.join(",")
|
132
|
+
raise "There are some repeated fields on the header:#{repeated}. Please, fix"
|
133
|
+
end
|
134
|
+
ds=Statsample::Dataset.new(fields)
|
135
|
+
first_row=false
|
136
|
+
else
|
137
|
+
rowa=row.to_a.collect{|c|
|
138
|
+
|
139
|
+
empty.include?(c) ? nil: c
|
140
|
+
}
|
141
|
+
(fields.size - rowa.size).times {|i|
|
142
|
+
rowa << nil
|
143
|
+
}
|
144
|
+
ds.add_case(rowa,false)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
ds.update_valid_data
|
148
|
+
ds
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
module CSV
|
153
|
+
class << self
|
154
|
+
# Returns a Dataset based on a csv file
|
155
|
+
#
|
156
|
+
# USE:
|
157
|
+
# ds=Statsample::CSV.read("test_csv.csv")
|
158
|
+
def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
|
159
|
+
require 'csv'
|
160
|
+
|
161
|
+
first_row=true
|
162
|
+
fields=[]
|
163
|
+
fields_data={}
|
164
|
+
ds=nil
|
165
|
+
line_number=0
|
166
|
+
::CSV.open(filename,'r',fs,rs) do |row|
|
167
|
+
line_number+=1
|
168
|
+
if(line_number<=ignore_lines)
|
169
|
+
#puts "Skip line"
|
170
|
+
next
|
171
|
+
end
|
172
|
+
row.collect!{|c|
|
173
|
+
c.to_s
|
174
|
+
}
|
175
|
+
if first_row
|
176
|
+
fields=row.to_a.collect{|c| c.downcase}
|
177
|
+
if fields.size!=fields.uniq.size
|
178
|
+
repeated=fields.inject({}) {|a,v|
|
179
|
+
(a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v|k}.join(",")
|
180
|
+
|
181
|
+
raise "There are some repeated fields on the header:#{repeated}. Please, fix"
|
182
|
+
end
|
183
|
+
ds=Statsample::Dataset.new(fields)
|
184
|
+
first_row=false
|
185
|
+
else
|
186
|
+
rowa=row.to_a.collect{|c|
|
187
|
+
empty.include?(c) ? nil: c
|
188
|
+
}
|
189
|
+
|
190
|
+
ds.add_case(rowa,false)
|
191
|
+
end
|
192
|
+
end
|
193
|
+
ds.update_valid_data
|
194
|
+
ds
|
195
|
+
end
|
196
|
+
# Save a Dataset on a csv file
|
197
|
+
#
|
198
|
+
# USE:
|
199
|
+
# Statsample::CSV.write(ds,"test_csv.csv")
|
200
|
+
def write(dataset,filename, convert_comma=false,*opts)
|
201
|
+
writer=::CSV.open(filename,'w',*opts)
|
202
|
+
writer << dataset.fields
|
203
|
+
dataset.each_array{|row|
|
204
|
+
if(convert_comma)
|
205
|
+
row.collect!{|v| v.to_s.gsub(".",",")}
|
206
|
+
end
|
207
|
+
writer << row
|
208
|
+
}
|
209
|
+
writer.close
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
module Mx
|
214
|
+
class << self
|
215
|
+
def write(dataset,filename,type=:covariance)
|
216
|
+
puts "Writing MX File"
|
217
|
+
File.open(filename,"w") {|fp|
|
218
|
+
fp.puts "! #{filename}"
|
219
|
+
fp.puts "! Output generated by Statsample"
|
220
|
+
fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}"
|
221
|
+
fp.puts "Labels "+dataset.fields.join(" ")
|
222
|
+
case type
|
223
|
+
when :raw
|
224
|
+
fp.puts "Rectangular"
|
225
|
+
dataset.each {|row|
|
226
|
+
out=dataset.fields.collect {|f|
|
227
|
+
if dataset[f].is_valid? row[f]
|
228
|
+
row[f]
|
229
|
+
else
|
230
|
+
"."
|
231
|
+
end
|
232
|
+
}
|
233
|
+
fp.puts out.join("\t")
|
234
|
+
}
|
235
|
+
fp.puts "End Rectangular"
|
236
|
+
when :covariance
|
237
|
+
fp.puts " CMatrix Full"
|
238
|
+
cm=Statsample::Bivariate.covariance_matrix(dataset)
|
239
|
+
d=(0...(cm.row_size)).collect {|row|
|
240
|
+
(0...(cm.column_size)).collect{|col|
|
241
|
+
cm[row,col].nil? ? "." : sprintf("%0.3f", cm[row,col])
|
242
|
+
}.join(" ")
|
243
|
+
}.join("\n")
|
244
|
+
fp.puts d
|
245
|
+
end
|
246
|
+
}
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
module GGobi
|
251
|
+
class << self
|
252
|
+
def write(dataset,filename,opt={})
|
253
|
+
File.open(filename,"w") {|fp|
|
254
|
+
fp.write(self.out(dataset,opt))
|
255
|
+
}
|
256
|
+
end
|
257
|
+
def out(dataset,opt={})
|
258
|
+
require 'ostruct'
|
259
|
+
default_opt = {:dataname => "Default", :description=>""}
|
260
|
+
default_opt.merge! opt
|
261
|
+
carrier=OpenStruct.new
|
262
|
+
carrier.categorials=[]
|
263
|
+
carrier.conversions={}
|
264
|
+
variables_def=dataset.vectors.collect{|k,v|
|
265
|
+
variable_definition(carrier,v,k)
|
266
|
+
}.join("\n")
|
267
|
+
|
268
|
+
indexes=carrier.categorials.inject({}) {|s,c|
|
269
|
+
s[dataset.fields.index(c)]=c
|
270
|
+
s
|
271
|
+
}
|
272
|
+
records=""
|
273
|
+
dataset.each_array {|c|
|
274
|
+
indexes.each{|ik,iv|
|
275
|
+
c[ik]=carrier.conversions[iv][c[ik]]
|
276
|
+
}
|
277
|
+
records << "<record>#{values_definition(c)}</record>\n"
|
278
|
+
}
|
279
|
+
|
280
|
+
out=<<EOC
|
281
|
+
<?xml version="1.0"?>
|
282
|
+
<!DOCTYPE ggobidata SYSTEM "ggobi.dtd">
|
283
|
+
<ggobidata count="1">
|
284
|
+
<data name="#{default_opt[:dataname]}">
|
285
|
+
<description>#{default_opt[:description]}</description>
|
286
|
+
<variables count="#{dataset.fields.size}">
|
287
|
+
#{variables_def}
|
288
|
+
</variables>
|
289
|
+
<records count="#{dataset.cases}">
|
290
|
+
#{records}
|
291
|
+
</records>
|
292
|
+
|
293
|
+
</data>
|
294
|
+
</ggobidata>
|
295
|
+
EOC
|
296
|
+
|
297
|
+
out
|
298
|
+
|
299
|
+
end
|
300
|
+
def values_definition(c)
|
301
|
+
c.collect{|v|
|
302
|
+
if v.is_a? Float
|
303
|
+
"<real>#{v}</real>"
|
304
|
+
elsif v.is_a? Integer
|
305
|
+
"<int>#{v}</int>"
|
306
|
+
else
|
307
|
+
"<string>#{v}</string>"
|
308
|
+
end
|
309
|
+
}.join(" ")
|
310
|
+
end
|
311
|
+
# Outputs a string for a variable definition
|
312
|
+
# v = vector
|
313
|
+
# name = name of the variable
|
314
|
+
# nickname = nickname
|
315
|
+
def variable_definition(carrier,v,name,nickname=nil)
|
316
|
+
nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" )
|
317
|
+
if v.type==:nominal or v.data.find {|d| d.is_a? String }
|
318
|
+
carrier.categorials.push(name)
|
319
|
+
carrier.conversions[name]={}
|
320
|
+
factors=v.data.uniq.sort
|
321
|
+
out ="<categoricalvariable name=\"#{name}\" #{nickname}>\n"
|
322
|
+
out << "<levels count=\"#{factors.size}\">\n"
|
323
|
+
out << (1..factors.size).to_a.collect{|i|
|
324
|
+
carrier.conversions[name][factors[i-1]]=i
|
325
|
+
"<level value=\"#{i}\">#{v.labeling(factors[i-1])}</level>"
|
326
|
+
}.join("\n")
|
327
|
+
out << "</levels>\n</categoricalvariable>\n"
|
328
|
+
out
|
329
|
+
elsif v.data.find {|d| d.is_a? Float}
|
330
|
+
"<realvariable name=\"#{name}\" #{nickname} />"
|
331
|
+
else
|
332
|
+
"<integervariable name=\"#{name}\" #{nickname} />"
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Class to create crosstab of data
|
3
|
+
# With this, you can create reports and do chi square test
|
4
|
+
# The first vector will be at rows and the second will the the columns
|
5
|
+
#
|
6
|
+
class Crosstab
|
7
|
+
attr_reader :v_rows, :v_cols
|
8
|
+
def initialize(v1,v2)
|
9
|
+
raise ArgumentError, "Both arguments should be Vectors" unless v1.instance_of? Vector and v2.instance_of? Vector
|
10
|
+
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
|
11
|
+
@v_rows,@v_cols=v1,v2
|
12
|
+
end
|
13
|
+
def rows_names
|
14
|
+
@v_rows.factors.sort
|
15
|
+
end
|
16
|
+
def cols_names
|
17
|
+
@v_cols.factors.sort
|
18
|
+
end
|
19
|
+
def rows_total
|
20
|
+
@v_rows.frequencies
|
21
|
+
end
|
22
|
+
def cols_total
|
23
|
+
@v_cols.frequencies
|
24
|
+
end
|
25
|
+
def frequencies
|
26
|
+
base=rows_names.inject([]){|s,row|
|
27
|
+
s+=cols_names.collect{|col| [row,col]}
|
28
|
+
}.inject({}) {|s,par|
|
29
|
+
s[par]=0
|
30
|
+
s
|
31
|
+
}
|
32
|
+
base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
|
33
|
+
end
|
34
|
+
def to_matrix
|
35
|
+
f=frequencies
|
36
|
+
rn=rows_names
|
37
|
+
cn=cols_names
|
38
|
+
Matrix.rows(rn.collect{|row|
|
39
|
+
cn.collect{|col| f[[row,col]]}
|
40
|
+
})
|
41
|
+
end
|
42
|
+
def frequencies_by_row
|
43
|
+
f=frequencies
|
44
|
+
rows_names.inject({}){|sr,row|
|
45
|
+
sr[row]=cols_names.inject({}) {|sc,col|
|
46
|
+
sc[col]=f[[row,col]]
|
47
|
+
sc
|
48
|
+
}
|
49
|
+
sr
|
50
|
+
}
|
51
|
+
end
|
52
|
+
def frequencies_by_col
|
53
|
+
f=frequencies
|
54
|
+
cols_names.inject({}){|sc,col|
|
55
|
+
sc[col]=rows_names.inject({}) {|sr,row|
|
56
|
+
sr[row]=f[[row,col]]
|
57
|
+
sr
|
58
|
+
}
|
59
|
+
sc
|
60
|
+
}
|
61
|
+
end
|
62
|
+
# Chi square, based on expected and real matrix
|
63
|
+
def chi_square
|
64
|
+
require 'statsample/test'
|
65
|
+
Statsample::Test.chi_square(self.to_matrix,matrix_expected)
|
66
|
+
end
|
67
|
+
# Useful to obtain chi square
|
68
|
+
def matrix_expected
|
69
|
+
rn=rows_names
|
70
|
+
cn=cols_names
|
71
|
+
rt=rows_total
|
72
|
+
ct=cols_total
|
73
|
+
t=@v_rows.size.to_f
|
74
|
+
m=rn.collect{|row|
|
75
|
+
cn.collect{|col|
|
76
|
+
(rt[row]*ct[col]) / t
|
77
|
+
}
|
78
|
+
}
|
79
|
+
Matrix.rows(m)
|
80
|
+
end
|
81
|
+
def to_s
|
82
|
+
fq=frequencies
|
83
|
+
rn=rows_names
|
84
|
+
cn=cols_names
|
85
|
+
total=0
|
86
|
+
total_cols=cn.inject({}) {|a,x| a[x]=0;a}
|
87
|
+
max_row_size = rn.inject(0) {|s,x| sl=@v_rows.labeling(x).size; sl>s ? sl : s}
|
88
|
+
|
89
|
+
max_row_size=max_row_size<6 ? 6 : max_row_size
|
90
|
+
|
91
|
+
max_col_size = cn.inject(0) {|s,x| sl=@v_cols.labeling(x).size; sl>s ? sl : s}
|
92
|
+
max_col_size = frequencies.inject(max_col_size) {|s,x| x[1].to_s.size>s ? x[1].to_s.size : s}
|
93
|
+
|
94
|
+
out=""
|
95
|
+
out << " " * (max_row_size+2) << "|" << cn.collect{|c| name=@v_cols.labeling(c); " "+name+(" "*(max_col_size-name.size))+" "}.join("|") << "| Total\n"
|
96
|
+
linea="-" * (max_row_size+2) << "|" << ("-"*(max_col_size+2) +"|")*cn.size << "-"*7 << "\n"
|
97
|
+
out << linea
|
98
|
+
rn.each{|row|
|
99
|
+
total_row=0;
|
100
|
+
name=@v_rows.labeling(row)
|
101
|
+
out << " " +name << " "*(max_row_size-name.size) << " | "
|
102
|
+
cn.each{|col|
|
103
|
+
data=fq[[row,col]].to_s
|
104
|
+
total_row+=fq[[row,col]]
|
105
|
+
total+=fq[[row,col]]
|
106
|
+
total_cols[col]+=fq[[row,col]]
|
107
|
+
out << " " << data << " "*(max_col_size-data.size) << "| "
|
108
|
+
}
|
109
|
+
out << " " << total_row.to_s
|
110
|
+
out << "\n"
|
111
|
+
}
|
112
|
+
out << linea
|
113
|
+
out << " Total " << " "*(max_row_size-5) << "| "
|
114
|
+
cn.each{|v|
|
115
|
+
data=total_cols[v].to_s
|
116
|
+
out << " " << data << " "*(max_col_size-data.size) << "| "
|
117
|
+
}
|
118
|
+
out << " " << total.to_s
|
119
|
+
out
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|