statsample 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +3 -1
- data/lib/statsample.rb +175 -179
- data/lib/statsample/codification.rb +1 -1
- data/lib/statsample/converter/csv18.rb +56 -0
- data/lib/statsample/converter/csv19.rb +60 -0
- data/lib/statsample/converters.rb +26 -75
- data/lib/statsample/dataset.rb +38 -29
- data/lib/statsample/dominanceanalysis.rb +6 -6
- data/lib/statsample/graph/gdchart.rb +2 -1
- data/lib/statsample/graph/svggraph.rb +10 -9
- data/lib/statsample/multiset.rb +3 -3
- data/lib/statsample/regression/multiple.rb +43 -271
- data/lib/statsample/regression/multiple/baseengine.rb +235 -0
- data/lib/statsample/regression/multiple/gslengine.rb +2 -2
- data/lib/statsample/vector.rb +754 -736
- data/test/test_csv.rb +3 -4
- data/test/test_dataset.rb +22 -3
- data/test/test_distribution.rb +4 -3
- data/test/test_ggobi.rb +2 -2
- data/test/test_regression.rb +11 -2
- data/test/test_svg_graph.rb +0 -1
- data/test/test_vector.rb +50 -5
- data/test/test_xls.rb +2 -4
- metadata +5 -3
- data/test/_test_chart.rb +0 -58
@@ -0,0 +1,60 @@
|
|
1
|
+
module Statsample
|
2
|
+
class CSV < SpreadsheetBase
|
3
|
+
class << self
|
4
|
+
# Returns a Dataset based on a csv file
|
5
|
+
#
|
6
|
+
# USE:
|
7
|
+
# ds=Statsample::CSV.read("test_csv.csv")
|
8
|
+
def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
|
9
|
+
require 'csv'
|
10
|
+
first_row=true
|
11
|
+
fields=[]
|
12
|
+
fields_data={}
|
13
|
+
ds=nil
|
14
|
+
line_number=0
|
15
|
+
opts={}
|
16
|
+
opts[:col_sep]=fs unless fs.nil?
|
17
|
+
opts[:row_sep]=rs unless rs.nil?
|
18
|
+
csv=::CSV.open(filename,'r',opts)
|
19
|
+
|
20
|
+
csv.each do |row|
|
21
|
+
line_number+=1
|
22
|
+
if(line_number<=ignore_lines)
|
23
|
+
#puts "Skip line"
|
24
|
+
next
|
25
|
+
end
|
26
|
+
row.collect!{|c|
|
27
|
+
c.to_s
|
28
|
+
}
|
29
|
+
if first_row
|
30
|
+
fields=extract_fields(row)
|
31
|
+
ds=Statsample::Dataset.new(fields)
|
32
|
+
first_row=false
|
33
|
+
else
|
34
|
+
rowa=process_row(row,empty)
|
35
|
+
ds.add_case(rowa,false)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
convert_to_scale(ds,fields)
|
39
|
+
ds.update_valid_data
|
40
|
+
ds
|
41
|
+
end
|
42
|
+
# Save a Dataset on a csv file
|
43
|
+
#
|
44
|
+
# USE:
|
45
|
+
# Statsample::CSV.write(ds,"test_csv.csv")
|
46
|
+
def write(dataset,filename, convert_comma=false,*opts)
|
47
|
+
require 'csv'
|
48
|
+
writer=::CSV.open(filename,'w',*opts)
|
49
|
+
writer << dataset.fields
|
50
|
+
dataset.each_array{|row|
|
51
|
+
if(convert_comma)
|
52
|
+
row.collect!{|v| v.to_s.gsub(".",",")}
|
53
|
+
end
|
54
|
+
writer << row
|
55
|
+
}
|
56
|
+
writer.close
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -1,7 +1,6 @@
|
|
1
1
|
module Statsample
|
2
2
|
# Create and dumps Datasets on a database
|
3
3
|
module Database
|
4
|
-
require 'dbi'
|
5
4
|
class << self
|
6
5
|
# Read a database query and returns a Dataset
|
7
6
|
#
|
@@ -11,6 +10,7 @@ module Statsample
|
|
11
10
|
# Statsample.read(dbh, "SELECT * FROM test")
|
12
11
|
#
|
13
12
|
def read(dbh,query)
|
13
|
+
require 'dbi'
|
14
14
|
sth=dbh.execute(query)
|
15
15
|
vectors={}
|
16
16
|
fields=[]
|
@@ -35,6 +35,7 @@ module Statsample
|
|
35
35
|
# Statsample::Database.insert(ds,dbh,"test")
|
36
36
|
#
|
37
37
|
def insert(ds, dbh,table)
|
38
|
+
require 'dbi'
|
38
39
|
query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
|
39
40
|
sth=dbh.prepare(query)
|
40
41
|
ds.each_array{|c|
|
@@ -64,15 +65,11 @@ module Statsample
|
|
64
65
|
def write(dataset,filename)
|
65
66
|
File.open(filename,"wb") do |fp|
|
66
67
|
fp.puts dataset.fields.join("\t")
|
67
|
-
dataset.
|
68
|
-
|
69
|
-
|
70
|
-
row[f]
|
71
|
-
else
|
72
|
-
""
|
73
|
-
end
|
68
|
+
dataset.each_array_with_nils{|row|
|
69
|
+
row2=row.collect{|v|
|
70
|
+
v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_")
|
74
71
|
}
|
75
|
-
fp.puts
|
72
|
+
fp.puts row2.join("\t")
|
76
73
|
}
|
77
74
|
end
|
78
75
|
end
|
@@ -194,7 +191,7 @@ module Statsample
|
|
194
191
|
first_row=false
|
195
192
|
else
|
196
193
|
rowa=process_row(row,empty)
|
197
|
-
(fields.size - rowa.size).times {
|
194
|
+
(fields.size - rowa.size).times {
|
198
195
|
rowa << nil
|
199
196
|
}
|
200
197
|
ds.add_case(rowa,false)
|
@@ -210,59 +207,6 @@ module Statsample
|
|
210
207
|
end
|
211
208
|
end
|
212
209
|
end
|
213
|
-
class CSV < SpreadsheetBase
|
214
|
-
class << self
|
215
|
-
# Returns a Dataset based on a csv file
|
216
|
-
#
|
217
|
-
# USE:
|
218
|
-
# ds=Statsample::CSV.read("test_csv.csv")
|
219
|
-
def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
|
220
|
-
require 'csv'
|
221
|
-
first_row=true
|
222
|
-
fields=[]
|
223
|
-
fields_data={}
|
224
|
-
ds=nil
|
225
|
-
line_number=0
|
226
|
-
::CSV.open(filename,'r',fs,rs) do |row|
|
227
|
-
line_number+=1
|
228
|
-
if(line_number<=ignore_lines)
|
229
|
-
#puts "Skip line"
|
230
|
-
next
|
231
|
-
end
|
232
|
-
row.collect!{|c|
|
233
|
-
c.to_s
|
234
|
-
}
|
235
|
-
if first_row
|
236
|
-
fields=extract_fields(row)
|
237
|
-
ds=Statsample::Dataset.new(fields)
|
238
|
-
first_row=false
|
239
|
-
else
|
240
|
-
rowa=process_row(row,empty)
|
241
|
-
ds.add_case(rowa,false)
|
242
|
-
end
|
243
|
-
end
|
244
|
-
convert_to_scale(ds,fields)
|
245
|
-
ds.update_valid_data
|
246
|
-
ds
|
247
|
-
end
|
248
|
-
# Save a Dataset on a csv file
|
249
|
-
#
|
250
|
-
# USE:
|
251
|
-
# Statsample::CSV.write(ds,"test_csv.csv")
|
252
|
-
def write(dataset,filename, convert_comma=false,*opts)
|
253
|
-
require 'csv'
|
254
|
-
writer=::CSV.open(filename,'w',*opts)
|
255
|
-
writer << dataset.fields
|
256
|
-
dataset.each_array{|row|
|
257
|
-
if(convert_comma)
|
258
|
-
row.collect!{|v| v.to_s.gsub(".",",")}
|
259
|
-
end
|
260
|
-
writer << row
|
261
|
-
}
|
262
|
-
writer.close
|
263
|
-
end
|
264
|
-
end
|
265
|
-
end
|
266
210
|
module Mx
|
267
211
|
class << self
|
268
212
|
def write(dataset,filename,type=:covariance)
|
@@ -309,13 +253,13 @@ module Statsample
|
|
309
253
|
end
|
310
254
|
def out(dataset,opt={})
|
311
255
|
require 'ostruct'
|
312
|
-
default_opt = {:dataname => "Default", :description=>""}
|
256
|
+
default_opt = {:dataname => "Default", :description=>"", :missing=>"NA"}
|
313
257
|
default_opt.merge! opt
|
314
258
|
carrier=OpenStruct.new
|
315
259
|
carrier.categorials=[]
|
316
260
|
carrier.conversions={}
|
317
|
-
variables_def=dataset.
|
318
|
-
variable_definition(carrier,
|
261
|
+
variables_def=dataset.fields.collect{|k|
|
262
|
+
variable_definition(carrier,dataset[k],k)
|
319
263
|
}.join("\n")
|
320
264
|
|
321
265
|
indexes=carrier.categorials.inject({}) {|s,c|
|
@@ -327,7 +271,7 @@ module Statsample
|
|
327
271
|
indexes.each{|ik,iv|
|
328
272
|
c[ik]=carrier.conversions[iv][c[ik]]
|
329
273
|
}
|
330
|
-
records << "<record>#{values_definition(c)}</record>\n"
|
274
|
+
records << "<record>#{values_definition(c, default_opt[:missing])}</record>\n"
|
331
275
|
}
|
332
276
|
|
333
277
|
out=<<EOC
|
@@ -339,7 +283,7 @@ out=<<EOC
|
|
339
283
|
<variables count="#{dataset.fields.size}">
|
340
284
|
#{variables_def}
|
341
285
|
</variables>
|
342
|
-
<records count="#{dataset.cases}">
|
286
|
+
<records count="#{dataset.cases}" missingValue="#{default_opt[:missing]}">
|
343
287
|
#{records}
|
344
288
|
</records>
|
345
289
|
|
@@ -350,14 +294,14 @@ EOC
|
|
350
294
|
out
|
351
295
|
|
352
296
|
end
|
353
|
-
def values_definition(c)
|
297
|
+
def values_definition(c,missing)
|
354
298
|
c.collect{|v|
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
"
|
299
|
+
if v.nil?
|
300
|
+
"#{missing}"
|
301
|
+
elsif v.is_a? Numeric
|
302
|
+
"#{v}"
|
359
303
|
else
|
360
|
-
"
|
304
|
+
"#{v.gsub(/\s+/,"_")}"
|
361
305
|
end
|
362
306
|
}.join(" ")
|
363
307
|
end
|
@@ -370,7 +314,7 @@ out
|
|
370
314
|
if v.type==:nominal or v.data.find {|d| d.is_a? String }
|
371
315
|
carrier.categorials.push(name)
|
372
316
|
carrier.conversions[name]={}
|
373
|
-
factors=v.
|
317
|
+
factors=v.factors
|
374
318
|
out ="<categoricalvariable name=\"#{name}\" #{nickname}>\n"
|
375
319
|
out << "<levels count=\"#{factors.size}\">\n"
|
376
320
|
out << (1..factors.size).to_a.collect{|i|
|
@@ -389,3 +333,10 @@ out
|
|
389
333
|
end
|
390
334
|
end
|
391
335
|
end
|
336
|
+
|
337
|
+
if RUBY_VERSION<"1.9"
|
338
|
+
require 'statsample/converter/csv18.rb'
|
339
|
+
else
|
340
|
+
require 'statsample/converter/csv19.rb'
|
341
|
+
end
|
342
|
+
|
data/lib/statsample/dataset.rb
CHANGED
@@ -292,7 +292,7 @@ module Statsample
|
|
292
292
|
end
|
293
293
|
def check_length
|
294
294
|
size=nil
|
295
|
-
@vectors.each
|
295
|
+
@vectors.each do |k,v|
|
296
296
|
raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
|
297
297
|
if size.nil?
|
298
298
|
size=v.size
|
@@ -302,38 +302,33 @@ module Statsample
|
|
302
302
|
raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
|
303
303
|
end
|
304
304
|
end
|
305
|
-
}
|
306
|
-
@cases=size
|
307
305
|
end
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
306
|
+
@cases=size
|
307
|
+
end
|
308
|
+
def each_vector
|
309
|
+
@fields.each{|k| yield k,@vectors[k]}
|
310
|
+
end
|
311
|
+
if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
|
312
|
+
def case_as_hash(c) # :nodoc:
|
313
|
+
Statsample::STATSAMPLE__.case_as_hash(self,c)
|
312
314
|
end
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
315
|
+
else
|
316
|
+
def case_as_hash(c)
|
317
|
+
_case_as_hash(c)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
|
322
|
+
def case_as_array(c) # :nodoc:
|
323
|
+
Statsample::STATSAMPLE__.case_as_array(self,c)
|
321
324
|
end
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
Statsample::STATSAMPLE__.case_as_array(self,c)
|
326
|
-
end
|
327
|
-
else
|
328
|
-
def case_as_array(c)
|
329
|
-
_case_as_array(c)
|
330
|
-
end
|
325
|
+
else
|
326
|
+
def case_as_array(c)
|
327
|
+
_case_as_array(c)
|
331
328
|
end
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
a
|
336
|
-
}
|
329
|
+
end
|
330
|
+
def _case_as_hash(c) # :nodoc:
|
331
|
+
@fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
|
337
332
|
end
|
338
333
|
def _case_as_array(c) # :nodoc:
|
339
334
|
@fields.collect {|x| @vectors[x][c]}
|
@@ -366,6 +361,20 @@ module Statsample
|
|
366
361
|
raise DatasetException.new(self,e)
|
367
362
|
end
|
368
363
|
end
|
364
|
+
# Returns each case as an array, coding missing values as nils
|
365
|
+
def each_array_with_nils
|
366
|
+
m=fields.size
|
367
|
+
@cases.times {|i|
|
368
|
+
@i=i
|
369
|
+
row=Array.new(m)
|
370
|
+
fields.each_index{|j|
|
371
|
+
f=fields[j]
|
372
|
+
row[j]=@vectors[f].data_with_nils[i]
|
373
|
+
}
|
374
|
+
yield row
|
375
|
+
}
|
376
|
+
@i=nil
|
377
|
+
end
|
369
378
|
# Returns each case as an array
|
370
379
|
def each_array
|
371
380
|
@cases.times {|i|
|
@@ -124,8 +124,8 @@ module Statsample
|
|
124
124
|
}
|
125
125
|
}
|
126
126
|
out={}
|
127
|
-
averages.each{|
|
128
|
-
out[
|
127
|
+
averages.each{|key,val|
|
128
|
+
out[key]=val.to_vector(:scale).mean
|
129
129
|
}
|
130
130
|
out
|
131
131
|
end
|
@@ -139,8 +139,8 @@ module Statsample
|
|
139
139
|
}
|
140
140
|
end
|
141
141
|
out={}
|
142
|
-
averages.each{|
|
143
|
-
out[
|
142
|
+
averages.each{|key,val|
|
143
|
+
out[key]=val.to_vector(:scale).mean
|
144
144
|
}
|
145
145
|
@general_averages=out
|
146
146
|
end
|
@@ -152,8 +152,8 @@ module Statsample
|
|
152
152
|
for i in 1..@fields.size
|
153
153
|
c=Statsample::Combination.new(i,@fields.size)
|
154
154
|
c.each{|data|
|
155
|
-
convert=data.collect {|
|
156
|
-
@fields[
|
155
|
+
convert=data.collect {|i1|
|
156
|
+
@fields[i1]
|
157
157
|
}
|
158
158
|
@models.push(convert)
|
159
159
|
ds_prev=@ds.dup(convert+[@y_var])
|
@@ -9,19 +9,20 @@ module Statsample
|
|
9
9
|
class Vector
|
10
10
|
# Creates a barchart using ruby-gdchart
|
11
11
|
def svggraph_frequencies(file, width=600, height=300, chart_type=SVG::Graph::BarNoOp, options={})
|
12
|
-
labels,
|
12
|
+
labels, data1=[],[]
|
13
13
|
self.frequencies.sort.each{|k,v|
|
14
14
|
labels.push(k.to_s)
|
15
|
-
|
15
|
+
data1.push(v)
|
16
16
|
}
|
17
17
|
options[:height]=height
|
18
18
|
options[:width]=width
|
19
19
|
options[:fields]=labels
|
20
20
|
graph = chart_type.new(options)
|
21
21
|
graph.add_data(
|
22
|
-
:data =>
|
22
|
+
:data => data1,
|
23
23
|
:title => "Frequencies"
|
24
24
|
)
|
25
|
+
|
25
26
|
File.open(file,"w") {|f|
|
26
27
|
f.puts(graph.burn)
|
27
28
|
}
|
@@ -49,7 +50,7 @@ module Statsample
|
|
49
50
|
def svggraph_boxplot(options={})
|
50
51
|
check_type :scale
|
51
52
|
options={:graph_title=>"Boxplot", :fields=>['vector'], :show_graph_title=>true}.merge! options
|
52
|
-
vx=@
|
53
|
+
vx=@valid_data.to_a.to_vector(:scale)
|
53
54
|
graph = Statsample::Graph::SvgBoxplot.new(options)
|
54
55
|
graph.add_data(:title=>"vector", :data=>@data.to_a)
|
55
56
|
graph
|
@@ -58,8 +59,8 @@ module Statsample
|
|
58
59
|
def svggraph_lag_plot(options={})
|
59
60
|
check_type :scale
|
60
61
|
options={:graph_title=>"Lag Plot", :show_graph_title=>true}.merge! options
|
61
|
-
vx=@
|
62
|
-
vy=@
|
62
|
+
vx=@valid_data[0...(@valid_data.size-1)].to_vector(:scale)
|
63
|
+
vy=@valid_data[1...@valid_data.size].to_vector(:scale)
|
63
64
|
ds={'x_minus_1'=>vx,'x'=>vy}.to_dataset
|
64
65
|
graph = Statsample::Graph::SvgScatterplot.new(ds,options)
|
65
66
|
graph.set_x('x_minus_1')
|
@@ -73,11 +74,11 @@ module Statsample
|
|
73
74
|
extend Statsample::Util
|
74
75
|
check_type :scale
|
75
76
|
options={:graph_title=>"Normal Probability Plot", :show_graph_title=>true}.merge! options
|
76
|
-
n=@
|
77
|
-
vx=(1..@
|
77
|
+
n=@valid_data.size
|
78
|
+
vx=(1..@valid_data.size).to_a.collect{|i|
|
78
79
|
Distribution::Normal.p_value(normal_order_statistic_medians(i,n))
|
79
80
|
}.to_vector(:scale)
|
80
|
-
vy=@
|
81
|
+
vy=@valid_data.sort.to_vector(:scale)
|
81
82
|
ds={'normal_order_statistics_medians'=>vx, 'ordered_response'=>vy}.to_dataset
|
82
83
|
graph = Statsample::Graph::SvgScatterplot.new(ds,options)
|
83
84
|
graph.set_x('normal_order_statistics_medians')
|
data/lib/statsample/multiset.rb
CHANGED
@@ -50,13 +50,13 @@ module Statsample
|
|
50
50
|
class StratifiedSample
|
51
51
|
class << self
|
52
52
|
# mean for an array of vectors
|
53
|
-
def mean(*
|
53
|
+
def mean(*vectors)
|
54
54
|
n_total=0
|
55
|
-
|
55
|
+
means=vectors.inject(0){|a,v|
|
56
56
|
n_total+=v.size
|
57
57
|
a+v.sum
|
58
58
|
}
|
59
|
-
|
59
|
+
means.to_f/n_total
|
60
60
|
end
|
61
61
|
|
62
62
|
def standard_error_ksd_wr(es)
|