statsample 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +7 -0
- data/Manifest.txt +6 -4
- data/README.txt +5 -1
- data/Rakefile +1 -1
- data/examples/boxplot.rb +17 -0
- data/examples/dominance_analysis_bootstrap.rb +5 -0
- data/examples/histogram.rb +14 -0
- data/examples/scatterplot.rb +4 -3
- data/lib/distribution/normalbivariate.rb +1 -1
- data/lib/statsample.rb +16 -3
- data/lib/statsample/bivariate.rb +4 -2
- data/lib/statsample/converter/csv.rb +0 -2
- data/lib/statsample/converters.rb +13 -1
- data/lib/statsample/dataset.rb +23 -15
- data/lib/statsample/dominanceanalysis.rb +3 -2
- data/lib/statsample/dominanceanalysis/bootstrap.rb +2 -1
- data/lib/statsample/factor/parallelanalysis.rb +1 -1
- data/lib/statsample/factor/principalaxis.rb +1 -1
- data/lib/statsample/graph.rb +2 -0
- data/lib/statsample/graph/boxplot.rb +234 -0
- data/lib/statsample/graph/histogram.rb +133 -0
- data/lib/statsample/graph/scatterplot.rb +1 -9
- data/lib/statsample/histogram.rb +47 -11
- data/lib/statsample/mle.rb +4 -4
- data/lib/statsample/mle/normal.rb +3 -3
- data/lib/statsample/regression/multiple/baseengine.rb +1 -1
- data/lib/statsample/regression/multiple/gslengine.rb +0 -1
- data/lib/statsample/regression/multiple/matrixengine.rb +1 -1
- data/lib/statsample/reliability.rb +1 -0
- data/lib/statsample/reliability/scaleanalysis.rb +3 -51
- data/lib/statsample/reliability/skillscaleanalysis.rb +93 -0
- data/lib/statsample/srs.rb +1 -1
- data/lib/statsample/test/umannwhitney.rb +1 -1
- data/lib/statsample/vector.rb +13 -36
- data/test/test_factor.rb +1 -1
- data/test/test_ggobi.rb +0 -5
- data/test/test_histogram.rb +75 -18
- data/test/test_mle.rb +0 -44
- data/test/test_reliability_skillscale.rb +41 -0
- data/test/test_statistics.rb +3 -3
- data/test/test_stest.rb +2 -2
- data/test/test_vector.rb +13 -8
- metadata +36 -18
- metadata.gz.sig +0 -0
- data/lib/statsample/combination.rb +0 -114
- data/lib/statsample/permutation.rb +0 -98
- data/test/test_combination.rb +0 -37
- data/test/test_permutation.rb +0 -42
data.tar.gz.sig
CHANGED
Binary file
|
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
=== 0.17.0 / 2010-12-09
|
2
|
+
* Added Statsample::Graph::Histogram and Statsample::Graph::Boxplot
|
3
|
+
* Added Statsample::Reliability::SkillScaleAnalysis for analysis of skill based scales.
|
4
|
+
* Delete combination and permutation clases. Backport for ruby 1.8.7 widely available
|
5
|
+
* Deleted unused variables (thanks, ruby-head)
|
6
|
+
|
1
7
|
=== 0.16.0 / 2010-11-13
|
2
8
|
* Works on ruby 1.9.2 and HEAD. Updated Rakefile and manifest
|
3
9
|
* Removed all graph based on Svg::Graph.
|
@@ -6,6 +12,7 @@
|
|
6
12
|
* Added reference on references.txt
|
7
13
|
* Ruby-based random gaussian distribution generator when gsl not available
|
8
14
|
* Added population average deviation [Al Chou]
|
15
|
+
|
9
16
|
=== 0.15.1 / 2010-10-20
|
10
17
|
* Statsample::Excel and Statsample::PlainText add name to vectors equal to field name
|
11
18
|
* Statsample::Dataset.delete_vector accept multiple fields.
|
data/Manifest.txt
CHANGED
@@ -12,10 +12,12 @@ data/test_binomial.csv
|
|
12
12
|
data/tetmat_matrix.txt
|
13
13
|
data/tetmat_test.txt
|
14
14
|
doc_latex/manual/equations.tex
|
15
|
+
examples/boxplot.rb
|
15
16
|
examples/correlation_matrix.rb
|
16
17
|
examples/dataset.rb
|
17
18
|
examples/dominance_analysis.rb
|
18
19
|
examples/dominance_analysis_bootstrap.rb
|
20
|
+
examples/histogram.rb
|
19
21
|
examples/icc.rb
|
20
22
|
examples/levene.rb
|
21
23
|
examples/multiple_regression.rb
|
@@ -47,7 +49,6 @@ lib/statsample/anova/twoway.rb
|
|
47
49
|
lib/statsample/bivariate.rb
|
48
50
|
lib/statsample/bivariate/pearson.rb
|
49
51
|
lib/statsample/codification.rb
|
50
|
-
lib/statsample/combination.rb
|
51
52
|
lib/statsample/converter/csv.rb
|
52
53
|
lib/statsample/converter/spss.rb
|
53
54
|
lib/statsample/converters.rb
|
@@ -62,6 +63,8 @@ lib/statsample/factor/pca.rb
|
|
62
63
|
lib/statsample/factor/principalaxis.rb
|
63
64
|
lib/statsample/factor/rotation.rb
|
64
65
|
lib/statsample/graph.rb
|
66
|
+
lib/statsample/graph/boxplot.rb
|
67
|
+
lib/statsample/graph/histogram.rb
|
65
68
|
lib/statsample/graph/scatterplot.rb
|
66
69
|
lib/statsample/histogram.rb
|
67
70
|
lib/statsample/matrix.rb
|
@@ -70,7 +73,6 @@ lib/statsample/mle/logit.rb
|
|
70
73
|
lib/statsample/mle/normal.rb
|
71
74
|
lib/statsample/mle/probit.rb
|
72
75
|
lib/statsample/multiset.rb
|
73
|
-
lib/statsample/permutation.rb
|
74
76
|
lib/statsample/regression.rb
|
75
77
|
lib/statsample/regression/binomial.rb
|
76
78
|
lib/statsample/regression/binomial/logit.rb
|
@@ -86,6 +88,7 @@ lib/statsample/reliability.rb
|
|
86
88
|
lib/statsample/reliability/icc.rb
|
87
89
|
lib/statsample/reliability/multiscaleanalysis.rb
|
88
90
|
lib/statsample/reliability/scaleanalysis.rb
|
91
|
+
lib/statsample/reliability/skillscaleanalysis.rb
|
89
92
|
lib/statsample/resample.rb
|
90
93
|
lib/statsample/rserve_extension.rb
|
91
94
|
lib/statsample/srs.rb
|
@@ -111,7 +114,6 @@ test/test_anovawithvectors.rb
|
|
111
114
|
test/test_bartlettsphericity.rb
|
112
115
|
test/test_bivariate.rb
|
113
116
|
test/test_codification.rb
|
114
|
-
test/test_combination.rb
|
115
117
|
test/test_crosstab.rb
|
116
118
|
test/test_csv.csv
|
117
119
|
test/test_csv.rb
|
@@ -126,10 +128,10 @@ test/test_logit.rb
|
|
126
128
|
test/test_matrix.rb
|
127
129
|
test/test_mle.rb
|
128
130
|
test/test_multiset.rb
|
129
|
-
test/test_permutation.rb
|
130
131
|
test/test_regression.rb
|
131
132
|
test/test_reliability.rb
|
132
133
|
test/test_reliability_icc.rb
|
134
|
+
test/test_reliability_skillscale.rb
|
133
135
|
test/test_resample.rb
|
134
136
|
test/test_rserve_extension.rb
|
135
137
|
test/test_srs.rb
|
data/README.txt
CHANGED
@@ -21,6 +21,7 @@ Include:
|
|
21
21
|
* Sample calculation related formulas
|
22
22
|
* Structural Equation Modeling (SEM), using R libraries +sem+ and +OpenMx+
|
23
23
|
* Creates reports on text, html and rtf, using ReportBuilder gem
|
24
|
+
* Graphics: Histogram, Boxplot and Scatterplot
|
24
25
|
|
25
26
|
== FEATURES:
|
26
27
|
|
@@ -69,8 +70,11 @@ Include:
|
|
69
70
|
* Statsample::Test::UMannWhitney
|
70
71
|
* Statsample::Test::T
|
71
72
|
* Statsample::Test::F
|
73
|
+
* Module Graph provides several classes to create beautiful graphs using rubyvis
|
74
|
+
* Statsample::Graph::Boxplot
|
75
|
+
* Statsample::Graph::Histogram
|
76
|
+
* Statsample::Graph::Scatterplot
|
72
77
|
* Gem +statsample-sem+ provides a DSL to R libraries +sem+ and +OpenMx+
|
73
|
-
* Interfaces to gdchart, gnuplot and SVG::Graph (experimental)
|
74
78
|
* Close integration with gem <tt>reportbuilder</tt>, to easily create reports on text, html and rtf formats.
|
75
79
|
|
76
80
|
== Examples of use:
|
data/Rakefile
CHANGED
@@ -41,7 +41,7 @@ h=Hoe.spec('statsample') do
|
|
41
41
|
#self.testlib=:minitest
|
42
42
|
self.rubyforge_name = "ruby-statsample"
|
43
43
|
self.developer('Claudio Bustos', 'clbustos@gmail.com')
|
44
|
-
self.extra_deps << ["spreadsheet","~>0.6.0"] << ["reportbuilder", "~>1.
|
44
|
+
self.extra_deps << ["spreadsheet","~>0.6.0"] << ["reportbuilder", "~>1.4"] << ["minimization", "~>0.2.0"] << ["fastercsv", ">0"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.3.1"] << ["statsample-bivariate-extension", ">0"] << ["rserve-client", "~>0.2.5"] << ["rubyvis", "~>0.3.3"]
|
45
45
|
|
46
46
|
self.extra_dev_deps << ["shoulda"] << ["minitest", "~>2.0"]
|
47
47
|
self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression"
|
data/examples/boxplot.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
|
+
$:.unshift('/home/cdx/dev/reportbuilder/lib/')
|
4
|
+
|
5
|
+
require 'benchmark'
|
6
|
+
require 'statsample'
|
7
|
+
n=100
|
8
|
+
a=(n-1).times.map {|i| rand()*20+50}
|
9
|
+
b=n.times.map {|i| rand()*10+50}.to_scale
|
10
|
+
c=n.times.map {|i| rand()*5+50}.to_scale
|
11
|
+
|
12
|
+
a.push(30)
|
13
|
+
a=a.to_scale
|
14
|
+
sp=Statsample::Graph::Boxplot.new(:vectors=>[a,b,c],:width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0)
|
15
|
+
rb=ReportBuilder.new
|
16
|
+
rb.add(sp)
|
17
|
+
puts rb.to_text
|
@@ -8,6 +8,11 @@ b=100.times.collect {rand}.to_scale
|
|
8
8
|
c=100.times.collect {rand}.to_scale
|
9
9
|
d=100.times.collect {rand}.to_scale
|
10
10
|
|
11
|
+
a.name="a"
|
12
|
+
b.name="b"
|
13
|
+
c.name="c"
|
14
|
+
d.name="d"
|
15
|
+
|
11
16
|
ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
|
12
17
|
|
13
18
|
ds['y1']=ds.collect{|row| row['a']*5+row['b']*2+row['c']*2+row['d']*2+10*rand()}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
|
+
$:.unshift('/home/cdx/dev/reportbuilder/lib/')
|
4
|
+
|
5
|
+
require 'benchmark'
|
6
|
+
require 'statsample'
|
7
|
+
n=1000
|
8
|
+
a=n.times.map {|i| rand()*20}.to_scale
|
9
|
+
hg=Statsample::Graph::Histogram.new(a, :bins=>15)
|
10
|
+
|
11
|
+
rb=ReportBuilder.new
|
12
|
+
rb.add(a.histogram)
|
13
|
+
rb.add(hg)
|
14
|
+
puts rb.to_text
|
data/examples/scatterplot.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
2
|
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
|
+
$:.unshift('/home/cdx/dev/reportbuilder/lib/')
|
4
|
+
|
3
5
|
require 'benchmark'
|
4
6
|
require 'statsample'
|
5
7
|
n=100
|
6
8
|
a=n.times.map {|i| rand(10)+i}.to_scale
|
7
9
|
b=n.times.map {|i| rand(10)+i}.to_scale
|
8
10
|
sp=Statsample::Graph::Scatterplot.new(a,b, :width=>200, :height=>200)
|
9
|
-
rb=ReportBuilder.new
|
10
|
-
|
11
|
-
end
|
11
|
+
rb=ReportBuilder.new
|
12
|
+
rb.add(sp)
|
12
13
|
puts rb.to_text
|
@@ -220,7 +220,7 @@ module Distribution
|
|
220
220
|
asr = Math::asin(r)
|
221
221
|
(1..lg).each do |i|
|
222
222
|
[-1,1].each do |is|
|
223
|
-
sn = Math::sin(
|
223
|
+
sn = Math::sin(asr*(is* x[i][ng]+1).quo(2) )
|
224
224
|
bvn = bvn + w[i][ng] * Math::exp( ( sn*hk-hs ).quo( 1-sn*sn ) )
|
225
225
|
end # do
|
226
226
|
end # do
|
data/lib/statsample.rb
CHANGED
@@ -118,12 +118,10 @@ module Statsample
|
|
118
118
|
@@has_gsl
|
119
119
|
end
|
120
120
|
|
121
|
-
VERSION = '0.
|
121
|
+
VERSION = '0.17.0'
|
122
122
|
SPLIT_TOKEN = ","
|
123
123
|
autoload(:Database, 'statsample/converters')
|
124
124
|
autoload(:Anova, 'statsample/anova')
|
125
|
-
autoload(:Combination, 'statsample/combination')
|
126
|
-
autoload(:Permutation, 'statsample/permutation')
|
127
125
|
autoload(:CSV, 'statsample/converters')
|
128
126
|
autoload(:PlainText, 'statsample/converters')
|
129
127
|
autoload(:Excel, 'statsample/converters')
|
@@ -219,6 +217,21 @@ module Statsample
|
|
219
217
|
end
|
220
218
|
u
|
221
219
|
end
|
220
|
+
|
221
|
+
def self.nice(s,e) # :nodoc:
|
222
|
+
reverse = e<s
|
223
|
+
min = reverse ? e : s
|
224
|
+
max = reverse ? s : e
|
225
|
+
span=max-min
|
226
|
+
return [s, e] if (!span or (span.respond_to? :infinite? and span.infinite?))
|
227
|
+
|
228
|
+
step=10**((Math::log(span).quo(Math::log(10))).round - 1).to_f
|
229
|
+
out=[(min.quo(step)).floor * step, (max.quo(step)).ceil * step]
|
230
|
+
out.reverse! if reverse
|
231
|
+
out
|
232
|
+
end
|
233
|
+
|
234
|
+
|
222
235
|
end
|
223
236
|
|
224
237
|
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require 'statsample/bivariate/pearson'
|
2
2
|
|
3
|
+
|
4
|
+
|
3
5
|
module Statsample
|
4
6
|
# Diverse methods and classes to calculate bivariate relations
|
5
7
|
# Specific classes:
|
@@ -7,6 +9,8 @@ module Statsample
|
|
7
9
|
# * Statsample::Bivariate::Tetrachoric : Tetrachoric correlation
|
8
10
|
# * Statsample::Bivariate::Polychoric : Polychoric correlation (using joint, two-step and polychoric series)
|
9
11
|
module Bivariate
|
12
|
+
autoload(:Polychoric, 'statsample/bivariate/polychoric')
|
13
|
+
autoload(:Tetrachoric, 'statsample/bivariate/tetrachoric')
|
10
14
|
|
11
15
|
class << self
|
12
16
|
# Covariance between two vectors
|
@@ -335,6 +339,4 @@ module Statsample
|
|
335
339
|
end
|
336
340
|
end
|
337
341
|
|
338
|
-
require 'statsample/bivariate/polychoric'
|
339
|
-
require 'statsample/bivariate/tetrachoric'
|
340
342
|
|
@@ -181,8 +181,20 @@ module Statsample
|
|
181
181
|
# USE:
|
182
182
|
# ds = Statsample::Excel.read("test.xls")
|
183
183
|
#
|
184
|
-
def read(filename,
|
184
|
+
def read(filename, opts=Hash.new)
|
185
185
|
require 'spreadsheet'
|
186
|
+
opts_default={
|
187
|
+
:worksheet_id=>0,
|
188
|
+
:ignore_lines=>0,
|
189
|
+
:empty=>['']
|
190
|
+
}
|
191
|
+
|
192
|
+
opts=opts_default.merge opts
|
193
|
+
|
194
|
+
worksheet_id=opts[:worksheet_id]
|
195
|
+
ignore_lines=opts[:ignore_lines]
|
196
|
+
empty=opts[:empty]
|
197
|
+
|
186
198
|
first_row=true
|
187
199
|
fields=[]
|
188
200
|
fields_data={}
|
data/lib/statsample/dataset.rb
CHANGED
@@ -331,7 +331,7 @@ module Statsample
|
|
331
331
|
def bootstrap(n=nil)
|
332
332
|
n||=@cases
|
333
333
|
ds_boot=dup_empty
|
334
|
-
|
334
|
+
n.times do
|
335
335
|
ds_boot.add_case_array(case_as_array(rand(n)))
|
336
336
|
end
|
337
337
|
ds_boot.update_valid_data
|
@@ -418,7 +418,6 @@ module Statsample
|
|
418
418
|
# Returns a vector with sumatory of fields
|
419
419
|
# if fields parameter is empty, sum all fields
|
420
420
|
def vector_sum(fields=nil)
|
421
|
-
a=[]
|
422
421
|
fields||=@fields
|
423
422
|
collect_with_index do |row, i|
|
424
423
|
if(fields.find{|f| !@vectors[f].data_with_nils[i]})
|
@@ -557,6 +556,7 @@ module Statsample
|
|
557
556
|
raise DatasetException.new(self, e)
|
558
557
|
end
|
559
558
|
end
|
559
|
+
|
560
560
|
# Returns each case as an array, coding missing values as nils
|
561
561
|
def each_array_with_nils
|
562
562
|
m=fields.size
|
@@ -586,8 +586,9 @@ module Statsample
|
|
586
586
|
@fields=f
|
587
587
|
check_order
|
588
588
|
end
|
589
|
-
|
590
|
-
|
589
|
+
# Check congruence between +fields+ attribute
|
590
|
+
# and keys on +vectors
|
591
|
+
def check_order #:nodoc:
|
591
592
|
if(@vectors.keys.sort!=@fields.sort)
|
592
593
|
@fields=@fields&@vectors.keys
|
593
594
|
@fields+=@vectors.keys.sort-@fields
|
@@ -598,7 +599,7 @@ module Statsample
|
|
598
599
|
if i.is_a? Range
|
599
600
|
fields=from_to(i.begin,i.end)
|
600
601
|
vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
|
601
|
-
|
602
|
+
Dataset.new(vectors,fields)
|
602
603
|
else
|
603
604
|
raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
|
604
605
|
@vectors[i]
|
@@ -613,7 +614,7 @@ module Statsample
|
|
613
614
|
}
|
614
615
|
Statsample::Vector.new(data,type)
|
615
616
|
end
|
616
|
-
# Same as
|
617
|
+
# Same as Statsample::Vector.collect, but giving case index as second parameter on yield.
|
617
618
|
def collect_with_index(type=:scale)
|
618
619
|
data=[]
|
619
620
|
each_with_index {|row, i|
|
@@ -661,14 +662,7 @@ module Statsample
|
|
661
662
|
end
|
662
663
|
end
|
663
664
|
|
664
|
-
|
665
|
-
require 'statsample/multiset'
|
666
|
-
if fields.size==1
|
667
|
-
to_multiset_by_split_one_field(fields[0])
|
668
|
-
else
|
669
|
-
to_multiset_by_split_multiple_fields(*fields)
|
670
|
-
end
|
671
|
-
end
|
665
|
+
|
672
666
|
|
673
667
|
# Create a new dataset with all cases which the block returns true
|
674
668
|
def filter
|
@@ -689,6 +683,20 @@ module Statsample
|
|
689
683
|
a.to_vector(@vectors[field].type)
|
690
684
|
end
|
691
685
|
|
686
|
+
# Creates a Stastample::Multiset, using one or more fields
|
687
|
+
# to split the dataset.
|
688
|
+
|
689
|
+
|
690
|
+
def to_multiset_by_split(*fields)
|
691
|
+
require 'statsample/multiset'
|
692
|
+
if fields.size==1
|
693
|
+
to_multiset_by_split_one_field(fields[0])
|
694
|
+
else
|
695
|
+
to_multiset_by_split_multiple_fields(*fields)
|
696
|
+
end
|
697
|
+
end
|
698
|
+
# Creates a Statsample::Multiset, using one field
|
699
|
+
|
692
700
|
def to_multiset_by_split_one_field(field)
|
693
701
|
raise ArgumentError,"Should use a correct field name" if !@fields.include? field
|
694
702
|
factors=@vectors[field].factors
|
@@ -831,7 +839,7 @@ module Statsample
|
|
831
839
|
# ]
|
832
840
|
#
|
833
841
|
def one_to_many(parent_fields, pattern)
|
834
|
-
base_pattern=pattern.gsub(/%v|%n/,"")
|
842
|
+
#base_pattern=pattern.gsub(/%v|%n/,"")
|
835
843
|
re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
|
836
844
|
ds_vars=parent_fields
|
837
845
|
vars=[]
|
@@ -156,8 +156,9 @@ module Statsample
|
|
156
156
|
@models=[]
|
157
157
|
@models_data={}
|
158
158
|
for i in 1..@predictors.size
|
159
|
-
c=
|
159
|
+
c=(0...@predictors.size).to_a.combination(i)
|
160
160
|
c.each do |data|
|
161
|
+
|
161
162
|
independent=data.collect {|i1| @predictors[i1] }
|
162
163
|
@models.push(independent)
|
163
164
|
if (@build_from_dataset)
|
@@ -268,7 +269,7 @@ module Statsample
|
|
268
269
|
# Get all model of size k
|
269
270
|
def md_k(k)
|
270
271
|
out=[]
|
271
|
-
models
|
272
|
+
@models.each{|m| out.push(md(m)) if m.size==k }
|
272
273
|
out
|
273
274
|
end
|
274
275
|
|
@@ -158,8 +158,9 @@ module Statsample
|
|
158
158
|
@samples_cd={}
|
159
159
|
@samples_gd={}
|
160
160
|
@pairs=[]
|
161
|
-
c=
|
161
|
+
c=(0...@fields.size).to_a.combination(2)
|
162
162
|
c.each do |data|
|
163
|
+
p data
|
163
164
|
convert=data.collect {|i| @fields[i] }
|
164
165
|
@pairs.push(convert)
|
165
166
|
[@samples_td, @samples_cd, @samples_gd].each{|s|
|
@@ -132,7 +132,7 @@ module Factor
|
|
132
132
|
@communalities=pca.communalities(m)
|
133
133
|
@eigenvalues=pca.eigenvalues
|
134
134
|
com_sum = @communalities.inject(0) {|ac,v| ac+v}
|
135
|
-
jump=true
|
135
|
+
#jump=true
|
136
136
|
|
137
137
|
break if (com_sum-prev_sum).abs < @delta
|
138
138
|
@communalities.each_with_index do |v2,i2|
|
data/lib/statsample/graph.rb
CHANGED
@@ -0,0 +1,234 @@
|
|
1
|
+
require 'rubyvis'
|
2
|
+
module Statsample
|
3
|
+
module Graph
|
4
|
+
# = Boxplot
|
5
|
+
#
|
6
|
+
# From Wikipedia:
|
7
|
+
# In descriptive statistics, a box plot or boxplot (also known as a box-and-whisker diagram or plot) is a convenient way of graphically depicting groups of numerical data through their five-number summaries: the smallest observation (sample minimum), lower quartile (Q1), median (Q2), upper quartile (Q3), and largest observation (sample maximum). A boxplot may also indicate which observations, if any, might be considered outliers.
|
8
|
+
#
|
9
|
+
# == Usage
|
10
|
+
# === Svg output
|
11
|
+
# a=[1,2,3,4].to_scale
|
12
|
+
# b=[3,4,5,6].to_scale
|
13
|
+
# puts Statsample::Graph::Boxplot.new(:vectors=>[a,b]).to_svg
|
14
|
+
# === Using ReportBuilder
|
15
|
+
# a=[1,2,3,4].to_scale
|
16
|
+
# b=[3,4,5,6].to_scale
|
17
|
+
# rb=ReportBuilder.new
|
18
|
+
# rb.add(Statsample::Graph::Boxplot.new(:vectors=>[a,b]))
|
19
|
+
# rb.save_html('boxplot.html')
|
20
|
+
|
21
|
+
class Boxplot
|
22
|
+
include Summarizable
|
23
|
+
attr_accessor :name
|
24
|
+
# Total width of Boxplot
|
25
|
+
attr_accessor :width
|
26
|
+
# Total height of Boxplot
|
27
|
+
attr_accessor :height
|
28
|
+
# Top margin
|
29
|
+
attr_accessor :margin_top
|
30
|
+
# Bottom margin
|
31
|
+
attr_accessor :margin_bottom
|
32
|
+
# Left margin
|
33
|
+
attr_accessor :margin_left
|
34
|
+
# Right margin
|
35
|
+
attr_accessor :margin_right
|
36
|
+
# Array with assignation to groups of bars
|
37
|
+
# For example, for four vectors,
|
38
|
+
# boxplot.groups=[1,2,1,3]
|
39
|
+
# Assign same color to first and third element, and different to
|
40
|
+
# second and fourth
|
41
|
+
attr_accessor :groups
|
42
|
+
# Minimum value on y-axis. Automaticly defined from data
|
43
|
+
attr_accessor :minimum
|
44
|
+
# Maximum value on y-axis. Automaticly defined from data
|
45
|
+
attr_accessor :maximum
|
46
|
+
# Vectors to box-ploting
|
47
|
+
attr_accessor :vectors
|
48
|
+
|
49
|
+
attr_reader :x_scale, :y_scale
|
50
|
+
# Create a new Boxplot.
|
51
|
+
# Parameters: Hash of options
|
52
|
+
# * :vectors: Array of vectors
|
53
|
+
# * :groups: Array of same size as :vectors:, with name of groups
|
54
|
+
# to colorize vectors
|
55
|
+
def initialize(opts=Hash.new)
|
56
|
+
@vectors=opts.delete :vectors
|
57
|
+
raise "You should define vectors" if @vectors.nil?
|
58
|
+
|
59
|
+
opts_default={
|
60
|
+
:name=>_("Boxplot"),
|
61
|
+
:groups=>nil,
|
62
|
+
:width=>400,
|
63
|
+
:height=>300,
|
64
|
+
:margin_top=>10,
|
65
|
+
:margin_bottom=>20,
|
66
|
+
:margin_left=>20,
|
67
|
+
:margin_right=>20,
|
68
|
+
:minimum=>nil,
|
69
|
+
:maximum=>nil
|
70
|
+
}
|
71
|
+
@opts=opts_default.merge(opts)
|
72
|
+
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
|
73
|
+
end
|
74
|
+
|
75
|
+
# Returns a Rubyvis panel with scatterplot
|
76
|
+
def rubyvis_panel # :nodoc:
|
77
|
+
that=self
|
78
|
+
|
79
|
+
min,max=@minimum, @maximum
|
80
|
+
|
81
|
+
min||=@vectors.map {|v| v.min}.min
|
82
|
+
max||=@vectors.map {|v| v.max}.max
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
margin_hor=margin_left + margin_right
|
87
|
+
margin_vert=margin_top + margin_bottom
|
88
|
+
x_scale = pv.Scale.ordinal(@vectors.size.times.map.to_a).split_banded(0, width-margin_hor, 4.0/5)
|
89
|
+
y_scale=Rubyvis::Scale.linear(min,max).range(0,height-margin_vert)
|
90
|
+
y_scale.nice
|
91
|
+
# cache data
|
92
|
+
|
93
|
+
colors=Rubyvis::Colors.category10
|
94
|
+
|
95
|
+
data=@vectors.map {|v|
|
96
|
+
out={:percentil_25=>v.percentil(25), :median=>v.median, :percentil_75=>v.percentil(75), :name=>v.name}
|
97
|
+
out[:iqr]=out[:percentil_75]-out[:percentil_25]
|
98
|
+
|
99
|
+
irq_max=out[:percentil_75]+out[:iqr]
|
100
|
+
irq_min=out[:percentil_25]-out[:iqr]
|
101
|
+
|
102
|
+
# Find the last data inside the margin
|
103
|
+
min=out[:percentil_25]
|
104
|
+
max=out[:percentil_75]
|
105
|
+
|
106
|
+
v.each {|d|
|
107
|
+
min=d if d<min and d>irq_min
|
108
|
+
max=d if d>max and d<irq_max
|
109
|
+
}
|
110
|
+
# Whiskers!
|
111
|
+
out[:low_whisker]=min
|
112
|
+
out[:high_whisker]=max
|
113
|
+
# And now, data outside whiskers
|
114
|
+
out[:outliers]=v.data_with_nils.find_all {|d|
|
115
|
+
d<min or d>max
|
116
|
+
}
|
117
|
+
out
|
118
|
+
}
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
vis=Rubyvis::Panel.new do |pan|
|
123
|
+
pan.width width - margin_hor
|
124
|
+
pan.height height - margin_vert
|
125
|
+
pan.bottom margin_bottom
|
126
|
+
pan.left margin_left
|
127
|
+
pan.right margin_right
|
128
|
+
pan.top margin_top
|
129
|
+
# Y axis
|
130
|
+
pan.rule do
|
131
|
+
data y_scale.ticks
|
132
|
+
bottom y_scale
|
133
|
+
stroke_style {|d| d!=0 ? "#eee" : "#000"}
|
134
|
+
label(:anchor=>'left') do
|
135
|
+
visible {|d| true}
|
136
|
+
text y_scale.tick_format
|
137
|
+
end
|
138
|
+
end
|
139
|
+
pan.rule do
|
140
|
+
bottom 0
|
141
|
+
stroke_style 'black'
|
142
|
+
end
|
143
|
+
pan.label do |l|
|
144
|
+
l.data data
|
145
|
+
l.left {|v| x_scale.scale(index)}
|
146
|
+
l.bottom -15
|
147
|
+
l.text {|v,x| v[:name]}
|
148
|
+
end
|
149
|
+
|
150
|
+
pan.panel do |bp|
|
151
|
+
bp.data data
|
152
|
+
bp.left {|v| x_scale.scale(index)}
|
153
|
+
bp.width x_scale.range_band
|
154
|
+
|
155
|
+
|
156
|
+
# Bar
|
157
|
+
bp.bar do |b|
|
158
|
+
b.bottom {|v| y_scale.scale(v[:percentil_25])}
|
159
|
+
b.height {|v| y_scale.scale(v[:percentil_75]) - y_scale.scale(v[:percentil_25]) }
|
160
|
+
b.line_width 1
|
161
|
+
b.stroke_style {|v|
|
162
|
+
if that.groups
|
163
|
+
colors.scale(that.groups[parent.index]).darker
|
164
|
+
else
|
165
|
+
colors.scale(index).darker
|
166
|
+
end
|
167
|
+
|
168
|
+
|
169
|
+
}
|
170
|
+
b.fill_style {|v|
|
171
|
+
if that.groups
|
172
|
+
colors.scale(that.groups[parent.index])
|
173
|
+
else
|
174
|
+
colors.scale(index)
|
175
|
+
end
|
176
|
+
}
|
177
|
+
end
|
178
|
+
# Median
|
179
|
+
bp.rule do |r|
|
180
|
+
r.bottom {|v| y_scale.scale(v[:median])}
|
181
|
+
r.width x_scale.range_band
|
182
|
+
r.line_width 2
|
183
|
+
end
|
184
|
+
|
185
|
+
# Whiskeys
|
186
|
+
bp.rule do |r|
|
187
|
+
r.visible {|v| v[:percentil_25]>v[:low_whisker]}
|
188
|
+
r.bottom {|v| y_scale.scale(v[:low_whisker])}
|
189
|
+
end
|
190
|
+
bp.rule do |r|
|
191
|
+
r.visible {|v| v[:percentil_25]>v[:low_whisker]}
|
192
|
+
r.bottom {|v| y_scale.scale(v[:low_whisker])}
|
193
|
+
r.left {|v| x_scale.range_band / 2.0}
|
194
|
+
r.height {|v| y_scale.scale(v[:percentil_25])-y_scale.scale(v[:low_whisker])}
|
195
|
+
end
|
196
|
+
bp.rule do |r|
|
197
|
+
r.visible {|v| v[:percentil_75]<v[:high_whisker]}
|
198
|
+
r.bottom {|v| y_scale.scale(v[:high_whisker])}
|
199
|
+
end
|
200
|
+
|
201
|
+
bp.rule do |r|
|
202
|
+
r.visible {|v| v[:percentil_75]<v[:high_whisker]}
|
203
|
+
r.bottom {|v| y_scale.scale(v[:percentil_75])}
|
204
|
+
r.left {|v| x_scale.range_band / 2.0}
|
205
|
+
r.height {|v| y_scale.scale(v[:high_whisker])-y_scale.scale(v[:percentil_75])}
|
206
|
+
end
|
207
|
+
|
208
|
+
bp.dot do |dot|
|
209
|
+
dot.shape_size 4
|
210
|
+
dot.data {|v| v[:outliers]}
|
211
|
+
dot.left {|v| x_scale.range_band / 2.0}
|
212
|
+
dot.bottom {|v| y_scale.scale(v)}
|
213
|
+
dot.title {|v| v}
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
# Returns SVG with scatterplot
|
221
|
+
def to_svg
|
222
|
+
rp=rubyvis_panel
|
223
|
+
rp.render
|
224
|
+
rp.to_svg
|
225
|
+
end
|
226
|
+
def report_building(builder) # :nodoc:
|
227
|
+
builder.section(:name=>name) do |b|
|
228
|
+
b.image(to_svg, :type=>'svg', :width=>width, :height=>height)
|
229
|
+
end
|
230
|
+
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|