statsample 1.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.build.sh +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +19 -7
- data/CONTRIBUTING.md +33 -0
- data/History.txt +5 -0
- data/README.md +41 -53
- data/benchmarks/correlation_matrix_15_variables.rb +6 -5
- data/benchmarks/correlation_matrix_5_variables.rb +6 -5
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
- data/examples/boxplot.rb +17 -5
- data/examples/correlation_matrix.rb +36 -7
- data/examples/dataset.rb +25 -5
- data/examples/dominance_analysis.rb +8 -7
- data/examples/dominance_analysis_bootstrap.rb +16 -11
- data/examples/histogram.rb +16 -2
- data/examples/icc.rb +5 -6
- data/examples/levene.rb +17 -3
- data/examples/multiple_regression.rb +6 -3
- data/examples/parallel_analysis.rb +11 -6
- data/examples/polychoric.rb +26 -13
- data/examples/principal_axis.rb +8 -4
- data/examples/reliability.rb +10 -10
- data/examples/scatterplot.rb +8 -0
- data/examples/t_test.rb +7 -0
- data/examples/u_test.rb +10 -2
- data/examples/vector.rb +9 -6
- data/examples/velicer_map_test.rb +12 -8
- data/lib/statsample.rb +13 -47
- data/lib/statsample/analysis/suite.rb +1 -1
- data/lib/statsample/anova/oneway.rb +6 -6
- data/lib/statsample/anova/twoway.rb +26 -24
- data/lib/statsample/bivariate.rb +78 -61
- data/lib/statsample/bivariate/pearson.rb +2 -2
- data/lib/statsample/codification.rb +45 -32
- data/lib/statsample/converter/csv.rb +15 -53
- data/lib/statsample/converter/spss.rb +6 -5
- data/lib/statsample/converters.rb +50 -211
- data/lib/statsample/crosstab.rb +26 -25
- data/lib/statsample/daru.rb +117 -0
- data/lib/statsample/dataset.rb +70 -942
- data/lib/statsample/dominanceanalysis.rb +16 -17
- data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
- data/lib/statsample/factor/parallelanalysis.rb +17 -19
- data/lib/statsample/factor/pca.rb +21 -20
- data/lib/statsample/factor/principalaxis.rb +3 -3
- data/lib/statsample/graph/boxplot.rb +8 -16
- data/lib/statsample/graph/histogram.rb +4 -4
- data/lib/statsample/graph/scatterplot.rb +8 -7
- data/lib/statsample/histogram.rb +128 -119
- data/lib/statsample/matrix.rb +20 -16
- data/lib/statsample/multiset.rb +39 -38
- data/lib/statsample/regression.rb +3 -3
- data/lib/statsample/regression/multiple.rb +8 -10
- data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
- data/lib/statsample/regression/multiple/baseengine.rb +32 -32
- data/lib/statsample/regression/multiple/gslengine.rb +33 -36
- data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
- data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
- data/lib/statsample/reliability.rb +23 -25
- data/lib/statsample/reliability/icc.rb +8 -7
- data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
- data/lib/statsample/reliability/scaleanalysis.rb +58 -60
- data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
- data/lib/statsample/resample.rb +1 -1
- data/lib/statsample/shorthand.rb +29 -25
- data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
- data/lib/statsample/test/levene.rb +28 -27
- data/lib/statsample/test/t.rb +7 -9
- data/lib/statsample/test/umannwhitney.rb +28 -28
- data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
- data/lib/statsample/vector.rb +70 -1013
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +12 -16
- data/test/helpers_tests.rb +1 -1
- data/test/test_analysis.rb +17 -17
- data/test/test_anova_contrast.rb +6 -6
- data/test/test_anovatwowaywithdataset.rb +8 -8
- data/test/test_anovawithvectors.rb +8 -8
- data/test/test_awesome_print_bug.rb +1 -1
- data/test/test_bartlettsphericity.rb +4 -4
- data/test/test_bivariate.rb +48 -43
- data/test/test_codification.rb +33 -33
- data/test/test_crosstab.rb +9 -9
- data/test/test_dataset.rb +28 -458
- data/test/test_factor.rb +46 -38
- data/test/test_factor_pa.rb +22 -13
- data/test/test_ggobi.rb +4 -4
- data/test/test_gsl.rb +4 -4
- data/test/test_histogram.rb +3 -3
- data/test/test_matrix.rb +13 -13
- data/test/test_multiset.rb +103 -91
- data/test/test_regression.rb +57 -52
- data/test/test_reliability.rb +55 -45
- data/test/test_reliability_icc.rb +8 -8
- data/test/test_reliability_skillscale.rb +26 -24
- data/test/test_resample.rb +1 -1
- data/test/test_statistics.rb +3 -13
- data/test/test_stest.rb +9 -9
- data/test/test_stratified.rb +3 -3
- data/test/test_test_t.rb +12 -12
- data/test/test_umannwhitney.rb +2 -2
- data/test/test_vector.rb +76 -613
- data/test/test_wilcoxonsignedrank.rb +4 -4
- metadata +57 -28
- data/lib/statsample/rserve_extension.rb +0 -20
- data/lib/statsample/vector/gsl.rb +0 -106
- data/test/fixtures/repeated_fields.csv +0 -7
- data/test/fixtures/scientific_notation.csv +0 -4
- data/test/fixtures/test_csv.csv +0 -7
- data/test/fixtures/test_xls.xls +0 -0
- data/test/test_csv.rb +0 -63
- data/test/test_rserve_extension.rb +0 -42
- data/test/test_xls.rb +0 -52
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 244a3b640d29832affc12304a77aa447f4f2df0d
|
4
|
+
data.tar.gz: 08261176b4763367e65036c486e699491e209e7d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 125beac1e267c030c6d4b70b84543a7888f3b8fcb43e6475913b00f7332dd5ce01bb6330813c3c99f5d649add677a339dc8ef366b466f2710aeaaf4bf20c0cdf
|
7
|
+
data.tar.gz: 402c505e62785e2f1eacb28b8798fed26193ed54171f7dce2a106584800ffd3de657013559c55df9e20fe5827fe6309c53f526213fc9ec23bfa2a95bf086d38e
|
data/.build.sh
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
git clone https://github.com/SciRuby/nmatrix.git
|
4
|
+
cd nmatrix
|
5
|
+
gem build nmatrix.gemspec
|
6
|
+
gem install nmatrix-0.1.0.gem
|
7
|
+
cd ..
|
8
|
+
rm -rf nmatrix
|
9
|
+
git clone https://github.com/v0dro/gsl-nmatrix
|
10
|
+
cd gsl-nmatrix
|
11
|
+
gem build gsl-nmatrix.gemspec
|
12
|
+
gem install gsl-nmatrix-1.17.gem
|
13
|
+
cd ..
|
14
|
+
rm -rf gsl-nmatrix
|
15
|
+
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
@@ -1,16 +1,28 @@
|
|
1
1
|
language:
|
2
2
|
ruby
|
3
3
|
|
4
|
+
env:
|
5
|
+
- CPLUS_INCLUDE_PATH=/usr/include/atlas C_INCLUDE_PATH=/usr/include/atlas
|
6
|
+
|
4
7
|
rvm:
|
5
|
-
- 1.9.3
|
6
|
-
- 2.0
|
7
|
-
- 2.1
|
8
|
-
- 2.2
|
8
|
+
- '1.9.3'
|
9
|
+
- '2.0'
|
10
|
+
- '2.1'
|
11
|
+
- '2.2'
|
12
|
+
|
13
|
+
matrix:
|
14
|
+
fast_finish:
|
15
|
+
true
|
16
|
+
|
17
|
+
script: "bundle exec rake test"
|
18
|
+
|
19
|
+
install:
|
20
|
+
- gem install bundler
|
21
|
+
- ./.build.sh
|
22
|
+
- bundle install
|
9
23
|
|
10
|
-
script:
|
11
|
-
bundle exec rake test
|
12
|
-
|
13
24
|
before_install:
|
14
25
|
- sudo apt-get update -qq
|
26
|
+
- sudo apt-get install -qq libatlas-base-dev
|
15
27
|
- sudo apt-get install -y libgsl0-dev r-base r-base-dev
|
16
28
|
- sudo Rscript -e "install.packages(c('Rserve','irr'),,'http://cran.us.r-project.org')"
|
data/CONTRIBUTING.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# Contributing guide
|
2
|
+
|
3
|
+
## Installing statsample development dependencies
|
4
|
+
|
5
|
+
If you want to run the full test suite, you will need the latest unreleased nmatrix and gsl-nmatrix ruby gems. They will be released upstream soon but please follow this procdure for now.
|
6
|
+
|
7
|
+
Keep in mind that either nmatrix OR gsl-nmatrix are NOT NECESSARY for using statsample. They are just required for an optional speed up.
|
8
|
+
|
9
|
+
Statsample also works with [rb-gsl](https://github.com/blackwinter/rb-gsl), though installing that will cause a problem if you have any nmatrix dependent code because narray and nmatrix have a namespace problem.
|
10
|
+
|
11
|
+
To install dependencies, execute the following commands:
|
12
|
+
|
13
|
+
`export CPLUS_INCLUDE_PATH=/usr/include/atlas`
|
14
|
+
`export C_INCLUDE_PATH=/usr/include/atlas`
|
15
|
+
`sudo apt-get update -qq`
|
16
|
+
`sudo apt-get install -qq libatlas-base-dev`
|
17
|
+
`sudo apt-get --purge remove liblapack-dev liblapack3 liblapack3gf`
|
18
|
+
`sudo apt-get install -y libgsl0-dev r-base r-base-dev`
|
19
|
+
`sudo Rscript -e "install.packages(c('Rserve','irr'),,'http://cran.us.r-project.org')"`
|
20
|
+
|
21
|
+
Then execute the .build.sh script to clone and install the latest nmatrix and gsl-nmatrix on your system:
|
22
|
+
|
23
|
+
`./.build.sh`
|
24
|
+
|
25
|
+
Then finally install remaining dependencies:
|
26
|
+
|
27
|
+
`bundle install`
|
28
|
+
|
29
|
+
And run the test suite (should be all green):
|
30
|
+
|
31
|
+
`bundle exec rake test`
|
32
|
+
|
33
|
+
If you have problems installing nmatrix, please consult the [nmatrix installation wiki](https://github.com/SciRuby/nmatrix/wiki/Installation) or the [mailing list](https://groups.google.com/forum/#!forum/sciruby-dev).
|
data/History.txt
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
=== 2.0.0 / 2015-06-20
|
2
|
+
* Added dependency on daru and replaced Statsample::Vector and Dataset with
|
3
|
+
Daru::Vector and Daru::DataFrame.
|
4
|
+
* NMatrix and gsl-nmatrix are used as development dependencies.
|
5
|
+
|
1
6
|
=== 1.5.0 / 2015-06-11
|
2
7
|
* Made sure all methods work properly with and without GSL.
|
3
8
|
* Statsample works with either rb-gsl or gsl-nmatrix.
|
data/README.md
CHANGED
@@ -32,18 +32,56 @@ If you need to work on Structural Equation Modeling, you could see +statsample-s
|
|
32
32
|
```bash
|
33
33
|
$ [sudo] gem install statsample-sem
|
34
34
|
```
|
35
|
+
# Testing
|
36
|
+
|
37
|
+
See CONTRIBUTING for information on testing and contributing to statsample.
|
35
38
|
|
36
39
|
# Documentation
|
37
40
|
|
38
41
|
You can see the latest documentation in [rubydoc.info](http://www.rubydoc.info/github/sciruby/statsample/master).
|
39
42
|
|
43
|
+
# Usage
|
44
|
+
|
45
|
+
## Notebooks
|
46
|
+
|
47
|
+
You can see some iruby notebooks here:
|
48
|
+
|
49
|
+
### Statistics
|
50
|
+
|
51
|
+
* [Correlation Matrix with daru and statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Correlation%20Matrix%20with%20daru%20and%20statsample.ipynb)
|
52
|
+
* [Dominance Analysis with statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Dominance%20Analysis%20with%20statsample.ipynb)
|
53
|
+
* [Reliability ICC](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Reliability%20ICC%20with%20statsample.ipynb)
|
54
|
+
* [Levene Test](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Levene%20Test.ipynb)
|
55
|
+
* [Multiple Regression](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Multiple%20Regression.ipynb)
|
56
|
+
* [Parallel Analysis on PCA](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Parallel%20Analysis%20on%20PCA.ipynb)
|
57
|
+
* [Polychoric Analysis](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Polychoric%20Correlation.ipynb)
|
58
|
+
* [Reliability Scale and Multiscale Analysis](https://github.com/SciRuby/sciruby-notebooks/blob/master/Statistics/Reliability%20Scale%20Analysis.ipynb)
|
59
|
+
* [Velicer MAP Test](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Velicer%20MAP%20test.ipynb)
|
60
|
+
|
61
|
+
### Visualizations
|
62
|
+
|
63
|
+
* [Creating Boxplots with daru and statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Boxplot%20with%20daru%20and%20statsample.ipynb)
|
64
|
+
* [Creating A Histogram](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Creating%20a%20Histogram.ipynb)
|
65
|
+
* [Creating a Scatterplot](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Scatterplot%20with%20statsample.ipynb)
|
66
|
+
|
67
|
+
### Working with DataFrame and Vector
|
68
|
+
|
69
|
+
* [Creating Vectors and DataFrames with daru](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Creation%20of%20Vector%20and%20DataFrame.ipynb)
|
70
|
+
* [Detailed Usage of Daru::Vector](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20Vector.ipynb)
|
71
|
+
* [Detailed Usage of Daru::DataFrame](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20DataFrame.ipynb)
|
72
|
+
* [Visualizing Data with Daru::DataFrame](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Visualizing%20data%20with%20daru%20DataFrame.ipynb)
|
73
|
+
|
74
|
+
## Examples
|
75
|
+
|
76
|
+
See the /examples directory for some use cases. The notebooks listed above have mostly
|
77
|
+
the same examples, and they look better so you might want to see that first.
|
78
|
+
|
40
79
|
# Description
|
41
80
|
|
42
81
|
A suite for basic and advanced statistics on Ruby. Tested on CRuby 1.9.3, 2.0.0 and 2.1.1. See `.travis.yml` for more information.
|
43
82
|
|
44
83
|
Include:
|
45
84
|
- Descriptive statistics: frequencies, median, mean, standard error, skew, kurtosis (and many others).
|
46
|
-
- Imports and exports datasets from and to Excel, CSV and plain text files.
|
47
85
|
- Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b and gamma. Tetrachoric and Polychoric correlation provides by +statsample-bivariate-extension+ gem.
|
48
86
|
- Intra-class correlation
|
49
87
|
- Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA.
|
@@ -75,8 +113,7 @@ Include:
|
|
75
113
|
# Features
|
76
114
|
|
77
115
|
- Classes for manipulation and storage of data:
|
78
|
-
-
|
79
|
-
- Statsample::Dataset: a group of Statsample::Vector, analog to a excel spreadsheet or a dataframe on R. The base of almost all operations on statsample.
|
116
|
+
- Uses [daru](https://github.com/v0dro/daru) for storing data and basic statistics.
|
80
117
|
- Statsample::Multiset: multiple datasets with same fields and type of vectors
|
81
118
|
- Anova module provides generic Statsample::Anova::OneWay and vector based Statsample::Anova::OneWayWithVectors. Also you can create contrast using Statsample::Anova::Contrast
|
82
119
|
- Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices
|
@@ -100,10 +137,7 @@ Include:
|
|
100
137
|
- Statsample::DominanceAnalysis class can report dominance analysis for a sample, using uni or multivariate dependent variables
|
101
138
|
- Statsample::DominanceAnalysis::Bootstrap can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
|
102
139
|
- Module Statsample::Codification, to help to codify open questions
|
103
|
-
- Converters to
|
104
|
-
- Statsample::Database : Can create sql to create tables, read and insert data
|
105
|
-
- Statsample::CSV : Read and write CSV files
|
106
|
-
- Statsample::Excel : Read and write Excel files
|
140
|
+
- Converters to export data:
|
107
141
|
- Statsample::Mx : Write Mx Files
|
108
142
|
- Statsample::GGobi : Write Ggobi files
|
109
143
|
- Module Statsample::Crosstab provides function to create crosstab for categorical data
|
@@ -130,52 +164,6 @@ Include:
|
|
130
164
|
- Gem <tt>statsample-glm</tt> provides you with GML method, to work with Logistic, Poisson and Gaussian regression ,using ML or IRWLS.
|
131
165
|
- Close integration with gem <tt>reportbuilder</tt>, to easily create reports on text, html and rtf formats.
|
132
166
|
|
133
|
-
# Usage
|
134
|
-
|
135
|
-
See the [examples folder](https://github.com/clbustos/statsample/tree/master/examples/) too.
|
136
|
-
|
137
|
-
## Boxplot
|
138
|
-
|
139
|
-
```ruby
|
140
|
-
require 'statsample'
|
141
|
-
|
142
|
-
ss_analysis(Statsample::Graph::Boxplot) do
|
143
|
-
n = 30
|
144
|
-
a = rnorm(n-1, 50, 10)
|
145
|
-
b = rnorm(n, 30, 5)
|
146
|
-
c = rnorm(n, 5, 1)
|
147
|
-
a.push(2)
|
148
|
-
boxplot(vectors: [a, b, c],
|
149
|
-
width: 300,
|
150
|
-
height: 300,
|
151
|
-
groups: %w{first first second},
|
152
|
-
minimum: 0)
|
153
|
-
end
|
154
|
-
|
155
|
-
Statsample::Analysis.run # Open svg file on *nix application defined
|
156
|
-
```
|
157
|
-
|
158
|
-
## Correlation matrix
|
159
|
-
|
160
|
-
```ruby
|
161
|
-
require 'statsample'
|
162
|
-
# Note R like generation of random gaussian variable
|
163
|
-
# and correlation matrix
|
164
|
-
|
165
|
-
ss_analysis("Statsample::Bivariate.correlation_matrix") do
|
166
|
-
samples = 1000
|
167
|
-
ds = data_frame(
|
168
|
-
'a' => rnorm(samples),
|
169
|
-
'b' => rnorm(samples),
|
170
|
-
'c' => rnorm(samples),
|
171
|
-
'd' => rnorm(samples))
|
172
|
-
cm = cor(ds)
|
173
|
-
summary(cm)
|
174
|
-
end
|
175
|
-
|
176
|
-
Statsample::Analysis.run_batch # Echo output to console
|
177
|
-
```
|
178
|
-
|
179
167
|
# Resources
|
180
168
|
|
181
169
|
- Source code on github :: http://github.com/sciruby/statsample
|
@@ -4,7 +4,6 @@ extend BenchPress
|
|
4
4
|
cases=250
|
5
5
|
vars=20
|
6
6
|
|
7
|
-
|
8
7
|
name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)"
|
9
8
|
author 'Clbustos'
|
10
9
|
date '2011-01-18'
|
@@ -17,10 +16,12 @@ In this test, we test the calculation using #{vars} variables with
|
|
17
16
|
|
18
17
|
reps 200 #number of repetitions
|
19
18
|
|
20
|
-
ds=
|
21
|
-
|
22
|
-
ac
|
23
|
-
|
19
|
+
ds = Daru::DataFrame.new(
|
20
|
+
vars.times.inject({}) do |ac,v|
|
21
|
+
ac["x#{v}".to_sym]=Daru::Vector.new_with_size(cases) {rand()}
|
22
|
+
ac
|
23
|
+
end
|
24
|
+
)
|
24
25
|
|
25
26
|
measure "Statsample::Bivariate.correlation_matrix_optimized" do
|
26
27
|
Statsample::Bivariate.correlation_matrix_optimized(ds)
|
@@ -17,11 +17,12 @@ In this test, we test the calculation using #{vars} variables with
|
|
17
17
|
|
18
18
|
reps 200 #number of repetitions
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
ac["x#{v}"]=
|
23
|
-
ac
|
24
|
-
|
20
|
+
ds = Daru::DataFrame.new(
|
21
|
+
vars.times.inject({}) do |ac,v|
|
22
|
+
ac["x#{v}".to_sym]=Daru::Vector.new_with_size(cases) {rand()}
|
23
|
+
ac
|
24
|
+
end
|
25
|
+
)
|
25
26
|
|
26
27
|
measure "Statsample::Bivariate.correlation_matrix_optimized" do
|
27
28
|
Statsample::Bivariate.correlation_matrix_optimized(ds)
|
@@ -5,11 +5,13 @@ require 'statsample'
|
|
5
5
|
require 'benchmark'
|
6
6
|
|
7
7
|
def create_dataset(vars,cases)
|
8
|
-
ran=Distribution::Normal.rng
|
9
|
-
ds=
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
ran = Distribution::Normal.rng
|
9
|
+
ds = Daru::DataFrame.new(
|
10
|
+
vars.times.inject({}) do |ac,v|
|
11
|
+
ac["x#{v}".to_sym] = Daru::Vector.new_with_size(cases) {ran.call}
|
12
|
+
ac
|
13
|
+
end
|
14
|
+
)
|
13
15
|
end
|
14
16
|
|
15
17
|
def prediction_pairwise(vars,cases)
|
@@ -19,19 +21,17 @@ def prediction_optimized(vars,cases)
|
|
19
21
|
Statsample::Bivariate.prediction_optimized(vars,cases) / 10
|
20
22
|
end
|
21
23
|
|
22
|
-
|
23
|
-
|
24
24
|
if !File.exists?("correlation_matrix.ds") or File.mtime(__FILE__) > File.mtime("correlation_matrix.ds")
|
25
25
|
reps=100 #number of repetitions
|
26
26
|
ds_sizes=[5,10,30,50,100,150,200,500,1000]
|
27
27
|
ds_vars=[3,4,5,10,20,30,40]
|
28
28
|
#ds_sizes=[5,10]
|
29
29
|
#ds_vars=[3,5,20]
|
30
|
-
rs=
|
30
|
+
rs = Daru::DataFrame.new({}, order: [:cases, :vars, :time_optimized, :time_pairwise])
|
31
31
|
|
32
32
|
ds_sizes.each do |cases|
|
33
33
|
ds_vars.each do |vars|
|
34
|
-
ds=create_dataset(vars,cases)
|
34
|
+
ds = create_dataset(vars,cases)
|
35
35
|
time_optimized= Benchmark.realtime do
|
36
36
|
reps.times {
|
37
37
|
Statsample::Bivariate.correlation_matrix_optimized(ds)
|
@@ -40,36 +40,33 @@ ds_sizes.each do |cases|
|
|
40
40
|
end
|
41
41
|
|
42
42
|
time_pairwise= Benchmark.realtime do
|
43
|
-
reps.times {
|
44
|
-
Statsample::Bivariate.correlation_matrix_pairwise(ds)
|
45
|
-
}
|
43
|
+
reps.times { Statsample::Bivariate.correlation_matrix_pairwise(ds) }
|
46
44
|
end
|
47
45
|
|
48
46
|
puts "Cases:#{cases}, vars:#{vars} -> opt:%0.3f (%0.3f) | pair: %0.3f (%0.3f)" % [time_optimized, prediction_optimized(vars,cases), time_pairwise, prediction_pairwise(vars,cases)]
|
49
47
|
|
50
|
-
rs.
|
48
|
+
rs.add_row(Daru::Vector.new({
|
49
|
+
:cases => cases,
|
50
|
+
:vars => vars,
|
51
|
+
:time_optimized => Math.sqrt(time_optimized*1000),
|
52
|
+
:time_pairwise =>Math.sqrt(time_pairwise*1000)
|
53
|
+
})
|
54
|
+
)
|
51
55
|
end
|
52
|
-
end
|
53
|
-
|
56
|
+
end
|
54
57
|
else
|
55
58
|
rs=Statsample.load("correlation_matrix.ds")
|
56
59
|
end
|
57
60
|
|
61
|
+
rs[:c_v] = rs.collect {|row| row[:cases]*row[:vars]}
|
58
62
|
|
59
|
-
rs.
|
60
|
-
|
61
|
-
rs['c_v']=rs.collect {|row| row['cases']*row['vars']}
|
62
|
-
|
63
|
-
rs.update_valid_data
|
63
|
+
rs.update
|
64
64
|
rs.save("correlation_matrix.ds")
|
65
65
|
Statsample::Excel.write(rs,"correlation_matrix.xls")
|
66
66
|
|
67
|
+
rb = ReportBuilder.new(:name=>"Correlation matrix analysis")
|
67
68
|
|
68
|
-
|
69
|
-
rb
|
70
|
-
|
71
|
-
rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_optimized','c_v']],'time_optimized', :digits=>6))
|
72
|
-
rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_pairwise','c_v']],'time_pairwise', :digits=>6))
|
73
|
-
|
69
|
+
rb.add(Statsample::Regression.multiple(rs[:cases,:vars,:time_optimized,:c_v],:time_optimized, :digits=>6))
|
70
|
+
rb.add(Statsample::Regression.multiple(rs[:cases,:vars,:time_pairwise,:c_v],:time_pairwise, :digits=>6))
|
74
71
|
|
75
72
|
rb.save_html("correlation_matrix.html")
|
data/examples/boxplot.rb
CHANGED
@@ -1,14 +1,26 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
|
+
# == Description
|
3
|
+
#
|
4
|
+
# This example illustrates how daru, combined with Statsample::Graph::Boxplot
|
5
|
+
# can be used for generating box plots of a normally distributed set of data.
|
6
|
+
#
|
7
|
+
# The 'rnorm' function, defined in statsample/shorthands generates a Daru::Vector
|
8
|
+
# object which contains the specified number of random variables in a normal distribution.
|
9
|
+
# It uses the 'distribution' gem for this purpose.
|
10
|
+
#
|
11
|
+
# Create a boxplot of the data by specifying the vectors a, b and c and providing
|
12
|
+
# necessary options to Statsample::Graph::Boxplot. The 'boxplot' function is shorthand
|
13
|
+
# for calling Statsample::Graph::Boxplot.
|
2
14
|
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
15
|
require 'statsample'
|
4
16
|
Statsample::Analysis.store(Statsample::Graph::Boxplot) do
|
5
|
-
n=30
|
6
|
-
a=rnorm(n-1,50,10)
|
7
|
-
b=rnorm(n, 30,5)
|
8
|
-
c=rnorm(n,5,1)
|
17
|
+
n = 30
|
18
|
+
a = rnorm(n-1,50,10)
|
19
|
+
b = rnorm(n, 30,5)
|
20
|
+
c = rnorm(n,5,1)
|
9
21
|
a.push(2)
|
22
|
+
|
10
23
|
boxplot(:vectors=>[a,b,c],:width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0)
|
11
|
-
|
12
24
|
end
|
13
25
|
|
14
26
|
if __FILE__==$0
|
@@ -1,16 +1,45 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# == Description
|
4
|
+
#
|
5
|
+
# Creating and summarizing a correlation matrix with daru and statsample
|
2
6
|
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
|
-
require 'statsample'
|
4
7
|
|
8
|
+
require 'statsample'
|
5
9
|
Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
|
10
|
+
# It so happens that Daru::Vector and Daru::DataFrame must update metadata
|
11
|
+
# like positions of missing values every time they are created.
|
12
|
+
#
|
13
|
+
# Since we dont have any missing values in the data that we are creating,
|
14
|
+
# we set Daru.lazy_update = true so that missing data is not updated every
|
15
|
+
# time and things happen much faster.
|
16
|
+
#
|
17
|
+
# In case you do have missing data and lazy_update has been set to *true*,
|
18
|
+
# you _SHOULD_ called `#update` on the concerned Vector or DataFrame object
|
19
|
+
# everytime an assingment or deletion cycle is complete.
|
20
|
+
Daru.lazy_update = true
|
21
|
+
|
22
|
+
# Create a Daru::DataFrame containing 4 vectors a, b, c and d.
|
23
|
+
#
|
24
|
+
# Notice that the `clone` option has been set to *false*. This tells Daru
|
25
|
+
# to not clone the Daru::Vectors being supplied by `rnorm`, since it would
|
26
|
+
# be unnecessarily counter productive to clone the vectors once they have
|
27
|
+
# been assigned to the dataframe.
|
6
28
|
samples=1000
|
7
|
-
ds=
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
29
|
+
ds = Daru::DataFrame.new({
|
30
|
+
:a => rnorm(samples),
|
31
|
+
:b => rnorm(samples),
|
32
|
+
:c => rnorm(samples),
|
33
|
+
:d => rnorm(samples)
|
34
|
+
}, clone: false)
|
35
|
+
|
36
|
+
# Calculate correlation matrix by calling the `cor` shorthand.
|
37
|
+
cm = cor(ds)
|
13
38
|
summary(cm)
|
39
|
+
|
40
|
+
# Set lazy_update to *false* once our job is done so that this analysis does
|
41
|
+
# not accidentally affect code elsewhere.
|
42
|
+
Daru.lazy_update = false
|
14
43
|
end
|
15
44
|
|
16
45
|
if __FILE__==$0
|