statsample 1.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +15 -0
  3. data/.gitignore +1 -0
  4. data/.travis.yml +19 -7
  5. data/CONTRIBUTING.md +33 -0
  6. data/History.txt +5 -0
  7. data/README.md +41 -53
  8. data/benchmarks/correlation_matrix_15_variables.rb +6 -5
  9. data/benchmarks/correlation_matrix_5_variables.rb +6 -5
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
  11. data/examples/boxplot.rb +17 -5
  12. data/examples/correlation_matrix.rb +36 -7
  13. data/examples/dataset.rb +25 -5
  14. data/examples/dominance_analysis.rb +8 -7
  15. data/examples/dominance_analysis_bootstrap.rb +16 -11
  16. data/examples/histogram.rb +16 -2
  17. data/examples/icc.rb +5 -6
  18. data/examples/levene.rb +17 -3
  19. data/examples/multiple_regression.rb +6 -3
  20. data/examples/parallel_analysis.rb +11 -6
  21. data/examples/polychoric.rb +26 -13
  22. data/examples/principal_axis.rb +8 -4
  23. data/examples/reliability.rb +10 -10
  24. data/examples/scatterplot.rb +8 -0
  25. data/examples/t_test.rb +7 -0
  26. data/examples/u_test.rb +10 -2
  27. data/examples/vector.rb +9 -6
  28. data/examples/velicer_map_test.rb +12 -8
  29. data/lib/statsample.rb +13 -47
  30. data/lib/statsample/analysis/suite.rb +1 -1
  31. data/lib/statsample/anova/oneway.rb +6 -6
  32. data/lib/statsample/anova/twoway.rb +26 -24
  33. data/lib/statsample/bivariate.rb +78 -61
  34. data/lib/statsample/bivariate/pearson.rb +2 -2
  35. data/lib/statsample/codification.rb +45 -32
  36. data/lib/statsample/converter/csv.rb +15 -53
  37. data/lib/statsample/converter/spss.rb +6 -5
  38. data/lib/statsample/converters.rb +50 -211
  39. data/lib/statsample/crosstab.rb +26 -25
  40. data/lib/statsample/daru.rb +117 -0
  41. data/lib/statsample/dataset.rb +70 -942
  42. data/lib/statsample/dominanceanalysis.rb +16 -17
  43. data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
  44. data/lib/statsample/factor/parallelanalysis.rb +17 -19
  45. data/lib/statsample/factor/pca.rb +21 -20
  46. data/lib/statsample/factor/principalaxis.rb +3 -3
  47. data/lib/statsample/graph/boxplot.rb +8 -16
  48. data/lib/statsample/graph/histogram.rb +4 -4
  49. data/lib/statsample/graph/scatterplot.rb +8 -7
  50. data/lib/statsample/histogram.rb +128 -119
  51. data/lib/statsample/matrix.rb +20 -16
  52. data/lib/statsample/multiset.rb +39 -38
  53. data/lib/statsample/regression.rb +3 -3
  54. data/lib/statsample/regression/multiple.rb +8 -10
  55. data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
  56. data/lib/statsample/regression/multiple/baseengine.rb +32 -32
  57. data/lib/statsample/regression/multiple/gslengine.rb +33 -36
  58. data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
  59. data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
  60. data/lib/statsample/reliability.rb +23 -25
  61. data/lib/statsample/reliability/icc.rb +8 -7
  62. data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
  63. data/lib/statsample/reliability/scaleanalysis.rb +58 -60
  64. data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
  65. data/lib/statsample/resample.rb +1 -1
  66. data/lib/statsample/shorthand.rb +29 -25
  67. data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
  68. data/lib/statsample/test/levene.rb +28 -27
  69. data/lib/statsample/test/t.rb +7 -9
  70. data/lib/statsample/test/umannwhitney.rb +28 -28
  71. data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
  72. data/lib/statsample/vector.rb +70 -1013
  73. data/lib/statsample/version.rb +1 -1
  74. data/statsample.gemspec +12 -16
  75. data/test/helpers_tests.rb +1 -1
  76. data/test/test_analysis.rb +17 -17
  77. data/test/test_anova_contrast.rb +6 -6
  78. data/test/test_anovatwowaywithdataset.rb +8 -8
  79. data/test/test_anovawithvectors.rb +8 -8
  80. data/test/test_awesome_print_bug.rb +1 -1
  81. data/test/test_bartlettsphericity.rb +4 -4
  82. data/test/test_bivariate.rb +48 -43
  83. data/test/test_codification.rb +33 -33
  84. data/test/test_crosstab.rb +9 -9
  85. data/test/test_dataset.rb +28 -458
  86. data/test/test_factor.rb +46 -38
  87. data/test/test_factor_pa.rb +22 -13
  88. data/test/test_ggobi.rb +4 -4
  89. data/test/test_gsl.rb +4 -4
  90. data/test/test_histogram.rb +3 -3
  91. data/test/test_matrix.rb +13 -13
  92. data/test/test_multiset.rb +103 -91
  93. data/test/test_regression.rb +57 -52
  94. data/test/test_reliability.rb +55 -45
  95. data/test/test_reliability_icc.rb +8 -8
  96. data/test/test_reliability_skillscale.rb +26 -24
  97. data/test/test_resample.rb +1 -1
  98. data/test/test_statistics.rb +3 -13
  99. data/test/test_stest.rb +9 -9
  100. data/test/test_stratified.rb +3 -3
  101. data/test/test_test_t.rb +12 -12
  102. data/test/test_umannwhitney.rb +2 -2
  103. data/test/test_vector.rb +76 -613
  104. data/test/test_wilcoxonsignedrank.rb +4 -4
  105. metadata +57 -28
  106. data/lib/statsample/rserve_extension.rb +0 -20
  107. data/lib/statsample/vector/gsl.rb +0 -106
  108. data/test/fixtures/repeated_fields.csv +0 -7
  109. data/test/fixtures/scientific_notation.csv +0 -4
  110. data/test/fixtures/test_csv.csv +0 -7
  111. data/test/fixtures/test_xls.xls +0 -0
  112. data/test/test_csv.rb +0 -63
  113. data/test/test_rserve_extension.rb +0 -42
  114. data/test/test_xls.rb +0 -52
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2bdb8a5e29a75f62af49b094ac026bbc01597707
4
- data.tar.gz: 9459ac7bf01e6d20e81a23286dab63f63bdbcea3
3
+ metadata.gz: 244a3b640d29832affc12304a77aa447f4f2df0d
4
+ data.tar.gz: 08261176b4763367e65036c486e699491e209e7d
5
5
  SHA512:
6
- metadata.gz: de03429dbc2c7a1bf0e0acfd0afcebcb55f53630688d635bdfa549eca988bfde40f601e056ae10946f064e86e769bc3be0b50888e1a3209ab243d0253e5b9ed6
7
- data.tar.gz: dd71de3f239d8202f3716e4e268f5199cce24e6436e1b017d82970a3bf27da1749bc00a0ab35d030fbd1f63b717aab49c513bb51f8728807a565206cf81d30f1
6
+ metadata.gz: 125beac1e267c030c6d4b70b84543a7888f3b8fcb43e6475913b00f7332dd5ce01bb6330813c3c99f5d649add677a339dc8ef366b466f2710aeaaf4bf20c0cdf
7
+ data.tar.gz: 402c505e62785e2f1eacb28b8798fed26193ed54171f7dce2a106584800ffd3de657013559c55df9e20fe5827fe6309c53f526213fc9ec23bfa2a95bf086d38e
@@ -0,0 +1,15 @@
1
+ #!/bin/bash
2
+
3
+ git clone https://github.com/SciRuby/nmatrix.git
4
+ cd nmatrix
5
+ gem build nmatrix.gemspec
6
+ gem install nmatrix-0.1.0.gem
7
+ cd ..
8
+ rm -rf nmatrix
9
+ git clone https://github.com/v0dro/gsl-nmatrix
10
+ cd gsl-nmatrix
11
+ gem build gsl-nmatrix.gemspec
12
+ gem install gsl-nmatrix-1.17.gem
13
+ cd ..
14
+ rm -rf gsl-nmatrix
15
+
data/.gitignore CHANGED
@@ -12,3 +12,4 @@ examples/images/*
12
12
  examples/*.html
13
13
  web/upload_task.rb
14
14
  .idea
15
+ *.gem
@@ -1,16 +1,28 @@
1
1
  language:
2
2
  ruby
3
3
 
4
+ env:
5
+ - CPLUS_INCLUDE_PATH=/usr/include/atlas C_INCLUDE_PATH=/usr/include/atlas
6
+
4
7
  rvm:
5
- - 1.9.3
6
- - 2.0
7
- - 2.1
8
- - 2.2
8
+ - '1.9.3'
9
+ - '2.0'
10
+ - '2.1'
11
+ - '2.2'
12
+
13
+ matrix:
14
+ fast_finish:
15
+ true
16
+
17
+ script: "bundle exec rake test"
18
+
19
+ install:
20
+ - gem install bundler
21
+ - ./.build.sh
22
+ - bundle install
9
23
 
10
- script:
11
- bundle exec rake test
12
-
13
24
  before_install:
14
25
  - sudo apt-get update -qq
26
+ - sudo apt-get install -qq libatlas-base-dev
15
27
  - sudo apt-get install -y libgsl0-dev r-base r-base-dev
16
28
  - sudo Rscript -e "install.packages(c('Rserve','irr'),,'http://cran.us.r-project.org')"
@@ -0,0 +1,33 @@
1
+ # Contributing guide
2
+
3
+ ## Installing statsample development dependencies
4
+
5
+ If you want to run the full test suite, you will need the latest unreleased nmatrix and gsl-nmatrix ruby gems. They will be released upstream soon but please follow this procdure for now.
6
+
7
+ Keep in mind that either nmatrix OR gsl-nmatrix are NOT NECESSARY for using statsample. They are just required for an optional speed up.
8
+
9
+ Statsample also works with [rb-gsl](https://github.com/blackwinter/rb-gsl), though installing that will cause a problem if you have any nmatrix dependent code because narray and nmatrix have a namespace problem.
10
+
11
+ To install dependencies, execute the following commands:
12
+
13
+ `export CPLUS_INCLUDE_PATH=/usr/include/atlas`
14
+ `export C_INCLUDE_PATH=/usr/include/atlas`
15
+ `sudo apt-get update -qq`
16
+ `sudo apt-get install -qq libatlas-base-dev`
17
+ `sudo apt-get --purge remove liblapack-dev liblapack3 liblapack3gf`
18
+ `sudo apt-get install -y libgsl0-dev r-base r-base-dev`
19
+ `sudo Rscript -e "install.packages(c('Rserve','irr'),,'http://cran.us.r-project.org')"`
20
+
21
+ Then execute the .build.sh script to clone and install the latest nmatrix and gsl-nmatrix on your system:
22
+
23
+ `./.build.sh`
24
+
25
+ Then finally install remaining dependencies:
26
+
27
+ `bundle install`
28
+
29
+ And run the test suite (should be all green):
30
+
31
+ `bundle exec rake test`
32
+
33
+ If you have problems installing nmatrix, please consult the [nmatrix installation wiki](https://github.com/SciRuby/nmatrix/wiki/Installation) or the [mailing list](https://groups.google.com/forum/#!forum/sciruby-dev).
@@ -1,3 +1,8 @@
1
+ === 2.0.0 / 2015-06-20
2
+ * Added dependency on daru and replaced Statsample::Vector and Dataset with
3
+ Daru::Vector and Daru::DataFrame.
4
+ * NMatrix and gsl-nmatrix are used as development dependencies.
5
+
1
6
  === 1.5.0 / 2015-06-11
2
7
  * Made sure all methods work properly with and without GSL.
3
8
  * Statsample works with either rb-gsl or gsl-nmatrix.
data/README.md CHANGED
@@ -32,18 +32,56 @@ If you need to work on Structural Equation Modeling, you could see +statsample-s
32
32
  ```bash
33
33
  $ [sudo] gem install statsample-sem
34
34
  ```
35
+ # Testing
36
+
37
+ See CONTRIBUTING for information on testing and contributing to statsample.
35
38
 
36
39
  # Documentation
37
40
 
38
41
  You can see the latest documentation in [rubydoc.info](http://www.rubydoc.info/github/sciruby/statsample/master).
39
42
 
43
+ # Usage
44
+
45
+ ## Notebooks
46
+
47
+ You can see some iruby notebooks here:
48
+
49
+ ### Statistics
50
+
51
+ * [Correlation Matrix with daru and statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Correlation%20Matrix%20with%20daru%20and%20statsample.ipynb)
52
+ * [Dominance Analysis with statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Dominance%20Analysis%20with%20statsample.ipynb)
53
+ * [Reliability ICC](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Reliability%20ICC%20with%20statsample.ipynb)
54
+ * [Levene Test](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Levene%20Test.ipynb)
55
+ * [Multiple Regression](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Multiple%20Regression.ipynb)
56
+ * [Parallel Analysis on PCA](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Parallel%20Analysis%20on%20PCA.ipynb)
57
+ * [Polychoric Analysis](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Polychoric%20Correlation.ipynb)
58
+ * [Reliability Scale and Multiscale Analysis](https://github.com/SciRuby/sciruby-notebooks/blob/master/Statistics/Reliability%20Scale%20Analysis.ipynb)
59
+ * [Velicer MAP Test](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Velicer%20MAP%20test.ipynb)
60
+
61
+ ### Visualizations
62
+
63
+ * [Creating Boxplots with daru and statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Boxplot%20with%20daru%20and%20statsample.ipynb)
64
+ * [Creating A Histogram](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Creating%20a%20Histogram.ipynb)
65
+ * [Creating a Scatterplot](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Scatterplot%20with%20statsample.ipynb)
66
+
67
+ ### Working with DataFrame and Vector
68
+
69
+ * [Creating Vectors and DataFrames with daru](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Creation%20of%20Vector%20and%20DataFrame.ipynb)
70
+ * [Detailed Usage of Daru::Vector](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20Vector.ipynb)
71
+ * [Detailed Usage of Daru::DataFrame](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20DataFrame.ipynb)
72
+ * [Visualizing Data with Daru::DataFrame](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Visualizing%20data%20with%20daru%20DataFrame.ipynb)
73
+
74
+ ## Examples
75
+
76
+ See the /examples directory for some use cases. The notebooks listed above have mostly
77
+ the same examples, and they look better so you might want to see that first.
78
+
40
79
  # Description
41
80
 
42
81
  A suite for basic and advanced statistics on Ruby. Tested on CRuby 1.9.3, 2.0.0 and 2.1.1. See `.travis.yml` for more information.
43
82
 
44
83
  Include:
45
84
  - Descriptive statistics: frequencies, median, mean, standard error, skew, kurtosis (and many others).
46
- - Imports and exports datasets from and to Excel, CSV and plain text files.
47
85
  - Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b and gamma. Tetrachoric and Polychoric correlation provides by +statsample-bivariate-extension+ gem.
48
86
  - Intra-class correlation
49
87
  - Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA.
@@ -75,8 +113,7 @@ Include:
75
113
  # Features
76
114
 
77
115
  - Classes for manipulation and storage of data:
78
- - Statsample::Vector: An extension of an array, with statistical methods like sum, mean and standard deviation
79
- - Statsample::Dataset: a group of Statsample::Vector, analog to a excel spreadsheet or a dataframe on R. The base of almost all operations on statsample.
116
+ - Uses [daru](https://github.com/v0dro/daru) for storing data and basic statistics.
80
117
  - Statsample::Multiset: multiple datasets with same fields and type of vectors
81
118
  - Anova module provides generic Statsample::Anova::OneWay and vector based Statsample::Anova::OneWayWithVectors. Also you can create contrast using Statsample::Anova::Contrast
82
119
  - Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices
@@ -100,10 +137,7 @@ Include:
100
137
  - Statsample::DominanceAnalysis class can report dominance analysis for a sample, using uni or multivariate dependent variables
101
138
  - Statsample::DominanceAnalysis::Bootstrap can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
102
139
  - Module Statsample::Codification, to help to codify open questions
103
- - Converters to import and export data:
104
- - Statsample::Database : Can create sql to create tables, read and insert data
105
- - Statsample::CSV : Read and write CSV files
106
- - Statsample::Excel : Read and write Excel files
140
+ - Converters to export data:
107
141
  - Statsample::Mx : Write Mx Files
108
142
  - Statsample::GGobi : Write Ggobi files
109
143
  - Module Statsample::Crosstab provides function to create crosstab for categorical data
@@ -130,52 +164,6 @@ Include:
130
164
  - Gem <tt>statsample-glm</tt> provides you with GML method, to work with Logistic, Poisson and Gaussian regression ,using ML or IRWLS.
131
165
  - Close integration with gem <tt>reportbuilder</tt>, to easily create reports on text, html and rtf formats.
132
166
 
133
- # Usage
134
-
135
- See the [examples folder](https://github.com/clbustos/statsample/tree/master/examples/) too.
136
-
137
- ## Boxplot
138
-
139
- ```ruby
140
- require 'statsample'
141
-
142
- ss_analysis(Statsample::Graph::Boxplot) do
143
- n = 30
144
- a = rnorm(n-1, 50, 10)
145
- b = rnorm(n, 30, 5)
146
- c = rnorm(n, 5, 1)
147
- a.push(2)
148
- boxplot(vectors: [a, b, c],
149
- width: 300,
150
- height: 300,
151
- groups: %w{first first second},
152
- minimum: 0)
153
- end
154
-
155
- Statsample::Analysis.run # Open svg file on *nix application defined
156
- ```
157
-
158
- ## Correlation matrix
159
-
160
- ```ruby
161
- require 'statsample'
162
- # Note R like generation of random gaussian variable
163
- # and correlation matrix
164
-
165
- ss_analysis("Statsample::Bivariate.correlation_matrix") do
166
- samples = 1000
167
- ds = data_frame(
168
- 'a' => rnorm(samples),
169
- 'b' => rnorm(samples),
170
- 'c' => rnorm(samples),
171
- 'd' => rnorm(samples))
172
- cm = cor(ds)
173
- summary(cm)
174
- end
175
-
176
- Statsample::Analysis.run_batch # Echo output to console
177
- ```
178
-
179
167
  # Resources
180
168
 
181
169
  - Source code on github :: http://github.com/sciruby/statsample
@@ -4,7 +4,6 @@ extend BenchPress
4
4
  cases=250
5
5
  vars=20
6
6
 
7
-
8
7
  name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)"
9
8
  author 'Clbustos'
10
9
  date '2011-01-18'
@@ -17,10 +16,12 @@ In this test, we test the calculation using #{vars} variables with
17
16
 
18
17
  reps 200 #number of repetitions
19
18
 
20
- ds=vars.times.inject({}) {|ac,v|
21
- ac["x#{v}"]=Statsample::Vector.new_numeric(cases) {rand()}
22
- ac
23
- }.to_dataset
19
+ ds = Daru::DataFrame.new(
20
+ vars.times.inject({}) do |ac,v|
21
+ ac["x#{v}".to_sym]=Daru::Vector.new_with_size(cases) {rand()}
22
+ ac
23
+ end
24
+ )
24
25
 
25
26
  measure "Statsample::Bivariate.correlation_matrix_optimized" do
26
27
  Statsample::Bivariate.correlation_matrix_optimized(ds)
@@ -17,11 +17,12 @@ In this test, we test the calculation using #{vars} variables with
17
17
 
18
18
  reps 200 #number of repetitions
19
19
 
20
-
21
- ds=vars.times.inject({}) {|ac,v|
22
- ac["x#{v}"]=Statsample::Vector.new_numeric(cases) {rand()}
23
- ac
24
- }.to_dataset
20
+ ds = Daru::DataFrame.new(
21
+ vars.times.inject({}) do |ac,v|
22
+ ac["x#{v}".to_sym]=Daru::Vector.new_with_size(cases) {rand()}
23
+ ac
24
+ end
25
+ )
25
26
 
26
27
  measure "Statsample::Bivariate.correlation_matrix_optimized" do
27
28
  Statsample::Bivariate.correlation_matrix_optimized(ds)
@@ -5,11 +5,13 @@ require 'statsample'
5
5
  require 'benchmark'
6
6
 
7
7
  def create_dataset(vars,cases)
8
- ran=Distribution::Normal.rng
9
- ds=vars.times.inject({}) {|ac,v|
10
- ac["x#{v}"]=Statsample::Vector.new_numeric(cases) {ran.call}
11
- ac
12
- }.to_dataset
8
+ ran = Distribution::Normal.rng
9
+ ds = Daru::DataFrame.new(
10
+ vars.times.inject({}) do |ac,v|
11
+ ac["x#{v}".to_sym] = Daru::Vector.new_with_size(cases) {ran.call}
12
+ ac
13
+ end
14
+ )
13
15
  end
14
16
 
15
17
  def prediction_pairwise(vars,cases)
@@ -19,19 +21,17 @@ def prediction_optimized(vars,cases)
19
21
  Statsample::Bivariate.prediction_optimized(vars,cases) / 10
20
22
  end
21
23
 
22
-
23
-
24
24
  if !File.exists?("correlation_matrix.ds") or File.mtime(__FILE__) > File.mtime("correlation_matrix.ds")
25
25
  reps=100 #number of repetitions
26
26
  ds_sizes=[5,10,30,50,100,150,200,500,1000]
27
27
  ds_vars=[3,4,5,10,20,30,40]
28
28
  #ds_sizes=[5,10]
29
29
  #ds_vars=[3,5,20]
30
- rs=Statsample::Dataset.new(%w{cases vars time_optimized time_pairwise})
30
+ rs = Daru::DataFrame.new({}, order: [:cases, :vars, :time_optimized, :time_pairwise])
31
31
 
32
32
  ds_sizes.each do |cases|
33
33
  ds_vars.each do |vars|
34
- ds=create_dataset(vars,cases)
34
+ ds = create_dataset(vars,cases)
35
35
  time_optimized= Benchmark.realtime do
36
36
  reps.times {
37
37
  Statsample::Bivariate.correlation_matrix_optimized(ds)
@@ -40,36 +40,33 @@ ds_sizes.each do |cases|
40
40
  end
41
41
 
42
42
  time_pairwise= Benchmark.realtime do
43
- reps.times {
44
- Statsample::Bivariate.correlation_matrix_pairwise(ds)
45
- }
43
+ reps.times { Statsample::Bivariate.correlation_matrix_pairwise(ds) }
46
44
  end
47
45
 
48
46
  puts "Cases:#{cases}, vars:#{vars} -> opt:%0.3f (%0.3f) | pair: %0.3f (%0.3f)" % [time_optimized, prediction_optimized(vars,cases), time_pairwise, prediction_pairwise(vars,cases)]
49
47
 
50
- rs.add_case({'cases'=>cases,'vars'=>vars,'time_optimized'=>Math.sqrt(time_optimized*1000),'time_pairwise'=>Math.sqrt(time_pairwise*1000)})
48
+ rs.add_row(Daru::Vector.new({
49
+ :cases => cases,
50
+ :vars => vars,
51
+ :time_optimized => Math.sqrt(time_optimized*1000),
52
+ :time_pairwise =>Math.sqrt(time_pairwise*1000)
53
+ })
54
+ )
51
55
  end
52
- end
53
-
56
+ end
54
57
  else
55
58
  rs=Statsample.load("correlation_matrix.ds")
56
59
  end
57
60
 
61
+ rs[:c_v] = rs.collect {|row| row[:cases]*row[:vars]}
58
62
 
59
- rs.fields.each {|f| rs[f].type=:numeric}
60
-
61
- rs['c_v']=rs.collect {|row| row['cases']*row['vars']}
62
-
63
- rs.update_valid_data
63
+ rs.update
64
64
  rs.save("correlation_matrix.ds")
65
65
  Statsample::Excel.write(rs,"correlation_matrix.xls")
66
66
 
67
+ rb = ReportBuilder.new(:name=>"Correlation matrix analysis")
67
68
 
68
-
69
- rb=ReportBuilder.new(:name=>"Correlation matrix analysis")
70
-
71
- rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_optimized','c_v']],'time_optimized', :digits=>6))
72
- rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_pairwise','c_v']],'time_pairwise', :digits=>6))
73
-
69
+ rb.add(Statsample::Regression.multiple(rs[:cases,:vars,:time_optimized,:c_v],:time_optimized, :digits=>6))
70
+ rb.add(Statsample::Regression.multiple(rs[:cases,:vars,:time_pairwise,:c_v],:time_pairwise, :digits=>6))
74
71
 
75
72
  rb.save_html("correlation_matrix.html")
@@ -1,14 +1,26 @@
1
1
  #!/usr/bin/ruby
2
+ # == Description
3
+ #
4
+ # This example illustrates how daru, combined with Statsample::Graph::Boxplot
5
+ # can be used for generating box plots of a normally distributed set of data.
6
+ #
7
+ # The 'rnorm' function, defined in statsample/shorthands generates a Daru::Vector
8
+ # object which contains the specified number of random variables in a normal distribution.
9
+ # It uses the 'distribution' gem for this purpose.
10
+ #
11
+ # Create a boxplot of the data by specifying the vectors a, b and c and providing
12
+ # necessary options to Statsample::Graph::Boxplot. The 'boxplot' function is shorthand
13
+ # for calling Statsample::Graph::Boxplot.
2
14
  $:.unshift(File.dirname(__FILE__)+'/../lib/')
3
15
  require 'statsample'
4
16
  Statsample::Analysis.store(Statsample::Graph::Boxplot) do
5
- n=30
6
- a=rnorm(n-1,50,10)
7
- b=rnorm(n, 30,5)
8
- c=rnorm(n,5,1)
17
+ n = 30
18
+ a = rnorm(n-1,50,10)
19
+ b = rnorm(n, 30,5)
20
+ c = rnorm(n,5,1)
9
21
  a.push(2)
22
+
10
23
  boxplot(:vectors=>[a,b,c],:width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0)
11
-
12
24
  end
13
25
 
14
26
  if __FILE__==$0
@@ -1,16 +1,45 @@
1
1
  #!/usr/bin/ruby
2
+
3
+ # == Description
4
+ #
5
+ # Creating and summarizing a correlation matrix with daru and statsample
2
6
  $:.unshift(File.dirname(__FILE__)+'/../lib/')
3
- require 'statsample'
4
7
 
8
+ require 'statsample'
5
9
  Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
10
+ # It so happens that Daru::Vector and Daru::DataFrame must update metadata
11
+ # like positions of missing values every time they are created.
12
+ #
13
+ # Since we dont have any missing values in the data that we are creating,
14
+ # we set Daru.lazy_update = true so that missing data is not updated every
15
+ # time and things happen much faster.
16
+ #
17
+ # In case you do have missing data and lazy_update has been set to *true*,
18
+ # you _SHOULD_ called `#update` on the concerned Vector or DataFrame object
19
+ # everytime an assingment or deletion cycle is complete.
20
+ Daru.lazy_update = true
21
+
22
+ # Create a Daru::DataFrame containing 4 vectors a, b, c and d.
23
+ #
24
+ # Notice that the `clone` option has been set to *false*. This tells Daru
25
+ # to not clone the Daru::Vectors being supplied by `rnorm`, since it would
26
+ # be unnecessarily counter productive to clone the vectors once they have
27
+ # been assigned to the dataframe.
6
28
  samples=1000
7
- ds=data_frame(
8
- 'a'=>rnorm(samples),
9
- 'b'=>rnorm(samples),
10
- 'c'=>rnorm(samples),
11
- 'd'=>rnorm(samples))
12
- cm=cor(ds)
29
+ ds = Daru::DataFrame.new({
30
+ :a => rnorm(samples),
31
+ :b => rnorm(samples),
32
+ :c => rnorm(samples),
33
+ :d => rnorm(samples)
34
+ }, clone: false)
35
+
36
+ # Calculate correlation matrix by calling the `cor` shorthand.
37
+ cm = cor(ds)
13
38
  summary(cm)
39
+
40
+ # Set lazy_update to *false* once our job is done so that this analysis does
41
+ # not accidentally affect code elsewhere.
42
+ Daru.lazy_update = false
14
43
  end
15
44
 
16
45
  if __FILE__==$0