statsample 1.5.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +15 -0
  3. data/.gitignore +1 -0
  4. data/.travis.yml +19 -7
  5. data/CONTRIBUTING.md +33 -0
  6. data/History.txt +5 -0
  7. data/README.md +41 -53
  8. data/benchmarks/correlation_matrix_15_variables.rb +6 -5
  9. data/benchmarks/correlation_matrix_5_variables.rb +6 -5
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
  11. data/examples/boxplot.rb +17 -5
  12. data/examples/correlation_matrix.rb +36 -7
  13. data/examples/dataset.rb +25 -5
  14. data/examples/dominance_analysis.rb +8 -7
  15. data/examples/dominance_analysis_bootstrap.rb +16 -11
  16. data/examples/histogram.rb +16 -2
  17. data/examples/icc.rb +5 -6
  18. data/examples/levene.rb +17 -3
  19. data/examples/multiple_regression.rb +6 -3
  20. data/examples/parallel_analysis.rb +11 -6
  21. data/examples/polychoric.rb +26 -13
  22. data/examples/principal_axis.rb +8 -4
  23. data/examples/reliability.rb +10 -10
  24. data/examples/scatterplot.rb +8 -0
  25. data/examples/t_test.rb +7 -0
  26. data/examples/u_test.rb +10 -2
  27. data/examples/vector.rb +9 -6
  28. data/examples/velicer_map_test.rb +12 -8
  29. data/lib/statsample.rb +13 -47
  30. data/lib/statsample/analysis/suite.rb +1 -1
  31. data/lib/statsample/anova/oneway.rb +6 -6
  32. data/lib/statsample/anova/twoway.rb +26 -24
  33. data/lib/statsample/bivariate.rb +78 -61
  34. data/lib/statsample/bivariate/pearson.rb +2 -2
  35. data/lib/statsample/codification.rb +45 -32
  36. data/lib/statsample/converter/csv.rb +15 -53
  37. data/lib/statsample/converter/spss.rb +6 -5
  38. data/lib/statsample/converters.rb +50 -211
  39. data/lib/statsample/crosstab.rb +26 -25
  40. data/lib/statsample/daru.rb +117 -0
  41. data/lib/statsample/dataset.rb +70 -942
  42. data/lib/statsample/dominanceanalysis.rb +16 -17
  43. data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
  44. data/lib/statsample/factor/parallelanalysis.rb +17 -19
  45. data/lib/statsample/factor/pca.rb +21 -20
  46. data/lib/statsample/factor/principalaxis.rb +3 -3
  47. data/lib/statsample/graph/boxplot.rb +8 -16
  48. data/lib/statsample/graph/histogram.rb +4 -4
  49. data/lib/statsample/graph/scatterplot.rb +8 -7
  50. data/lib/statsample/histogram.rb +128 -119
  51. data/lib/statsample/matrix.rb +20 -16
  52. data/lib/statsample/multiset.rb +39 -38
  53. data/lib/statsample/regression.rb +3 -3
  54. data/lib/statsample/regression/multiple.rb +8 -10
  55. data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
  56. data/lib/statsample/regression/multiple/baseengine.rb +32 -32
  57. data/lib/statsample/regression/multiple/gslengine.rb +33 -36
  58. data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
  59. data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
  60. data/lib/statsample/reliability.rb +23 -25
  61. data/lib/statsample/reliability/icc.rb +8 -7
  62. data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
  63. data/lib/statsample/reliability/scaleanalysis.rb +58 -60
  64. data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
  65. data/lib/statsample/resample.rb +1 -1
  66. data/lib/statsample/shorthand.rb +29 -25
  67. data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
  68. data/lib/statsample/test/levene.rb +28 -27
  69. data/lib/statsample/test/t.rb +7 -9
  70. data/lib/statsample/test/umannwhitney.rb +28 -28
  71. data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
  72. data/lib/statsample/vector.rb +70 -1013
  73. data/lib/statsample/version.rb +1 -1
  74. data/statsample.gemspec +12 -16
  75. data/test/helpers_tests.rb +1 -1
  76. data/test/test_analysis.rb +17 -17
  77. data/test/test_anova_contrast.rb +6 -6
  78. data/test/test_anovatwowaywithdataset.rb +8 -8
  79. data/test/test_anovawithvectors.rb +8 -8
  80. data/test/test_awesome_print_bug.rb +1 -1
  81. data/test/test_bartlettsphericity.rb +4 -4
  82. data/test/test_bivariate.rb +48 -43
  83. data/test/test_codification.rb +33 -33
  84. data/test/test_crosstab.rb +9 -9
  85. data/test/test_dataset.rb +28 -458
  86. data/test/test_factor.rb +46 -38
  87. data/test/test_factor_pa.rb +22 -13
  88. data/test/test_ggobi.rb +4 -4
  89. data/test/test_gsl.rb +4 -4
  90. data/test/test_histogram.rb +3 -3
  91. data/test/test_matrix.rb +13 -13
  92. data/test/test_multiset.rb +103 -91
  93. data/test/test_regression.rb +57 -52
  94. data/test/test_reliability.rb +55 -45
  95. data/test/test_reliability_icc.rb +8 -8
  96. data/test/test_reliability_skillscale.rb +26 -24
  97. data/test/test_resample.rb +1 -1
  98. data/test/test_statistics.rb +3 -13
  99. data/test/test_stest.rb +9 -9
  100. data/test/test_stratified.rb +3 -3
  101. data/test/test_test_t.rb +12 -12
  102. data/test/test_umannwhitney.rb +2 -2
  103. data/test/test_vector.rb +76 -613
  104. data/test/test_wilcoxonsignedrank.rb +4 -4
  105. metadata +57 -28
  106. data/lib/statsample/rserve_extension.rb +0 -20
  107. data/lib/statsample/vector/gsl.rb +0 -106
  108. data/test/fixtures/repeated_fields.csv +0 -7
  109. data/test/fixtures/scientific_notation.csv +0 -4
  110. data/test/fixtures/test_csv.csv +0 -7
  111. data/test/fixtures/test_xls.xls +0 -0
  112. data/test/test_csv.rb +0 -63
  113. data/test/test_rserve_extension.rb +0 -42
  114. data/test/test_xls.rb +0 -52
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2bdb8a5e29a75f62af49b094ac026bbc01597707
4
- data.tar.gz: 9459ac7bf01e6d20e81a23286dab63f63bdbcea3
3
+ metadata.gz: 244a3b640d29832affc12304a77aa447f4f2df0d
4
+ data.tar.gz: 08261176b4763367e65036c486e699491e209e7d
5
5
  SHA512:
6
- metadata.gz: de03429dbc2c7a1bf0e0acfd0afcebcb55f53630688d635bdfa549eca988bfde40f601e056ae10946f064e86e769bc3be0b50888e1a3209ab243d0253e5b9ed6
7
- data.tar.gz: dd71de3f239d8202f3716e4e268f5199cce24e6436e1b017d82970a3bf27da1749bc00a0ab35d030fbd1f63b717aab49c513bb51f8728807a565206cf81d30f1
6
+ metadata.gz: 125beac1e267c030c6d4b70b84543a7888f3b8fcb43e6475913b00f7332dd5ce01bb6330813c3c99f5d649add677a339dc8ef366b466f2710aeaaf4bf20c0cdf
7
+ data.tar.gz: 402c505e62785e2f1eacb28b8798fed26193ed54171f7dce2a106584800ffd3de657013559c55df9e20fe5827fe6309c53f526213fc9ec23bfa2a95bf086d38e
@@ -0,0 +1,15 @@
1
+ #!/bin/bash
2
+
3
+ git clone https://github.com/SciRuby/nmatrix.git
4
+ cd nmatrix
5
+ gem build nmatrix.gemspec
6
+ gem install nmatrix-0.1.0.gem
7
+ cd ..
8
+ rm -rf nmatrix
9
+ git clone https://github.com/v0dro/gsl-nmatrix
10
+ cd gsl-nmatrix
11
+ gem build gsl-nmatrix.gemspec
12
+ gem install gsl-nmatrix-1.17.gem
13
+ cd ..
14
+ rm -rf gsl-nmatrix
15
+
data/.gitignore CHANGED
@@ -12,3 +12,4 @@ examples/images/*
12
12
  examples/*.html
13
13
  web/upload_task.rb
14
14
  .idea
15
+ *.gem
@@ -1,16 +1,28 @@
1
1
  language:
2
2
  ruby
3
3
 
4
+ env:
5
+ - CPLUS_INCLUDE_PATH=/usr/include/atlas C_INCLUDE_PATH=/usr/include/atlas
6
+
4
7
  rvm:
5
- - 1.9.3
6
- - 2.0
7
- - 2.1
8
- - 2.2
8
+ - '1.9.3'
9
+ - '2.0'
10
+ - '2.1'
11
+ - '2.2'
12
+
13
+ matrix:
14
+ fast_finish:
15
+ true
16
+
17
+ script: "bundle exec rake test"
18
+
19
+ install:
20
+ - gem install bundler
21
+ - ./.build.sh
22
+ - bundle install
9
23
 
10
- script:
11
- bundle exec rake test
12
-
13
24
  before_install:
14
25
  - sudo apt-get update -qq
26
+ - sudo apt-get install -qq libatlas-base-dev
15
27
  - sudo apt-get install -y libgsl0-dev r-base r-base-dev
16
28
  - sudo Rscript -e "install.packages(c('Rserve','irr'),,'http://cran.us.r-project.org')"
@@ -0,0 +1,33 @@
1
+ # Contributing guide
2
+
3
+ ## Installing statsample development dependencies
4
+
5
+ If you want to run the full test suite, you will need the latest unreleased nmatrix and gsl-nmatrix ruby gems. They will be released upstream soon but please follow this procdure for now.
6
+
7
+ Keep in mind that either nmatrix OR gsl-nmatrix are NOT NECESSARY for using statsample. They are just required for an optional speed up.
8
+
9
+ Statsample also works with [rb-gsl](https://github.com/blackwinter/rb-gsl), though installing that will cause a problem if you have any nmatrix dependent code because narray and nmatrix have a namespace problem.
10
+
11
+ To install dependencies, execute the following commands:
12
+
13
+ `export CPLUS_INCLUDE_PATH=/usr/include/atlas`
14
+ `export C_INCLUDE_PATH=/usr/include/atlas`
15
+ `sudo apt-get update -qq`
16
+ `sudo apt-get install -qq libatlas-base-dev`
17
+ `sudo apt-get --purge remove liblapack-dev liblapack3 liblapack3gf`
18
+ `sudo apt-get install -y libgsl0-dev r-base r-base-dev`
19
+ `sudo Rscript -e "install.packages(c('Rserve','irr'),,'http://cran.us.r-project.org')"`
20
+
21
+ Then execute the .build.sh script to clone and install the latest nmatrix and gsl-nmatrix on your system:
22
+
23
+ `./.build.sh`
24
+
25
+ Then finally install remaining dependencies:
26
+
27
+ `bundle install`
28
+
29
+ And run the test suite (should be all green):
30
+
31
+ `bundle exec rake test`
32
+
33
+ If you have problems installing nmatrix, please consult the [nmatrix installation wiki](https://github.com/SciRuby/nmatrix/wiki/Installation) or the [mailing list](https://groups.google.com/forum/#!forum/sciruby-dev).
@@ -1,3 +1,8 @@
1
+ === 2.0.0 / 2015-06-20
2
+ * Added dependency on daru and replaced Statsample::Vector and Dataset with
3
+ Daru::Vector and Daru::DataFrame.
4
+ * NMatrix and gsl-nmatrix are used as development dependencies.
5
+
1
6
  === 1.5.0 / 2015-06-11
2
7
  * Made sure all methods work properly with and without GSL.
3
8
  * Statsample works with either rb-gsl or gsl-nmatrix.
data/README.md CHANGED
@@ -32,18 +32,56 @@ If you need to work on Structural Equation Modeling, you could see +statsample-s
32
32
  ```bash
33
33
  $ [sudo] gem install statsample-sem
34
34
  ```
35
+ # Testing
36
+
37
+ See CONTRIBUTING for information on testing and contributing to statsample.
35
38
 
36
39
  # Documentation
37
40
 
38
41
  You can see the latest documentation in [rubydoc.info](http://www.rubydoc.info/github/sciruby/statsample/master).
39
42
 
43
+ # Usage
44
+
45
+ ## Notebooks
46
+
47
+ You can see some iruby notebooks here:
48
+
49
+ ### Statistics
50
+
51
+ * [Correlation Matrix with daru and statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Correlation%20Matrix%20with%20daru%20and%20statsample.ipynb)
52
+ * [Dominance Analysis with statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Dominance%20Analysis%20with%20statsample.ipynb)
53
+ * [Reliability ICC](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Reliability%20ICC%20with%20statsample.ipynb)
54
+ * [Levene Test](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Levene%20Test.ipynb)
55
+ * [Multiple Regression](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Multiple%20Regression.ipynb)
56
+ * [Parallel Analysis on PCA](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Parallel%20Analysis%20on%20PCA.ipynb)
57
+ * [Polychoric Analysis](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Polychoric%20Correlation.ipynb)
58
+ * [Reliability Scale and Multiscale Analysis](https://github.com/SciRuby/sciruby-notebooks/blob/master/Statistics/Reliability%20Scale%20Analysis.ipynb)
59
+ * [Velicer MAP Test](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Velicer%20MAP%20test.ipynb)
60
+
61
+ ### Visualizations
62
+
63
+ * [Creating Boxplots with daru and statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Boxplot%20with%20daru%20and%20statsample.ipynb)
64
+ * [Creating A Histogram](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Creating%20a%20Histogram.ipynb)
65
+ * [Creating a Scatterplot](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Scatterplot%20with%20statsample.ipynb)
66
+
67
+ ### Working with DataFrame and Vector
68
+
69
+ * [Creating Vectors and DataFrames with daru](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Creation%20of%20Vector%20and%20DataFrame.ipynb)
70
+ * [Detailed Usage of Daru::Vector](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20Vector.ipynb)
71
+ * [Detailed Usage of Daru::DataFrame](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20DataFrame.ipynb)
72
+ * [Visualizing Data with Daru::DataFrame](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Visualizing%20data%20with%20daru%20DataFrame.ipynb)
73
+
74
+ ## Examples
75
+
76
+ See the /examples directory for some use cases. The notebooks listed above have mostly
77
+ the same examples, and they look better so you might want to see that first.
78
+
40
79
  # Description
41
80
 
42
81
  A suite for basic and advanced statistics on Ruby. Tested on CRuby 1.9.3, 2.0.0 and 2.1.1. See `.travis.yml` for more information.
43
82
 
44
83
  Include:
45
84
  - Descriptive statistics: frequencies, median, mean, standard error, skew, kurtosis (and many others).
46
- - Imports and exports datasets from and to Excel, CSV and plain text files.
47
85
  - Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b and gamma. Tetrachoric and Polychoric correlation provides by +statsample-bivariate-extension+ gem.
48
86
  - Intra-class correlation
49
87
  - Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA.
@@ -75,8 +113,7 @@ Include:
75
113
  # Features
76
114
 
77
115
  - Classes for manipulation and storage of data:
78
- - Statsample::Vector: An extension of an array, with statistical methods like sum, mean and standard deviation
79
- - Statsample::Dataset: a group of Statsample::Vector, analog to a excel spreadsheet or a dataframe on R. The base of almost all operations on statsample.
116
+ - Uses [daru](https://github.com/v0dro/daru) for storing data and basic statistics.
80
117
  - Statsample::Multiset: multiple datasets with same fields and type of vectors
81
118
  - Anova module provides generic Statsample::Anova::OneWay and vector based Statsample::Anova::OneWayWithVectors. Also you can create contrast using Statsample::Anova::Contrast
82
119
  - Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices
@@ -100,10 +137,7 @@ Include:
100
137
  - Statsample::DominanceAnalysis class can report dominance analysis for a sample, using uni or multivariate dependent variables
101
138
  - Statsample::DominanceAnalysis::Bootstrap can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
102
139
  - Module Statsample::Codification, to help to codify open questions
103
- - Converters to import and export data:
104
- - Statsample::Database : Can create sql to create tables, read and insert data
105
- - Statsample::CSV : Read and write CSV files
106
- - Statsample::Excel : Read and write Excel files
140
+ - Converters to export data:
107
141
  - Statsample::Mx : Write Mx Files
108
142
  - Statsample::GGobi : Write Ggobi files
109
143
  - Module Statsample::Crosstab provides function to create crosstab for categorical data
@@ -130,52 +164,6 @@ Include:
130
164
  - Gem <tt>statsample-glm</tt> provides you with GML method, to work with Logistic, Poisson and Gaussian regression ,using ML or IRWLS.
131
165
  - Close integration with gem <tt>reportbuilder</tt>, to easily create reports on text, html and rtf formats.
132
166
 
133
- # Usage
134
-
135
- See the [examples folder](https://github.com/clbustos/statsample/tree/master/examples/) too.
136
-
137
- ## Boxplot
138
-
139
- ```ruby
140
- require 'statsample'
141
-
142
- ss_analysis(Statsample::Graph::Boxplot) do
143
- n = 30
144
- a = rnorm(n-1, 50, 10)
145
- b = rnorm(n, 30, 5)
146
- c = rnorm(n, 5, 1)
147
- a.push(2)
148
- boxplot(vectors: [a, b, c],
149
- width: 300,
150
- height: 300,
151
- groups: %w{first first second},
152
- minimum: 0)
153
- end
154
-
155
- Statsample::Analysis.run # Open svg file on *nix application defined
156
- ```
157
-
158
- ## Correlation matrix
159
-
160
- ```ruby
161
- require 'statsample'
162
- # Note R like generation of random gaussian variable
163
- # and correlation matrix
164
-
165
- ss_analysis("Statsample::Bivariate.correlation_matrix") do
166
- samples = 1000
167
- ds = data_frame(
168
- 'a' => rnorm(samples),
169
- 'b' => rnorm(samples),
170
- 'c' => rnorm(samples),
171
- 'd' => rnorm(samples))
172
- cm = cor(ds)
173
- summary(cm)
174
- end
175
-
176
- Statsample::Analysis.run_batch # Echo output to console
177
- ```
178
-
179
167
  # Resources
180
168
 
181
169
  - Source code on github :: http://github.com/sciruby/statsample
@@ -4,7 +4,6 @@ extend BenchPress
4
4
  cases=250
5
5
  vars=20
6
6
 
7
-
8
7
  name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)"
9
8
  author 'Clbustos'
10
9
  date '2011-01-18'
@@ -17,10 +16,12 @@ In this test, we test the calculation using #{vars} variables with
17
16
 
18
17
  reps 200 #number of repetitions
19
18
 
20
- ds=vars.times.inject({}) {|ac,v|
21
- ac["x#{v}"]=Statsample::Vector.new_numeric(cases) {rand()}
22
- ac
23
- }.to_dataset
19
+ ds = Daru::DataFrame.new(
20
+ vars.times.inject({}) do |ac,v|
21
+ ac["x#{v}".to_sym]=Daru::Vector.new_with_size(cases) {rand()}
22
+ ac
23
+ end
24
+ )
24
25
 
25
26
  measure "Statsample::Bivariate.correlation_matrix_optimized" do
26
27
  Statsample::Bivariate.correlation_matrix_optimized(ds)
@@ -17,11 +17,12 @@ In this test, we test the calculation using #{vars} variables with
17
17
 
18
18
  reps 200 #number of repetitions
19
19
 
20
-
21
- ds=vars.times.inject({}) {|ac,v|
22
- ac["x#{v}"]=Statsample::Vector.new_numeric(cases) {rand()}
23
- ac
24
- }.to_dataset
20
+ ds = Daru::DataFrame.new(
21
+ vars.times.inject({}) do |ac,v|
22
+ ac["x#{v}".to_sym]=Daru::Vector.new_with_size(cases) {rand()}
23
+ ac
24
+ end
25
+ )
25
26
 
26
27
  measure "Statsample::Bivariate.correlation_matrix_optimized" do
27
28
  Statsample::Bivariate.correlation_matrix_optimized(ds)
@@ -5,11 +5,13 @@ require 'statsample'
5
5
  require 'benchmark'
6
6
 
7
7
  def create_dataset(vars,cases)
8
- ran=Distribution::Normal.rng
9
- ds=vars.times.inject({}) {|ac,v|
10
- ac["x#{v}"]=Statsample::Vector.new_numeric(cases) {ran.call}
11
- ac
12
- }.to_dataset
8
+ ran = Distribution::Normal.rng
9
+ ds = Daru::DataFrame.new(
10
+ vars.times.inject({}) do |ac,v|
11
+ ac["x#{v}".to_sym] = Daru::Vector.new_with_size(cases) {ran.call}
12
+ ac
13
+ end
14
+ )
13
15
  end
14
16
 
15
17
  def prediction_pairwise(vars,cases)
@@ -19,19 +21,17 @@ def prediction_optimized(vars,cases)
19
21
  Statsample::Bivariate.prediction_optimized(vars,cases) / 10
20
22
  end
21
23
 
22
-
23
-
24
24
  if !File.exists?("correlation_matrix.ds") or File.mtime(__FILE__) > File.mtime("correlation_matrix.ds")
25
25
  reps=100 #number of repetitions
26
26
  ds_sizes=[5,10,30,50,100,150,200,500,1000]
27
27
  ds_vars=[3,4,5,10,20,30,40]
28
28
  #ds_sizes=[5,10]
29
29
  #ds_vars=[3,5,20]
30
- rs=Statsample::Dataset.new(%w{cases vars time_optimized time_pairwise})
30
+ rs = Daru::DataFrame.new({}, order: [:cases, :vars, :time_optimized, :time_pairwise])
31
31
 
32
32
  ds_sizes.each do |cases|
33
33
  ds_vars.each do |vars|
34
- ds=create_dataset(vars,cases)
34
+ ds = create_dataset(vars,cases)
35
35
  time_optimized= Benchmark.realtime do
36
36
  reps.times {
37
37
  Statsample::Bivariate.correlation_matrix_optimized(ds)
@@ -40,36 +40,33 @@ ds_sizes.each do |cases|
40
40
  end
41
41
 
42
42
  time_pairwise= Benchmark.realtime do
43
- reps.times {
44
- Statsample::Bivariate.correlation_matrix_pairwise(ds)
45
- }
43
+ reps.times { Statsample::Bivariate.correlation_matrix_pairwise(ds) }
46
44
  end
47
45
 
48
46
  puts "Cases:#{cases}, vars:#{vars} -> opt:%0.3f (%0.3f) | pair: %0.3f (%0.3f)" % [time_optimized, prediction_optimized(vars,cases), time_pairwise, prediction_pairwise(vars,cases)]
49
47
 
50
- rs.add_case({'cases'=>cases,'vars'=>vars,'time_optimized'=>Math.sqrt(time_optimized*1000),'time_pairwise'=>Math.sqrt(time_pairwise*1000)})
48
+ rs.add_row(Daru::Vector.new({
49
+ :cases => cases,
50
+ :vars => vars,
51
+ :time_optimized => Math.sqrt(time_optimized*1000),
52
+ :time_pairwise =>Math.sqrt(time_pairwise*1000)
53
+ })
54
+ )
51
55
  end
52
- end
53
-
56
+ end
54
57
  else
55
58
  rs=Statsample.load("correlation_matrix.ds")
56
59
  end
57
60
 
61
+ rs[:c_v] = rs.collect {|row| row[:cases]*row[:vars]}
58
62
 
59
- rs.fields.each {|f| rs[f].type=:numeric}
60
-
61
- rs['c_v']=rs.collect {|row| row['cases']*row['vars']}
62
-
63
- rs.update_valid_data
63
+ rs.update
64
64
  rs.save("correlation_matrix.ds")
65
65
  Statsample::Excel.write(rs,"correlation_matrix.xls")
66
66
 
67
+ rb = ReportBuilder.new(:name=>"Correlation matrix analysis")
67
68
 
68
-
69
- rb=ReportBuilder.new(:name=>"Correlation matrix analysis")
70
-
71
- rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_optimized','c_v']],'time_optimized', :digits=>6))
72
- rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_pairwise','c_v']],'time_pairwise', :digits=>6))
73
-
69
+ rb.add(Statsample::Regression.multiple(rs[:cases,:vars,:time_optimized,:c_v],:time_optimized, :digits=>6))
70
+ rb.add(Statsample::Regression.multiple(rs[:cases,:vars,:time_pairwise,:c_v],:time_pairwise, :digits=>6))
74
71
 
75
72
  rb.save_html("correlation_matrix.html")
@@ -1,14 +1,26 @@
1
1
  #!/usr/bin/ruby
2
+ # == Description
3
+ #
4
+ # This example illustrates how daru, combined with Statsample::Graph::Boxplot
5
+ # can be used for generating box plots of a normally distributed set of data.
6
+ #
7
+ # The 'rnorm' function, defined in statsample/shorthands generates a Daru::Vector
8
+ # object which contains the specified number of random variables in a normal distribution.
9
+ # It uses the 'distribution' gem for this purpose.
10
+ #
11
+ # Create a boxplot of the data by specifying the vectors a, b and c and providing
12
+ # necessary options to Statsample::Graph::Boxplot. The 'boxplot' function is shorthand
13
+ # for calling Statsample::Graph::Boxplot.
2
14
  $:.unshift(File.dirname(__FILE__)+'/../lib/')
3
15
  require 'statsample'
4
16
  Statsample::Analysis.store(Statsample::Graph::Boxplot) do
5
- n=30
6
- a=rnorm(n-1,50,10)
7
- b=rnorm(n, 30,5)
8
- c=rnorm(n,5,1)
17
+ n = 30
18
+ a = rnorm(n-1,50,10)
19
+ b = rnorm(n, 30,5)
20
+ c = rnorm(n,5,1)
9
21
  a.push(2)
22
+
10
23
  boxplot(:vectors=>[a,b,c],:width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0)
11
-
12
24
  end
13
25
 
14
26
  if __FILE__==$0
@@ -1,16 +1,45 @@
1
1
  #!/usr/bin/ruby
2
+
3
+ # == Description
4
+ #
5
+ # Creating and summarizing a correlation matrix with daru and statsample
2
6
  $:.unshift(File.dirname(__FILE__)+'/../lib/')
3
- require 'statsample'
4
7
 
8
+ require 'statsample'
5
9
  Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
10
+ # It so happens that Daru::Vector and Daru::DataFrame must update metadata
11
+ # like positions of missing values every time they are created.
12
+ #
13
+ # Since we dont have any missing values in the data that we are creating,
14
+ # we set Daru.lazy_update = true so that missing data is not updated every
15
+ # time and things happen much faster.
16
+ #
17
+ # In case you do have missing data and lazy_update has been set to *true*,
18
+ # you _SHOULD_ called `#update` on the concerned Vector or DataFrame object
19
+ # everytime an assingment or deletion cycle is complete.
20
+ Daru.lazy_update = true
21
+
22
+ # Create a Daru::DataFrame containing 4 vectors a, b, c and d.
23
+ #
24
+ # Notice that the `clone` option has been set to *false*. This tells Daru
25
+ # to not clone the Daru::Vectors being supplied by `rnorm`, since it would
26
+ # be unnecessarily counter productive to clone the vectors once they have
27
+ # been assigned to the dataframe.
6
28
  samples=1000
7
- ds=data_frame(
8
- 'a'=>rnorm(samples),
9
- 'b'=>rnorm(samples),
10
- 'c'=>rnorm(samples),
11
- 'd'=>rnorm(samples))
12
- cm=cor(ds)
29
+ ds = Daru::DataFrame.new({
30
+ :a => rnorm(samples),
31
+ :b => rnorm(samples),
32
+ :c => rnorm(samples),
33
+ :d => rnorm(samples)
34
+ }, clone: false)
35
+
36
+ # Calculate correlation matrix by calling the `cor` shorthand.
37
+ cm = cor(ds)
13
38
  summary(cm)
39
+
40
+ # Set lazy_update to *false* once our job is done so that this analysis does
41
+ # not accidentally affect code elsewhere.
42
+ Daru.lazy_update = false
14
43
  end
15
44
 
16
45
  if __FILE__==$0