statsample 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/History.txt +12 -0
  2. data/Manifest.txt +13 -0
  3. data/README.txt +2 -1
  4. data/demo/pca.rb +29 -0
  5. data/demo/umann.rb +8 -0
  6. data/lib/distribution.rb +0 -1
  7. data/lib/matrix_extension.rb +35 -21
  8. data/lib/statsample.rb +31 -28
  9. data/lib/statsample/anova.rb +7 -2
  10. data/lib/statsample/bivariate.rb +17 -11
  11. data/lib/statsample/codification.rb +136 -87
  12. data/lib/statsample/combination.rb +0 -2
  13. data/lib/statsample/converter/csv18.rb +1 -1
  14. data/lib/statsample/converter/csv19.rb +1 -1
  15. data/lib/statsample/converters.rb +176 -171
  16. data/lib/statsample/crosstab.rb +227 -154
  17. data/lib/statsample/dataset.rb +94 -12
  18. data/lib/statsample/dominanceanalysis.rb +69 -62
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
  20. data/lib/statsample/factor.rb +18 -0
  21. data/lib/statsample/factor/pca.rb +128 -0
  22. data/lib/statsample/factor/principalaxis.rb +133 -0
  23. data/lib/statsample/factor/rotation.rb +125 -0
  24. data/lib/statsample/histogram.rb +99 -0
  25. data/lib/statsample/mle.rb +125 -126
  26. data/lib/statsample/mle/logit.rb +91 -91
  27. data/lib/statsample/mle/probit.rb +84 -85
  28. data/lib/statsample/multiset.rb +1 -1
  29. data/lib/statsample/permutation.rb +96 -0
  30. data/lib/statsample/regression.rb +1 -1
  31. data/lib/statsample/regression/binomial.rb +89 -89
  32. data/lib/statsample/regression/binomial/logit.rb +9 -9
  33. data/lib/statsample/regression/binomial/probit.rb +9 -9
  34. data/lib/statsample/regression/multiple.rb +8 -14
  35. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  36. data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
  37. data/lib/statsample/resample.rb +12 -17
  38. data/lib/statsample/srs.rb +4 -1
  39. data/lib/statsample/test.rb +23 -22
  40. data/lib/statsample/test/umannwhitney.rb +182 -0
  41. data/lib/statsample/vector.rb +854 -815
  42. data/test/test_bivariate.rb +132 -132
  43. data/test/test_codification.rb +71 -50
  44. data/test/test_dataset.rb +19 -1
  45. data/test/test_factor.rb +44 -0
  46. data/test/test_histogram.rb +26 -0
  47. data/test/test_permutation.rb +37 -0
  48. data/test/test_statistics.rb +74 -63
  49. data/test/test_umannwhitney.rb +17 -0
  50. data/test/test_vector.rb +46 -30
  51. metadata +31 -4
data/History.txt CHANGED
@@ -1,3 +1,14 @@
1
+ === 0.6.0 / 2010-02-05
2
+ * New Statsample::Factor module. Include classes for extracting factors (Statsample::Factor::PCA and Statsample::Factor::PrincipalAxis) and rotate component matrix ( Statsample::Factor::Rotation subclasses). For now, only orthogonal rotations
3
+ * New Statsample::Dataset.crosstab_with_asignation, Statsample::Dataset.one_to_many
4
+ * New class Statsample::Permutation to produce permutations of a given array
5
+ * New class Statsample::Histogram, with same interface as GSL one
6
+ * New class Statsample::Test::UMannWhitney, to perform Mann-Whitney's U test. Gives z based and exact calculation of probability
7
+ * Improved support for ReportBuilder
8
+ * Statsample::Codification module reworked
9
+ * Fixed bugs on Dominance Analysis classes
10
+ * Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew
11
+
1
12
  === 0.5.1 / 2009-10-06
2
13
 
3
14
  * New class Statsample::Bivariate::Tetrachoric, for calculation of tetrachoric correlations. See http://www.john-uebersax.com/stat/tetra.htm for information.
@@ -18,6 +29,7 @@
18
29
  * Logit tests
19
30
  * Bug fix: rescue for requires doesn't specify LoadError
20
31
  * Binomial::BaseEngine new methods: coeffs_se, coeffs, constant and constant_se
32
+
21
33
  === 0.4.0 / 2009-09-10
22
34
  * New Distribution module, based on statistics2.rb by Shin-ichiro HARA. Replaces all instances of GSL distributions pdf and cdf calculations for native calculation.
23
35
  * New Maximum Likehood Estimation for Logit, Probit and Normal Distribution using Von Tessin(2005) algorithm. See MLE class and subclasses for more information.
data/Manifest.txt CHANGED
@@ -18,6 +18,7 @@ demo/graph.rb
18
18
  demo/item_analysis.rb
19
19
  demo/mean.rb
20
20
  demo/nunnally_6.rb
21
+ demo/pca.rb
21
22
  demo/proportion.rb
22
23
  demo/regression.rb
23
24
  demo/sample_test.csv
@@ -25,6 +26,7 @@ demo/spss_matrix.rb
25
26
  demo/strata_proportion.rb
26
27
  demo/stratum.rb
27
28
  demo/t-student.rb
29
+ demo/umann.rb
28
30
  lib/distribution.rb
29
31
  lib/distribution/chisquare.rb
30
32
  lib/distribution/f.rb
@@ -47,17 +49,23 @@ lib/statsample/crosstab.rb
47
49
  lib/statsample/dataset.rb
48
50
  lib/statsample/dominanceanalysis.rb
49
51
  lib/statsample/dominanceanalysis/bootstrap.rb
52
+ lib/statsample/factor.rb
53
+ lib/statsample/factor/pca.rb
54
+ lib/statsample/factor/principalaxis.rb
55
+ lib/statsample/factor/rotation.rb
50
56
  lib/statsample/graph/gdchart.rb
51
57
  lib/statsample/graph/svgboxplot.rb
52
58
  lib/statsample/graph/svggraph.rb
53
59
  lib/statsample/graph/svghistogram.rb
54
60
  lib/statsample/graph/svgscatterplot.rb
61
+ lib/statsample/histogram.rb
55
62
  lib/statsample/htmlreport.rb
56
63
  lib/statsample/mle.rb
57
64
  lib/statsample/mle/logit.rb
58
65
  lib/statsample/mle/normal.rb
59
66
  lib/statsample/mle/probit.rb
60
67
  lib/statsample/multiset.rb
68
+ lib/statsample/permutation.rb
61
69
  lib/statsample/regression.rb
62
70
  lib/statsample/regression/binomial.rb
63
71
  lib/statsample/regression/binomial/logit.rb
@@ -72,6 +80,7 @@ lib/statsample/reliability.rb
72
80
  lib/statsample/resample.rb
73
81
  lib/statsample/srs.rb
74
82
  lib/statsample/test.rb
83
+ lib/statsample/test/umannwhitney.rb
75
84
  lib/statsample/vector.rb
76
85
  po/es/statsample.po
77
86
  po/statsample.pot
@@ -85,11 +94,14 @@ test/test_csv.csv
85
94
  test/test_csv.rb
86
95
  test/test_dataset.rb
87
96
  test/test_distribution.rb
97
+ test/test_factor.rb
88
98
  test/test_ggobi.rb
89
99
  test/test_gsl.rb
100
+ test/test_histogram.rb
90
101
  test/test_logit.rb
91
102
  test/test_mle.rb
92
103
  test/test_multiset.rb
104
+ test/test_permutation.rb
93
105
  test/test_regression.rb
94
106
  test/test_reliability.rb
95
107
  test/test_resample.rb
@@ -97,6 +109,7 @@ test/test_srs.rb
97
109
  test/test_statistics.rb
98
110
  test/test_stratified.rb
99
111
  test/test_svg_graph.rb
112
+ test/test_umannwhitney.rb
100
113
  test/test_vector.rb
101
114
  test/test_xls.rb
102
115
  test/test_xls.xls
data/README.txt CHANGED
@@ -5,10 +5,11 @@ http://ruby-statsample.rubyforge.org/
5
5
 
6
6
  == DESCRIPTION:
7
7
 
8
- A suite for your basic and advanced statistics needs. Descriptive statistics, multiple regression, dominance analysis, scale's reliability analysis, bivariate statistics and others procedures.
8
+ A suite for your basic and advanced statistics needs. Descriptive statistics, multiple regression, factorial analysis, dominance analysis, scale's reliability analysis, bivariate statistics and others procedures.
9
9
 
10
10
  == FEATURES:
11
11
 
12
+ * Factorial Analysis. Principal Component Analysis and Principal Axis extraction, with orthogonal rotations (Varimax, Equimax, Quartimax)
12
13
  * Multiple Regression. Listwise analysis optimized with use of Alglib library. Pairwise analysis is executed on pure ruby and reports same values as SPSS
13
14
  * Dominance Analysis. Based on Budescu and Azen papers, <strong>DominanceAnalysis</strong> class can report dominance analysis for a sample and <strong>DominanceAnalysisBootstrap</strong> can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
14
15
  * Classes for Vector, Datasets (set of Vectors) and Multisets (multiple datasets with same fields and type of vectors), and multiple methods to manipulate them
data/demo/pca.rb ADDED
@@ -0,0 +1,29 @@
1
+ require File.dirname(__FILE__)+"/../lib/statsample"
2
+ require 'matrix_extension'
3
+ require 'reportbuilder'
4
+ require 'gsl'
5
+ ds=Statsample.load("/home/cdx/trabajo/sepade/pdie/2008_ntic/analisis_c1/tesis.ds")
6
+ ds2=ds['ac_gen'..'ac_db'].dup_only_valid
7
+
8
+ cm=Statsample::Bivariate.correlation_matrix(ds2)
9
+
10
+ pca=Statsample::Factor::PCA.new(cm, :m=>2)
11
+ rb=ReportBuilder.new()
12
+ rb.add(pca)
13
+
14
+ varimax=Statsample::Factor::Quartimax.new(pca.component_matrix.to_matrix)
15
+ varimax.iterate
16
+ rb.add(varimax.rotated)
17
+ rb.add(varimax.iterations)
18
+ rb.add(varimax.component_transformation_matrix)
19
+ rb.add(varimax.h2)
20
+ =begin
21
+ fa=Statsample::Factor::PrincipalAxis.new(cm, :m=>1)
22
+ rb=ReportBuilder.new()
23
+ rb.add(fa)
24
+
25
+ =end
26
+ puts rb.to_text
27
+
28
+
29
+
data/demo/umann.rb ADDED
@@ -0,0 +1,8 @@
1
+ require File.dirname(__FILE__)+'/../lib/statsample'
2
+ v1=[1,2,3,4,7,8,9,10,14,15].to_scale
3
+ v2=[5,6,11,12,13,16,17,18,19].to_scale
4
+ u=Statsample::Test::UMannWhitney.new(v1,v2)
5
+
6
+ puts u.summary
7
+
8
+ #p Statsample::Test::UMannWhitney.exact_probability_as62(100,100)
data/lib/distribution.rb CHANGED
@@ -12,5 +12,4 @@ module Distribution
12
12
  autoload(:T, 'distribution/t')
13
13
  autoload(:F, 'distribution/f')
14
14
  autoload(:Normal, 'distribution/normal')
15
-
16
15
  end
@@ -1,4 +1,18 @@
1
1
  require 'matrix'
2
+
3
+ if RUBY_VERSION<="1.9.0"
4
+ class Vector
5
+ alias_method :old_coerce, :coerce
6
+ def coerce(other)
7
+ case other
8
+ when Numeric
9
+ return Matrix::Scalar.new(other), self
10
+ else
11
+ raise TypeError, "#{self.class} can't be coerced into #{other.class}"
12
+ end
13
+ end
14
+ end
15
+ end
2
16
  class Matrix
3
17
  def rows_sum
4
18
  (0...row_size).collect {|i|
@@ -37,31 +51,31 @@ class Matrix
37
51
  end
38
52
  # Test if a Matrix is a identity one
39
53
  def identity?
40
- if regular?
41
- rows=(0...row_size).each{|i|
42
- (0...column_size).each {|j|
43
- v = self[i,j]
44
- return false if (i==j and v!=1) or (i!=j and v!=0)
45
- }
46
- }
47
- true
48
- else
49
- false
50
- end
54
+ if regular?
55
+ rows=(0...row_size).each{|i|
56
+ (0...column_size).each {|j|
57
+ v = self[i,j]
58
+ return false if (i==j and v!=1) or (i!=j and v!=0)
59
+ }
60
+ }
61
+ true
62
+ else
63
+ false
64
+ end
51
65
  end
52
66
  def to_gsl
53
- out=[]
54
- self.row_size.times{|i|
55
- out[i]=self.row(i).to_a
56
- }
57
- GSL::Matrix[*out]
67
+ out=[]
68
+ self.row_size.times{|i|
69
+ out[i]=self.row(i).to_a
70
+ }
71
+ GSL::Matrix[*out]
58
72
  end
59
73
  def orthogonal?
60
- if regular?
61
- (self * self.t).identity?
62
- else
63
- false
64
- end
74
+ if regular?
75
+ (self * self.t).identity?
76
+ else
77
+ false
78
+ end
65
79
  end
66
80
  end
67
81
 
data/lib/statsample.rb CHANGED
@@ -108,16 +108,18 @@ end
108
108
  # * Dataset: An union of vectors.
109
109
  #
110
110
  module Statsample
111
- VERSION = '0.5.1'
111
+ VERSION = '0.6.0'
112
112
  SPLIT_TOKEN = ","
113
113
  autoload(:Database, 'statsample/converters')
114
114
  autoload(:Anova, 'statsample/anova')
115
115
  autoload(:Combination, 'statsample/combination')
116
+ autoload(:Permutation, 'statsample/permutation')
116
117
  autoload(:CSV, 'statsample/converters')
117
118
  autoload(:PlainText, 'statsample/converters')
118
119
  autoload(:Excel, 'statsample/converters')
119
120
  autoload(:GGobi, 'statsample/converters')
120
121
  autoload(:SPSS, 'statsample/converter/spss')
122
+ autoload(:Histogram, 'statsample/histogram')
121
123
  autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
122
124
  autoload(:HtmlReport, 'statsample/htmlreport')
123
125
  autoload(:Mx, 'statsample/converters')
@@ -132,6 +134,7 @@ module Statsample
132
134
  autoload(:MLE, 'statsample/mle')
133
135
  autoload(:Regression, 'statsample/regression')
134
136
  autoload(:Test, 'statsample/test')
137
+ autoload(:Factor, 'statsample/factor')
135
138
  def self.load(filename)
136
139
  if File.exists? filename
137
140
  o=false
@@ -157,38 +160,38 @@ module Statsample
157
160
  end
158
161
  module Writable
159
162
  def save(filename)
160
- fp=File.open(filename,"w")
161
- Marshal.dump(self,fp)
162
- fp.close
163
+ fp=File.open(filename,"w")
164
+ Marshal.dump(self,fp)
165
+ fp.close
163
166
  end
164
167
  end
165
168
  module HtmlSummary
166
- def add_line(n=nil)
167
- self << "<hr />"
168
- end
169
- def nl
170
- self << "<br />"
171
- end
172
- def add(text)
173
- self << ("<p>"+text.gsub("\n","<br />")+"</p>")
174
- end
175
- def parse_table(table)
176
- self << table.parse_html
177
- end
169
+ def add_line(n=nil)
170
+ self << "<hr />"
171
+ end
172
+ def nl
173
+ self << "<br />"
174
+ end
175
+ def add(text)
176
+ self << ("<p>"+text.gsub("\n","<br />")+"</p>")
177
+ end
178
+ def parse_table(table)
179
+ self << table.parse_html
180
+ end
178
181
  end
179
182
  module ConsoleSummary
180
- def add_line(n=80)
181
- self << "-"*n+"\n"
182
- end
183
- def nl
184
- self << "\n"
185
- end
186
- def add(text)
187
- self << text
188
- end
189
- def parse_table(table)
190
- self << table.parse_console
191
- end
183
+ def add_line(n=80)
184
+ self << "-"*n+"\n"
185
+ end
186
+ def nl
187
+ self << "\n"
188
+ end
189
+ def add(text)
190
+ self << text
191
+ end
192
+ def parse_table(table)
193
+ self << table.parse_console
194
+ end
192
195
  end
193
196
  class ReportTable
194
197
  attr_reader :header
@@ -6,8 +6,13 @@ module Statsample
6
6
  # v2=[3,3,4,5,6].to_scale
7
7
  # v3=[5,3,1,5,6].to_scale
8
8
  # anova=Statsample::Anova::OneWay.new([v1,v2,v3])
9
- # puts anova.f
10
- # puts anova.significance
9
+ # anova.f
10
+ # => 0.0243902439024391
11
+ # anova.significance
12
+ # => 0.975953044203438
13
+ # anova.sst
14
+ # => 32.9333333333333
15
+ #
11
16
  class OneWay
12
17
  def initialize(vectors)
13
18
  @vectors=vectors
@@ -13,6 +13,7 @@ module Statsample
13
13
  covariance_slow(v1a,v2a)
14
14
  end
15
15
  end
16
+ # Estimate the ML between two dichotomic vectors
16
17
  def maximum_likehood_dichotomic(pred,real)
17
18
  preda,reala=Statsample.only_valid(pred,real)
18
19
  sum=0
@@ -59,13 +60,14 @@ module Statsample
59
60
  end
60
61
  # Retrieves the value for t test for a pearson correlation
61
62
  # giving r and vector size
63
+ # Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm
62
64
  def t_r(r,size)
63
65
  r * Math::sqrt(((size)-2).to_f / (1 - r**2))
64
66
  end
65
67
  # Retrieves the probability value (a la SPSS)
66
68
  # for a given t, size and number of tails.
67
69
  # Uses a second parameter
68
- # * :both or 2 : for r!=0
70
+ # * :both or 2 : for r!=0 (default)
69
71
  # * :right, :positive or 1 : for r > 0
70
72
  # * :left, :negative : for r < 0
71
73
 
@@ -112,6 +114,7 @@ module Statsample
112
114
  (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
113
115
 
114
116
  end
117
+
115
118
  # Covariance matrix.
116
119
  # Order of rows and columns depends on Dataset#fields order
117
120
 
@@ -139,7 +142,8 @@ module Statsample
139
142
  end
140
143
  end
141
144
  end
142
- # Retrieves the n valid pairwise
145
+
146
+ # Retrieves the n valid pairwise.
143
147
  def n_valid_matrix(ds)
144
148
  ds.collect_matrix do |row,col|
145
149
  if row==col
@@ -150,7 +154,8 @@ module Statsample
150
154
  end
151
155
  end
152
156
  end
153
- # Matrix of correlation probability
157
+
158
+ # Matrix of correlation probabilities.
154
159
  # Order of rows and columns depends on Dataset#fields order
155
160
 
156
161
  def correlation_probability_matrix(ds, tails=:both)
@@ -162,6 +167,7 @@ module Statsample
162
167
  end
163
168
  Matrix.rows(rows)
164
169
  end
170
+
165
171
  # Spearman ranked correlation coefficient between 2 vectors
166
172
  def spearman(v1,v2)
167
173
  v1a,v2a=Statsample.only_valid(v1,v2)
@@ -218,16 +224,16 @@ module Statsample
218
224
  rs=matrix.row_size
219
225
  cs=matrix.column_size
220
226
  conc=disc=ties_x=ties_y=0
221
- (0...(rs-1)).each {|x|
222
- (0...(cs-1)).each{|y|
223
- ((x+1)...rs).each{|x2|
224
- ((y+1)...cs).each{|y2|
227
+ (0...(rs-1)).each do |x|
228
+ (0...(cs-1)).each do |y|
229
+ ((x+1)...rs).each do |x2|
230
+ ((y+1)...cs).each do |y2|
225
231
  # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
226
232
  conc+=matrix[x,y]*matrix[x2,y2]
227
- }
228
- }
229
- }
230
- }
233
+ end
234
+ end
235
+ end
236
+ end
231
237
  (0...(rs-1)).each {|x|
232
238
  (1...(cs)).each{|y|
233
239
  ((x+1)...rs).each{|x2|
@@ -27,94 +27,143 @@ module Statsample
27
27
  # }
28
28
  # end
29
29
  #
30
- module Codification
31
- class << self
32
- # Create a yaml dump for a hash, based on vectors
33
- # The keys will be vectors name on dataset and the values
34
- # will be hashes, with keys = values, for recodification
35
- #
36
- # v1=%w{a,b b,c d}.to_vector
37
- # ds={"v1"=>v1}.to_dataset
38
- # Statsample::Codification.create_yaml(ds,['v1'])
39
- # => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
40
- def create_yaml(dataset,vectors,sep=Statsample::SPLIT_TOKEN,io=nil)
41
- raise ArgumentError,"Array should't be empty" if vectors.size==0
42
- pro_hash=vectors.inject({}){|h,v_name|
43
- raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
44
- v=dataset[v_name]
45
- split_data=v.splitted(sep)
46
- factors=split_data.flatten.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac}
47
- h[v_name]=factors
48
- h
49
- }
50
- YAML.dump(pro_hash,io)
51
- end
52
- def inverse_hash(h,sep=Statsample::SPLIT_TOKEN)
53
- h.inject({}) {|a,v|
54
- v[1].split(sep).each {|val|
55
- a[val]||=[]
56
- a[val].push(v[0])
57
- }
58
- a
59
- }
60
- end
61
- def dictionary(h,sep=Statsample::SPLIT_TOKEN)
62
- h.inject({}) {|a,v|
63
- a[v[0]]=v[1].split(sep)
64
- a
65
- }
66
- end
67
- def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
68
- dict=dictionary(h,sep)
69
- new_data=v.splitted(sep)
70
- recoded=new_data.collect{|c|
71
- if c.nil?
72
- nil
73
- else
74
- c.collect{|value|
75
- dict[value]
76
- }.flatten.uniq
77
- end
78
- }
79
- end
80
- def recode_dataset_simple!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
81
- _recode_dataset(dataset,yaml,sep,false)
82
- end
83
- def recode_dataset_split!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
84
- _recode_dataset(dataset,yaml,sep,true)
85
- end
86
-
87
- def _recode_dataset(dataset,yaml,sep=Statsample::SPLIT_TOKEN,split=false)
88
- h=YAML::load(yaml)
89
- v_names||=h.keys
90
- v_names.each do |v_name|
91
- raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
92
- recoded=recode_vector(dataset[v_name],h[v_name],sep).collect { |c|
93
- if c.nil?
94
- nil
95
- else
96
- c.join(sep)
97
- end
98
- }.to_vector
99
- if(split)
100
- recoded.split_by_separator(sep).each {|k,v|
101
- dataset[v_name+"_"+k]=v
102
- }
103
- else
104
- dataset[v_name+"_recoded"]=recoded
105
- end
106
- end
107
- end
108
- def verify(yaml,v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
109
- require 'pp'
110
- h=YAML::load(yaml)
111
- v_names||=h.keys
112
- v_names.each{|v_name|
113
- inverse=inverse_hash(h[v_name],sep)
114
- io.puts "Vector: #{v_name}"
115
- YAML.dump(inverse.sort,io)
116
- }
30
+ module Codification
31
+ class << self
32
+ # Create a hash, based on vectors, to create the dictionary.
33
+ # The keys will be vectors name on dataset and the values
34
+ # will be hashes, with keys = values, for recodification
35
+ def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
36
+ raise ArgumentError,"Array should't be empty" if vectors.size==0
37
+ pro_hash=vectors.inject({}){|h,v_name|
38
+ raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
39
+ v=dataset[v_name]
40
+ split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?}
41
+
42
+ factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac }
43
+ h[v_name]=factors
44
+ h
45
+ }
46
+ pro_hash
47
+ end
48
+ # Create a yaml to create a dictionary, based on vectors
49
+ # The keys will be vectors name on dataset and the values
50
+ # will be hashes, with keys = values, for recodification
51
+ #
52
+ # v1=%w{a,b b,c d}.to_vector
53
+ # ds={"v1"=>v1}.to_dataset
54
+ # Statsample::Codification.create_yaml(ds,['v1'])
55
+ # => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
56
+ def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
57
+ pro_hash=create_hash(dataset, vectors, sep)
58
+ YAML.dump(pro_hash,io)
59
+ end
60
+ # Create a excel to create a dictionary, based on vectors.
61
+ # Raises an error if filename exists
62
+ # The rows will be:
63
+ # * field: name of vector
64
+ # * original: original name
65
+ # * recoded: new code
66
+
67
+ def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN)
68
+ require 'spreadsheet'
69
+ if File.exists?(filename)
70
+ raise "Exists a file named #{filename}. Delete ir before overwrite."
71
+ end
72
+ book = Spreadsheet::Workbook.new
73
+ sheet = book.create_worksheet
74
+ sheet.row(0).concat(%w{field original recoded})
75
+ i=1
76
+ create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
77
+ inner_hash.sort.each do |k,v|
78
+ sheet.row(i).concat([field.dup,k.dup,v.dup])
79
+ i+=1
80
+ end
81
+ end
82
+ book.write(filename)
83
+ end
84
+ # From a excel generates a dictionary hash
85
+ # to use on recode_dataset_simple!() or recode_dataset_split!().
86
+ #
87
+ def excel_to_recoded_hash(filename)
88
+ require 'spreadsheet'
89
+ h={}
90
+ book = Spreadsheet.open filename
91
+ sheet= book.worksheet 0
92
+ row_i=0
93
+ sheet.each do |row|
94
+ row_i+=1
95
+ next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil?
96
+ h[row[0]]={} if h[row[0]].nil?
97
+ h[row[0]][row[1]]=row[2]
98
+ end
99
+ h
100
+ end
101
+
102
+ def inverse_hash(h, sep=Statsample::SPLIT_TOKEN)
103
+ h.inject({}) do |a,v|
104
+ v[1].split(sep).each do |val|
105
+ a[val]||=[]
106
+ a[val].push(v[0])
107
+ end
108
+ a
109
+ end
110
+ end
111
+
112
+ def dictionary(h, sep=Statsample::SPLIT_TOKEN)
113
+ h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
114
+ end
115
+
116
+ def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
117
+ dict=dictionary(h,sep)
118
+ new_data=v.splitted(sep)
119
+ recoded=new_data.collect do |c|
120
+ if c.nil?
121
+ nil
122
+ else
123
+ c.collect{|value| dict[value] }.flatten.uniq
124
+ end
125
+ end
126
+ end
127
+ def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN)
128
+ _recode_dataset(dataset,dictionary_hash ,sep,false)
129
+ end
130
+ def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN)
131
+ _recode_dataset(dataset, dictionary_hash, sep,true)
132
+ end
133
+
134
+ def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
135
+ v_names||=h.keys
136
+ v_names.each do |v_name|
137
+ raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
138
+ recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c|
139
+ if c.nil?
140
+ nil
141
+ else
142
+ c.join(sep)
143
+ end
144
+ }.to_vector
145
+ if(split)
146
+ recoded.split_by_separator(sep).each {|k,v|
147
+ dataset[v_name+"_"+k]=v
148
+ }
149
+ else
150
+ dataset[v_name+"_recoded"]=recoded
117
151
  end
152
+ end
153
+ end
154
+
155
+
156
+ def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
157
+ require 'pp'
158
+ v_names||=h.keys
159
+ v_names.each{|v_name|
160
+ inverse=inverse_hash(h[v_name],sep)
161
+ io.puts "- Field: #{v_name}"
162
+ inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v|
163
+ io.puts " - \"#{k}\" (#{v.count}) :\n -'"+v.join("\n -'")+"'"
164
+ }
165
+ }
118
166
  end
119
167
  end
168
+ end
120
169
  end