statsample 0.5.1 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/History.txt +12 -0
  2. data/Manifest.txt +13 -0
  3. data/README.txt +2 -1
  4. data/demo/pca.rb +29 -0
  5. data/demo/umann.rb +8 -0
  6. data/lib/distribution.rb +0 -1
  7. data/lib/matrix_extension.rb +35 -21
  8. data/lib/statsample.rb +31 -28
  9. data/lib/statsample/anova.rb +7 -2
  10. data/lib/statsample/bivariate.rb +17 -11
  11. data/lib/statsample/codification.rb +136 -87
  12. data/lib/statsample/combination.rb +0 -2
  13. data/lib/statsample/converter/csv18.rb +1 -1
  14. data/lib/statsample/converter/csv19.rb +1 -1
  15. data/lib/statsample/converters.rb +176 -171
  16. data/lib/statsample/crosstab.rb +227 -154
  17. data/lib/statsample/dataset.rb +94 -12
  18. data/lib/statsample/dominanceanalysis.rb +69 -62
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
  20. data/lib/statsample/factor.rb +18 -0
  21. data/lib/statsample/factor/pca.rb +128 -0
  22. data/lib/statsample/factor/principalaxis.rb +133 -0
  23. data/lib/statsample/factor/rotation.rb +125 -0
  24. data/lib/statsample/histogram.rb +99 -0
  25. data/lib/statsample/mle.rb +125 -126
  26. data/lib/statsample/mle/logit.rb +91 -91
  27. data/lib/statsample/mle/probit.rb +84 -85
  28. data/lib/statsample/multiset.rb +1 -1
  29. data/lib/statsample/permutation.rb +96 -0
  30. data/lib/statsample/regression.rb +1 -1
  31. data/lib/statsample/regression/binomial.rb +89 -89
  32. data/lib/statsample/regression/binomial/logit.rb +9 -9
  33. data/lib/statsample/regression/binomial/probit.rb +9 -9
  34. data/lib/statsample/regression/multiple.rb +8 -14
  35. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  36. data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
  37. data/lib/statsample/resample.rb +12 -17
  38. data/lib/statsample/srs.rb +4 -1
  39. data/lib/statsample/test.rb +23 -22
  40. data/lib/statsample/test/umannwhitney.rb +182 -0
  41. data/lib/statsample/vector.rb +854 -815
  42. data/test/test_bivariate.rb +132 -132
  43. data/test/test_codification.rb +71 -50
  44. data/test/test_dataset.rb +19 -1
  45. data/test/test_factor.rb +44 -0
  46. data/test/test_histogram.rb +26 -0
  47. data/test/test_permutation.rb +37 -0
  48. data/test/test_statistics.rb +74 -63
  49. data/test/test_umannwhitney.rb +17 -0
  50. data/test/test_vector.rb +46 -30
  51. metadata +31 -4
data/History.txt CHANGED
@@ -1,3 +1,14 @@
1
+ === 0.6.0 / 2010-02-05
2
+ * New Statsample::Factor module. Include classes for extracting factors (Statsample::Factor::PCA and Statsample::Factor::PrincipalAxis) and rotate component matrix ( Statsample::Factor::Rotation subclasses). For now, only orthogonal rotations
3
+ * New Statsample::Dataset.crosstab_with_asignation, Statsample::Dataset.one_to_many
4
+ * New class Statsample::Permutation to produce permutations of a given array
5
+ * New class Statsample::Histogram, with same interface as GSL one
6
+ * New class Statsample::Test::UMannWhitney, to perform Mann-Whitney's U test. Gives z based and exact calculation of probability
7
+ * Improved support for ReportBuilder
8
+ * Statsample::Codification module reworked
9
+ * Fixed bugs on Dominance Analysis classes
10
+ * Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew
11
+
1
12
  === 0.5.1 / 2009-10-06
2
13
 
3
14
  * New class Statsample::Bivariate::Tetrachoric, for calculation of tetrachoric correlations. See http://www.john-uebersax.com/stat/tetra.htm for information.
@@ -18,6 +29,7 @@
18
29
  * Logit tests
19
30
  * Bug fix: rescue for requires doesn't specify LoadError
20
31
  * Binomial::BaseEngine new methods: coeffs_se, coeffs, constant and constant_se
32
+
21
33
  === 0.4.0 / 2009-09-10
22
34
  * New Distribution module, based on statistics2.rb by Shin-ichiro HARA. Replaces all instances of GSL distributions pdf and cdf calculations for native calculation.
23
35
  * New Maximum Likehood Estimation for Logit, Probit and Normal Distribution using Von Tessin(2005) algorithm. See MLE class and subclasses for more information.
data/Manifest.txt CHANGED
@@ -18,6 +18,7 @@ demo/graph.rb
18
18
  demo/item_analysis.rb
19
19
  demo/mean.rb
20
20
  demo/nunnally_6.rb
21
+ demo/pca.rb
21
22
  demo/proportion.rb
22
23
  demo/regression.rb
23
24
  demo/sample_test.csv
@@ -25,6 +26,7 @@ demo/spss_matrix.rb
25
26
  demo/strata_proportion.rb
26
27
  demo/stratum.rb
27
28
  demo/t-student.rb
29
+ demo/umann.rb
28
30
  lib/distribution.rb
29
31
  lib/distribution/chisquare.rb
30
32
  lib/distribution/f.rb
@@ -47,17 +49,23 @@ lib/statsample/crosstab.rb
47
49
  lib/statsample/dataset.rb
48
50
  lib/statsample/dominanceanalysis.rb
49
51
  lib/statsample/dominanceanalysis/bootstrap.rb
52
+ lib/statsample/factor.rb
53
+ lib/statsample/factor/pca.rb
54
+ lib/statsample/factor/principalaxis.rb
55
+ lib/statsample/factor/rotation.rb
50
56
  lib/statsample/graph/gdchart.rb
51
57
  lib/statsample/graph/svgboxplot.rb
52
58
  lib/statsample/graph/svggraph.rb
53
59
  lib/statsample/graph/svghistogram.rb
54
60
  lib/statsample/graph/svgscatterplot.rb
61
+ lib/statsample/histogram.rb
55
62
  lib/statsample/htmlreport.rb
56
63
  lib/statsample/mle.rb
57
64
  lib/statsample/mle/logit.rb
58
65
  lib/statsample/mle/normal.rb
59
66
  lib/statsample/mle/probit.rb
60
67
  lib/statsample/multiset.rb
68
+ lib/statsample/permutation.rb
61
69
  lib/statsample/regression.rb
62
70
  lib/statsample/regression/binomial.rb
63
71
  lib/statsample/regression/binomial/logit.rb
@@ -72,6 +80,7 @@ lib/statsample/reliability.rb
72
80
  lib/statsample/resample.rb
73
81
  lib/statsample/srs.rb
74
82
  lib/statsample/test.rb
83
+ lib/statsample/test/umannwhitney.rb
75
84
  lib/statsample/vector.rb
76
85
  po/es/statsample.po
77
86
  po/statsample.pot
@@ -85,11 +94,14 @@ test/test_csv.csv
85
94
  test/test_csv.rb
86
95
  test/test_dataset.rb
87
96
  test/test_distribution.rb
97
+ test/test_factor.rb
88
98
  test/test_ggobi.rb
89
99
  test/test_gsl.rb
100
+ test/test_histogram.rb
90
101
  test/test_logit.rb
91
102
  test/test_mle.rb
92
103
  test/test_multiset.rb
104
+ test/test_permutation.rb
93
105
  test/test_regression.rb
94
106
  test/test_reliability.rb
95
107
  test/test_resample.rb
@@ -97,6 +109,7 @@ test/test_srs.rb
97
109
  test/test_statistics.rb
98
110
  test/test_stratified.rb
99
111
  test/test_svg_graph.rb
112
+ test/test_umannwhitney.rb
100
113
  test/test_vector.rb
101
114
  test/test_xls.rb
102
115
  test/test_xls.xls
data/README.txt CHANGED
@@ -5,10 +5,11 @@ http://ruby-statsample.rubyforge.org/
5
5
 
6
6
  == DESCRIPTION:
7
7
 
8
- A suite for your basic and advanced statistics needs. Descriptive statistics, multiple regression, dominance analysis, scale's reliability analysis, bivariate statistics and others procedures.
8
+ A suite for your basic and advanced statistics needs. Descriptive statistics, multiple regression, factorial analysis, dominance analysis, scale's reliability analysis, bivariate statistics and others procedures.
9
9
 
10
10
  == FEATURES:
11
11
 
12
+ * Factorial Analysis. Principal Component Analysis and Principal Axis extraction, with orthogonal rotations (Varimax, Equimax, Quartimax)
12
13
  * Multiple Regression. Listwise analysis optimized with use of Alglib library. Pairwise analysis is executed on pure ruby and reports same values as SPSS
13
14
  * Dominance Analysis. Based on Budescu and Azen papers, <strong>DominanceAnalysis</strong> class can report dominance analysis for a sample and <strong>DominanceAnalysisBootstrap</strong> can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
14
15
  * Classes for Vector, Datasets (set of Vectors) and Multisets (multiple datasets with same fields and type of vectors), and multiple methods to manipulate them
data/demo/pca.rb ADDED
@@ -0,0 +1,29 @@
1
+ require File.dirname(__FILE__)+"/../lib/statsample"
2
+ require 'matrix_extension'
3
+ require 'reportbuilder'
4
+ require 'gsl'
5
+ ds=Statsample.load("/home/cdx/trabajo/sepade/pdie/2008_ntic/analisis_c1/tesis.ds")
6
+ ds2=ds['ac_gen'..'ac_db'].dup_only_valid
7
+
8
+ cm=Statsample::Bivariate.correlation_matrix(ds2)
9
+
10
+ pca=Statsample::Factor::PCA.new(cm, :m=>2)
11
+ rb=ReportBuilder.new()
12
+ rb.add(pca)
13
+
14
+ varimax=Statsample::Factor::Quartimax.new(pca.component_matrix.to_matrix)
15
+ varimax.iterate
16
+ rb.add(varimax.rotated)
17
+ rb.add(varimax.iterations)
18
+ rb.add(varimax.component_transformation_matrix)
19
+ rb.add(varimax.h2)
20
+ =begin
21
+ fa=Statsample::Factor::PrincipalAxis.new(cm, :m=>1)
22
+ rb=ReportBuilder.new()
23
+ rb.add(fa)
24
+
25
+ =end
26
+ puts rb.to_text
27
+
28
+
29
+
data/demo/umann.rb ADDED
@@ -0,0 +1,8 @@
1
+ require File.dirname(__FILE__)+'/../lib/statsample'
2
+ v1=[1,2,3,4,7,8,9,10,14,15].to_scale
3
+ v2=[5,6,11,12,13,16,17,18,19].to_scale
4
+ u=Statsample::Test::UMannWhitney.new(v1,v2)
5
+
6
+ puts u.summary
7
+
8
+ #p Statsample::Test::UMannWhitney.exact_probability_as62(100,100)
data/lib/distribution.rb CHANGED
@@ -12,5 +12,4 @@ module Distribution
12
12
  autoload(:T, 'distribution/t')
13
13
  autoload(:F, 'distribution/f')
14
14
  autoload(:Normal, 'distribution/normal')
15
-
16
15
  end
@@ -1,4 +1,18 @@
1
1
  require 'matrix'
2
+
3
+ if RUBY_VERSION<="1.9.0"
4
+ class Vector
5
+ alias_method :old_coerce, :coerce
6
+ def coerce(other)
7
+ case other
8
+ when Numeric
9
+ return Matrix::Scalar.new(other), self
10
+ else
11
+ raise TypeError, "#{self.class} can't be coerced into #{other.class}"
12
+ end
13
+ end
14
+ end
15
+ end
2
16
  class Matrix
3
17
  def rows_sum
4
18
  (0...row_size).collect {|i|
@@ -37,31 +51,31 @@ class Matrix
37
51
  end
38
52
  # Test if a Matrix is a identity one
39
53
  def identity?
40
- if regular?
41
- rows=(0...row_size).each{|i|
42
- (0...column_size).each {|j|
43
- v = self[i,j]
44
- return false if (i==j and v!=1) or (i!=j and v!=0)
45
- }
46
- }
47
- true
48
- else
49
- false
50
- end
54
+ if regular?
55
+ rows=(0...row_size).each{|i|
56
+ (0...column_size).each {|j|
57
+ v = self[i,j]
58
+ return false if (i==j and v!=1) or (i!=j and v!=0)
59
+ }
60
+ }
61
+ true
62
+ else
63
+ false
64
+ end
51
65
  end
52
66
  def to_gsl
53
- out=[]
54
- self.row_size.times{|i|
55
- out[i]=self.row(i).to_a
56
- }
57
- GSL::Matrix[*out]
67
+ out=[]
68
+ self.row_size.times{|i|
69
+ out[i]=self.row(i).to_a
70
+ }
71
+ GSL::Matrix[*out]
58
72
  end
59
73
  def orthogonal?
60
- if regular?
61
- (self * self.t).identity?
62
- else
63
- false
64
- end
74
+ if regular?
75
+ (self * self.t).identity?
76
+ else
77
+ false
78
+ end
65
79
  end
66
80
  end
67
81
 
data/lib/statsample.rb CHANGED
@@ -108,16 +108,18 @@ end
108
108
  # * Dataset: An union of vectors.
109
109
  #
110
110
  module Statsample
111
- VERSION = '0.5.1'
111
+ VERSION = '0.6.0'
112
112
  SPLIT_TOKEN = ","
113
113
  autoload(:Database, 'statsample/converters')
114
114
  autoload(:Anova, 'statsample/anova')
115
115
  autoload(:Combination, 'statsample/combination')
116
+ autoload(:Permutation, 'statsample/permutation')
116
117
  autoload(:CSV, 'statsample/converters')
117
118
  autoload(:PlainText, 'statsample/converters')
118
119
  autoload(:Excel, 'statsample/converters')
119
120
  autoload(:GGobi, 'statsample/converters')
120
121
  autoload(:SPSS, 'statsample/converter/spss')
122
+ autoload(:Histogram, 'statsample/histogram')
121
123
  autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
122
124
  autoload(:HtmlReport, 'statsample/htmlreport')
123
125
  autoload(:Mx, 'statsample/converters')
@@ -132,6 +134,7 @@ module Statsample
132
134
  autoload(:MLE, 'statsample/mle')
133
135
  autoload(:Regression, 'statsample/regression')
134
136
  autoload(:Test, 'statsample/test')
137
+ autoload(:Factor, 'statsample/factor')
135
138
  def self.load(filename)
136
139
  if File.exists? filename
137
140
  o=false
@@ -157,38 +160,38 @@ module Statsample
157
160
  end
158
161
  module Writable
159
162
  def save(filename)
160
- fp=File.open(filename,"w")
161
- Marshal.dump(self,fp)
162
- fp.close
163
+ fp=File.open(filename,"w")
164
+ Marshal.dump(self,fp)
165
+ fp.close
163
166
  end
164
167
  end
165
168
  module HtmlSummary
166
- def add_line(n=nil)
167
- self << "<hr />"
168
- end
169
- def nl
170
- self << "<br />"
171
- end
172
- def add(text)
173
- self << ("<p>"+text.gsub("\n","<br />")+"</p>")
174
- end
175
- def parse_table(table)
176
- self << table.parse_html
177
- end
169
+ def add_line(n=nil)
170
+ self << "<hr />"
171
+ end
172
+ def nl
173
+ self << "<br />"
174
+ end
175
+ def add(text)
176
+ self << ("<p>"+text.gsub("\n","<br />")+"</p>")
177
+ end
178
+ def parse_table(table)
179
+ self << table.parse_html
180
+ end
178
181
  end
179
182
  module ConsoleSummary
180
- def add_line(n=80)
181
- self << "-"*n+"\n"
182
- end
183
- def nl
184
- self << "\n"
185
- end
186
- def add(text)
187
- self << text
188
- end
189
- def parse_table(table)
190
- self << table.parse_console
191
- end
183
+ def add_line(n=80)
184
+ self << "-"*n+"\n"
185
+ end
186
+ def nl
187
+ self << "\n"
188
+ end
189
+ def add(text)
190
+ self << text
191
+ end
192
+ def parse_table(table)
193
+ self << table.parse_console
194
+ end
192
195
  end
193
196
  class ReportTable
194
197
  attr_reader :header
@@ -6,8 +6,13 @@ module Statsample
6
6
  # v2=[3,3,4,5,6].to_scale
7
7
  # v3=[5,3,1,5,6].to_scale
8
8
  # anova=Statsample::Anova::OneWay.new([v1,v2,v3])
9
- # puts anova.f
10
- # puts anova.significance
9
+ # anova.f
10
+ # => 0.0243902439024391
11
+ # anova.significance
12
+ # => 0.975953044203438
13
+ # anova.sst
14
+ # => 32.9333333333333
15
+ #
11
16
  class OneWay
12
17
  def initialize(vectors)
13
18
  @vectors=vectors
@@ -13,6 +13,7 @@ module Statsample
13
13
  covariance_slow(v1a,v2a)
14
14
  end
15
15
  end
16
+ # Estimate the ML between two dichotomic vectors
16
17
  def maximum_likehood_dichotomic(pred,real)
17
18
  preda,reala=Statsample.only_valid(pred,real)
18
19
  sum=0
@@ -59,13 +60,14 @@ module Statsample
59
60
  end
60
61
  # Retrieves the value for t test for a pearson correlation
61
62
  # giving r and vector size
63
+ # Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm
62
64
  def t_r(r,size)
63
65
  r * Math::sqrt(((size)-2).to_f / (1 - r**2))
64
66
  end
65
67
  # Retrieves the probability value (a la SPSS)
66
68
  # for a given t, size and number of tails.
67
69
  # Uses a second parameter
68
- # * :both or 2 : for r!=0
70
+ # * :both or 2 : for r!=0 (default)
69
71
  # * :right, :positive or 1 : for r > 0
70
72
  # * :left, :negative : for r < 0
71
73
 
@@ -112,6 +114,7 @@ module Statsample
112
114
  (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
113
115
 
114
116
  end
117
+
115
118
  # Covariance matrix.
116
119
  # Order of rows and columns depends on Dataset#fields order
117
120
 
@@ -139,7 +142,8 @@ module Statsample
139
142
  end
140
143
  end
141
144
  end
142
- # Retrieves the n valid pairwise
145
+
146
+ # Retrieves the n valid pairwise.
143
147
  def n_valid_matrix(ds)
144
148
  ds.collect_matrix do |row,col|
145
149
  if row==col
@@ -150,7 +154,8 @@ module Statsample
150
154
  end
151
155
  end
152
156
  end
153
- # Matrix of correlation probability
157
+
158
+ # Matrix of correlation probabilities.
154
159
  # Order of rows and columns depends on Dataset#fields order
155
160
 
156
161
  def correlation_probability_matrix(ds, tails=:both)
@@ -162,6 +167,7 @@ module Statsample
162
167
  end
163
168
  Matrix.rows(rows)
164
169
  end
170
+
165
171
  # Spearman ranked correlation coefficient between 2 vectors
166
172
  def spearman(v1,v2)
167
173
  v1a,v2a=Statsample.only_valid(v1,v2)
@@ -218,16 +224,16 @@ module Statsample
218
224
  rs=matrix.row_size
219
225
  cs=matrix.column_size
220
226
  conc=disc=ties_x=ties_y=0
221
- (0...(rs-1)).each {|x|
222
- (0...(cs-1)).each{|y|
223
- ((x+1)...rs).each{|x2|
224
- ((y+1)...cs).each{|y2|
227
+ (0...(rs-1)).each do |x|
228
+ (0...(cs-1)).each do |y|
229
+ ((x+1)...rs).each do |x2|
230
+ ((y+1)...cs).each do |y2|
225
231
  # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
226
232
  conc+=matrix[x,y]*matrix[x2,y2]
227
- }
228
- }
229
- }
230
- }
233
+ end
234
+ end
235
+ end
236
+ end
231
237
  (0...(rs-1)).each {|x|
232
238
  (1...(cs)).each{|y|
233
239
  ((x+1)...rs).each{|x2|
@@ -27,94 +27,143 @@ module Statsample
27
27
  # }
28
28
  # end
29
29
  #
30
- module Codification
31
- class << self
32
- # Create a yaml dump for a hash, based on vectors
33
- # The keys will be vectors name on dataset and the values
34
- # will be hashes, with keys = values, for recodification
35
- #
36
- # v1=%w{a,b b,c d}.to_vector
37
- # ds={"v1"=>v1}.to_dataset
38
- # Statsample::Codification.create_yaml(ds,['v1'])
39
- # => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
40
- def create_yaml(dataset,vectors,sep=Statsample::SPLIT_TOKEN,io=nil)
41
- raise ArgumentError,"Array should't be empty" if vectors.size==0
42
- pro_hash=vectors.inject({}){|h,v_name|
43
- raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
44
- v=dataset[v_name]
45
- split_data=v.splitted(sep)
46
- factors=split_data.flatten.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac}
47
- h[v_name]=factors
48
- h
49
- }
50
- YAML.dump(pro_hash,io)
51
- end
52
- def inverse_hash(h,sep=Statsample::SPLIT_TOKEN)
53
- h.inject({}) {|a,v|
54
- v[1].split(sep).each {|val|
55
- a[val]||=[]
56
- a[val].push(v[0])
57
- }
58
- a
59
- }
60
- end
61
- def dictionary(h,sep=Statsample::SPLIT_TOKEN)
62
- h.inject({}) {|a,v|
63
- a[v[0]]=v[1].split(sep)
64
- a
65
- }
66
- end
67
- def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
68
- dict=dictionary(h,sep)
69
- new_data=v.splitted(sep)
70
- recoded=new_data.collect{|c|
71
- if c.nil?
72
- nil
73
- else
74
- c.collect{|value|
75
- dict[value]
76
- }.flatten.uniq
77
- end
78
- }
79
- end
80
- def recode_dataset_simple!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
81
- _recode_dataset(dataset,yaml,sep,false)
82
- end
83
- def recode_dataset_split!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
84
- _recode_dataset(dataset,yaml,sep,true)
85
- end
86
-
87
- def _recode_dataset(dataset,yaml,sep=Statsample::SPLIT_TOKEN,split=false)
88
- h=YAML::load(yaml)
89
- v_names||=h.keys
90
- v_names.each do |v_name|
91
- raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
92
- recoded=recode_vector(dataset[v_name],h[v_name],sep).collect { |c|
93
- if c.nil?
94
- nil
95
- else
96
- c.join(sep)
97
- end
98
- }.to_vector
99
- if(split)
100
- recoded.split_by_separator(sep).each {|k,v|
101
- dataset[v_name+"_"+k]=v
102
- }
103
- else
104
- dataset[v_name+"_recoded"]=recoded
105
- end
106
- end
107
- end
108
- def verify(yaml,v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
109
- require 'pp'
110
- h=YAML::load(yaml)
111
- v_names||=h.keys
112
- v_names.each{|v_name|
113
- inverse=inverse_hash(h[v_name],sep)
114
- io.puts "Vector: #{v_name}"
115
- YAML.dump(inverse.sort,io)
116
- }
30
+ module Codification
31
+ class << self
32
+ # Create a hash, based on vectors, to create the dictionary.
33
+ # The keys will be vectors name on dataset and the values
34
+ # will be hashes, with keys = values, for recodification
35
+ def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
36
+ raise ArgumentError,"Array should't be empty" if vectors.size==0
37
+ pro_hash=vectors.inject({}){|h,v_name|
38
+ raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
39
+ v=dataset[v_name]
40
+ split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?}
41
+
42
+ factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac }
43
+ h[v_name]=factors
44
+ h
45
+ }
46
+ pro_hash
47
+ end
48
+ # Create a yaml to create a dictionary, based on vectors
49
+ # The keys will be vectors name on dataset and the values
50
+ # will be hashes, with keys = values, for recodification
51
+ #
52
+ # v1=%w{a,b b,c d}.to_vector
53
+ # ds={"v1"=>v1}.to_dataset
54
+ # Statsample::Codification.create_yaml(ds,['v1'])
55
+ # => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
56
+ def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
57
+ pro_hash=create_hash(dataset, vectors, sep)
58
+ YAML.dump(pro_hash,io)
59
+ end
60
+ # Create a excel to create a dictionary, based on vectors.
61
+ # Raises an error if filename exists
62
+ # The rows will be:
63
+ # * field: name of vector
64
+ # * original: original name
65
+ # * recoded: new code
66
+
67
+ def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN)
68
+ require 'spreadsheet'
69
+ if File.exists?(filename)
70
+ raise "Exists a file named #{filename}. Delete ir before overwrite."
71
+ end
72
+ book = Spreadsheet::Workbook.new
73
+ sheet = book.create_worksheet
74
+ sheet.row(0).concat(%w{field original recoded})
75
+ i=1
76
+ create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
77
+ inner_hash.sort.each do |k,v|
78
+ sheet.row(i).concat([field.dup,k.dup,v.dup])
79
+ i+=1
80
+ end
81
+ end
82
+ book.write(filename)
83
+ end
84
+ # From a excel generates a dictionary hash
85
+ # to use on recode_dataset_simple!() or recode_dataset_split!().
86
+ #
87
+ def excel_to_recoded_hash(filename)
88
+ require 'spreadsheet'
89
+ h={}
90
+ book = Spreadsheet.open filename
91
+ sheet= book.worksheet 0
92
+ row_i=0
93
+ sheet.each do |row|
94
+ row_i+=1
95
+ next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil?
96
+ h[row[0]]={} if h[row[0]].nil?
97
+ h[row[0]][row[1]]=row[2]
98
+ end
99
+ h
100
+ end
101
+
102
+ def inverse_hash(h, sep=Statsample::SPLIT_TOKEN)
103
+ h.inject({}) do |a,v|
104
+ v[1].split(sep).each do |val|
105
+ a[val]||=[]
106
+ a[val].push(v[0])
107
+ end
108
+ a
109
+ end
110
+ end
111
+
112
+ def dictionary(h, sep=Statsample::SPLIT_TOKEN)
113
+ h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
114
+ end
115
+
116
+ def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
117
+ dict=dictionary(h,sep)
118
+ new_data=v.splitted(sep)
119
+ recoded=new_data.collect do |c|
120
+ if c.nil?
121
+ nil
122
+ else
123
+ c.collect{|value| dict[value] }.flatten.uniq
124
+ end
125
+ end
126
+ end
127
+ def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN)
128
+ _recode_dataset(dataset,dictionary_hash ,sep,false)
129
+ end
130
+ def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN)
131
+ _recode_dataset(dataset, dictionary_hash, sep,true)
132
+ end
133
+
134
+ def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
135
+ v_names||=h.keys
136
+ v_names.each do |v_name|
137
+ raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
138
+ recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c|
139
+ if c.nil?
140
+ nil
141
+ else
142
+ c.join(sep)
143
+ end
144
+ }.to_vector
145
+ if(split)
146
+ recoded.split_by_separator(sep).each {|k,v|
147
+ dataset[v_name+"_"+k]=v
148
+ }
149
+ else
150
+ dataset[v_name+"_recoded"]=recoded
117
151
  end
152
+ end
153
+ end
154
+
155
+
156
+ def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
157
+ require 'pp'
158
+ v_names||=h.keys
159
+ v_names.each{|v_name|
160
+ inverse=inverse_hash(h[v_name],sep)
161
+ io.puts "- Field: #{v_name}"
162
+ inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v|
163
+ io.puts " - \"#{k}\" (#{v.count}) :\n -'"+v.join("\n -'")+"'"
164
+ }
165
+ }
118
166
  end
119
167
  end
168
+ end
120
169
  end