statsample 1.5.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +15 -0
  3. data/.gitignore +1 -0
  4. data/.travis.yml +19 -7
  5. data/CONTRIBUTING.md +33 -0
  6. data/History.txt +5 -0
  7. data/README.md +41 -53
  8. data/benchmarks/correlation_matrix_15_variables.rb +6 -5
  9. data/benchmarks/correlation_matrix_5_variables.rb +6 -5
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
  11. data/examples/boxplot.rb +17 -5
  12. data/examples/correlation_matrix.rb +36 -7
  13. data/examples/dataset.rb +25 -5
  14. data/examples/dominance_analysis.rb +8 -7
  15. data/examples/dominance_analysis_bootstrap.rb +16 -11
  16. data/examples/histogram.rb +16 -2
  17. data/examples/icc.rb +5 -6
  18. data/examples/levene.rb +17 -3
  19. data/examples/multiple_regression.rb +6 -3
  20. data/examples/parallel_analysis.rb +11 -6
  21. data/examples/polychoric.rb +26 -13
  22. data/examples/principal_axis.rb +8 -4
  23. data/examples/reliability.rb +10 -10
  24. data/examples/scatterplot.rb +8 -0
  25. data/examples/t_test.rb +7 -0
  26. data/examples/u_test.rb +10 -2
  27. data/examples/vector.rb +9 -6
  28. data/examples/velicer_map_test.rb +12 -8
  29. data/lib/statsample.rb +13 -47
  30. data/lib/statsample/analysis/suite.rb +1 -1
  31. data/lib/statsample/anova/oneway.rb +6 -6
  32. data/lib/statsample/anova/twoway.rb +26 -24
  33. data/lib/statsample/bivariate.rb +78 -61
  34. data/lib/statsample/bivariate/pearson.rb +2 -2
  35. data/lib/statsample/codification.rb +45 -32
  36. data/lib/statsample/converter/csv.rb +15 -53
  37. data/lib/statsample/converter/spss.rb +6 -5
  38. data/lib/statsample/converters.rb +50 -211
  39. data/lib/statsample/crosstab.rb +26 -25
  40. data/lib/statsample/daru.rb +117 -0
  41. data/lib/statsample/dataset.rb +70 -942
  42. data/lib/statsample/dominanceanalysis.rb +16 -17
  43. data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
  44. data/lib/statsample/factor/parallelanalysis.rb +17 -19
  45. data/lib/statsample/factor/pca.rb +21 -20
  46. data/lib/statsample/factor/principalaxis.rb +3 -3
  47. data/lib/statsample/graph/boxplot.rb +8 -16
  48. data/lib/statsample/graph/histogram.rb +4 -4
  49. data/lib/statsample/graph/scatterplot.rb +8 -7
  50. data/lib/statsample/histogram.rb +128 -119
  51. data/lib/statsample/matrix.rb +20 -16
  52. data/lib/statsample/multiset.rb +39 -38
  53. data/lib/statsample/regression.rb +3 -3
  54. data/lib/statsample/regression/multiple.rb +8 -10
  55. data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
  56. data/lib/statsample/regression/multiple/baseengine.rb +32 -32
  57. data/lib/statsample/regression/multiple/gslengine.rb +33 -36
  58. data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
  59. data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
  60. data/lib/statsample/reliability.rb +23 -25
  61. data/lib/statsample/reliability/icc.rb +8 -7
  62. data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
  63. data/lib/statsample/reliability/scaleanalysis.rb +58 -60
  64. data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
  65. data/lib/statsample/resample.rb +1 -1
  66. data/lib/statsample/shorthand.rb +29 -25
  67. data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
  68. data/lib/statsample/test/levene.rb +28 -27
  69. data/lib/statsample/test/t.rb +7 -9
  70. data/lib/statsample/test/umannwhitney.rb +28 -28
  71. data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
  72. data/lib/statsample/vector.rb +70 -1013
  73. data/lib/statsample/version.rb +1 -1
  74. data/statsample.gemspec +12 -16
  75. data/test/helpers_tests.rb +1 -1
  76. data/test/test_analysis.rb +17 -17
  77. data/test/test_anova_contrast.rb +6 -6
  78. data/test/test_anovatwowaywithdataset.rb +8 -8
  79. data/test/test_anovawithvectors.rb +8 -8
  80. data/test/test_awesome_print_bug.rb +1 -1
  81. data/test/test_bartlettsphericity.rb +4 -4
  82. data/test/test_bivariate.rb +48 -43
  83. data/test/test_codification.rb +33 -33
  84. data/test/test_crosstab.rb +9 -9
  85. data/test/test_dataset.rb +28 -458
  86. data/test/test_factor.rb +46 -38
  87. data/test/test_factor_pa.rb +22 -13
  88. data/test/test_ggobi.rb +4 -4
  89. data/test/test_gsl.rb +4 -4
  90. data/test/test_histogram.rb +3 -3
  91. data/test/test_matrix.rb +13 -13
  92. data/test/test_multiset.rb +103 -91
  93. data/test/test_regression.rb +57 -52
  94. data/test/test_reliability.rb +55 -45
  95. data/test/test_reliability_icc.rb +8 -8
  96. data/test/test_reliability_skillscale.rb +26 -24
  97. data/test/test_resample.rb +1 -1
  98. data/test/test_statistics.rb +3 -13
  99. data/test/test_stest.rb +9 -9
  100. data/test/test_stratified.rb +3 -3
  101. data/test/test_test_t.rb +12 -12
  102. data/test/test_umannwhitney.rb +2 -2
  103. data/test/test_vector.rb +76 -613
  104. data/test/test_wilcoxonsignedrank.rb +4 -4
  105. metadata +57 -28
  106. data/lib/statsample/rserve_extension.rb +0 -20
  107. data/lib/statsample/vector/gsl.rb +0 -106
  108. data/test/fixtures/repeated_fields.csv +0 -7
  109. data/test/fixtures/scientific_notation.csv +0 -4
  110. data/test/fixtures/test_csv.csv +0 -7
  111. data/test/fixtures/test_xls.xls +0 -0
  112. data/test/test_csv.rb +0 -63
  113. data/test/test_rserve_extension.rb +0 -42
  114. data/test/test_xls.rb +0 -52
@@ -8,24 +8,25 @@ module Statsample
8
8
  attr_reader :v_rows, :v_cols
9
9
  attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
10
10
  def initialize(v1, v2, opts=Hash.new)
11
- #raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
12
11
  raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
13
- @v_rows, @v_cols=Statsample.only_valid_clone(v1.to_vector,v2.to_vector)
14
- @cases=@v_rows.size
15
- @row_label=v1.name
16
- @column_label=v2.name
17
- @name=nil
12
+ @v_rows, @v_cols = Statsample.only_valid_clone(
13
+ Daru::Vector.new(v1),
14
+ Daru::Vector.new(v2))
15
+ @cases = @v_rows.size
16
+ @row_label = v1.name
17
+ @column_label = v2.name
18
+ @name = nil
18
19
  @percentage_row = @percentage_column = @percentage_total=false
19
- opts.each{|k,v|
20
+ opts.each do |k,v|
20
21
  self.send("#{k}=",v) if self.respond_to? k
21
- }
22
- @name||=_("Crosstab %s - %s") % [@row_label, @column_label]
22
+ end
23
+ @name ||= _("Crosstab %s - %s") % [@row_label, @column_label]
23
24
  end
24
25
  def rows_names
25
- @v_rows.factors.sort
26
+ @v_rows.factors.sort.reset_index!
26
27
  end
27
28
  def cols_names
28
- @v_cols.factors.sort
29
+ @v_cols.factors.sort.reset_index!
29
30
  end
30
31
  def rows_total
31
32
  @v_rows.frequencies
@@ -35,18 +36,18 @@ module Statsample
35
36
  end
36
37
 
37
38
  def frequencies
38
- base=rows_names.inject([]){|s,row|
39
- s+=cols_names.collect{|col| [row,col]}
40
- }.inject({}) {|s,par|
39
+ base = rows_names.inject([]) do |s,row|
40
+ s += cols_names.collect { |col| [row,col] }
41
+ end.inject({}) do |s,par|
41
42
  s[par]=0
42
43
  s
43
- }
44
- base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
44
+ end
45
+ base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
45
46
  end
46
47
  def to_matrix
47
- f=frequencies
48
- rn=rows_names
49
- cn=cols_names
48
+ f = frequencies
49
+ rn = rows_names
50
+ cn = cols_names
50
51
  Matrix.rows(rn.collect{|row|
51
52
  cn.collect{|col| f[[row,col]]}
52
53
  })
@@ -67,8 +68,8 @@ module Statsample
67
68
  end
68
69
  # Chi square, based on expected and real matrix
69
70
  def chi_square
70
- require 'statsample/test'
71
- Statsample::Test.chi_square(self.to_matrix, matrix_expected)
71
+ require 'statsample/test'
72
+ Statsample::Test.chi_square(self.to_matrix, matrix_expected)
72
73
  end
73
74
  # Useful to obtain chi square
74
75
  def matrix_expected
@@ -98,10 +99,10 @@ module Statsample
98
99
  generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
99
100
  generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
100
101
 
101
- t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")])
102
+ t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")])
102
103
  rn.each do |row|
103
104
  total_row=0
104
- t_row=[@v_rows.labeling(row)]
105
+ t_row=[@v_rows.index_of(row)]
105
106
  cn.each do |col|
106
107
  data=fq[[row,col]]
107
108
  total_row+=fq[[row,col]]
@@ -148,9 +149,9 @@ module Statsample
148
149
  when :total then _("% Total")
149
150
  end
150
151
 
151
- t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c) } + [_("Total")])
152
+ t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")])
152
153
  rn.each do |row|
153
- t_row=[@v_rows.labeling(row)]
154
+ t_row=[@v_rows.index_of(row)]
154
155
  cn.each do |col|
155
156
  total=case type
156
157
  when :row then rt[row]
@@ -0,0 +1,117 @@
1
+ # Opening the Daru::DataFrame class for adding methods to convert from
2
+ # data structures to specialized statsample data structues like Multiset.
3
+ module Daru
4
+ class Vector
5
+ def histogram(bins=10)
6
+ type == :numeric or raise TypeError, "Only numeric Vectors can do this operation."
7
+
8
+ if bins.is_a? Array
9
+ h = Statsample::Histogram.alloc(bins)
10
+ else
11
+ # ugly patch. The upper limit for a bin has the form
12
+ # x < range
13
+ #h=Statsample::Histogram.new(self, bins)
14
+ valid = only_valid
15
+ min,max=Statsample::Util.nice(valid.min,valid.max)
16
+ # fix last data
17
+ if max == valid.max
18
+ max += 1e-10
19
+ end
20
+ h = Statsample::Histogram.alloc(bins,[min,max])
21
+ # Fix last bin
22
+ end
23
+
24
+ h.increment(valid)
25
+ h
26
+ end
27
+
28
+ # Variance of p, according to poblation size
29
+ def variance_proportion(n_poblation, v=1)
30
+ Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
31
+ end
32
+
33
+ # Variance of p, according to poblation size
34
+ def variance_total(n_poblation, v=1)
35
+ Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
36
+ end
37
+
38
+ def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
39
+ Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
40
+ end
41
+
42
+ def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
43
+ Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
44
+ end
45
+ end
46
+
47
+ class DataFrame
48
+ def crosstab(v1,v2,opts={})
49
+ Statsample::Crosstab.new(self[v1], self[v2],opts)
50
+ end
51
+
52
+ # Functions for converting to Statsample::Multiset
53
+ def to_multiset_by_split(*vecs)
54
+ require 'statsample/multiset'
55
+
56
+ if vecs.size == 1
57
+ to_multiset_by_split_one_field(vecs[0])
58
+ else
59
+ to_multiset_by_split_multiple_fields(*vecs)
60
+ end
61
+ end
62
+ # Creates a Statsample::Multiset, using one field
63
+
64
+ def to_multiset_by_split_one_field(field)
65
+ raise ArgumentError,"Should use a correct field name" if
66
+ !@vectors.include? field
67
+
68
+ factors = self[field].factors
69
+ ms = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors)
70
+ each_row do |row|
71
+ ms[row[field]].add_row(row)
72
+ end
73
+ #puts "Ingreso a los dataset"
74
+ ms.datasets.each do |k,ds|
75
+ ds.update
76
+ ds.rename self[field].index_of(k)
77
+ end
78
+
79
+ ms
80
+ end
81
+
82
+ def to_multiset_by_split_multiple_fields(*fields)
83
+ fields.map!(&:to_sym)
84
+ factors_total=nil
85
+ fields.each do |f|
86
+ if factors_total.nil?
87
+ factors_total = self[f].factors.collect { |c| [c] }
88
+ else
89
+ suma = []
90
+ factors = self[f].factors
91
+ factors_total.each do |f1|
92
+ factors.each do |f2|
93
+ suma.push(f1+[f2])
94
+ end
95
+ end
96
+ factors_total = suma
97
+ end
98
+ end
99
+ ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total)
100
+
101
+ p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }"
102
+ each_row { |r| p1.call(r) }
103
+
104
+ ms.datasets.each do |k,ds|
105
+ ds.update
106
+ ds.rename(
107
+ fields.size.times.map do |i|
108
+ f = fields[i]
109
+ sk = k[i]
110
+ self[f].index_of(sk)
111
+ end.join("-")
112
+ )
113
+ end
114
+ ms
115
+ end
116
+ end
117
+ end
@@ -2,9 +2,11 @@ require 'statsample/vector'
2
2
 
3
3
  class Hash
4
4
  # Creates a Statsample::Dataset based on a Hash
5
- def to_dataset(*args)
5
+ def to_dataframe(*args)
6
6
  Statsample::Dataset.new(self, *args)
7
7
  end
8
+
9
+ alias :to_dataset :to_dataframe
8
10
  end
9
11
 
10
12
  class Array
@@ -17,990 +19,116 @@ class Array
17
19
  end
18
20
 
19
21
  module Statsample
20
- class DatasetException < RuntimeError # :nodoc:
21
- attr_reader :ds,:exp
22
- def initialize(ds,e)
23
- @ds=ds
24
- @exp=e
25
- end
26
- def to_s
27
- m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
28
- m+="\nRow ##{@ds.i}:#{@ds.case_as_hash(@ds.i)}" unless @ds.i.nil?
29
- m
22
+ # == Deprecation Warning
23
+ #
24
+ # This class will soon be replaced by Daru::DataFrame in the
25
+ # next release. Please see the daru docs at https://github.com/v0dro/daru
26
+ # for more details
27
+ class Dataset < Daru::DataFrame
28
+ # Ordered ids of vectors
29
+ def fields
30
+ $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#vectors.\n"
31
+ @vectors.to_a
30
32
  end
31
- end
32
- # Set of cases with values for one or more variables,
33
- # analog to a dataframe on R or a standard data file of SPSS.
34
- # Every vector has <tt>#field</tt> name, which represent it. By default,
35
- # the vectors are ordered by it field name, but you can change it
36
- # the fields order manually.
37
- # The Dataset work as a Hash, with keys are field names
38
- # and values are Statsample::Vector
39
- #
40
- #
41
- # ==Usage
42
- # Create a empty dataset:
43
- # Dataset.new()
44
- # Create a dataset with three empty vectors, called <tt>v1</tt>, <tt>v2</tt> and <tt>v3</tt>:
45
- # Dataset.new(%w{v1 v2 v3})
46
- # Create a dataset with two vectors, called <tt>v1</tt>
47
- # and <tt>v2</tt>:
48
- # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
49
- # Create a dataset with two given vectors (v1 and v2),
50
- # with vectors on inverted order:
51
- # Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1'])
52
- #
53
- # The fast way to create a dataset uses Hash#to_dataset, with
54
- # field order as arguments
55
- # v1 = [1,2,3].to_numeric
56
- # v2 = [1,2,3].to_numeric
57
- # ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1})
58
33
 
59
- class Dataset
60
- include Writable
61
- include Summarizable
62
- # Hash of Statsample::Vector
63
- attr_reader :vectors
64
- # Ordered ids of vectors
65
- attr_reader :fields
66
- # Name of dataset
67
- attr_accessor :name
68
- # Number of cases
69
- attr_reader :cases
70
- # Location of pointer on enumerations methods (like #each)
71
- attr_reader :i
34
+ def name= new_name
35
+ $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#rename.\n"
72
36
 
73
- # Generates a new dataset, using three vectors
74
- # - Rows
75
- # - Columns
76
- # - Values
77
- #
78
- # For example, you have these values
79
- #
80
- # x y v
81
- # a a 0
82
- # a b 1
83
- # b a 1
84
- # b b 0
85
- #
86
- # You obtain
87
- # id a b
88
- # a 0 1
89
- # b 1 0
90
- #
91
- # Useful to process outputs from databases
92
- def self.crosstab_by_asignation(rows,columns,values)
93
- raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size
94
- cols_values=columns.factors
95
- cols_n=cols_values.size
96
- h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){
97
- |a1,v1| a1[v1]=nil; a1
98
- }
99
- ;a}
100
- values.each_index{|i|
101
- h_rows[rows[i]][columns[i]]=values[i]
102
- }
103
- ds=Dataset.new(["_id"]+cols_values)
104
- cols_values.each{|c|
105
- ds[c].type=values.type
106
- }
107
- rows.factors.each {|row|
108
- n_row=Array.new(cols_n+1)
109
- n_row[0]=row
110
- cols_values.each_index {|i|
111
- n_row[i+1]=h_rows[row][cols_values[i]]
112
- }
113
- ds.add_case_array(n_row)
114
- }
115
- ds.update_valid_data
116
- ds
37
+ rename new_name
117
38
  end
118
- # Return true if any vector has missing data
119
- def has_missing_data?
120
- @vectors.any? {|k,v| v.has_missing_data?}
39
+ # Number of cases
40
+ def cases
41
+ $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#nrows.\n"
42
+
43
+ nrows
121
44
  end
122
- # Return a nested hash using fields as keys and
123
- # an array constructed of hashes with other values.
124
- # If block provided, is used to provide the
125
- # values, with parameters +row+ of dataset,
126
- # +current+ last hash on hierarchy and
127
- # +name+ of the key to include
128
- def nest(*tree_keys,&block)
129
- tree_keys=tree_keys[0] if tree_keys[0].is_a? Array
130
- out=Hash.new
131
- each do |row|
132
- current=out
133
- # Create tree
134
- tree_keys[0,tree_keys.size-1].each do |f|
135
- root=row[f]
136
- current[root]||=Hash.new
137
- current=current[root]
138
- end
139
- name=row[tree_keys.last]
140
- if !block
141
- current[name]||=Array.new
142
- current[name].push(row.delete_if{|key,value| tree_keys.include? key})
143
- else
144
- current[name]=block.call(row, current,name)
145
- end
146
- end
147
- out
45
+
46
+ # == Deprecation Warning
47
+ #
48
+ # This class will soon be replaced by Daru::DataFrame in the
49
+ # next release. Use Daru::DataFrame.crosstab_by_assignation
50
+ # for the same effect. Please see the daru docs at
51
+ # https://github.com/v0dro/daru for more details.
52
+ def self.crosstab_by_assignation(rows,columns,values)
53
+ ds = super(rows, columns, values)
54
+ Dataset.new ds.to_hash
148
55
  end
149
- # Creates a new dataset. A dataset is a set of ordered named vectors
150
- # of the same size.
151
- #
152
- # [vectors] With an array, creates a set of empty vectors named as
153
- # values on the array. With a hash, each Vector is assigned as
154
- # a variable of the Dataset named as its key
155
- # [fields] Array of names for vectors. Is only used for set the
156
- # order of variables. If empty, vectors keys on alfabethic order as
157
- # used as fields.
56
+
57
+ # == Deprecation Warning
58
+ #
59
+ # This class will soon be replaced by Daru::DataFrame in the
60
+ # next release. Use Daru::DataFrame.new for the same effect.
61
+ # Please see the daru docs at https://github.com/v0dro/daru for more details.
158
62
  def initialize(vectors={}, fields=[])
159
- @@n_dataset||=0
160
- @@n_dataset+=1
161
- @name=_("Dataset %d") % @@n_dataset
162
- @cases=0
163
- @gsl=nil
164
- @i=nil
63
+ $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using that.\n"
165
64
 
166
65
  if vectors.instance_of? Array
167
66
  @fields=vectors.dup
168
- @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
67
+ super({}, order: @fields.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e })
169
68
  else
170
69
  # Check vectors
171
- @vectors=vectors
172
- @fields=fields
173
- check_order
174
- check_length
175
- end
176
- end
177
- #
178
- # Creates a copy of the given dataset, deleting all the cases with
179
- # missing data on one of the vectors.
180
- #
181
- # @param array of fields to include. No value include all fields
182
- #
183
- def dup_only_valid(*fields_to_include)
184
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
185
- fields_to_include=fields_to_include[0]
186
- end
187
- fields_to_include=@fields if fields_to_include.size==0
188
- if fields_to_include.any? {|f| @vectors[f].has_missing_data?}
189
- ds=Dataset.new(fields_to_include)
190
- fields_to_include.each {|f| ds[f].type=@vectors[f].type}
191
- each {|row|
192
- unless fields_to_include.any? {|f| @vectors[f].has_missing_data? and !@vectors[f].is_valid? row[f]}
193
- row_2=fields_to_include.inject({}) {|ac,v| ac[v]=row[v]; ac}
194
- ds.add_case(row_2)
195
- end
196
- }
197
- else
198
- ds=dup fields_to_include
199
- end
200
- ds.name= self.name
201
- ds
202
- end
203
- #
204
- # Returns a duplicate of the Dataset.
205
- # All vectors are copied, so any modification on new
206
- # dataset doesn't affect original dataset's vectors.
207
- # If fields given as parameter, only include those vectors.
208
- #
209
- # @param array of fields to include. No value include all fields
210
- # @return {Statsample::Dataset}
211
- def dup(*fields_to_include)
212
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
213
- fields_to_include=fields_to_include[0]
214
- end
215
- fields_to_include=@fields if fields_to_include.size==0
216
- vectors={}
217
- fields=[]
218
- fields_to_include.each{|f|
219
- raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
220
- vectors[f]=@vectors[f].dup
221
- fields.push(f)
222
- }
223
- ds=Dataset.new(vectors,fields)
224
- ds.name= self.name
225
- ds
226
- end
227
-
228
-
229
- # Returns an array with the fields from first argumen to last argument
230
- def from_to(from,to)
231
- raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
232
- raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
233
- @fields.slice(@fields.index(from)..@fields.index(to))
234
- end
235
-
236
- # Returns (when possible) a cheap copy of dataset.
237
- # If no vector have missing values, returns original vectors.
238
- # If missing values presents, uses Dataset.dup_only_valid.
239
- #
240
- # @param array of fields to include. No value include all fields
241
- # @return {Statsample::Dataset}
242
- def clone_only_valid(*fields_to_include)
243
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
244
- fields_to_include=fields_to_include[0]
245
- end
246
- fields_to_include=@fields.dup if fields_to_include.size==0
247
- if fields_to_include.any? {|v| @vectors[v].has_missing_data?}
248
- dup_only_valid(fields_to_include)
249
- else
250
- clone(fields_to_include)
251
- end
252
- end
253
- # Returns a shallow copy of Dataset.
254
- # Object id will be distinct, but @vectors will be the same.
255
- # @param array of fields to include. No value include all fields
256
- # @return {Statsample::Dataset}
257
- def clone(*fields_to_include)
258
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
259
- fields_to_include=fields_to_include[0]
260
- end
261
- fields_to_include=@fields.dup if fields_to_include.size==0
262
- ds=Dataset.new
263
- fields_to_include.each{|f|
264
- raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
265
- ds[f]=@vectors[f]
266
- }
267
- ds.fields=fields_to_include
268
- ds.name=@name
269
- ds.update_valid_data
270
- ds
271
- end
272
- # Creates a copy of the given dataset, without data on vectors
273
- #
274
- # @return {Statsample::Dataset}
275
- def dup_empty
276
- vectors=@vectors.inject({}) {|a,v|
277
- a[v[0]]=v[1].dup_empty
278
- a
279
- }
280
- Dataset.new(vectors,@fields.dup)
281
- end
282
- # Merge vectors from two datasets
283
- # In case of name collition, the vectors names are changed to
284
- # x_1, x_2 ....
285
- #
286
- # @return {Statsample::Dataset}
287
- def merge(other_ds)
288
- raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases
289
- types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type}
290
- new_fields = (@fields+other_ds.fields).recode_repeated
291
- ds_new=Statsample::Dataset.new(new_fields)
292
- new_fields.each_index{|i|
293
- field=new_fields[i]
294
- ds_new[field].type=types[i]
295
- }
296
- @cases.times {|i|
297
- row=case_as_array(i)+other_ds.case_as_array(i)
298
- ds_new.add_case_array(row)
299
- }
300
- ds_new.update_valid_data
301
- ds_new
302
- end
303
-
304
- # Join 2 Datasets by given fields
305
- # type is one of :left and :inner, default is :left
306
- #
307
- # @return {Statsample::Dataset}
308
- def join(other_ds,fields_1=[],fields_2=[],type=:left)
309
- fields_new = other_ds.fields - fields_2
310
- fields = self.fields + fields_new
311
-
312
- other_ds_hash = {}
313
- other_ds.each do |row|
314
- key = row.select{|k,v| fields_2.include?(k)}.values
315
- value = row.select{|k,v| fields_new.include?(k)}
316
- if other_ds_hash[key].nil?
317
- other_ds_hash[key] = [value]
318
- else
319
- other_ds_hash[key] << value
70
+ @vectors = {}
71
+ vectors.each do |k,v|
72
+ @vectors[k.respond_to?(:to_sym) ? k.to_sym : k] = v
320
73
  end
74
+ @fields = fields
75
+ super @vectors, order: @fields.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
321
76
  end
322
-
323
- new_ds = Dataset.new(fields)
324
-
325
- self.each do |row|
326
- key = row.select{|k,v| fields_1.include?(k)}.values
327
-
328
- new_case = row.dup
329
-
330
- if other_ds_hash[key].nil?
331
- if type == :left
332
- fields_new.each{|field| new_case[field] = nil}
333
- new_ds.add_case(new_case)
334
- end
335
- else
336
- other_ds_hash[key].each do |new_values|
337
- new_ds.add_case new_case.merge(new_values)
338
- end
339
- end
340
-
341
- end
342
- new_ds
343
77
  end
344
- # Returns a dataset with standarized data.
345
- #
346
- # @return {Statsample::Dataset}
347
- def standarize
348
- ds=dup()
349
- ds.fields.each do |f|
350
- ds[f]=ds[f].vector_standarized
351
- end
352
- ds
353
- end
354
- # Generate a matrix, based on fields of dataset
355
- #
356
- # @return {::Matrix}
357
78
 
358
- def collect_matrix
359
- rows=@fields.collect{|row|
360
- @fields.collect{|col|
361
- yield row,col
362
- }
363
- }
364
- Matrix.rows(rows)
79
+ def from_to(from,to)
80
+ raise NoMethodError, "This method is no longer supported. To see the vector index use Daru::DataFrame#vectors"
365
81
  end
366
82
 
367
- # We have the same datasets if +vectors+ and +fields+ are the same
368
- #
369
- # @return {Boolean}
370
- def ==(d2)
371
- @vectors==d2.vectors and @fields==d2.fields
372
- end
373
- # Returns vector <tt>c</tt>
374
- #
375
- # @return {Statsample::Vector}
376
- def col(c)
377
- @vectors[c]
378
- end
379
- alias_method :vector, :col
380
- # Equal to Dataset[<tt>name</tt>]=<tt>vector</tt>
381
- #
382
- # @return self
383
83
  def add_vector(name, vector)
384
- raise ArgumentError, "Vector have different size" if vector.size!=@cases
385
- @vectors[name]=vector
386
- check_order
387
- self
388
- end
389
- # Returns true if dataset have vector <tt>v</tt>.
390
- #
391
- # @return {Boolean}
392
- def has_vector? (v)
393
- return @vectors.has_key?(v)
394
- end
395
- # Creates a dataset with the random data, of a n size
396
- # If n not given, uses original number of cases.
397
- #
398
- # @return {Statsample::Dataset}
399
- def bootstrap(n=nil)
400
- n||=@cases
401
- ds_boot=dup_empty
402
- n.times do
403
- ds_boot.add_case_array(case_as_array(rand(n)))
404
- end
405
- ds_boot.update_valid_data
406
- ds_boot
84
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#[]= directly."
407
85
  end
408
- # Fast version of #add_case.
409
- # Can only add one case and no error check if performed
410
- # You SHOULD use #update_valid_data at the end of insertion cycle
411
- #
412
- #
86
+
413
87
  def add_case_array(v)
414
- v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
88
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#add_row instead."
415
89
  end
416
- # Insert a case, using:
417
- # * Array: size equal to number of vectors and values in the same order as fields
418
- # * Hash: keys equal to fields
419
- # If uvd is false, #update_valid_data is not executed after
420
- # inserting a case. This is very useful if you want to increase the
421
- # performance on inserting many cases, because #update_valid_data
422
- # performs check on vectors and on the dataset
423
90
 
424
91
  def add_case(v,uvd=true)
425
- case v
426
- when Array
427
- if (v[0].is_a? Array)
428
- v.each{|subv| add_case(subv,false)}
429
- else
430
- raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
431
- v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
432
- end
433
- when Hash
434
- raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort
435
- @fields.each{|f| @vectors[f].add(v[f],false)}
436
- else
437
- raise TypeError, 'Value must be a Array or a Hash'
438
- end
439
- if uvd
440
- update_valid_data
441
- end
442
- end
443
- # Check vectors and fields after inserting data. Use only
444
- # after #add_case_array or #add_case with second parameter to false
445
- def update_valid_data
446
- @gsl=nil
447
- @fields.each{|f| @vectors[f].set_valid_data}
448
- check_length
449
- end
450
- # Delete vector named +name+. Multiple fields accepted.
451
- def delete_vector(*args)
452
- if args.size==1 and args[0].is_a? Array
453
- names=args[0]
454
- else
455
- names=args
456
- end
457
- names.each do |name|
458
- @fields.delete(name)
459
- @vectors.delete(name)
460
- end
461
- end
462
-
463
- def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN)
464
- split=@vectors[name_].split_by_separator(sep)
465
- i=1
466
- split.each{|k,v|
467
- new_field=name_+join+i.to_s
468
- v.name=name_+":"+k
469
- add_vector(new_field,v)
470
- i+=1
471
- }
472
- end
473
- def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
474
- split=@vectors[name].split_by_separator(sep)
475
- split.each{|k,v|
476
- add_vector(name+join+k,v)
477
- }
478
- end
479
-
480
- def vector_by_calculation(type=:numeric)
481
- a=[]
482
- each do |row|
483
- a.push(yield(row))
484
- end
485
- a.to_vector(type)
486
- end
487
- # Returns a vector with sumatory of fields
488
- # if fields parameter is empty, sum all fields
489
- def vector_sum(fields=nil)
490
- fields||=@fields
491
- vector=collect_with_index do |row, i|
492
- if(fields.find{|f| !@vectors[f].data_with_nils[i]})
493
- nil
494
- else
495
- fields.inject(0) {|ac,v| ac + row[v].to_f}
496
- end
497
- end
498
- vector.name=_("Sum from %s") % @name
499
- vector
500
- end
501
- # Check if #fields attribute is correct, after inserting or deleting vectors
502
- def check_fields(fields)
503
- fields||=@fields
504
- raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
505
- fields
506
- end
507
-
508
- # Returns a vector with the numbers of missing values for a case
509
- def vector_missing_values(fields=nil)
510
- fields=check_fields(fields)
511
- collect_with_index do |row, i|
512
- fields.inject(0) {|a,v|
513
- a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
514
- }
515
- end
516
- end
517
- def vector_count_characters(fields=nil)
518
- fields=check_fields(fields)
519
- collect_with_index do |row, i|
520
- fields.inject(0){|a,v|
521
- a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
522
- }
523
- end
524
- end
525
- # Returns a vector with the mean for a set of fields
526
- # if fields parameter is empty, return the mean for all fields
527
- # if max invalid parameter > 0, returns the mean for all tuples
528
- # with 0 to max_invalid invalid fields
529
- def vector_mean(fields=nil, max_invalid=0)
530
- a=[]
531
- fields=check_fields(fields)
532
- size=fields.size
533
- each_with_index do |row, i |
534
- # numero de invalidos
535
- sum=0
536
- invalids=0
537
- fields.each{|f|
538
- if !@vectors[f].data_with_nils[i].nil?
539
- sum+=row[f].to_f
540
- else
541
- invalids+=1
542
- end
543
- }
544
- if(invalids>max_invalid)
545
- a.push(nil)
546
- else
547
- a.push(sum.quo(size-invalids))
548
- end
549
- end
550
- a=a.to_vector(:numeric)
551
- a.name=_("Means from %s") % @name
552
- a
553
- end
554
- # Check vectors for type and size.
555
- def check_length # :nodoc:
556
- size=nil
557
- @vectors.each do |k,v|
558
- raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
559
- if size.nil?
560
- size=v.size
561
- else
562
- if v.size!=size
563
- raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
564
- end
565
- end
566
- end
567
- @cases=size
568
- end
569
- # Retrieves each vector as [key, vector]
570
- def each_vector # :yield: |key, vector|
571
- @fields.each{|k| yield k, @vectors[k]}
92
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#add_row instead."
572
93
  end
573
94
 
574
- if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
575
- def case_as_hash(c) # :nodoc:
576
- Statsample::STATSAMPLE__.case_as_hash(self,c)
577
- end
578
- else
579
- # Retrieves case i as a hash
580
- def case_as_hash(i)
581
- _case_as_hash(i)
582
- end
95
+ def update_valid_data
96
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#update instead. Also see Daru.lazy_update in the daru docs."
583
97
  end
584
98
 
585
- if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
586
- def case_as_array(c) # :nodoc:
587
- Statsample::STATSAMPLE__.case_as_array(self,c)
588
- end
589
- else
590
- # Retrieves case i as a array, ordered on #fields order
591
- def case_as_array(i)
592
- _case_as_array(i)
593
- end
594
- end
595
- def _case_as_hash(c) # :nodoc:
596
- @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
597
- end
598
- def _case_as_array(c) # :nodoc:
599
- @fields.collect {|x| @vectors[x][c]}
99
+ def each_array
100
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#each_row instead."
600
101
  end
601
102
 
602
- # Returns each case as a hash
603
- def each
604
- begin
605
- @i=0
606
- @cases.times {|i|
607
- @i=i
608
- row=case_as_hash(i)
609
- yield row
610
- }
611
- @i=nil
612
- rescue =>e
613
- raise DatasetException.new(self, e)
614
- end
615
- end
103
+ def fields=(f)
104
+ $stderr.puts "WARNING: Deprecated. Use Daru::DataFrame#reindex_vectors! instead.\n"
616
105
 
617
- # Returns each case as hash and index
618
- def each_with_index # :yield: |case, i|
619
- begin
620
- @i=0
621
- @cases.times{|i|
622
- @i=i
623
- row=case_as_hash(i)
624
- yield row, i
625
- }
626
- @i=nil
627
- rescue =>e
628
- raise DatasetException.new(self, e)
629
- end
106
+ reindex_vectors! f
630
107
  end
631
108
 
632
- # Returns each case as an array, coding missing values as nils
633
- def each_array_with_nils
634
- m=fields.size
635
- @cases.times {|i|
636
- @i=i
637
- row=Array.new(m)
638
- fields.each_index{|j|
639
- f=fields[j]
640
- row[j]=@vectors[f].data_with_nils[i]
641
- }
642
- yield row
643
- }
644
- @i=nil
645
- end
646
- # Returns each case as an array
647
- def each_array
648
- @cases.times {|i|
649
- @i=i
650
- row=case_as_array(i)
651
- yield row
652
- }
653
- @i=nil
654
- end
655
- # Set fields order. If you omit one or more vectors, they are
656
- # ordered by alphabetic order.
657
- def fields=(f)
658
- @fields=f
659
- check_order
660
- end
661
- # Check congruence between +fields+ attribute
662
- # and keys on +vectors
663
- def check_order #:nodoc:
664
- if(@vectors.keys.sort!=@fields.sort)
665
- @fields=@fields&@vectors.keys
666
- @fields+=@vectors.keys.sort-@fields
667
- end
668
- end
669
109
  # Returns the vector named i
670
- def[](i)
110
+ def [](i)
111
+ $stderr.puts "WARNING: Daru uses symbols instead of strings for naming vectors. Please switch to symbols.\n"
112
+
671
113
  if i.is_a? Range
672
- fields=from_to(i.begin,i.end)
673
- clone(*fields)
674
- elsif i.is_a? Array
675
- clone(i)
114
+ beg = i.begin.respond_to?(:to_sym) ? i.to_sym : i
115
+ en = i.end.respond_to?(:to_sym) ? i.to_sym : i
116
+ super(beg..en)
676
117
  else
677
- raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
678
- @vectors[i]
118
+ super i.to_sym
679
119
  end
680
120
  end
681
- # Retrieves a Statsample::Vector, based on the result
682
- # of calculation performed on each case.
683
- def collect(type=:numeric)
684
- data=[]
685
- each {|row|
686
- data.push yield(row)
687
- }
688
- Statsample::Vector.new(data,type)
689
- end
690
- # Same as Statsample::Vector.collect, but giving case index as second parameter on yield.
691
- def collect_with_index(type=:numeric)
692
- data=[]
693
- each_with_index {|row, i|
694
- data.push(yield(row, i))
695
- }
696
- Statsample::Vector.new(data,type)
697
- end
698
- # Recode a vector based on a block
699
- def recode!(vector_name)
700
- 0.upto(@cases-1) {|i|
701
- @vectors[vector_name].data[i]=yield case_as_hash(i)
702
- }
703
- @vectors[vector_name].set_valid_data
704
- end
705
121
 
706
- def crosstab(v1,v2,opts={})
707
- Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts)
708
- end
709
- def[]=(i,v)
710
- if v.instance_of? Statsample::Vector
711
- @vectors[i]=v
712
- check_order
713
- else
714
- raise ArgumentError,"Should pass a Statsample::Vector"
715
- end
716
- end
717
- # Return data as a matrix. Column are ordered by #fields and
718
- # rows by orden of insertion
719
- def to_matrix
720
- rows=[]
721
- self.each_array{|c|
722
- rows.push(c)
723
- }
724
- Matrix.rows(rows)
122
+ def []=(i,v)
123
+ $stderr.puts "WARNING: Daru uses symbols instead of strings for naming vectors. Please switch to symbols.\n"
124
+
125
+ super i, v
725
126
  end
726
127
 
727
128
  if Statsample.has_gsl?
728
129
  def clear_gsl
729
- @gsl=nil
130
+ raise NoMethodError, "This method is no longer needed/supported."
730
131
  end
731
-
732
- def to_gsl
733
- if @gsl.nil?
734
- if cases.nil?
735
- update_valid_data
736
- end
737
- @gsl=GSL::Matrix.alloc(cases,fields.size)
738
- self.each_array{|c|
739
- @gsl.set_row(@i,c)
740
- }
741
- end
742
- @gsl
743
- end
744
-
745
- end
746
-
747
- # Return a correlation matrix for fields included as parameters.
748
- # By default, uses all fields of dataset
749
- def correlation_matrix(fields = nil)
750
- if fields
751
- ds = clone(fields)
752
- else
753
- ds = self
754
- end
755
- Statsample::Bivariate.correlation_matrix(ds)
756
- end
757
-
758
- # Return a correlation matrix for fields included as parameters.
759
- # By default, uses all fields of dataset
760
- def covariance_matrix(fields = nil)
761
- if fields
762
- ds = clone(fields)
763
- else
764
- ds = self
765
- end
766
- Statsample::Bivariate.covariance_matrix(ds)
767
- end
768
-
769
- # Create a new dataset with all cases which the block returns true
770
- def filter
771
- ds=self.dup_empty
772
- each {|c|
773
- ds.add_case(c, false) if yield c
774
- }
775
- ds.update_valid_data
776
- ds.name=_("%s(filtered)") % @name
777
- ds
778
- end
779
-
780
- # creates a new vector with the data of a given field which the block returns true
781
- def filter_field(field)
782
- a=[]
783
- each do |c|
784
- a.push(c[field]) if yield c
785
- end
786
- a.to_vector(@vectors[field].type)
787
- end
788
-
789
- # Creates a Stastample::Multiset, using one or more fields
790
- # to split the dataset.
791
-
792
-
793
- def to_multiset_by_split(*fields)
794
- require 'statsample/multiset'
795
- if fields.size==1
796
- to_multiset_by_split_one_field(fields[0])
797
- else
798
- to_multiset_by_split_multiple_fields(*fields)
799
- end
800
- end
801
- # Creates a Statsample::Multiset, using one field
802
-
803
- def to_multiset_by_split_one_field(field)
804
- raise ArgumentError,"Should use a correct field name" if !@fields.include? field
805
- factors=@vectors[field].factors
806
- ms=Multiset.new_empty_vectors(@fields, factors)
807
- each {|c|
808
- ms[c[field]].add_case(c,false)
809
- }
810
- #puts "Ingreso a los dataset"
811
- ms.datasets.each {|k,ds|
812
- ds.update_valid_data
813
- ds.name=@vectors[field].labeling(k)
814
- ds.vectors.each{|k1,v1|
815
- # puts "Vector #{k1}:"+v1.to_s
816
- v1.type=@vectors[k1].type
817
- v1.name=@vectors[k1].name
818
- v1.labels=@vectors[k1].labels
819
-
820
- }
821
- }
822
- ms
823
- end
824
- def to_multiset_by_split_multiple_fields(*fields)
825
- factors_total=nil
826
- fields.each do |f|
827
- if factors_total.nil?
828
- factors_total=@vectors[f].factors.collect{|c|
829
- [c]
830
- }
831
- else
832
- suma=[]
833
- factors=@vectors[f].factors
834
- factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } }
835
- factors_total=suma
836
- end
837
- end
838
- ms=Multiset.new_empty_vectors(@fields,factors_total)
839
-
840
- p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
841
- each{|c| p1.call(c)}
842
-
843
- ms.datasets.each do |k,ds|
844
- ds.update_valid_data
845
- ds.name=fields.size.times.map {|i|
846
- f=fields[i]
847
- sk=k[i]
848
- @vectors[f].labeling(sk)
849
- }.join("-")
850
- ds.vectors.each{|k1,v1|
851
- v1.type=@vectors[k1].type
852
- v1.name=@vectors[k1].name
853
- v1.labels=@vectors[k1].labels
854
-
855
- }
856
- end
857
- ms
858
-
859
- end
860
- # Returns a vector, based on a string with a calculation based
861
- # on vector
862
- # The calculation will be eval'ed, so you can put any variable
863
- # or expression valid on ruby
864
- # For example:
865
- # a=[1,2].to_vector(scale)
866
- # b=[3,4].to_vector(scale)
867
- # ds={'a'=>a,'b'=>b}.to_dataset
868
- # ds.compute("a+b")
869
- # => Vector [4,6]
870
- def compute(text)
871
- @fields.each{|f|
872
- if @vectors[f].type=:numeric
873
- text.gsub!(f,"row['#{f}'].to_f")
874
- else
875
- text.gsub!(f,"row['#{f}']")
876
- end
877
- }
878
- collect_with_index {|row, i|
879
- invalid=false
880
- @fields.each{|f|
881
- if @vectors[f].data_with_nils[i].nil?
882
- invalid=true
883
- end
884
- }
885
- if invalid
886
- nil
887
- else
888
- eval(text)
889
- end
890
- }
891
- end
892
- # Test each row with one or more tests
893
- # each test is a Proc with the form
894
- # Proc.new {|row| row['age']>0}
895
- # The function returns an array with all errors
896
- def verify(*tests)
897
- if(tests[0].is_a? String)
898
- id=tests[0]
899
- tests.shift
900
- else
901
- id=@fields[0]
902
- end
903
- vr=[]
904
- i=0
905
- each do |row|
906
- i+=1
907
- tests.each{|test|
908
- if ! test[2].call(row)
909
- values=""
910
- if test[1].size>0
911
- values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
912
- end
913
- vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
914
- end
915
- }
916
- end
917
- vr
918
- end
919
- def to_s
920
- "#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
921
- end
922
- def inspect
923
- self.to_s
924
- end
925
- # Creates a new dataset for one to many relations
926
- # on a dataset, based on pattern of field names.
927
- #
928
- # for example, you have a survey for number of children
929
- # with this structure:
930
- # id, name, child_name_1, child_age_1, child_name_2, child_age_2
931
- # with
932
- # ds.one_to_many(%w{id}, "child_%v_%n"
933
- # the field of first parameters will be copied verbatim
934
- # to new dataset, and fields which responds to second
935
- # pattern will be added one case for each different %n.
936
- # For example
937
- # cases=[
938
- # ['1','george','red',10,'blue',20,nil,nil],
939
- # ['2','fred','green',15,'orange',30,'white',20],
940
- # ['3','alfred',nil,nil,nil,nil,nil,nil]
941
- # ]
942
- # ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
943
- # cases.each {|c| ds.add_case_array c }
944
- # ds.one_to_many(['id'],'car_%v%n').to_matrix
945
- # => Matrix[
946
- # ["red", "1", 10],
947
- # ["blue", "1", 20],
948
- # ["green", "2", 15],
949
- # ["orange", "2", 30],
950
- # ["white", "2", 20]
951
- # ]
952
- #
953
- def one_to_many(parent_fields, pattern)
954
- #base_pattern=pattern.gsub(/%v|%n/,"")
955
- re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
956
- ds_vars=parent_fields
957
- vars=[]
958
- max_n=0
959
- h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
960
- # Adding _row_id
961
- h['_col_id']=[].to_numeric
962
- ds_vars.push("_col_id")
963
- @fields.each do |f|
964
- if f=~re
965
- if !vars.include? $1
966
- vars.push($1)
967
- h[$1]=Statsample::Vector.new([], @vectors[f].type)
968
- end
969
- max_n=$2.to_i if max_n < $2.to_i
970
- end
971
- end
972
- ds=Dataset.new(h,ds_vars+vars)
973
- each do |row|
974
- row_out={}
975
- parent_fields.each do |f|
976
- row_out[f]=row[f]
977
- end
978
- max_n.times do |n1|
979
- n=n1+1
980
- any_data=false
981
- vars.each do |v|
982
- data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
983
- row_out[v]=data
984
- any_data=true if !data.nil?
985
- end
986
- if any_data
987
- row_out["_col_id"]=n
988
- ds.add_case(row_out,false)
989
- end
990
-
991
- end
992
- end
993
- ds.update_valid_data
994
- ds
995
- end
996
- def report_building(b)
997
- b.section(:name=>@name) do |g|
998
- g.text _"Cases: %d" % cases
999
- @fields.each do |f|
1000
- g.text "Element:[#{f}]"
1001
- g.parse_element(@vectors[f])
1002
- end
1003
- end
1004
- end
132
+ end
1005
133
  end
1006
134
  end