statsample 1.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +15 -0
  3. data/.gitignore +1 -0
  4. data/.travis.yml +19 -7
  5. data/CONTRIBUTING.md +33 -0
  6. data/History.txt +5 -0
  7. data/README.md +41 -53
  8. data/benchmarks/correlation_matrix_15_variables.rb +6 -5
  9. data/benchmarks/correlation_matrix_5_variables.rb +6 -5
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
  11. data/examples/boxplot.rb +17 -5
  12. data/examples/correlation_matrix.rb +36 -7
  13. data/examples/dataset.rb +25 -5
  14. data/examples/dominance_analysis.rb +8 -7
  15. data/examples/dominance_analysis_bootstrap.rb +16 -11
  16. data/examples/histogram.rb +16 -2
  17. data/examples/icc.rb +5 -6
  18. data/examples/levene.rb +17 -3
  19. data/examples/multiple_regression.rb +6 -3
  20. data/examples/parallel_analysis.rb +11 -6
  21. data/examples/polychoric.rb +26 -13
  22. data/examples/principal_axis.rb +8 -4
  23. data/examples/reliability.rb +10 -10
  24. data/examples/scatterplot.rb +8 -0
  25. data/examples/t_test.rb +7 -0
  26. data/examples/u_test.rb +10 -2
  27. data/examples/vector.rb +9 -6
  28. data/examples/velicer_map_test.rb +12 -8
  29. data/lib/statsample.rb +13 -47
  30. data/lib/statsample/analysis/suite.rb +1 -1
  31. data/lib/statsample/anova/oneway.rb +6 -6
  32. data/lib/statsample/anova/twoway.rb +26 -24
  33. data/lib/statsample/bivariate.rb +78 -61
  34. data/lib/statsample/bivariate/pearson.rb +2 -2
  35. data/lib/statsample/codification.rb +45 -32
  36. data/lib/statsample/converter/csv.rb +15 -53
  37. data/lib/statsample/converter/spss.rb +6 -5
  38. data/lib/statsample/converters.rb +50 -211
  39. data/lib/statsample/crosstab.rb +26 -25
  40. data/lib/statsample/daru.rb +117 -0
  41. data/lib/statsample/dataset.rb +70 -942
  42. data/lib/statsample/dominanceanalysis.rb +16 -17
  43. data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
  44. data/lib/statsample/factor/parallelanalysis.rb +17 -19
  45. data/lib/statsample/factor/pca.rb +21 -20
  46. data/lib/statsample/factor/principalaxis.rb +3 -3
  47. data/lib/statsample/graph/boxplot.rb +8 -16
  48. data/lib/statsample/graph/histogram.rb +4 -4
  49. data/lib/statsample/graph/scatterplot.rb +8 -7
  50. data/lib/statsample/histogram.rb +128 -119
  51. data/lib/statsample/matrix.rb +20 -16
  52. data/lib/statsample/multiset.rb +39 -38
  53. data/lib/statsample/regression.rb +3 -3
  54. data/lib/statsample/regression/multiple.rb +8 -10
  55. data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
  56. data/lib/statsample/regression/multiple/baseengine.rb +32 -32
  57. data/lib/statsample/regression/multiple/gslengine.rb +33 -36
  58. data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
  59. data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
  60. data/lib/statsample/reliability.rb +23 -25
  61. data/lib/statsample/reliability/icc.rb +8 -7
  62. data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
  63. data/lib/statsample/reliability/scaleanalysis.rb +58 -60
  64. data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
  65. data/lib/statsample/resample.rb +1 -1
  66. data/lib/statsample/shorthand.rb +29 -25
  67. data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
  68. data/lib/statsample/test/levene.rb +28 -27
  69. data/lib/statsample/test/t.rb +7 -9
  70. data/lib/statsample/test/umannwhitney.rb +28 -28
  71. data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
  72. data/lib/statsample/vector.rb +70 -1013
  73. data/lib/statsample/version.rb +1 -1
  74. data/statsample.gemspec +12 -16
  75. data/test/helpers_tests.rb +1 -1
  76. data/test/test_analysis.rb +17 -17
  77. data/test/test_anova_contrast.rb +6 -6
  78. data/test/test_anovatwowaywithdataset.rb +8 -8
  79. data/test/test_anovawithvectors.rb +8 -8
  80. data/test/test_awesome_print_bug.rb +1 -1
  81. data/test/test_bartlettsphericity.rb +4 -4
  82. data/test/test_bivariate.rb +48 -43
  83. data/test/test_codification.rb +33 -33
  84. data/test/test_crosstab.rb +9 -9
  85. data/test/test_dataset.rb +28 -458
  86. data/test/test_factor.rb +46 -38
  87. data/test/test_factor_pa.rb +22 -13
  88. data/test/test_ggobi.rb +4 -4
  89. data/test/test_gsl.rb +4 -4
  90. data/test/test_histogram.rb +3 -3
  91. data/test/test_matrix.rb +13 -13
  92. data/test/test_multiset.rb +103 -91
  93. data/test/test_regression.rb +57 -52
  94. data/test/test_reliability.rb +55 -45
  95. data/test/test_reliability_icc.rb +8 -8
  96. data/test/test_reliability_skillscale.rb +26 -24
  97. data/test/test_resample.rb +1 -1
  98. data/test/test_statistics.rb +3 -13
  99. data/test/test_stest.rb +9 -9
  100. data/test/test_stratified.rb +3 -3
  101. data/test/test_test_t.rb +12 -12
  102. data/test/test_umannwhitney.rb +2 -2
  103. data/test/test_vector.rb +76 -613
  104. data/test/test_wilcoxonsignedrank.rb +4 -4
  105. metadata +57 -28
  106. data/lib/statsample/rserve_extension.rb +0 -20
  107. data/lib/statsample/vector/gsl.rb +0 -106
  108. data/test/fixtures/repeated_fields.csv +0 -7
  109. data/test/fixtures/scientific_notation.csv +0 -4
  110. data/test/fixtures/test_csv.csv +0 -7
  111. data/test/fixtures/test_xls.xls +0 -0
  112. data/test/test_csv.rb +0 -63
  113. data/test/test_rserve_extension.rb +0 -42
  114. data/test/test_xls.rb +0 -52
@@ -8,24 +8,25 @@ module Statsample
8
8
  attr_reader :v_rows, :v_cols
9
9
  attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
10
10
  def initialize(v1, v2, opts=Hash.new)
11
- #raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
12
11
  raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
13
- @v_rows, @v_cols=Statsample.only_valid_clone(v1.to_vector,v2.to_vector)
14
- @cases=@v_rows.size
15
- @row_label=v1.name
16
- @column_label=v2.name
17
- @name=nil
12
+ @v_rows, @v_cols = Statsample.only_valid_clone(
13
+ Daru::Vector.new(v1),
14
+ Daru::Vector.new(v2))
15
+ @cases = @v_rows.size
16
+ @row_label = v1.name
17
+ @column_label = v2.name
18
+ @name = nil
18
19
  @percentage_row = @percentage_column = @percentage_total=false
19
- opts.each{|k,v|
20
+ opts.each do |k,v|
20
21
  self.send("#{k}=",v) if self.respond_to? k
21
- }
22
- @name||=_("Crosstab %s - %s") % [@row_label, @column_label]
22
+ end
23
+ @name ||= _("Crosstab %s - %s") % [@row_label, @column_label]
23
24
  end
24
25
  def rows_names
25
- @v_rows.factors.sort
26
+ @v_rows.factors.sort.reset_index!
26
27
  end
27
28
  def cols_names
28
- @v_cols.factors.sort
29
+ @v_cols.factors.sort.reset_index!
29
30
  end
30
31
  def rows_total
31
32
  @v_rows.frequencies
@@ -35,18 +36,18 @@ module Statsample
35
36
  end
36
37
 
37
38
  def frequencies
38
- base=rows_names.inject([]){|s,row|
39
- s+=cols_names.collect{|col| [row,col]}
40
- }.inject({}) {|s,par|
39
+ base = rows_names.inject([]) do |s,row|
40
+ s += cols_names.collect { |col| [row,col] }
41
+ end.inject({}) do |s,par|
41
42
  s[par]=0
42
43
  s
43
- }
44
- base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
44
+ end
45
+ base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
45
46
  end
46
47
  def to_matrix
47
- f=frequencies
48
- rn=rows_names
49
- cn=cols_names
48
+ f = frequencies
49
+ rn = rows_names
50
+ cn = cols_names
50
51
  Matrix.rows(rn.collect{|row|
51
52
  cn.collect{|col| f[[row,col]]}
52
53
  })
@@ -67,8 +68,8 @@ module Statsample
67
68
  end
68
69
  # Chi square, based on expected and real matrix
69
70
  def chi_square
70
- require 'statsample/test'
71
- Statsample::Test.chi_square(self.to_matrix, matrix_expected)
71
+ require 'statsample/test'
72
+ Statsample::Test.chi_square(self.to_matrix, matrix_expected)
72
73
  end
73
74
  # Useful to obtain chi square
74
75
  def matrix_expected
@@ -98,10 +99,10 @@ module Statsample
98
99
  generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
99
100
  generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
100
101
 
101
- t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")])
102
+ t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")])
102
103
  rn.each do |row|
103
104
  total_row=0
104
- t_row=[@v_rows.labeling(row)]
105
+ t_row=[@v_rows.index_of(row)]
105
106
  cn.each do |col|
106
107
  data=fq[[row,col]]
107
108
  total_row+=fq[[row,col]]
@@ -148,9 +149,9 @@ module Statsample
148
149
  when :total then _("% Total")
149
150
  end
150
151
 
151
- t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c) } + [_("Total")])
152
+ t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")])
152
153
  rn.each do |row|
153
- t_row=[@v_rows.labeling(row)]
154
+ t_row=[@v_rows.index_of(row)]
154
155
  cn.each do |col|
155
156
  total=case type
156
157
  when :row then rt[row]
@@ -0,0 +1,117 @@
1
+ # Opening the Daru::DataFrame class for adding methods to convert from
2
+ # data structures to specialized statsample data structues like Multiset.
3
+ module Daru
4
+ class Vector
5
+ def histogram(bins=10)
6
+ type == :numeric or raise TypeError, "Only numeric Vectors can do this operation."
7
+
8
+ if bins.is_a? Array
9
+ h = Statsample::Histogram.alloc(bins)
10
+ else
11
+ # ugly patch. The upper limit for a bin has the form
12
+ # x < range
13
+ #h=Statsample::Histogram.new(self, bins)
14
+ valid = only_valid
15
+ min,max=Statsample::Util.nice(valid.min,valid.max)
16
+ # fix last data
17
+ if max == valid.max
18
+ max += 1e-10
19
+ end
20
+ h = Statsample::Histogram.alloc(bins,[min,max])
21
+ # Fix last bin
22
+ end
23
+
24
+ h.increment(valid)
25
+ h
26
+ end
27
+
28
+ # Variance of p, according to poblation size
29
+ def variance_proportion(n_poblation, v=1)
30
+ Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
31
+ end
32
+
33
+ # Variance of p, according to poblation size
34
+ def variance_total(n_poblation, v=1)
35
+ Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
36
+ end
37
+
38
+ def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
39
+ Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
40
+ end
41
+
42
+ def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
43
+ Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
44
+ end
45
+ end
46
+
47
+ class DataFrame
48
+ def crosstab(v1,v2,opts={})
49
+ Statsample::Crosstab.new(self[v1], self[v2],opts)
50
+ end
51
+
52
+ # Functions for converting to Statsample::Multiset
53
+ def to_multiset_by_split(*vecs)
54
+ require 'statsample/multiset'
55
+
56
+ if vecs.size == 1
57
+ to_multiset_by_split_one_field(vecs[0])
58
+ else
59
+ to_multiset_by_split_multiple_fields(*vecs)
60
+ end
61
+ end
62
+ # Creates a Statsample::Multiset, using one field
63
+
64
+ def to_multiset_by_split_one_field(field)
65
+ raise ArgumentError,"Should use a correct field name" if
66
+ !@vectors.include? field
67
+
68
+ factors = self[field].factors
69
+ ms = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors)
70
+ each_row do |row|
71
+ ms[row[field]].add_row(row)
72
+ end
73
+ #puts "Ingreso a los dataset"
74
+ ms.datasets.each do |k,ds|
75
+ ds.update
76
+ ds.rename self[field].index_of(k)
77
+ end
78
+
79
+ ms
80
+ end
81
+
82
+ def to_multiset_by_split_multiple_fields(*fields)
83
+ fields.map!(&:to_sym)
84
+ factors_total=nil
85
+ fields.each do |f|
86
+ if factors_total.nil?
87
+ factors_total = self[f].factors.collect { |c| [c] }
88
+ else
89
+ suma = []
90
+ factors = self[f].factors
91
+ factors_total.each do |f1|
92
+ factors.each do |f2|
93
+ suma.push(f1+[f2])
94
+ end
95
+ end
96
+ factors_total = suma
97
+ end
98
+ end
99
+ ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total)
100
+
101
+ p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }"
102
+ each_row { |r| p1.call(r) }
103
+
104
+ ms.datasets.each do |k,ds|
105
+ ds.update
106
+ ds.rename(
107
+ fields.size.times.map do |i|
108
+ f = fields[i]
109
+ sk = k[i]
110
+ self[f].index_of(sk)
111
+ end.join("-")
112
+ )
113
+ end
114
+ ms
115
+ end
116
+ end
117
+ end
@@ -2,9 +2,11 @@ require 'statsample/vector'
2
2
 
3
3
  class Hash
4
4
  # Creates a Statsample::Dataset based on a Hash
5
- def to_dataset(*args)
5
+ def to_dataframe(*args)
6
6
  Statsample::Dataset.new(self, *args)
7
7
  end
8
+
9
+ alias :to_dataset :to_dataframe
8
10
  end
9
11
 
10
12
  class Array
@@ -17,990 +19,116 @@ class Array
17
19
  end
18
20
 
19
21
  module Statsample
20
- class DatasetException < RuntimeError # :nodoc:
21
- attr_reader :ds,:exp
22
- def initialize(ds,e)
23
- @ds=ds
24
- @exp=e
25
- end
26
- def to_s
27
- m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
28
- m+="\nRow ##{@ds.i}:#{@ds.case_as_hash(@ds.i)}" unless @ds.i.nil?
29
- m
22
+ # == Deprecation Warning
23
+ #
24
+ # This class will soon be replaced by Daru::DataFrame in the
25
+ # next release. Please see the daru docs at https://github.com/v0dro/daru
26
+ # for more details
27
+ class Dataset < Daru::DataFrame
28
+ # Ordered ids of vectors
29
+ def fields
30
+ $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#vectors.\n"
31
+ @vectors.to_a
30
32
  end
31
- end
32
- # Set of cases with values for one or more variables,
33
- # analog to a dataframe on R or a standard data file of SPSS.
34
- # Every vector has <tt>#field</tt> name, which represent it. By default,
35
- # the vectors are ordered by it field name, but you can change it
36
- # the fields order manually.
37
- # The Dataset work as a Hash, with keys are field names
38
- # and values are Statsample::Vector
39
- #
40
- #
41
- # ==Usage
42
- # Create a empty dataset:
43
- # Dataset.new()
44
- # Create a dataset with three empty vectors, called <tt>v1</tt>, <tt>v2</tt> and <tt>v3</tt>:
45
- # Dataset.new(%w{v1 v2 v3})
46
- # Create a dataset with two vectors, called <tt>v1</tt>
47
- # and <tt>v2</tt>:
48
- # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
49
- # Create a dataset with two given vectors (v1 and v2),
50
- # with vectors on inverted order:
51
- # Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1'])
52
- #
53
- # The fast way to create a dataset uses Hash#to_dataset, with
54
- # field order as arguments
55
- # v1 = [1,2,3].to_numeric
56
- # v2 = [1,2,3].to_numeric
57
- # ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1})
58
33
 
59
- class Dataset
60
- include Writable
61
- include Summarizable
62
- # Hash of Statsample::Vector
63
- attr_reader :vectors
64
- # Ordered ids of vectors
65
- attr_reader :fields
66
- # Name of dataset
67
- attr_accessor :name
68
- # Number of cases
69
- attr_reader :cases
70
- # Location of pointer on enumerations methods (like #each)
71
- attr_reader :i
34
+ def name= new_name
35
+ $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#rename.\n"
72
36
 
73
- # Generates a new dataset, using three vectors
74
- # - Rows
75
- # - Columns
76
- # - Values
77
- #
78
- # For example, you have these values
79
- #
80
- # x y v
81
- # a a 0
82
- # a b 1
83
- # b a 1
84
- # b b 0
85
- #
86
- # You obtain
87
- # id a b
88
- # a 0 1
89
- # b 1 0
90
- #
91
- # Useful to process outputs from databases
92
- def self.crosstab_by_asignation(rows,columns,values)
93
- raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size
94
- cols_values=columns.factors
95
- cols_n=cols_values.size
96
- h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){
97
- |a1,v1| a1[v1]=nil; a1
98
- }
99
- ;a}
100
- values.each_index{|i|
101
- h_rows[rows[i]][columns[i]]=values[i]
102
- }
103
- ds=Dataset.new(["_id"]+cols_values)
104
- cols_values.each{|c|
105
- ds[c].type=values.type
106
- }
107
- rows.factors.each {|row|
108
- n_row=Array.new(cols_n+1)
109
- n_row[0]=row
110
- cols_values.each_index {|i|
111
- n_row[i+1]=h_rows[row][cols_values[i]]
112
- }
113
- ds.add_case_array(n_row)
114
- }
115
- ds.update_valid_data
116
- ds
37
+ rename new_name
117
38
  end
118
- # Return true if any vector has missing data
119
- def has_missing_data?
120
- @vectors.any? {|k,v| v.has_missing_data?}
39
+ # Number of cases
40
+ def cases
41
+ $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#nrows.\n"
42
+
43
+ nrows
121
44
  end
122
- # Return a nested hash using fields as keys and
123
- # an array constructed of hashes with other values.
124
- # If block provided, is used to provide the
125
- # values, with parameters +row+ of dataset,
126
- # +current+ last hash on hierarchy and
127
- # +name+ of the key to include
128
- def nest(*tree_keys,&block)
129
- tree_keys=tree_keys[0] if tree_keys[0].is_a? Array
130
- out=Hash.new
131
- each do |row|
132
- current=out
133
- # Create tree
134
- tree_keys[0,tree_keys.size-1].each do |f|
135
- root=row[f]
136
- current[root]||=Hash.new
137
- current=current[root]
138
- end
139
- name=row[tree_keys.last]
140
- if !block
141
- current[name]||=Array.new
142
- current[name].push(row.delete_if{|key,value| tree_keys.include? key})
143
- else
144
- current[name]=block.call(row, current,name)
145
- end
146
- end
147
- out
45
+
46
+ # == Deprecation Warning
47
+ #
48
+ # This class will soon be replaced by Daru::DataFrame in the
49
+ # next release. Use Daru::DataFrame.crosstab_by_assignation
50
+ # for the same effect. Please see the daru docs at
51
+ # https://github.com/v0dro/daru for more details.
52
+ def self.crosstab_by_assignation(rows,columns,values)
53
+ ds = super(rows, columns, values)
54
+ Dataset.new ds.to_hash
148
55
  end
149
- # Creates a new dataset. A dataset is a set of ordered named vectors
150
- # of the same size.
151
- #
152
- # [vectors] With an array, creates a set of empty vectors named as
153
- # values on the array. With a hash, each Vector is assigned as
154
- # a variable of the Dataset named as its key
155
- # [fields] Array of names for vectors. Is only used for set the
156
- # order of variables. If empty, vectors keys on alfabethic order as
157
- # used as fields.
56
+
57
+ # == Deprecation Warning
58
+ #
59
+ # This class will soon be replaced by Daru::DataFrame in the
60
+ # next release. Use Daru::DataFrame.new for the same effect.
61
+ # Please see the daru docs at https://github.com/v0dro/daru for more details.
158
62
  def initialize(vectors={}, fields=[])
159
- @@n_dataset||=0
160
- @@n_dataset+=1
161
- @name=_("Dataset %d") % @@n_dataset
162
- @cases=0
163
- @gsl=nil
164
- @i=nil
63
+ $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using that.\n"
165
64
 
166
65
  if vectors.instance_of? Array
167
66
  @fields=vectors.dup
168
- @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
67
+ super({}, order: @fields.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e })
169
68
  else
170
69
  # Check vectors
171
- @vectors=vectors
172
- @fields=fields
173
- check_order
174
- check_length
175
- end
176
- end
177
- #
178
- # Creates a copy of the given dataset, deleting all the cases with
179
- # missing data on one of the vectors.
180
- #
181
- # @param array of fields to include. No value include all fields
182
- #
183
- def dup_only_valid(*fields_to_include)
184
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
185
- fields_to_include=fields_to_include[0]
186
- end
187
- fields_to_include=@fields if fields_to_include.size==0
188
- if fields_to_include.any? {|f| @vectors[f].has_missing_data?}
189
- ds=Dataset.new(fields_to_include)
190
- fields_to_include.each {|f| ds[f].type=@vectors[f].type}
191
- each {|row|
192
- unless fields_to_include.any? {|f| @vectors[f].has_missing_data? and !@vectors[f].is_valid? row[f]}
193
- row_2=fields_to_include.inject({}) {|ac,v| ac[v]=row[v]; ac}
194
- ds.add_case(row_2)
195
- end
196
- }
197
- else
198
- ds=dup fields_to_include
199
- end
200
- ds.name= self.name
201
- ds
202
- end
203
- #
204
- # Returns a duplicate of the Dataset.
205
- # All vectors are copied, so any modification on new
206
- # dataset doesn't affect original dataset's vectors.
207
- # If fields given as parameter, only include those vectors.
208
- #
209
- # @param array of fields to include. No value include all fields
210
- # @return {Statsample::Dataset}
211
- def dup(*fields_to_include)
212
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
213
- fields_to_include=fields_to_include[0]
214
- end
215
- fields_to_include=@fields if fields_to_include.size==0
216
- vectors={}
217
- fields=[]
218
- fields_to_include.each{|f|
219
- raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
220
- vectors[f]=@vectors[f].dup
221
- fields.push(f)
222
- }
223
- ds=Dataset.new(vectors,fields)
224
- ds.name= self.name
225
- ds
226
- end
227
-
228
-
229
- # Returns an array with the fields from first argumen to last argument
230
- def from_to(from,to)
231
- raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
232
- raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
233
- @fields.slice(@fields.index(from)..@fields.index(to))
234
- end
235
-
236
- # Returns (when possible) a cheap copy of dataset.
237
- # If no vector have missing values, returns original vectors.
238
- # If missing values presents, uses Dataset.dup_only_valid.
239
- #
240
- # @param array of fields to include. No value include all fields
241
- # @return {Statsample::Dataset}
242
- def clone_only_valid(*fields_to_include)
243
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
244
- fields_to_include=fields_to_include[0]
245
- end
246
- fields_to_include=@fields.dup if fields_to_include.size==0
247
- if fields_to_include.any? {|v| @vectors[v].has_missing_data?}
248
- dup_only_valid(fields_to_include)
249
- else
250
- clone(fields_to_include)
251
- end
252
- end
253
- # Returns a shallow copy of Dataset.
254
- # Object id will be distinct, but @vectors will be the same.
255
- # @param array of fields to include. No value include all fields
256
- # @return {Statsample::Dataset}
257
- def clone(*fields_to_include)
258
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
259
- fields_to_include=fields_to_include[0]
260
- end
261
- fields_to_include=@fields.dup if fields_to_include.size==0
262
- ds=Dataset.new
263
- fields_to_include.each{|f|
264
- raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
265
- ds[f]=@vectors[f]
266
- }
267
- ds.fields=fields_to_include
268
- ds.name=@name
269
- ds.update_valid_data
270
- ds
271
- end
272
- # Creates a copy of the given dataset, without data on vectors
273
- #
274
- # @return {Statsample::Dataset}
275
- def dup_empty
276
- vectors=@vectors.inject({}) {|a,v|
277
- a[v[0]]=v[1].dup_empty
278
- a
279
- }
280
- Dataset.new(vectors,@fields.dup)
281
- end
282
- # Merge vectors from two datasets
283
- # In case of name collition, the vectors names are changed to
284
- # x_1, x_2 ....
285
- #
286
- # @return {Statsample::Dataset}
287
- def merge(other_ds)
288
- raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases
289
- types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type}
290
- new_fields = (@fields+other_ds.fields).recode_repeated
291
- ds_new=Statsample::Dataset.new(new_fields)
292
- new_fields.each_index{|i|
293
- field=new_fields[i]
294
- ds_new[field].type=types[i]
295
- }
296
- @cases.times {|i|
297
- row=case_as_array(i)+other_ds.case_as_array(i)
298
- ds_new.add_case_array(row)
299
- }
300
- ds_new.update_valid_data
301
- ds_new
302
- end
303
-
304
- # Join 2 Datasets by given fields
305
- # type is one of :left and :inner, default is :left
306
- #
307
- # @return {Statsample::Dataset}
308
- def join(other_ds,fields_1=[],fields_2=[],type=:left)
309
- fields_new = other_ds.fields - fields_2
310
- fields = self.fields + fields_new
311
-
312
- other_ds_hash = {}
313
- other_ds.each do |row|
314
- key = row.select{|k,v| fields_2.include?(k)}.values
315
- value = row.select{|k,v| fields_new.include?(k)}
316
- if other_ds_hash[key].nil?
317
- other_ds_hash[key] = [value]
318
- else
319
- other_ds_hash[key] << value
70
+ @vectors = {}
71
+ vectors.each do |k,v|
72
+ @vectors[k.respond_to?(:to_sym) ? k.to_sym : k] = v
320
73
  end
74
+ @fields = fields
75
+ super @vectors, order: @fields.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
321
76
  end
322
-
323
- new_ds = Dataset.new(fields)
324
-
325
- self.each do |row|
326
- key = row.select{|k,v| fields_1.include?(k)}.values
327
-
328
- new_case = row.dup
329
-
330
- if other_ds_hash[key].nil?
331
- if type == :left
332
- fields_new.each{|field| new_case[field] = nil}
333
- new_ds.add_case(new_case)
334
- end
335
- else
336
- other_ds_hash[key].each do |new_values|
337
- new_ds.add_case new_case.merge(new_values)
338
- end
339
- end
340
-
341
- end
342
- new_ds
343
77
  end
344
- # Returns a dataset with standarized data.
345
- #
346
- # @return {Statsample::Dataset}
347
- def standarize
348
- ds=dup()
349
- ds.fields.each do |f|
350
- ds[f]=ds[f].vector_standarized
351
- end
352
- ds
353
- end
354
- # Generate a matrix, based on fields of dataset
355
- #
356
- # @return {::Matrix}
357
78
 
358
- def collect_matrix
359
- rows=@fields.collect{|row|
360
- @fields.collect{|col|
361
- yield row,col
362
- }
363
- }
364
- Matrix.rows(rows)
79
+ def from_to(from,to)
80
+ raise NoMethodError, "This method is no longer supported. To see the vector index use Daru::DataFrame#vectors"
365
81
  end
366
82
 
367
- # We have the same datasets if +vectors+ and +fields+ are the same
368
- #
369
- # @return {Boolean}
370
- def ==(d2)
371
- @vectors==d2.vectors and @fields==d2.fields
372
- end
373
- # Returns vector <tt>c</tt>
374
- #
375
- # @return {Statsample::Vector}
376
- def col(c)
377
- @vectors[c]
378
- end
379
- alias_method :vector, :col
380
- # Equal to Dataset[<tt>name</tt>]=<tt>vector</tt>
381
- #
382
- # @return self
383
83
  def add_vector(name, vector)
384
- raise ArgumentError, "Vector have different size" if vector.size!=@cases
385
- @vectors[name]=vector
386
- check_order
387
- self
388
- end
389
- # Returns true if dataset have vector <tt>v</tt>.
390
- #
391
- # @return {Boolean}
392
- def has_vector? (v)
393
- return @vectors.has_key?(v)
394
- end
395
- # Creates a dataset with the random data, of a n size
396
- # If n not given, uses original number of cases.
397
- #
398
- # @return {Statsample::Dataset}
399
- def bootstrap(n=nil)
400
- n||=@cases
401
- ds_boot=dup_empty
402
- n.times do
403
- ds_boot.add_case_array(case_as_array(rand(n)))
404
- end
405
- ds_boot.update_valid_data
406
- ds_boot
84
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#[]= directly."
407
85
  end
408
- # Fast version of #add_case.
409
- # Can only add one case and no error check if performed
410
- # You SHOULD use #update_valid_data at the end of insertion cycle
411
- #
412
- #
86
+
413
87
  def add_case_array(v)
414
- v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
88
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#add_row instead."
415
89
  end
416
- # Insert a case, using:
417
- # * Array: size equal to number of vectors and values in the same order as fields
418
- # * Hash: keys equal to fields
419
- # If uvd is false, #update_valid_data is not executed after
420
- # inserting a case. This is very useful if you want to increase the
421
- # performance on inserting many cases, because #update_valid_data
422
- # performs check on vectors and on the dataset
423
90
 
424
91
  def add_case(v,uvd=true)
425
- case v
426
- when Array
427
- if (v[0].is_a? Array)
428
- v.each{|subv| add_case(subv,false)}
429
- else
430
- raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
431
- v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
432
- end
433
- when Hash
434
- raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort
435
- @fields.each{|f| @vectors[f].add(v[f],false)}
436
- else
437
- raise TypeError, 'Value must be a Array or a Hash'
438
- end
439
- if uvd
440
- update_valid_data
441
- end
442
- end
443
- # Check vectors and fields after inserting data. Use only
444
- # after #add_case_array or #add_case with second parameter to false
445
- def update_valid_data
446
- @gsl=nil
447
- @fields.each{|f| @vectors[f].set_valid_data}
448
- check_length
449
- end
450
- # Delete vector named +name+. Multiple fields accepted.
451
- def delete_vector(*args)
452
- if args.size==1 and args[0].is_a? Array
453
- names=args[0]
454
- else
455
- names=args
456
- end
457
- names.each do |name|
458
- @fields.delete(name)
459
- @vectors.delete(name)
460
- end
461
- end
462
-
463
- def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN)
464
- split=@vectors[name_].split_by_separator(sep)
465
- i=1
466
- split.each{|k,v|
467
- new_field=name_+join+i.to_s
468
- v.name=name_+":"+k
469
- add_vector(new_field,v)
470
- i+=1
471
- }
472
- end
473
- def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
474
- split=@vectors[name].split_by_separator(sep)
475
- split.each{|k,v|
476
- add_vector(name+join+k,v)
477
- }
478
- end
479
-
480
- def vector_by_calculation(type=:numeric)
481
- a=[]
482
- each do |row|
483
- a.push(yield(row))
484
- end
485
- a.to_vector(type)
486
- end
487
- # Returns a vector with sumatory of fields
488
- # if fields parameter is empty, sum all fields
489
- def vector_sum(fields=nil)
490
- fields||=@fields
491
- vector=collect_with_index do |row, i|
492
- if(fields.find{|f| !@vectors[f].data_with_nils[i]})
493
- nil
494
- else
495
- fields.inject(0) {|ac,v| ac + row[v].to_f}
496
- end
497
- end
498
- vector.name=_("Sum from %s") % @name
499
- vector
500
- end
501
- # Check if #fields attribute is correct, after inserting or deleting vectors
502
- def check_fields(fields)
503
- fields||=@fields
504
- raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
505
- fields
506
- end
507
-
508
- # Returns a vector with the numbers of missing values for a case
509
- def vector_missing_values(fields=nil)
510
- fields=check_fields(fields)
511
- collect_with_index do |row, i|
512
- fields.inject(0) {|a,v|
513
- a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
514
- }
515
- end
516
- end
517
- def vector_count_characters(fields=nil)
518
- fields=check_fields(fields)
519
- collect_with_index do |row, i|
520
- fields.inject(0){|a,v|
521
- a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
522
- }
523
- end
524
- end
525
- # Returns a vector with the mean for a set of fields
526
- # if fields parameter is empty, return the mean for all fields
527
- # if max invalid parameter > 0, returns the mean for all tuples
528
- # with 0 to max_invalid invalid fields
529
- def vector_mean(fields=nil, max_invalid=0)
530
- a=[]
531
- fields=check_fields(fields)
532
- size=fields.size
533
- each_with_index do |row, i |
534
- # numero de invalidos
535
- sum=0
536
- invalids=0
537
- fields.each{|f|
538
- if !@vectors[f].data_with_nils[i].nil?
539
- sum+=row[f].to_f
540
- else
541
- invalids+=1
542
- end
543
- }
544
- if(invalids>max_invalid)
545
- a.push(nil)
546
- else
547
- a.push(sum.quo(size-invalids))
548
- end
549
- end
550
- a=a.to_vector(:numeric)
551
- a.name=_("Means from %s") % @name
552
- a
553
- end
554
- # Check vectors for type and size.
555
- def check_length # :nodoc:
556
- size=nil
557
- @vectors.each do |k,v|
558
- raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
559
- if size.nil?
560
- size=v.size
561
- else
562
- if v.size!=size
563
- raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
564
- end
565
- end
566
- end
567
- @cases=size
568
- end
569
- # Retrieves each vector as [key, vector]
570
- def each_vector # :yield: |key, vector|
571
- @fields.each{|k| yield k, @vectors[k]}
92
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#add_row instead."
572
93
  end
573
94
 
574
- if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
575
- def case_as_hash(c) # :nodoc:
576
- Statsample::STATSAMPLE__.case_as_hash(self,c)
577
- end
578
- else
579
- # Retrieves case i as a hash
580
- def case_as_hash(i)
581
- _case_as_hash(i)
582
- end
95
+ def update_valid_data
96
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#update instead. Also see Daru.lazy_update in the daru docs."
583
97
  end
584
98
 
585
- if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
586
- def case_as_array(c) # :nodoc:
587
- Statsample::STATSAMPLE__.case_as_array(self,c)
588
- end
589
- else
590
- # Retrieves case i as a array, ordered on #fields order
591
- def case_as_array(i)
592
- _case_as_array(i)
593
- end
594
- end
595
- def _case_as_hash(c) # :nodoc:
596
- @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
597
- end
598
- def _case_as_array(c) # :nodoc:
599
- @fields.collect {|x| @vectors[x][c]}
99
+ def each_array
100
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#each_row instead."
600
101
  end
601
102
 
602
- # Returns each case as a hash
603
- def each
604
- begin
605
- @i=0
606
- @cases.times {|i|
607
- @i=i
608
- row=case_as_hash(i)
609
- yield row
610
- }
611
- @i=nil
612
- rescue =>e
613
- raise DatasetException.new(self, e)
614
- end
615
- end
103
+ def fields=(f)
104
+ $stderr.puts "WARNING: Deprecated. Use Daru::DataFrame#reindex_vectors! instead.\n"
616
105
 
617
- # Returns each case as hash and index
618
- def each_with_index # :yield: |case, i|
619
- begin
620
- @i=0
621
- @cases.times{|i|
622
- @i=i
623
- row=case_as_hash(i)
624
- yield row, i
625
- }
626
- @i=nil
627
- rescue =>e
628
- raise DatasetException.new(self, e)
629
- end
106
+ reindex_vectors! f
630
107
  end
631
108
 
632
- # Returns each case as an array, coding missing values as nils
633
- def each_array_with_nils
634
- m=fields.size
635
- @cases.times {|i|
636
- @i=i
637
- row=Array.new(m)
638
- fields.each_index{|j|
639
- f=fields[j]
640
- row[j]=@vectors[f].data_with_nils[i]
641
- }
642
- yield row
643
- }
644
- @i=nil
645
- end
646
- # Returns each case as an array
647
- def each_array
648
- @cases.times {|i|
649
- @i=i
650
- row=case_as_array(i)
651
- yield row
652
- }
653
- @i=nil
654
- end
655
- # Set fields order. If you omit one or more vectors, they are
656
- # ordered by alphabetic order.
657
- def fields=(f)
658
- @fields=f
659
- check_order
660
- end
661
- # Check congruence between +fields+ attribute
662
- # and keys on +vectors
663
- def check_order #:nodoc:
664
- if(@vectors.keys.sort!=@fields.sort)
665
- @fields=@fields&@vectors.keys
666
- @fields+=@vectors.keys.sort-@fields
667
- end
668
- end
669
109
  # Returns the vector named i
670
- def[](i)
110
+ def [](i)
111
+ $stderr.puts "WARNING: Daru uses symbols instead of strings for naming vectors. Please switch to symbols.\n"
112
+
671
113
  if i.is_a? Range
672
- fields=from_to(i.begin,i.end)
673
- clone(*fields)
674
- elsif i.is_a? Array
675
- clone(i)
114
+ beg = i.begin.respond_to?(:to_sym) ? i.to_sym : i
115
+ en = i.end.respond_to?(:to_sym) ? i.to_sym : i
116
+ super(beg..en)
676
117
  else
677
- raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
678
- @vectors[i]
118
+ super i.to_sym
679
119
  end
680
120
  end
681
- # Retrieves a Statsample::Vector, based on the result
682
- # of calculation performed on each case.
683
- def collect(type=:numeric)
684
- data=[]
685
- each {|row|
686
- data.push yield(row)
687
- }
688
- Statsample::Vector.new(data,type)
689
- end
690
- # Same as Statsample::Vector.collect, but giving case index as second parameter on yield.
691
- def collect_with_index(type=:numeric)
692
- data=[]
693
- each_with_index {|row, i|
694
- data.push(yield(row, i))
695
- }
696
- Statsample::Vector.new(data,type)
697
- end
698
- # Recode a vector based on a block
699
- def recode!(vector_name)
700
- 0.upto(@cases-1) {|i|
701
- @vectors[vector_name].data[i]=yield case_as_hash(i)
702
- }
703
- @vectors[vector_name].set_valid_data
704
- end
705
121
 
706
- def crosstab(v1,v2,opts={})
707
- Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts)
708
- end
709
- def[]=(i,v)
710
- if v.instance_of? Statsample::Vector
711
- @vectors[i]=v
712
- check_order
713
- else
714
- raise ArgumentError,"Should pass a Statsample::Vector"
715
- end
716
- end
717
- # Return data as a matrix. Column are ordered by #fields and
718
- # rows by orden of insertion
719
- def to_matrix
720
- rows=[]
721
- self.each_array{|c|
722
- rows.push(c)
723
- }
724
- Matrix.rows(rows)
122
+ def []=(i,v)
123
+ $stderr.puts "WARNING: Daru uses symbols instead of strings for naming vectors. Please switch to symbols.\n"
124
+
125
+ super i, v
725
126
  end
726
127
 
727
128
  if Statsample.has_gsl?
728
129
  def clear_gsl
729
- @gsl=nil
130
+ raise NoMethodError, "This method is no longer needed/supported."
730
131
  end
731
-
732
- def to_gsl
733
- if @gsl.nil?
734
- if cases.nil?
735
- update_valid_data
736
- end
737
- @gsl=GSL::Matrix.alloc(cases,fields.size)
738
- self.each_array{|c|
739
- @gsl.set_row(@i,c)
740
- }
741
- end
742
- @gsl
743
- end
744
-
745
- end
746
-
747
- # Return a correlation matrix for fields included as parameters.
748
- # By default, uses all fields of dataset
749
- def correlation_matrix(fields = nil)
750
- if fields
751
- ds = clone(fields)
752
- else
753
- ds = self
754
- end
755
- Statsample::Bivariate.correlation_matrix(ds)
756
- end
757
-
758
- # Return a correlation matrix for fields included as parameters.
759
- # By default, uses all fields of dataset
760
- def covariance_matrix(fields = nil)
761
- if fields
762
- ds = clone(fields)
763
- else
764
- ds = self
765
- end
766
- Statsample::Bivariate.covariance_matrix(ds)
767
- end
768
-
769
- # Create a new dataset with all cases which the block returns true
770
- def filter
771
- ds=self.dup_empty
772
- each {|c|
773
- ds.add_case(c, false) if yield c
774
- }
775
- ds.update_valid_data
776
- ds.name=_("%s(filtered)") % @name
777
- ds
778
- end
779
-
780
- # creates a new vector with the data of a given field which the block returns true
781
- def filter_field(field)
782
- a=[]
783
- each do |c|
784
- a.push(c[field]) if yield c
785
- end
786
- a.to_vector(@vectors[field].type)
787
- end
788
-
789
- # Creates a Stastample::Multiset, using one or more fields
790
- # to split the dataset.
791
-
792
-
793
- def to_multiset_by_split(*fields)
794
- require 'statsample/multiset'
795
- if fields.size==1
796
- to_multiset_by_split_one_field(fields[0])
797
- else
798
- to_multiset_by_split_multiple_fields(*fields)
799
- end
800
- end
801
- # Creates a Statsample::Multiset, using one field
802
-
803
- def to_multiset_by_split_one_field(field)
804
- raise ArgumentError,"Should use a correct field name" if !@fields.include? field
805
- factors=@vectors[field].factors
806
- ms=Multiset.new_empty_vectors(@fields, factors)
807
- each {|c|
808
- ms[c[field]].add_case(c,false)
809
- }
810
- #puts "Ingreso a los dataset"
811
- ms.datasets.each {|k,ds|
812
- ds.update_valid_data
813
- ds.name=@vectors[field].labeling(k)
814
- ds.vectors.each{|k1,v1|
815
- # puts "Vector #{k1}:"+v1.to_s
816
- v1.type=@vectors[k1].type
817
- v1.name=@vectors[k1].name
818
- v1.labels=@vectors[k1].labels
819
-
820
- }
821
- }
822
- ms
823
- end
824
- def to_multiset_by_split_multiple_fields(*fields)
825
- factors_total=nil
826
- fields.each do |f|
827
- if factors_total.nil?
828
- factors_total=@vectors[f].factors.collect{|c|
829
- [c]
830
- }
831
- else
832
- suma=[]
833
- factors=@vectors[f].factors
834
- factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } }
835
- factors_total=suma
836
- end
837
- end
838
- ms=Multiset.new_empty_vectors(@fields,factors_total)
839
-
840
- p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
841
- each{|c| p1.call(c)}
842
-
843
- ms.datasets.each do |k,ds|
844
- ds.update_valid_data
845
- ds.name=fields.size.times.map {|i|
846
- f=fields[i]
847
- sk=k[i]
848
- @vectors[f].labeling(sk)
849
- }.join("-")
850
- ds.vectors.each{|k1,v1|
851
- v1.type=@vectors[k1].type
852
- v1.name=@vectors[k1].name
853
- v1.labels=@vectors[k1].labels
854
-
855
- }
856
- end
857
- ms
858
-
859
- end
860
- # Returns a vector, based on a string with a calculation based
861
- # on vector
862
- # The calculation will be eval'ed, so you can put any variable
863
- # or expression valid on ruby
864
- # For example:
865
- # a=[1,2].to_vector(scale)
866
- # b=[3,4].to_vector(scale)
867
- # ds={'a'=>a,'b'=>b}.to_dataset
868
- # ds.compute("a+b")
869
- # => Vector [4,6]
870
- def compute(text)
871
- @fields.each{|f|
872
- if @vectors[f].type=:numeric
873
- text.gsub!(f,"row['#{f}'].to_f")
874
- else
875
- text.gsub!(f,"row['#{f}']")
876
- end
877
- }
878
- collect_with_index {|row, i|
879
- invalid=false
880
- @fields.each{|f|
881
- if @vectors[f].data_with_nils[i].nil?
882
- invalid=true
883
- end
884
- }
885
- if invalid
886
- nil
887
- else
888
- eval(text)
889
- end
890
- }
891
- end
892
- # Test each row with one or more tests
893
- # each test is a Proc with the form
894
- # Proc.new {|row| row['age']>0}
895
- # The function returns an array with all errors
896
- def verify(*tests)
897
- if(tests[0].is_a? String)
898
- id=tests[0]
899
- tests.shift
900
- else
901
- id=@fields[0]
902
- end
903
- vr=[]
904
- i=0
905
- each do |row|
906
- i+=1
907
- tests.each{|test|
908
- if ! test[2].call(row)
909
- values=""
910
- if test[1].size>0
911
- values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
912
- end
913
- vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
914
- end
915
- }
916
- end
917
- vr
918
- end
919
- def to_s
920
- "#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
921
- end
922
- def inspect
923
- self.to_s
924
- end
925
- # Creates a new dataset for one to many relations
926
- # on a dataset, based on pattern of field names.
927
- #
928
- # for example, you have a survey for number of children
929
- # with this structure:
930
- # id, name, child_name_1, child_age_1, child_name_2, child_age_2
931
- # with
932
- # ds.one_to_many(%w{id}, "child_%v_%n"
933
- # the field of first parameters will be copied verbatim
934
- # to new dataset, and fields which responds to second
935
- # pattern will be added one case for each different %n.
936
- # For example
937
- # cases=[
938
- # ['1','george','red',10,'blue',20,nil,nil],
939
- # ['2','fred','green',15,'orange',30,'white',20],
940
- # ['3','alfred',nil,nil,nil,nil,nil,nil]
941
- # ]
942
- # ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
943
- # cases.each {|c| ds.add_case_array c }
944
- # ds.one_to_many(['id'],'car_%v%n').to_matrix
945
- # => Matrix[
946
- # ["red", "1", 10],
947
- # ["blue", "1", 20],
948
- # ["green", "2", 15],
949
- # ["orange", "2", 30],
950
- # ["white", "2", 20]
951
- # ]
952
- #
953
- def one_to_many(parent_fields, pattern)
954
- #base_pattern=pattern.gsub(/%v|%n/,"")
955
- re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
956
- ds_vars=parent_fields
957
- vars=[]
958
- max_n=0
959
- h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
960
- # Adding _row_id
961
- h['_col_id']=[].to_numeric
962
- ds_vars.push("_col_id")
963
- @fields.each do |f|
964
- if f=~re
965
- if !vars.include? $1
966
- vars.push($1)
967
- h[$1]=Statsample::Vector.new([], @vectors[f].type)
968
- end
969
- max_n=$2.to_i if max_n < $2.to_i
970
- end
971
- end
972
- ds=Dataset.new(h,ds_vars+vars)
973
- each do |row|
974
- row_out={}
975
- parent_fields.each do |f|
976
- row_out[f]=row[f]
977
- end
978
- max_n.times do |n1|
979
- n=n1+1
980
- any_data=false
981
- vars.each do |v|
982
- data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
983
- row_out[v]=data
984
- any_data=true if !data.nil?
985
- end
986
- if any_data
987
- row_out["_col_id"]=n
988
- ds.add_case(row_out,false)
989
- end
990
-
991
- end
992
- end
993
- ds.update_valid_data
994
- ds
995
- end
996
- def report_building(b)
997
- b.section(:name=>@name) do |g|
998
- g.text _"Cases: %d" % cases
999
- @fields.each do |f|
1000
- g.text "Element:[#{f}]"
1001
- g.parse_element(@vectors[f])
1002
- end
1003
- end
1004
- end
132
+ end
1005
133
  end
1006
134
  end