statsample 1.5.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.build.sh +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +19 -7
- data/CONTRIBUTING.md +33 -0
- data/History.txt +5 -0
- data/README.md +41 -53
- data/benchmarks/correlation_matrix_15_variables.rb +6 -5
- data/benchmarks/correlation_matrix_5_variables.rb +6 -5
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
- data/examples/boxplot.rb +17 -5
- data/examples/correlation_matrix.rb +36 -7
- data/examples/dataset.rb +25 -5
- data/examples/dominance_analysis.rb +8 -7
- data/examples/dominance_analysis_bootstrap.rb +16 -11
- data/examples/histogram.rb +16 -2
- data/examples/icc.rb +5 -6
- data/examples/levene.rb +17 -3
- data/examples/multiple_regression.rb +6 -3
- data/examples/parallel_analysis.rb +11 -6
- data/examples/polychoric.rb +26 -13
- data/examples/principal_axis.rb +8 -4
- data/examples/reliability.rb +10 -10
- data/examples/scatterplot.rb +8 -0
- data/examples/t_test.rb +7 -0
- data/examples/u_test.rb +10 -2
- data/examples/vector.rb +9 -6
- data/examples/velicer_map_test.rb +12 -8
- data/lib/statsample.rb +13 -47
- data/lib/statsample/analysis/suite.rb +1 -1
- data/lib/statsample/anova/oneway.rb +6 -6
- data/lib/statsample/anova/twoway.rb +26 -24
- data/lib/statsample/bivariate.rb +78 -61
- data/lib/statsample/bivariate/pearson.rb +2 -2
- data/lib/statsample/codification.rb +45 -32
- data/lib/statsample/converter/csv.rb +15 -53
- data/lib/statsample/converter/spss.rb +6 -5
- data/lib/statsample/converters.rb +50 -211
- data/lib/statsample/crosstab.rb +26 -25
- data/lib/statsample/daru.rb +117 -0
- data/lib/statsample/dataset.rb +70 -942
- data/lib/statsample/dominanceanalysis.rb +16 -17
- data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
- data/lib/statsample/factor/parallelanalysis.rb +17 -19
- data/lib/statsample/factor/pca.rb +21 -20
- data/lib/statsample/factor/principalaxis.rb +3 -3
- data/lib/statsample/graph/boxplot.rb +8 -16
- data/lib/statsample/graph/histogram.rb +4 -4
- data/lib/statsample/graph/scatterplot.rb +8 -7
- data/lib/statsample/histogram.rb +128 -119
- data/lib/statsample/matrix.rb +20 -16
- data/lib/statsample/multiset.rb +39 -38
- data/lib/statsample/regression.rb +3 -3
- data/lib/statsample/regression/multiple.rb +8 -10
- data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
- data/lib/statsample/regression/multiple/baseengine.rb +32 -32
- data/lib/statsample/regression/multiple/gslengine.rb +33 -36
- data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
- data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
- data/lib/statsample/reliability.rb +23 -25
- data/lib/statsample/reliability/icc.rb +8 -7
- data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
- data/lib/statsample/reliability/scaleanalysis.rb +58 -60
- data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
- data/lib/statsample/resample.rb +1 -1
- data/lib/statsample/shorthand.rb +29 -25
- data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
- data/lib/statsample/test/levene.rb +28 -27
- data/lib/statsample/test/t.rb +7 -9
- data/lib/statsample/test/umannwhitney.rb +28 -28
- data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
- data/lib/statsample/vector.rb +70 -1013
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +12 -16
- data/test/helpers_tests.rb +1 -1
- data/test/test_analysis.rb +17 -17
- data/test/test_anova_contrast.rb +6 -6
- data/test/test_anovatwowaywithdataset.rb +8 -8
- data/test/test_anovawithvectors.rb +8 -8
- data/test/test_awesome_print_bug.rb +1 -1
- data/test/test_bartlettsphericity.rb +4 -4
- data/test/test_bivariate.rb +48 -43
- data/test/test_codification.rb +33 -33
- data/test/test_crosstab.rb +9 -9
- data/test/test_dataset.rb +28 -458
- data/test/test_factor.rb +46 -38
- data/test/test_factor_pa.rb +22 -13
- data/test/test_ggobi.rb +4 -4
- data/test/test_gsl.rb +4 -4
- data/test/test_histogram.rb +3 -3
- data/test/test_matrix.rb +13 -13
- data/test/test_multiset.rb +103 -91
- data/test/test_regression.rb +57 -52
- data/test/test_reliability.rb +55 -45
- data/test/test_reliability_icc.rb +8 -8
- data/test/test_reliability_skillscale.rb +26 -24
- data/test/test_resample.rb +1 -1
- data/test/test_statistics.rb +3 -13
- data/test/test_stest.rb +9 -9
- data/test/test_stratified.rb +3 -3
- data/test/test_test_t.rb +12 -12
- data/test/test_umannwhitney.rb +2 -2
- data/test/test_vector.rb +76 -613
- data/test/test_wilcoxonsignedrank.rb +4 -4
- metadata +57 -28
- data/lib/statsample/rserve_extension.rb +0 -20
- data/lib/statsample/vector/gsl.rb +0 -106
- data/test/fixtures/repeated_fields.csv +0 -7
- data/test/fixtures/scientific_notation.csv +0 -4
- data/test/fixtures/test_csv.csv +0 -7
- data/test/fixtures/test_xls.xls +0 -0
- data/test/test_csv.rb +0 -63
- data/test/test_rserve_extension.rb +0 -42
- data/test/test_xls.rb +0 -52
data/lib/statsample/crosstab.rb
CHANGED
@@ -8,24 +8,25 @@ module Statsample
|
|
8
8
|
attr_reader :v_rows, :v_cols
|
9
9
|
attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
|
10
10
|
def initialize(v1, v2, opts=Hash.new)
|
11
|
-
#raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
|
12
11
|
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
|
13
|
-
@v_rows, @v_cols=Statsample.only_valid_clone(
|
14
|
-
|
15
|
-
|
16
|
-
@
|
17
|
-
@name
|
12
|
+
@v_rows, @v_cols = Statsample.only_valid_clone(
|
13
|
+
Daru::Vector.new(v1),
|
14
|
+
Daru::Vector.new(v2))
|
15
|
+
@cases = @v_rows.size
|
16
|
+
@row_label = v1.name
|
17
|
+
@column_label = v2.name
|
18
|
+
@name = nil
|
18
19
|
@percentage_row = @percentage_column = @percentage_total=false
|
19
|
-
opts.each
|
20
|
+
opts.each do |k,v|
|
20
21
|
self.send("#{k}=",v) if self.respond_to? k
|
21
|
-
|
22
|
-
@name||=_("Crosstab %s - %s") % [@row_label, @column_label]
|
22
|
+
end
|
23
|
+
@name ||= _("Crosstab %s - %s") % [@row_label, @column_label]
|
23
24
|
end
|
24
25
|
def rows_names
|
25
|
-
@v_rows.factors.sort
|
26
|
+
@v_rows.factors.sort.reset_index!
|
26
27
|
end
|
27
28
|
def cols_names
|
28
|
-
@v_cols.factors.sort
|
29
|
+
@v_cols.factors.sort.reset_index!
|
29
30
|
end
|
30
31
|
def rows_total
|
31
32
|
@v_rows.frequencies
|
@@ -35,18 +36,18 @@ module Statsample
|
|
35
36
|
end
|
36
37
|
|
37
38
|
def frequencies
|
38
|
-
base=rows_names.inject([])
|
39
|
-
s+=cols_names.collect{|col| [row,col]}
|
40
|
-
|
39
|
+
base = rows_names.inject([]) do |s,row|
|
40
|
+
s += cols_names.collect { |col| [row,col] }
|
41
|
+
end.inject({}) do |s,par|
|
41
42
|
s[par]=0
|
42
43
|
s
|
43
|
-
|
44
|
-
base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.
|
44
|
+
end
|
45
|
+
base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
|
45
46
|
end
|
46
47
|
def to_matrix
|
47
|
-
f=frequencies
|
48
|
-
rn=rows_names
|
49
|
-
cn=cols_names
|
48
|
+
f = frequencies
|
49
|
+
rn = rows_names
|
50
|
+
cn = cols_names
|
50
51
|
Matrix.rows(rn.collect{|row|
|
51
52
|
cn.collect{|col| f[[row,col]]}
|
52
53
|
})
|
@@ -67,8 +68,8 @@ module Statsample
|
|
67
68
|
end
|
68
69
|
# Chi square, based on expected and real matrix
|
69
70
|
def chi_square
|
70
|
-
|
71
|
-
|
71
|
+
require 'statsample/test'
|
72
|
+
Statsample::Test.chi_square(self.to_matrix, matrix_expected)
|
72
73
|
end
|
73
74
|
# Useful to obtain chi square
|
74
75
|
def matrix_expected
|
@@ -98,10 +99,10 @@ module Statsample
|
|
98
99
|
generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
|
99
100
|
generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
|
100
101
|
|
101
|
-
t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.
|
102
|
+
t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")])
|
102
103
|
rn.each do |row|
|
103
104
|
total_row=0
|
104
|
-
t_row=[@v_rows.
|
105
|
+
t_row=[@v_rows.index_of(row)]
|
105
106
|
cn.each do |col|
|
106
107
|
data=fq[[row,col]]
|
107
108
|
total_row+=fq[[row,col]]
|
@@ -148,9 +149,9 @@ module Statsample
|
|
148
149
|
when :total then _("% Total")
|
149
150
|
end
|
150
151
|
|
151
|
-
t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.
|
152
|
+
t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")])
|
152
153
|
rn.each do |row|
|
153
|
-
t_row=[@v_rows.
|
154
|
+
t_row=[@v_rows.index_of(row)]
|
154
155
|
cn.each do |col|
|
155
156
|
total=case type
|
156
157
|
when :row then rt[row]
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# Opening the Daru::DataFrame class for adding methods to convert from
|
2
|
+
# data structures to specialized statsample data structues like Multiset.
|
3
|
+
module Daru
|
4
|
+
class Vector
|
5
|
+
def histogram(bins=10)
|
6
|
+
type == :numeric or raise TypeError, "Only numeric Vectors can do this operation."
|
7
|
+
|
8
|
+
if bins.is_a? Array
|
9
|
+
h = Statsample::Histogram.alloc(bins)
|
10
|
+
else
|
11
|
+
# ugly patch. The upper limit for a bin has the form
|
12
|
+
# x < range
|
13
|
+
#h=Statsample::Histogram.new(self, bins)
|
14
|
+
valid = only_valid
|
15
|
+
min,max=Statsample::Util.nice(valid.min,valid.max)
|
16
|
+
# fix last data
|
17
|
+
if max == valid.max
|
18
|
+
max += 1e-10
|
19
|
+
end
|
20
|
+
h = Statsample::Histogram.alloc(bins,[min,max])
|
21
|
+
# Fix last bin
|
22
|
+
end
|
23
|
+
|
24
|
+
h.increment(valid)
|
25
|
+
h
|
26
|
+
end
|
27
|
+
|
28
|
+
# Variance of p, according to poblation size
|
29
|
+
def variance_proportion(n_poblation, v=1)
|
30
|
+
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Variance of p, according to poblation size
|
34
|
+
def variance_total(n_poblation, v=1)
|
35
|
+
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
36
|
+
end
|
37
|
+
|
38
|
+
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
39
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
40
|
+
end
|
41
|
+
|
42
|
+
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
43
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class DataFrame
|
48
|
+
def crosstab(v1,v2,opts={})
|
49
|
+
Statsample::Crosstab.new(self[v1], self[v2],opts)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Functions for converting to Statsample::Multiset
|
53
|
+
def to_multiset_by_split(*vecs)
|
54
|
+
require 'statsample/multiset'
|
55
|
+
|
56
|
+
if vecs.size == 1
|
57
|
+
to_multiset_by_split_one_field(vecs[0])
|
58
|
+
else
|
59
|
+
to_multiset_by_split_multiple_fields(*vecs)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
# Creates a Statsample::Multiset, using one field
|
63
|
+
|
64
|
+
def to_multiset_by_split_one_field(field)
|
65
|
+
raise ArgumentError,"Should use a correct field name" if
|
66
|
+
!@vectors.include? field
|
67
|
+
|
68
|
+
factors = self[field].factors
|
69
|
+
ms = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors)
|
70
|
+
each_row do |row|
|
71
|
+
ms[row[field]].add_row(row)
|
72
|
+
end
|
73
|
+
#puts "Ingreso a los dataset"
|
74
|
+
ms.datasets.each do |k,ds|
|
75
|
+
ds.update
|
76
|
+
ds.rename self[field].index_of(k)
|
77
|
+
end
|
78
|
+
|
79
|
+
ms
|
80
|
+
end
|
81
|
+
|
82
|
+
def to_multiset_by_split_multiple_fields(*fields)
|
83
|
+
fields.map!(&:to_sym)
|
84
|
+
factors_total=nil
|
85
|
+
fields.each do |f|
|
86
|
+
if factors_total.nil?
|
87
|
+
factors_total = self[f].factors.collect { |c| [c] }
|
88
|
+
else
|
89
|
+
suma = []
|
90
|
+
factors = self[f].factors
|
91
|
+
factors_total.each do |f1|
|
92
|
+
factors.each do |f2|
|
93
|
+
suma.push(f1+[f2])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
factors_total = suma
|
97
|
+
end
|
98
|
+
end
|
99
|
+
ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total)
|
100
|
+
|
101
|
+
p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }"
|
102
|
+
each_row { |r| p1.call(r) }
|
103
|
+
|
104
|
+
ms.datasets.each do |k,ds|
|
105
|
+
ds.update
|
106
|
+
ds.rename(
|
107
|
+
fields.size.times.map do |i|
|
108
|
+
f = fields[i]
|
109
|
+
sk = k[i]
|
110
|
+
self[f].index_of(sk)
|
111
|
+
end.join("-")
|
112
|
+
)
|
113
|
+
end
|
114
|
+
ms
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
data/lib/statsample/dataset.rb
CHANGED
@@ -2,9 +2,11 @@ require 'statsample/vector'
|
|
2
2
|
|
3
3
|
class Hash
|
4
4
|
# Creates a Statsample::Dataset based on a Hash
|
5
|
-
def
|
5
|
+
def to_dataframe(*args)
|
6
6
|
Statsample::Dataset.new(self, *args)
|
7
7
|
end
|
8
|
+
|
9
|
+
alias :to_dataset :to_dataframe
|
8
10
|
end
|
9
11
|
|
10
12
|
class Array
|
@@ -17,990 +19,116 @@ class Array
|
|
17
19
|
end
|
18
20
|
|
19
21
|
module Statsample
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
22
|
+
# == Deprecation Warning
|
23
|
+
#
|
24
|
+
# This class will soon be replaced by Daru::DataFrame in the
|
25
|
+
# next release. Please see the daru docs at https://github.com/v0dro/daru
|
26
|
+
# for more details
|
27
|
+
class Dataset < Daru::DataFrame
|
28
|
+
# Ordered ids of vectors
|
29
|
+
def fields
|
30
|
+
$stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#vectors.\n"
|
31
|
+
@vectors.to_a
|
30
32
|
end
|
31
|
-
end
|
32
|
-
# Set of cases with values for one or more variables,
|
33
|
-
# analog to a dataframe on R or a standard data file of SPSS.
|
34
|
-
# Every vector has <tt>#field</tt> name, which represent it. By default,
|
35
|
-
# the vectors are ordered by it field name, but you can change it
|
36
|
-
# the fields order manually.
|
37
|
-
# The Dataset work as a Hash, with keys are field names
|
38
|
-
# and values are Statsample::Vector
|
39
|
-
#
|
40
|
-
#
|
41
|
-
# ==Usage
|
42
|
-
# Create a empty dataset:
|
43
|
-
# Dataset.new()
|
44
|
-
# Create a dataset with three empty vectors, called <tt>v1</tt>, <tt>v2</tt> and <tt>v3</tt>:
|
45
|
-
# Dataset.new(%w{v1 v2 v3})
|
46
|
-
# Create a dataset with two vectors, called <tt>v1</tt>
|
47
|
-
# and <tt>v2</tt>:
|
48
|
-
# Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
|
49
|
-
# Create a dataset with two given vectors (v1 and v2),
|
50
|
-
# with vectors on inverted order:
|
51
|
-
# Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1'])
|
52
|
-
#
|
53
|
-
# The fast way to create a dataset uses Hash#to_dataset, with
|
54
|
-
# field order as arguments
|
55
|
-
# v1 = [1,2,3].to_numeric
|
56
|
-
# v2 = [1,2,3].to_numeric
|
57
|
-
# ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1})
|
58
33
|
|
59
|
-
|
60
|
-
|
61
|
-
include Summarizable
|
62
|
-
# Hash of Statsample::Vector
|
63
|
-
attr_reader :vectors
|
64
|
-
# Ordered ids of vectors
|
65
|
-
attr_reader :fields
|
66
|
-
# Name of dataset
|
67
|
-
attr_accessor :name
|
68
|
-
# Number of cases
|
69
|
-
attr_reader :cases
|
70
|
-
# Location of pointer on enumerations methods (like #each)
|
71
|
-
attr_reader :i
|
34
|
+
def name= new_name
|
35
|
+
$stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#rename.\n"
|
72
36
|
|
73
|
-
|
74
|
-
# - Rows
|
75
|
-
# - Columns
|
76
|
-
# - Values
|
77
|
-
#
|
78
|
-
# For example, you have these values
|
79
|
-
#
|
80
|
-
# x y v
|
81
|
-
# a a 0
|
82
|
-
# a b 1
|
83
|
-
# b a 1
|
84
|
-
# b b 0
|
85
|
-
#
|
86
|
-
# You obtain
|
87
|
-
# id a b
|
88
|
-
# a 0 1
|
89
|
-
# b 1 0
|
90
|
-
#
|
91
|
-
# Useful to process outputs from databases
|
92
|
-
def self.crosstab_by_asignation(rows,columns,values)
|
93
|
-
raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size
|
94
|
-
cols_values=columns.factors
|
95
|
-
cols_n=cols_values.size
|
96
|
-
h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){
|
97
|
-
|a1,v1| a1[v1]=nil; a1
|
98
|
-
}
|
99
|
-
;a}
|
100
|
-
values.each_index{|i|
|
101
|
-
h_rows[rows[i]][columns[i]]=values[i]
|
102
|
-
}
|
103
|
-
ds=Dataset.new(["_id"]+cols_values)
|
104
|
-
cols_values.each{|c|
|
105
|
-
ds[c].type=values.type
|
106
|
-
}
|
107
|
-
rows.factors.each {|row|
|
108
|
-
n_row=Array.new(cols_n+1)
|
109
|
-
n_row[0]=row
|
110
|
-
cols_values.each_index {|i|
|
111
|
-
n_row[i+1]=h_rows[row][cols_values[i]]
|
112
|
-
}
|
113
|
-
ds.add_case_array(n_row)
|
114
|
-
}
|
115
|
-
ds.update_valid_data
|
116
|
-
ds
|
37
|
+
rename new_name
|
117
38
|
end
|
118
|
-
#
|
119
|
-
def
|
120
|
-
|
39
|
+
# Number of cases
|
40
|
+
def cases
|
41
|
+
$stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#nrows.\n"
|
42
|
+
|
43
|
+
nrows
|
121
44
|
end
|
122
|
-
|
123
|
-
#
|
124
|
-
#
|
125
|
-
#
|
126
|
-
#
|
127
|
-
#
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
current=out
|
133
|
-
# Create tree
|
134
|
-
tree_keys[0,tree_keys.size-1].each do |f|
|
135
|
-
root=row[f]
|
136
|
-
current[root]||=Hash.new
|
137
|
-
current=current[root]
|
138
|
-
end
|
139
|
-
name=row[tree_keys.last]
|
140
|
-
if !block
|
141
|
-
current[name]||=Array.new
|
142
|
-
current[name].push(row.delete_if{|key,value| tree_keys.include? key})
|
143
|
-
else
|
144
|
-
current[name]=block.call(row, current,name)
|
145
|
-
end
|
146
|
-
end
|
147
|
-
out
|
45
|
+
|
46
|
+
# == Deprecation Warning
|
47
|
+
#
|
48
|
+
# This class will soon be replaced by Daru::DataFrame in the
|
49
|
+
# next release. Use Daru::DataFrame.crosstab_by_assignation
|
50
|
+
# for the same effect. Please see the daru docs at
|
51
|
+
# https://github.com/v0dro/daru for more details.
|
52
|
+
def self.crosstab_by_assignation(rows,columns,values)
|
53
|
+
ds = super(rows, columns, values)
|
54
|
+
Dataset.new ds.to_hash
|
148
55
|
end
|
149
|
-
|
150
|
-
#
|
151
|
-
#
|
152
|
-
#
|
153
|
-
#
|
154
|
-
#
|
155
|
-
# [fields] Array of names for vectors. Is only used for set the
|
156
|
-
# order of variables. If empty, vectors keys on alfabethic order as
|
157
|
-
# used as fields.
|
56
|
+
|
57
|
+
# == Deprecation Warning
|
58
|
+
#
|
59
|
+
# This class will soon be replaced by Daru::DataFrame in the
|
60
|
+
# next release. Use Daru::DataFrame.new for the same effect.
|
61
|
+
# Please see the daru docs at https://github.com/v0dro/daru for more details.
|
158
62
|
def initialize(vectors={}, fields=[])
|
159
|
-
|
160
|
-
@@n_dataset+=1
|
161
|
-
@name=_("Dataset %d") % @@n_dataset
|
162
|
-
@cases=0
|
163
|
-
@gsl=nil
|
164
|
-
@i=nil
|
63
|
+
$stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using that.\n"
|
165
64
|
|
166
65
|
if vectors.instance_of? Array
|
167
66
|
@fields=vectors.dup
|
168
|
-
|
67
|
+
super({}, order: @fields.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e })
|
169
68
|
else
|
170
69
|
# Check vectors
|
171
|
-
@vectors=
|
172
|
-
|
173
|
-
|
174
|
-
check_length
|
175
|
-
end
|
176
|
-
end
|
177
|
-
#
|
178
|
-
# Creates a copy of the given dataset, deleting all the cases with
|
179
|
-
# missing data on one of the vectors.
|
180
|
-
#
|
181
|
-
# @param array of fields to include. No value include all fields
|
182
|
-
#
|
183
|
-
def dup_only_valid(*fields_to_include)
|
184
|
-
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
185
|
-
fields_to_include=fields_to_include[0]
|
186
|
-
end
|
187
|
-
fields_to_include=@fields if fields_to_include.size==0
|
188
|
-
if fields_to_include.any? {|f| @vectors[f].has_missing_data?}
|
189
|
-
ds=Dataset.new(fields_to_include)
|
190
|
-
fields_to_include.each {|f| ds[f].type=@vectors[f].type}
|
191
|
-
each {|row|
|
192
|
-
unless fields_to_include.any? {|f| @vectors[f].has_missing_data? and !@vectors[f].is_valid? row[f]}
|
193
|
-
row_2=fields_to_include.inject({}) {|ac,v| ac[v]=row[v]; ac}
|
194
|
-
ds.add_case(row_2)
|
195
|
-
end
|
196
|
-
}
|
197
|
-
else
|
198
|
-
ds=dup fields_to_include
|
199
|
-
end
|
200
|
-
ds.name= self.name
|
201
|
-
ds
|
202
|
-
end
|
203
|
-
#
|
204
|
-
# Returns a duplicate of the Dataset.
|
205
|
-
# All vectors are copied, so any modification on new
|
206
|
-
# dataset doesn't affect original dataset's vectors.
|
207
|
-
# If fields given as parameter, only include those vectors.
|
208
|
-
#
|
209
|
-
# @param array of fields to include. No value include all fields
|
210
|
-
# @return {Statsample::Dataset}
|
211
|
-
def dup(*fields_to_include)
|
212
|
-
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
213
|
-
fields_to_include=fields_to_include[0]
|
214
|
-
end
|
215
|
-
fields_to_include=@fields if fields_to_include.size==0
|
216
|
-
vectors={}
|
217
|
-
fields=[]
|
218
|
-
fields_to_include.each{|f|
|
219
|
-
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
|
220
|
-
vectors[f]=@vectors[f].dup
|
221
|
-
fields.push(f)
|
222
|
-
}
|
223
|
-
ds=Dataset.new(vectors,fields)
|
224
|
-
ds.name= self.name
|
225
|
-
ds
|
226
|
-
end
|
227
|
-
|
228
|
-
|
229
|
-
# Returns an array with the fields from first argumen to last argument
|
230
|
-
def from_to(from,to)
|
231
|
-
raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
|
232
|
-
raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
|
233
|
-
@fields.slice(@fields.index(from)..@fields.index(to))
|
234
|
-
end
|
235
|
-
|
236
|
-
# Returns (when possible) a cheap copy of dataset.
|
237
|
-
# If no vector have missing values, returns original vectors.
|
238
|
-
# If missing values presents, uses Dataset.dup_only_valid.
|
239
|
-
#
|
240
|
-
# @param array of fields to include. No value include all fields
|
241
|
-
# @return {Statsample::Dataset}
|
242
|
-
def clone_only_valid(*fields_to_include)
|
243
|
-
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
244
|
-
fields_to_include=fields_to_include[0]
|
245
|
-
end
|
246
|
-
fields_to_include=@fields.dup if fields_to_include.size==0
|
247
|
-
if fields_to_include.any? {|v| @vectors[v].has_missing_data?}
|
248
|
-
dup_only_valid(fields_to_include)
|
249
|
-
else
|
250
|
-
clone(fields_to_include)
|
251
|
-
end
|
252
|
-
end
|
253
|
-
# Returns a shallow copy of Dataset.
|
254
|
-
# Object id will be distinct, but @vectors will be the same.
|
255
|
-
# @param array of fields to include. No value include all fields
|
256
|
-
# @return {Statsample::Dataset}
|
257
|
-
def clone(*fields_to_include)
|
258
|
-
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
259
|
-
fields_to_include=fields_to_include[0]
|
260
|
-
end
|
261
|
-
fields_to_include=@fields.dup if fields_to_include.size==0
|
262
|
-
ds=Dataset.new
|
263
|
-
fields_to_include.each{|f|
|
264
|
-
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
|
265
|
-
ds[f]=@vectors[f]
|
266
|
-
}
|
267
|
-
ds.fields=fields_to_include
|
268
|
-
ds.name=@name
|
269
|
-
ds.update_valid_data
|
270
|
-
ds
|
271
|
-
end
|
272
|
-
# Creates a copy of the given dataset, without data on vectors
|
273
|
-
#
|
274
|
-
# @return {Statsample::Dataset}
|
275
|
-
def dup_empty
|
276
|
-
vectors=@vectors.inject({}) {|a,v|
|
277
|
-
a[v[0]]=v[1].dup_empty
|
278
|
-
a
|
279
|
-
}
|
280
|
-
Dataset.new(vectors,@fields.dup)
|
281
|
-
end
|
282
|
-
# Merge vectors from two datasets
|
283
|
-
# In case of name collition, the vectors names are changed to
|
284
|
-
# x_1, x_2 ....
|
285
|
-
#
|
286
|
-
# @return {Statsample::Dataset}
|
287
|
-
def merge(other_ds)
|
288
|
-
raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases
|
289
|
-
types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type}
|
290
|
-
new_fields = (@fields+other_ds.fields).recode_repeated
|
291
|
-
ds_new=Statsample::Dataset.new(new_fields)
|
292
|
-
new_fields.each_index{|i|
|
293
|
-
field=new_fields[i]
|
294
|
-
ds_new[field].type=types[i]
|
295
|
-
}
|
296
|
-
@cases.times {|i|
|
297
|
-
row=case_as_array(i)+other_ds.case_as_array(i)
|
298
|
-
ds_new.add_case_array(row)
|
299
|
-
}
|
300
|
-
ds_new.update_valid_data
|
301
|
-
ds_new
|
302
|
-
end
|
303
|
-
|
304
|
-
# Join 2 Datasets by given fields
|
305
|
-
# type is one of :left and :inner, default is :left
|
306
|
-
#
|
307
|
-
# @return {Statsample::Dataset}
|
308
|
-
def join(other_ds,fields_1=[],fields_2=[],type=:left)
|
309
|
-
fields_new = other_ds.fields - fields_2
|
310
|
-
fields = self.fields + fields_new
|
311
|
-
|
312
|
-
other_ds_hash = {}
|
313
|
-
other_ds.each do |row|
|
314
|
-
key = row.select{|k,v| fields_2.include?(k)}.values
|
315
|
-
value = row.select{|k,v| fields_new.include?(k)}
|
316
|
-
if other_ds_hash[key].nil?
|
317
|
-
other_ds_hash[key] = [value]
|
318
|
-
else
|
319
|
-
other_ds_hash[key] << value
|
70
|
+
@vectors = {}
|
71
|
+
vectors.each do |k,v|
|
72
|
+
@vectors[k.respond_to?(:to_sym) ? k.to_sym : k] = v
|
320
73
|
end
|
74
|
+
@fields = fields
|
75
|
+
super @vectors, order: @fields.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
|
321
76
|
end
|
322
|
-
|
323
|
-
new_ds = Dataset.new(fields)
|
324
|
-
|
325
|
-
self.each do |row|
|
326
|
-
key = row.select{|k,v| fields_1.include?(k)}.values
|
327
|
-
|
328
|
-
new_case = row.dup
|
329
|
-
|
330
|
-
if other_ds_hash[key].nil?
|
331
|
-
if type == :left
|
332
|
-
fields_new.each{|field| new_case[field] = nil}
|
333
|
-
new_ds.add_case(new_case)
|
334
|
-
end
|
335
|
-
else
|
336
|
-
other_ds_hash[key].each do |new_values|
|
337
|
-
new_ds.add_case new_case.merge(new_values)
|
338
|
-
end
|
339
|
-
end
|
340
|
-
|
341
|
-
end
|
342
|
-
new_ds
|
343
77
|
end
|
344
|
-
# Returns a dataset with standarized data.
|
345
|
-
#
|
346
|
-
# @return {Statsample::Dataset}
|
347
|
-
def standarize
|
348
|
-
ds=dup()
|
349
|
-
ds.fields.each do |f|
|
350
|
-
ds[f]=ds[f].vector_standarized
|
351
|
-
end
|
352
|
-
ds
|
353
|
-
end
|
354
|
-
# Generate a matrix, based on fields of dataset
|
355
|
-
#
|
356
|
-
# @return {::Matrix}
|
357
78
|
|
358
|
-
def
|
359
|
-
|
360
|
-
@fields.collect{|col|
|
361
|
-
yield row,col
|
362
|
-
}
|
363
|
-
}
|
364
|
-
Matrix.rows(rows)
|
79
|
+
def from_to(from,to)
|
80
|
+
raise NoMethodError, "This method is no longer supported. To see the vector index use Daru::DataFrame#vectors"
|
365
81
|
end
|
366
82
|
|
367
|
-
# We have the same datasets if +vectors+ and +fields+ are the same
|
368
|
-
#
|
369
|
-
# @return {Boolean}
|
370
|
-
def ==(d2)
|
371
|
-
@vectors==d2.vectors and @fields==d2.fields
|
372
|
-
end
|
373
|
-
# Returns vector <tt>c</tt>
|
374
|
-
#
|
375
|
-
# @return {Statsample::Vector}
|
376
|
-
def col(c)
|
377
|
-
@vectors[c]
|
378
|
-
end
|
379
|
-
alias_method :vector, :col
|
380
|
-
# Equal to Dataset[<tt>name</tt>]=<tt>vector</tt>
|
381
|
-
#
|
382
|
-
# @return self
|
383
83
|
def add_vector(name, vector)
|
384
|
-
raise
|
385
|
-
@vectors[name]=vector
|
386
|
-
check_order
|
387
|
-
self
|
388
|
-
end
|
389
|
-
# Returns true if dataset have vector <tt>v</tt>.
|
390
|
-
#
|
391
|
-
# @return {Boolean}
|
392
|
-
def has_vector? (v)
|
393
|
-
return @vectors.has_key?(v)
|
394
|
-
end
|
395
|
-
# Creates a dataset with the random data, of a n size
|
396
|
-
# If n not given, uses original number of cases.
|
397
|
-
#
|
398
|
-
# @return {Statsample::Dataset}
|
399
|
-
def bootstrap(n=nil)
|
400
|
-
n||=@cases
|
401
|
-
ds_boot=dup_empty
|
402
|
-
n.times do
|
403
|
-
ds_boot.add_case_array(case_as_array(rand(n)))
|
404
|
-
end
|
405
|
-
ds_boot.update_valid_data
|
406
|
-
ds_boot
|
84
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#[]= directly."
|
407
85
|
end
|
408
|
-
|
409
|
-
# Can only add one case and no error check if performed
|
410
|
-
# You SHOULD use #update_valid_data at the end of insertion cycle
|
411
|
-
#
|
412
|
-
#
|
86
|
+
|
413
87
|
def add_case_array(v)
|
414
|
-
|
88
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#add_row instead."
|
415
89
|
end
|
416
|
-
# Insert a case, using:
|
417
|
-
# * Array: size equal to number of vectors and values in the same order as fields
|
418
|
-
# * Hash: keys equal to fields
|
419
|
-
# If uvd is false, #update_valid_data is not executed after
|
420
|
-
# inserting a case. This is very useful if you want to increase the
|
421
|
-
# performance on inserting many cases, because #update_valid_data
|
422
|
-
# performs check on vectors and on the dataset
|
423
90
|
|
424
91
|
def add_case(v,uvd=true)
|
425
|
-
|
426
|
-
when Array
|
427
|
-
if (v[0].is_a? Array)
|
428
|
-
v.each{|subv| add_case(subv,false)}
|
429
|
-
else
|
430
|
-
raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
|
431
|
-
v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
|
432
|
-
end
|
433
|
-
when Hash
|
434
|
-
raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort
|
435
|
-
@fields.each{|f| @vectors[f].add(v[f],false)}
|
436
|
-
else
|
437
|
-
raise TypeError, 'Value must be a Array or a Hash'
|
438
|
-
end
|
439
|
-
if uvd
|
440
|
-
update_valid_data
|
441
|
-
end
|
442
|
-
end
|
443
|
-
# Check vectors and fields after inserting data. Use only
|
444
|
-
# after #add_case_array or #add_case with second parameter to false
|
445
|
-
def update_valid_data
|
446
|
-
@gsl=nil
|
447
|
-
@fields.each{|f| @vectors[f].set_valid_data}
|
448
|
-
check_length
|
449
|
-
end
|
450
|
-
# Delete vector named +name+. Multiple fields accepted.
|
451
|
-
def delete_vector(*args)
|
452
|
-
if args.size==1 and args[0].is_a? Array
|
453
|
-
names=args[0]
|
454
|
-
else
|
455
|
-
names=args
|
456
|
-
end
|
457
|
-
names.each do |name|
|
458
|
-
@fields.delete(name)
|
459
|
-
@vectors.delete(name)
|
460
|
-
end
|
461
|
-
end
|
462
|
-
|
463
|
-
def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN)
|
464
|
-
split=@vectors[name_].split_by_separator(sep)
|
465
|
-
i=1
|
466
|
-
split.each{|k,v|
|
467
|
-
new_field=name_+join+i.to_s
|
468
|
-
v.name=name_+":"+k
|
469
|
-
add_vector(new_field,v)
|
470
|
-
i+=1
|
471
|
-
}
|
472
|
-
end
|
473
|
-
def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
|
474
|
-
split=@vectors[name].split_by_separator(sep)
|
475
|
-
split.each{|k,v|
|
476
|
-
add_vector(name+join+k,v)
|
477
|
-
}
|
478
|
-
end
|
479
|
-
|
480
|
-
def vector_by_calculation(type=:numeric)
|
481
|
-
a=[]
|
482
|
-
each do |row|
|
483
|
-
a.push(yield(row))
|
484
|
-
end
|
485
|
-
a.to_vector(type)
|
486
|
-
end
|
487
|
-
# Returns a vector with sumatory of fields
|
488
|
-
# if fields parameter is empty, sum all fields
|
489
|
-
def vector_sum(fields=nil)
|
490
|
-
fields||=@fields
|
491
|
-
vector=collect_with_index do |row, i|
|
492
|
-
if(fields.find{|f| !@vectors[f].data_with_nils[i]})
|
493
|
-
nil
|
494
|
-
else
|
495
|
-
fields.inject(0) {|ac,v| ac + row[v].to_f}
|
496
|
-
end
|
497
|
-
end
|
498
|
-
vector.name=_("Sum from %s") % @name
|
499
|
-
vector
|
500
|
-
end
|
501
|
-
# Check if #fields attribute is correct, after inserting or deleting vectors
|
502
|
-
def check_fields(fields)
|
503
|
-
fields||=@fields
|
504
|
-
raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
|
505
|
-
fields
|
506
|
-
end
|
507
|
-
|
508
|
-
# Returns a vector with the numbers of missing values for a case
|
509
|
-
def vector_missing_values(fields=nil)
|
510
|
-
fields=check_fields(fields)
|
511
|
-
collect_with_index do |row, i|
|
512
|
-
fields.inject(0) {|a,v|
|
513
|
-
a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
|
514
|
-
}
|
515
|
-
end
|
516
|
-
end
|
517
|
-
def vector_count_characters(fields=nil)
|
518
|
-
fields=check_fields(fields)
|
519
|
-
collect_with_index do |row, i|
|
520
|
-
fields.inject(0){|a,v|
|
521
|
-
a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
|
522
|
-
}
|
523
|
-
end
|
524
|
-
end
|
525
|
-
# Returns a vector with the mean for a set of fields
|
526
|
-
# if fields parameter is empty, return the mean for all fields
|
527
|
-
# if max invalid parameter > 0, returns the mean for all tuples
|
528
|
-
# with 0 to max_invalid invalid fields
|
529
|
-
def vector_mean(fields=nil, max_invalid=0)
|
530
|
-
a=[]
|
531
|
-
fields=check_fields(fields)
|
532
|
-
size=fields.size
|
533
|
-
each_with_index do |row, i |
|
534
|
-
# numero de invalidos
|
535
|
-
sum=0
|
536
|
-
invalids=0
|
537
|
-
fields.each{|f|
|
538
|
-
if !@vectors[f].data_with_nils[i].nil?
|
539
|
-
sum+=row[f].to_f
|
540
|
-
else
|
541
|
-
invalids+=1
|
542
|
-
end
|
543
|
-
}
|
544
|
-
if(invalids>max_invalid)
|
545
|
-
a.push(nil)
|
546
|
-
else
|
547
|
-
a.push(sum.quo(size-invalids))
|
548
|
-
end
|
549
|
-
end
|
550
|
-
a=a.to_vector(:numeric)
|
551
|
-
a.name=_("Means from %s") % @name
|
552
|
-
a
|
553
|
-
end
|
554
|
-
# Check vectors for type and size.
|
555
|
-
def check_length # :nodoc:
|
556
|
-
size=nil
|
557
|
-
@vectors.each do |k,v|
|
558
|
-
raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
|
559
|
-
if size.nil?
|
560
|
-
size=v.size
|
561
|
-
else
|
562
|
-
if v.size!=size
|
563
|
-
raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
|
564
|
-
end
|
565
|
-
end
|
566
|
-
end
|
567
|
-
@cases=size
|
568
|
-
end
|
569
|
-
# Retrieves each vector as [key, vector]
|
570
|
-
def each_vector # :yield: |key, vector|
|
571
|
-
@fields.each{|k| yield k, @vectors[k]}
|
92
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#add_row instead."
|
572
93
|
end
|
573
94
|
|
574
|
-
|
575
|
-
|
576
|
-
Statsample::STATSAMPLE__.case_as_hash(self,c)
|
577
|
-
end
|
578
|
-
else
|
579
|
-
# Retrieves case i as a hash
|
580
|
-
def case_as_hash(i)
|
581
|
-
_case_as_hash(i)
|
582
|
-
end
|
95
|
+
def update_valid_data
|
96
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#update instead. Also see Daru.lazy_update in the daru docs."
|
583
97
|
end
|
584
98
|
|
585
|
-
|
586
|
-
|
587
|
-
Statsample::STATSAMPLE__.case_as_array(self,c)
|
588
|
-
end
|
589
|
-
else
|
590
|
-
# Retrieves case i as a array, ordered on #fields order
|
591
|
-
def case_as_array(i)
|
592
|
-
_case_as_array(i)
|
593
|
-
end
|
594
|
-
end
|
595
|
-
def _case_as_hash(c) # :nodoc:
|
596
|
-
@fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
|
597
|
-
end
|
598
|
-
def _case_as_array(c) # :nodoc:
|
599
|
-
@fields.collect {|x| @vectors[x][c]}
|
99
|
+
def each_array
|
100
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#each_row instead."
|
600
101
|
end
|
601
102
|
|
602
|
-
|
603
|
-
|
604
|
-
begin
|
605
|
-
@i=0
|
606
|
-
@cases.times {|i|
|
607
|
-
@i=i
|
608
|
-
row=case_as_hash(i)
|
609
|
-
yield row
|
610
|
-
}
|
611
|
-
@i=nil
|
612
|
-
rescue =>e
|
613
|
-
raise DatasetException.new(self, e)
|
614
|
-
end
|
615
|
-
end
|
103
|
+
def fields=(f)
|
104
|
+
$stderr.puts "WARNING: Deprecated. Use Daru::DataFrame#reindex_vectors! instead.\n"
|
616
105
|
|
617
|
-
|
618
|
-
def each_with_index # :yield: |case, i|
|
619
|
-
begin
|
620
|
-
@i=0
|
621
|
-
@cases.times{|i|
|
622
|
-
@i=i
|
623
|
-
row=case_as_hash(i)
|
624
|
-
yield row, i
|
625
|
-
}
|
626
|
-
@i=nil
|
627
|
-
rescue =>e
|
628
|
-
raise DatasetException.new(self, e)
|
629
|
-
end
|
106
|
+
reindex_vectors! f
|
630
107
|
end
|
631
108
|
|
632
|
-
# Returns each case as an array, coding missing values as nils
|
633
|
-
def each_array_with_nils
|
634
|
-
m=fields.size
|
635
|
-
@cases.times {|i|
|
636
|
-
@i=i
|
637
|
-
row=Array.new(m)
|
638
|
-
fields.each_index{|j|
|
639
|
-
f=fields[j]
|
640
|
-
row[j]=@vectors[f].data_with_nils[i]
|
641
|
-
}
|
642
|
-
yield row
|
643
|
-
}
|
644
|
-
@i=nil
|
645
|
-
end
|
646
|
-
# Returns each case as an array
|
647
|
-
def each_array
|
648
|
-
@cases.times {|i|
|
649
|
-
@i=i
|
650
|
-
row=case_as_array(i)
|
651
|
-
yield row
|
652
|
-
}
|
653
|
-
@i=nil
|
654
|
-
end
|
655
|
-
# Set fields order. If you omit one or more vectors, they are
|
656
|
-
# ordered by alphabetic order.
|
657
|
-
def fields=(f)
|
658
|
-
@fields=f
|
659
|
-
check_order
|
660
|
-
end
|
661
|
-
# Check congruence between +fields+ attribute
|
662
|
-
# and keys on +vectors
|
663
|
-
def check_order #:nodoc:
|
664
|
-
if(@vectors.keys.sort!=@fields.sort)
|
665
|
-
@fields=@fields&@vectors.keys
|
666
|
-
@fields+=@vectors.keys.sort-@fields
|
667
|
-
end
|
668
|
-
end
|
669
109
|
# Returns the vector named i
|
670
|
-
def[](i)
|
110
|
+
def [](i)
|
111
|
+
$stderr.puts "WARNING: Daru uses symbols instead of strings for naming vectors. Please switch to symbols.\n"
|
112
|
+
|
671
113
|
if i.is_a? Range
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
clone(i)
|
114
|
+
beg = i.begin.respond_to?(:to_sym) ? i.to_sym : i
|
115
|
+
en = i.end.respond_to?(:to_sym) ? i.to_sym : i
|
116
|
+
super(beg..en)
|
676
117
|
else
|
677
|
-
|
678
|
-
@vectors[i]
|
118
|
+
super i.to_sym
|
679
119
|
end
|
680
120
|
end
|
681
|
-
# Retrieves a Statsample::Vector, based on the result
|
682
|
-
# of calculation performed on each case.
|
683
|
-
def collect(type=:numeric)
|
684
|
-
data=[]
|
685
|
-
each {|row|
|
686
|
-
data.push yield(row)
|
687
|
-
}
|
688
|
-
Statsample::Vector.new(data,type)
|
689
|
-
end
|
690
|
-
# Same as Statsample::Vector.collect, but giving case index as second parameter on yield.
|
691
|
-
def collect_with_index(type=:numeric)
|
692
|
-
data=[]
|
693
|
-
each_with_index {|row, i|
|
694
|
-
data.push(yield(row, i))
|
695
|
-
}
|
696
|
-
Statsample::Vector.new(data,type)
|
697
|
-
end
|
698
|
-
# Recode a vector based on a block
|
699
|
-
def recode!(vector_name)
|
700
|
-
0.upto(@cases-1) {|i|
|
701
|
-
@vectors[vector_name].data[i]=yield case_as_hash(i)
|
702
|
-
}
|
703
|
-
@vectors[vector_name].set_valid_data
|
704
|
-
end
|
705
121
|
|
706
|
-
def
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
if v.instance_of? Statsample::Vector
|
711
|
-
@vectors[i]=v
|
712
|
-
check_order
|
713
|
-
else
|
714
|
-
raise ArgumentError,"Should pass a Statsample::Vector"
|
715
|
-
end
|
716
|
-
end
|
717
|
-
# Return data as a matrix. Column are ordered by #fields and
|
718
|
-
# rows by orden of insertion
|
719
|
-
def to_matrix
|
720
|
-
rows=[]
|
721
|
-
self.each_array{|c|
|
722
|
-
rows.push(c)
|
723
|
-
}
|
724
|
-
Matrix.rows(rows)
|
122
|
+
def []=(i,v)
|
123
|
+
$stderr.puts "WARNING: Daru uses symbols instead of strings for naming vectors. Please switch to symbols.\n"
|
124
|
+
|
125
|
+
super i, v
|
725
126
|
end
|
726
127
|
|
727
128
|
if Statsample.has_gsl?
|
728
129
|
def clear_gsl
|
729
|
-
|
130
|
+
raise NoMethodError, "This method is no longer needed/supported."
|
730
131
|
end
|
731
|
-
|
732
|
-
def to_gsl
|
733
|
-
if @gsl.nil?
|
734
|
-
if cases.nil?
|
735
|
-
update_valid_data
|
736
|
-
end
|
737
|
-
@gsl=GSL::Matrix.alloc(cases,fields.size)
|
738
|
-
self.each_array{|c|
|
739
|
-
@gsl.set_row(@i,c)
|
740
|
-
}
|
741
|
-
end
|
742
|
-
@gsl
|
743
|
-
end
|
744
|
-
|
745
|
-
end
|
746
|
-
|
747
|
-
# Return a correlation matrix for fields included as parameters.
|
748
|
-
# By default, uses all fields of dataset
|
749
|
-
def correlation_matrix(fields = nil)
|
750
|
-
if fields
|
751
|
-
ds = clone(fields)
|
752
|
-
else
|
753
|
-
ds = self
|
754
|
-
end
|
755
|
-
Statsample::Bivariate.correlation_matrix(ds)
|
756
|
-
end
|
757
|
-
|
758
|
-
# Return a correlation matrix for fields included as parameters.
|
759
|
-
# By default, uses all fields of dataset
|
760
|
-
def covariance_matrix(fields = nil)
|
761
|
-
if fields
|
762
|
-
ds = clone(fields)
|
763
|
-
else
|
764
|
-
ds = self
|
765
|
-
end
|
766
|
-
Statsample::Bivariate.covariance_matrix(ds)
|
767
|
-
end
|
768
|
-
|
769
|
-
# Create a new dataset with all cases which the block returns true
|
770
|
-
def filter
|
771
|
-
ds=self.dup_empty
|
772
|
-
each {|c|
|
773
|
-
ds.add_case(c, false) if yield c
|
774
|
-
}
|
775
|
-
ds.update_valid_data
|
776
|
-
ds.name=_("%s(filtered)") % @name
|
777
|
-
ds
|
778
|
-
end
|
779
|
-
|
780
|
-
# creates a new vector with the data of a given field which the block returns true
|
781
|
-
def filter_field(field)
|
782
|
-
a=[]
|
783
|
-
each do |c|
|
784
|
-
a.push(c[field]) if yield c
|
785
|
-
end
|
786
|
-
a.to_vector(@vectors[field].type)
|
787
|
-
end
|
788
|
-
|
789
|
-
# Creates a Stastample::Multiset, using one or more fields
|
790
|
-
# to split the dataset.
|
791
|
-
|
792
|
-
|
793
|
-
def to_multiset_by_split(*fields)
|
794
|
-
require 'statsample/multiset'
|
795
|
-
if fields.size==1
|
796
|
-
to_multiset_by_split_one_field(fields[0])
|
797
|
-
else
|
798
|
-
to_multiset_by_split_multiple_fields(*fields)
|
799
|
-
end
|
800
|
-
end
|
801
|
-
# Creates a Statsample::Multiset, using one field
|
802
|
-
|
803
|
-
def to_multiset_by_split_one_field(field)
|
804
|
-
raise ArgumentError,"Should use a correct field name" if !@fields.include? field
|
805
|
-
factors=@vectors[field].factors
|
806
|
-
ms=Multiset.new_empty_vectors(@fields, factors)
|
807
|
-
each {|c|
|
808
|
-
ms[c[field]].add_case(c,false)
|
809
|
-
}
|
810
|
-
#puts "Ingreso a los dataset"
|
811
|
-
ms.datasets.each {|k,ds|
|
812
|
-
ds.update_valid_data
|
813
|
-
ds.name=@vectors[field].labeling(k)
|
814
|
-
ds.vectors.each{|k1,v1|
|
815
|
-
# puts "Vector #{k1}:"+v1.to_s
|
816
|
-
v1.type=@vectors[k1].type
|
817
|
-
v1.name=@vectors[k1].name
|
818
|
-
v1.labels=@vectors[k1].labels
|
819
|
-
|
820
|
-
}
|
821
|
-
}
|
822
|
-
ms
|
823
|
-
end
|
824
|
-
def to_multiset_by_split_multiple_fields(*fields)
|
825
|
-
factors_total=nil
|
826
|
-
fields.each do |f|
|
827
|
-
if factors_total.nil?
|
828
|
-
factors_total=@vectors[f].factors.collect{|c|
|
829
|
-
[c]
|
830
|
-
}
|
831
|
-
else
|
832
|
-
suma=[]
|
833
|
-
factors=@vectors[f].factors
|
834
|
-
factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } }
|
835
|
-
factors_total=suma
|
836
|
-
end
|
837
|
-
end
|
838
|
-
ms=Multiset.new_empty_vectors(@fields,factors_total)
|
839
|
-
|
840
|
-
p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
|
841
|
-
each{|c| p1.call(c)}
|
842
|
-
|
843
|
-
ms.datasets.each do |k,ds|
|
844
|
-
ds.update_valid_data
|
845
|
-
ds.name=fields.size.times.map {|i|
|
846
|
-
f=fields[i]
|
847
|
-
sk=k[i]
|
848
|
-
@vectors[f].labeling(sk)
|
849
|
-
}.join("-")
|
850
|
-
ds.vectors.each{|k1,v1|
|
851
|
-
v1.type=@vectors[k1].type
|
852
|
-
v1.name=@vectors[k1].name
|
853
|
-
v1.labels=@vectors[k1].labels
|
854
|
-
|
855
|
-
}
|
856
|
-
end
|
857
|
-
ms
|
858
|
-
|
859
|
-
end
|
860
|
-
# Returns a vector, based on a string with a calculation based
|
861
|
-
# on vector
|
862
|
-
# The calculation will be eval'ed, so you can put any variable
|
863
|
-
# or expression valid on ruby
|
864
|
-
# For example:
|
865
|
-
# a=[1,2].to_vector(scale)
|
866
|
-
# b=[3,4].to_vector(scale)
|
867
|
-
# ds={'a'=>a,'b'=>b}.to_dataset
|
868
|
-
# ds.compute("a+b")
|
869
|
-
# => Vector [4,6]
|
870
|
-
def compute(text)
|
871
|
-
@fields.each{|f|
|
872
|
-
if @vectors[f].type=:numeric
|
873
|
-
text.gsub!(f,"row['#{f}'].to_f")
|
874
|
-
else
|
875
|
-
text.gsub!(f,"row['#{f}']")
|
876
|
-
end
|
877
|
-
}
|
878
|
-
collect_with_index {|row, i|
|
879
|
-
invalid=false
|
880
|
-
@fields.each{|f|
|
881
|
-
if @vectors[f].data_with_nils[i].nil?
|
882
|
-
invalid=true
|
883
|
-
end
|
884
|
-
}
|
885
|
-
if invalid
|
886
|
-
nil
|
887
|
-
else
|
888
|
-
eval(text)
|
889
|
-
end
|
890
|
-
}
|
891
|
-
end
|
892
|
-
# Test each row with one or more tests
|
893
|
-
# each test is a Proc with the form
|
894
|
-
# Proc.new {|row| row['age']>0}
|
895
|
-
# The function returns an array with all errors
|
896
|
-
def verify(*tests)
|
897
|
-
if(tests[0].is_a? String)
|
898
|
-
id=tests[0]
|
899
|
-
tests.shift
|
900
|
-
else
|
901
|
-
id=@fields[0]
|
902
|
-
end
|
903
|
-
vr=[]
|
904
|
-
i=0
|
905
|
-
each do |row|
|
906
|
-
i+=1
|
907
|
-
tests.each{|test|
|
908
|
-
if ! test[2].call(row)
|
909
|
-
values=""
|
910
|
-
if test[1].size>0
|
911
|
-
values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
|
912
|
-
end
|
913
|
-
vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
|
914
|
-
end
|
915
|
-
}
|
916
|
-
end
|
917
|
-
vr
|
918
|
-
end
|
919
|
-
def to_s
|
920
|
-
"#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
|
921
|
-
end
|
922
|
-
def inspect
|
923
|
-
self.to_s
|
924
|
-
end
|
925
|
-
# Creates a new dataset for one to many relations
|
926
|
-
# on a dataset, based on pattern of field names.
|
927
|
-
#
|
928
|
-
# for example, you have a survey for number of children
|
929
|
-
# with this structure:
|
930
|
-
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
931
|
-
# with
|
932
|
-
# ds.one_to_many(%w{id}, "child_%v_%n"
|
933
|
-
# the field of first parameters will be copied verbatim
|
934
|
-
# to new dataset, and fields which responds to second
|
935
|
-
# pattern will be added one case for each different %n.
|
936
|
-
# For example
|
937
|
-
# cases=[
|
938
|
-
# ['1','george','red',10,'blue',20,nil,nil],
|
939
|
-
# ['2','fred','green',15,'orange',30,'white',20],
|
940
|
-
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
941
|
-
# ]
|
942
|
-
# ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
|
943
|
-
# cases.each {|c| ds.add_case_array c }
|
944
|
-
# ds.one_to_many(['id'],'car_%v%n').to_matrix
|
945
|
-
# => Matrix[
|
946
|
-
# ["red", "1", 10],
|
947
|
-
# ["blue", "1", 20],
|
948
|
-
# ["green", "2", 15],
|
949
|
-
# ["orange", "2", 30],
|
950
|
-
# ["white", "2", 20]
|
951
|
-
# ]
|
952
|
-
#
|
953
|
-
def one_to_many(parent_fields, pattern)
|
954
|
-
#base_pattern=pattern.gsub(/%v|%n/,"")
|
955
|
-
re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
|
956
|
-
ds_vars=parent_fields
|
957
|
-
vars=[]
|
958
|
-
max_n=0
|
959
|
-
h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
|
960
|
-
# Adding _row_id
|
961
|
-
h['_col_id']=[].to_numeric
|
962
|
-
ds_vars.push("_col_id")
|
963
|
-
@fields.each do |f|
|
964
|
-
if f=~re
|
965
|
-
if !vars.include? $1
|
966
|
-
vars.push($1)
|
967
|
-
h[$1]=Statsample::Vector.new([], @vectors[f].type)
|
968
|
-
end
|
969
|
-
max_n=$2.to_i if max_n < $2.to_i
|
970
|
-
end
|
971
|
-
end
|
972
|
-
ds=Dataset.new(h,ds_vars+vars)
|
973
|
-
each do |row|
|
974
|
-
row_out={}
|
975
|
-
parent_fields.each do |f|
|
976
|
-
row_out[f]=row[f]
|
977
|
-
end
|
978
|
-
max_n.times do |n1|
|
979
|
-
n=n1+1
|
980
|
-
any_data=false
|
981
|
-
vars.each do |v|
|
982
|
-
data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
|
983
|
-
row_out[v]=data
|
984
|
-
any_data=true if !data.nil?
|
985
|
-
end
|
986
|
-
if any_data
|
987
|
-
row_out["_col_id"]=n
|
988
|
-
ds.add_case(row_out,false)
|
989
|
-
end
|
990
|
-
|
991
|
-
end
|
992
|
-
end
|
993
|
-
ds.update_valid_data
|
994
|
-
ds
|
995
|
-
end
|
996
|
-
def report_building(b)
|
997
|
-
b.section(:name=>@name) do |g|
|
998
|
-
g.text _"Cases: %d" % cases
|
999
|
-
@fields.each do |f|
|
1000
|
-
g.text "Element:[#{f}]"
|
1001
|
-
g.parse_element(@vectors[f])
|
1002
|
-
end
|
1003
|
-
end
|
1004
|
-
end
|
132
|
+
end
|
1005
133
|
end
|
1006
134
|
end
|