statsample 1.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.build.sh +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +19 -7
- data/CONTRIBUTING.md +33 -0
- data/History.txt +5 -0
- data/README.md +41 -53
- data/benchmarks/correlation_matrix_15_variables.rb +6 -5
- data/benchmarks/correlation_matrix_5_variables.rb +6 -5
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
- data/examples/boxplot.rb +17 -5
- data/examples/correlation_matrix.rb +36 -7
- data/examples/dataset.rb +25 -5
- data/examples/dominance_analysis.rb +8 -7
- data/examples/dominance_analysis_bootstrap.rb +16 -11
- data/examples/histogram.rb +16 -2
- data/examples/icc.rb +5 -6
- data/examples/levene.rb +17 -3
- data/examples/multiple_regression.rb +6 -3
- data/examples/parallel_analysis.rb +11 -6
- data/examples/polychoric.rb +26 -13
- data/examples/principal_axis.rb +8 -4
- data/examples/reliability.rb +10 -10
- data/examples/scatterplot.rb +8 -0
- data/examples/t_test.rb +7 -0
- data/examples/u_test.rb +10 -2
- data/examples/vector.rb +9 -6
- data/examples/velicer_map_test.rb +12 -8
- data/lib/statsample.rb +13 -47
- data/lib/statsample/analysis/suite.rb +1 -1
- data/lib/statsample/anova/oneway.rb +6 -6
- data/lib/statsample/anova/twoway.rb +26 -24
- data/lib/statsample/bivariate.rb +78 -61
- data/lib/statsample/bivariate/pearson.rb +2 -2
- data/lib/statsample/codification.rb +45 -32
- data/lib/statsample/converter/csv.rb +15 -53
- data/lib/statsample/converter/spss.rb +6 -5
- data/lib/statsample/converters.rb +50 -211
- data/lib/statsample/crosstab.rb +26 -25
- data/lib/statsample/daru.rb +117 -0
- data/lib/statsample/dataset.rb +70 -942
- data/lib/statsample/dominanceanalysis.rb +16 -17
- data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
- data/lib/statsample/factor/parallelanalysis.rb +17 -19
- data/lib/statsample/factor/pca.rb +21 -20
- data/lib/statsample/factor/principalaxis.rb +3 -3
- data/lib/statsample/graph/boxplot.rb +8 -16
- data/lib/statsample/graph/histogram.rb +4 -4
- data/lib/statsample/graph/scatterplot.rb +8 -7
- data/lib/statsample/histogram.rb +128 -119
- data/lib/statsample/matrix.rb +20 -16
- data/lib/statsample/multiset.rb +39 -38
- data/lib/statsample/regression.rb +3 -3
- data/lib/statsample/regression/multiple.rb +8 -10
- data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
- data/lib/statsample/regression/multiple/baseengine.rb +32 -32
- data/lib/statsample/regression/multiple/gslengine.rb +33 -36
- data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
- data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
- data/lib/statsample/reliability.rb +23 -25
- data/lib/statsample/reliability/icc.rb +8 -7
- data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
- data/lib/statsample/reliability/scaleanalysis.rb +58 -60
- data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
- data/lib/statsample/resample.rb +1 -1
- data/lib/statsample/shorthand.rb +29 -25
- data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
- data/lib/statsample/test/levene.rb +28 -27
- data/lib/statsample/test/t.rb +7 -9
- data/lib/statsample/test/umannwhitney.rb +28 -28
- data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
- data/lib/statsample/vector.rb +70 -1013
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +12 -16
- data/test/helpers_tests.rb +1 -1
- data/test/test_analysis.rb +17 -17
- data/test/test_anova_contrast.rb +6 -6
- data/test/test_anovatwowaywithdataset.rb +8 -8
- data/test/test_anovawithvectors.rb +8 -8
- data/test/test_awesome_print_bug.rb +1 -1
- data/test/test_bartlettsphericity.rb +4 -4
- data/test/test_bivariate.rb +48 -43
- data/test/test_codification.rb +33 -33
- data/test/test_crosstab.rb +9 -9
- data/test/test_dataset.rb +28 -458
- data/test/test_factor.rb +46 -38
- data/test/test_factor_pa.rb +22 -13
- data/test/test_ggobi.rb +4 -4
- data/test/test_gsl.rb +4 -4
- data/test/test_histogram.rb +3 -3
- data/test/test_matrix.rb +13 -13
- data/test/test_multiset.rb +103 -91
- data/test/test_regression.rb +57 -52
- data/test/test_reliability.rb +55 -45
- data/test/test_reliability_icc.rb +8 -8
- data/test/test_reliability_skillscale.rb +26 -24
- data/test/test_resample.rb +1 -1
- data/test/test_statistics.rb +3 -13
- data/test/test_stest.rb +9 -9
- data/test/test_stratified.rb +3 -3
- data/test/test_test_t.rb +12 -12
- data/test/test_umannwhitney.rb +2 -2
- data/test/test_vector.rb +76 -613
- data/test/test_wilcoxonsignedrank.rb +4 -4
- metadata +57 -28
- data/lib/statsample/rserve_extension.rb +0 -20
- data/lib/statsample/vector/gsl.rb +0 -106
- data/test/fixtures/repeated_fields.csv +0 -7
- data/test/fixtures/scientific_notation.csv +0 -4
- data/test/fixtures/test_csv.csv +0 -7
- data/test/fixtures/test_xls.xls +0 -0
- data/test/test_csv.rb +0 -63
- data/test/test_rserve_extension.rb +0 -42
- data/test/test_xls.rb +0 -52
data/lib/statsample/crosstab.rb
CHANGED
@@ -8,24 +8,25 @@ module Statsample
|
|
8
8
|
attr_reader :v_rows, :v_cols
|
9
9
|
attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
|
10
10
|
def initialize(v1, v2, opts=Hash.new)
|
11
|
-
#raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
|
12
11
|
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
|
13
|
-
@v_rows, @v_cols=Statsample.only_valid_clone(
|
14
|
-
|
15
|
-
|
16
|
-
@
|
17
|
-
@name
|
12
|
+
@v_rows, @v_cols = Statsample.only_valid_clone(
|
13
|
+
Daru::Vector.new(v1),
|
14
|
+
Daru::Vector.new(v2))
|
15
|
+
@cases = @v_rows.size
|
16
|
+
@row_label = v1.name
|
17
|
+
@column_label = v2.name
|
18
|
+
@name = nil
|
18
19
|
@percentage_row = @percentage_column = @percentage_total=false
|
19
|
-
opts.each
|
20
|
+
opts.each do |k,v|
|
20
21
|
self.send("#{k}=",v) if self.respond_to? k
|
21
|
-
|
22
|
-
@name||=_("Crosstab %s - %s") % [@row_label, @column_label]
|
22
|
+
end
|
23
|
+
@name ||= _("Crosstab %s - %s") % [@row_label, @column_label]
|
23
24
|
end
|
24
25
|
def rows_names
|
25
|
-
@v_rows.factors.sort
|
26
|
+
@v_rows.factors.sort.reset_index!
|
26
27
|
end
|
27
28
|
def cols_names
|
28
|
-
@v_cols.factors.sort
|
29
|
+
@v_cols.factors.sort.reset_index!
|
29
30
|
end
|
30
31
|
def rows_total
|
31
32
|
@v_rows.frequencies
|
@@ -35,18 +36,18 @@ module Statsample
|
|
35
36
|
end
|
36
37
|
|
37
38
|
def frequencies
|
38
|
-
base=rows_names.inject([])
|
39
|
-
s+=cols_names.collect{|col| [row,col]}
|
40
|
-
|
39
|
+
base = rows_names.inject([]) do |s,row|
|
40
|
+
s += cols_names.collect { |col| [row,col] }
|
41
|
+
end.inject({}) do |s,par|
|
41
42
|
s[par]=0
|
42
43
|
s
|
43
|
-
|
44
|
-
base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.
|
44
|
+
end
|
45
|
+
base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
|
45
46
|
end
|
46
47
|
def to_matrix
|
47
|
-
f=frequencies
|
48
|
-
rn=rows_names
|
49
|
-
cn=cols_names
|
48
|
+
f = frequencies
|
49
|
+
rn = rows_names
|
50
|
+
cn = cols_names
|
50
51
|
Matrix.rows(rn.collect{|row|
|
51
52
|
cn.collect{|col| f[[row,col]]}
|
52
53
|
})
|
@@ -67,8 +68,8 @@ module Statsample
|
|
67
68
|
end
|
68
69
|
# Chi square, based on expected and real matrix
|
69
70
|
def chi_square
|
70
|
-
|
71
|
-
|
71
|
+
require 'statsample/test'
|
72
|
+
Statsample::Test.chi_square(self.to_matrix, matrix_expected)
|
72
73
|
end
|
73
74
|
# Useful to obtain chi square
|
74
75
|
def matrix_expected
|
@@ -98,10 +99,10 @@ module Statsample
|
|
98
99
|
generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
|
99
100
|
generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
|
100
101
|
|
101
|
-
t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.
|
102
|
+
t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")])
|
102
103
|
rn.each do |row|
|
103
104
|
total_row=0
|
104
|
-
t_row=[@v_rows.
|
105
|
+
t_row=[@v_rows.index_of(row)]
|
105
106
|
cn.each do |col|
|
106
107
|
data=fq[[row,col]]
|
107
108
|
total_row+=fq[[row,col]]
|
@@ -148,9 +149,9 @@ module Statsample
|
|
148
149
|
when :total then _("% Total")
|
149
150
|
end
|
150
151
|
|
151
|
-
t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.
|
152
|
+
t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")])
|
152
153
|
rn.each do |row|
|
153
|
-
t_row=[@v_rows.
|
154
|
+
t_row=[@v_rows.index_of(row)]
|
154
155
|
cn.each do |col|
|
155
156
|
total=case type
|
156
157
|
when :row then rt[row]
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# Opening the Daru::DataFrame class for adding methods to convert from
|
2
|
+
# data structures to specialized statsample data structues like Multiset.
|
3
|
+
module Daru
|
4
|
+
class Vector
|
5
|
+
def histogram(bins=10)
|
6
|
+
type == :numeric or raise TypeError, "Only numeric Vectors can do this operation."
|
7
|
+
|
8
|
+
if bins.is_a? Array
|
9
|
+
h = Statsample::Histogram.alloc(bins)
|
10
|
+
else
|
11
|
+
# ugly patch. The upper limit for a bin has the form
|
12
|
+
# x < range
|
13
|
+
#h=Statsample::Histogram.new(self, bins)
|
14
|
+
valid = only_valid
|
15
|
+
min,max=Statsample::Util.nice(valid.min,valid.max)
|
16
|
+
# fix last data
|
17
|
+
if max == valid.max
|
18
|
+
max += 1e-10
|
19
|
+
end
|
20
|
+
h = Statsample::Histogram.alloc(bins,[min,max])
|
21
|
+
# Fix last bin
|
22
|
+
end
|
23
|
+
|
24
|
+
h.increment(valid)
|
25
|
+
h
|
26
|
+
end
|
27
|
+
|
28
|
+
# Variance of p, according to poblation size
|
29
|
+
def variance_proportion(n_poblation, v=1)
|
30
|
+
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Variance of p, according to poblation size
|
34
|
+
def variance_total(n_poblation, v=1)
|
35
|
+
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
36
|
+
end
|
37
|
+
|
38
|
+
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
39
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
40
|
+
end
|
41
|
+
|
42
|
+
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
43
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class DataFrame
|
48
|
+
def crosstab(v1,v2,opts={})
|
49
|
+
Statsample::Crosstab.new(self[v1], self[v2],opts)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Functions for converting to Statsample::Multiset
|
53
|
+
def to_multiset_by_split(*vecs)
|
54
|
+
require 'statsample/multiset'
|
55
|
+
|
56
|
+
if vecs.size == 1
|
57
|
+
to_multiset_by_split_one_field(vecs[0])
|
58
|
+
else
|
59
|
+
to_multiset_by_split_multiple_fields(*vecs)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
# Creates a Statsample::Multiset, using one field
|
63
|
+
|
64
|
+
def to_multiset_by_split_one_field(field)
|
65
|
+
raise ArgumentError,"Should use a correct field name" if
|
66
|
+
!@vectors.include? field
|
67
|
+
|
68
|
+
factors = self[field].factors
|
69
|
+
ms = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors)
|
70
|
+
each_row do |row|
|
71
|
+
ms[row[field]].add_row(row)
|
72
|
+
end
|
73
|
+
#puts "Ingreso a los dataset"
|
74
|
+
ms.datasets.each do |k,ds|
|
75
|
+
ds.update
|
76
|
+
ds.rename self[field].index_of(k)
|
77
|
+
end
|
78
|
+
|
79
|
+
ms
|
80
|
+
end
|
81
|
+
|
82
|
+
def to_multiset_by_split_multiple_fields(*fields)
|
83
|
+
fields.map!(&:to_sym)
|
84
|
+
factors_total=nil
|
85
|
+
fields.each do |f|
|
86
|
+
if factors_total.nil?
|
87
|
+
factors_total = self[f].factors.collect { |c| [c] }
|
88
|
+
else
|
89
|
+
suma = []
|
90
|
+
factors = self[f].factors
|
91
|
+
factors_total.each do |f1|
|
92
|
+
factors.each do |f2|
|
93
|
+
suma.push(f1+[f2])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
factors_total = suma
|
97
|
+
end
|
98
|
+
end
|
99
|
+
ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total)
|
100
|
+
|
101
|
+
p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }"
|
102
|
+
each_row { |r| p1.call(r) }
|
103
|
+
|
104
|
+
ms.datasets.each do |k,ds|
|
105
|
+
ds.update
|
106
|
+
ds.rename(
|
107
|
+
fields.size.times.map do |i|
|
108
|
+
f = fields[i]
|
109
|
+
sk = k[i]
|
110
|
+
self[f].index_of(sk)
|
111
|
+
end.join("-")
|
112
|
+
)
|
113
|
+
end
|
114
|
+
ms
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
data/lib/statsample/dataset.rb
CHANGED
@@ -2,9 +2,11 @@ require 'statsample/vector'
|
|
2
2
|
|
3
3
|
class Hash
|
4
4
|
# Creates a Statsample::Dataset based on a Hash
|
5
|
-
def
|
5
|
+
def to_dataframe(*args)
|
6
6
|
Statsample::Dataset.new(self, *args)
|
7
7
|
end
|
8
|
+
|
9
|
+
alias :to_dataset :to_dataframe
|
8
10
|
end
|
9
11
|
|
10
12
|
class Array
|
@@ -17,990 +19,116 @@ class Array
|
|
17
19
|
end
|
18
20
|
|
19
21
|
module Statsample
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
22
|
+
# == Deprecation Warning
|
23
|
+
#
|
24
|
+
# This class will soon be replaced by Daru::DataFrame in the
|
25
|
+
# next release. Please see the daru docs at https://github.com/v0dro/daru
|
26
|
+
# for more details
|
27
|
+
class Dataset < Daru::DataFrame
|
28
|
+
# Ordered ids of vectors
|
29
|
+
def fields
|
30
|
+
$stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#vectors.\n"
|
31
|
+
@vectors.to_a
|
30
32
|
end
|
31
|
-
end
|
32
|
-
# Set of cases with values for one or more variables,
|
33
|
-
# analog to a dataframe on R or a standard data file of SPSS.
|
34
|
-
# Every vector has <tt>#field</tt> name, which represent it. By default,
|
35
|
-
# the vectors are ordered by it field name, but you can change it
|
36
|
-
# the fields order manually.
|
37
|
-
# The Dataset work as a Hash, with keys are field names
|
38
|
-
# and values are Statsample::Vector
|
39
|
-
#
|
40
|
-
#
|
41
|
-
# ==Usage
|
42
|
-
# Create a empty dataset:
|
43
|
-
# Dataset.new()
|
44
|
-
# Create a dataset with three empty vectors, called <tt>v1</tt>, <tt>v2</tt> and <tt>v3</tt>:
|
45
|
-
# Dataset.new(%w{v1 v2 v3})
|
46
|
-
# Create a dataset with two vectors, called <tt>v1</tt>
|
47
|
-
# and <tt>v2</tt>:
|
48
|
-
# Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
|
49
|
-
# Create a dataset with two given vectors (v1 and v2),
|
50
|
-
# with vectors on inverted order:
|
51
|
-
# Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1'])
|
52
|
-
#
|
53
|
-
# The fast way to create a dataset uses Hash#to_dataset, with
|
54
|
-
# field order as arguments
|
55
|
-
# v1 = [1,2,3].to_numeric
|
56
|
-
# v2 = [1,2,3].to_numeric
|
57
|
-
# ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1})
|
58
33
|
|
59
|
-
|
60
|
-
|
61
|
-
include Summarizable
|
62
|
-
# Hash of Statsample::Vector
|
63
|
-
attr_reader :vectors
|
64
|
-
# Ordered ids of vectors
|
65
|
-
attr_reader :fields
|
66
|
-
# Name of dataset
|
67
|
-
attr_accessor :name
|
68
|
-
# Number of cases
|
69
|
-
attr_reader :cases
|
70
|
-
# Location of pointer on enumerations methods (like #each)
|
71
|
-
attr_reader :i
|
34
|
+
def name= new_name
|
35
|
+
$stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#rename.\n"
|
72
36
|
|
73
|
-
|
74
|
-
# - Rows
|
75
|
-
# - Columns
|
76
|
-
# - Values
|
77
|
-
#
|
78
|
-
# For example, you have these values
|
79
|
-
#
|
80
|
-
# x y v
|
81
|
-
# a a 0
|
82
|
-
# a b 1
|
83
|
-
# b a 1
|
84
|
-
# b b 0
|
85
|
-
#
|
86
|
-
# You obtain
|
87
|
-
# id a b
|
88
|
-
# a 0 1
|
89
|
-
# b 1 0
|
90
|
-
#
|
91
|
-
# Useful to process outputs from databases
|
92
|
-
def self.crosstab_by_asignation(rows,columns,values)
|
93
|
-
raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size
|
94
|
-
cols_values=columns.factors
|
95
|
-
cols_n=cols_values.size
|
96
|
-
h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){
|
97
|
-
|a1,v1| a1[v1]=nil; a1
|
98
|
-
}
|
99
|
-
;a}
|
100
|
-
values.each_index{|i|
|
101
|
-
h_rows[rows[i]][columns[i]]=values[i]
|
102
|
-
}
|
103
|
-
ds=Dataset.new(["_id"]+cols_values)
|
104
|
-
cols_values.each{|c|
|
105
|
-
ds[c].type=values.type
|
106
|
-
}
|
107
|
-
rows.factors.each {|row|
|
108
|
-
n_row=Array.new(cols_n+1)
|
109
|
-
n_row[0]=row
|
110
|
-
cols_values.each_index {|i|
|
111
|
-
n_row[i+1]=h_rows[row][cols_values[i]]
|
112
|
-
}
|
113
|
-
ds.add_case_array(n_row)
|
114
|
-
}
|
115
|
-
ds.update_valid_data
|
116
|
-
ds
|
37
|
+
rename new_name
|
117
38
|
end
|
118
|
-
#
|
119
|
-
def
|
120
|
-
|
39
|
+
# Number of cases
|
40
|
+
def cases
|
41
|
+
$stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#nrows.\n"
|
42
|
+
|
43
|
+
nrows
|
121
44
|
end
|
122
|
-
|
123
|
-
#
|
124
|
-
#
|
125
|
-
#
|
126
|
-
#
|
127
|
-
#
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
current=out
|
133
|
-
# Create tree
|
134
|
-
tree_keys[0,tree_keys.size-1].each do |f|
|
135
|
-
root=row[f]
|
136
|
-
current[root]||=Hash.new
|
137
|
-
current=current[root]
|
138
|
-
end
|
139
|
-
name=row[tree_keys.last]
|
140
|
-
if !block
|
141
|
-
current[name]||=Array.new
|
142
|
-
current[name].push(row.delete_if{|key,value| tree_keys.include? key})
|
143
|
-
else
|
144
|
-
current[name]=block.call(row, current,name)
|
145
|
-
end
|
146
|
-
end
|
147
|
-
out
|
45
|
+
|
46
|
+
# == Deprecation Warning
|
47
|
+
#
|
48
|
+
# This class will soon be replaced by Daru::DataFrame in the
|
49
|
+
# next release. Use Daru::DataFrame.crosstab_by_assignation
|
50
|
+
# for the same effect. Please see the daru docs at
|
51
|
+
# https://github.com/v0dro/daru for more details.
|
52
|
+
def self.crosstab_by_assignation(rows,columns,values)
|
53
|
+
ds = super(rows, columns, values)
|
54
|
+
Dataset.new ds.to_hash
|
148
55
|
end
|
149
|
-
|
150
|
-
#
|
151
|
-
#
|
152
|
-
#
|
153
|
-
#
|
154
|
-
#
|
155
|
-
# [fields] Array of names for vectors. Is only used for set the
|
156
|
-
# order of variables. If empty, vectors keys on alfabethic order as
|
157
|
-
# used as fields.
|
56
|
+
|
57
|
+
# == Deprecation Warning
|
58
|
+
#
|
59
|
+
# This class will soon be replaced by Daru::DataFrame in the
|
60
|
+
# next release. Use Daru::DataFrame.new for the same effect.
|
61
|
+
# Please see the daru docs at https://github.com/v0dro/daru for more details.
|
158
62
|
def initialize(vectors={}, fields=[])
|
159
|
-
|
160
|
-
@@n_dataset+=1
|
161
|
-
@name=_("Dataset %d") % @@n_dataset
|
162
|
-
@cases=0
|
163
|
-
@gsl=nil
|
164
|
-
@i=nil
|
63
|
+
$stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using that.\n"
|
165
64
|
|
166
65
|
if vectors.instance_of? Array
|
167
66
|
@fields=vectors.dup
|
168
|
-
|
67
|
+
super({}, order: @fields.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e })
|
169
68
|
else
|
170
69
|
# Check vectors
|
171
|
-
@vectors=
|
172
|
-
|
173
|
-
|
174
|
-
check_length
|
175
|
-
end
|
176
|
-
end
|
177
|
-
#
|
178
|
-
# Creates a copy of the given dataset, deleting all the cases with
|
179
|
-
# missing data on one of the vectors.
|
180
|
-
#
|
181
|
-
# @param array of fields to include. No value include all fields
|
182
|
-
#
|
183
|
-
def dup_only_valid(*fields_to_include)
|
184
|
-
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
185
|
-
fields_to_include=fields_to_include[0]
|
186
|
-
end
|
187
|
-
fields_to_include=@fields if fields_to_include.size==0
|
188
|
-
if fields_to_include.any? {|f| @vectors[f].has_missing_data?}
|
189
|
-
ds=Dataset.new(fields_to_include)
|
190
|
-
fields_to_include.each {|f| ds[f].type=@vectors[f].type}
|
191
|
-
each {|row|
|
192
|
-
unless fields_to_include.any? {|f| @vectors[f].has_missing_data? and !@vectors[f].is_valid? row[f]}
|
193
|
-
row_2=fields_to_include.inject({}) {|ac,v| ac[v]=row[v]; ac}
|
194
|
-
ds.add_case(row_2)
|
195
|
-
end
|
196
|
-
}
|
197
|
-
else
|
198
|
-
ds=dup fields_to_include
|
199
|
-
end
|
200
|
-
ds.name= self.name
|
201
|
-
ds
|
202
|
-
end
|
203
|
-
#
|
204
|
-
# Returns a duplicate of the Dataset.
|
205
|
-
# All vectors are copied, so any modification on new
|
206
|
-
# dataset doesn't affect original dataset's vectors.
|
207
|
-
# If fields given as parameter, only include those vectors.
|
208
|
-
#
|
209
|
-
# @param array of fields to include. No value include all fields
|
210
|
-
# @return {Statsample::Dataset}
|
211
|
-
def dup(*fields_to_include)
|
212
|
-
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
213
|
-
fields_to_include=fields_to_include[0]
|
214
|
-
end
|
215
|
-
fields_to_include=@fields if fields_to_include.size==0
|
216
|
-
vectors={}
|
217
|
-
fields=[]
|
218
|
-
fields_to_include.each{|f|
|
219
|
-
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
|
220
|
-
vectors[f]=@vectors[f].dup
|
221
|
-
fields.push(f)
|
222
|
-
}
|
223
|
-
ds=Dataset.new(vectors,fields)
|
224
|
-
ds.name= self.name
|
225
|
-
ds
|
226
|
-
end
|
227
|
-
|
228
|
-
|
229
|
-
# Returns an array with the fields from first argumen to last argument
|
230
|
-
def from_to(from,to)
|
231
|
-
raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
|
232
|
-
raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
|
233
|
-
@fields.slice(@fields.index(from)..@fields.index(to))
|
234
|
-
end
|
235
|
-
|
236
|
-
# Returns (when possible) a cheap copy of dataset.
|
237
|
-
# If no vector have missing values, returns original vectors.
|
238
|
-
# If missing values presents, uses Dataset.dup_only_valid.
|
239
|
-
#
|
240
|
-
# @param array of fields to include. No value include all fields
|
241
|
-
# @return {Statsample::Dataset}
|
242
|
-
def clone_only_valid(*fields_to_include)
|
243
|
-
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
244
|
-
fields_to_include=fields_to_include[0]
|
245
|
-
end
|
246
|
-
fields_to_include=@fields.dup if fields_to_include.size==0
|
247
|
-
if fields_to_include.any? {|v| @vectors[v].has_missing_data?}
|
248
|
-
dup_only_valid(fields_to_include)
|
249
|
-
else
|
250
|
-
clone(fields_to_include)
|
251
|
-
end
|
252
|
-
end
|
253
|
-
# Returns a shallow copy of Dataset.
|
254
|
-
# Object id will be distinct, but @vectors will be the same.
|
255
|
-
# @param array of fields to include. No value include all fields
|
256
|
-
# @return {Statsample::Dataset}
|
257
|
-
def clone(*fields_to_include)
|
258
|
-
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
259
|
-
fields_to_include=fields_to_include[0]
|
260
|
-
end
|
261
|
-
fields_to_include=@fields.dup if fields_to_include.size==0
|
262
|
-
ds=Dataset.new
|
263
|
-
fields_to_include.each{|f|
|
264
|
-
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
|
265
|
-
ds[f]=@vectors[f]
|
266
|
-
}
|
267
|
-
ds.fields=fields_to_include
|
268
|
-
ds.name=@name
|
269
|
-
ds.update_valid_data
|
270
|
-
ds
|
271
|
-
end
|
272
|
-
# Creates a copy of the given dataset, without data on vectors
|
273
|
-
#
|
274
|
-
# @return {Statsample::Dataset}
|
275
|
-
def dup_empty
|
276
|
-
vectors=@vectors.inject({}) {|a,v|
|
277
|
-
a[v[0]]=v[1].dup_empty
|
278
|
-
a
|
279
|
-
}
|
280
|
-
Dataset.new(vectors,@fields.dup)
|
281
|
-
end
|
282
|
-
# Merge vectors from two datasets
|
283
|
-
# In case of name collition, the vectors names are changed to
|
284
|
-
# x_1, x_2 ....
|
285
|
-
#
|
286
|
-
# @return {Statsample::Dataset}
|
287
|
-
def merge(other_ds)
|
288
|
-
raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases
|
289
|
-
types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type}
|
290
|
-
new_fields = (@fields+other_ds.fields).recode_repeated
|
291
|
-
ds_new=Statsample::Dataset.new(new_fields)
|
292
|
-
new_fields.each_index{|i|
|
293
|
-
field=new_fields[i]
|
294
|
-
ds_new[field].type=types[i]
|
295
|
-
}
|
296
|
-
@cases.times {|i|
|
297
|
-
row=case_as_array(i)+other_ds.case_as_array(i)
|
298
|
-
ds_new.add_case_array(row)
|
299
|
-
}
|
300
|
-
ds_new.update_valid_data
|
301
|
-
ds_new
|
302
|
-
end
|
303
|
-
|
304
|
-
# Join 2 Datasets by given fields
|
305
|
-
# type is one of :left and :inner, default is :left
|
306
|
-
#
|
307
|
-
# @return {Statsample::Dataset}
|
308
|
-
def join(other_ds,fields_1=[],fields_2=[],type=:left)
|
309
|
-
fields_new = other_ds.fields - fields_2
|
310
|
-
fields = self.fields + fields_new
|
311
|
-
|
312
|
-
other_ds_hash = {}
|
313
|
-
other_ds.each do |row|
|
314
|
-
key = row.select{|k,v| fields_2.include?(k)}.values
|
315
|
-
value = row.select{|k,v| fields_new.include?(k)}
|
316
|
-
if other_ds_hash[key].nil?
|
317
|
-
other_ds_hash[key] = [value]
|
318
|
-
else
|
319
|
-
other_ds_hash[key] << value
|
70
|
+
@vectors = {}
|
71
|
+
vectors.each do |k,v|
|
72
|
+
@vectors[k.respond_to?(:to_sym) ? k.to_sym : k] = v
|
320
73
|
end
|
74
|
+
@fields = fields
|
75
|
+
super @vectors, order: @fields.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
|
321
76
|
end
|
322
|
-
|
323
|
-
new_ds = Dataset.new(fields)
|
324
|
-
|
325
|
-
self.each do |row|
|
326
|
-
key = row.select{|k,v| fields_1.include?(k)}.values
|
327
|
-
|
328
|
-
new_case = row.dup
|
329
|
-
|
330
|
-
if other_ds_hash[key].nil?
|
331
|
-
if type == :left
|
332
|
-
fields_new.each{|field| new_case[field] = nil}
|
333
|
-
new_ds.add_case(new_case)
|
334
|
-
end
|
335
|
-
else
|
336
|
-
other_ds_hash[key].each do |new_values|
|
337
|
-
new_ds.add_case new_case.merge(new_values)
|
338
|
-
end
|
339
|
-
end
|
340
|
-
|
341
|
-
end
|
342
|
-
new_ds
|
343
77
|
end
|
344
|
-
# Returns a dataset with standarized data.
|
345
|
-
#
|
346
|
-
# @return {Statsample::Dataset}
|
347
|
-
def standarize
|
348
|
-
ds=dup()
|
349
|
-
ds.fields.each do |f|
|
350
|
-
ds[f]=ds[f].vector_standarized
|
351
|
-
end
|
352
|
-
ds
|
353
|
-
end
|
354
|
-
# Generate a matrix, based on fields of dataset
|
355
|
-
#
|
356
|
-
# @return {::Matrix}
|
357
78
|
|
358
|
-
def
|
359
|
-
|
360
|
-
@fields.collect{|col|
|
361
|
-
yield row,col
|
362
|
-
}
|
363
|
-
}
|
364
|
-
Matrix.rows(rows)
|
79
|
+
def from_to(from,to)
|
80
|
+
raise NoMethodError, "This method is no longer supported. To see the vector index use Daru::DataFrame#vectors"
|
365
81
|
end
|
366
82
|
|
367
|
-
# We have the same datasets if +vectors+ and +fields+ are the same
|
368
|
-
#
|
369
|
-
# @return {Boolean}
|
370
|
-
def ==(d2)
|
371
|
-
@vectors==d2.vectors and @fields==d2.fields
|
372
|
-
end
|
373
|
-
# Returns vector <tt>c</tt>
|
374
|
-
#
|
375
|
-
# @return {Statsample::Vector}
|
376
|
-
def col(c)
|
377
|
-
@vectors[c]
|
378
|
-
end
|
379
|
-
alias_method :vector, :col
|
380
|
-
# Equal to Dataset[<tt>name</tt>]=<tt>vector</tt>
|
381
|
-
#
|
382
|
-
# @return self
|
383
83
|
def add_vector(name, vector)
|
384
|
-
raise
|
385
|
-
@vectors[name]=vector
|
386
|
-
check_order
|
387
|
-
self
|
388
|
-
end
|
389
|
-
# Returns true if dataset have vector <tt>v</tt>.
|
390
|
-
#
|
391
|
-
# @return {Boolean}
|
392
|
-
def has_vector? (v)
|
393
|
-
return @vectors.has_key?(v)
|
394
|
-
end
|
395
|
-
# Creates a dataset with the random data, of a n size
|
396
|
-
# If n not given, uses original number of cases.
|
397
|
-
#
|
398
|
-
# @return {Statsample::Dataset}
|
399
|
-
def bootstrap(n=nil)
|
400
|
-
n||=@cases
|
401
|
-
ds_boot=dup_empty
|
402
|
-
n.times do
|
403
|
-
ds_boot.add_case_array(case_as_array(rand(n)))
|
404
|
-
end
|
405
|
-
ds_boot.update_valid_data
|
406
|
-
ds_boot
|
84
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#[]= directly."
|
407
85
|
end
|
408
|
-
|
409
|
-
# Can only add one case and no error check if performed
|
410
|
-
# You SHOULD use #update_valid_data at the end of insertion cycle
|
411
|
-
#
|
412
|
-
#
|
86
|
+
|
413
87
|
def add_case_array(v)
|
414
|
-
|
88
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#add_row instead."
|
415
89
|
end
|
416
|
-
# Insert a case, using:
|
417
|
-
# * Array: size equal to number of vectors and values in the same order as fields
|
418
|
-
# * Hash: keys equal to fields
|
419
|
-
# If uvd is false, #update_valid_data is not executed after
|
420
|
-
# inserting a case. This is very useful if you want to increase the
|
421
|
-
# performance on inserting many cases, because #update_valid_data
|
422
|
-
# performs check on vectors and on the dataset
|
423
90
|
|
424
91
|
def add_case(v,uvd=true)
|
425
|
-
|
426
|
-
when Array
|
427
|
-
if (v[0].is_a? Array)
|
428
|
-
v.each{|subv| add_case(subv,false)}
|
429
|
-
else
|
430
|
-
raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
|
431
|
-
v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
|
432
|
-
end
|
433
|
-
when Hash
|
434
|
-
raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort
|
435
|
-
@fields.each{|f| @vectors[f].add(v[f],false)}
|
436
|
-
else
|
437
|
-
raise TypeError, 'Value must be a Array or a Hash'
|
438
|
-
end
|
439
|
-
if uvd
|
440
|
-
update_valid_data
|
441
|
-
end
|
442
|
-
end
|
443
|
-
# Check vectors and fields after inserting data. Use only
|
444
|
-
# after #add_case_array or #add_case with second parameter to false
|
445
|
-
def update_valid_data
|
446
|
-
@gsl=nil
|
447
|
-
@fields.each{|f| @vectors[f].set_valid_data}
|
448
|
-
check_length
|
449
|
-
end
|
450
|
-
# Delete vector named +name+. Multiple fields accepted.
|
451
|
-
def delete_vector(*args)
|
452
|
-
if args.size==1 and args[0].is_a? Array
|
453
|
-
names=args[0]
|
454
|
-
else
|
455
|
-
names=args
|
456
|
-
end
|
457
|
-
names.each do |name|
|
458
|
-
@fields.delete(name)
|
459
|
-
@vectors.delete(name)
|
460
|
-
end
|
461
|
-
end
|
462
|
-
|
463
|
-
def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN)
|
464
|
-
split=@vectors[name_].split_by_separator(sep)
|
465
|
-
i=1
|
466
|
-
split.each{|k,v|
|
467
|
-
new_field=name_+join+i.to_s
|
468
|
-
v.name=name_+":"+k
|
469
|
-
add_vector(new_field,v)
|
470
|
-
i+=1
|
471
|
-
}
|
472
|
-
end
|
473
|
-
def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
|
474
|
-
split=@vectors[name].split_by_separator(sep)
|
475
|
-
split.each{|k,v|
|
476
|
-
add_vector(name+join+k,v)
|
477
|
-
}
|
478
|
-
end
|
479
|
-
|
480
|
-
def vector_by_calculation(type=:numeric)
|
481
|
-
a=[]
|
482
|
-
each do |row|
|
483
|
-
a.push(yield(row))
|
484
|
-
end
|
485
|
-
a.to_vector(type)
|
486
|
-
end
|
487
|
-
# Returns a vector with sumatory of fields
|
488
|
-
# if fields parameter is empty, sum all fields
|
489
|
-
def vector_sum(fields=nil)
|
490
|
-
fields||=@fields
|
491
|
-
vector=collect_with_index do |row, i|
|
492
|
-
if(fields.find{|f| !@vectors[f].data_with_nils[i]})
|
493
|
-
nil
|
494
|
-
else
|
495
|
-
fields.inject(0) {|ac,v| ac + row[v].to_f}
|
496
|
-
end
|
497
|
-
end
|
498
|
-
vector.name=_("Sum from %s") % @name
|
499
|
-
vector
|
500
|
-
end
|
501
|
-
# Check if #fields attribute is correct, after inserting or deleting vectors
|
502
|
-
def check_fields(fields)
|
503
|
-
fields||=@fields
|
504
|
-
raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
|
505
|
-
fields
|
506
|
-
end
|
507
|
-
|
508
|
-
# Returns a vector with the numbers of missing values for a case
|
509
|
-
def vector_missing_values(fields=nil)
|
510
|
-
fields=check_fields(fields)
|
511
|
-
collect_with_index do |row, i|
|
512
|
-
fields.inject(0) {|a,v|
|
513
|
-
a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
|
514
|
-
}
|
515
|
-
end
|
516
|
-
end
|
517
|
-
def vector_count_characters(fields=nil)
|
518
|
-
fields=check_fields(fields)
|
519
|
-
collect_with_index do |row, i|
|
520
|
-
fields.inject(0){|a,v|
|
521
|
-
a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
|
522
|
-
}
|
523
|
-
end
|
524
|
-
end
|
525
|
-
# Returns a vector with the mean for a set of fields
|
526
|
-
# if fields parameter is empty, return the mean for all fields
|
527
|
-
# if max invalid parameter > 0, returns the mean for all tuples
|
528
|
-
# with 0 to max_invalid invalid fields
|
529
|
-
def vector_mean(fields=nil, max_invalid=0)
|
530
|
-
a=[]
|
531
|
-
fields=check_fields(fields)
|
532
|
-
size=fields.size
|
533
|
-
each_with_index do |row, i |
|
534
|
-
# numero de invalidos
|
535
|
-
sum=0
|
536
|
-
invalids=0
|
537
|
-
fields.each{|f|
|
538
|
-
if !@vectors[f].data_with_nils[i].nil?
|
539
|
-
sum+=row[f].to_f
|
540
|
-
else
|
541
|
-
invalids+=1
|
542
|
-
end
|
543
|
-
}
|
544
|
-
if(invalids>max_invalid)
|
545
|
-
a.push(nil)
|
546
|
-
else
|
547
|
-
a.push(sum.quo(size-invalids))
|
548
|
-
end
|
549
|
-
end
|
550
|
-
a=a.to_vector(:numeric)
|
551
|
-
a.name=_("Means from %s") % @name
|
552
|
-
a
|
553
|
-
end
|
554
|
-
# Check vectors for type and size.
|
555
|
-
def check_length # :nodoc:
|
556
|
-
size=nil
|
557
|
-
@vectors.each do |k,v|
|
558
|
-
raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
|
559
|
-
if size.nil?
|
560
|
-
size=v.size
|
561
|
-
else
|
562
|
-
if v.size!=size
|
563
|
-
raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
|
564
|
-
end
|
565
|
-
end
|
566
|
-
end
|
567
|
-
@cases=size
|
568
|
-
end
|
569
|
-
# Retrieves each vector as [key, vector]
|
570
|
-
def each_vector # :yield: |key, vector|
|
571
|
-
@fields.each{|k| yield k, @vectors[k]}
|
92
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#add_row instead."
|
572
93
|
end
|
573
94
|
|
574
|
-
|
575
|
-
|
576
|
-
Statsample::STATSAMPLE__.case_as_hash(self,c)
|
577
|
-
end
|
578
|
-
else
|
579
|
-
# Retrieves case i as a hash
|
580
|
-
def case_as_hash(i)
|
581
|
-
_case_as_hash(i)
|
582
|
-
end
|
95
|
+
def update_valid_data
|
96
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#update instead. Also see Daru.lazy_update in the daru docs."
|
583
97
|
end
|
584
98
|
|
585
|
-
|
586
|
-
|
587
|
-
Statsample::STATSAMPLE__.case_as_array(self,c)
|
588
|
-
end
|
589
|
-
else
|
590
|
-
# Retrieves case i as a array, ordered on #fields order
|
591
|
-
def case_as_array(i)
|
592
|
-
_case_as_array(i)
|
593
|
-
end
|
594
|
-
end
|
595
|
-
def _case_as_hash(c) # :nodoc:
|
596
|
-
@fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
|
597
|
-
end
|
598
|
-
def _case_as_array(c) # :nodoc:
|
599
|
-
@fields.collect {|x| @vectors[x][c]}
|
99
|
+
def each_array
|
100
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#each_row instead."
|
600
101
|
end
|
601
102
|
|
602
|
-
|
603
|
-
|
604
|
-
begin
|
605
|
-
@i=0
|
606
|
-
@cases.times {|i|
|
607
|
-
@i=i
|
608
|
-
row=case_as_hash(i)
|
609
|
-
yield row
|
610
|
-
}
|
611
|
-
@i=nil
|
612
|
-
rescue =>e
|
613
|
-
raise DatasetException.new(self, e)
|
614
|
-
end
|
615
|
-
end
|
103
|
+
def fields=(f)
|
104
|
+
$stderr.puts "WARNING: Deprecated. Use Daru::DataFrame#reindex_vectors! instead.\n"
|
616
105
|
|
617
|
-
|
618
|
-
def each_with_index # :yield: |case, i|
|
619
|
-
begin
|
620
|
-
@i=0
|
621
|
-
@cases.times{|i|
|
622
|
-
@i=i
|
623
|
-
row=case_as_hash(i)
|
624
|
-
yield row, i
|
625
|
-
}
|
626
|
-
@i=nil
|
627
|
-
rescue =>e
|
628
|
-
raise DatasetException.new(self, e)
|
629
|
-
end
|
106
|
+
reindex_vectors! f
|
630
107
|
end
|
631
108
|
|
632
|
-
# Returns each case as an array, coding missing values as nils
|
633
|
-
def each_array_with_nils
|
634
|
-
m=fields.size
|
635
|
-
@cases.times {|i|
|
636
|
-
@i=i
|
637
|
-
row=Array.new(m)
|
638
|
-
fields.each_index{|j|
|
639
|
-
f=fields[j]
|
640
|
-
row[j]=@vectors[f].data_with_nils[i]
|
641
|
-
}
|
642
|
-
yield row
|
643
|
-
}
|
644
|
-
@i=nil
|
645
|
-
end
|
646
|
-
# Returns each case as an array
|
647
|
-
def each_array
|
648
|
-
@cases.times {|i|
|
649
|
-
@i=i
|
650
|
-
row=case_as_array(i)
|
651
|
-
yield row
|
652
|
-
}
|
653
|
-
@i=nil
|
654
|
-
end
|
655
|
-
# Set fields order. If you omit one or more vectors, they are
|
656
|
-
# ordered by alphabetic order.
|
657
|
-
def fields=(f)
|
658
|
-
@fields=f
|
659
|
-
check_order
|
660
|
-
end
|
661
|
-
# Check congruence between +fields+ attribute
|
662
|
-
# and keys on +vectors
|
663
|
-
def check_order #:nodoc:
|
664
|
-
if(@vectors.keys.sort!=@fields.sort)
|
665
|
-
@fields=@fields&@vectors.keys
|
666
|
-
@fields+=@vectors.keys.sort-@fields
|
667
|
-
end
|
668
|
-
end
|
669
109
|
# Returns the vector named i
|
670
|
-
def[](i)
|
110
|
+
def [](i)
|
111
|
+
$stderr.puts "WARNING: Daru uses symbols instead of strings for naming vectors. Please switch to symbols.\n"
|
112
|
+
|
671
113
|
if i.is_a? Range
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
clone(i)
|
114
|
+
beg = i.begin.respond_to?(:to_sym) ? i.to_sym : i
|
115
|
+
en = i.end.respond_to?(:to_sym) ? i.to_sym : i
|
116
|
+
super(beg..en)
|
676
117
|
else
|
677
|
-
|
678
|
-
@vectors[i]
|
118
|
+
super i.to_sym
|
679
119
|
end
|
680
120
|
end
|
681
|
-
# Retrieves a Statsample::Vector, based on the result
|
682
|
-
# of calculation performed on each case.
|
683
|
-
def collect(type=:numeric)
|
684
|
-
data=[]
|
685
|
-
each {|row|
|
686
|
-
data.push yield(row)
|
687
|
-
}
|
688
|
-
Statsample::Vector.new(data,type)
|
689
|
-
end
|
690
|
-
# Same as Statsample::Vector.collect, but giving case index as second parameter on yield.
|
691
|
-
def collect_with_index(type=:numeric)
|
692
|
-
data=[]
|
693
|
-
each_with_index {|row, i|
|
694
|
-
data.push(yield(row, i))
|
695
|
-
}
|
696
|
-
Statsample::Vector.new(data,type)
|
697
|
-
end
|
698
|
-
# Recode a vector based on a block
|
699
|
-
def recode!(vector_name)
|
700
|
-
0.upto(@cases-1) {|i|
|
701
|
-
@vectors[vector_name].data[i]=yield case_as_hash(i)
|
702
|
-
}
|
703
|
-
@vectors[vector_name].set_valid_data
|
704
|
-
end
|
705
121
|
|
706
|
-
def
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
if v.instance_of? Statsample::Vector
|
711
|
-
@vectors[i]=v
|
712
|
-
check_order
|
713
|
-
else
|
714
|
-
raise ArgumentError,"Should pass a Statsample::Vector"
|
715
|
-
end
|
716
|
-
end
|
717
|
-
# Return data as a matrix. Column are ordered by #fields and
|
718
|
-
# rows by orden of insertion
|
719
|
-
def to_matrix
|
720
|
-
rows=[]
|
721
|
-
self.each_array{|c|
|
722
|
-
rows.push(c)
|
723
|
-
}
|
724
|
-
Matrix.rows(rows)
|
122
|
+
def []=(i,v)
|
123
|
+
$stderr.puts "WARNING: Daru uses symbols instead of strings for naming vectors. Please switch to symbols.\n"
|
124
|
+
|
125
|
+
super i, v
|
725
126
|
end
|
726
127
|
|
727
128
|
if Statsample.has_gsl?
|
728
129
|
def clear_gsl
|
729
|
-
|
130
|
+
raise NoMethodError, "This method is no longer needed/supported."
|
730
131
|
end
|
731
|
-
|
732
|
-
def to_gsl
|
733
|
-
if @gsl.nil?
|
734
|
-
if cases.nil?
|
735
|
-
update_valid_data
|
736
|
-
end
|
737
|
-
@gsl=GSL::Matrix.alloc(cases,fields.size)
|
738
|
-
self.each_array{|c|
|
739
|
-
@gsl.set_row(@i,c)
|
740
|
-
}
|
741
|
-
end
|
742
|
-
@gsl
|
743
|
-
end
|
744
|
-
|
745
|
-
end
|
746
|
-
|
747
|
-
# Return a correlation matrix for fields included as parameters.
|
748
|
-
# By default, uses all fields of dataset
|
749
|
-
def correlation_matrix(fields = nil)
|
750
|
-
if fields
|
751
|
-
ds = clone(fields)
|
752
|
-
else
|
753
|
-
ds = self
|
754
|
-
end
|
755
|
-
Statsample::Bivariate.correlation_matrix(ds)
|
756
|
-
end
|
757
|
-
|
758
|
-
# Return a correlation matrix for fields included as parameters.
|
759
|
-
# By default, uses all fields of dataset
|
760
|
-
def covariance_matrix(fields = nil)
|
761
|
-
if fields
|
762
|
-
ds = clone(fields)
|
763
|
-
else
|
764
|
-
ds = self
|
765
|
-
end
|
766
|
-
Statsample::Bivariate.covariance_matrix(ds)
|
767
|
-
end
|
768
|
-
|
769
|
-
# Create a new dataset with all cases which the block returns true
|
770
|
-
def filter
|
771
|
-
ds=self.dup_empty
|
772
|
-
each {|c|
|
773
|
-
ds.add_case(c, false) if yield c
|
774
|
-
}
|
775
|
-
ds.update_valid_data
|
776
|
-
ds.name=_("%s(filtered)") % @name
|
777
|
-
ds
|
778
|
-
end
|
779
|
-
|
780
|
-
# creates a new vector with the data of a given field which the block returns true
|
781
|
-
def filter_field(field)
|
782
|
-
a=[]
|
783
|
-
each do |c|
|
784
|
-
a.push(c[field]) if yield c
|
785
|
-
end
|
786
|
-
a.to_vector(@vectors[field].type)
|
787
|
-
end
|
788
|
-
|
789
|
-
# Creates a Stastample::Multiset, using one or more fields
|
790
|
-
# to split the dataset.
|
791
|
-
|
792
|
-
|
793
|
-
def to_multiset_by_split(*fields)
|
794
|
-
require 'statsample/multiset'
|
795
|
-
if fields.size==1
|
796
|
-
to_multiset_by_split_one_field(fields[0])
|
797
|
-
else
|
798
|
-
to_multiset_by_split_multiple_fields(*fields)
|
799
|
-
end
|
800
|
-
end
|
801
|
-
# Creates a Statsample::Multiset, using one field
|
802
|
-
|
803
|
-
def to_multiset_by_split_one_field(field)
|
804
|
-
raise ArgumentError,"Should use a correct field name" if !@fields.include? field
|
805
|
-
factors=@vectors[field].factors
|
806
|
-
ms=Multiset.new_empty_vectors(@fields, factors)
|
807
|
-
each {|c|
|
808
|
-
ms[c[field]].add_case(c,false)
|
809
|
-
}
|
810
|
-
#puts "Ingreso a los dataset"
|
811
|
-
ms.datasets.each {|k,ds|
|
812
|
-
ds.update_valid_data
|
813
|
-
ds.name=@vectors[field].labeling(k)
|
814
|
-
ds.vectors.each{|k1,v1|
|
815
|
-
# puts "Vector #{k1}:"+v1.to_s
|
816
|
-
v1.type=@vectors[k1].type
|
817
|
-
v1.name=@vectors[k1].name
|
818
|
-
v1.labels=@vectors[k1].labels
|
819
|
-
|
820
|
-
}
|
821
|
-
}
|
822
|
-
ms
|
823
|
-
end
|
824
|
-
def to_multiset_by_split_multiple_fields(*fields)
|
825
|
-
factors_total=nil
|
826
|
-
fields.each do |f|
|
827
|
-
if factors_total.nil?
|
828
|
-
factors_total=@vectors[f].factors.collect{|c|
|
829
|
-
[c]
|
830
|
-
}
|
831
|
-
else
|
832
|
-
suma=[]
|
833
|
-
factors=@vectors[f].factors
|
834
|
-
factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } }
|
835
|
-
factors_total=suma
|
836
|
-
end
|
837
|
-
end
|
838
|
-
ms=Multiset.new_empty_vectors(@fields,factors_total)
|
839
|
-
|
840
|
-
p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
|
841
|
-
each{|c| p1.call(c)}
|
842
|
-
|
843
|
-
ms.datasets.each do |k,ds|
|
844
|
-
ds.update_valid_data
|
845
|
-
ds.name=fields.size.times.map {|i|
|
846
|
-
f=fields[i]
|
847
|
-
sk=k[i]
|
848
|
-
@vectors[f].labeling(sk)
|
849
|
-
}.join("-")
|
850
|
-
ds.vectors.each{|k1,v1|
|
851
|
-
v1.type=@vectors[k1].type
|
852
|
-
v1.name=@vectors[k1].name
|
853
|
-
v1.labels=@vectors[k1].labels
|
854
|
-
|
855
|
-
}
|
856
|
-
end
|
857
|
-
ms
|
858
|
-
|
859
|
-
end
|
860
|
-
# Returns a vector, based on a string with a calculation based
|
861
|
-
# on vector
|
862
|
-
# The calculation will be eval'ed, so you can put any variable
|
863
|
-
# or expression valid on ruby
|
864
|
-
# For example:
|
865
|
-
# a=[1,2].to_vector(scale)
|
866
|
-
# b=[3,4].to_vector(scale)
|
867
|
-
# ds={'a'=>a,'b'=>b}.to_dataset
|
868
|
-
# ds.compute("a+b")
|
869
|
-
# => Vector [4,6]
|
870
|
-
def compute(text)
|
871
|
-
@fields.each{|f|
|
872
|
-
if @vectors[f].type=:numeric
|
873
|
-
text.gsub!(f,"row['#{f}'].to_f")
|
874
|
-
else
|
875
|
-
text.gsub!(f,"row['#{f}']")
|
876
|
-
end
|
877
|
-
}
|
878
|
-
collect_with_index {|row, i|
|
879
|
-
invalid=false
|
880
|
-
@fields.each{|f|
|
881
|
-
if @vectors[f].data_with_nils[i].nil?
|
882
|
-
invalid=true
|
883
|
-
end
|
884
|
-
}
|
885
|
-
if invalid
|
886
|
-
nil
|
887
|
-
else
|
888
|
-
eval(text)
|
889
|
-
end
|
890
|
-
}
|
891
|
-
end
|
892
|
-
# Test each row with one or more tests
|
893
|
-
# each test is a Proc with the form
|
894
|
-
# Proc.new {|row| row['age']>0}
|
895
|
-
# The function returns an array with all errors
|
896
|
-
def verify(*tests)
|
897
|
-
if(tests[0].is_a? String)
|
898
|
-
id=tests[0]
|
899
|
-
tests.shift
|
900
|
-
else
|
901
|
-
id=@fields[0]
|
902
|
-
end
|
903
|
-
vr=[]
|
904
|
-
i=0
|
905
|
-
each do |row|
|
906
|
-
i+=1
|
907
|
-
tests.each{|test|
|
908
|
-
if ! test[2].call(row)
|
909
|
-
values=""
|
910
|
-
if test[1].size>0
|
911
|
-
values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
|
912
|
-
end
|
913
|
-
vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
|
914
|
-
end
|
915
|
-
}
|
916
|
-
end
|
917
|
-
vr
|
918
|
-
end
|
919
|
-
def to_s
|
920
|
-
"#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
|
921
|
-
end
|
922
|
-
def inspect
|
923
|
-
self.to_s
|
924
|
-
end
|
925
|
-
# Creates a new dataset for one to many relations
|
926
|
-
# on a dataset, based on pattern of field names.
|
927
|
-
#
|
928
|
-
# for example, you have a survey for number of children
|
929
|
-
# with this structure:
|
930
|
-
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
931
|
-
# with
|
932
|
-
# ds.one_to_many(%w{id}, "child_%v_%n"
|
933
|
-
# the field of first parameters will be copied verbatim
|
934
|
-
# to new dataset, and fields which responds to second
|
935
|
-
# pattern will be added one case for each different %n.
|
936
|
-
# For example
|
937
|
-
# cases=[
|
938
|
-
# ['1','george','red',10,'blue',20,nil,nil],
|
939
|
-
# ['2','fred','green',15,'orange',30,'white',20],
|
940
|
-
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
941
|
-
# ]
|
942
|
-
# ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
|
943
|
-
# cases.each {|c| ds.add_case_array c }
|
944
|
-
# ds.one_to_many(['id'],'car_%v%n').to_matrix
|
945
|
-
# => Matrix[
|
946
|
-
# ["red", "1", 10],
|
947
|
-
# ["blue", "1", 20],
|
948
|
-
# ["green", "2", 15],
|
949
|
-
# ["orange", "2", 30],
|
950
|
-
# ["white", "2", 20]
|
951
|
-
# ]
|
952
|
-
#
|
953
|
-
def one_to_many(parent_fields, pattern)
|
954
|
-
#base_pattern=pattern.gsub(/%v|%n/,"")
|
955
|
-
re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
|
956
|
-
ds_vars=parent_fields
|
957
|
-
vars=[]
|
958
|
-
max_n=0
|
959
|
-
h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
|
960
|
-
# Adding _row_id
|
961
|
-
h['_col_id']=[].to_numeric
|
962
|
-
ds_vars.push("_col_id")
|
963
|
-
@fields.each do |f|
|
964
|
-
if f=~re
|
965
|
-
if !vars.include? $1
|
966
|
-
vars.push($1)
|
967
|
-
h[$1]=Statsample::Vector.new([], @vectors[f].type)
|
968
|
-
end
|
969
|
-
max_n=$2.to_i if max_n < $2.to_i
|
970
|
-
end
|
971
|
-
end
|
972
|
-
ds=Dataset.new(h,ds_vars+vars)
|
973
|
-
each do |row|
|
974
|
-
row_out={}
|
975
|
-
parent_fields.each do |f|
|
976
|
-
row_out[f]=row[f]
|
977
|
-
end
|
978
|
-
max_n.times do |n1|
|
979
|
-
n=n1+1
|
980
|
-
any_data=false
|
981
|
-
vars.each do |v|
|
982
|
-
data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
|
983
|
-
row_out[v]=data
|
984
|
-
any_data=true if !data.nil?
|
985
|
-
end
|
986
|
-
if any_data
|
987
|
-
row_out["_col_id"]=n
|
988
|
-
ds.add_case(row_out,false)
|
989
|
-
end
|
990
|
-
|
991
|
-
end
|
992
|
-
end
|
993
|
-
ds.update_valid_data
|
994
|
-
ds
|
995
|
-
end
|
996
|
-
def report_building(b)
|
997
|
-
b.section(:name=>@name) do |g|
|
998
|
-
g.text _"Cases: %d" % cases
|
999
|
-
@fields.each do |f|
|
1000
|
-
g.text "Element:[#{f}]"
|
1001
|
-
g.parse_element(@vectors[f])
|
1002
|
-
end
|
1003
|
-
end
|
1004
|
-
end
|
132
|
+
end
|
1005
133
|
end
|
1006
134
|
end
|