statsample-ekatena 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Bivariate
|
|
3
|
+
# = Pearson correlation coefficient (r)
|
|
4
|
+
#
|
|
5
|
+
# The moment-product Pearson's correlation coefficient, known as 'r'
|
|
6
|
+
# is a measure of bivariate associate between two continous
|
|
7
|
+
# variables.
|
|
8
|
+
#
|
|
9
|
+
# == Usage
|
|
10
|
+
# a = Daru::Vector.new([1,2,3,4,5,6])
|
|
11
|
+
# b = Daru::Vector.new([2,3,4,5,6,7])
|
|
12
|
+
# pearson = Statsample::Bivariate::Pearson.new(a,b)
|
|
13
|
+
# puts pearson.r
|
|
14
|
+
# puts pearson.t
|
|
15
|
+
# puts pearson.probability
|
|
16
|
+
# puts pearson.summary
|
|
17
|
+
#
|
|
18
|
+
class Pearson
|
|
19
|
+
|
|
20
|
+
include Statsample::Test
|
|
21
|
+
include Summarizable
|
|
22
|
+
# Name of correlation
|
|
23
|
+
attr_accessor :name
|
|
24
|
+
# Tails for probability (:both, :left or :right)
|
|
25
|
+
attr_accessor :tails
|
|
26
|
+
attr_accessor :n
|
|
27
|
+
def initialize(v1,v2,opts=Hash.new)
|
|
28
|
+
@v1_name,@v2_name = v1.name,v2.name
|
|
29
|
+
@v1,@v2 = Statsample.only_valid_clone(v1,v2)
|
|
30
|
+
@n=@v1.size
|
|
31
|
+
opts_default={
|
|
32
|
+
:name=>_("Correlation (%s - %s)") % [@v1_name, @v2_name],
|
|
33
|
+
:tails=>:both
|
|
34
|
+
}
|
|
35
|
+
@opts=opts.merge(opts_default)
|
|
36
|
+
@opts.each{|k,v|
|
|
37
|
+
self.send("#{k}=",v) if self.respond_to? k
|
|
38
|
+
}
|
|
39
|
+
end
|
|
40
|
+
def r
|
|
41
|
+
Statsample::Bivariate.pearson(@v1,@v2)
|
|
42
|
+
end
|
|
43
|
+
def t
|
|
44
|
+
Statsample::Bivariate.t_pearson(@v1,@v2)
|
|
45
|
+
end
|
|
46
|
+
def probability
|
|
47
|
+
p_using_cdf(Distribution::T.cdf(t, @v1.size-2), tails)
|
|
48
|
+
end
|
|
49
|
+
def report_building(builder)
|
|
50
|
+
builder.text(_("%s : r=%0.3f (t:%0.3f, g.l.=%d, p:%0.3f / %s tails)") % [@name, r,t, (n-2), probability, tails])
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
require 'yaml'
|
|
2
|
+
|
|
3
|
+
module Statsample
|
|
4
|
+
# This module aids to code open questions
|
|
5
|
+
# * Select one or more vectors of a dataset, to create a yaml files, on which each vector is a hash, which keys and values are the vector's factors . If data have Statsample::SPLIT_TOKEN on a value, each value will be separated on two or more hash keys.
|
|
6
|
+
# * Edit the yaml and replace the values of hashes with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SPLIT_TOKEN)
|
|
7
|
+
# * Recode the vectors, loading the yaml file:
|
|
8
|
+
# * recode_dataset_simple!() : The new vectors have the same name of the original plus "_recoded"
|
|
9
|
+
# * recode_dataset_split!() : Create equal number of vectors as values. See Vector.add_vectors_by_split() for arguments
|
|
10
|
+
#
|
|
11
|
+
# Usage:
|
|
12
|
+
# recode_file="recodification.yaml"
|
|
13
|
+
# phase=:first # flag
|
|
14
|
+
# if phase==:first
|
|
15
|
+
# File.open(recode_file,"w") {|fp|
|
|
16
|
+
# Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp)
|
|
17
|
+
# }
|
|
18
|
+
# # Edit the file recodification.yaml and verify changes
|
|
19
|
+
# elsif phase==:second
|
|
20
|
+
# File.open(recode_file,"r") {|fp|
|
|
21
|
+
# Statsample::Codification.verify(fp,['vector1'])
|
|
22
|
+
# }
|
|
23
|
+
# # Add new vectors to the dataset
|
|
24
|
+
# elsif phase==:third
|
|
25
|
+
# File.open(recode_file,"r") {|fp|
|
|
26
|
+
# Statsample::Codification.recode_dataset_split!(ds,fp,"*")
|
|
27
|
+
# }
|
|
28
|
+
# end
|
|
29
|
+
#
|
|
30
|
+
module Codification
|
|
31
|
+
class << self
|
|
32
|
+
# Create a hash, based on vectors, to create the dictionary.
|
|
33
|
+
# The keys will be vectors name on dataset and the values
|
|
34
|
+
# will be hashes, with keys = values, for recodification
|
|
35
|
+
def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
|
|
36
|
+
raise ArgumentError,"Array should't be empty" if vectors.size==0
|
|
37
|
+
pro_hash = vectors.inject({}) do |h,v_name|
|
|
38
|
+
v_name = v_name.is_a?(Numeric) ? v_name : v_name.to_sym
|
|
39
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if
|
|
40
|
+
!dataset.vectors.include?(v_name)
|
|
41
|
+
v = dataset[v_name]
|
|
42
|
+
split_data = v.splitted(sep)
|
|
43
|
+
.flatten
|
|
44
|
+
.collect { |c| c.to_s }
|
|
45
|
+
.find_all{ |c| !c.nil? }
|
|
46
|
+
|
|
47
|
+
factors = split_data.uniq
|
|
48
|
+
.compact
|
|
49
|
+
.sort
|
|
50
|
+
.inject({}) { |ac,val| ac[val] = val; ac }
|
|
51
|
+
h[v_name] = factors
|
|
52
|
+
h
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
pro_hash
|
|
56
|
+
end
|
|
57
|
+
# Create a yaml to create a dictionary, based on vectors
|
|
58
|
+
# The keys will be vectors name on dataset and the values
|
|
59
|
+
# will be hashes, with keys = values, for recodification
|
|
60
|
+
#
|
|
61
|
+
# v1 = Daru::Vector.new(%w{a,b b,c d})
|
|
62
|
+
# ds = Daru::DataFrame.new({:v1 => v1})
|
|
63
|
+
# Statsample::Codification.create_yaml(ds,[:v1])
|
|
64
|
+
# => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
|
|
65
|
+
def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
|
|
66
|
+
pro_hash=create_hash(dataset, vectors, sep)
|
|
67
|
+
YAML.dump(pro_hash,io)
|
|
68
|
+
end
|
|
69
|
+
# Create a excel to create a dictionary, based on vectors.
|
|
70
|
+
# Raises an error if filename exists
|
|
71
|
+
# The rows will be:
|
|
72
|
+
# * field: name of vector
|
|
73
|
+
# * original: original name
|
|
74
|
+
# * recoded: new code
|
|
75
|
+
|
|
76
|
+
def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN)
|
|
77
|
+
require 'spreadsheet'
|
|
78
|
+
if File.exist?(filename)
|
|
79
|
+
raise "Exists a file named #{filename}. Delete ir before overwrite."
|
|
80
|
+
end
|
|
81
|
+
book = Spreadsheet::Workbook.new
|
|
82
|
+
sheet = book.create_worksheet
|
|
83
|
+
sheet.row(0).concat(%w(field original recoded))
|
|
84
|
+
i = 1
|
|
85
|
+
create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
|
|
86
|
+
inner_hash.sort.each do |k,v|
|
|
87
|
+
sheet.row(i).concat([field.to_s,k.to_s,v.to_s])
|
|
88
|
+
i += 1
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
book.write(filename)
|
|
93
|
+
end
|
|
94
|
+
# From a excel generates a dictionary hash
|
|
95
|
+
# to use on recode_dataset_simple!() or recode_dataset_split!().
|
|
96
|
+
#
|
|
97
|
+
def excel_to_recoded_hash(filename)
|
|
98
|
+
require 'spreadsheet'
|
|
99
|
+
h={}
|
|
100
|
+
book = Spreadsheet.open filename
|
|
101
|
+
sheet= book.worksheet 0
|
|
102
|
+
row_i=0
|
|
103
|
+
sheet.each do |row|
|
|
104
|
+
row_i += 1
|
|
105
|
+
next if row_i == 1 or row[0].nil? or row[1].nil? or row[2].nil?
|
|
106
|
+
key = row[0].to_sym
|
|
107
|
+
h[key] ||= {}
|
|
108
|
+
h[key][row[1]] = row[2]
|
|
109
|
+
end
|
|
110
|
+
h
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def inverse_hash(h, sep=Statsample::SPLIT_TOKEN)
|
|
114
|
+
h.inject({}) do |a,v|
|
|
115
|
+
v[1].split(sep).each do |val|
|
|
116
|
+
a[val]||=[]
|
|
117
|
+
a[val].push(v[0])
|
|
118
|
+
end
|
|
119
|
+
a
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def dictionary(h, sep=Statsample::SPLIT_TOKEN)
|
|
124
|
+
h.inject({}) { |a,v| a[v[0]]=v[1].split(sep); a }
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
|
|
128
|
+
dict = dictionary(h,sep)
|
|
129
|
+
new_data = v.splitted(sep)
|
|
130
|
+
new_data.collect do |c|
|
|
131
|
+
if c.nil?
|
|
132
|
+
nil
|
|
133
|
+
else
|
|
134
|
+
c.collect{|value| dict[value] }.flatten.uniq
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN)
|
|
139
|
+
_recode_dataset(dataset,dictionary_hash ,sep,false)
|
|
140
|
+
end
|
|
141
|
+
def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN)
|
|
142
|
+
_recode_dataset(dataset, dictionary_hash, sep,true)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
|
|
146
|
+
v_names||=h.keys
|
|
147
|
+
v_names.each do |v_name|
|
|
148
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.vectors.include? v_name
|
|
149
|
+
recoded = Daru::Vector.new(
|
|
150
|
+
recode_vector(dataset[v_name], h[v_name],sep).collect do |c|
|
|
151
|
+
if c.nil?
|
|
152
|
+
nil
|
|
153
|
+
else
|
|
154
|
+
c.join(sep)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
)
|
|
158
|
+
if split
|
|
159
|
+
recoded.split_by_separator(sep).each {|k,v|
|
|
160
|
+
dataset[(v_name.to_s + "_" + k).to_sym] = v
|
|
161
|
+
}
|
|
162
|
+
else
|
|
163
|
+
dataset[(v_name.to_s + "_recoded").to_sym] = recoded
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
|
|
170
|
+
require 'pp'
|
|
171
|
+
v_names||=h.keys
|
|
172
|
+
v_names.each{|v_name|
|
|
173
|
+
inverse=inverse_hash(h[v_name],sep)
|
|
174
|
+
io.puts "- Field: #{v_name}"
|
|
175
|
+
inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v|
|
|
176
|
+
io.puts " - \"#{k}\" (#{v.count}) :\n -'"+v.join("\n -'")+"'"
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# This module will be removed in the next release.
|
|
2
|
+
# Please shift to using Daru::DataFrame.from_csv and #write_csv for CSV
|
|
3
|
+
# related operations.
|
|
4
|
+
module Statsample
|
|
5
|
+
class CSV
|
|
6
|
+
class << self
|
|
7
|
+
# Return a DataFrom created from a csv file.
|
|
8
|
+
#
|
|
9
|
+
# == NOTE
|
|
10
|
+
#
|
|
11
|
+
# This method has been DEPRECATED in favour of Daru::DataFrame.from_csv.
|
|
12
|
+
# Please switch to using that.
|
|
13
|
+
def read(filename, empty = [''], ignore_lines = 0, opts = {})
|
|
14
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_csv instead."
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Save a Dataset on a csv file.
|
|
18
|
+
#
|
|
19
|
+
# == NOTE
|
|
20
|
+
#
|
|
21
|
+
# This method has BEEN DEPRECATED in favor of Daru::DataFrame#write_csv.
|
|
22
|
+
# Please use that instead.
|
|
23
|
+
def write(dataset, filename, convert_comma = false, opts = {})
|
|
24
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_csv instead."
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module SPSS
|
|
3
|
+
class << self
|
|
4
|
+
# Export a SPSS Matrix with tetrachoric correlations .
|
|
5
|
+
#
|
|
6
|
+
# Use:
|
|
7
|
+
# ds=Daru::DataFrame.from_excel("my_data.xls")
|
|
8
|
+
# puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
|
|
9
|
+
def tetrachoric_correlation_matrix(ds)
|
|
10
|
+
dsv=ds.reject_values(*Daru::MISSING_VALUES)
|
|
11
|
+
# Delete all vectors doesn't have variation
|
|
12
|
+
dsv.vectors.each { |f|
|
|
13
|
+
if dsv[f].factors.size==1
|
|
14
|
+
dsv.delete_vector(f)
|
|
15
|
+
else
|
|
16
|
+
dsv[f]=dsv[f].dichotomize
|
|
17
|
+
end
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
tcm=Statsample::Bivariate.tetrachoric_correlation_matrix(dsv)
|
|
21
|
+
n=dsv.vectors.to_a.collect {|f|
|
|
22
|
+
sprintf("%d",dsv[f].size)
|
|
23
|
+
}
|
|
24
|
+
meanlist=dsv.vectors.to_a.collect{|f|
|
|
25
|
+
sprintf("%0.3f", dsv[f].mean)
|
|
26
|
+
}
|
|
27
|
+
stddevlist=dsv.vectors.to_a.collect{|f|
|
|
28
|
+
sprintf("%0.3f", dsv[f].sd)
|
|
29
|
+
}
|
|
30
|
+
out=<<-HEREDOC
|
|
31
|
+
MATRIX DATA VARIABLES=ROWTYPE_ #{dsv.fields.join(",")}.
|
|
32
|
+
BEGIN DATA
|
|
33
|
+
N #{n.join(" ")}
|
|
34
|
+
MEAN #{meanlist.join(" ")}
|
|
35
|
+
STDDEV #{stddevlist.join(" ")}
|
|
36
|
+
HEREDOC
|
|
37
|
+
tcm.row_size.times {|i|
|
|
38
|
+
out +="CORR "
|
|
39
|
+
(i+1).times {|j|
|
|
40
|
+
out+=sprintf("%0.3f",tcm[i,j])+" "
|
|
41
|
+
}
|
|
42
|
+
out +="\n"
|
|
43
|
+
}
|
|
44
|
+
out+="END DATA.\nEXECUTE.\n"
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
require 'statsample/converter/spss'
|
|
2
|
+
module Statsample
|
|
3
|
+
# Create and dumps Datasets on a database
|
|
4
|
+
#
|
|
5
|
+
# == NOTE
|
|
6
|
+
#
|
|
7
|
+
# Deprecated. Use Daru::DataFrame.from_sql and Daru::DataFrame#write_sql
|
|
8
|
+
module Database
|
|
9
|
+
class << self
|
|
10
|
+
# Read a database query and returns a Dataset
|
|
11
|
+
#
|
|
12
|
+
# == NOTE
|
|
13
|
+
#
|
|
14
|
+
# Deprecated. Use Daru::DataFrame.from_sql instead.
|
|
15
|
+
def read(dbh,query)
|
|
16
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_sql instead."
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Insert each case of the Dataset on the selected table
|
|
20
|
+
#
|
|
21
|
+
# == NOTE
|
|
22
|
+
#
|
|
23
|
+
# Deprecated. Use Daru::DataFrame#write_sql instead
|
|
24
|
+
def insert(ds, dbh, table)
|
|
25
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_sql instead."
|
|
26
|
+
end
|
|
27
|
+
# Create a sql, basen on a given Dataset
|
|
28
|
+
#
|
|
29
|
+
# == NOTE
|
|
30
|
+
#
|
|
31
|
+
# Deprecated. Use Daru::DataFrame#create_sql instead.
|
|
32
|
+
def create_sql(ds,table,charset="UTF8")
|
|
33
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#create_sql instead."
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
module Mondrian
|
|
38
|
+
class << self
|
|
39
|
+
def write(dataset,filename)
|
|
40
|
+
File.open(filename,"wb") do |fp|
|
|
41
|
+
fp.puts dataset.vectors.to_a.join("\t")
|
|
42
|
+
dataset.each_row do |row|
|
|
43
|
+
row2 = row.map { |v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
|
|
44
|
+
fp.puts row2.join("\t")
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
class PlainText
|
|
52
|
+
class << self
|
|
53
|
+
def read(filename, fields)
|
|
54
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_plaintext instead."
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# This class has been DEPRECATED. Use Daru::DataFrame::from_excel
|
|
60
|
+
# Daru::DataFrame#write_excel for XLS file operations.
|
|
61
|
+
class Excel
|
|
62
|
+
class << self
|
|
63
|
+
# Write a Excel spreadsheet based on a dataset
|
|
64
|
+
# * TODO: Format nicely date values
|
|
65
|
+
#
|
|
66
|
+
# == NOTE
|
|
67
|
+
#
|
|
68
|
+
# Deprecated. Use Daru::DataFrame#write_csv.
|
|
69
|
+
def write(dataset,filename)
|
|
70
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_excel instead."
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Returns a dataset based on a xls file
|
|
74
|
+
#
|
|
75
|
+
# == NOTE
|
|
76
|
+
#
|
|
77
|
+
# Deprecated. Use Daru::DataFrame.from_excel instead.
|
|
78
|
+
def read(filename, opts=Hash.new)
|
|
79
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_excel instead."
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
module Mx
|
|
85
|
+
class << self
|
|
86
|
+
def write(dataset,filename,type=:covariance)
|
|
87
|
+
puts "Writing MX File"
|
|
88
|
+
File.open(filename,"w") do |fp|
|
|
89
|
+
fp.puts "! #{filename}"
|
|
90
|
+
fp.puts "! Output generated by Statsample"
|
|
91
|
+
fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}"
|
|
92
|
+
fp.puts "Labels " + dataset.vectors.to_a.join(" ")
|
|
93
|
+
case type
|
|
94
|
+
when :raw
|
|
95
|
+
fp.puts "Rectangular"
|
|
96
|
+
dataset.each do |row|
|
|
97
|
+
out=dataset.vectors.to_a.collect do |f|
|
|
98
|
+
if dataset[f].is_valid? row[f]
|
|
99
|
+
row[f]
|
|
100
|
+
else
|
|
101
|
+
"."
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
fp.puts out.join("\t")
|
|
105
|
+
end
|
|
106
|
+
fp.puts "End Rectangular"
|
|
107
|
+
when :covariance
|
|
108
|
+
fp.puts " CMatrix Full"
|
|
109
|
+
cm=Statsample::Bivariate.covariance_matrix(dataset)
|
|
110
|
+
d=(0...(cm.row_size)).collect {|row|
|
|
111
|
+
(0...(cm.column_size)).collect{|col|
|
|
112
|
+
cm[row,col].nil? ? "." : sprintf("%0.3f", cm[row,col])
|
|
113
|
+
}.join(" ")
|
|
114
|
+
}.join("\n")
|
|
115
|
+
fp.puts d
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
module GGobi
|
|
122
|
+
class << self
|
|
123
|
+
def write(dataset,filename,opt={})
|
|
124
|
+
File.open(filename,"w") {|fp|
|
|
125
|
+
fp.write(self.out(dataset,opt))
|
|
126
|
+
}
|
|
127
|
+
end
|
|
128
|
+
def out(dataset,opt={})
|
|
129
|
+
require 'ostruct'
|
|
130
|
+
default_opt = {:dataname => "Default", :description=>"", :missing=>"NA"}
|
|
131
|
+
default_opt.merge! opt
|
|
132
|
+
carrier=OpenStruct.new
|
|
133
|
+
carrier.categorials=[]
|
|
134
|
+
carrier.conversions={}
|
|
135
|
+
variables_def=dataset.vectors.to_a.collect{|k|
|
|
136
|
+
variable_definition(carrier,dataset[k],k)
|
|
137
|
+
}.join("\n")
|
|
138
|
+
|
|
139
|
+
indexes=carrier.categorials.inject({}) {|s,c|
|
|
140
|
+
s[dataset.vectors.to_a.index(c)]=c
|
|
141
|
+
s
|
|
142
|
+
}
|
|
143
|
+
records=""
|
|
144
|
+
dataset.each_row {|c|
|
|
145
|
+
indexes.each { |ik,iv|
|
|
146
|
+
c[ik] = carrier.conversions[iv][c[ik]]
|
|
147
|
+
}
|
|
148
|
+
records << "<record>#{values_definition(c, default_opt[:missing])}</record>\n"
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
out=<<EOC
|
|
152
|
+
<?xml version="1.0"?>
|
|
153
|
+
<!DOCTYPE ggobidata SYSTEM "ggobi.dtd">
|
|
154
|
+
<ggobidata count="1">
|
|
155
|
+
<data name="#{default_opt[:dataname]}">
|
|
156
|
+
<description>#{default_opt[:description]}</description>
|
|
157
|
+
<variables count="#{dataset.fields.size}">
|
|
158
|
+
#{variables_def}
|
|
159
|
+
</variables>
|
|
160
|
+
<records count="#{dataset.cases}" missingValue="#{default_opt[:missing]}">
|
|
161
|
+
#{records}
|
|
162
|
+
</records>
|
|
163
|
+
|
|
164
|
+
</data>
|
|
165
|
+
</ggobidata>
|
|
166
|
+
EOC
|
|
167
|
+
|
|
168
|
+
out
|
|
169
|
+
|
|
170
|
+
end
|
|
171
|
+
def values_definition(c,missing)
|
|
172
|
+
c.collect{|v|
|
|
173
|
+
if v.nil?
|
|
174
|
+
"#{missing}"
|
|
175
|
+
elsif v.is_a? Numeric
|
|
176
|
+
"#{v}"
|
|
177
|
+
else
|
|
178
|
+
"#{v.gsub(/\s+/,"_")}"
|
|
179
|
+
end
|
|
180
|
+
}.join(" ")
|
|
181
|
+
end
|
|
182
|
+
# Outputs a string for a variable definition
|
|
183
|
+
# v = vector
|
|
184
|
+
# name = name of the variable
|
|
185
|
+
# nickname = nickname
|
|
186
|
+
def variable_definition(carrier,v,name,nickname=nil)
|
|
187
|
+
nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" )
|
|
188
|
+
if v.type==:object or v.to_a.find {|d| d.is_a? String }
|
|
189
|
+
carrier.categorials.push(name)
|
|
190
|
+
carrier.conversions[name]={}
|
|
191
|
+
factors=v.factors
|
|
192
|
+
out ="<categoricalvariable name=\"#{name}\" #{nickname}>\n"
|
|
193
|
+
out << "<levels count=\"#{factors.size}\">\n"
|
|
194
|
+
out << (1..factors.size).to_a.collect{|i|
|
|
195
|
+
carrier.conversions[name][factors[i-1]]=i
|
|
196
|
+
"<level value=\"#{i}\">#{(v.labels[factors[i-1]] || factors[i-1])}</level>"
|
|
197
|
+
}.join("\n")
|
|
198
|
+
out << "</levels>\n</categoricalvariable>\n"
|
|
199
|
+
out
|
|
200
|
+
elsif v.to_a.find {|d| d.is_a? Float}
|
|
201
|
+
"<realvariable name=\"#{name}\" #{nickname} />"
|
|
202
|
+
else
|
|
203
|
+
"<integervariable name=\"#{name}\" #{nickname} />"
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
require 'statsample/converter/csv.rb'
|
|
211
|
+
|