statsample 0.6.5 → 0.6.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +15 -0
- data/Manifest.txt +6 -0
- data/README.txt +30 -12
- data/Rakefile +91 -0
- data/demo/levene.rb +9 -0
- data/demo/multiple_regression.rb +1 -7
- data/demo/polychoric.rb +1 -0
- data/demo/principal_axis.rb +8 -0
- data/lib/distribution/f.rb +22 -22
- data/lib/spss.rb +99 -99
- data/lib/statsample/bivariate/polychoric.rb +32 -22
- data/lib/statsample/bivariate/tetrachoric.rb +212 -207
- data/lib/statsample/bivariate.rb +6 -6
- data/lib/statsample/codification.rb +65 -65
- data/lib/statsample/combination.rb +60 -59
- data/lib/statsample/converter/csv19.rb +12 -12
- data/lib/statsample/converters.rb +1 -1
- data/lib/statsample/dataset.rb +93 -36
- data/lib/statsample/dominanceanalysis/bootstrap.rb +66 -3
- data/lib/statsample/dominanceanalysis.rb +5 -6
- data/lib/statsample/factor/pca.rb +41 -11
- data/lib/statsample/factor/principalaxis.rb +105 -29
- data/lib/statsample/factor/rotation.rb +20 -3
- data/lib/statsample/factor.rb +1 -1
- data/lib/statsample/graph/gdchart.rb +13 -13
- data/lib/statsample/graph/svggraph.rb +166 -167
- data/lib/statsample/matrix.rb +22 -12
- data/lib/statsample/mle/logit.rb +3 -2
- data/lib/statsample/mle/probit.rb +7 -5
- data/lib/statsample/mle.rb +4 -2
- data/lib/statsample/multiset.rb +125 -124
- data/lib/statsample/permutation.rb +2 -1
- data/lib/statsample/regression/binomial/logit.rb +4 -3
- data/lib/statsample/regression/binomial/probit.rb +2 -1
- data/lib/statsample/regression/binomial.rb +62 -81
- data/lib/statsample/regression/multiple/baseengine.rb +1 -1
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/matrixengine.rb +12 -6
- data/lib/statsample/regression/multiple.rb +15 -42
- data/lib/statsample/regression/simple.rb +93 -78
- data/lib/statsample/regression.rb +74 -2
- data/lib/statsample/reliability.rb +117 -120
- data/lib/statsample/srs.rb +156 -153
- data/lib/statsample/test/levene.rb +90 -0
- data/lib/statsample/test/umannwhitney.rb +25 -9
- data/lib/statsample/test.rb +2 -0
- data/lib/statsample/vector.rb +388 -413
- data/lib/statsample.rb +74 -30
- data/po/es/statsample.mo +0 -0
- data/test/test_bivariate.rb +5 -4
- data/test/test_combination.rb +1 -1
- data/test/test_dataset.rb +2 -2
- data/test/test_factor.rb +53 -6
- data/test/test_gsl.rb +1 -1
- data/test/test_mle.rb +1 -1
- data/test/test_regression.rb +18 -33
- data/test/test_statistics.rb +15 -33
- data/test/test_stest.rb +35 -0
- data/test/test_svg_graph.rb +2 -2
- data/test/test_vector.rb +331 -333
- metadata +38 -11
@@ -1,32 +1,32 @@
|
|
1
1
|
require 'yaml'
|
2
2
|
|
3
3
|
module Statsample
|
4
|
-
# This module aids to code open questions
|
5
|
-
# * Select one or more vectors of a dataset, to create a yaml files, on which each vector is a hash, which keys and values are the vector's factors . If data have Statsample::SPLIT_TOKEN on a value, each value will be separated on two or more hash keys.
|
6
|
-
# * Edit the yaml and replace the values of hashes with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SPLIT_TOKEN)
|
7
|
-
# * Recode the vectors, loading the yaml file:
|
8
|
-
# * recode_dataset_simple!() : The new vectors have the same name of the original plus "_recoded"
|
9
|
-
# * recode_dataset_split!() : Create equal number of vectors as values. See Vector.add_vectors_by_split() for arguments
|
10
|
-
#
|
11
|
-
# Usage:
|
12
|
-
# recode_file="recodification.yaml"
|
13
|
-
# phase=:first # flag
|
14
|
-
# if phase==:first
|
15
|
-
# File.open(recode_file,"w") {|fp|
|
16
|
-
# Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp)
|
17
|
-
# }
|
18
|
-
# # Edit the file recodification.yaml and verify changes
|
19
|
-
# elsif phase==:second
|
20
|
-
# File.open(recode_file,"r") {|fp|
|
21
|
-
# Statsample::Codification.verify(fp,['vector1'])
|
22
|
-
# }
|
23
|
-
# # Add new vectors to the dataset
|
24
|
-
# elsif phase==:third
|
25
|
-
# File.open(recode_file,"r") {|fp|
|
26
|
-
# Statsample::Codification.recode_dataset_split!(ds,fp,"*")
|
27
|
-
# }
|
28
|
-
# end
|
29
|
-
#
|
4
|
+
# This module aids to code open questions
|
5
|
+
# * Select one or more vectors of a dataset, to create a yaml files, on which each vector is a hash, which keys and values are the vector's factors . If data have Statsample::SPLIT_TOKEN on a value, each value will be separated on two or more hash keys.
|
6
|
+
# * Edit the yaml and replace the values of hashes with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SPLIT_TOKEN)
|
7
|
+
# * Recode the vectors, loading the yaml file:
|
8
|
+
# * recode_dataset_simple!() : The new vectors have the same name of the original plus "_recoded"
|
9
|
+
# * recode_dataset_split!() : Create equal number of vectors as values. See Vector.add_vectors_by_split() for arguments
|
10
|
+
#
|
11
|
+
# Usage:
|
12
|
+
# recode_file="recodification.yaml"
|
13
|
+
# phase=:first # flag
|
14
|
+
# if phase==:first
|
15
|
+
# File.open(recode_file,"w") {|fp|
|
16
|
+
# Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp)
|
17
|
+
# }
|
18
|
+
# # Edit the file recodification.yaml and verify changes
|
19
|
+
# elsif phase==:second
|
20
|
+
# File.open(recode_file,"r") {|fp|
|
21
|
+
# Statsample::Codification.verify(fp,['vector1'])
|
22
|
+
# }
|
23
|
+
# # Add new vectors to the dataset
|
24
|
+
# elsif phase==:third
|
25
|
+
# File.open(recode_file,"r") {|fp|
|
26
|
+
# Statsample::Codification.recode_dataset_split!(ds,fp,"*")
|
27
|
+
# }
|
28
|
+
# end
|
29
|
+
#
|
30
30
|
module Codification
|
31
31
|
class << self
|
32
32
|
# Create a hash, based on vectors, to create the dictionary.
|
@@ -38,7 +38,7 @@ module Statsample
|
|
38
38
|
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
|
39
39
|
v=dataset[v_name]
|
40
40
|
split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?}
|
41
|
-
|
41
|
+
|
42
42
|
factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac }
|
43
43
|
h[v_name]=factors
|
44
44
|
h
|
@@ -48,7 +48,7 @@ module Statsample
|
|
48
48
|
# Create a yaml to create a dictionary, based on vectors
|
49
49
|
# The keys will be vectors name on dataset and the values
|
50
50
|
# will be hashes, with keys = values, for recodification
|
51
|
-
#
|
51
|
+
#
|
52
52
|
# v1=%w{a,b b,c d}.to_vector
|
53
53
|
# ds={"v1"=>v1}.to_dataset
|
54
54
|
# Statsample::Codification.create_yaml(ds,['v1'])
|
@@ -63,7 +63,7 @@ module Statsample
|
|
63
63
|
# * field: name of vector
|
64
64
|
# * original: original name
|
65
65
|
# * recoded: new code
|
66
|
-
|
66
|
+
|
67
67
|
def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN)
|
68
68
|
require 'spreadsheet'
|
69
69
|
if File.exists?(filename)
|
@@ -98,7 +98,7 @@ module Statsample
|
|
98
98
|
end
|
99
99
|
h
|
100
100
|
end
|
101
|
-
|
101
|
+
|
102
102
|
def inverse_hash(h, sep=Statsample::SPLIT_TOKEN)
|
103
103
|
h.inject({}) do |a,v|
|
104
104
|
v[1].split(sep).each do |val|
|
@@ -108,11 +108,11 @@ module Statsample
|
|
108
108
|
a
|
109
109
|
end
|
110
110
|
end
|
111
|
-
|
111
|
+
|
112
112
|
def dictionary(h, sep=Statsample::SPLIT_TOKEN)
|
113
113
|
h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
|
114
114
|
end
|
115
|
-
|
115
|
+
|
116
116
|
def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
|
117
117
|
dict=dictionary(h,sep)
|
118
118
|
new_data=v.splitted(sep)
|
@@ -125,45 +125,45 @@ module Statsample
|
|
125
125
|
end
|
126
126
|
end
|
127
127
|
def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN)
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
128
|
+
_recode_dataset(dataset,dictionary_hash ,sep,false)
|
129
|
+
end
|
130
|
+
def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN)
|
131
|
+
_recode_dataset(dataset, dictionary_hash, sep,true)
|
132
|
+
end
|
133
|
+
|
134
|
+
def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
|
135
|
+
v_names||=h.keys
|
136
|
+
v_names.each do |v_name|
|
137
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
|
138
|
+
recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c|
|
139
|
+
if c.nil?
|
140
|
+
nil
|
141
|
+
else
|
142
|
+
c.join(sep)
|
143
|
+
end
|
144
|
+
}.to_vector
|
145
|
+
if(split)
|
146
146
|
recoded.split_by_separator(sep).each {|k,v|
|
147
147
|
dataset[v_name+"_"+k]=v
|
148
148
|
}
|
149
|
-
|
150
|
-
|
151
|
-
end
|
149
|
+
else
|
150
|
+
dataset[v_name+"_recoded"]=recoded
|
152
151
|
end
|
153
152
|
end
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
}
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
|
157
|
+
require 'pp'
|
158
|
+
v_names||=h.keys
|
159
|
+
v_names.each{|v_name|
|
160
|
+
inverse=inverse_hash(h[v_name],sep)
|
161
|
+
io.puts "- Field: #{v_name}"
|
162
|
+
inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v|
|
163
|
+
io.puts " - \"#{k}\" (#{v.count}) :\n -'"+v.join("\n -'")+"'"
|
165
164
|
}
|
166
|
-
|
165
|
+
}
|
166
|
+
end
|
167
167
|
end
|
168
168
|
end
|
169
169
|
end
|
@@ -1,8 +1,7 @@
|
|
1
1
|
module Statsample
|
2
2
|
# Combination class systematically generates all combinations of n elements, taken r at a time.
|
3
3
|
# With rbgsl, GSL::Combination is available for extra speed
|
4
|
-
#
|
5
|
-
# Use:
|
4
|
+
# == Use:
|
6
5
|
# comb=Statsample::Combination.new(3,5)
|
7
6
|
# => #<Statsample::Combination:0x7f6323804e08 @n=5, @d=#<Statsample::Combination::CombinationGsl:0x7f63237ff7f0 @n=5, @k=3, @c=GSL::Combination>, @k=3>
|
8
7
|
# comb.each{|c| p c }
|
@@ -16,23 +15,25 @@ module Statsample
|
|
16
15
|
# [1, 2, 4]
|
17
16
|
# [1, 3, 4]
|
18
17
|
# [2, 3, 4]
|
18
|
+
# == Reference:
|
19
|
+
# * http://snippets.dzone.com/posts/show/4666
|
19
20
|
#
|
20
21
|
class Combination
|
21
22
|
attr_reader :d
|
22
23
|
def initialize(k,n,only_ruby=false)
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
24
|
+
@k=k
|
25
|
+
@n=n
|
26
|
+
if Statsample.has_gsl? and !only_ruby
|
27
|
+
@d=CombinationGsl.new(@k,@n)
|
28
|
+
else
|
29
|
+
@d=CombinationRuby.new(@k,@n)
|
30
|
+
end
|
30
31
|
end
|
31
32
|
def each
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
33
|
+
reset
|
34
|
+
while a=next_value
|
35
|
+
yield a
|
36
|
+
end
|
36
37
|
end
|
37
38
|
def reset
|
38
39
|
@d.reset
|
@@ -43,70 +44,70 @@ module Statsample
|
|
43
44
|
class CombinationRuby # :nodoc:
|
44
45
|
attr_reader :data
|
45
46
|
def initialize(k,n)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
raise "k<=n" if k>n
|
48
|
+
@k=k
|
49
|
+
@n=n
|
50
|
+
reset
|
50
51
|
end
|
51
52
|
def reset
|
52
|
-
|
53
|
-
|
53
|
+
@data=[]
|
54
|
+
(0...@k).each {|i| @data[i] = i }
|
54
55
|
end
|
55
56
|
def each
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
57
|
+
reset
|
58
|
+
while a=next_value
|
59
|
+
yield a
|
60
|
+
end
|
60
61
|
end
|
61
62
|
def next_value
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
63
|
+
return false if !@data
|
64
|
+
old_comb=@data.dup
|
65
|
+
i = @k - 1;
|
66
|
+
@data[i]+=1
|
67
|
+
while ((i >= 0) and (@data[i] >= @n - @k + 1 + i)) do
|
68
|
+
i-=1;
|
69
|
+
@data[i]+=1;
|
70
|
+
end
|
71
|
+
|
72
|
+
if (@data[0] > @n - @k) # Combination (n-k, n-k+1, ..., n) reached */
|
73
|
+
@data=false # No more combinations can be generated
|
74
|
+
else
|
75
|
+
# comb now looks like (..., x, n, n, n, ..., n).
|
76
|
+
# Turn it into (..., x, x + 1, x + 2, ...)
|
77
|
+
i = i+1
|
78
|
+
(i...@k).each{ |i1|
|
79
|
+
@data[i1] = @data[i1 - 1] + 1
|
80
|
+
}
|
81
|
+
end
|
82
|
+
return old_comb
|
82
83
|
end
|
83
84
|
end
|
84
85
|
|
85
86
|
# rb-gsl engine for Combinations
|
86
87
|
class CombinationGsl # :nodoc:
|
87
88
|
def initialize(k,n)
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
89
|
+
require 'gsl'
|
90
|
+
raise "k<=n" if k>n
|
91
|
+
@k=k
|
92
|
+
@n=n
|
93
|
+
reset
|
93
94
|
end
|
94
95
|
def reset
|
95
|
-
|
96
|
+
@c= ::GSL::Combination.calloc(@n, @k);
|
96
97
|
end
|
97
98
|
def next_value
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
99
|
+
return false if !@c
|
100
|
+
data=@c.data.to_a
|
101
|
+
if @c.next != GSL::SUCCESS
|
102
|
+
@c=false
|
103
|
+
end
|
104
|
+
return data
|
104
105
|
end
|
105
106
|
def each
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
107
|
+
reset
|
108
|
+
begin
|
109
|
+
yield @c.data.to_a
|
110
|
+
end while @c.next == GSL::SUCCESS
|
110
111
|
end
|
111
112
|
end
|
112
113
|
end
|
@@ -1,10 +1,10 @@
|
|
1
1
|
module Statsample
|
2
2
|
class CSV < SpreadsheetBase
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
3
|
+
class << self
|
4
|
+
# Returns a Dataset based on a csv file
|
5
|
+
#
|
6
|
+
# USE:
|
7
|
+
# ds=Statsample::CSV.read("test_csv.csv")
|
8
8
|
def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
|
9
9
|
require 'csv'
|
10
10
|
first_row=true
|
@@ -36,17 +36,17 @@ module Statsample
|
|
36
36
|
ds.update_valid_data
|
37
37
|
ds
|
38
38
|
end
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
# Save a Dataset on a csv file
|
40
|
+
#
|
41
|
+
# USE:
|
42
|
+
# Statsample::CSV.write(ds,"test_csv.csv")
|
43
43
|
def write(dataset,filename, convert_comma=false,*opts)
|
44
|
-
require 'csv'
|
44
|
+
require 'csv'
|
45
45
|
writer=::CSV.open(filename,'w',*opts)
|
46
46
|
writer << dataset.fields
|
47
47
|
dataset.each_array do|row|
|
48
48
|
if(convert_comma)
|
49
|
-
|
49
|
+
row.collect!{|v| v.to_s.gsub(".",",")}
|
50
50
|
end
|
51
51
|
writer << row
|
52
52
|
end
|
@@ -54,4 +54,4 @@ module Statsample
|
|
54
54
|
end
|
55
55
|
end
|
56
56
|
end
|
57
|
-
end
|
57
|
+
end
|