daru 0.0.5 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.build.sh +14 -0
- data/.travis.yml +26 -4
- data/CONTRIBUTING.md +31 -0
- data/Gemfile +1 -2
- data/{History.txt → History.md} +110 -44
- data/README.md +21 -288
- data/Rakefile +1 -0
- data/daru.gemspec +12 -8
- data/lib/daru.rb +36 -1
- data/lib/daru/accessors/array_wrapper.rb +8 -3
- data/lib/daru/accessors/gsl_wrapper.rb +113 -0
- data/lib/daru/accessors/nmatrix_wrapper.rb +6 -17
- data/lib/daru/core/group_by.rb +0 -1
- data/lib/daru/dataframe.rb +1192 -83
- data/lib/daru/extensions/rserve.rb +21 -0
- data/lib/daru/index.rb +14 -0
- data/lib/daru/io/io.rb +170 -8
- data/lib/daru/maths/arithmetic/dataframe.rb +4 -3
- data/lib/daru/maths/arithmetic/vector.rb +4 -4
- data/lib/daru/maths/statistics/dataframe.rb +48 -27
- data/lib/daru/maths/statistics/vector.rb +215 -33
- data/lib/daru/monkeys.rb +53 -7
- data/lib/daru/multi_index.rb +21 -4
- data/lib/daru/plotting/dataframe.rb +83 -25
- data/lib/daru/plotting/vector.rb +9 -10
- data/lib/daru/vector.rb +596 -61
- data/lib/daru/version.rb +3 -0
- data/spec/accessors/wrappers_spec.rb +51 -0
- data/spec/core/group_by_spec.rb +0 -2
- data/spec/daru_spec.rb +58 -0
- data/spec/dataframe_spec.rb +768 -73
- data/spec/extensions/rserve_spec.rb +52 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/io/io_spec.rb +161 -24
- data/spec/math/arithmetic/dataframe_spec.rb +26 -7
- data/spec/math/arithmetic/vector_spec.rb +8 -0
- data/spec/math/statistics/dataframe_spec.rb +16 -1
- data/spec/math/statistics/vector_spec.rb +215 -47
- data/spec/spec_helper.rb +21 -2
- data/spec/vector_spec.rb +368 -12
- metadata +99 -16
- data/lib/version.rb +0 -3
- data/notebooks/grouping_splitting_pivots.ipynb +0 -529
- data/notebooks/intro_with_music_data_.ipynb +0 -303
@@ -0,0 +1,21 @@
|
|
1
|
+
# Support for converting data to R data structures to support rserve-client
|
2
|
+
|
3
|
+
module Daru
|
4
|
+
class DataFrame
|
5
|
+
def to_REXP
|
6
|
+
names = @vectors.to_a
|
7
|
+
data = names.map do |f|
|
8
|
+
Rserve::REXP::Wrapper.wrap(self[f].to_a)
|
9
|
+
end
|
10
|
+
l = Rserve::Rlist.new(data, names.map(&:to_s))
|
11
|
+
|
12
|
+
Rserve::REXP.create_data_frame(l)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Vector
|
17
|
+
def to_REXP
|
18
|
+
Rserve::REXP::Wrapper.wrap(self.to_a)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/daru/index.rb
CHANGED
@@ -101,8 +101,22 @@ module Daru
|
|
101
101
|
@relation_hash.has_key? index
|
102
102
|
end
|
103
103
|
|
104
|
+
def empty?
|
105
|
+
@relation_hash.empty?
|
106
|
+
end
|
107
|
+
|
104
108
|
def dup
|
105
109
|
Daru::Index.new @relation_hash.keys
|
106
110
|
end
|
111
|
+
|
112
|
+
def _dump depth
|
113
|
+
Marshal.dump({relation_hash: @relation_hash})
|
114
|
+
end
|
115
|
+
|
116
|
+
def self._load data
|
117
|
+
h = Marshal.load data
|
118
|
+
|
119
|
+
Daru::Index.new(h[:relation_hash].keys, h[:relation_hash].values)
|
120
|
+
end
|
107
121
|
end
|
108
122
|
end
|
data/lib/daru/io/io.rb
CHANGED
@@ -1,22 +1,184 @@
|
|
1
1
|
module Daru
|
2
|
+
module IOHelpers
|
3
|
+
class << self
|
4
|
+
def process_row(row,empty)
|
5
|
+
row.to_a.map do |c|
|
6
|
+
if empty.include?(c)
|
7
|
+
nil
|
8
|
+
else
|
9
|
+
if c.is_a? String and c.is_number?
|
10
|
+
c =~ /^\d+$/ ? c.to_i : c.gsub(",",".").to_f
|
11
|
+
else
|
12
|
+
c
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
2
20
|
module IO
|
3
21
|
class << self
|
22
|
+
# Functions for loading/writing Excel files.
|
23
|
+
|
24
|
+
def from_excel path, opts={}
|
25
|
+
opts = {
|
26
|
+
:worksheet_id => 0,
|
27
|
+
}.merge opts
|
28
|
+
|
29
|
+
worksheet_id = opts[:worksheet_id]
|
30
|
+
book = Spreadsheet.open path
|
31
|
+
worksheet = book.worksheet worksheet_id
|
32
|
+
headers = worksheet.row(0).recode_repeated.map(&:to_sym)
|
33
|
+
|
34
|
+
df = Daru::DataFrame.new({})
|
35
|
+
headers.each_with_index do |h,i|
|
36
|
+
col = worksheet.column(i).to_a
|
37
|
+
col.delete_at 0
|
38
|
+
df[h] = col
|
39
|
+
end
|
40
|
+
|
41
|
+
df
|
42
|
+
end
|
43
|
+
|
44
|
+
def dataframe_write_excel dataframe, path, opts={}
|
45
|
+
book = Spreadsheet::Workbook.new
|
46
|
+
sheet = book.create_worksheet
|
47
|
+
format = Spreadsheet::Format.new :color => :blue, :weight => :bold
|
48
|
+
|
49
|
+
sheet.row(0).concat(dataframe.vectors.to_a.map(&:to_s)) # Unfreeze strings
|
50
|
+
sheet.row(0).default_format = format
|
51
|
+
i = 1
|
52
|
+
dataframe.each_row do |row|
|
53
|
+
sheet.row(i).concat(row.to_a)
|
54
|
+
i += 1
|
55
|
+
end
|
56
|
+
|
57
|
+
book.write(path)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Functions for loading/writing CSV files
|
61
|
+
|
4
62
|
def from_csv path, opts={}
|
5
63
|
opts[:col_sep] ||= ','
|
6
|
-
opts[:headers] ||= true
|
7
64
|
opts[:converters] ||= :numeric
|
8
|
-
opts[:header_converters] ||= :symbol
|
9
65
|
|
10
|
-
|
66
|
+
daru_options = opts.keys.inject({}) do |hash, k|
|
67
|
+
if [:clone, :order, :index, :name].include?(k)
|
68
|
+
hash[k] = opts[k]
|
69
|
+
opts.delete k
|
70
|
+
end
|
71
|
+
|
72
|
+
hash
|
73
|
+
end
|
74
|
+
|
75
|
+
# Preprocess headers for detecting and correcting repetition in
|
76
|
+
# case the :headers option is not specified.
|
77
|
+
unless opts[:headers]
|
78
|
+
csv = ::CSV.open(path, 'rb', opts)
|
79
|
+
yield csv if block_given?
|
80
|
+
|
81
|
+
csv_as_arrays = csv.to_a
|
82
|
+
headers = csv_as_arrays[0].recode_repeated.map(&:to_sym)
|
83
|
+
csv_as_arrays.delete_at 0
|
84
|
+
csv_as_arrays = csv_as_arrays.transpose
|
11
85
|
|
12
|
-
|
86
|
+
hsh = {}
|
87
|
+
headers.each_with_index do |h, i|
|
88
|
+
hsh[h] = csv_as_arrays[i]
|
89
|
+
end
|
90
|
+
else
|
91
|
+
opts[:header_converters] ||= :symbol
|
92
|
+
|
93
|
+
csv = ::CSV.read(path, 'rb',opts)
|
94
|
+
yield csv if block_given?
|
13
95
|
|
14
|
-
|
15
|
-
|
16
|
-
|
96
|
+
hsh = {}
|
97
|
+
csv.by_col.each do |col_name, values|
|
98
|
+
hsh[col_name] = values
|
99
|
+
end
|
17
100
|
end
|
18
101
|
|
19
|
-
Daru::DataFrame.new(hsh)
|
102
|
+
Daru::DataFrame.new(hsh,daru_options)
|
103
|
+
end
|
104
|
+
|
105
|
+
def dataframe_write_csv dataframe, path, opts={}
|
106
|
+
options = {
|
107
|
+
converters: :numeric
|
108
|
+
}.merge(opts)
|
109
|
+
|
110
|
+
writer = ::CSV.open(path, 'w', options)
|
111
|
+
writer << dataframe.vectors.to_a
|
112
|
+
|
113
|
+
dataframe.each_row do |row|
|
114
|
+
if options[:convert_comma]
|
115
|
+
writer << row.map { |v| v.to_s.gsub('.', ',') }
|
116
|
+
else
|
117
|
+
writer << row.to_a
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
writer.close
|
122
|
+
end
|
123
|
+
|
124
|
+
# Loading/writing from SQL databases
|
125
|
+
|
126
|
+
def from_sql dbh, query
|
127
|
+
require 'dbi'
|
128
|
+
sth = dbh.execute(query)
|
129
|
+
vectors = {}
|
130
|
+
fields = []
|
131
|
+
sth.column_info.each do |c|
|
132
|
+
vectors[c[:name]] = Daru::Vector.new([])
|
133
|
+
vectors[c[:name]].rename c[:name]
|
134
|
+
fields.push(c[:name].to_sym)
|
135
|
+
end
|
136
|
+
ds=Daru::DataFrame.new(vectors,order: fields)
|
137
|
+
sth.fetch do |row|
|
138
|
+
ds.add_row(row.to_a)
|
139
|
+
end
|
140
|
+
ds.update
|
141
|
+
ds
|
142
|
+
end
|
143
|
+
|
144
|
+
def dataframe_write_sql ds, dbh, table
|
145
|
+
require 'dbi'
|
146
|
+
query = "INSERT INTO #{table} ("+ds.vectors.to_a.join(",")+") VALUES ("+((["?"]*ds.vectors.size).join(","))+")"
|
147
|
+
sth = dbh.prepare(query)
|
148
|
+
ds.each_row { |c| sth.execute(*c.to_a) }
|
149
|
+
return true
|
150
|
+
end
|
151
|
+
|
152
|
+
# Loading data from plain text files
|
153
|
+
|
154
|
+
def from_plaintext filename, fields
|
155
|
+
ds = Daru::DataFrame.new({}, order: fields)
|
156
|
+
fp = File.open(filename,"r")
|
157
|
+
fp.each_line do |line|
|
158
|
+
row = Daru::IOHelpers.process_row(line.strip.split(/\s+/),[""])
|
159
|
+
next if row == ["\x1A"]
|
160
|
+
ds.add_row(row)
|
161
|
+
end
|
162
|
+
ds.update
|
163
|
+
fields.each { |f| ds[f].rename f }
|
164
|
+
ds
|
165
|
+
end
|
166
|
+
|
167
|
+
# Loading and writing Marshalled DataFrame/Vector
|
168
|
+
def save klass, filename
|
169
|
+
fp = File.open(filename, 'w')
|
170
|
+
Marshal.dump(klass, fp)
|
171
|
+
fp.close
|
172
|
+
end
|
173
|
+
|
174
|
+
def load filename
|
175
|
+
if File.exist? filename
|
176
|
+
o = false
|
177
|
+
File.open(filename, 'r') { |fp| o = Marshal.load(fp) }
|
178
|
+
o
|
179
|
+
else
|
180
|
+
false
|
181
|
+
end
|
20
182
|
end
|
21
183
|
end
|
22
184
|
end
|
@@ -36,15 +36,16 @@ module Daru
|
|
36
36
|
|
37
37
|
# Calculate exponenential of all vectors with numeric values.
|
38
38
|
def exp
|
39
|
-
|
39
|
+
only_numerics(clone: false).recode { |v| v.exp }
|
40
40
|
end
|
41
41
|
|
42
|
+
# Calcuate square root of numeric vectors.
|
42
43
|
def sqrt
|
43
|
-
|
44
|
+
only_numerics(clone: false).recode { |v| v.sqrt }
|
44
45
|
end
|
45
46
|
|
46
47
|
def round precision=0
|
47
|
-
|
48
|
+
only_numerics(clone: false).recode { |v| v.round(precision) }
|
48
49
|
end
|
49
50
|
private
|
50
51
|
|
@@ -35,17 +35,17 @@ module Daru
|
|
35
35
|
end
|
36
36
|
|
37
37
|
def abs
|
38
|
-
self.
|
38
|
+
self.recode { |e| e.abs unless e.nil? }
|
39
39
|
end
|
40
40
|
|
41
41
|
def round precision=0
|
42
|
-
self.
|
42
|
+
self.recode { |e| e.round(precision) unless e.nil? }
|
43
43
|
end
|
44
44
|
|
45
45
|
private
|
46
46
|
|
47
47
|
def math_unary_op operation
|
48
|
-
self.
|
48
|
+
self.recode { |e| Math.send(operation, e) unless e.nil? }
|
49
49
|
end
|
50
50
|
|
51
51
|
def binary_op operation, other
|
@@ -65,7 +65,7 @@ module Daru
|
|
65
65
|
def v2v_binary operation, other
|
66
66
|
common_idxs = []
|
67
67
|
elements = []
|
68
|
-
index = (@index.to_a
|
68
|
+
index = (@index.to_a | other.index.to_a).sort
|
69
69
|
|
70
70
|
index.each do |idx|
|
71
71
|
this = self[idx]
|
@@ -37,6 +37,15 @@ module Daru
|
|
37
37
|
compute_stats :product
|
38
38
|
end
|
39
39
|
|
40
|
+
def standardize
|
41
|
+
df = self.only_numerics clone: true
|
42
|
+
df.map! do |v|
|
43
|
+
v.standardize
|
44
|
+
end
|
45
|
+
|
46
|
+
df
|
47
|
+
end
|
48
|
+
|
40
49
|
# Create a summary of mean, standard deviation, count, max and min of
|
41
50
|
# each numeric vector in the dataframe in one shot.
|
42
51
|
#
|
@@ -55,40 +64,39 @@ module Daru
|
|
55
64
|
Daru::DataFrame.new(description_hash, index: methods)
|
56
65
|
end
|
57
66
|
|
58
|
-
# Calculate variance-covariance between the numeric vectors.
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
67
|
+
# Calculate sample variance-covariance between the numeric vectors.
|
68
|
+
def covariance
|
69
|
+
cache={}
|
70
|
+
vectors = self.numeric_vectors
|
71
|
+
|
72
|
+
mat_rows = vectors.collect do |row|
|
73
|
+
vectors.collect do |col|
|
74
|
+
if row == col
|
75
|
+
self[row].variance
|
76
|
+
else
|
77
|
+
if cache[[col,row]].nil?
|
78
|
+
cov = vector_cov(self[row],self[col])
|
79
|
+
cache[[row,col]] = cov
|
80
|
+
cov
|
81
|
+
else
|
82
|
+
cache[[col,row]]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
75
86
|
end
|
76
87
|
|
77
|
-
Daru::DataFrame.rows(
|
88
|
+
Daru::DataFrame.rows(mat_rows, index: numeric_vectors, order: numeric_vectors)
|
78
89
|
end
|
79
90
|
|
80
91
|
alias :cov :covariance
|
81
92
|
|
82
93
|
# Calculate the correlation between the numeric vectors.
|
83
94
|
def correlation
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
standard_deviation
|
89
|
-
(cov.to_matrix.elementwise_division(standard_deviation.transpose *
|
90
|
-
standard_deviation)).to_a
|
91
|
-
end
|
95
|
+
standard_deviation = std.to_matrix
|
96
|
+
corr_arry = (cov
|
97
|
+
.to_matrix
|
98
|
+
.elementwise_division(standard_deviation.transpose *
|
99
|
+
standard_deviation)).to_a
|
92
100
|
|
93
101
|
Daru::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
|
94
102
|
end
|
@@ -97,12 +105,25 @@ module Daru
|
|
97
105
|
|
98
106
|
private
|
99
107
|
|
108
|
+
def vector_cov v1a, v2a
|
109
|
+
sum_of_squares(v1a,v2a) / (v1a.size - 1)
|
110
|
+
end
|
111
|
+
|
112
|
+
def sum_of_squares v1, v2
|
113
|
+
v1a,v2a = v1.only_valid ,v2.only_valid
|
114
|
+
v1a.reset_index!
|
115
|
+
v2a.reset_index!
|
116
|
+
m1 = v1a.mean
|
117
|
+
m2 = v2a.mean
|
118
|
+
(v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
|
119
|
+
end
|
120
|
+
|
100
121
|
def compute_stats method
|
101
122
|
Daru::Vector.new(
|
102
123
|
numeric_vectors.inject({}) do |hash, vec|
|
103
124
|
hash[vec] = self[vec].send(method)
|
104
125
|
hash
|
105
|
-
end
|
126
|
+
end, name: method
|
106
127
|
)
|
107
128
|
end
|
108
129
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Daru
|
2
2
|
module Maths
|
3
3
|
# Encapsulates statistics methods for vectors. Most basic stuff like mean, etc.
|
4
|
-
#
|
5
|
-
#
|
4
|
+
# is done inside the wrapper, so that native methods can be used for most of
|
5
|
+
# the computationally intensive tasks.
|
6
6
|
module Statistics
|
7
7
|
module Vector
|
8
8
|
def mean
|
@@ -26,7 +26,7 @@ module Daru
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def median
|
29
|
-
percentile
|
29
|
+
@data.respond_to?(:median) ? @data.median : percentile(50)
|
30
30
|
end
|
31
31
|
|
32
32
|
def mode
|
@@ -36,15 +36,21 @@ module Daru
|
|
36
36
|
|
37
37
|
def median_absolute_deviation
|
38
38
|
m = median
|
39
|
-
|
39
|
+
recode {|val| (val - m).abs }.median
|
40
40
|
end
|
41
|
+
alias :mad :median_absolute_deviation
|
41
42
|
|
42
43
|
def standard_error
|
43
|
-
standard_deviation_sample/(Math::sqrt((
|
44
|
+
standard_deviation_sample/(Math::sqrt((n_valid)))
|
44
45
|
end
|
45
46
|
|
46
47
|
def sum_of_squared_deviation
|
47
|
-
(@data.
|
48
|
+
(@data.inject(0) { |a,x| x.square + a } - (sum.square.quo(n_valid)).to_f).to_f
|
49
|
+
end
|
50
|
+
|
51
|
+
# Retrieve unique values of non-nil data
|
52
|
+
def factors
|
53
|
+
only_valid.uniq.reset_index!
|
48
54
|
end
|
49
55
|
|
50
56
|
# Maximum element of the vector.
|
@@ -69,12 +75,18 @@ module Daru
|
|
69
75
|
|
70
76
|
def frequencies
|
71
77
|
@data.inject({}) do |hash, element|
|
72
|
-
|
73
|
-
|
78
|
+
unless element.nil?
|
79
|
+
hash[element] ||= 0
|
80
|
+
hash[element] += 1
|
81
|
+
end
|
74
82
|
hash
|
75
83
|
end
|
76
84
|
end
|
77
85
|
|
86
|
+
def freqs
|
87
|
+
Daru::Vector.new(frequencies)
|
88
|
+
end
|
89
|
+
|
78
90
|
def proportions
|
79
91
|
len = n_valid
|
80
92
|
frequencies.inject({}) { |hash, arr| hash[arr[0]] = arr[1] / len; hash }
|
@@ -83,13 +95,12 @@ module Daru
|
|
83
95
|
def ranked
|
84
96
|
sum = 0
|
85
97
|
r = frequencies.sort.inject( {} ) do |memo, val|
|
86
|
-
memo[val[0]] = ((sum + 1) + (sum + val[1]))
|
98
|
+
memo[val[0]] = ((sum + 1) + (sum + val[1])).quo(2)
|
87
99
|
sum += val[1]
|
88
100
|
memo
|
89
101
|
end
|
90
102
|
|
91
|
-
|
92
|
-
name: self.name, dtype: self.dtype, nm_dtype: self.nm_dtype
|
103
|
+
recode { |e| r[e] }
|
93
104
|
end
|
94
105
|
|
95
106
|
def coefficient_of_variation
|
@@ -107,69 +118,239 @@ module Daru
|
|
107
118
|
val = frequencies[value]
|
108
119
|
val.nil? ? 0 : val
|
109
120
|
else
|
110
|
-
size - @
|
121
|
+
size - @missing_positions.size
|
111
122
|
end
|
112
123
|
end
|
113
124
|
|
114
125
|
def proportion value=1
|
115
|
-
frequencies[value]
|
126
|
+
frequencies[value].quo(n_valid).to_f
|
116
127
|
end
|
117
128
|
|
118
129
|
# Sample variance with denominator (N-1)
|
119
130
|
def variance_sample m=nil
|
120
131
|
m ||= self.mean
|
121
|
-
|
132
|
+
if @data.respond_to? :variance_sample
|
133
|
+
@data.variance_sample m
|
134
|
+
else
|
135
|
+
sum_of_squares(m).quo((n_valid) - 1)
|
136
|
+
end
|
122
137
|
end
|
123
138
|
|
124
139
|
# Population variance with denominator (N)
|
125
140
|
def variance_population m=nil
|
126
141
|
m ||= mean
|
127
|
-
|
142
|
+
if @data.respond_to? :variance_population
|
143
|
+
@data.variance_population m
|
144
|
+
else
|
145
|
+
sum_of_squares(m).quo((n_valid)).to_f
|
146
|
+
end
|
128
147
|
end
|
129
148
|
|
130
149
|
def sum_of_squares(m=nil)
|
131
150
|
m ||= mean
|
132
|
-
@data.inject(0) { |memo, val|
|
151
|
+
@data.inject(0) { |memo, val|
|
152
|
+
@missing_values.has_key?(val) ? memo : (memo + (val - m)**2)
|
153
|
+
}
|
133
154
|
end
|
134
155
|
|
135
156
|
def standard_deviation_population m=nil
|
136
157
|
m ||= mean
|
137
|
-
|
158
|
+
if @data.respond_to? :standard_deviation_population
|
159
|
+
@data.standard_deviation_population(m)
|
160
|
+
else
|
161
|
+
Math::sqrt(variance_population(m))
|
162
|
+
end
|
138
163
|
end
|
139
164
|
|
140
165
|
def standard_deviation_sample m=nil
|
141
|
-
|
166
|
+
m ||= mean
|
167
|
+
if @data.respond_to? :standard_deviation_sample
|
168
|
+
@data.standard_deviation_sample m
|
169
|
+
else
|
170
|
+
Math::sqrt(variance_sample(m))
|
171
|
+
end
|
142
172
|
end
|
143
173
|
|
144
174
|
# Calculate skewness using (sigma(xi - mean)^3)/((N)*std_dev_sample^3)
|
145
175
|
def skew m=nil
|
146
|
-
|
147
|
-
|
148
|
-
|
176
|
+
if @data.respond_to? :skew
|
177
|
+
@data.skew
|
178
|
+
else
|
179
|
+
m ||= mean
|
180
|
+
th = @data.inject(0) { |memo, val| memo + ((val - m)**3) }
|
181
|
+
th.quo ((@size - @missing_positions.size) * (standard_deviation_sample(m)**3))
|
182
|
+
end
|
149
183
|
end
|
150
184
|
|
151
185
|
def kurtosis m=nil
|
152
|
-
|
153
|
-
|
154
|
-
|
186
|
+
if @data.respond_to? :kurtosis
|
187
|
+
@data.kurtosis
|
188
|
+
else
|
189
|
+
m ||= mean
|
190
|
+
fo = @data.inject(0){ |a, x| a + ((x - m) ** 4) }
|
191
|
+
fo.quo((@size - @missing_positions.size) * standard_deviation_sample(m) ** 4) - 3
|
192
|
+
end
|
155
193
|
end
|
156
194
|
|
157
195
|
def average_deviation_population m=nil
|
196
|
+
type == :numeric or raise TypeError, "Vector must be numeric"
|
197
|
+
m ||= mean
|
198
|
+
(@data.inject( 0 ) { |memo, val|
|
199
|
+
@missing_values.has_key?(val) ? memo : ( val - m ).abs + memo
|
200
|
+
}).quo( n_valid )
|
201
|
+
end
|
202
|
+
|
203
|
+
# Returns the value of the percentile q
|
204
|
+
#
|
205
|
+
# Accepts an optional second argument specifying the strategy to interpolate
|
206
|
+
# when the requested percentile lies between two data points a and b
|
207
|
+
# Valid strategies are:
|
208
|
+
# * :midpoint (Default): (a + b) / 2
|
209
|
+
# * :linear : a + (b - a) * d where d is the decimal part of the index between a and b.
|
210
|
+
# == References
|
211
|
+
#
|
212
|
+
# This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method)
|
213
|
+
def percentile(q, strategy = :midpoint)
|
214
|
+
sorted = only_valid(:array).sort
|
215
|
+
|
216
|
+
case strategy
|
217
|
+
when :midpoint
|
218
|
+
v = (n_valid * q).quo(100)
|
219
|
+
if(v.to_i!=v)
|
220
|
+
sorted[v.to_i]
|
221
|
+
else
|
222
|
+
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
223
|
+
end
|
224
|
+
when :linear
|
225
|
+
index = (q / 100.0) * (n_valid + 1)
|
226
|
+
|
227
|
+
k = index.truncate
|
228
|
+
d = index % 1
|
229
|
+
|
230
|
+
if k == 0
|
231
|
+
sorted[0]
|
232
|
+
elsif k >= sorted.size
|
233
|
+
sorted[-1]
|
234
|
+
else
|
235
|
+
sorted[k - 1] + d * (sorted[k] - sorted[k - 1])
|
236
|
+
end
|
237
|
+
else
|
238
|
+
raise NotImplementedError.new "Unknown strategy #{strategy.to_s}"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
# Dichotomize the vector with 0 and 1, based on lowest value.
|
243
|
+
# If parameter is defined, this value and lower will be 0
|
244
|
+
# and higher, 1.
|
245
|
+
def dichotomize(low = nil)
|
246
|
+
low ||= factors.min
|
247
|
+
|
248
|
+
self.recode do |x|
|
249
|
+
if x.nil?
|
250
|
+
nil
|
251
|
+
elsif x > low
|
252
|
+
1
|
253
|
+
else
|
254
|
+
0
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
# Center data by subtracting the mean from each non-nil value.
|
260
|
+
def center
|
261
|
+
self - mean
|
262
|
+
end
|
263
|
+
|
264
|
+
# Standardize data.
|
265
|
+
#
|
266
|
+
# == Arguments
|
267
|
+
#
|
268
|
+
# * use_population - Pass as *true* if you want to use population
|
269
|
+
# standard deviation instead of sample standard deviation.
|
270
|
+
def standardize use_population=false
|
158
271
|
m ||= mean
|
159
|
-
|
272
|
+
sd = use_population ? sdp : sds
|
273
|
+
return Daru::Vector.new([nil]*@size) if m.nil? or sd == 0.0
|
274
|
+
|
275
|
+
vector_standardized_compute m, sd
|
160
276
|
end
|
161
277
|
|
162
|
-
def
|
163
|
-
@
|
278
|
+
def box_cox_transformation lambda # :nodoc:
|
279
|
+
raise "Should be a numeric" unless @type == :numeric
|
280
|
+
|
281
|
+
self.recode do |x|
|
282
|
+
if !x.nil?
|
283
|
+
if(lambda == 0)
|
284
|
+
Math.log(x)
|
285
|
+
else
|
286
|
+
(x ** lambda - 1).quo(lambda)
|
287
|
+
end
|
288
|
+
else
|
289
|
+
nil
|
290
|
+
end
|
291
|
+
end
|
164
292
|
end
|
165
293
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
294
|
+
# Replace each non-nil value in the vector with its percentile.
|
295
|
+
def vector_percentile
|
296
|
+
c = size - missing_positions.size
|
297
|
+
ranked.recode! { |i| i.nil? ? nil : (i.quo(c)*100).to_f }
|
298
|
+
end
|
299
|
+
|
300
|
+
def vector_standardized_compute(m,sd)
|
301
|
+
if @data.respond_to? :vector_standardized_compute
|
302
|
+
@data.vector_standardized_compute(m,sd)
|
171
303
|
else
|
172
|
-
|
304
|
+
Daru::Vector.new @data.collect { |x| x.nil? ? nil : (x.to_f - m).quo(sd) },
|
305
|
+
index: index, name: name, dtype: dtype
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
def vector_centered_compute(m)
|
310
|
+
if @data.respond_to? :vector_centered_compute
|
311
|
+
@data.vector_centered_compute(m)
|
312
|
+
else
|
313
|
+
Daru::Vector.new @data.collect { |x| x.nil? ? nil : x.to_f-m },
|
314
|
+
index: index, name: name, dtype: dtype
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
# Returns an random sample of size n, with replacement,
|
319
|
+
# only with non-nil data.
|
320
|
+
#
|
321
|
+
# In all the trails, every item have the same probability
|
322
|
+
# of been selected.
|
323
|
+
def sample_with_replacement(sample=1)
|
324
|
+
if @data.respond_to? :sample_with_replacement
|
325
|
+
@data.sample_with_replacement sample
|
326
|
+
else
|
327
|
+
valid = missing_positions.empty? ? self : self.only_valid
|
328
|
+
vds = valid.size
|
329
|
+
(0...sample).collect{ valid[rand(vds)] }
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
# Returns an random sample of size n, without replacement,
|
334
|
+
# only with valid data.
|
335
|
+
#
|
336
|
+
# Every element could only be selected once.
|
337
|
+
#
|
338
|
+
# A sample of the same size of the vector is the vector itself.
|
339
|
+
def sample_without_replacement(sample=1)
|
340
|
+
if @data.respond_to? :sample_without_replacement
|
341
|
+
@data.sample_without_replacement sample
|
342
|
+
else
|
343
|
+
valid = missing_positions.empty? ? self : self.only_valid
|
344
|
+
raise ArgumentError, "Sample size couldn't be greater than n" if
|
345
|
+
sample > valid.size
|
346
|
+
out = []
|
347
|
+
size = valid.size
|
348
|
+
while out.size < sample
|
349
|
+
value = rand(size)
|
350
|
+
out.push(value) if !out.include?(value)
|
351
|
+
end
|
352
|
+
|
353
|
+
out.collect{|i| valid[i]}
|
173
354
|
end
|
174
355
|
end
|
175
356
|
|
@@ -182,6 +363,7 @@ module Daru
|
|
182
363
|
alias :sd :standard_deviation_sample
|
183
364
|
alias :ss :sum_of_squares
|
184
365
|
alias :percentil :percentile
|
366
|
+
alias :se :standard_error
|
185
367
|
end
|
186
368
|
end
|
187
369
|
end
|