daru 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.build.sh +14 -0
- data/.travis.yml +26 -4
- data/CONTRIBUTING.md +31 -0
- data/Gemfile +1 -2
- data/{History.txt → History.md} +110 -44
- data/README.md +21 -288
- data/Rakefile +1 -0
- data/daru.gemspec +12 -8
- data/lib/daru.rb +36 -1
- data/lib/daru/accessors/array_wrapper.rb +8 -3
- data/lib/daru/accessors/gsl_wrapper.rb +113 -0
- data/lib/daru/accessors/nmatrix_wrapper.rb +6 -17
- data/lib/daru/core/group_by.rb +0 -1
- data/lib/daru/dataframe.rb +1192 -83
- data/lib/daru/extensions/rserve.rb +21 -0
- data/lib/daru/index.rb +14 -0
- data/lib/daru/io/io.rb +170 -8
- data/lib/daru/maths/arithmetic/dataframe.rb +4 -3
- data/lib/daru/maths/arithmetic/vector.rb +4 -4
- data/lib/daru/maths/statistics/dataframe.rb +48 -27
- data/lib/daru/maths/statistics/vector.rb +215 -33
- data/lib/daru/monkeys.rb +53 -7
- data/lib/daru/multi_index.rb +21 -4
- data/lib/daru/plotting/dataframe.rb +83 -25
- data/lib/daru/plotting/vector.rb +9 -10
- data/lib/daru/vector.rb +596 -61
- data/lib/daru/version.rb +3 -0
- data/spec/accessors/wrappers_spec.rb +51 -0
- data/spec/core/group_by_spec.rb +0 -2
- data/spec/daru_spec.rb +58 -0
- data/spec/dataframe_spec.rb +768 -73
- data/spec/extensions/rserve_spec.rb +52 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/io/io_spec.rb +161 -24
- data/spec/math/arithmetic/dataframe_spec.rb +26 -7
- data/spec/math/arithmetic/vector_spec.rb +8 -0
- data/spec/math/statistics/dataframe_spec.rb +16 -1
- data/spec/math/statistics/vector_spec.rb +215 -47
- data/spec/spec_helper.rb +21 -2
- data/spec/vector_spec.rb +368 -12
- metadata +99 -16
- data/lib/version.rb +0 -3
- data/notebooks/grouping_splitting_pivots.ipynb +0 -529
- data/notebooks/intro_with_music_data_.ipynb +0 -303
@@ -0,0 +1,21 @@
|
|
1
|
+
# Support for converting data to R data structures to support rserve-client
|
2
|
+
|
3
|
+
module Daru
|
4
|
+
class DataFrame
|
5
|
+
def to_REXP
|
6
|
+
names = @vectors.to_a
|
7
|
+
data = names.map do |f|
|
8
|
+
Rserve::REXP::Wrapper.wrap(self[f].to_a)
|
9
|
+
end
|
10
|
+
l = Rserve::Rlist.new(data, names.map(&:to_s))
|
11
|
+
|
12
|
+
Rserve::REXP.create_data_frame(l)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Vector
|
17
|
+
def to_REXP
|
18
|
+
Rserve::REXP::Wrapper.wrap(self.to_a)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/daru/index.rb
CHANGED
@@ -101,8 +101,22 @@ module Daru
|
|
101
101
|
@relation_hash.has_key? index
|
102
102
|
end
|
103
103
|
|
104
|
+
def empty?
|
105
|
+
@relation_hash.empty?
|
106
|
+
end
|
107
|
+
|
104
108
|
def dup
|
105
109
|
Daru::Index.new @relation_hash.keys
|
106
110
|
end
|
111
|
+
|
112
|
+
def _dump depth
|
113
|
+
Marshal.dump({relation_hash: @relation_hash})
|
114
|
+
end
|
115
|
+
|
116
|
+
def self._load data
|
117
|
+
h = Marshal.load data
|
118
|
+
|
119
|
+
Daru::Index.new(h[:relation_hash].keys, h[:relation_hash].values)
|
120
|
+
end
|
107
121
|
end
|
108
122
|
end
|
data/lib/daru/io/io.rb
CHANGED
@@ -1,22 +1,184 @@
|
|
1
1
|
module Daru
|
2
|
+
module IOHelpers
|
3
|
+
class << self
|
4
|
+
def process_row(row,empty)
|
5
|
+
row.to_a.map do |c|
|
6
|
+
if empty.include?(c)
|
7
|
+
nil
|
8
|
+
else
|
9
|
+
if c.is_a? String and c.is_number?
|
10
|
+
c =~ /^\d+$/ ? c.to_i : c.gsub(",",".").to_f
|
11
|
+
else
|
12
|
+
c
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
2
20
|
module IO
|
3
21
|
class << self
|
22
|
+
# Functions for loading/writing Excel files.
|
23
|
+
|
24
|
+
def from_excel path, opts={}
|
25
|
+
opts = {
|
26
|
+
:worksheet_id => 0,
|
27
|
+
}.merge opts
|
28
|
+
|
29
|
+
worksheet_id = opts[:worksheet_id]
|
30
|
+
book = Spreadsheet.open path
|
31
|
+
worksheet = book.worksheet worksheet_id
|
32
|
+
headers = worksheet.row(0).recode_repeated.map(&:to_sym)
|
33
|
+
|
34
|
+
df = Daru::DataFrame.new({})
|
35
|
+
headers.each_with_index do |h,i|
|
36
|
+
col = worksheet.column(i).to_a
|
37
|
+
col.delete_at 0
|
38
|
+
df[h] = col
|
39
|
+
end
|
40
|
+
|
41
|
+
df
|
42
|
+
end
|
43
|
+
|
44
|
+
def dataframe_write_excel dataframe, path, opts={}
|
45
|
+
book = Spreadsheet::Workbook.new
|
46
|
+
sheet = book.create_worksheet
|
47
|
+
format = Spreadsheet::Format.new :color => :blue, :weight => :bold
|
48
|
+
|
49
|
+
sheet.row(0).concat(dataframe.vectors.to_a.map(&:to_s)) # Unfreeze strings
|
50
|
+
sheet.row(0).default_format = format
|
51
|
+
i = 1
|
52
|
+
dataframe.each_row do |row|
|
53
|
+
sheet.row(i).concat(row.to_a)
|
54
|
+
i += 1
|
55
|
+
end
|
56
|
+
|
57
|
+
book.write(path)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Functions for loading/writing CSV files
|
61
|
+
|
4
62
|
def from_csv path, opts={}
|
5
63
|
opts[:col_sep] ||= ','
|
6
|
-
opts[:headers] ||= true
|
7
64
|
opts[:converters] ||= :numeric
|
8
|
-
opts[:header_converters] ||= :symbol
|
9
65
|
|
10
|
-
|
66
|
+
daru_options = opts.keys.inject({}) do |hash, k|
|
67
|
+
if [:clone, :order, :index, :name].include?(k)
|
68
|
+
hash[k] = opts[k]
|
69
|
+
opts.delete k
|
70
|
+
end
|
71
|
+
|
72
|
+
hash
|
73
|
+
end
|
74
|
+
|
75
|
+
# Preprocess headers for detecting and correcting repetition in
|
76
|
+
# case the :headers option is not specified.
|
77
|
+
unless opts[:headers]
|
78
|
+
csv = ::CSV.open(path, 'rb', opts)
|
79
|
+
yield csv if block_given?
|
80
|
+
|
81
|
+
csv_as_arrays = csv.to_a
|
82
|
+
headers = csv_as_arrays[0].recode_repeated.map(&:to_sym)
|
83
|
+
csv_as_arrays.delete_at 0
|
84
|
+
csv_as_arrays = csv_as_arrays.transpose
|
11
85
|
|
12
|
-
|
86
|
+
hsh = {}
|
87
|
+
headers.each_with_index do |h, i|
|
88
|
+
hsh[h] = csv_as_arrays[i]
|
89
|
+
end
|
90
|
+
else
|
91
|
+
opts[:header_converters] ||= :symbol
|
92
|
+
|
93
|
+
csv = ::CSV.read(path, 'rb',opts)
|
94
|
+
yield csv if block_given?
|
13
95
|
|
14
|
-
|
15
|
-
|
16
|
-
|
96
|
+
hsh = {}
|
97
|
+
csv.by_col.each do |col_name, values|
|
98
|
+
hsh[col_name] = values
|
99
|
+
end
|
17
100
|
end
|
18
101
|
|
19
|
-
Daru::DataFrame.new(hsh)
|
102
|
+
Daru::DataFrame.new(hsh,daru_options)
|
103
|
+
end
|
104
|
+
|
105
|
+
def dataframe_write_csv dataframe, path, opts={}
|
106
|
+
options = {
|
107
|
+
converters: :numeric
|
108
|
+
}.merge(opts)
|
109
|
+
|
110
|
+
writer = ::CSV.open(path, 'w', options)
|
111
|
+
writer << dataframe.vectors.to_a
|
112
|
+
|
113
|
+
dataframe.each_row do |row|
|
114
|
+
if options[:convert_comma]
|
115
|
+
writer << row.map { |v| v.to_s.gsub('.', ',') }
|
116
|
+
else
|
117
|
+
writer << row.to_a
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
writer.close
|
122
|
+
end
|
123
|
+
|
124
|
+
# Loading/writing from SQL databases
|
125
|
+
|
126
|
+
def from_sql dbh, query
|
127
|
+
require 'dbi'
|
128
|
+
sth = dbh.execute(query)
|
129
|
+
vectors = {}
|
130
|
+
fields = []
|
131
|
+
sth.column_info.each do |c|
|
132
|
+
vectors[c[:name]] = Daru::Vector.new([])
|
133
|
+
vectors[c[:name]].rename c[:name]
|
134
|
+
fields.push(c[:name].to_sym)
|
135
|
+
end
|
136
|
+
ds=Daru::DataFrame.new(vectors,order: fields)
|
137
|
+
sth.fetch do |row|
|
138
|
+
ds.add_row(row.to_a)
|
139
|
+
end
|
140
|
+
ds.update
|
141
|
+
ds
|
142
|
+
end
|
143
|
+
|
144
|
+
def dataframe_write_sql ds, dbh, table
|
145
|
+
require 'dbi'
|
146
|
+
query = "INSERT INTO #{table} ("+ds.vectors.to_a.join(",")+") VALUES ("+((["?"]*ds.vectors.size).join(","))+")"
|
147
|
+
sth = dbh.prepare(query)
|
148
|
+
ds.each_row { |c| sth.execute(*c.to_a) }
|
149
|
+
return true
|
150
|
+
end
|
151
|
+
|
152
|
+
# Loading data from plain text files
|
153
|
+
|
154
|
+
def from_plaintext filename, fields
|
155
|
+
ds = Daru::DataFrame.new({}, order: fields)
|
156
|
+
fp = File.open(filename,"r")
|
157
|
+
fp.each_line do |line|
|
158
|
+
row = Daru::IOHelpers.process_row(line.strip.split(/\s+/),[""])
|
159
|
+
next if row == ["\x1A"]
|
160
|
+
ds.add_row(row)
|
161
|
+
end
|
162
|
+
ds.update
|
163
|
+
fields.each { |f| ds[f].rename f }
|
164
|
+
ds
|
165
|
+
end
|
166
|
+
|
167
|
+
# Loading and writing Marshalled DataFrame/Vector
|
168
|
+
def save klass, filename
|
169
|
+
fp = File.open(filename, 'w')
|
170
|
+
Marshal.dump(klass, fp)
|
171
|
+
fp.close
|
172
|
+
end
|
173
|
+
|
174
|
+
def load filename
|
175
|
+
if File.exist? filename
|
176
|
+
o = false
|
177
|
+
File.open(filename, 'r') { |fp| o = Marshal.load(fp) }
|
178
|
+
o
|
179
|
+
else
|
180
|
+
false
|
181
|
+
end
|
20
182
|
end
|
21
183
|
end
|
22
184
|
end
|
@@ -36,15 +36,16 @@ module Daru
|
|
36
36
|
|
37
37
|
# Calculate exponenential of all vectors with numeric values.
|
38
38
|
def exp
|
39
|
-
|
39
|
+
only_numerics(clone: false).recode { |v| v.exp }
|
40
40
|
end
|
41
41
|
|
42
|
+
# Calcuate square root of numeric vectors.
|
42
43
|
def sqrt
|
43
|
-
|
44
|
+
only_numerics(clone: false).recode { |v| v.sqrt }
|
44
45
|
end
|
45
46
|
|
46
47
|
def round precision=0
|
47
|
-
|
48
|
+
only_numerics(clone: false).recode { |v| v.round(precision) }
|
48
49
|
end
|
49
50
|
private
|
50
51
|
|
@@ -35,17 +35,17 @@ module Daru
|
|
35
35
|
end
|
36
36
|
|
37
37
|
def abs
|
38
|
-
self.
|
38
|
+
self.recode { |e| e.abs unless e.nil? }
|
39
39
|
end
|
40
40
|
|
41
41
|
def round precision=0
|
42
|
-
self.
|
42
|
+
self.recode { |e| e.round(precision) unless e.nil? }
|
43
43
|
end
|
44
44
|
|
45
45
|
private
|
46
46
|
|
47
47
|
def math_unary_op operation
|
48
|
-
self.
|
48
|
+
self.recode { |e| Math.send(operation, e) unless e.nil? }
|
49
49
|
end
|
50
50
|
|
51
51
|
def binary_op operation, other
|
@@ -65,7 +65,7 @@ module Daru
|
|
65
65
|
def v2v_binary operation, other
|
66
66
|
common_idxs = []
|
67
67
|
elements = []
|
68
|
-
index = (@index.to_a
|
68
|
+
index = (@index.to_a | other.index.to_a).sort
|
69
69
|
|
70
70
|
index.each do |idx|
|
71
71
|
this = self[idx]
|
@@ -37,6 +37,15 @@ module Daru
|
|
37
37
|
compute_stats :product
|
38
38
|
end
|
39
39
|
|
40
|
+
def standardize
|
41
|
+
df = self.only_numerics clone: true
|
42
|
+
df.map! do |v|
|
43
|
+
v.standardize
|
44
|
+
end
|
45
|
+
|
46
|
+
df
|
47
|
+
end
|
48
|
+
|
40
49
|
# Create a summary of mean, standard deviation, count, max and min of
|
41
50
|
# each numeric vector in the dataframe in one shot.
|
42
51
|
#
|
@@ -55,40 +64,39 @@ module Daru
|
|
55
64
|
Daru::DataFrame.new(description_hash, index: methods)
|
56
65
|
end
|
57
66
|
|
58
|
-
# Calculate variance-covariance between the numeric vectors.
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
67
|
+
# Calculate sample variance-covariance between the numeric vectors.
|
68
|
+
def covariance
|
69
|
+
cache={}
|
70
|
+
vectors = self.numeric_vectors
|
71
|
+
|
72
|
+
mat_rows = vectors.collect do |row|
|
73
|
+
vectors.collect do |col|
|
74
|
+
if row == col
|
75
|
+
self[row].variance
|
76
|
+
else
|
77
|
+
if cache[[col,row]].nil?
|
78
|
+
cov = vector_cov(self[row],self[col])
|
79
|
+
cache[[row,col]] = cov
|
80
|
+
cov
|
81
|
+
else
|
82
|
+
cache[[col,row]]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
75
86
|
end
|
76
87
|
|
77
|
-
Daru::DataFrame.rows(
|
88
|
+
Daru::DataFrame.rows(mat_rows, index: numeric_vectors, order: numeric_vectors)
|
78
89
|
end
|
79
90
|
|
80
91
|
alias :cov :covariance
|
81
92
|
|
82
93
|
# Calculate the correlation between the numeric vectors.
|
83
94
|
def correlation
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
standard_deviation
|
89
|
-
(cov.to_matrix.elementwise_division(standard_deviation.transpose *
|
90
|
-
standard_deviation)).to_a
|
91
|
-
end
|
95
|
+
standard_deviation = std.to_matrix
|
96
|
+
corr_arry = (cov
|
97
|
+
.to_matrix
|
98
|
+
.elementwise_division(standard_deviation.transpose *
|
99
|
+
standard_deviation)).to_a
|
92
100
|
|
93
101
|
Daru::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
|
94
102
|
end
|
@@ -97,12 +105,25 @@ module Daru
|
|
97
105
|
|
98
106
|
private
|
99
107
|
|
108
|
+
def vector_cov v1a, v2a
|
109
|
+
sum_of_squares(v1a,v2a) / (v1a.size - 1)
|
110
|
+
end
|
111
|
+
|
112
|
+
def sum_of_squares v1, v2
|
113
|
+
v1a,v2a = v1.only_valid ,v2.only_valid
|
114
|
+
v1a.reset_index!
|
115
|
+
v2a.reset_index!
|
116
|
+
m1 = v1a.mean
|
117
|
+
m2 = v2a.mean
|
118
|
+
(v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
|
119
|
+
end
|
120
|
+
|
100
121
|
def compute_stats method
|
101
122
|
Daru::Vector.new(
|
102
123
|
numeric_vectors.inject({}) do |hash, vec|
|
103
124
|
hash[vec] = self[vec].send(method)
|
104
125
|
hash
|
105
|
-
end
|
126
|
+
end, name: method
|
106
127
|
)
|
107
128
|
end
|
108
129
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Daru
|
2
2
|
module Maths
|
3
3
|
# Encapsulates statistics methods for vectors. Most basic stuff like mean, etc.
|
4
|
-
#
|
5
|
-
#
|
4
|
+
# is done inside the wrapper, so that native methods can be used for most of
|
5
|
+
# the computationally intensive tasks.
|
6
6
|
module Statistics
|
7
7
|
module Vector
|
8
8
|
def mean
|
@@ -26,7 +26,7 @@ module Daru
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def median
|
29
|
-
percentile
|
29
|
+
@data.respond_to?(:median) ? @data.median : percentile(50)
|
30
30
|
end
|
31
31
|
|
32
32
|
def mode
|
@@ -36,15 +36,21 @@ module Daru
|
|
36
36
|
|
37
37
|
def median_absolute_deviation
|
38
38
|
m = median
|
39
|
-
|
39
|
+
recode {|val| (val - m).abs }.median
|
40
40
|
end
|
41
|
+
alias :mad :median_absolute_deviation
|
41
42
|
|
42
43
|
def standard_error
|
43
|
-
standard_deviation_sample/(Math::sqrt((
|
44
|
+
standard_deviation_sample/(Math::sqrt((n_valid)))
|
44
45
|
end
|
45
46
|
|
46
47
|
def sum_of_squared_deviation
|
47
|
-
(@data.
|
48
|
+
(@data.inject(0) { |a,x| x.square + a } - (sum.square.quo(n_valid)).to_f).to_f
|
49
|
+
end
|
50
|
+
|
51
|
+
# Retrieve unique values of non-nil data
|
52
|
+
def factors
|
53
|
+
only_valid.uniq.reset_index!
|
48
54
|
end
|
49
55
|
|
50
56
|
# Maximum element of the vector.
|
@@ -69,12 +75,18 @@ module Daru
|
|
69
75
|
|
70
76
|
def frequencies
|
71
77
|
@data.inject({}) do |hash, element|
|
72
|
-
|
73
|
-
|
78
|
+
unless element.nil?
|
79
|
+
hash[element] ||= 0
|
80
|
+
hash[element] += 1
|
81
|
+
end
|
74
82
|
hash
|
75
83
|
end
|
76
84
|
end
|
77
85
|
|
86
|
+
def freqs
|
87
|
+
Daru::Vector.new(frequencies)
|
88
|
+
end
|
89
|
+
|
78
90
|
def proportions
|
79
91
|
len = n_valid
|
80
92
|
frequencies.inject({}) { |hash, arr| hash[arr[0]] = arr[1] / len; hash }
|
@@ -83,13 +95,12 @@ module Daru
|
|
83
95
|
def ranked
|
84
96
|
sum = 0
|
85
97
|
r = frequencies.sort.inject( {} ) do |memo, val|
|
86
|
-
memo[val[0]] = ((sum + 1) + (sum + val[1]))
|
98
|
+
memo[val[0]] = ((sum + 1) + (sum + val[1])).quo(2)
|
87
99
|
sum += val[1]
|
88
100
|
memo
|
89
101
|
end
|
90
102
|
|
91
|
-
|
92
|
-
name: self.name, dtype: self.dtype, nm_dtype: self.nm_dtype
|
103
|
+
recode { |e| r[e] }
|
93
104
|
end
|
94
105
|
|
95
106
|
def coefficient_of_variation
|
@@ -107,69 +118,239 @@ module Daru
|
|
107
118
|
val = frequencies[value]
|
108
119
|
val.nil? ? 0 : val
|
109
120
|
else
|
110
|
-
size - @
|
121
|
+
size - @missing_positions.size
|
111
122
|
end
|
112
123
|
end
|
113
124
|
|
114
125
|
def proportion value=1
|
115
|
-
frequencies[value]
|
126
|
+
frequencies[value].quo(n_valid).to_f
|
116
127
|
end
|
117
128
|
|
118
129
|
# Sample variance with denominator (N-1)
|
119
130
|
def variance_sample m=nil
|
120
131
|
m ||= self.mean
|
121
|
-
|
132
|
+
if @data.respond_to? :variance_sample
|
133
|
+
@data.variance_sample m
|
134
|
+
else
|
135
|
+
sum_of_squares(m).quo((n_valid) - 1)
|
136
|
+
end
|
122
137
|
end
|
123
138
|
|
124
139
|
# Population variance with denominator (N)
|
125
140
|
def variance_population m=nil
|
126
141
|
m ||= mean
|
127
|
-
|
142
|
+
if @data.respond_to? :variance_population
|
143
|
+
@data.variance_population m
|
144
|
+
else
|
145
|
+
sum_of_squares(m).quo((n_valid)).to_f
|
146
|
+
end
|
128
147
|
end
|
129
148
|
|
130
149
|
def sum_of_squares(m=nil)
|
131
150
|
m ||= mean
|
132
|
-
@data.inject(0) { |memo, val|
|
151
|
+
@data.inject(0) { |memo, val|
|
152
|
+
@missing_values.has_key?(val) ? memo : (memo + (val - m)**2)
|
153
|
+
}
|
133
154
|
end
|
134
155
|
|
135
156
|
def standard_deviation_population m=nil
|
136
157
|
m ||= mean
|
137
|
-
|
158
|
+
if @data.respond_to? :standard_deviation_population
|
159
|
+
@data.standard_deviation_population(m)
|
160
|
+
else
|
161
|
+
Math::sqrt(variance_population(m))
|
162
|
+
end
|
138
163
|
end
|
139
164
|
|
140
165
|
def standard_deviation_sample m=nil
|
141
|
-
|
166
|
+
m ||= mean
|
167
|
+
if @data.respond_to? :standard_deviation_sample
|
168
|
+
@data.standard_deviation_sample m
|
169
|
+
else
|
170
|
+
Math::sqrt(variance_sample(m))
|
171
|
+
end
|
142
172
|
end
|
143
173
|
|
144
174
|
# Calculate skewness using (sigma(xi - mean)^3)/((N)*std_dev_sample^3)
|
145
175
|
def skew m=nil
|
146
|
-
|
147
|
-
|
148
|
-
|
176
|
+
if @data.respond_to? :skew
|
177
|
+
@data.skew
|
178
|
+
else
|
179
|
+
m ||= mean
|
180
|
+
th = @data.inject(0) { |memo, val| memo + ((val - m)**3) }
|
181
|
+
th.quo ((@size - @missing_positions.size) * (standard_deviation_sample(m)**3))
|
182
|
+
end
|
149
183
|
end
|
150
184
|
|
151
185
|
def kurtosis m=nil
|
152
|
-
|
153
|
-
|
154
|
-
|
186
|
+
if @data.respond_to? :kurtosis
|
187
|
+
@data.kurtosis
|
188
|
+
else
|
189
|
+
m ||= mean
|
190
|
+
fo = @data.inject(0){ |a, x| a + ((x - m) ** 4) }
|
191
|
+
fo.quo((@size - @missing_positions.size) * standard_deviation_sample(m) ** 4) - 3
|
192
|
+
end
|
155
193
|
end
|
156
194
|
|
157
195
|
def average_deviation_population m=nil
|
196
|
+
type == :numeric or raise TypeError, "Vector must be numeric"
|
197
|
+
m ||= mean
|
198
|
+
(@data.inject( 0 ) { |memo, val|
|
199
|
+
@missing_values.has_key?(val) ? memo : ( val - m ).abs + memo
|
200
|
+
}).quo( n_valid )
|
201
|
+
end
|
202
|
+
|
203
|
+
# Returns the value of the percentile q
|
204
|
+
#
|
205
|
+
# Accepts an optional second argument specifying the strategy to interpolate
|
206
|
+
# when the requested percentile lies between two data points a and b
|
207
|
+
# Valid strategies are:
|
208
|
+
# * :midpoint (Default): (a + b) / 2
|
209
|
+
# * :linear : a + (b - a) * d where d is the decimal part of the index between a and b.
|
210
|
+
# == References
|
211
|
+
#
|
212
|
+
# This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method)
|
213
|
+
def percentile(q, strategy = :midpoint)
|
214
|
+
sorted = only_valid(:array).sort
|
215
|
+
|
216
|
+
case strategy
|
217
|
+
when :midpoint
|
218
|
+
v = (n_valid * q).quo(100)
|
219
|
+
if(v.to_i!=v)
|
220
|
+
sorted[v.to_i]
|
221
|
+
else
|
222
|
+
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
223
|
+
end
|
224
|
+
when :linear
|
225
|
+
index = (q / 100.0) * (n_valid + 1)
|
226
|
+
|
227
|
+
k = index.truncate
|
228
|
+
d = index % 1
|
229
|
+
|
230
|
+
if k == 0
|
231
|
+
sorted[0]
|
232
|
+
elsif k >= sorted.size
|
233
|
+
sorted[-1]
|
234
|
+
else
|
235
|
+
sorted[k - 1] + d * (sorted[k] - sorted[k - 1])
|
236
|
+
end
|
237
|
+
else
|
238
|
+
raise NotImplementedError.new "Unknown strategy #{strategy.to_s}"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
# Dichotomize the vector with 0 and 1, based on lowest value.
|
243
|
+
# If parameter is defined, this value and lower will be 0
|
244
|
+
# and higher, 1.
|
245
|
+
def dichotomize(low = nil)
|
246
|
+
low ||= factors.min
|
247
|
+
|
248
|
+
self.recode do |x|
|
249
|
+
if x.nil?
|
250
|
+
nil
|
251
|
+
elsif x > low
|
252
|
+
1
|
253
|
+
else
|
254
|
+
0
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
# Center data by subtracting the mean from each non-nil value.
|
260
|
+
def center
|
261
|
+
self - mean
|
262
|
+
end
|
263
|
+
|
264
|
+
# Standardize data.
|
265
|
+
#
|
266
|
+
# == Arguments
|
267
|
+
#
|
268
|
+
# * use_population - Pass as *true* if you want to use population
|
269
|
+
# standard deviation instead of sample standard deviation.
|
270
|
+
def standardize use_population=false
|
158
271
|
m ||= mean
|
159
|
-
|
272
|
+
sd = use_population ? sdp : sds
|
273
|
+
return Daru::Vector.new([nil]*@size) if m.nil? or sd == 0.0
|
274
|
+
|
275
|
+
vector_standardized_compute m, sd
|
160
276
|
end
|
161
277
|
|
162
|
-
def
|
163
|
-
@
|
278
|
+
def box_cox_transformation lambda # :nodoc:
|
279
|
+
raise "Should be a numeric" unless @type == :numeric
|
280
|
+
|
281
|
+
self.recode do |x|
|
282
|
+
if !x.nil?
|
283
|
+
if(lambda == 0)
|
284
|
+
Math.log(x)
|
285
|
+
else
|
286
|
+
(x ** lambda - 1).quo(lambda)
|
287
|
+
end
|
288
|
+
else
|
289
|
+
nil
|
290
|
+
end
|
291
|
+
end
|
164
292
|
end
|
165
293
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
294
|
+
# Replace each non-nil value in the vector with its percentile.
|
295
|
+
def vector_percentile
|
296
|
+
c = size - missing_positions.size
|
297
|
+
ranked.recode! { |i| i.nil? ? nil : (i.quo(c)*100).to_f }
|
298
|
+
end
|
299
|
+
|
300
|
+
def vector_standardized_compute(m,sd)
|
301
|
+
if @data.respond_to? :vector_standardized_compute
|
302
|
+
@data.vector_standardized_compute(m,sd)
|
171
303
|
else
|
172
|
-
|
304
|
+
Daru::Vector.new @data.collect { |x| x.nil? ? nil : (x.to_f - m).quo(sd) },
|
305
|
+
index: index, name: name, dtype: dtype
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
def vector_centered_compute(m)
|
310
|
+
if @data.respond_to? :vector_centered_compute
|
311
|
+
@data.vector_centered_compute(m)
|
312
|
+
else
|
313
|
+
Daru::Vector.new @data.collect { |x| x.nil? ? nil : x.to_f-m },
|
314
|
+
index: index, name: name, dtype: dtype
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
# Returns an random sample of size n, with replacement,
|
319
|
+
# only with non-nil data.
|
320
|
+
#
|
321
|
+
# In all the trails, every item have the same probability
|
322
|
+
# of been selected.
|
323
|
+
def sample_with_replacement(sample=1)
|
324
|
+
if @data.respond_to? :sample_with_replacement
|
325
|
+
@data.sample_with_replacement sample
|
326
|
+
else
|
327
|
+
valid = missing_positions.empty? ? self : self.only_valid
|
328
|
+
vds = valid.size
|
329
|
+
(0...sample).collect{ valid[rand(vds)] }
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
# Returns an random sample of size n, without replacement,
|
334
|
+
# only with valid data.
|
335
|
+
#
|
336
|
+
# Every element could only be selected once.
|
337
|
+
#
|
338
|
+
# A sample of the same size of the vector is the vector itself.
|
339
|
+
def sample_without_replacement(sample=1)
|
340
|
+
if @data.respond_to? :sample_without_replacement
|
341
|
+
@data.sample_without_replacement sample
|
342
|
+
else
|
343
|
+
valid = missing_positions.empty? ? self : self.only_valid
|
344
|
+
raise ArgumentError, "Sample size couldn't be greater than n" if
|
345
|
+
sample > valid.size
|
346
|
+
out = []
|
347
|
+
size = valid.size
|
348
|
+
while out.size < sample
|
349
|
+
value = rand(size)
|
350
|
+
out.push(value) if !out.include?(value)
|
351
|
+
end
|
352
|
+
|
353
|
+
out.collect{|i| valid[i]}
|
173
354
|
end
|
174
355
|
end
|
175
356
|
|
@@ -182,6 +363,7 @@ module Daru
|
|
182
363
|
alias :sd :standard_deviation_sample
|
183
364
|
alias :ss :sum_of_squares
|
184
365
|
alias :percentil :percentile
|
366
|
+
alias :se :standard_error
|
185
367
|
end
|
186
368
|
end
|
187
369
|
end
|