daru 0.1.3.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rspec +2 -1
- data/.rspec_formatter.rb +33 -0
- data/.rubocop.yml +26 -2
- data/History.md +38 -0
- data/README.md +22 -13
- data/Rakefile +50 -2
- data/benchmarks/csv_reading.rb +22 -0
- data/daru.gemspec +9 -2
- data/lib/daru.rb +36 -4
- data/lib/daru/accessors/array_wrapper.rb +6 -1
- data/lib/daru/accessors/dataframe_by_row.rb +10 -2
- data/lib/daru/accessors/gsl_wrapper.rb +1 -3
- data/lib/daru/accessors/nmatrix_wrapper.rb +9 -0
- data/lib/daru/category.rb +935 -0
- data/lib/daru/core/group_by.rb +29 -38
- data/lib/daru/core/merge.rb +186 -145
- data/lib/daru/core/query.rb +22 -11
- data/lib/daru/dataframe.rb +976 -885
- data/lib/daru/date_time/index.rb +166 -166
- data/lib/daru/date_time/offsets.rb +66 -77
- data/lib/daru/formatters/table.rb +54 -0
- data/lib/daru/helpers/array.rb +40 -0
- data/lib/daru/index.rb +476 -73
- data/lib/daru/io/io.rb +66 -45
- data/lib/daru/io/sql_data_source.rb +33 -62
- data/lib/daru/iruby/helpers.rb +38 -0
- data/lib/daru/iruby/templates/dataframe.html.erb +52 -0
- data/lib/daru/iruby/templates/dataframe_mi.html.erb +58 -0
- data/lib/daru/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru/iruby/templates/vector.html.erb +27 -0
- data/lib/daru/iruby/templates/vector_mi.html.erb +36 -0
- data/lib/daru/maths/arithmetic/dataframe.rb +16 -18
- data/lib/daru/maths/arithmetic/vector.rb +4 -6
- data/lib/daru/maths/statistics/dataframe.rb +8 -15
- data/lib/daru/maths/statistics/vector.rb +120 -98
- data/lib/daru/monkeys.rb +12 -40
- data/lib/daru/plotting/gruff.rb +3 -0
- data/lib/daru/plotting/gruff/category.rb +49 -0
- data/lib/daru/plotting/gruff/dataframe.rb +91 -0
- data/lib/daru/plotting/gruff/vector.rb +57 -0
- data/lib/daru/plotting/nyaplot.rb +3 -0
- data/lib/daru/plotting/nyaplot/category.rb +34 -0
- data/lib/daru/plotting/nyaplot/dataframe.rb +187 -0
- data/lib/daru/plotting/nyaplot/vector.rb +46 -0
- data/lib/daru/vector.rb +694 -421
- data/lib/daru/version.rb +1 -1
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/spec/accessors/wrappers_spec.rb +2 -4
- data/spec/categorical_spec.rb +1734 -0
- data/spec/core/group_by_spec.rb +52 -2
- data/spec/core/merge_spec.rb +63 -2
- data/spec/core/query_spec.rb +236 -80
- data/spec/dataframe_spec.rb +1373 -79
- data/spec/date_time/data_spec.rb +3 -5
- data/spec/date_time/index_spec.rb +154 -17
- data/spec/date_time/offsets_spec.rb +3 -4
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/formatters/table_formatter_spec.rb +99 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +168 -0
- data/spec/index/index_spec.rb +283 -0
- data/spec/index/multi_index_spec.rb +570 -0
- data/spec/io/io_spec.rb +31 -4
- data/spec/io/sql_data_source_spec.rb +0 -1
- data/spec/iruby/dataframe_spec.rb +172 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +107 -0
- data/spec/math/arithmetic/dataframe_spec.rb +71 -13
- data/spec/math/arithmetic/vector_spec.rb +8 -10
- data/spec/math/statistics/dataframe_spec.rb +3 -5
- data/spec/math/statistics/vector_spec.rb +45 -55
- data/spec/monkeys_spec.rb +32 -9
- data/spec/plotting/dataframe_spec.rb +386 -0
- data/spec/plotting/vector_spec.rb +230 -0
- data/spec/shared/vector_display_spec.rb +215 -0
- data/spec/spec_helper.rb +23 -0
- data/spec/vector_spec.rb +905 -138
- metadata +143 -11
- data/.rubocop_todo.yml +0 -44
- data/lib/daru/plotting/dataframe.rb +0 -104
- data/lib/daru/plotting/vector.rb +0 -38
- data/spec/daru_spec.rb +0 -58
- data/spec/index_spec.rb +0 -375
data/lib/daru/core/query.rb
CHANGED
@@ -33,7 +33,7 @@ module Daru
|
|
33
33
|
end
|
34
34
|
|
35
35
|
def inspect
|
36
|
-
"
|
36
|
+
"#<#{self.class}:#{object_id} bool_arry=#{@barry}>"
|
37
37
|
end
|
38
38
|
end
|
39
39
|
|
@@ -56,17 +56,28 @@ module Daru
|
|
56
56
|
)
|
57
57
|
end
|
58
58
|
|
59
|
-
def vector_where
|
60
|
-
new_data =
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
59
|
+
def vector_where dv, bool_array
|
60
|
+
new_data, new_index = fetch_new_data_and_index dv, bool_array
|
61
|
+
|
62
|
+
resultant_dv = Daru::Vector.new new_data,
|
63
|
+
index: dv.index.class.new(new_index),
|
64
|
+
dtype: dv.dtype,
|
65
|
+
type: dv.type,
|
66
|
+
name: dv.name
|
67
|
+
|
68
|
+
# Preserve categories order for category vector
|
69
|
+
resultant_dv.categories = dv.categories if dv.category?
|
70
|
+
resultant_dv
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
68
74
|
|
69
|
-
|
75
|
+
def fetch_new_data_and_index dv, bool_array
|
76
|
+
barry = bool_array.to_a
|
77
|
+
positions = dv.size.times.select { |i| barry[i] }
|
78
|
+
new_data = dv.to_a.values_at(*positions)
|
79
|
+
new_index = dv.index.to_a.values_at(*positions)
|
80
|
+
[new_data, new_index]
|
70
81
|
end
|
71
82
|
end
|
72
83
|
end
|
data/lib/daru/dataframe.rb
CHANGED
@@ -1,14 +1,17 @@
|
|
1
1
|
require 'daru/accessors/dataframe_by_row.rb'
|
2
2
|
require 'daru/maths/arithmetic/dataframe.rb'
|
3
3
|
require 'daru/maths/statistics/dataframe.rb'
|
4
|
-
require 'daru/plotting/
|
4
|
+
require 'daru/plotting/gruff.rb'
|
5
|
+
require 'daru/plotting/nyaplot.rb'
|
5
6
|
require 'daru/io/io.rb'
|
6
7
|
|
7
8
|
module Daru
|
8
|
-
class DataFrame
|
9
|
+
class DataFrame # rubocop:disable Metrics/ClassLength
|
9
10
|
include Daru::Maths::Arithmetic::DataFrame
|
10
11
|
include Daru::Maths::Statistics::DataFrame
|
11
|
-
|
12
|
+
# TODO: Remove this line but its causing erros due to unkown reason
|
13
|
+
include Daru::Plotting::DataFrame::NyaplotLibrary if Daru.has_nyaplot?
|
14
|
+
extend Gem::Deprecate
|
12
15
|
|
13
16
|
class << self
|
14
17
|
# Load data from a CSV file. Specify an optional block to grab the CSV
|
@@ -112,29 +115,17 @@ module Daru
|
|
112
115
|
# Create DataFrame by specifying rows as an Array of Arrays or Array of
|
113
116
|
# Daru::Vector objects.
|
114
117
|
def rows source, opts={}
|
115
|
-
first = source.first
|
116
|
-
|
117
118
|
raise SizeError, 'All vectors must have same length' \
|
118
|
-
unless source.all? { |v| v.size == first.size }
|
119
|
-
|
120
|
-
index = []
|
121
|
-
opts[:order] ||=
|
122
|
-
case first
|
123
|
-
when Daru::Vector # assume that all are Vectors
|
124
|
-
index = source.map(&:name)
|
125
|
-
first.index.to_a
|
126
|
-
when Array
|
127
|
-
Array.new(first.size, &:to_s)
|
128
|
-
end
|
119
|
+
unless source.all? { |v| v.size == source.first.size }
|
129
120
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
121
|
+
opts[:order] ||= guess_order(source)
|
122
|
+
|
123
|
+
if ArrayHelper.array_of?(source, Array)
|
124
|
+
DataFrame.new(source.transpose, opts)
|
125
|
+
elsif ArrayHelper.array_of?(source, Vector)
|
126
|
+
from_vector_rows(source, opts)
|
127
|
+
else
|
128
|
+
raise ArgumentError, "Can't create DataFrame from #{source}"
|
138
129
|
end
|
139
130
|
end
|
140
131
|
|
@@ -161,36 +152,47 @@ module Daru
|
|
161
152
|
raise 'Three vectors should be equal size' if
|
162
153
|
rows.size != columns.size || rows.size!=values.size
|
163
154
|
|
164
|
-
|
165
|
-
|
155
|
+
data = Hash.new { |h, col|
|
156
|
+
h[col] = rows.factors.map { |r| [r, nil] }.to_h
|
157
|
+
}
|
158
|
+
columns.zip(rows, values).each { |c, r, v| data[c][r] = v }
|
166
159
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
160
|
+
# FIXME: in fact, WITHOUT this line you'll obtain more "right"
|
161
|
+
# data: with vectors having "rows" as an index...
|
162
|
+
data = data.map { |c, r| [c, r.values] }.to_h
|
163
|
+
data[:_id] = rows.factors
|
164
|
+
|
165
|
+
DataFrame.new(data)
|
166
|
+
end
|
167
|
+
|
168
|
+
private
|
172
169
|
|
173
|
-
|
174
|
-
|
170
|
+
def guess_order source
|
171
|
+
case source.first
|
172
|
+
when Vector # assume that all are Vectors
|
173
|
+
source.first.index.to_a
|
174
|
+
when Array
|
175
|
+
Array.new(source.first.size, &:to_s)
|
175
176
|
end
|
176
|
-
|
177
|
+
end
|
177
178
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
n_row[i+1] = h_rows[row][cols_values[i]]
|
183
|
-
end
|
179
|
+
def from_vector_rows source, opts
|
180
|
+
index = source.map(&:name)
|
181
|
+
.each_with_index.map { |n, i| n || i }
|
182
|
+
index = ArrayHelper.recode_repeated(index)
|
184
183
|
|
185
|
-
|
184
|
+
DataFrame.new({}, opts).tap do |df|
|
185
|
+
source.each_with_index do |row, idx|
|
186
|
+
df[index[idx] || idx, :row] = row
|
187
|
+
end
|
186
188
|
end
|
187
|
-
df.update
|
188
|
-
df
|
189
189
|
end
|
190
190
|
end
|
191
191
|
|
192
192
|
# The vectors (columns) index of the DataFrame
|
193
193
|
attr_reader :vectors
|
194
|
+
# TOREMOVE
|
195
|
+
attr_reader :data
|
194
196
|
|
195
197
|
# The index of the rows of the DataFrame
|
196
198
|
attr_reader :index
|
@@ -237,135 +239,181 @@ module Daru
|
|
237
239
|
# # b 7 2
|
238
240
|
# # c 8 3
|
239
241
|
# # d 9 4
|
240
|
-
def initialize source, opts={}
|
241
|
-
vectors = opts[:order]
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
if source.empty?
|
250
|
-
@vectors = try_create_index vectors
|
251
|
-
@index = try_create_index index
|
242
|
+
def initialize source, opts={} # rubocop:disable Metrics/MethodLength
|
243
|
+
vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1
|
244
|
+
@data = []
|
245
|
+
@name = opts[:name]
|
246
|
+
|
247
|
+
case source
|
248
|
+
when ->(s) { s.empty? }
|
249
|
+
@vectors = Index.coerce vectors
|
250
|
+
@index = Index.coerce index
|
252
251
|
create_empty_vectors
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
raise ArgumentError, "Number of vectors (#{vectors.size}) should \
|
258
|
-
equal order size (#{source.size})" if source.size != vectors.size
|
259
|
-
|
260
|
-
@index = try_create_index(index || source[0].size)
|
261
|
-
@vectors = try_create_index(vectors)
|
262
|
-
|
263
|
-
@vectors.each_with_index do |_vec,idx|
|
264
|
-
@data << Daru::Vector.new(source[idx], index: @index)
|
265
|
-
end
|
266
|
-
elsif source.all? { |s| s.is_a?(Daru::Vector) }
|
267
|
-
hsh = {}
|
268
|
-
vectors.each_with_index do |name, idx|
|
269
|
-
hsh[name] = source[idx]
|
270
|
-
end
|
271
|
-
initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
|
272
|
-
else # array of hashes
|
273
|
-
@vectors =
|
274
|
-
if vectors.nil?
|
275
|
-
Daru::Index.new source[0].keys
|
276
|
-
else
|
277
|
-
Daru::Index.new((vectors + (source[0].keys - vectors)).uniq)
|
278
|
-
end
|
279
|
-
@index = Daru::Index.new(index || source.size)
|
280
|
-
|
281
|
-
@vectors.each do |name|
|
282
|
-
v = []
|
283
|
-
source.each do |h|
|
284
|
-
v << (h[name] || h[name.to_s])
|
285
|
-
end
|
286
|
-
|
287
|
-
@data << Daru::Vector.new(v, name: set_name(name), index: @index)
|
288
|
-
end
|
289
|
-
end
|
290
|
-
when Hash
|
291
|
-
create_vectors_index_with vectors, source
|
292
|
-
if all_daru_vectors_in_source? source
|
293
|
-
vectors_have_same_index = all_vectors_have_equal_indexes?(source)
|
294
|
-
if !index.nil?
|
295
|
-
@index = try_create_index index
|
296
|
-
elsif vectors_have_same_index
|
297
|
-
@index = source.values[0].index.dup
|
298
|
-
else
|
299
|
-
all_indexes = []
|
300
|
-
source.each_value do |vector|
|
301
|
-
all_indexes << vector.index.to_a
|
302
|
-
end
|
303
|
-
# sort only if missing indexes detected
|
304
|
-
all_indexes.flatten!.uniq!.sort!
|
305
|
-
|
306
|
-
@index = Daru::Index.new all_indexes
|
307
|
-
clone = true
|
308
|
-
end
|
309
|
-
|
310
|
-
if clone
|
311
|
-
@vectors.each do |vector|
|
312
|
-
# avoids matching indexes of vectors if all the supplied vectors
|
313
|
-
# have the same index.
|
314
|
-
if vectors_have_same_index
|
315
|
-
v = source[vector].dup
|
316
|
-
else
|
317
|
-
v = Daru::Vector.new([], name: vector, metadata: source[vector].metadata.dup, index: @index)
|
318
|
-
|
319
|
-
@index.each do |idx|
|
320
|
-
v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
|
321
|
-
end
|
322
|
-
end
|
323
|
-
@data << v
|
324
|
-
end
|
325
|
-
else
|
326
|
-
@data.concat source.values
|
327
|
-
end
|
328
|
-
else
|
329
|
-
@index = try_create_index(index || source.values[0].size)
|
330
|
-
|
331
|
-
@vectors.each do |name|
|
332
|
-
meta_opt = source[name].respond_to?(:metadata) ? {metadata: source[name].metadata.dup} : {}
|
333
|
-
@data << Daru::Vector.new(source[name].dup, name: set_name(name), **meta_opt, index: @index)
|
334
|
-
end
|
335
|
-
end
|
336
|
-
end
|
252
|
+
when Array
|
253
|
+
initialize_from_array source, vectors, index, opts
|
254
|
+
when Hash
|
255
|
+
initialize_from_hash source, vectors, index, opts
|
337
256
|
end
|
338
257
|
|
339
258
|
set_size
|
340
259
|
validate
|
341
260
|
update
|
261
|
+
self.plotting_library = Daru.plotting_library
|
342
262
|
end
|
343
263
|
|
344
|
-
def
|
345
|
-
|
346
|
-
|
264
|
+
def plotting_library= lib
|
265
|
+
case lib
|
266
|
+
when :gruff, :nyaplot
|
267
|
+
@plotting_library = lib
|
268
|
+
extend Module.const_get(
|
269
|
+
"Daru::Plotting::DataFrame::#{lib.to_s.capitalize}Library"
|
270
|
+
) if Daru.send("has_#{lib}?".to_sym)
|
271
|
+
else
|
272
|
+
raise ArguementError, "Plotting library #{lib} not supported. "\
|
273
|
+
'Supported libraries are :nyaplot and :gruff'
|
274
|
+
end
|
347
275
|
end
|
348
276
|
|
349
277
|
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
350
278
|
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
351
279
|
# rows. Use df.row[:a] for accessing row with index ':a'.
|
352
280
|
def [](*names)
|
353
|
-
|
354
|
-
|
355
|
-
|
281
|
+
axis = extract_axis(names, :vector)
|
282
|
+
dispatch_to_axis axis, :access, *names
|
283
|
+
end
|
284
|
+
|
285
|
+
# Retrive rows by positions
|
286
|
+
# @param [Array<Integer>] *positions positions of rows to retrive
|
287
|
+
# @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
|
288
|
+
# @example
|
289
|
+
# df = Daru::DataFrame.new({
|
290
|
+
# a: [1, 2, 3],
|
291
|
+
# b: ['a', 'b', 'c']
|
292
|
+
# })
|
293
|
+
# df.row_at 1, 2
|
294
|
+
# # => #<Daru::DataFrame(2x2)>
|
295
|
+
# # a b
|
296
|
+
# # 1 2 b
|
297
|
+
# # 2 3 c
|
298
|
+
def row_at *positions
|
299
|
+
original_positions = positions
|
300
|
+
positions = coerce_positions(*positions, nrows)
|
301
|
+
validate_positions(*positions, nrows)
|
302
|
+
|
303
|
+
if positions.is_a? Integer
|
304
|
+
return Daru::Vector.new @data.map { |vec| vec.at(*positions) },
|
305
|
+
index: @vectors
|
356
306
|
else
|
357
|
-
|
307
|
+
new_rows = @data.map { |vec| vec.at(*original_positions) }
|
308
|
+
return Daru::DataFrame.new new_rows,
|
309
|
+
index: @index.at(*original_positions),
|
310
|
+
order: @vectors
|
358
311
|
end
|
312
|
+
end
|
359
313
|
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
314
|
+
# Set rows by positions
|
315
|
+
# @param [Array<Integer>] positions positions of rows to set
|
316
|
+
# @vector [Array, Daru::Vector] vector vector to be assigned
|
317
|
+
# @example
|
318
|
+
# df = Daru::DataFrame.new({
|
319
|
+
# a: [1, 2, 3],
|
320
|
+
# b: ['a', 'b', 'c']
|
321
|
+
# })
|
322
|
+
# df.set_row_at [0, 1], ['x', 'x']
|
323
|
+
# df
|
324
|
+
# #=> #<Daru::DataFrame(3x2)>
|
325
|
+
# # a b
|
326
|
+
# # 0 x x
|
327
|
+
# # 1 x x
|
328
|
+
# # 2 3 c
|
329
|
+
def set_row_at positions, vector
|
330
|
+
validate_positions(*positions, nrows)
|
331
|
+
vector =
|
332
|
+
if vector.is_a? Daru::Vector
|
333
|
+
vector.reindex @vectors
|
334
|
+
else
|
335
|
+
Daru::Vector.new vector
|
336
|
+
end
|
337
|
+
|
338
|
+
raise SizeError, 'Vector length should match row length' if
|
339
|
+
vector.size != @vectors.size
|
340
|
+
|
341
|
+
@data.each_with_index do |vec, pos|
|
342
|
+
vec.set_at(positions, vector.at(pos))
|
343
|
+
end
|
344
|
+
@index = @data[0].index
|
345
|
+
set_size
|
346
|
+
end
|
347
|
+
|
348
|
+
# Retrive vectors by positions
|
349
|
+
# @param [Array<Integer>] *positions positions of vectors to retrive
|
350
|
+
# @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
|
351
|
+
# @example
|
352
|
+
# df = Daru::DataFrame.new({
|
353
|
+
# a: [1, 2, 3],
|
354
|
+
# b: ['a', 'b', 'c']
|
355
|
+
# })
|
356
|
+
# df.at 0
|
357
|
+
# # => #<Daru::Vector(3)>
|
358
|
+
# # a
|
359
|
+
# # 0 1
|
360
|
+
# # 1 2
|
361
|
+
# # 2 3
|
362
|
+
def at *positions
|
363
|
+
if AXES.include? positions.last
|
364
|
+
axis = positions.pop
|
365
|
+
return row_at(*positions) if axis == :row
|
366
|
+
end
|
367
|
+
|
368
|
+
original_positions = positions
|
369
|
+
positions = coerce_positions(*positions, ncols)
|
370
|
+
validate_positions(*positions, ncols)
|
371
|
+
|
372
|
+
if positions.is_a? Integer
|
373
|
+
@data[positions].dup
|
364
374
|
else
|
365
|
-
|
375
|
+
Daru::DataFrame.new positions.map { |pos| @data[pos].dup },
|
376
|
+
index: @index,
|
377
|
+
order: @vectors.at(*original_positions),
|
378
|
+
name: @name
|
366
379
|
end
|
367
380
|
end
|
368
381
|
|
382
|
+
# Set vectors by positions
|
383
|
+
# @param [Array<Integer>] positions positions of vectors to set
|
384
|
+
# @param [Array, Daru::Vector] vector vector to be assigned
|
385
|
+
# @example
|
386
|
+
# df = Daru::DataFrame.new({
|
387
|
+
# a: [1, 2, 3],
|
388
|
+
# b: ['a', 'b', 'c']
|
389
|
+
# })
|
390
|
+
# df.set_at [0], ['x', 'y', 'z']
|
391
|
+
# df
|
392
|
+
# #=> #<Daru::DataFrame(3x2)>
|
393
|
+
# # a b
|
394
|
+
# # 0 x a
|
395
|
+
# # 1 y b
|
396
|
+
# # 2 z c
|
397
|
+
def set_at positions, vector
|
398
|
+
if positions.last == :row
|
399
|
+
positions.pop
|
400
|
+
return set_row_at(positions, vector)
|
401
|
+
end
|
402
|
+
|
403
|
+
validate_positions(*positions, ncols)
|
404
|
+
vector =
|
405
|
+
if vector.is_a? Daru::Vector
|
406
|
+
vector.reindex @index
|
407
|
+
else
|
408
|
+
Daru::Vector.new vector
|
409
|
+
end
|
410
|
+
|
411
|
+
raise SizeError, 'Vector length should match index length' if
|
412
|
+
vector.size != @index.size
|
413
|
+
|
414
|
+
positions.each { |pos| @data[pos] = vector }
|
415
|
+
end
|
416
|
+
|
369
417
|
# Insert a new row/vector of the specified name or modify a previous row.
|
370
418
|
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
|
371
419
|
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
|
@@ -374,25 +422,11 @@ module Daru
|
|
374
422
|
# of the vector will be matched against the row/vector indexes of the DataFrame
|
375
423
|
# before an insertion is performed. Unmatched indexes will be set to nil.
|
376
424
|
def []=(*args)
|
377
|
-
|
378
|
-
args
|
379
|
-
args
|
425
|
+
vector = args.pop
|
426
|
+
axis = extract_axis(args)
|
427
|
+
names = args
|
380
428
|
|
381
|
-
|
382
|
-
vector = args[-1]
|
383
|
-
|
384
|
-
if axis == :vector
|
385
|
-
insert_or_modify_vector name, vector
|
386
|
-
elsif axis == :row
|
387
|
-
insert_or_modify_row name, vector
|
388
|
-
else
|
389
|
-
raise IndexError, "Expected axis to be row or vector, not #{axis}."
|
390
|
-
end
|
391
|
-
end
|
392
|
-
|
393
|
-
# Access a vector by name.
|
394
|
-
def column name
|
395
|
-
vector[name]
|
429
|
+
dispatch_to_axis axis, :insert_or_modify, names, vector
|
396
430
|
end
|
397
431
|
|
398
432
|
def add_row row, index=nil
|
@@ -421,10 +455,7 @@ module Daru
|
|
421
455
|
def dup vectors_to_dup=nil
|
422
456
|
vectors_to_dup = @vectors.to_a unless vectors_to_dup
|
423
457
|
|
424
|
-
src = []
|
425
|
-
vectors_to_dup.each do |vec|
|
426
|
-
src << @data[@vectors[vec]].dup
|
427
|
-
end
|
458
|
+
src = vectors_to_dup.map { |vec| @data[@vectors[vec]].dup }
|
428
459
|
new_order = Daru::Index.new(vectors_to_dup)
|
429
460
|
|
430
461
|
Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
|
@@ -443,20 +474,18 @@ module Daru
|
|
443
474
|
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
444
475
|
# a view of the whole data frame otherwise.
|
445
476
|
def clone *vectors_to_clone
|
446
|
-
vectors_to_clone.flatten!
|
477
|
+
vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
|
447
478
|
vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
|
448
479
|
|
449
|
-
h = vectors_to_clone.
|
450
|
-
|
451
|
-
end
|
452
|
-
Daru::DataFrame.new(h, clone: false)
|
480
|
+
h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
|
481
|
+
Daru::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
|
453
482
|
end
|
454
483
|
|
455
484
|
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
456
485
|
# or a full copy of only valid data if missing data is present.
|
457
486
|
def clone_only_valid
|
458
|
-
if
|
459
|
-
|
487
|
+
if include_values?(*Daru::MISSING_VALUES)
|
488
|
+
reject_values(*Daru::MISSING_VALUES)
|
460
489
|
else
|
461
490
|
clone
|
462
491
|
end
|
@@ -465,19 +494,76 @@ module Daru
|
|
465
494
|
# Creates a new duplicate dataframe containing only rows
|
466
495
|
# without a single missing value.
|
467
496
|
def dup_only_valid vecs=nil
|
468
|
-
rows_with_nil = @data.
|
469
|
-
|
470
|
-
|
497
|
+
rows_with_nil = @data.map { |vec| vec.indexes(*Daru::MISSING_VALUES) }
|
498
|
+
.inject(&:concat)
|
499
|
+
.uniq
|
471
500
|
|
472
501
|
row_indexes = @index.to_a
|
473
502
|
(vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
|
474
503
|
end
|
504
|
+
deprecate :dup_only_valid, :reject_values, 2016, 10
|
505
|
+
|
506
|
+
# Returns a dataframe in which rows with any of the mentioned values
|
507
|
+
# are ignored.
|
508
|
+
# @param [Array] *values values to reject to form the new dataframe
|
509
|
+
# @return [Daru::DataFrame] Data Frame with only rows which doesn't
|
510
|
+
# contain the mentioned values
|
511
|
+
# @example
|
512
|
+
# df = Daru::DataFrame.new({
|
513
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
514
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
515
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
516
|
+
# }, index: 11..18)
|
517
|
+
# df.reject_values nil, Float::NAN
|
518
|
+
# # => #<Daru::DataFrame(2x3)>
|
519
|
+
# # a b c
|
520
|
+
# # 11 1 a a
|
521
|
+
# # 18 7 8 7
|
522
|
+
def reject_values(*values)
|
523
|
+
positions =
|
524
|
+
size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
|
525
|
+
# Handle the case when positions size is 1 and #row_at wouldn't return a df
|
526
|
+
if positions.size == 1
|
527
|
+
pos = positions.first
|
528
|
+
row_at(pos..pos)
|
529
|
+
else
|
530
|
+
row_at(*positions)
|
531
|
+
end
|
532
|
+
end
|
533
|
+
|
534
|
+
# Replace specified values with given value
|
535
|
+
# @param [Array] old_values values to replace with new value
|
536
|
+
# @param [object] new_value new value to replace with
|
537
|
+
# @return [Daru::DataFrame] Data Frame itself with old values replace
|
538
|
+
# with new value
|
539
|
+
# @example
|
540
|
+
# df = Daru::DataFrame.new({
|
541
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
542
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
543
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
544
|
+
# }, index: 11..18)
|
545
|
+
# df
|
546
|
+
# # => #<Daru::DataFrame(8x3)>
|
547
|
+
# # a b c
|
548
|
+
# # 11 1 a a
|
549
|
+
# # 12 2 b NaN
|
550
|
+
# # 13 3 NaN 3
|
551
|
+
# # 14 NaN NaN 4
|
552
|
+
# # 15 NaN NaN 3
|
553
|
+
# # 16 NaN 3 5
|
554
|
+
# # 17 1 5 NaN
|
555
|
+
# # 18 7 8 7
|
556
|
+
def replace_values old_values, new_value
|
557
|
+
@data.each { |vec| vec.replace_values old_values, new_value }
|
558
|
+
self
|
559
|
+
end
|
475
560
|
|
476
561
|
# Iterate over each index of the DataFrame.
|
477
562
|
def each_index &block
|
478
563
|
return to_enum(:each_index) unless block_given?
|
479
564
|
|
480
565
|
@index.each(&block)
|
566
|
+
|
481
567
|
self
|
482
568
|
end
|
483
569
|
|
@@ -509,8 +595,8 @@ module Daru
|
|
509
595
|
def each_row
|
510
596
|
return to_enum(:each_row) unless block_given?
|
511
597
|
|
512
|
-
@index.
|
513
|
-
yield
|
598
|
+
@index.size.times do |pos|
|
599
|
+
yield row_at(pos)
|
514
600
|
end
|
515
601
|
|
516
602
|
self
|
@@ -540,13 +626,7 @@ module Daru
|
|
540
626
|
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
541
627
|
# or :row. Default to :vector.
|
542
628
|
def each axis=:vector, &block
|
543
|
-
|
544
|
-
each_vector(&block)
|
545
|
-
elsif axis == :row
|
546
|
-
each_row(&block)
|
547
|
-
else
|
548
|
-
raise ArgumentError, "Unknown axis #{axis}"
|
549
|
-
end
|
629
|
+
dispatch_to_axis axis, :each, &block
|
550
630
|
end
|
551
631
|
|
552
632
|
# Iterate over a row or vector and return results in a Daru::Vector.
|
@@ -565,13 +645,7 @@ module Daru
|
|
565
645
|
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
566
646
|
# or :row. Default to :vector.
|
567
647
|
def collect axis=:vector, &block
|
568
|
-
|
569
|
-
collect_vectors(&block)
|
570
|
-
elsif axis == :row
|
571
|
-
collect_rows(&block)
|
572
|
-
else
|
573
|
-
raise ArgumentError, "Unknown axis #{axis}"
|
574
|
-
end
|
648
|
+
dispatch_to_axis_pl axis, :collect, &block
|
575
649
|
end
|
576
650
|
|
577
651
|
# Map over each vector or row of the data frame according to
|
@@ -591,13 +665,7 @@ module Daru
|
|
591
665
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
592
666
|
# Default to :vector.
|
593
667
|
def map axis=:vector, &block
|
594
|
-
|
595
|
-
map_vectors(&block)
|
596
|
-
elsif axis == :row
|
597
|
-
map_rows(&block)
|
598
|
-
else
|
599
|
-
raise ArgumentError, "Unknown axis #{axis}"
|
600
|
-
end
|
668
|
+
dispatch_to_axis_pl axis, :map, &block
|
601
669
|
end
|
602
670
|
|
603
671
|
# Destructive map. Modifies the DataFrame. Each run of the block
|
@@ -634,11 +702,7 @@ module Daru
|
|
634
702
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
635
703
|
# Default to :vector.
|
636
704
|
def recode axis=:vector, &block
|
637
|
-
|
638
|
-
recode_vectors(&block)
|
639
|
-
elsif axis == :row
|
640
|
-
recode_rows(&block)
|
641
|
-
end
|
705
|
+
dispatch_to_axis_pl axis, :recode, &block
|
642
706
|
end
|
643
707
|
|
644
708
|
# Retain vectors or rows if the block returns a truthy value.
|
@@ -670,50 +734,34 @@ module Daru
|
|
670
734
|
# row[:a] + row[:d] < 100
|
671
735
|
# end
|
672
736
|
def filter axis=:vector, &block
|
673
|
-
|
674
|
-
filter_vectors(&block)
|
675
|
-
elsif axis == :row
|
676
|
-
filter_rows(&block)
|
677
|
-
end
|
737
|
+
dispatch_to_axis_pl axis, :filter, &block
|
678
738
|
end
|
679
739
|
|
680
740
|
def recode_vectors
|
681
741
|
block_given? or return to_enum(:recode_vectors)
|
682
742
|
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
|
688
|
-
df[*i] = ret
|
743
|
+
dup.tap do |df|
|
744
|
+
df.each_vector_with_index do |v, i|
|
745
|
+
df[*i] = should_be_vector!(yield(v))
|
746
|
+
end
|
689
747
|
end
|
690
|
-
|
691
|
-
df
|
692
748
|
end
|
693
749
|
|
694
750
|
def recode_rows
|
695
751
|
block_given? or return to_enum(:recode_rows)
|
696
752
|
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
df.row[i] = ret
|
753
|
+
dup.tap do |df|
|
754
|
+
df.each_row_with_index do |r, i|
|
755
|
+
df.row[i] = should_be_vector!(yield(r))
|
756
|
+
end
|
702
757
|
end
|
703
|
-
|
704
|
-
df
|
705
758
|
end
|
706
759
|
|
707
760
|
# Map each vector and return an Array.
|
708
|
-
def map_vectors
|
761
|
+
def map_vectors &block
|
709
762
|
return to_enum(:map_vectors) unless block_given?
|
710
763
|
|
711
|
-
|
712
|
-
@data.each do |vec|
|
713
|
-
arry << yield(vec)
|
714
|
-
end
|
715
|
-
|
716
|
-
arry
|
764
|
+
@data.map(&block)
|
717
765
|
end
|
718
766
|
|
719
767
|
# Destructive form of #map_vectors
|
@@ -721,56 +769,37 @@ module Daru
|
|
721
769
|
return to_enum(:map_vectors!) unless block_given?
|
722
770
|
|
723
771
|
vectors.dup.each do |n|
|
724
|
-
|
725
|
-
v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}"
|
726
|
-
self[n] = v
|
772
|
+
self[n] = should_be_vector!(yield(self[n]))
|
727
773
|
end
|
728
774
|
|
729
775
|
self
|
730
776
|
end
|
731
777
|
|
732
778
|
# Map vectors alongwith the index.
|
733
|
-
def map_vectors_with_index
|
779
|
+
def map_vectors_with_index &block
|
734
780
|
return to_enum(:map_vectors_with_index) unless block_given?
|
735
781
|
|
736
|
-
|
737
|
-
each_vector_with_index do |vector, name|
|
738
|
-
dt << yield(vector, name)
|
739
|
-
end
|
740
|
-
|
741
|
-
dt
|
782
|
+
each_vector_with_index.map(&block)
|
742
783
|
end
|
743
784
|
|
744
785
|
# Map each row
|
745
|
-
def map_rows
|
786
|
+
def map_rows &block
|
746
787
|
return to_enum(:map_rows) unless block_given?
|
747
788
|
|
748
|
-
|
749
|
-
each_row do |row|
|
750
|
-
dt << yield(row)
|
751
|
-
end
|
752
|
-
|
753
|
-
dt
|
789
|
+
each_row.map(&block)
|
754
790
|
end
|
755
791
|
|
756
|
-
def map_rows_with_index
|
792
|
+
def map_rows_with_index &block
|
757
793
|
return to_enum(:map_rows_with_index) unless block_given?
|
758
794
|
|
759
|
-
|
760
|
-
each_row_with_index do |row, index|
|
761
|
-
dt << yield(row, index)
|
762
|
-
end
|
763
|
-
|
764
|
-
dt
|
795
|
+
each_row_with_index.map(&block)
|
765
796
|
end
|
766
797
|
|
767
798
|
def map_rows!
|
768
799
|
return to_enum(:map_rows!) unless block_given?
|
769
800
|
|
770
801
|
index.dup.each do |i|
|
771
|
-
|
772
|
-
r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
|
773
|
-
row[i] = r
|
802
|
+
row[i] = should_be_vector!(yield(row[i]))
|
774
803
|
end
|
775
804
|
|
776
805
|
self
|
@@ -778,55 +807,38 @@ module Daru
|
|
778
807
|
|
779
808
|
# Retrieves a Daru::Vector, based on the result of calculation
|
780
809
|
# performed on each row.
|
781
|
-
def collect_rows
|
810
|
+
def collect_rows &block
|
782
811
|
return to_enum(:collect_rows) unless block_given?
|
783
812
|
|
784
|
-
|
785
|
-
each_row do |row|
|
786
|
-
data.push yield(row)
|
787
|
-
end
|
788
|
-
|
789
|
-
Daru::Vector.new(data, index: @index)
|
813
|
+
Daru::Vector.new(each_row.map(&block), index: @index)
|
790
814
|
end
|
791
815
|
|
792
|
-
def collect_row_with_index
|
816
|
+
def collect_row_with_index &block
|
793
817
|
return to_enum(:collect_row_with_index) unless block_given?
|
794
818
|
|
795
|
-
|
796
|
-
each_row_with_index do |row, i|
|
797
|
-
data.push yield(row, i)
|
798
|
-
end
|
799
|
-
|
800
|
-
Daru::Vector.new(data, index: @index)
|
819
|
+
Daru::Vector.new(each_row_with_index.map(&block), index: @index)
|
801
820
|
end
|
802
821
|
|
803
822
|
# Retrives a Daru::Vector, based on the result of calculation
|
804
823
|
# performed on each vector.
|
805
|
-
def collect_vectors
|
824
|
+
def collect_vectors &block
|
806
825
|
return to_enum(:collect_vectors) unless block_given?
|
807
826
|
|
808
|
-
|
809
|
-
each_vector do |vec|
|
810
|
-
data.push yield(vec)
|
811
|
-
end
|
812
|
-
|
813
|
-
Daru::Vector.new(data, index: @vectors)
|
827
|
+
Daru::Vector.new(each_vector.map(&block), index: @vectors)
|
814
828
|
end
|
815
829
|
|
816
|
-
def collect_vector_with_index
|
830
|
+
def collect_vector_with_index &block
|
817
831
|
return to_enum(:collect_vector_with_index) unless block_given?
|
818
832
|
|
819
|
-
|
820
|
-
each_vector_with_index do |vec, i|
|
821
|
-
data.push yield(vec, i)
|
822
|
-
end
|
823
|
-
|
824
|
-
Daru::Vector.new(data, index: @vectors)
|
833
|
+
Daru::Vector.new(each_vector_with_index.map(&block), index: @vectors)
|
825
834
|
end
|
826
835
|
|
827
836
|
# Generate a matrix, based on vector names of the DataFrame.
|
828
837
|
#
|
829
838
|
# @return {::Matrix}
|
839
|
+
# :nocov:
|
840
|
+
# FIXME: Even not trying to cover this: I can't get, how it is expected
|
841
|
+
# to work.... -- zverok
|
830
842
|
def collect_matrix
|
831
843
|
return to_enum(:collect_matrix) unless block_given?
|
832
844
|
|
@@ -839,6 +851,7 @@ module Daru
|
|
839
851
|
|
840
852
|
Matrix.rows(rows)
|
841
853
|
end
|
854
|
+
# :nocov:
|
842
855
|
|
843
856
|
# Delete a vector
|
844
857
|
def delete_vector vector
|
@@ -876,43 +889,29 @@ module Daru
|
|
876
889
|
# @return {Daru::DataFrame}
|
877
890
|
def bootstrap(n=nil)
|
878
891
|
n ||= nrows
|
879
|
-
|
880
|
-
|
881
|
-
|
892
|
+
Daru::DataFrame.new({}, order: @vectors).tap do |df_boot|
|
893
|
+
n.times do
|
894
|
+
df_boot.add_row(row[rand(n)])
|
895
|
+
end
|
896
|
+
df_boot.update
|
882
897
|
end
|
883
|
-
ds_boot.update
|
884
|
-
ds_boot
|
885
898
|
end
|
886
899
|
|
887
900
|
def keep_row_if
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
keep_row = yield access_row(index)
|
892
|
-
|
893
|
-
deletion << index unless keep_row
|
894
|
-
end
|
895
|
-
deletion.each { |idx|
|
896
|
-
delete_row idx
|
897
|
-
}
|
901
|
+
@index
|
902
|
+
.reject { |idx| yield access_row(idx) }
|
903
|
+
.each { |idx| delete_row idx }
|
898
904
|
end
|
899
905
|
|
900
906
|
def keep_vector_if
|
901
907
|
@vectors.each do |vector|
|
902
|
-
|
903
|
-
|
904
|
-
delete_vector vector unless keep_vector
|
908
|
+
delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
|
905
909
|
end
|
906
910
|
end
|
907
911
|
|
908
912
|
# creates a new vector with the data of a given field which the block returns true
|
909
|
-
def filter_vector vec
|
910
|
-
|
911
|
-
each_row do |row|
|
912
|
-
d.push(row[vec]) if yield row
|
913
|
-
end
|
914
|
-
|
915
|
-
Daru::Vector.new(d, metadata: self[vec].metadata.dup)
|
913
|
+
def filter_vector vec, &block
|
914
|
+
Daru::Vector.new each_row.select(&block).map { |row| row[vec] }
|
916
915
|
end
|
917
916
|
|
918
917
|
# Iterates over each row and retains it in a new DataFrame if the block returns
|
@@ -930,38 +929,24 @@ module Daru
|
|
930
929
|
def filter_vectors &block
|
931
930
|
return to_enum(:filter_vectors) unless block_given?
|
932
931
|
|
933
|
-
df
|
934
|
-
df.keep_vector_if(&block)
|
935
|
-
|
936
|
-
df
|
932
|
+
dup.tap { |df| df.keep_vector_if(&block) }
|
937
933
|
end
|
938
934
|
|
939
935
|
# Test each row with one or more tests. Each test is a Proc with the form
|
940
936
|
# *Proc.new {|row| row[:age] > 0}*
|
941
937
|
#
|
942
938
|
# The function returns an array with all errors.
|
939
|
+
#
|
940
|
+
# FIXME: description here is too sparse. As far as I can get,
|
941
|
+
# it should tell something about that each test is [descr, fields, block],
|
942
|
+
# and that first value may be column name to output. - zverok, 2016-05-18
|
943
943
|
def verify(*tests)
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
end
|
950
|
-
|
951
|
-
vr = []
|
952
|
-
i = 0
|
953
|
-
each(:row) do |row|
|
954
|
-
i += 1
|
955
|
-
tests.each do |test|
|
956
|
-
next if test[2].call(row)
|
957
|
-
values = ''
|
958
|
-
unless test[1].empty?
|
959
|
-
values = ' (' + test[1].collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
|
960
|
-
end
|
961
|
-
vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
|
962
|
-
end
|
963
|
-
end
|
964
|
-
vr
|
944
|
+
id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
|
945
|
+
|
946
|
+
each_row_with_index.map do |row, i|
|
947
|
+
tests.reject { |*_, block| block.call(row) }
|
948
|
+
.map { |test| verify_error_message row, test, id, i }
|
949
|
+
end.flatten
|
965
950
|
end
|
966
951
|
|
967
952
|
# DSL for yielding each row and returning a Daru::Vector based on the
|
@@ -984,10 +969,7 @@ module Daru
|
|
984
969
|
# # 5 666
|
985
970
|
# # 6 777
|
986
971
|
def vector_by_calculation &block
|
987
|
-
a =
|
988
|
-
each_row do |r|
|
989
|
-
a.push r.instance_eval(&block)
|
990
|
-
end
|
972
|
+
a = each_row.map { |r| r.instance_eval(&block) }
|
991
973
|
|
992
974
|
Daru::Vector.new a, index: @index
|
993
975
|
end
|
@@ -1016,10 +998,8 @@ module Daru
|
|
1016
998
|
# * +missing_values+ - An Array of the values that should be
|
1017
999
|
# treated as 'missing'. The default missing value is *nil*.
|
1018
1000
|
def missing_values_rows missing_values=[nil]
|
1019
|
-
number_of_missing =
|
1020
|
-
|
1021
|
-
row.missing_values = missing_values
|
1022
|
-
number_of_missing << row.missing_positions.size
|
1001
|
+
number_of_missing = each_row.map do |row|
|
1002
|
+
row.indexes(*missing_values).size
|
1023
1003
|
end
|
1024
1004
|
|
1025
1005
|
Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
|
@@ -1029,67 +1009,77 @@ module Daru
|
|
1029
1009
|
alias :vector_missing_values :missing_values_rows
|
1030
1010
|
|
1031
1011
|
def has_missing_data?
|
1032
|
-
!!@data.any?(
|
1012
|
+
!!@data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
|
1033
1013
|
end
|
1034
|
-
|
1035
1014
|
alias :flawed? :has_missing_data?
|
1015
|
+
deprecate :has_missing_data?, :include_values?, 2016, 10
|
1016
|
+
deprecate :flawed?, :include_values?, 2016, 10
|
1017
|
+
|
1018
|
+
# Check if any of given values occur in the data frame
|
1019
|
+
# @param [Array] *values values to check for
|
1020
|
+
# @return [true, false] true if any of the given values occur in the
|
1021
|
+
# dataframe, false otherwise
|
1022
|
+
# @example
|
1023
|
+
# df = Daru::DataFrame.new({
|
1024
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
1025
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
1026
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
1027
|
+
# }, index: 11..18)
|
1028
|
+
# df.include_values? nil
|
1029
|
+
# # => true
|
1030
|
+
def include_values?(*values)
|
1031
|
+
@data.any? { |vec| vec.include_values?(*values) }
|
1032
|
+
end
|
1036
1033
|
|
1037
1034
|
# Return a nested hash using vector names as keys and an array constructed of
|
1038
1035
|
# hashes with other values. If block provided, is used to provide the
|
1039
1036
|
# values, with parameters +row+ of dataset, +current+ last hash on
|
1040
1037
|
# hierarchy and +name+ of the key to include
|
1041
|
-
def nest *tree_keys, &
|
1038
|
+
def nest *tree_keys, &_block
|
1042
1039
|
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
1043
|
-
out = {}
|
1044
1040
|
|
1045
|
-
each_row do |row|
|
1046
|
-
current = out
|
1041
|
+
each_row.each_with_object({}) do |row, current|
|
1047
1042
|
# Create tree
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1043
|
+
*keys, last = tree_keys
|
1044
|
+
current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
|
1045
|
+
name = row[last]
|
1046
|
+
|
1047
|
+
if block_given?
|
1048
|
+
current[name] = yield(row, current, name)
|
1049
|
+
else
|
1055
1050
|
current[name] ||= []
|
1056
1051
|
current[name].push(row.to_h.delete_if { |key,_value| tree_keys.include? key })
|
1057
|
-
else
|
1058
|
-
current[name] = yield(row, current, name)
|
1059
1052
|
end
|
1060
1053
|
end
|
1061
|
-
|
1062
|
-
out
|
1063
1054
|
end
|
1064
1055
|
|
1065
1056
|
def vector_count_characters vecs=nil
|
1066
1057
|
vecs ||= @vectors.to_a
|
1067
1058
|
|
1068
1059
|
collect_rows do |row|
|
1069
|
-
vecs.
|
1070
|
-
memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
|
1071
|
-
end
|
1060
|
+
vecs.map { |v| row[v].to_s.size }.inject(:+)
|
1072
1061
|
end
|
1073
1062
|
end
|
1074
1063
|
|
1075
1064
|
def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
|
1076
|
-
|
1077
|
-
|
1065
|
+
self[name]
|
1066
|
+
.split_by_separator(sep)
|
1067
|
+
.each { |k,v| self["#{name}#{join}#{k}".to_sym] = v }
|
1078
1068
|
end
|
1079
1069
|
|
1080
1070
|
# Return the number of rows and columns of the DataFrame in an Array.
|
1081
1071
|
def shape
|
1082
|
-
[
|
1072
|
+
[nrows, ncols]
|
1083
1073
|
end
|
1084
1074
|
|
1085
1075
|
# The number of rows
|
1086
1076
|
def nrows
|
1087
|
-
|
1077
|
+
@index.size
|
1088
1078
|
end
|
1089
1079
|
|
1090
1080
|
# The number of vectors
|
1091
1081
|
def ncols
|
1092
|
-
|
1082
|
+
@vectors.size
|
1093
1083
|
end
|
1094
1084
|
|
1095
1085
|
# Check if a vector is present
|
@@ -1132,10 +1122,7 @@ module Daru
|
|
1132
1122
|
if axis == :vector || axis == :column
|
1133
1123
|
@data.all?(&block)
|
1134
1124
|
elsif axis == :row
|
1135
|
-
each_row
|
1136
|
-
return false unless yield(row)
|
1137
|
-
end
|
1138
|
-
return true
|
1125
|
+
each_row.all?(&block)
|
1139
1126
|
else
|
1140
1127
|
raise ArgumentError, "Unidentified axis #{axis}"
|
1141
1128
|
end
|
@@ -1145,7 +1132,7 @@ module Daru
|
|
1145
1132
|
#
|
1146
1133
|
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
1147
1134
|
def head quantity=10
|
1148
|
-
|
1135
|
+
row.at 0..(quantity-1)
|
1149
1136
|
end
|
1150
1137
|
|
1151
1138
|
alias :first :head
|
@@ -1154,22 +1141,19 @@ module Daru
|
|
1154
1141
|
#
|
1155
1142
|
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
1156
1143
|
def tail quantity=10
|
1157
|
-
|
1144
|
+
start = [-quantity, -size].max
|
1145
|
+
row.at start..-1
|
1158
1146
|
end
|
1159
1147
|
|
1160
1148
|
alias :last :tail
|
1161
1149
|
|
1162
1150
|
# Returns a vector with sum of all vectors specified in the argument.
|
1163
|
-
#
|
1151
|
+
# If vecs parameter is empty, sum all numeric vector.
|
1164
1152
|
def vector_sum vecs=nil
|
1165
1153
|
vecs ||= numeric_vectors
|
1166
1154
|
sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
|
1167
1155
|
|
1168
|
-
vecs.
|
1169
|
-
sum += self[n]
|
1170
|
-
end
|
1171
|
-
|
1172
|
-
sum
|
1156
|
+
vecs.inject(sum) { |memo, n| memo + self[n] }
|
1173
1157
|
end
|
1174
1158
|
|
1175
1159
|
# Calculate mean of the rows of the dataframe.
|
@@ -1179,13 +1163,13 @@ module Daru
|
|
1179
1163
|
# * +max_missing+ - The maximum number of elements in the row that can be
|
1180
1164
|
# zero for the mean calculation to happen. Default to 0.
|
1181
1165
|
def vector_mean max_missing=0
|
1166
|
+
# FIXME: in vector_sum we preserve created vector dtype, but
|
1167
|
+
# here we are not. Is this by design or ...? - zverok, 2016-05-18
|
1182
1168
|
mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"
|
1183
1169
|
|
1184
|
-
each_row_with_index do |row, i|
|
1185
|
-
|
1170
|
+
each_row_with_index.each_with_object(mean_vec) do |(row, i), memo|
|
1171
|
+
memo[i] = row.indexes(*Daru::MISSING_VALUES).size > max_missing ? nil : row.mean
|
1186
1172
|
end
|
1187
|
-
|
1188
|
-
mean_vec
|
1189
1173
|
end
|
1190
1174
|
|
1191
1175
|
# Group elements by vector to perform operations on them. Returns a
|
@@ -1214,6 +1198,8 @@ module Daru
|
|
1214
1198
|
# # ["foo", "two", 3]=>[2, 4]}
|
1215
1199
|
def group_by *vectors
|
1216
1200
|
vectors.flatten!
|
1201
|
+
# FIXME: wouldn't it better to do vectors - @vectors here and
|
1202
|
+
# raise one error with all non-existent vector names?.. - zverok, 2016-05-18
|
1217
1203
|
vectors.each { |v|
|
1218
1204
|
raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
|
1219
1205
|
}
|
@@ -1226,28 +1212,22 @@ module Daru
|
|
1226
1212
|
"subclasses, not #{new_index.class}" unless new_vectors.is_a?(Daru::Index)
|
1227
1213
|
|
1228
1214
|
cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
|
1229
|
-
new_vectors.
|
1230
|
-
|
1215
|
+
new_vectors.each_with_object(cl) do |vec, memo|
|
1216
|
+
memo[vec] = @vectors.include?(vec) ? self[vec] : [nil]*nrows
|
1231
1217
|
end
|
1218
|
+
end
|
1232
1219
|
|
1233
|
-
|
1220
|
+
def get_vector_anyways(v)
|
1221
|
+
@vectors.include?(v) ? self[v].to_a : [nil] * size
|
1234
1222
|
end
|
1235
1223
|
|
1236
1224
|
# Concatenate another DataFrame along corresponding columns.
|
1237
1225
|
# If columns do not exist in both dataframes, they are filled with nils
|
1238
1226
|
def concat other_df
|
1239
|
-
vectors = @vectors.to_a
|
1240
|
-
data = []
|
1227
|
+
vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
|
1241
1228
|
|
1242
|
-
vectors.
|
1243
|
-
|
1244
|
-
data << self[v].dup.to_a.concat(other_vec)
|
1245
|
-
end
|
1246
|
-
|
1247
|
-
other_df.vectors.each do |v|
|
1248
|
-
next if vectors.include?(v)
|
1249
|
-
vectors << v
|
1250
|
-
data << ([nil] * size).concat(other_df[v].to_a)
|
1229
|
+
data = vectors.map do |v|
|
1230
|
+
get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
|
1251
1231
|
end
|
1252
1232
|
|
1253
1233
|
Daru::DataFrame.new(data, order: vectors)
|
@@ -1291,11 +1271,9 @@ module Daru
|
|
1291
1271
|
"subclasses, not #{new_index.class}" unless new_index.is_a?(Daru::Index)
|
1292
1272
|
|
1293
1273
|
cl = Daru::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
|
1294
|
-
new_index.
|
1295
|
-
|
1274
|
+
new_index.each_with_object(cl) do |idx, memo|
|
1275
|
+
memo.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
|
1296
1276
|
end
|
1297
|
-
|
1298
|
-
cl
|
1299
1277
|
end
|
1300
1278
|
|
1301
1279
|
# Reassign index with a new index of type Daru::Index or any of its subclasses.
|
@@ -1310,8 +1288,8 @@ module Daru
|
|
1310
1288
|
# df.index.to_a #=> ['a','b','c','d']
|
1311
1289
|
# df.row['a'].to_a #=> [1,11]
|
1312
1290
|
def index= idx
|
1313
|
-
@
|
1314
|
-
@index =
|
1291
|
+
@index = Index.coerce idx
|
1292
|
+
@data.each { |vec| vec.index = @index }
|
1315
1293
|
|
1316
1294
|
self
|
1317
1295
|
end
|
@@ -1361,21 +1339,14 @@ module Daru
|
|
1361
1339
|
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
1362
1340
|
# alongwith numbers.
|
1363
1341
|
def numeric_vectors
|
1364
|
-
|
1365
|
-
|
1366
|
-
|
1367
|
-
|
1368
|
-
end
|
1369
|
-
numerics
|
1342
|
+
# FIXME: Why _with_index ?..
|
1343
|
+
each_vector_with_index
|
1344
|
+
.select { |vec, _i| vec.numeric? }
|
1345
|
+
.map(&:last)
|
1370
1346
|
end
|
1371
1347
|
|
1372
1348
|
def numeric_vector_names
|
1373
|
-
|
1374
|
-
|
1375
|
-
@vectors.each do |v|
|
1376
|
-
numerics << v if self[v].type == :numeric
|
1377
|
-
end
|
1378
|
-
numerics
|
1349
|
+
@vectors.select { |v| self[v].numeric? }
|
1379
1350
|
end
|
1380
1351
|
|
1381
1352
|
# Return a DataFrame of only the numerical Vectors. If clone: false
|
@@ -1383,12 +1354,9 @@ module Daru
|
|
1383
1354
|
# returned. Defaults to clone: true.
|
1384
1355
|
def only_numerics opts={}
|
1385
1356
|
cln = opts[:clone] == false ? false : true
|
1386
|
-
|
1387
|
-
arry = nv.each_with_object([]) do |v, arr|
|
1388
|
-
arr << self[v]
|
1389
|
-
end
|
1357
|
+
arry = numeric_vectors.map { |v| self[v] }
|
1390
1358
|
|
1391
|
-
order = Index.new(
|
1359
|
+
order = Index.new(numeric_vectors)
|
1392
1360
|
Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
1393
1361
|
end
|
1394
1362
|
|
@@ -1492,39 +1460,24 @@ module Daru
|
|
1492
1460
|
|
1493
1461
|
def sort! vector_order, opts={}
|
1494
1462
|
raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
|
1495
|
-
opts = {
|
1496
|
-
ascending: true,
|
1497
|
-
handle_nils: false,
|
1498
|
-
by: {}
|
1499
|
-
}.merge(opts)
|
1500
1463
|
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1464
|
+
# To enable sorting with categorical data,
|
1465
|
+
# map categories to integers preserving their order
|
1466
|
+
old = convert_categorical_vectors vector_order
|
1467
|
+
block = sort_prepare_block vector_order, opts
|
1504
1468
|
|
1505
|
-
|
1506
|
-
|
1507
|
-
left = build_array_from_blocks vector_order, opts, blocks, r1, r2
|
1508
|
-
right = build_array_from_blocks vector_order, opts, blocks, r2, r1
|
1469
|
+
order = @index.size.times.sort(&block)
|
1470
|
+
new_index = @index.reorder order
|
1509
1471
|
|
1510
|
-
|
1511
|
-
|
1512
|
-
right << r2
|
1513
|
-
left <=> right
|
1514
|
-
end
|
1472
|
+
# To reverse map mapping of categorical data to integers
|
1473
|
+
restore_categorical_vectors old
|
1515
1474
|
|
1516
|
-
|
1517
|
-
|
1518
|
-
old_index = @index.to_a
|
1519
|
-
self.index = Daru::Index.new(idx.map { |i| old_index[i] })
|
1520
|
-
|
1521
|
-
vectors.each do |v|
|
1522
|
-
@data[@vectors[v]] = Daru::Vector.new(
|
1523
|
-
idx.map { |i| @data[@vectors[v]].data[i] },
|
1524
|
-
name: self[v].name, metadata: self[v].metadata.dup, index: index
|
1525
|
-
)
|
1475
|
+
@data.each do |vector|
|
1476
|
+
vector.reorder! order
|
1526
1477
|
end
|
1527
1478
|
|
1479
|
+
self.index = new_index
|
1480
|
+
|
1528
1481
|
self
|
1529
1482
|
end
|
1530
1483
|
|
@@ -1568,90 +1521,41 @@ module Daru
|
|
1568
1521
|
# # [:bar] 18 26
|
1569
1522
|
# # [:foo] 10 12
|
1570
1523
|
def pivot_table opts={}
|
1571
|
-
raise ArgumentError,
|
1572
|
-
'Specify grouping index' if !opts[:index] || opts[:index].empty?
|
1573
|
-
|
1574
|
-
index = opts[:index]
|
1575
|
-
vectors = opts[:vectors] || []
|
1576
|
-
aggregate_function = opts[:agg] || :mean
|
1577
|
-
values =
|
1578
|
-
if opts[:values].is_a?(Symbol)
|
1579
|
-
[opts[:values]]
|
1580
|
-
elsif opts[:values].is_a?(Array)
|
1581
|
-
opts[:values]
|
1582
|
-
else # nil
|
1583
|
-
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
1584
|
-
end
|
1524
|
+
raise ArgumentError, 'Specify grouping index' if opts[:index].to_a.empty?
|
1585
1525
|
|
1526
|
+
index = opts[:index]
|
1527
|
+
vectors = opts[:vectors] || []
|
1528
|
+
aggregate_function = opts[:agg] || :mean
|
1529
|
+
values = prepare_pivot_values index, vectors, opts
|
1586
1530
|
raise IndexError, 'No numeric vectors to aggregate' if values.empty?
|
1587
1531
|
|
1588
1532
|
grouped = group_by(index)
|
1533
|
+
return grouped.send(aggregate_function) if vectors.empty?
|
1589
1534
|
|
1590
|
-
|
1591
|
-
grouped.send(aggregate_function)
|
1592
|
-
else
|
1593
|
-
super_hash = {}
|
1594
|
-
values.each do |value|
|
1595
|
-
grouped.groups.each do |group_name, row_numbers|
|
1596
|
-
super_hash[group_name] ||= {}
|
1535
|
+
super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
|
1597
1536
|
|
1598
|
-
|
1599
|
-
arry = []
|
1600
|
-
arry << value
|
1601
|
-
vectors.each { |v| arry << self[v][num] }
|
1602
|
-
sub_hash = super_hash[group_name]
|
1603
|
-
sub_hash[arry] ||= []
|
1604
|
-
|
1605
|
-
sub_hash[arry] << self[value][num]
|
1606
|
-
end
|
1607
|
-
end
|
1608
|
-
end
|
1609
|
-
|
1610
|
-
super_hash.each_value do |sub_hash|
|
1611
|
-
sub_hash.each do |group_name, aggregates|
|
1612
|
-
sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
|
1613
|
-
end
|
1614
|
-
end
|
1615
|
-
|
1616
|
-
df_index = Daru::MultiIndex.from_tuples super_hash.keys
|
1617
|
-
|
1618
|
-
vector_indexes = []
|
1619
|
-
super_hash.each_value do |sub_hash|
|
1620
|
-
vector_indexes.concat sub_hash.keys
|
1621
|
-
end
|
1622
|
-
|
1623
|
-
df_vectors = Daru::MultiIndex.from_tuples vector_indexes.uniq
|
1624
|
-
pivoted_dataframe = Daru::DataFrame.new({}, index: df_index, order: df_vectors)
|
1625
|
-
|
1626
|
-
super_hash.each do |row_index, sub_h|
|
1627
|
-
sub_h.each do |vector_index, val|
|
1628
|
-
# pivoted_dataframe[symbolize(vector_index)][symbolize(row_index)] = val
|
1629
|
-
pivoted_dataframe[vector_index][row_index] = val
|
1630
|
-
end
|
1631
|
-
end
|
1632
|
-
return pivoted_dataframe
|
1633
|
-
end
|
1537
|
+
pivot_dataframe super_hash
|
1634
1538
|
end
|
1635
1539
|
|
1636
1540
|
# Merge vectors from two DataFrames. In case of name collision,
|
1637
1541
|
# the vectors names are changed to x_1, x_2 ....
|
1638
1542
|
#
|
1639
1543
|
# @return {Daru::DataFrame}
|
1640
|
-
def merge other_df
|
1641
|
-
raise
|
1544
|
+
def merge other_df # rubocop:disable Metrics/AbcSize
|
1545
|
+
raise ArgumentError,
|
1546
|
+
"Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" \
|
1547
|
+
unless nrows == other_df.nrows
|
1642
1548
|
|
1643
1549
|
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
1644
|
-
|
1645
|
-
.map(&:to_sym)
|
1646
|
-
df_new = DataFrame.new({}, order: new_fields)
|
1550
|
+
new_fields = ArrayHelper.recode_repeated(new_fields)
|
1647
1551
|
|
1648
|
-
(
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1552
|
+
DataFrame.new({}, order: new_fields).tap do |df_new|
|
1553
|
+
(0...nrows).each do |i|
|
1554
|
+
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
1555
|
+
end
|
1652
1556
|
|
1653
|
-
|
1654
|
-
|
1557
|
+
df_new.update
|
1558
|
+
end
|
1655
1559
|
end
|
1656
1560
|
|
1657
1561
|
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
@@ -1701,7 +1605,11 @@ module Daru
|
|
1701
1605
|
# ['2','fred','green',15,'orange',30,'white',20],
|
1702
1606
|
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
1703
1607
|
# ]
|
1704
|
-
# ds=Daru::DataFrame.rows(cases, order:
|
1608
|
+
# ds=Daru::DataFrame.rows(cases, order:
|
1609
|
+
# [:id, :name,
|
1610
|
+
# :car_color1, :car_value1,
|
1611
|
+
# :car_color2, :car_value2,
|
1612
|
+
# :car_color3, :car_value3])
|
1705
1613
|
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
1706
1614
|
# #=> Matrix[
|
1707
1615
|
# # ["red", "1", 10],
|
@@ -1711,62 +1619,29 @@ module Daru
|
|
1711
1619
|
# # ["white", "2", 20]
|
1712
1620
|
# # ]
|
1713
1621
|
def one_to_many(parent_fields, pattern)
|
1714
|
-
|
1715
|
-
ds_vars = parent_fields.dup
|
1716
|
-
vars = []
|
1717
|
-
max_n = 0
|
1718
|
-
h = parent_fields.each_with_object({}) { |v, a|
|
1719
|
-
a[v] = Daru::Vector.new([])
|
1720
|
-
}
|
1721
|
-
# Adding _row_id
|
1722
|
-
h['_col_id'] = Daru::Vector.new([])
|
1723
|
-
ds_vars.push('_col_id')
|
1724
|
-
|
1725
|
-
@vectors.each do |f|
|
1726
|
-
next unless f =~ re
|
1727
|
-
unless vars.include? $1
|
1728
|
-
vars.push($1)
|
1729
|
-
h[$1] = Daru::Vector.new([])
|
1730
|
-
end
|
1622
|
+
vars, numbers = one_to_many_components(pattern)
|
1731
1623
|
|
1732
|
-
|
1733
|
-
|
1734
|
-
|
1735
|
-
|
1736
|
-
|
1737
|
-
|
1738
|
-
parent_fields.each do |f|
|
1739
|
-
row_out[f] = row[f]
|
1740
|
-
end
|
1741
|
-
|
1742
|
-
max_n.times do |n1|
|
1743
|
-
n = n1+1
|
1744
|
-
any_data = false
|
1745
|
-
vars.each do |v|
|
1746
|
-
data = row[pattern.gsub('%v',v.to_s).gsub('%n',n.to_s)]
|
1747
|
-
row_out[v] = data
|
1748
|
-
any_data = true unless data.nil?
|
1749
|
-
end
|
1624
|
+
DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
|
1625
|
+
each_row do |row|
|
1626
|
+
verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
|
1627
|
+
numbers.each do |n|
|
1628
|
+
generated = one_to_many_row row, n, vars, pattern
|
1629
|
+
next if generated.values.all?(&:nil?)
|
1750
1630
|
|
1751
|
-
|
1752
|
-
row_out['_col_id'] = n
|
1753
|
-
ds.add_row(row_out)
|
1631
|
+
ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
|
1754
1632
|
end
|
1755
1633
|
end
|
1634
|
+
ds.update
|
1756
1635
|
end
|
1757
|
-
ds.update
|
1758
|
-
ds
|
1759
1636
|
end
|
1760
1637
|
|
1761
|
-
def add_vectors_by_split_recode(
|
1762
|
-
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
1766
|
-
|
1767
|
-
|
1768
|
-
i += 1
|
1769
|
-
}
|
1638
|
+
def add_vectors_by_split_recode(nm, join='-', sep=Daru::SPLIT_TOKEN)
|
1639
|
+
self[nm]
|
1640
|
+
.split_by_separator(sep)
|
1641
|
+
.each_with_index do |(k, v), i|
|
1642
|
+
v.rename "#{nm}:#{k}"
|
1643
|
+
self["#{nm}#{join}#{i + 1}".to_sym] = v
|
1644
|
+
end
|
1770
1645
|
end
|
1771
1646
|
|
1772
1647
|
# Create a sql, basen on a given Dataset
|
@@ -1795,40 +1670,37 @@ module Daru
|
|
1795
1670
|
sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
|
1796
1671
|
end
|
1797
1672
|
|
1673
|
+
# Returns the dataframe. This can be convenient when the user does not
|
1674
|
+
# know whether the object is a vector or a dataframe.
|
1675
|
+
# @return [self] the dataframe
|
1676
|
+
def to_df
|
1677
|
+
self
|
1678
|
+
end
|
1679
|
+
|
1798
1680
|
# Convert all numeric vectors to GSL::Matrix
|
1799
1681
|
def to_gsl
|
1800
|
-
numerics_as_arrays = []
|
1801
|
-
numeric_vectors.each do |n|
|
1802
|
-
numerics_as_arrays << self[n].to_a
|
1803
|
-
end
|
1682
|
+
numerics_as_arrays = numeric_vectors.map { |n| self[n].to_a }
|
1804
1683
|
|
1805
1684
|
GSL::Matrix.alloc(*numerics_as_arrays.transpose)
|
1806
1685
|
end
|
1807
1686
|
|
1808
1687
|
# Convert all vectors of type *:numeric* into a Matrix.
|
1809
1688
|
def to_matrix
|
1810
|
-
|
1811
|
-
each_vector do |vector|
|
1812
|
-
numerics_as_arrays << vector.to_a if vector.type == :numeric
|
1813
|
-
end
|
1814
|
-
|
1815
|
-
Matrix.columns numerics_as_arrays
|
1689
|
+
Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
|
1816
1690
|
end
|
1817
1691
|
|
1818
1692
|
# Return a Nyaplot::DataFrame from the data of this DataFrame.
|
1693
|
+
# :nocov:
|
1819
1694
|
def to_nyaplotdf
|
1820
1695
|
Nyaplot::DataFrame.new(to_a[0])
|
1821
1696
|
end
|
1697
|
+
# :nocov:
|
1822
1698
|
|
1823
1699
|
# Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
|
1824
1700
|
def to_nmatrix
|
1825
|
-
|
1826
|
-
|
1827
|
-
|
1828
|
-
vector.missing_positions.empty?
|
1829
|
-
end
|
1830
|
-
|
1831
|
-
numerics_as_arrays.transpose.to_nm
|
1701
|
+
each_vector.select do |vector|
|
1702
|
+
vector.numeric? && !vector.include_values?(*Daru::MISSING_VALUES)
|
1703
|
+
end.map(&:to_a).transpose.to_nm
|
1832
1704
|
end
|
1833
1705
|
|
1834
1706
|
# Converts the DataFrame into an array of hashes where key is vector name
|
@@ -1837,13 +1709,7 @@ module Daru
|
|
1837
1709
|
# of the dataframe. Each element in the index array corresponds to its row
|
1838
1710
|
# in the array of hashes, which has the same index.
|
1839
1711
|
def to_a
|
1840
|
-
|
1841
|
-
each_row do |row|
|
1842
|
-
arry[0] << row.to_h
|
1843
|
-
end
|
1844
|
-
arry[1] = @index.to_a
|
1845
|
-
|
1846
|
-
arry
|
1712
|
+
[each_row.map(&:to_h), @index.to_a]
|
1847
1713
|
end
|
1848
1714
|
|
1849
1715
|
# Convert to json. If no_index is false then the index will NOT be included
|
@@ -1859,54 +1725,19 @@ module Daru
|
|
1859
1725
|
# Converts DataFrame to a hash (explicit) with keys as vector names and values as
|
1860
1726
|
# the corresponding vectors.
|
1861
1727
|
def to_h
|
1862
|
-
|
1863
|
-
|
1864
|
-
|
1865
|
-
end
|
1866
|
-
|
1867
|
-
hsh
|
1728
|
+
@vectors
|
1729
|
+
.each_with_index
|
1730
|
+
.map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
|
1868
1731
|
end
|
1869
1732
|
|
1870
1733
|
# Convert to html for IRuby.
|
1871
1734
|
def to_html threshold=30
|
1872
|
-
|
1873
|
-
|
1874
|
-
|
1875
|
-
|
1876
|
-
|
1877
|
-
|
1878
|
-
html +='<tr><th></th>'
|
1879
|
-
@vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
|
1880
|
-
html += '</tr>'
|
1881
|
-
|
1882
|
-
@index.each_with_index do |index, num|
|
1883
|
-
html += '<tr>'
|
1884
|
-
html += '<td>' + index.to_s + '</td>'
|
1885
|
-
|
1886
|
-
row[index].each do |element|
|
1887
|
-
html += '<td>' + element.to_s + '</td>'
|
1888
|
-
end
|
1889
|
-
|
1890
|
-
html += '</tr>'
|
1891
|
-
next if num <= threshold
|
1892
|
-
|
1893
|
-
html += '<tr>'
|
1894
|
-
(@vectors.size + 1).times { html += '<td>...</td>' }
|
1895
|
-
html += '</tr>'
|
1896
|
-
|
1897
|
-
last_index = @index.to_a.last
|
1898
|
-
last_row = row[last_index]
|
1899
|
-
html += '<tr>'
|
1900
|
-
html += '<td>' + last_index.to_s + '</td>'
|
1901
|
-
(0..(ncols - 1)).to_a.each do |i|
|
1902
|
-
html += '<td>' + last_row[i].to_s + '</td>'
|
1903
|
-
end
|
1904
|
-
html += '</tr>'
|
1905
|
-
break
|
1906
|
-
end
|
1907
|
-
html += '</table>'
|
1908
|
-
|
1909
|
-
html
|
1735
|
+
path = if index.is_a?(MultiIndex)
|
1736
|
+
File.expand_path('../iruby/templates/dataframe_mi.html.erb', __FILE__)
|
1737
|
+
else
|
1738
|
+
File.expand_path('../iruby/templates/dataframe.html.erb', __FILE__)
|
1739
|
+
end
|
1740
|
+
ERB.new(File.read(path).strip).result(binding)
|
1910
1741
|
end
|
1911
1742
|
|
1912
1743
|
def to_s
|
@@ -1925,8 +1756,11 @@ module Daru
|
|
1925
1756
|
# Rename the DataFrame.
|
1926
1757
|
def rename new_name
|
1927
1758
|
@name = new_name
|
1759
|
+
self
|
1928
1760
|
end
|
1929
1761
|
|
1762
|
+
alias_method :name=, :rename
|
1763
|
+
|
1930
1764
|
# Write this DataFrame to a CSV file.
|
1931
1765
|
#
|
1932
1766
|
# == Arguements
|
@@ -2003,46 +1837,28 @@ module Daru
|
|
2003
1837
|
|
2004
1838
|
# Transpose a DataFrame, tranposing elements and row, column indexing.
|
2005
1839
|
def transpose
|
2006
|
-
|
2007
|
-
|
2008
|
-
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
1840
|
+
Daru::DataFrame.new(
|
1841
|
+
each_vector.map(&:to_a).transpose,
|
1842
|
+
index: @vectors,
|
1843
|
+
order: @index,
|
1844
|
+
dtype: @dtype,
|
1845
|
+
name: @name
|
1846
|
+
)
|
2012
1847
|
end
|
2013
1848
|
|
2014
1849
|
# Pretty print in a nice table format for the command line (irb/pry/iruby)
|
2015
1850
|
def inspect spacing=10, threshold=15
|
2016
|
-
|
2017
|
-
|
2018
|
-
|
2019
|
-
|
2020
|
-
|
2021
|
-
|
2022
|
-
|
2023
|
-
|
2024
|
-
|
2025
|
-
|
2026
|
-
|
2027
|
-
content += "\n#<" + self.class.to_s + ':' + object_id.to_s + ' @name = ' +
|
2028
|
-
name.to_s + ' @size = ' + @size.to_s + '>'
|
2029
|
-
content += formatter % ['', *@vectors.map(&:to_s)]
|
2030
|
-
row_num = 1
|
2031
|
-
|
2032
|
-
each_row_with_index do |row, index|
|
2033
|
-
content += formatter % [index.to_s, *row.to_h.values.map { |e| (e || 'nil').to_s }]
|
2034
|
-
row_num += 1
|
2035
|
-
next if row_num <= threshold
|
2036
|
-
|
2037
|
-
dots = []
|
2038
|
-
|
2039
|
-
(@vectors.size + 1).times { dots << '...' }
|
2040
|
-
content += formatter % dots
|
2041
|
-
break
|
2042
|
-
end
|
2043
|
-
content += "\n"
|
2044
|
-
|
2045
|
-
content
|
1851
|
+
row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
|
1852
|
+
name_part = @name ? ": #{@name} " : ''
|
1853
|
+
|
1854
|
+
"#<#{self.class}#{name_part}(#{nrows}x#{ncols})>\n" +
|
1855
|
+
Formatters::Table.format(
|
1856
|
+
each_row.lazy,
|
1857
|
+
row_headers: row_headers,
|
1858
|
+
headers: vectors,
|
1859
|
+
threshold: threshold,
|
1860
|
+
spacing: spacing
|
1861
|
+
)
|
2046
1862
|
end
|
2047
1863
|
|
2048
1864
|
# Query a DataFrame by passing a Daru::Core::Query::BoolArray object.
|
@@ -2058,218 +1874,202 @@ module Daru
|
|
2058
1874
|
@vectors.to_a.all? { |v| self[v] == other[v] }
|
2059
1875
|
end
|
2060
1876
|
|
1877
|
+
# Converts the specified non category type vectors to category type vectors
|
1878
|
+
# @param [Array] *names names of non category type vectors to be converted
|
1879
|
+
# @return [Daru::DataFrame] data frame in which specified vectors have been
|
1880
|
+
# converted to category type
|
1881
|
+
# @example
|
1882
|
+
# df = Daru::DataFrame.new({
|
1883
|
+
# a: [1, 2, 3],
|
1884
|
+
# b: ['a', 'a', 'b']
|
1885
|
+
# })
|
1886
|
+
# df.to_category :b
|
1887
|
+
# df[:b].type
|
1888
|
+
# # => :category
|
1889
|
+
def to_category *names
|
1890
|
+
names.each { |n| self[n] = self[n].to_category }
|
1891
|
+
self
|
1892
|
+
end
|
1893
|
+
|
2061
1894
|
def method_missing(name, *args, &block)
|
2062
1895
|
if name =~ /(.+)\=/
|
2063
|
-
insert_or_modify_vector name[/(.+)\=/].delete('=').to_sym, args[0]
|
1896
|
+
insert_or_modify_vector [name[/(.+)\=/].delete('=').to_sym], args[0]
|
2064
1897
|
elsif has_vector? name
|
2065
1898
|
self[name]
|
2066
1899
|
else
|
2067
|
-
super
|
1900
|
+
super
|
2068
1901
|
end
|
2069
1902
|
end
|
2070
1903
|
|
2071
|
-
|
1904
|
+
def respond_to_missing?(name, include_private=false)
|
1905
|
+
name.to_s.end_with?('=') || has_vector?(name) || super
|
1906
|
+
end
|
2072
1907
|
|
2073
|
-
def
|
2074
|
-
|
2075
|
-
|
2076
|
-
else
|
2077
|
-
Daru::Index.new(index)
|
1908
|
+
def interact_code vector_names, full
|
1909
|
+
dfs = vector_names.zip(full).map do |vec_name, f|
|
1910
|
+
self[vec_name].contrast_code(full: f).each.to_a
|
2078
1911
|
end
|
1912
|
+
|
1913
|
+
all_vectors = recursive_product(dfs)
|
1914
|
+
Daru::DataFrame.new all_vectors,
|
1915
|
+
order: all_vectors.map(&:name)
|
2079
1916
|
end
|
2080
1917
|
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2086
|
-
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
1918
|
+
# Split the dataframe into many dataframes based on category vector
|
1919
|
+
# @param [object] cat_name name of category vector to split the dataframe
|
1920
|
+
# @return [Array] array of dataframes split by category with category vector
|
1921
|
+
# used to split not included
|
1922
|
+
# @example
|
1923
|
+
# df = Daru::DataFrame.new({
|
1924
|
+
# a: [1, 2, 3],
|
1925
|
+
# b: ['a', 'a', 'b']
|
1926
|
+
# })
|
1927
|
+
# df.to_category :b
|
1928
|
+
# df.split_by_category :b
|
1929
|
+
# # => [#<Daru::DataFrame: a (2x1)>
|
1930
|
+
# # a
|
1931
|
+
# # 0 1
|
1932
|
+
# # 1 2,
|
1933
|
+
# # #<Daru::DataFrame: b (1x1)>
|
1934
|
+
# # a
|
1935
|
+
# # 2 3]
|
1936
|
+
def split_by_category cat_name
|
1937
|
+
cat_dv = self[cat_name]
|
1938
|
+
raise ArguementError, "#{cat_name} is not a category vector" unless
|
1939
|
+
cat_dv.category?
|
1940
|
+
|
1941
|
+
cat_dv.categories.map do |cat|
|
1942
|
+
where(cat_dv.eq cat)
|
1943
|
+
.rename(cat)
|
1944
|
+
.delete_vector cat_name
|
2093
1945
|
end
|
1946
|
+
end
|
1947
|
+
|
1948
|
+
private
|
2094
1949
|
|
2095
|
-
|
1950
|
+
def convert_categorical_vectors names
|
1951
|
+
names.map do |n|
|
1952
|
+
next unless self[n].category?
|
1953
|
+
old = [n, self[n]]
|
1954
|
+
self[n] = Daru::Vector.new(self[n].to_ints)
|
1955
|
+
old
|
1956
|
+
end.compact
|
2096
1957
|
end
|
2097
1958
|
|
2098
|
-
def
|
2099
|
-
|
2100
|
-
|
2101
|
-
value = if opts[:ascending][i]
|
2102
|
-
@data[@vectors[v]].data[r1]
|
2103
|
-
else
|
2104
|
-
@data[@vectors[v]].data[r2]
|
2105
|
-
end
|
2106
|
-
|
2107
|
-
if opts[:by][v] && !opts[:handle_nils][i]
|
2108
|
-
# Block given and nils handled manually
|
2109
|
-
value = opts[:by][v].call value
|
2110
|
-
|
2111
|
-
elsif opts[:by][v] && opts[:handle_nils][i]
|
2112
|
-
# Block given and nils handled automatically
|
2113
|
-
value = opts[:by][v].call value rescue nil
|
2114
|
-
blocks[v].call value
|
1959
|
+
def restore_categorical_vectors old
|
1960
|
+
old.each { |name, vector| self[name] = vector }
|
1961
|
+
end
|
2115
1962
|
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
1963
|
+
def recursive_product dfs
|
1964
|
+
return dfs.first if dfs.size == 1
|
1965
|
+
|
1966
|
+
left = dfs.first
|
1967
|
+
dfs.shift
|
1968
|
+
right = recursive_product dfs
|
1969
|
+
left.product(right).map do |dv1, dv2|
|
1970
|
+
(dv1*dv2).rename "#{dv1.name}:#{dv2.name}"
|
2120
1971
|
end
|
2121
1972
|
end
|
2122
1973
|
|
2123
|
-
def
|
2124
|
-
if
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
1974
|
+
def should_be_vector! val
|
1975
|
+
return val if val.is_a?(Daru::Vector)
|
1976
|
+
raise TypeError, "Every iteration must return Daru::Vector not #{val.class}"
|
1977
|
+
end
|
1978
|
+
|
1979
|
+
def dispatch_to_axis(axis, method, *args, &block)
|
1980
|
+
if axis == :vector || axis == :column
|
1981
|
+
send("#{method}_vector", *args, &block)
|
1982
|
+
elsif axis == :row
|
1983
|
+
send("#{method}_row", *args, &block)
|
2128
1984
|
else
|
2129
|
-
|
1985
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
2130
1986
|
end
|
2131
1987
|
end
|
2132
1988
|
|
2133
|
-
def
|
2134
|
-
if
|
2135
|
-
|
2136
|
-
|
2137
|
-
|
1989
|
+
def dispatch_to_axis_pl(axis, method, *args, &block)
|
1990
|
+
if axis == :vector || axis == :column
|
1991
|
+
send("#{method}_vectors", *args, &block)
|
1992
|
+
elsif axis == :row
|
1993
|
+
send("#{method}_rows", *args, &block)
|
2138
1994
|
else
|
2139
|
-
|
1995
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
2140
1996
|
end
|
2141
1997
|
end
|
2142
1998
|
|
2143
|
-
|
2144
|
-
|
2145
|
-
|
2146
|
-
|
2147
|
-
|
1999
|
+
AXES = [:row, :vector].freeze
|
2000
|
+
|
2001
|
+
def extract_axis names, default=:vector
|
2002
|
+
if AXES.include?(names.last)
|
2003
|
+
names.pop
|
2004
|
+
else
|
2005
|
+
default
|
2148
2006
|
end
|
2149
2007
|
end
|
2150
2008
|
|
2151
2009
|
def access_vector *names
|
2152
|
-
|
2010
|
+
if names.first.is_a?(Range)
|
2011
|
+
dup(@vectors[names.first])
|
2012
|
+
elsif @vectors.is_a?(MultiIndex)
|
2013
|
+
access_vector_multi_index(*names)
|
2014
|
+
else
|
2015
|
+
access_vector_single_index(*names)
|
2016
|
+
end
|
2017
|
+
end
|
2153
2018
|
|
2154
|
-
|
2155
|
-
|
2156
|
-
pos = @vectors[names]
|
2019
|
+
def access_vector_multi_index *names
|
2020
|
+
pos = @vectors[names]
|
2157
2021
|
|
2158
|
-
|
2022
|
+
return @data[pos] if pos.is_a?(Integer)
|
2159
2023
|
|
2160
|
-
|
2161
|
-
new_vectors = pos.map do |tuple|
|
2162
|
-
@data[@vectors[tuple]]
|
2163
|
-
end
|
2024
|
+
new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
|
2164
2025
|
|
2165
|
-
|
2166
|
-
pos = pos.drop_left_level names.size
|
2167
|
-
end
|
2026
|
+
pos = pos.drop_left_level(names.size) if names.size < @vectors.width
|
2168
2027
|
|
2169
|
-
|
2170
|
-
|
2171
|
-
unless names[1]
|
2172
|
-
pos = @vectors[location]
|
2173
|
-
|
2174
|
-
return @data[pos] if pos.is_a?(Numeric)
|
2028
|
+
Daru::DataFrame.new(new_vectors, index: @index, order: pos)
|
2029
|
+
end
|
2175
2030
|
|
2176
|
-
|
2177
|
-
|
2031
|
+
def access_vector_single_index *names
|
2032
|
+
if names.count < 2
|
2033
|
+
pos = @vectors[names.first]
|
2178
2034
|
|
2179
|
-
|
2180
|
-
names.each do |name|
|
2181
|
-
new_vectors[name] = @data[@vectors[name]]
|
2182
|
-
end
|
2035
|
+
return @data[pos] if pos.is_a?(Numeric)
|
2183
2036
|
|
2184
|
-
|
2185
|
-
Daru::DataFrame.new(new_vectors, order: order,
|
2186
|
-
index: @index, name: @name)
|
2037
|
+
names = pos
|
2187
2038
|
end
|
2188
|
-
end
|
2189
2039
|
|
2190
|
-
|
2191
|
-
location = names[0]
|
2040
|
+
new_vectors = names.map { |name| [name, @data[@vectors[name]]] }.to_h
|
2192
2041
|
|
2193
|
-
|
2194
|
-
|
2195
|
-
|
2196
|
-
|
2197
|
-
end
|
2042
|
+
order = names.is_a?(Array) ? Daru::Index.new(names) : names
|
2043
|
+
Daru::DataFrame.new(new_vectors, order: order,
|
2044
|
+
index: @index, name: @name)
|
2045
|
+
end
|
2198
2046
|
|
2199
|
-
|
2047
|
+
def access_row *indexes
|
2048
|
+
positions = @index.pos(*indexes)
|
2200
2049
|
|
2201
|
-
|
2202
|
-
|
2203
|
-
|
2204
|
-
|
2205
|
-
Daru::DataFrame.rows(new_rows, order: @vectors, name: @name, index: pos)
|
2050
|
+
if positions.is_a? Numeric
|
2051
|
+
return Daru::Vector.new populate_row_for(positions),
|
2052
|
+
index: @vectors,
|
2053
|
+
name: indexes.first
|
2206
2054
|
else
|
2207
|
-
|
2208
|
-
|
2209
|
-
|
2210
|
-
|
2211
|
-
@data.each do |vector|
|
2212
|
-
row << vector[location]
|
2213
|
-
end
|
2214
|
-
|
2215
|
-
return Daru::Vector.new(row, index: @vectors, name: set_name(location))
|
2216
|
-
end
|
2217
|
-
end
|
2218
|
-
# Access multiple rows
|
2219
|
-
rows = []
|
2220
|
-
names.each do |name|
|
2221
|
-
rows << self.row[name].to_a
|
2222
|
-
end
|
2223
|
-
|
2224
|
-
Daru::DataFrame.rows rows, index: names,name: @name, order: @vectors
|
2055
|
+
new_rows = @data.map { |vec| vec[*indexes] }
|
2056
|
+
return Daru::DataFrame.new new_rows,
|
2057
|
+
index: @index.subset(*indexes),
|
2058
|
+
order: @vectors
|
2225
2059
|
end
|
2226
2060
|
end
|
2227
2061
|
|
2228
2062
|
def populate_row_for pos
|
2229
|
-
@data.map
|
2230
|
-
vector[pos]
|
2231
|
-
end
|
2063
|
+
@data.map { |vector| vector[pos] }
|
2232
2064
|
end
|
2233
2065
|
|
2234
2066
|
def insert_or_modify_vector name, vector
|
2235
2067
|
name = name[0] unless @vectors.is_a?(MultiIndex)
|
2236
|
-
vec = nil
|
2237
2068
|
|
2238
2069
|
if @index.empty?
|
2239
|
-
|
2240
|
-
vector
|
2241
|
-
else
|
2242
|
-
Daru::Vector.new(vector.to_a, name: set_name(name))
|
2243
|
-
end
|
2244
|
-
|
2245
|
-
@index = vec.index
|
2246
|
-
assign_or_add_vector name, vec
|
2247
|
-
set_size
|
2248
|
-
|
2249
|
-
@data.map! do |v|
|
2250
|
-
if v.empty?
|
2251
|
-
Daru::Vector.new([nil]*@size, name: set_name(name), metadata: v.metadata, index: @index)
|
2252
|
-
else
|
2253
|
-
v
|
2254
|
-
end
|
2255
|
-
end
|
2070
|
+
insert_vector_in_empty name, vector
|
2256
2071
|
else
|
2257
|
-
|
2258
|
-
if vector.index == @index # so that index-by-index assignment is avoided when possible.
|
2259
|
-
vec = vector.dup
|
2260
|
-
else
|
2261
|
-
vec = Daru::Vector.new [], name: set_name(name), metadata: vector.metadata.dup, index: @index
|
2262
|
-
@index.each do |idx|
|
2263
|
-
vec[idx] = vector.index.include?(idx) ? vector[idx] : nil
|
2264
|
-
end
|
2265
|
-
end
|
2266
|
-
else
|
2267
|
-
raise SizeError,
|
2268
|
-
"Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
|
2269
|
-
@size != vector.size
|
2270
|
-
|
2271
|
-
vec = Daru::Vector.new(vector, name: set_name(name), index: @index)
|
2272
|
-
end
|
2072
|
+
vec = prepare_vector_for_insert name, vector
|
2273
2073
|
|
2274
2074
|
assign_or_add_vector name, vec
|
2275
2075
|
end
|
@@ -2283,54 +2083,82 @@ module Daru
|
|
2283
2083
|
pos = name
|
2284
2084
|
end
|
2285
2085
|
|
2286
|
-
|
2287
|
-
|
2086
|
+
case
|
2087
|
+
when pos.is_a?(Daru::Index)
|
2088
|
+
assign_multiple_vectors pos, v
|
2089
|
+
when pos == name &&
|
2090
|
+
(@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
|
2091
|
+
|
2288
2092
|
@data[pos] = v
|
2289
|
-
elsif pos.is_a?(Daru::Index)
|
2290
|
-
pos.each do |p|
|
2291
|
-
@data[@vectors[p]] = v
|
2292
|
-
end
|
2293
2093
|
else
|
2294
|
-
|
2295
|
-
@data[@vectors[name]] = v
|
2094
|
+
assign_or_add_vector_rough name, v
|
2296
2095
|
end
|
2297
2096
|
end
|
2298
2097
|
|
2299
|
-
def
|
2300
|
-
|
2301
|
-
|
2302
|
-
|
2303
|
-
|
2304
|
-
vec =
|
2305
|
-
if vector.is_a?(Daru::Vector)
|
2306
|
-
vector
|
2307
|
-
else
|
2308
|
-
Daru::Vector.new(vector, name: set_name(name), index: @vectors)
|
2309
|
-
end
|
2098
|
+
def assign_multiple_vectors pos, v
|
2099
|
+
pos.each do |p|
|
2100
|
+
@data[@vectors[p]] = v
|
2101
|
+
end
|
2102
|
+
end
|
2310
2103
|
|
2311
|
-
|
2312
|
-
|
2313
|
-
|
2314
|
-
|
2315
|
-
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
2104
|
+
def assign_or_add_vector_rough name, v
|
2105
|
+
@vectors |= [name] unless @vectors.include?(name)
|
2106
|
+
@data[@vectors[name]] = v
|
2107
|
+
end
|
2108
|
+
|
2109
|
+
def insert_vector_in_empty name, vector
|
2110
|
+
vec = Vector.coerce(vector.to_a, name: coerce_name(name))
|
2111
|
+
|
2112
|
+
@index = vec.index
|
2113
|
+
assign_or_add_vector name, vec
|
2114
|
+
set_size
|
2115
|
+
|
2116
|
+
@data.map! { |v| v.empty? ? v.reindex(@index) : v }
|
2117
|
+
end
|
2118
|
+
|
2119
|
+
def prepare_vector_for_insert name, vector
|
2120
|
+
if vector.is_a?(Daru::Vector)
|
2121
|
+
# so that index-by-index assignment is avoided when possible.
|
2122
|
+
return vector.dup if vector.index == @index
|
2123
|
+
|
2124
|
+
Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v|
|
2125
|
+
@index.each do |idx|
|
2126
|
+
v[idx] = vector.index.include?(idx) ? vector[idx] : nil
|
2319
2127
|
end
|
2320
|
-
|
2128
|
+
}
|
2129
|
+
else
|
2130
|
+
# FIXME: No spec checks this case... And SizeError is not a thing - zverok, 2016-05-08
|
2131
|
+
raise SizeError,
|
2132
|
+
"Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
|
2133
|
+
@size != vector.size
|
2134
|
+
|
2135
|
+
Daru::Vector.new(vector, name: coerce_name(name), index: @index)
|
2136
|
+
end
|
2137
|
+
end
|
2138
|
+
|
2139
|
+
def insert_or_modify_row indexes, vector
|
2140
|
+
vector = coerce_vector vector
|
2141
|
+
|
2142
|
+
raise SizeError, 'Vector length should match row length' if
|
2143
|
+
vector.size != @vectors.size
|
2321
2144
|
|
2322
|
-
|
2145
|
+
@data.each_with_index do |vec, pos|
|
2146
|
+
vec.send(:set, indexes, vector.at(pos))
|
2323
2147
|
end
|
2148
|
+
@index = @data[0].index
|
2149
|
+
|
2150
|
+
set_size
|
2324
2151
|
end
|
2325
2152
|
|
2326
2153
|
def create_empty_vectors
|
2327
|
-
@vectors.
|
2328
|
-
|
2154
|
+
@data = @vectors.map do |name|
|
2155
|
+
Daru::Vector.new([], name: coerce_name(name), index: @index)
|
2329
2156
|
end
|
2330
2157
|
end
|
2331
2158
|
|
2332
2159
|
def validate_labels
|
2333
|
-
raise IndexError, "Expected equal number of vector names (#{@vectors.size})
|
2160
|
+
raise IndexError, "Expected equal number of vector names (#{@vectors.size}) " \
|
2161
|
+
"for number of vectors (#{@data.size})." if
|
2334
2162
|
@vectors && @vectors.size != @data.size
|
2335
2163
|
|
2336
2164
|
raise IndexError, 'Expected number of indexes same as number of rows' if
|
@@ -2348,12 +2176,6 @@ module Daru
|
|
2348
2176
|
validate_vector_sizes
|
2349
2177
|
end
|
2350
2178
|
|
2351
|
-
def all_daru_vectors_in_source? source
|
2352
|
-
source.values.all? do |vector|
|
2353
|
-
vector.is_a?(Daru::Vector)
|
2354
|
-
end
|
2355
|
-
end
|
2356
|
-
|
2357
2179
|
def set_size
|
2358
2180
|
@size = @index.size
|
2359
2181
|
end
|
@@ -2382,32 +2204,301 @@ module Daru
|
|
2382
2204
|
def all_vectors_have_equal_indexes? source
|
2383
2205
|
idx = source.values[0].index
|
2384
2206
|
|
2385
|
-
source.values.all?
|
2386
|
-
|
2207
|
+
source.values.all? { |vector| idx == vector.index }
|
2208
|
+
end
|
2209
|
+
|
2210
|
+
def coerce_name potential_name
|
2211
|
+
potential_name.is_a?(Array) ? potential_name.join : potential_name
|
2212
|
+
end
|
2213
|
+
|
2214
|
+
def initialize_from_array source, vectors, index, opts
|
2215
|
+
raise ArgumentError, 'All objects in data source should be same class' \
|
2216
|
+
unless source.map(&:class).uniq.size == 1
|
2217
|
+
|
2218
|
+
case source.first
|
2219
|
+
when Array
|
2220
|
+
initialize_from_array_of_arrays source, vectors, index, opts
|
2221
|
+
when Vector
|
2222
|
+
initialize_from_array_of_vectors source, vectors, index, opts
|
2223
|
+
when Hash
|
2224
|
+
initialize_from_array_of_hashes source, vectors, index, opts
|
2225
|
+
else
|
2226
|
+
raise ArgumentError, "Can't create DataFrame from #{source}"
|
2387
2227
|
end
|
2388
2228
|
end
|
2389
2229
|
|
2390
|
-
def
|
2391
|
-
|
2230
|
+
def initialize_from_array_of_arrays source, vectors, index, _opts
|
2231
|
+
raise ArgumentError, "Number of vectors (#{vectors.size}) should \
|
2232
|
+
equal order size (#{source.size})" if source.size != vectors.size
|
2233
|
+
|
2234
|
+
@index = Index.coerce(index || source[0].size)
|
2235
|
+
@vectors = Index.coerce(vectors)
|
2236
|
+
|
2237
|
+
@data = @vectors.each_with_index.map do |_vec,idx|
|
2238
|
+
Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
|
2239
|
+
end
|
2392
2240
|
end
|
2393
2241
|
|
2394
|
-
def
|
2395
|
-
|
2242
|
+
def initialize_from_array_of_vectors source, vectors, index, opts
|
2243
|
+
clone = opts[:clone] != false
|
2244
|
+
hsh = vectors.each_with_index.map do |name, idx|
|
2245
|
+
[name, source[idx]]
|
2246
|
+
end.to_h
|
2247
|
+
initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
|
2248
|
+
end
|
2249
|
+
|
2250
|
+
def initialize_from_array_of_hashes source, vectors, index, _opts
|
2251
|
+
names =
|
2252
|
+
if vectors.nil?
|
2253
|
+
source[0].keys
|
2254
|
+
else
|
2255
|
+
(vectors + source[0].keys).uniq
|
2256
|
+
end
|
2257
|
+
@vectors = Daru::Index.new(names)
|
2258
|
+
@index = Daru::Index.new(index || source.size)
|
2259
|
+
|
2260
|
+
@data = @vectors.map do |name|
|
2261
|
+
v = source.map { |h| h[name] || h[name.to_s] }
|
2262
|
+
Daru::Vector.new(v, name: coerce_name(name), index: @index)
|
2263
|
+
end
|
2264
|
+
end
|
2265
|
+
|
2266
|
+
def initialize_from_hash source, vectors, index, opts
|
2267
|
+
create_vectors_index_with vectors, source
|
2268
|
+
|
2269
|
+
if ArrayHelper.array_of?(source.values, Vector)
|
2270
|
+
initialize_from_hash_with_vectors source, index, opts
|
2271
|
+
else
|
2272
|
+
initialize_from_hash_with_arrays source, index, opts
|
2273
|
+
end
|
2274
|
+
end
|
2275
|
+
|
2276
|
+
def initialize_from_hash_with_vectors source, index, opts
|
2277
|
+
vectors_have_same_index = all_vectors_have_equal_indexes?(source)
|
2278
|
+
|
2279
|
+
clone = opts[:clone] != false
|
2280
|
+
clone = true unless index || vectors_have_same_index
|
2281
|
+
|
2282
|
+
@index = deduce_index index, source, vectors_have_same_index
|
2283
|
+
|
2284
|
+
if clone
|
2285
|
+
@data = clone_vectors source, vectors_have_same_index
|
2286
|
+
else
|
2287
|
+
@data.concat source.values
|
2288
|
+
end
|
2289
|
+
end
|
2290
|
+
|
2291
|
+
def deduce_index index, source, vectors_have_same_index
|
2292
|
+
if !index.nil?
|
2293
|
+
Index.coerce index
|
2294
|
+
elsif vectors_have_same_index
|
2295
|
+
source.values[0].index.dup
|
2296
|
+
else
|
2297
|
+
all_indexes = source
|
2298
|
+
.values.map { |v| v.index.to_a }
|
2299
|
+
.flatten.uniq.sort # sort only if missing indexes detected
|
2300
|
+
|
2301
|
+
Daru::Index.new all_indexes
|
2302
|
+
end
|
2396
2303
|
end
|
2397
2304
|
|
2398
|
-
def
|
2399
|
-
|
2400
|
-
if
|
2401
|
-
|
2402
|
-
|
2403
|
-
|
2305
|
+
def clone_vectors source, vectors_have_same_index
|
2306
|
+
@vectors.map do |vector|
|
2307
|
+
# avoids matching indexes of vectors if all the supplied vectors
|
2308
|
+
# have the same index.
|
2309
|
+
if vectors_have_same_index
|
2310
|
+
source[vector].dup
|
2311
|
+
else
|
2312
|
+
Daru::Vector.new([], name: vector, index: @index).tap do |v|
|
2313
|
+
@index.each do |idx|
|
2314
|
+
v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
|
2404
2315
|
end
|
2405
2316
|
end
|
2317
|
+
end
|
2318
|
+
end
|
2319
|
+
end
|
2320
|
+
|
2321
|
+
def initialize_from_hash_with_arrays source, index, _opts
|
2322
|
+
@index = Index.coerce(index || source.values[0].size)
|
2323
|
+
|
2324
|
+
@vectors.each do |name|
|
2325
|
+
@data << Daru::Vector.new(source[name].dup, name: coerce_name(name), index: @index)
|
2326
|
+
end
|
2327
|
+
end
|
2328
|
+
|
2329
|
+
def sort_build_row vector_locs, by_blocks, ascending, handle_nils, r1, r2 # rubocop:disable Metrics/ParameterLists
|
2330
|
+
# Create an array to be used for comparison of two rows in sorting
|
2331
|
+
vector_locs
|
2332
|
+
.zip(by_blocks, ascending, handle_nils)
|
2333
|
+
.map do |vector_loc, by, asc, handle_nil|
|
2334
|
+
value = @data[vector_loc].data[asc ? r1 : r2]
|
2335
|
+
|
2336
|
+
value = by.call(value) rescue nil if by
|
2337
|
+
|
2338
|
+
sort_handle_nils value, asc, handle_nil || !by
|
2339
|
+
end
|
2340
|
+
end
|
2341
|
+
|
2342
|
+
def sort_handle_nils value, asc, handle_nil
|
2343
|
+
case
|
2344
|
+
when !handle_nil
|
2345
|
+
value
|
2346
|
+
when asc
|
2347
|
+
[value.nil? ? 0 : 1, value]
|
2348
|
+
else
|
2349
|
+
[value.nil? ? 1 : 0, value]
|
2350
|
+
end
|
2351
|
+
end
|
2352
|
+
|
2353
|
+
def sort_coerce_boolean opts, symbol, default, size
|
2354
|
+
val = opts[symbol]
|
2355
|
+
case val
|
2356
|
+
when true, false
|
2357
|
+
Array.new(size, val)
|
2358
|
+
when nil
|
2359
|
+
Array.new(size, default)
|
2360
|
+
when Array
|
2361
|
+
raise ArgumentError, "Specify same number of vector names and #{symbol}" if
|
2362
|
+
size != val.size
|
2363
|
+
val
|
2364
|
+
else
|
2365
|
+
raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
|
2366
|
+
end
|
2367
|
+
end
|
2368
|
+
|
2369
|
+
def sort_prepare_block vector_order, opts
|
2370
|
+
ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
|
2371
|
+
handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
|
2372
|
+
|
2373
|
+
by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
|
2374
|
+
vector_locs = vector_order.map { |v| @vectors[v] }
|
2375
|
+
|
2376
|
+
lambda do |index1, index2|
|
2377
|
+
# Build left and right array to compare two rows
|
2378
|
+
left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
|
2379
|
+
right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
|
2380
|
+
|
2381
|
+
# Resolve conflict by Index if all attributes are same
|
2382
|
+
left << index1
|
2383
|
+
right << index2
|
2384
|
+
left <=> right
|
2385
|
+
end
|
2386
|
+
end
|
2387
|
+
|
2388
|
+
def verify_error_message row, test, id, i
|
2389
|
+
description, fields, = test
|
2390
|
+
values =
|
2391
|
+
if fields.empty?
|
2392
|
+
''
|
2406
2393
|
else
|
2407
|
-
|
2394
|
+
' (' + fields.collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
|
2395
|
+
end
|
2396
|
+
"#{i+1} [#{row[id]}]: #{description}#{values}"
|
2397
|
+
end
|
2398
|
+
|
2399
|
+
def prepare_pivot_values index, vectors, opts
|
2400
|
+
case opts[:values]
|
2401
|
+
when nil # values not specified at all.
|
2402
|
+
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
2403
|
+
when Array # multiple values specified.
|
2404
|
+
opts[:values]
|
2405
|
+
else # single value specified.
|
2406
|
+
[opts[:values]]
|
2407
|
+
end
|
2408
|
+
end
|
2409
|
+
|
2410
|
+
def make_pivot_hash grouped, vectors, values, aggregate_function
|
2411
|
+
grouped.groups.map { |n, _| [n, {}] }.to_h.tap do |super_hash|
|
2412
|
+
values.each do |value|
|
2413
|
+
grouped.groups.each do |group_name, row_numbers|
|
2414
|
+
row_numbers.each do |num|
|
2415
|
+
arry = [value, *vectors.map { |v| self[v][num] }]
|
2416
|
+
sub_hash = super_hash[group_name]
|
2417
|
+
sub_hash[arry] ||= []
|
2418
|
+
|
2419
|
+
sub_hash[arry] << self[value][num]
|
2420
|
+
end
|
2421
|
+
end
|
2422
|
+
end
|
2423
|
+
|
2424
|
+
setup_pivot_aggregates super_hash, aggregate_function
|
2425
|
+
end
|
2426
|
+
end
|
2427
|
+
|
2428
|
+
def setup_pivot_aggregates super_hash, aggregate_function
|
2429
|
+
super_hash.each_value do |sub_hash|
|
2430
|
+
sub_hash.each do |group_name, aggregates|
|
2431
|
+
sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
|
2408
2432
|
end
|
2433
|
+
end
|
2434
|
+
end
|
2435
|
+
|
2436
|
+
def pivot_dataframe super_hash
|
2437
|
+
df_index = Daru::MultiIndex.from_tuples super_hash.keys
|
2438
|
+
df_vectors = Daru::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
|
2409
2439
|
|
2410
|
-
|
2440
|
+
Daru::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
|
2441
|
+
super_hash.each do |row_index, sub_h|
|
2442
|
+
sub_h.each do |vector_index, val|
|
2443
|
+
pivoted_dataframe[vector_index][row_index] = val
|
2444
|
+
end
|
2445
|
+
end
|
2446
|
+
end
|
2447
|
+
end
|
2448
|
+
|
2449
|
+
def one_to_many_components pattern
|
2450
|
+
re = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)')
|
2451
|
+
|
2452
|
+
vars, numbers =
|
2453
|
+
@vectors
|
2454
|
+
.map { |v| v.scan(re) }
|
2455
|
+
.reject(&:empty?).flatten(1).transpose
|
2456
|
+
|
2457
|
+
[vars.uniq, numbers.map(&:to_i).sort.uniq]
|
2458
|
+
end
|
2459
|
+
|
2460
|
+
def one_to_many_row row, number, vars, pattern
|
2461
|
+
vars
|
2462
|
+
.map { |v|
|
2463
|
+
name = pattern.sub('%v', v).sub('%n', number.to_s)
|
2464
|
+
[v, row[name]]
|
2465
|
+
}.to_h
|
2466
|
+
end
|
2467
|
+
|
2468
|
+
# Raises IndexError when one of the positions is not a valid position
|
2469
|
+
def validate_positions *positions, size
|
2470
|
+
positions = [positions] if positions.is_a? Integer
|
2471
|
+
positions.each do |pos|
|
2472
|
+
raise IndexError, "#{pos} is not a valid position." if pos >= size
|
2473
|
+
end
|
2474
|
+
end
|
2475
|
+
|
2476
|
+
# Accepts hash, enumerable and vector and align it properly so it can be added
|
2477
|
+
def coerce_vector vector
|
2478
|
+
case vector
|
2479
|
+
when Daru::Vector
|
2480
|
+
vector.reindex @vectors
|
2481
|
+
when Hash
|
2482
|
+
Daru::Vector.new(vector).reindex @vectors
|
2483
|
+
else
|
2484
|
+
Daru::Vector.new vector
|
2485
|
+
end
|
2486
|
+
end
|
2487
|
+
|
2488
|
+
# coerce ranges, integers and array in appropriate ways
|
2489
|
+
def coerce_positions *positions, size
|
2490
|
+
if positions.size == 1
|
2491
|
+
case positions.first
|
2492
|
+
when Integer
|
2493
|
+
positions.first
|
2494
|
+
when Range
|
2495
|
+
size.times.to_a[positions.first]
|
2496
|
+
else
|
2497
|
+
raise ArgumentError, 'Unkown position type.'
|
2498
|
+
end
|
2499
|
+
else
|
2500
|
+
positions
|
2501
|
+
end
|
2411
2502
|
end
|
2412
2503
|
end
|
2413
2504
|
end
|