daru 0.1.3.1 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rspec +2 -1
- data/.rspec_formatter.rb +33 -0
- data/.rubocop.yml +26 -2
- data/History.md +38 -0
- data/README.md +22 -13
- data/Rakefile +50 -2
- data/benchmarks/csv_reading.rb +22 -0
- data/daru.gemspec +9 -2
- data/lib/daru.rb +36 -4
- data/lib/daru/accessors/array_wrapper.rb +6 -1
- data/lib/daru/accessors/dataframe_by_row.rb +10 -2
- data/lib/daru/accessors/gsl_wrapper.rb +1 -3
- data/lib/daru/accessors/nmatrix_wrapper.rb +9 -0
- data/lib/daru/category.rb +935 -0
- data/lib/daru/core/group_by.rb +29 -38
- data/lib/daru/core/merge.rb +186 -145
- data/lib/daru/core/query.rb +22 -11
- data/lib/daru/dataframe.rb +976 -885
- data/lib/daru/date_time/index.rb +166 -166
- data/lib/daru/date_time/offsets.rb +66 -77
- data/lib/daru/formatters/table.rb +54 -0
- data/lib/daru/helpers/array.rb +40 -0
- data/lib/daru/index.rb +476 -73
- data/lib/daru/io/io.rb +66 -45
- data/lib/daru/io/sql_data_source.rb +33 -62
- data/lib/daru/iruby/helpers.rb +38 -0
- data/lib/daru/iruby/templates/dataframe.html.erb +52 -0
- data/lib/daru/iruby/templates/dataframe_mi.html.erb +58 -0
- data/lib/daru/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru/iruby/templates/vector.html.erb +27 -0
- data/lib/daru/iruby/templates/vector_mi.html.erb +36 -0
- data/lib/daru/maths/arithmetic/dataframe.rb +16 -18
- data/lib/daru/maths/arithmetic/vector.rb +4 -6
- data/lib/daru/maths/statistics/dataframe.rb +8 -15
- data/lib/daru/maths/statistics/vector.rb +120 -98
- data/lib/daru/monkeys.rb +12 -40
- data/lib/daru/plotting/gruff.rb +3 -0
- data/lib/daru/plotting/gruff/category.rb +49 -0
- data/lib/daru/plotting/gruff/dataframe.rb +91 -0
- data/lib/daru/plotting/gruff/vector.rb +57 -0
- data/lib/daru/plotting/nyaplot.rb +3 -0
- data/lib/daru/plotting/nyaplot/category.rb +34 -0
- data/lib/daru/plotting/nyaplot/dataframe.rb +187 -0
- data/lib/daru/plotting/nyaplot/vector.rb +46 -0
- data/lib/daru/vector.rb +694 -421
- data/lib/daru/version.rb +1 -1
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/spec/accessors/wrappers_spec.rb +2 -4
- data/spec/categorical_spec.rb +1734 -0
- data/spec/core/group_by_spec.rb +52 -2
- data/spec/core/merge_spec.rb +63 -2
- data/spec/core/query_spec.rb +236 -80
- data/spec/dataframe_spec.rb +1373 -79
- data/spec/date_time/data_spec.rb +3 -5
- data/spec/date_time/index_spec.rb +154 -17
- data/spec/date_time/offsets_spec.rb +3 -4
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/formatters/table_formatter_spec.rb +99 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +168 -0
- data/spec/index/index_spec.rb +283 -0
- data/spec/index/multi_index_spec.rb +570 -0
- data/spec/io/io_spec.rb +31 -4
- data/spec/io/sql_data_source_spec.rb +0 -1
- data/spec/iruby/dataframe_spec.rb +172 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +107 -0
- data/spec/math/arithmetic/dataframe_spec.rb +71 -13
- data/spec/math/arithmetic/vector_spec.rb +8 -10
- data/spec/math/statistics/dataframe_spec.rb +3 -5
- data/spec/math/statistics/vector_spec.rb +45 -55
- data/spec/monkeys_spec.rb +32 -9
- data/spec/plotting/dataframe_spec.rb +386 -0
- data/spec/plotting/vector_spec.rb +230 -0
- data/spec/shared/vector_display_spec.rb +215 -0
- data/spec/spec_helper.rb +23 -0
- data/spec/vector_spec.rb +905 -138
- metadata +143 -11
- data/.rubocop_todo.yml +0 -44
- data/lib/daru/plotting/dataframe.rb +0 -104
- data/lib/daru/plotting/vector.rb +0 -38
- data/spec/daru_spec.rb +0 -58
- data/spec/index_spec.rb +0 -375
data/lib/daru/core/query.rb
CHANGED
@@ -33,7 +33,7 @@ module Daru
|
|
33
33
|
end
|
34
34
|
|
35
35
|
def inspect
|
36
|
-
"
|
36
|
+
"#<#{self.class}:#{object_id} bool_arry=#{@barry}>"
|
37
37
|
end
|
38
38
|
end
|
39
39
|
|
@@ -56,17 +56,28 @@ module Daru
|
|
56
56
|
)
|
57
57
|
end
|
58
58
|
|
59
|
-
def vector_where
|
60
|
-
new_data =
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
59
|
+
def vector_where dv, bool_array
|
60
|
+
new_data, new_index = fetch_new_data_and_index dv, bool_array
|
61
|
+
|
62
|
+
resultant_dv = Daru::Vector.new new_data,
|
63
|
+
index: dv.index.class.new(new_index),
|
64
|
+
dtype: dv.dtype,
|
65
|
+
type: dv.type,
|
66
|
+
name: dv.name
|
67
|
+
|
68
|
+
# Preserve categories order for category vector
|
69
|
+
resultant_dv.categories = dv.categories if dv.category?
|
70
|
+
resultant_dv
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
68
74
|
|
69
|
-
|
75
|
+
def fetch_new_data_and_index dv, bool_array
|
76
|
+
barry = bool_array.to_a
|
77
|
+
positions = dv.size.times.select { |i| barry[i] }
|
78
|
+
new_data = dv.to_a.values_at(*positions)
|
79
|
+
new_index = dv.index.to_a.values_at(*positions)
|
80
|
+
[new_data, new_index]
|
70
81
|
end
|
71
82
|
end
|
72
83
|
end
|
data/lib/daru/dataframe.rb
CHANGED
@@ -1,14 +1,17 @@
|
|
1
1
|
require 'daru/accessors/dataframe_by_row.rb'
|
2
2
|
require 'daru/maths/arithmetic/dataframe.rb'
|
3
3
|
require 'daru/maths/statistics/dataframe.rb'
|
4
|
-
require 'daru/plotting/
|
4
|
+
require 'daru/plotting/gruff.rb'
|
5
|
+
require 'daru/plotting/nyaplot.rb'
|
5
6
|
require 'daru/io/io.rb'
|
6
7
|
|
7
8
|
module Daru
|
8
|
-
class DataFrame
|
9
|
+
class DataFrame # rubocop:disable Metrics/ClassLength
|
9
10
|
include Daru::Maths::Arithmetic::DataFrame
|
10
11
|
include Daru::Maths::Statistics::DataFrame
|
11
|
-
|
12
|
+
# TODO: Remove this line but its causing erros due to unkown reason
|
13
|
+
include Daru::Plotting::DataFrame::NyaplotLibrary if Daru.has_nyaplot?
|
14
|
+
extend Gem::Deprecate
|
12
15
|
|
13
16
|
class << self
|
14
17
|
# Load data from a CSV file. Specify an optional block to grab the CSV
|
@@ -112,29 +115,17 @@ module Daru
|
|
112
115
|
# Create DataFrame by specifying rows as an Array of Arrays or Array of
|
113
116
|
# Daru::Vector objects.
|
114
117
|
def rows source, opts={}
|
115
|
-
first = source.first
|
116
|
-
|
117
118
|
raise SizeError, 'All vectors must have same length' \
|
118
|
-
unless source.all? { |v| v.size == first.size }
|
119
|
-
|
120
|
-
index = []
|
121
|
-
opts[:order] ||=
|
122
|
-
case first
|
123
|
-
when Daru::Vector # assume that all are Vectors
|
124
|
-
index = source.map(&:name)
|
125
|
-
first.index.to_a
|
126
|
-
when Array
|
127
|
-
Array.new(first.size, &:to_s)
|
128
|
-
end
|
119
|
+
unless source.all? { |v| v.size == source.first.size }
|
129
120
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
121
|
+
opts[:order] ||= guess_order(source)
|
122
|
+
|
123
|
+
if ArrayHelper.array_of?(source, Array)
|
124
|
+
DataFrame.new(source.transpose, opts)
|
125
|
+
elsif ArrayHelper.array_of?(source, Vector)
|
126
|
+
from_vector_rows(source, opts)
|
127
|
+
else
|
128
|
+
raise ArgumentError, "Can't create DataFrame from #{source}"
|
138
129
|
end
|
139
130
|
end
|
140
131
|
|
@@ -161,36 +152,47 @@ module Daru
|
|
161
152
|
raise 'Three vectors should be equal size' if
|
162
153
|
rows.size != columns.size || rows.size!=values.size
|
163
154
|
|
164
|
-
|
165
|
-
|
155
|
+
data = Hash.new { |h, col|
|
156
|
+
h[col] = rows.factors.map { |r| [r, nil] }.to_h
|
157
|
+
}
|
158
|
+
columns.zip(rows, values).each { |c, r, v| data[c][r] = v }
|
166
159
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
160
|
+
# FIXME: in fact, WITHOUT this line you'll obtain more "right"
|
161
|
+
# data: with vectors having "rows" as an index...
|
162
|
+
data = data.map { |c, r| [c, r.values] }.to_h
|
163
|
+
data[:_id] = rows.factors
|
164
|
+
|
165
|
+
DataFrame.new(data)
|
166
|
+
end
|
167
|
+
|
168
|
+
private
|
172
169
|
|
173
|
-
|
174
|
-
|
170
|
+
def guess_order source
|
171
|
+
case source.first
|
172
|
+
when Vector # assume that all are Vectors
|
173
|
+
source.first.index.to_a
|
174
|
+
when Array
|
175
|
+
Array.new(source.first.size, &:to_s)
|
175
176
|
end
|
176
|
-
|
177
|
+
end
|
177
178
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
n_row[i+1] = h_rows[row][cols_values[i]]
|
183
|
-
end
|
179
|
+
def from_vector_rows source, opts
|
180
|
+
index = source.map(&:name)
|
181
|
+
.each_with_index.map { |n, i| n || i }
|
182
|
+
index = ArrayHelper.recode_repeated(index)
|
184
183
|
|
185
|
-
|
184
|
+
DataFrame.new({}, opts).tap do |df|
|
185
|
+
source.each_with_index do |row, idx|
|
186
|
+
df[index[idx] || idx, :row] = row
|
187
|
+
end
|
186
188
|
end
|
187
|
-
df.update
|
188
|
-
df
|
189
189
|
end
|
190
190
|
end
|
191
191
|
|
192
192
|
# The vectors (columns) index of the DataFrame
|
193
193
|
attr_reader :vectors
|
194
|
+
# TOREMOVE
|
195
|
+
attr_reader :data
|
194
196
|
|
195
197
|
# The index of the rows of the DataFrame
|
196
198
|
attr_reader :index
|
@@ -237,135 +239,181 @@ module Daru
|
|
237
239
|
# # b 7 2
|
238
240
|
# # c 8 3
|
239
241
|
# # d 9 4
|
240
|
-
def initialize source, opts={}
|
241
|
-
vectors = opts[:order]
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
if source.empty?
|
250
|
-
@vectors = try_create_index vectors
|
251
|
-
@index = try_create_index index
|
242
|
+
def initialize source, opts={} # rubocop:disable Metrics/MethodLength
|
243
|
+
vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1
|
244
|
+
@data = []
|
245
|
+
@name = opts[:name]
|
246
|
+
|
247
|
+
case source
|
248
|
+
when ->(s) { s.empty? }
|
249
|
+
@vectors = Index.coerce vectors
|
250
|
+
@index = Index.coerce index
|
252
251
|
create_empty_vectors
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
raise ArgumentError, "Number of vectors (#{vectors.size}) should \
|
258
|
-
equal order size (#{source.size})" if source.size != vectors.size
|
259
|
-
|
260
|
-
@index = try_create_index(index || source[0].size)
|
261
|
-
@vectors = try_create_index(vectors)
|
262
|
-
|
263
|
-
@vectors.each_with_index do |_vec,idx|
|
264
|
-
@data << Daru::Vector.new(source[idx], index: @index)
|
265
|
-
end
|
266
|
-
elsif source.all? { |s| s.is_a?(Daru::Vector) }
|
267
|
-
hsh = {}
|
268
|
-
vectors.each_with_index do |name, idx|
|
269
|
-
hsh[name] = source[idx]
|
270
|
-
end
|
271
|
-
initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
|
272
|
-
else # array of hashes
|
273
|
-
@vectors =
|
274
|
-
if vectors.nil?
|
275
|
-
Daru::Index.new source[0].keys
|
276
|
-
else
|
277
|
-
Daru::Index.new((vectors + (source[0].keys - vectors)).uniq)
|
278
|
-
end
|
279
|
-
@index = Daru::Index.new(index || source.size)
|
280
|
-
|
281
|
-
@vectors.each do |name|
|
282
|
-
v = []
|
283
|
-
source.each do |h|
|
284
|
-
v << (h[name] || h[name.to_s])
|
285
|
-
end
|
286
|
-
|
287
|
-
@data << Daru::Vector.new(v, name: set_name(name), index: @index)
|
288
|
-
end
|
289
|
-
end
|
290
|
-
when Hash
|
291
|
-
create_vectors_index_with vectors, source
|
292
|
-
if all_daru_vectors_in_source? source
|
293
|
-
vectors_have_same_index = all_vectors_have_equal_indexes?(source)
|
294
|
-
if !index.nil?
|
295
|
-
@index = try_create_index index
|
296
|
-
elsif vectors_have_same_index
|
297
|
-
@index = source.values[0].index.dup
|
298
|
-
else
|
299
|
-
all_indexes = []
|
300
|
-
source.each_value do |vector|
|
301
|
-
all_indexes << vector.index.to_a
|
302
|
-
end
|
303
|
-
# sort only if missing indexes detected
|
304
|
-
all_indexes.flatten!.uniq!.sort!
|
305
|
-
|
306
|
-
@index = Daru::Index.new all_indexes
|
307
|
-
clone = true
|
308
|
-
end
|
309
|
-
|
310
|
-
if clone
|
311
|
-
@vectors.each do |vector|
|
312
|
-
# avoids matching indexes of vectors if all the supplied vectors
|
313
|
-
# have the same index.
|
314
|
-
if vectors_have_same_index
|
315
|
-
v = source[vector].dup
|
316
|
-
else
|
317
|
-
v = Daru::Vector.new([], name: vector, metadata: source[vector].metadata.dup, index: @index)
|
318
|
-
|
319
|
-
@index.each do |idx|
|
320
|
-
v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
|
321
|
-
end
|
322
|
-
end
|
323
|
-
@data << v
|
324
|
-
end
|
325
|
-
else
|
326
|
-
@data.concat source.values
|
327
|
-
end
|
328
|
-
else
|
329
|
-
@index = try_create_index(index || source.values[0].size)
|
330
|
-
|
331
|
-
@vectors.each do |name|
|
332
|
-
meta_opt = source[name].respond_to?(:metadata) ? {metadata: source[name].metadata.dup} : {}
|
333
|
-
@data << Daru::Vector.new(source[name].dup, name: set_name(name), **meta_opt, index: @index)
|
334
|
-
end
|
335
|
-
end
|
336
|
-
end
|
252
|
+
when Array
|
253
|
+
initialize_from_array source, vectors, index, opts
|
254
|
+
when Hash
|
255
|
+
initialize_from_hash source, vectors, index, opts
|
337
256
|
end
|
338
257
|
|
339
258
|
set_size
|
340
259
|
validate
|
341
260
|
update
|
261
|
+
self.plotting_library = Daru.plotting_library
|
342
262
|
end
|
343
263
|
|
344
|
-
def
|
345
|
-
|
346
|
-
|
264
|
+
def plotting_library= lib
|
265
|
+
case lib
|
266
|
+
when :gruff, :nyaplot
|
267
|
+
@plotting_library = lib
|
268
|
+
extend Module.const_get(
|
269
|
+
"Daru::Plotting::DataFrame::#{lib.to_s.capitalize}Library"
|
270
|
+
) if Daru.send("has_#{lib}?".to_sym)
|
271
|
+
else
|
272
|
+
raise ArguementError, "Plotting library #{lib} not supported. "\
|
273
|
+
'Supported libraries are :nyaplot and :gruff'
|
274
|
+
end
|
347
275
|
end
|
348
276
|
|
349
277
|
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
350
278
|
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
351
279
|
# rows. Use df.row[:a] for accessing row with index ':a'.
|
352
280
|
def [](*names)
|
353
|
-
|
354
|
-
|
355
|
-
|
281
|
+
axis = extract_axis(names, :vector)
|
282
|
+
dispatch_to_axis axis, :access, *names
|
283
|
+
end
|
284
|
+
|
285
|
+
# Retrive rows by positions
|
286
|
+
# @param [Array<Integer>] *positions positions of rows to retrive
|
287
|
+
# @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
|
288
|
+
# @example
|
289
|
+
# df = Daru::DataFrame.new({
|
290
|
+
# a: [1, 2, 3],
|
291
|
+
# b: ['a', 'b', 'c']
|
292
|
+
# })
|
293
|
+
# df.row_at 1, 2
|
294
|
+
# # => #<Daru::DataFrame(2x2)>
|
295
|
+
# # a b
|
296
|
+
# # 1 2 b
|
297
|
+
# # 2 3 c
|
298
|
+
def row_at *positions
|
299
|
+
original_positions = positions
|
300
|
+
positions = coerce_positions(*positions, nrows)
|
301
|
+
validate_positions(*positions, nrows)
|
302
|
+
|
303
|
+
if positions.is_a? Integer
|
304
|
+
return Daru::Vector.new @data.map { |vec| vec.at(*positions) },
|
305
|
+
index: @vectors
|
356
306
|
else
|
357
|
-
|
307
|
+
new_rows = @data.map { |vec| vec.at(*original_positions) }
|
308
|
+
return Daru::DataFrame.new new_rows,
|
309
|
+
index: @index.at(*original_positions),
|
310
|
+
order: @vectors
|
358
311
|
end
|
312
|
+
end
|
359
313
|
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
314
|
+
# Set rows by positions
|
315
|
+
# @param [Array<Integer>] positions positions of rows to set
|
316
|
+
# @vector [Array, Daru::Vector] vector vector to be assigned
|
317
|
+
# @example
|
318
|
+
# df = Daru::DataFrame.new({
|
319
|
+
# a: [1, 2, 3],
|
320
|
+
# b: ['a', 'b', 'c']
|
321
|
+
# })
|
322
|
+
# df.set_row_at [0, 1], ['x', 'x']
|
323
|
+
# df
|
324
|
+
# #=> #<Daru::DataFrame(3x2)>
|
325
|
+
# # a b
|
326
|
+
# # 0 x x
|
327
|
+
# # 1 x x
|
328
|
+
# # 2 3 c
|
329
|
+
def set_row_at positions, vector
|
330
|
+
validate_positions(*positions, nrows)
|
331
|
+
vector =
|
332
|
+
if vector.is_a? Daru::Vector
|
333
|
+
vector.reindex @vectors
|
334
|
+
else
|
335
|
+
Daru::Vector.new vector
|
336
|
+
end
|
337
|
+
|
338
|
+
raise SizeError, 'Vector length should match row length' if
|
339
|
+
vector.size != @vectors.size
|
340
|
+
|
341
|
+
@data.each_with_index do |vec, pos|
|
342
|
+
vec.set_at(positions, vector.at(pos))
|
343
|
+
end
|
344
|
+
@index = @data[0].index
|
345
|
+
set_size
|
346
|
+
end
|
347
|
+
|
348
|
+
# Retrive vectors by positions
|
349
|
+
# @param [Array<Integer>] *positions positions of vectors to retrive
|
350
|
+
# @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
|
351
|
+
# @example
|
352
|
+
# df = Daru::DataFrame.new({
|
353
|
+
# a: [1, 2, 3],
|
354
|
+
# b: ['a', 'b', 'c']
|
355
|
+
# })
|
356
|
+
# df.at 0
|
357
|
+
# # => #<Daru::Vector(3)>
|
358
|
+
# # a
|
359
|
+
# # 0 1
|
360
|
+
# # 1 2
|
361
|
+
# # 2 3
|
362
|
+
def at *positions
|
363
|
+
if AXES.include? positions.last
|
364
|
+
axis = positions.pop
|
365
|
+
return row_at(*positions) if axis == :row
|
366
|
+
end
|
367
|
+
|
368
|
+
original_positions = positions
|
369
|
+
positions = coerce_positions(*positions, ncols)
|
370
|
+
validate_positions(*positions, ncols)
|
371
|
+
|
372
|
+
if positions.is_a? Integer
|
373
|
+
@data[positions].dup
|
364
374
|
else
|
365
|
-
|
375
|
+
Daru::DataFrame.new positions.map { |pos| @data[pos].dup },
|
376
|
+
index: @index,
|
377
|
+
order: @vectors.at(*original_positions),
|
378
|
+
name: @name
|
366
379
|
end
|
367
380
|
end
|
368
381
|
|
382
|
+
# Set vectors by positions
|
383
|
+
# @param [Array<Integer>] positions positions of vectors to set
|
384
|
+
# @param [Array, Daru::Vector] vector vector to be assigned
|
385
|
+
# @example
|
386
|
+
# df = Daru::DataFrame.new({
|
387
|
+
# a: [1, 2, 3],
|
388
|
+
# b: ['a', 'b', 'c']
|
389
|
+
# })
|
390
|
+
# df.set_at [0], ['x', 'y', 'z']
|
391
|
+
# df
|
392
|
+
# #=> #<Daru::DataFrame(3x2)>
|
393
|
+
# # a b
|
394
|
+
# # 0 x a
|
395
|
+
# # 1 y b
|
396
|
+
# # 2 z c
|
397
|
+
def set_at positions, vector
|
398
|
+
if positions.last == :row
|
399
|
+
positions.pop
|
400
|
+
return set_row_at(positions, vector)
|
401
|
+
end
|
402
|
+
|
403
|
+
validate_positions(*positions, ncols)
|
404
|
+
vector =
|
405
|
+
if vector.is_a? Daru::Vector
|
406
|
+
vector.reindex @index
|
407
|
+
else
|
408
|
+
Daru::Vector.new vector
|
409
|
+
end
|
410
|
+
|
411
|
+
raise SizeError, 'Vector length should match index length' if
|
412
|
+
vector.size != @index.size
|
413
|
+
|
414
|
+
positions.each { |pos| @data[pos] = vector }
|
415
|
+
end
|
416
|
+
|
369
417
|
# Insert a new row/vector of the specified name or modify a previous row.
|
370
418
|
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
|
371
419
|
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
|
@@ -374,25 +422,11 @@ module Daru
|
|
374
422
|
# of the vector will be matched against the row/vector indexes of the DataFrame
|
375
423
|
# before an insertion is performed. Unmatched indexes will be set to nil.
|
376
424
|
def []=(*args)
|
377
|
-
|
378
|
-
args
|
379
|
-
args
|
425
|
+
vector = args.pop
|
426
|
+
axis = extract_axis(args)
|
427
|
+
names = args
|
380
428
|
|
381
|
-
|
382
|
-
vector = args[-1]
|
383
|
-
|
384
|
-
if axis == :vector
|
385
|
-
insert_or_modify_vector name, vector
|
386
|
-
elsif axis == :row
|
387
|
-
insert_or_modify_row name, vector
|
388
|
-
else
|
389
|
-
raise IndexError, "Expected axis to be row or vector, not #{axis}."
|
390
|
-
end
|
391
|
-
end
|
392
|
-
|
393
|
-
# Access a vector by name.
|
394
|
-
def column name
|
395
|
-
vector[name]
|
429
|
+
dispatch_to_axis axis, :insert_or_modify, names, vector
|
396
430
|
end
|
397
431
|
|
398
432
|
def add_row row, index=nil
|
@@ -421,10 +455,7 @@ module Daru
|
|
421
455
|
def dup vectors_to_dup=nil
|
422
456
|
vectors_to_dup = @vectors.to_a unless vectors_to_dup
|
423
457
|
|
424
|
-
src = []
|
425
|
-
vectors_to_dup.each do |vec|
|
426
|
-
src << @data[@vectors[vec]].dup
|
427
|
-
end
|
458
|
+
src = vectors_to_dup.map { |vec| @data[@vectors[vec]].dup }
|
428
459
|
new_order = Daru::Index.new(vectors_to_dup)
|
429
460
|
|
430
461
|
Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
|
@@ -443,20 +474,18 @@ module Daru
|
|
443
474
|
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
444
475
|
# a view of the whole data frame otherwise.
|
445
476
|
def clone *vectors_to_clone
|
446
|
-
vectors_to_clone.flatten!
|
477
|
+
vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
|
447
478
|
vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
|
448
479
|
|
449
|
-
h = vectors_to_clone.
|
450
|
-
|
451
|
-
end
|
452
|
-
Daru::DataFrame.new(h, clone: false)
|
480
|
+
h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
|
481
|
+
Daru::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
|
453
482
|
end
|
454
483
|
|
455
484
|
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
456
485
|
# or a full copy of only valid data if missing data is present.
|
457
486
|
def clone_only_valid
|
458
|
-
if
|
459
|
-
|
487
|
+
if include_values?(*Daru::MISSING_VALUES)
|
488
|
+
reject_values(*Daru::MISSING_VALUES)
|
460
489
|
else
|
461
490
|
clone
|
462
491
|
end
|
@@ -465,19 +494,76 @@ module Daru
|
|
465
494
|
# Creates a new duplicate dataframe containing only rows
|
466
495
|
# without a single missing value.
|
467
496
|
def dup_only_valid vecs=nil
|
468
|
-
rows_with_nil = @data.
|
469
|
-
|
470
|
-
|
497
|
+
rows_with_nil = @data.map { |vec| vec.indexes(*Daru::MISSING_VALUES) }
|
498
|
+
.inject(&:concat)
|
499
|
+
.uniq
|
471
500
|
|
472
501
|
row_indexes = @index.to_a
|
473
502
|
(vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
|
474
503
|
end
|
504
|
+
deprecate :dup_only_valid, :reject_values, 2016, 10
|
505
|
+
|
506
|
+
# Returns a dataframe in which rows with any of the mentioned values
|
507
|
+
# are ignored.
|
508
|
+
# @param [Array] *values values to reject to form the new dataframe
|
509
|
+
# @return [Daru::DataFrame] Data Frame with only rows which doesn't
|
510
|
+
# contain the mentioned values
|
511
|
+
# @example
|
512
|
+
# df = Daru::DataFrame.new({
|
513
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
514
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
515
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
516
|
+
# }, index: 11..18)
|
517
|
+
# df.reject_values nil, Float::NAN
|
518
|
+
# # => #<Daru::DataFrame(2x3)>
|
519
|
+
# # a b c
|
520
|
+
# # 11 1 a a
|
521
|
+
# # 18 7 8 7
|
522
|
+
def reject_values(*values)
|
523
|
+
positions =
|
524
|
+
size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
|
525
|
+
# Handle the case when positions size is 1 and #row_at wouldn't return a df
|
526
|
+
if positions.size == 1
|
527
|
+
pos = positions.first
|
528
|
+
row_at(pos..pos)
|
529
|
+
else
|
530
|
+
row_at(*positions)
|
531
|
+
end
|
532
|
+
end
|
533
|
+
|
534
|
+
# Replace specified values with given value
|
535
|
+
# @param [Array] old_values values to replace with new value
|
536
|
+
# @param [object] new_value new value to replace with
|
537
|
+
# @return [Daru::DataFrame] Data Frame itself with old values replace
|
538
|
+
# with new value
|
539
|
+
# @example
|
540
|
+
# df = Daru::DataFrame.new({
|
541
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
542
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
543
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
544
|
+
# }, index: 11..18)
|
545
|
+
# df
|
546
|
+
# # => #<Daru::DataFrame(8x3)>
|
547
|
+
# # a b c
|
548
|
+
# # 11 1 a a
|
549
|
+
# # 12 2 b NaN
|
550
|
+
# # 13 3 NaN 3
|
551
|
+
# # 14 NaN NaN 4
|
552
|
+
# # 15 NaN NaN 3
|
553
|
+
# # 16 NaN 3 5
|
554
|
+
# # 17 1 5 NaN
|
555
|
+
# # 18 7 8 7
|
556
|
+
def replace_values old_values, new_value
|
557
|
+
@data.each { |vec| vec.replace_values old_values, new_value }
|
558
|
+
self
|
559
|
+
end
|
475
560
|
|
476
561
|
# Iterate over each index of the DataFrame.
|
477
562
|
def each_index &block
|
478
563
|
return to_enum(:each_index) unless block_given?
|
479
564
|
|
480
565
|
@index.each(&block)
|
566
|
+
|
481
567
|
self
|
482
568
|
end
|
483
569
|
|
@@ -509,8 +595,8 @@ module Daru
|
|
509
595
|
def each_row
|
510
596
|
return to_enum(:each_row) unless block_given?
|
511
597
|
|
512
|
-
@index.
|
513
|
-
yield
|
598
|
+
@index.size.times do |pos|
|
599
|
+
yield row_at(pos)
|
514
600
|
end
|
515
601
|
|
516
602
|
self
|
@@ -540,13 +626,7 @@ module Daru
|
|
540
626
|
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
541
627
|
# or :row. Default to :vector.
|
542
628
|
def each axis=:vector, &block
|
543
|
-
|
544
|
-
each_vector(&block)
|
545
|
-
elsif axis == :row
|
546
|
-
each_row(&block)
|
547
|
-
else
|
548
|
-
raise ArgumentError, "Unknown axis #{axis}"
|
549
|
-
end
|
629
|
+
dispatch_to_axis axis, :each, &block
|
550
630
|
end
|
551
631
|
|
552
632
|
# Iterate over a row or vector and return results in a Daru::Vector.
|
@@ -565,13 +645,7 @@ module Daru
|
|
565
645
|
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
566
646
|
# or :row. Default to :vector.
|
567
647
|
def collect axis=:vector, &block
|
568
|
-
|
569
|
-
collect_vectors(&block)
|
570
|
-
elsif axis == :row
|
571
|
-
collect_rows(&block)
|
572
|
-
else
|
573
|
-
raise ArgumentError, "Unknown axis #{axis}"
|
574
|
-
end
|
648
|
+
dispatch_to_axis_pl axis, :collect, &block
|
575
649
|
end
|
576
650
|
|
577
651
|
# Map over each vector or row of the data frame according to
|
@@ -591,13 +665,7 @@ module Daru
|
|
591
665
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
592
666
|
# Default to :vector.
|
593
667
|
def map axis=:vector, &block
|
594
|
-
|
595
|
-
map_vectors(&block)
|
596
|
-
elsif axis == :row
|
597
|
-
map_rows(&block)
|
598
|
-
else
|
599
|
-
raise ArgumentError, "Unknown axis #{axis}"
|
600
|
-
end
|
668
|
+
dispatch_to_axis_pl axis, :map, &block
|
601
669
|
end
|
602
670
|
|
603
671
|
# Destructive map. Modifies the DataFrame. Each run of the block
|
@@ -634,11 +702,7 @@ module Daru
|
|
634
702
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
635
703
|
# Default to :vector.
|
636
704
|
def recode axis=:vector, &block
|
637
|
-
|
638
|
-
recode_vectors(&block)
|
639
|
-
elsif axis == :row
|
640
|
-
recode_rows(&block)
|
641
|
-
end
|
705
|
+
dispatch_to_axis_pl axis, :recode, &block
|
642
706
|
end
|
643
707
|
|
644
708
|
# Retain vectors or rows if the block returns a truthy value.
|
@@ -670,50 +734,34 @@ module Daru
|
|
670
734
|
# row[:a] + row[:d] < 100
|
671
735
|
# end
|
672
736
|
def filter axis=:vector, &block
|
673
|
-
|
674
|
-
filter_vectors(&block)
|
675
|
-
elsif axis == :row
|
676
|
-
filter_rows(&block)
|
677
|
-
end
|
737
|
+
dispatch_to_axis_pl axis, :filter, &block
|
678
738
|
end
|
679
739
|
|
680
740
|
def recode_vectors
|
681
741
|
block_given? or return to_enum(:recode_vectors)
|
682
742
|
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
|
688
|
-
df[*i] = ret
|
743
|
+
dup.tap do |df|
|
744
|
+
df.each_vector_with_index do |v, i|
|
745
|
+
df[*i] = should_be_vector!(yield(v))
|
746
|
+
end
|
689
747
|
end
|
690
|
-
|
691
|
-
df
|
692
748
|
end
|
693
749
|
|
694
750
|
def recode_rows
|
695
751
|
block_given? or return to_enum(:recode_rows)
|
696
752
|
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
df.row[i] = ret
|
753
|
+
dup.tap do |df|
|
754
|
+
df.each_row_with_index do |r, i|
|
755
|
+
df.row[i] = should_be_vector!(yield(r))
|
756
|
+
end
|
702
757
|
end
|
703
|
-
|
704
|
-
df
|
705
758
|
end
|
706
759
|
|
707
760
|
# Map each vector and return an Array.
|
708
|
-
def map_vectors
|
761
|
+
def map_vectors &block
|
709
762
|
return to_enum(:map_vectors) unless block_given?
|
710
763
|
|
711
|
-
|
712
|
-
@data.each do |vec|
|
713
|
-
arry << yield(vec)
|
714
|
-
end
|
715
|
-
|
716
|
-
arry
|
764
|
+
@data.map(&block)
|
717
765
|
end
|
718
766
|
|
719
767
|
# Destructive form of #map_vectors
|
@@ -721,56 +769,37 @@ module Daru
|
|
721
769
|
return to_enum(:map_vectors!) unless block_given?
|
722
770
|
|
723
771
|
vectors.dup.each do |n|
|
724
|
-
|
725
|
-
v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}"
|
726
|
-
self[n] = v
|
772
|
+
self[n] = should_be_vector!(yield(self[n]))
|
727
773
|
end
|
728
774
|
|
729
775
|
self
|
730
776
|
end
|
731
777
|
|
732
778
|
# Map vectors alongwith the index.
|
733
|
-
def map_vectors_with_index
|
779
|
+
def map_vectors_with_index &block
|
734
780
|
return to_enum(:map_vectors_with_index) unless block_given?
|
735
781
|
|
736
|
-
|
737
|
-
each_vector_with_index do |vector, name|
|
738
|
-
dt << yield(vector, name)
|
739
|
-
end
|
740
|
-
|
741
|
-
dt
|
782
|
+
each_vector_with_index.map(&block)
|
742
783
|
end
|
743
784
|
|
744
785
|
# Map each row
|
745
|
-
def map_rows
|
786
|
+
def map_rows &block
|
746
787
|
return to_enum(:map_rows) unless block_given?
|
747
788
|
|
748
|
-
|
749
|
-
each_row do |row|
|
750
|
-
dt << yield(row)
|
751
|
-
end
|
752
|
-
|
753
|
-
dt
|
789
|
+
each_row.map(&block)
|
754
790
|
end
|
755
791
|
|
756
|
-
def map_rows_with_index
|
792
|
+
def map_rows_with_index &block
|
757
793
|
return to_enum(:map_rows_with_index) unless block_given?
|
758
794
|
|
759
|
-
|
760
|
-
each_row_with_index do |row, index|
|
761
|
-
dt << yield(row, index)
|
762
|
-
end
|
763
|
-
|
764
|
-
dt
|
795
|
+
each_row_with_index.map(&block)
|
765
796
|
end
|
766
797
|
|
767
798
|
def map_rows!
|
768
799
|
return to_enum(:map_rows!) unless block_given?
|
769
800
|
|
770
801
|
index.dup.each do |i|
|
771
|
-
|
772
|
-
r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
|
773
|
-
row[i] = r
|
802
|
+
row[i] = should_be_vector!(yield(row[i]))
|
774
803
|
end
|
775
804
|
|
776
805
|
self
|
@@ -778,55 +807,38 @@ module Daru
|
|
778
807
|
|
779
808
|
# Retrieves a Daru::Vector, based on the result of calculation
|
780
809
|
# performed on each row.
|
781
|
-
def collect_rows
|
810
|
+
def collect_rows &block
|
782
811
|
return to_enum(:collect_rows) unless block_given?
|
783
812
|
|
784
|
-
|
785
|
-
each_row do |row|
|
786
|
-
data.push yield(row)
|
787
|
-
end
|
788
|
-
|
789
|
-
Daru::Vector.new(data, index: @index)
|
813
|
+
Daru::Vector.new(each_row.map(&block), index: @index)
|
790
814
|
end
|
791
815
|
|
792
|
-
def collect_row_with_index
|
816
|
+
def collect_row_with_index &block
|
793
817
|
return to_enum(:collect_row_with_index) unless block_given?
|
794
818
|
|
795
|
-
|
796
|
-
each_row_with_index do |row, i|
|
797
|
-
data.push yield(row, i)
|
798
|
-
end
|
799
|
-
|
800
|
-
Daru::Vector.new(data, index: @index)
|
819
|
+
Daru::Vector.new(each_row_with_index.map(&block), index: @index)
|
801
820
|
end
|
802
821
|
|
803
822
|
# Retrives a Daru::Vector, based on the result of calculation
|
804
823
|
# performed on each vector.
|
805
|
-
def collect_vectors
|
824
|
+
def collect_vectors &block
|
806
825
|
return to_enum(:collect_vectors) unless block_given?
|
807
826
|
|
808
|
-
|
809
|
-
each_vector do |vec|
|
810
|
-
data.push yield(vec)
|
811
|
-
end
|
812
|
-
|
813
|
-
Daru::Vector.new(data, index: @vectors)
|
827
|
+
Daru::Vector.new(each_vector.map(&block), index: @vectors)
|
814
828
|
end
|
815
829
|
|
816
|
-
def collect_vector_with_index
|
830
|
+
def collect_vector_with_index &block
|
817
831
|
return to_enum(:collect_vector_with_index) unless block_given?
|
818
832
|
|
819
|
-
|
820
|
-
each_vector_with_index do |vec, i|
|
821
|
-
data.push yield(vec, i)
|
822
|
-
end
|
823
|
-
|
824
|
-
Daru::Vector.new(data, index: @vectors)
|
833
|
+
Daru::Vector.new(each_vector_with_index.map(&block), index: @vectors)
|
825
834
|
end
|
826
835
|
|
827
836
|
# Generate a matrix, based on vector names of the DataFrame.
|
828
837
|
#
|
829
838
|
# @return {::Matrix}
|
839
|
+
# :nocov:
|
840
|
+
# FIXME: Even not trying to cover this: I can't get, how it is expected
|
841
|
+
# to work.... -- zverok
|
830
842
|
def collect_matrix
|
831
843
|
return to_enum(:collect_matrix) unless block_given?
|
832
844
|
|
@@ -839,6 +851,7 @@ module Daru
|
|
839
851
|
|
840
852
|
Matrix.rows(rows)
|
841
853
|
end
|
854
|
+
# :nocov:
|
842
855
|
|
843
856
|
# Delete a vector
|
844
857
|
def delete_vector vector
|
@@ -876,43 +889,29 @@ module Daru
|
|
876
889
|
# @return {Daru::DataFrame}
|
877
890
|
def bootstrap(n=nil)
|
878
891
|
n ||= nrows
|
879
|
-
|
880
|
-
|
881
|
-
|
892
|
+
Daru::DataFrame.new({}, order: @vectors).tap do |df_boot|
|
893
|
+
n.times do
|
894
|
+
df_boot.add_row(row[rand(n)])
|
895
|
+
end
|
896
|
+
df_boot.update
|
882
897
|
end
|
883
|
-
ds_boot.update
|
884
|
-
ds_boot
|
885
898
|
end
|
886
899
|
|
887
900
|
def keep_row_if
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
keep_row = yield access_row(index)
|
892
|
-
|
893
|
-
deletion << index unless keep_row
|
894
|
-
end
|
895
|
-
deletion.each { |idx|
|
896
|
-
delete_row idx
|
897
|
-
}
|
901
|
+
@index
|
902
|
+
.reject { |idx| yield access_row(idx) }
|
903
|
+
.each { |idx| delete_row idx }
|
898
904
|
end
|
899
905
|
|
900
906
|
def keep_vector_if
|
901
907
|
@vectors.each do |vector|
|
902
|
-
|
903
|
-
|
904
|
-
delete_vector vector unless keep_vector
|
908
|
+
delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
|
905
909
|
end
|
906
910
|
end
|
907
911
|
|
908
912
|
# creates a new vector with the data of a given field which the block returns true
|
909
|
-
def filter_vector vec
|
910
|
-
|
911
|
-
each_row do |row|
|
912
|
-
d.push(row[vec]) if yield row
|
913
|
-
end
|
914
|
-
|
915
|
-
Daru::Vector.new(d, metadata: self[vec].metadata.dup)
|
913
|
+
def filter_vector vec, &block
|
914
|
+
Daru::Vector.new each_row.select(&block).map { |row| row[vec] }
|
916
915
|
end
|
917
916
|
|
918
917
|
# Iterates over each row and retains it in a new DataFrame if the block returns
|
@@ -930,38 +929,24 @@ module Daru
|
|
930
929
|
def filter_vectors &block
|
931
930
|
return to_enum(:filter_vectors) unless block_given?
|
932
931
|
|
933
|
-
df
|
934
|
-
df.keep_vector_if(&block)
|
935
|
-
|
936
|
-
df
|
932
|
+
dup.tap { |df| df.keep_vector_if(&block) }
|
937
933
|
end
|
938
934
|
|
939
935
|
# Test each row with one or more tests. Each test is a Proc with the form
|
940
936
|
# *Proc.new {|row| row[:age] > 0}*
|
941
937
|
#
|
942
938
|
# The function returns an array with all errors.
|
939
|
+
#
|
940
|
+
# FIXME: description here is too sparse. As far as I can get,
|
941
|
+
# it should tell something about that each test is [descr, fields, block],
|
942
|
+
# and that first value may be column name to output. - zverok, 2016-05-18
|
943
943
|
def verify(*tests)
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
end
|
950
|
-
|
951
|
-
vr = []
|
952
|
-
i = 0
|
953
|
-
each(:row) do |row|
|
954
|
-
i += 1
|
955
|
-
tests.each do |test|
|
956
|
-
next if test[2].call(row)
|
957
|
-
values = ''
|
958
|
-
unless test[1].empty?
|
959
|
-
values = ' (' + test[1].collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
|
960
|
-
end
|
961
|
-
vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
|
962
|
-
end
|
963
|
-
end
|
964
|
-
vr
|
944
|
+
id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
|
945
|
+
|
946
|
+
each_row_with_index.map do |row, i|
|
947
|
+
tests.reject { |*_, block| block.call(row) }
|
948
|
+
.map { |test| verify_error_message row, test, id, i }
|
949
|
+
end.flatten
|
965
950
|
end
|
966
951
|
|
967
952
|
# DSL for yielding each row and returning a Daru::Vector based on the
|
@@ -984,10 +969,7 @@ module Daru
|
|
984
969
|
# # 5 666
|
985
970
|
# # 6 777
|
986
971
|
def vector_by_calculation &block
|
987
|
-
a =
|
988
|
-
each_row do |r|
|
989
|
-
a.push r.instance_eval(&block)
|
990
|
-
end
|
972
|
+
a = each_row.map { |r| r.instance_eval(&block) }
|
991
973
|
|
992
974
|
Daru::Vector.new a, index: @index
|
993
975
|
end
|
@@ -1016,10 +998,8 @@ module Daru
|
|
1016
998
|
# * +missing_values+ - An Array of the values that should be
|
1017
999
|
# treated as 'missing'. The default missing value is *nil*.
|
1018
1000
|
def missing_values_rows missing_values=[nil]
|
1019
|
-
number_of_missing =
|
1020
|
-
|
1021
|
-
row.missing_values = missing_values
|
1022
|
-
number_of_missing << row.missing_positions.size
|
1001
|
+
number_of_missing = each_row.map do |row|
|
1002
|
+
row.indexes(*missing_values).size
|
1023
1003
|
end
|
1024
1004
|
|
1025
1005
|
Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
|
@@ -1029,67 +1009,77 @@ module Daru
|
|
1029
1009
|
alias :vector_missing_values :missing_values_rows
|
1030
1010
|
|
1031
1011
|
def has_missing_data?
|
1032
|
-
!!@data.any?(
|
1012
|
+
!!@data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
|
1033
1013
|
end
|
1034
|
-
|
1035
1014
|
alias :flawed? :has_missing_data?
|
1015
|
+
deprecate :has_missing_data?, :include_values?, 2016, 10
|
1016
|
+
deprecate :flawed?, :include_values?, 2016, 10
|
1017
|
+
|
1018
|
+
# Check if any of given values occur in the data frame
|
1019
|
+
# @param [Array] *values values to check for
|
1020
|
+
# @return [true, false] true if any of the given values occur in the
|
1021
|
+
# dataframe, false otherwise
|
1022
|
+
# @example
|
1023
|
+
# df = Daru::DataFrame.new({
|
1024
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
1025
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
1026
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
1027
|
+
# }, index: 11..18)
|
1028
|
+
# df.include_values? nil
|
1029
|
+
# # => true
|
1030
|
+
def include_values?(*values)
|
1031
|
+
@data.any? { |vec| vec.include_values?(*values) }
|
1032
|
+
end
|
1036
1033
|
|
1037
1034
|
# Return a nested hash using vector names as keys and an array constructed of
|
1038
1035
|
# hashes with other values. If block provided, is used to provide the
|
1039
1036
|
# values, with parameters +row+ of dataset, +current+ last hash on
|
1040
1037
|
# hierarchy and +name+ of the key to include
|
1041
|
-
def nest *tree_keys, &
|
1038
|
+
def nest *tree_keys, &_block
|
1042
1039
|
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
1043
|
-
out = {}
|
1044
1040
|
|
1045
|
-
each_row do |row|
|
1046
|
-
current = out
|
1041
|
+
each_row.each_with_object({}) do |row, current|
|
1047
1042
|
# Create tree
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1043
|
+
*keys, last = tree_keys
|
1044
|
+
current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
|
1045
|
+
name = row[last]
|
1046
|
+
|
1047
|
+
if block_given?
|
1048
|
+
current[name] = yield(row, current, name)
|
1049
|
+
else
|
1055
1050
|
current[name] ||= []
|
1056
1051
|
current[name].push(row.to_h.delete_if { |key,_value| tree_keys.include? key })
|
1057
|
-
else
|
1058
|
-
current[name] = yield(row, current, name)
|
1059
1052
|
end
|
1060
1053
|
end
|
1061
|
-
|
1062
|
-
out
|
1063
1054
|
end
|
1064
1055
|
|
1065
1056
|
def vector_count_characters vecs=nil
|
1066
1057
|
vecs ||= @vectors.to_a
|
1067
1058
|
|
1068
1059
|
collect_rows do |row|
|
1069
|
-
vecs.
|
1070
|
-
memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
|
1071
|
-
end
|
1060
|
+
vecs.map { |v| row[v].to_s.size }.inject(:+)
|
1072
1061
|
end
|
1073
1062
|
end
|
1074
1063
|
|
1075
1064
|
def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
|
1076
|
-
|
1077
|
-
|
1065
|
+
self[name]
|
1066
|
+
.split_by_separator(sep)
|
1067
|
+
.each { |k,v| self["#{name}#{join}#{k}".to_sym] = v }
|
1078
1068
|
end
|
1079
1069
|
|
1080
1070
|
# Return the number of rows and columns of the DataFrame in an Array.
|
1081
1071
|
def shape
|
1082
|
-
[
|
1072
|
+
[nrows, ncols]
|
1083
1073
|
end
|
1084
1074
|
|
1085
1075
|
# The number of rows
|
1086
1076
|
def nrows
|
1087
|
-
|
1077
|
+
@index.size
|
1088
1078
|
end
|
1089
1079
|
|
1090
1080
|
# The number of vectors
|
1091
1081
|
def ncols
|
1092
|
-
|
1082
|
+
@vectors.size
|
1093
1083
|
end
|
1094
1084
|
|
1095
1085
|
# Check if a vector is present
|
@@ -1132,10 +1122,7 @@ module Daru
|
|
1132
1122
|
if axis == :vector || axis == :column
|
1133
1123
|
@data.all?(&block)
|
1134
1124
|
elsif axis == :row
|
1135
|
-
each_row
|
1136
|
-
return false unless yield(row)
|
1137
|
-
end
|
1138
|
-
return true
|
1125
|
+
each_row.all?(&block)
|
1139
1126
|
else
|
1140
1127
|
raise ArgumentError, "Unidentified axis #{axis}"
|
1141
1128
|
end
|
@@ -1145,7 +1132,7 @@ module Daru
|
|
1145
1132
|
#
|
1146
1133
|
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
1147
1134
|
def head quantity=10
|
1148
|
-
|
1135
|
+
row.at 0..(quantity-1)
|
1149
1136
|
end
|
1150
1137
|
|
1151
1138
|
alias :first :head
|
@@ -1154,22 +1141,19 @@ module Daru
|
|
1154
1141
|
#
|
1155
1142
|
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
1156
1143
|
def tail quantity=10
|
1157
|
-
|
1144
|
+
start = [-quantity, -size].max
|
1145
|
+
row.at start..-1
|
1158
1146
|
end
|
1159
1147
|
|
1160
1148
|
alias :last :tail
|
1161
1149
|
|
1162
1150
|
# Returns a vector with sum of all vectors specified in the argument.
|
1163
|
-
#
|
1151
|
+
# If vecs parameter is empty, sum all numeric vector.
|
1164
1152
|
def vector_sum vecs=nil
|
1165
1153
|
vecs ||= numeric_vectors
|
1166
1154
|
sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
|
1167
1155
|
|
1168
|
-
vecs.
|
1169
|
-
sum += self[n]
|
1170
|
-
end
|
1171
|
-
|
1172
|
-
sum
|
1156
|
+
vecs.inject(sum) { |memo, n| memo + self[n] }
|
1173
1157
|
end
|
1174
1158
|
|
1175
1159
|
# Calculate mean of the rows of the dataframe.
|
@@ -1179,13 +1163,13 @@ module Daru
|
|
1179
1163
|
# * +max_missing+ - The maximum number of elements in the row that can be
|
1180
1164
|
# zero for the mean calculation to happen. Default to 0.
|
1181
1165
|
def vector_mean max_missing=0
|
1166
|
+
# FIXME: in vector_sum we preserve created vector dtype, but
|
1167
|
+
# here we are not. Is this by design or ...? - zverok, 2016-05-18
|
1182
1168
|
mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"
|
1183
1169
|
|
1184
|
-
each_row_with_index do |row, i|
|
1185
|
-
|
1170
|
+
each_row_with_index.each_with_object(mean_vec) do |(row, i), memo|
|
1171
|
+
memo[i] = row.indexes(*Daru::MISSING_VALUES).size > max_missing ? nil : row.mean
|
1186
1172
|
end
|
1187
|
-
|
1188
|
-
mean_vec
|
1189
1173
|
end
|
1190
1174
|
|
1191
1175
|
# Group elements by vector to perform operations on them. Returns a
|
@@ -1214,6 +1198,8 @@ module Daru
|
|
1214
1198
|
# # ["foo", "two", 3]=>[2, 4]}
|
1215
1199
|
def group_by *vectors
|
1216
1200
|
vectors.flatten!
|
1201
|
+
# FIXME: wouldn't it better to do vectors - @vectors here and
|
1202
|
+
# raise one error with all non-existent vector names?.. - zverok, 2016-05-18
|
1217
1203
|
vectors.each { |v|
|
1218
1204
|
raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
|
1219
1205
|
}
|
@@ -1226,28 +1212,22 @@ module Daru
|
|
1226
1212
|
"subclasses, not #{new_index.class}" unless new_vectors.is_a?(Daru::Index)
|
1227
1213
|
|
1228
1214
|
cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
|
1229
|
-
new_vectors.
|
1230
|
-
|
1215
|
+
new_vectors.each_with_object(cl) do |vec, memo|
|
1216
|
+
memo[vec] = @vectors.include?(vec) ? self[vec] : [nil]*nrows
|
1231
1217
|
end
|
1218
|
+
end
|
1232
1219
|
|
1233
|
-
|
1220
|
+
def get_vector_anyways(v)
|
1221
|
+
@vectors.include?(v) ? self[v].to_a : [nil] * size
|
1234
1222
|
end
|
1235
1223
|
|
1236
1224
|
# Concatenate another DataFrame along corresponding columns.
|
1237
1225
|
# If columns do not exist in both dataframes, they are filled with nils
|
1238
1226
|
def concat other_df
|
1239
|
-
vectors = @vectors.to_a
|
1240
|
-
data = []
|
1227
|
+
vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
|
1241
1228
|
|
1242
|
-
vectors.
|
1243
|
-
|
1244
|
-
data << self[v].dup.to_a.concat(other_vec)
|
1245
|
-
end
|
1246
|
-
|
1247
|
-
other_df.vectors.each do |v|
|
1248
|
-
next if vectors.include?(v)
|
1249
|
-
vectors << v
|
1250
|
-
data << ([nil] * size).concat(other_df[v].to_a)
|
1229
|
+
data = vectors.map do |v|
|
1230
|
+
get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
|
1251
1231
|
end
|
1252
1232
|
|
1253
1233
|
Daru::DataFrame.new(data, order: vectors)
|
@@ -1291,11 +1271,9 @@ module Daru
|
|
1291
1271
|
"subclasses, not #{new_index.class}" unless new_index.is_a?(Daru::Index)
|
1292
1272
|
|
1293
1273
|
cl = Daru::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
|
1294
|
-
new_index.
|
1295
|
-
|
1274
|
+
new_index.each_with_object(cl) do |idx, memo|
|
1275
|
+
memo.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
|
1296
1276
|
end
|
1297
|
-
|
1298
|
-
cl
|
1299
1277
|
end
|
1300
1278
|
|
1301
1279
|
# Reassign index with a new index of type Daru::Index or any of its subclasses.
|
@@ -1310,8 +1288,8 @@ module Daru
|
|
1310
1288
|
# df.index.to_a #=> ['a','b','c','d']
|
1311
1289
|
# df.row['a'].to_a #=> [1,11]
|
1312
1290
|
def index= idx
|
1313
|
-
@
|
1314
|
-
@index =
|
1291
|
+
@index = Index.coerce idx
|
1292
|
+
@data.each { |vec| vec.index = @index }
|
1315
1293
|
|
1316
1294
|
self
|
1317
1295
|
end
|
@@ -1361,21 +1339,14 @@ module Daru
|
|
1361
1339
|
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
1362
1340
|
# alongwith numbers.
|
1363
1341
|
def numeric_vectors
|
1364
|
-
|
1365
|
-
|
1366
|
-
|
1367
|
-
|
1368
|
-
end
|
1369
|
-
numerics
|
1342
|
+
# FIXME: Why _with_index ?..
|
1343
|
+
each_vector_with_index
|
1344
|
+
.select { |vec, _i| vec.numeric? }
|
1345
|
+
.map(&:last)
|
1370
1346
|
end
|
1371
1347
|
|
1372
1348
|
def numeric_vector_names
|
1373
|
-
|
1374
|
-
|
1375
|
-
@vectors.each do |v|
|
1376
|
-
numerics << v if self[v].type == :numeric
|
1377
|
-
end
|
1378
|
-
numerics
|
1349
|
+
@vectors.select { |v| self[v].numeric? }
|
1379
1350
|
end
|
1380
1351
|
|
1381
1352
|
# Return a DataFrame of only the numerical Vectors. If clone: false
|
@@ -1383,12 +1354,9 @@ module Daru
|
|
1383
1354
|
# returned. Defaults to clone: true.
|
1384
1355
|
def only_numerics opts={}
|
1385
1356
|
cln = opts[:clone] == false ? false : true
|
1386
|
-
|
1387
|
-
arry = nv.each_with_object([]) do |v, arr|
|
1388
|
-
arr << self[v]
|
1389
|
-
end
|
1357
|
+
arry = numeric_vectors.map { |v| self[v] }
|
1390
1358
|
|
1391
|
-
order = Index.new(
|
1359
|
+
order = Index.new(numeric_vectors)
|
1392
1360
|
Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
1393
1361
|
end
|
1394
1362
|
|
@@ -1492,39 +1460,24 @@ module Daru
|
|
1492
1460
|
|
1493
1461
|
def sort! vector_order, opts={}
|
1494
1462
|
raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
|
1495
|
-
opts = {
|
1496
|
-
ascending: true,
|
1497
|
-
handle_nils: false,
|
1498
|
-
by: {}
|
1499
|
-
}.merge(opts)
|
1500
1463
|
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1464
|
+
# To enable sorting with categorical data,
|
1465
|
+
# map categories to integers preserving their order
|
1466
|
+
old = convert_categorical_vectors vector_order
|
1467
|
+
block = sort_prepare_block vector_order, opts
|
1504
1468
|
|
1505
|
-
|
1506
|
-
|
1507
|
-
left = build_array_from_blocks vector_order, opts, blocks, r1, r2
|
1508
|
-
right = build_array_from_blocks vector_order, opts, blocks, r2, r1
|
1469
|
+
order = @index.size.times.sort(&block)
|
1470
|
+
new_index = @index.reorder order
|
1509
1471
|
|
1510
|
-
|
1511
|
-
|
1512
|
-
right << r2
|
1513
|
-
left <=> right
|
1514
|
-
end
|
1472
|
+
# To reverse map mapping of categorical data to integers
|
1473
|
+
restore_categorical_vectors old
|
1515
1474
|
|
1516
|
-
|
1517
|
-
|
1518
|
-
old_index = @index.to_a
|
1519
|
-
self.index = Daru::Index.new(idx.map { |i| old_index[i] })
|
1520
|
-
|
1521
|
-
vectors.each do |v|
|
1522
|
-
@data[@vectors[v]] = Daru::Vector.new(
|
1523
|
-
idx.map { |i| @data[@vectors[v]].data[i] },
|
1524
|
-
name: self[v].name, metadata: self[v].metadata.dup, index: index
|
1525
|
-
)
|
1475
|
+
@data.each do |vector|
|
1476
|
+
vector.reorder! order
|
1526
1477
|
end
|
1527
1478
|
|
1479
|
+
self.index = new_index
|
1480
|
+
|
1528
1481
|
self
|
1529
1482
|
end
|
1530
1483
|
|
@@ -1568,90 +1521,41 @@ module Daru
|
|
1568
1521
|
# # [:bar] 18 26
|
1569
1522
|
# # [:foo] 10 12
|
1570
1523
|
def pivot_table opts={}
|
1571
|
-
raise ArgumentError,
|
1572
|
-
'Specify grouping index' if !opts[:index] || opts[:index].empty?
|
1573
|
-
|
1574
|
-
index = opts[:index]
|
1575
|
-
vectors = opts[:vectors] || []
|
1576
|
-
aggregate_function = opts[:agg] || :mean
|
1577
|
-
values =
|
1578
|
-
if opts[:values].is_a?(Symbol)
|
1579
|
-
[opts[:values]]
|
1580
|
-
elsif opts[:values].is_a?(Array)
|
1581
|
-
opts[:values]
|
1582
|
-
else # nil
|
1583
|
-
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
1584
|
-
end
|
1524
|
+
raise ArgumentError, 'Specify grouping index' if opts[:index].to_a.empty?
|
1585
1525
|
|
1526
|
+
index = opts[:index]
|
1527
|
+
vectors = opts[:vectors] || []
|
1528
|
+
aggregate_function = opts[:agg] || :mean
|
1529
|
+
values = prepare_pivot_values index, vectors, opts
|
1586
1530
|
raise IndexError, 'No numeric vectors to aggregate' if values.empty?
|
1587
1531
|
|
1588
1532
|
grouped = group_by(index)
|
1533
|
+
return grouped.send(aggregate_function) if vectors.empty?
|
1589
1534
|
|
1590
|
-
|
1591
|
-
grouped.send(aggregate_function)
|
1592
|
-
else
|
1593
|
-
super_hash = {}
|
1594
|
-
values.each do |value|
|
1595
|
-
grouped.groups.each do |group_name, row_numbers|
|
1596
|
-
super_hash[group_name] ||= {}
|
1535
|
+
super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
|
1597
1536
|
|
1598
|
-
|
1599
|
-
arry = []
|
1600
|
-
arry << value
|
1601
|
-
vectors.each { |v| arry << self[v][num] }
|
1602
|
-
sub_hash = super_hash[group_name]
|
1603
|
-
sub_hash[arry] ||= []
|
1604
|
-
|
1605
|
-
sub_hash[arry] << self[value][num]
|
1606
|
-
end
|
1607
|
-
end
|
1608
|
-
end
|
1609
|
-
|
1610
|
-
super_hash.each_value do |sub_hash|
|
1611
|
-
sub_hash.each do |group_name, aggregates|
|
1612
|
-
sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
|
1613
|
-
end
|
1614
|
-
end
|
1615
|
-
|
1616
|
-
df_index = Daru::MultiIndex.from_tuples super_hash.keys
|
1617
|
-
|
1618
|
-
vector_indexes = []
|
1619
|
-
super_hash.each_value do |sub_hash|
|
1620
|
-
vector_indexes.concat sub_hash.keys
|
1621
|
-
end
|
1622
|
-
|
1623
|
-
df_vectors = Daru::MultiIndex.from_tuples vector_indexes.uniq
|
1624
|
-
pivoted_dataframe = Daru::DataFrame.new({}, index: df_index, order: df_vectors)
|
1625
|
-
|
1626
|
-
super_hash.each do |row_index, sub_h|
|
1627
|
-
sub_h.each do |vector_index, val|
|
1628
|
-
# pivoted_dataframe[symbolize(vector_index)][symbolize(row_index)] = val
|
1629
|
-
pivoted_dataframe[vector_index][row_index] = val
|
1630
|
-
end
|
1631
|
-
end
|
1632
|
-
return pivoted_dataframe
|
1633
|
-
end
|
1537
|
+
pivot_dataframe super_hash
|
1634
1538
|
end
|
1635
1539
|
|
1636
1540
|
# Merge vectors from two DataFrames. In case of name collision,
|
1637
1541
|
# the vectors names are changed to x_1, x_2 ....
|
1638
1542
|
#
|
1639
1543
|
# @return {Daru::DataFrame}
|
1640
|
-
def merge other_df
|
1641
|
-
raise
|
1544
|
+
def merge other_df # rubocop:disable Metrics/AbcSize
|
1545
|
+
raise ArgumentError,
|
1546
|
+
"Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" \
|
1547
|
+
unless nrows == other_df.nrows
|
1642
1548
|
|
1643
1549
|
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
1644
|
-
|
1645
|
-
.map(&:to_sym)
|
1646
|
-
df_new = DataFrame.new({}, order: new_fields)
|
1550
|
+
new_fields = ArrayHelper.recode_repeated(new_fields)
|
1647
1551
|
|
1648
|
-
(
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1552
|
+
DataFrame.new({}, order: new_fields).tap do |df_new|
|
1553
|
+
(0...nrows).each do |i|
|
1554
|
+
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
1555
|
+
end
|
1652
1556
|
|
1653
|
-
|
1654
|
-
|
1557
|
+
df_new.update
|
1558
|
+
end
|
1655
1559
|
end
|
1656
1560
|
|
1657
1561
|
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
@@ -1701,7 +1605,11 @@ module Daru
|
|
1701
1605
|
# ['2','fred','green',15,'orange',30,'white',20],
|
1702
1606
|
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
1703
1607
|
# ]
|
1704
|
-
# ds=Daru::DataFrame.rows(cases, order:
|
1608
|
+
# ds=Daru::DataFrame.rows(cases, order:
|
1609
|
+
# [:id, :name,
|
1610
|
+
# :car_color1, :car_value1,
|
1611
|
+
# :car_color2, :car_value2,
|
1612
|
+
# :car_color3, :car_value3])
|
1705
1613
|
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
1706
1614
|
# #=> Matrix[
|
1707
1615
|
# # ["red", "1", 10],
|
@@ -1711,62 +1619,29 @@ module Daru
|
|
1711
1619
|
# # ["white", "2", 20]
|
1712
1620
|
# # ]
|
1713
1621
|
def one_to_many(parent_fields, pattern)
|
1714
|
-
|
1715
|
-
ds_vars = parent_fields.dup
|
1716
|
-
vars = []
|
1717
|
-
max_n = 0
|
1718
|
-
h = parent_fields.each_with_object({}) { |v, a|
|
1719
|
-
a[v] = Daru::Vector.new([])
|
1720
|
-
}
|
1721
|
-
# Adding _row_id
|
1722
|
-
h['_col_id'] = Daru::Vector.new([])
|
1723
|
-
ds_vars.push('_col_id')
|
1724
|
-
|
1725
|
-
@vectors.each do |f|
|
1726
|
-
next unless f =~ re
|
1727
|
-
unless vars.include? $1
|
1728
|
-
vars.push($1)
|
1729
|
-
h[$1] = Daru::Vector.new([])
|
1730
|
-
end
|
1622
|
+
vars, numbers = one_to_many_components(pattern)
|
1731
1623
|
|
1732
|
-
|
1733
|
-
|
1734
|
-
|
1735
|
-
|
1736
|
-
|
1737
|
-
|
1738
|
-
parent_fields.each do |f|
|
1739
|
-
row_out[f] = row[f]
|
1740
|
-
end
|
1741
|
-
|
1742
|
-
max_n.times do |n1|
|
1743
|
-
n = n1+1
|
1744
|
-
any_data = false
|
1745
|
-
vars.each do |v|
|
1746
|
-
data = row[pattern.gsub('%v',v.to_s).gsub('%n',n.to_s)]
|
1747
|
-
row_out[v] = data
|
1748
|
-
any_data = true unless data.nil?
|
1749
|
-
end
|
1624
|
+
DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
|
1625
|
+
each_row do |row|
|
1626
|
+
verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
|
1627
|
+
numbers.each do |n|
|
1628
|
+
generated = one_to_many_row row, n, vars, pattern
|
1629
|
+
next if generated.values.all?(&:nil?)
|
1750
1630
|
|
1751
|
-
|
1752
|
-
row_out['_col_id'] = n
|
1753
|
-
ds.add_row(row_out)
|
1631
|
+
ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
|
1754
1632
|
end
|
1755
1633
|
end
|
1634
|
+
ds.update
|
1756
1635
|
end
|
1757
|
-
ds.update
|
1758
|
-
ds
|
1759
1636
|
end
|
1760
1637
|
|
1761
|
-
def add_vectors_by_split_recode(
|
1762
|
-
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
1766
|
-
|
1767
|
-
|
1768
|
-
i += 1
|
1769
|
-
}
|
1638
|
+
def add_vectors_by_split_recode(nm, join='-', sep=Daru::SPLIT_TOKEN)
|
1639
|
+
self[nm]
|
1640
|
+
.split_by_separator(sep)
|
1641
|
+
.each_with_index do |(k, v), i|
|
1642
|
+
v.rename "#{nm}:#{k}"
|
1643
|
+
self["#{nm}#{join}#{i + 1}".to_sym] = v
|
1644
|
+
end
|
1770
1645
|
end
|
1771
1646
|
|
1772
1647
|
# Create a sql, basen on a given Dataset
|
@@ -1795,40 +1670,37 @@ module Daru
|
|
1795
1670
|
sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
|
1796
1671
|
end
|
1797
1672
|
|
1673
|
+
# Returns the dataframe. This can be convenient when the user does not
|
1674
|
+
# know whether the object is a vector or a dataframe.
|
1675
|
+
# @return [self] the dataframe
|
1676
|
+
def to_df
|
1677
|
+
self
|
1678
|
+
end
|
1679
|
+
|
1798
1680
|
# Convert all numeric vectors to GSL::Matrix
|
1799
1681
|
def to_gsl
|
1800
|
-
numerics_as_arrays = []
|
1801
|
-
numeric_vectors.each do |n|
|
1802
|
-
numerics_as_arrays << self[n].to_a
|
1803
|
-
end
|
1682
|
+
numerics_as_arrays = numeric_vectors.map { |n| self[n].to_a }
|
1804
1683
|
|
1805
1684
|
GSL::Matrix.alloc(*numerics_as_arrays.transpose)
|
1806
1685
|
end
|
1807
1686
|
|
1808
1687
|
# Convert all vectors of type *:numeric* into a Matrix.
|
1809
1688
|
def to_matrix
|
1810
|
-
|
1811
|
-
each_vector do |vector|
|
1812
|
-
numerics_as_arrays << vector.to_a if vector.type == :numeric
|
1813
|
-
end
|
1814
|
-
|
1815
|
-
Matrix.columns numerics_as_arrays
|
1689
|
+
Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
|
1816
1690
|
end
|
1817
1691
|
|
1818
1692
|
# Return a Nyaplot::DataFrame from the data of this DataFrame.
|
1693
|
+
# :nocov:
|
1819
1694
|
def to_nyaplotdf
|
1820
1695
|
Nyaplot::DataFrame.new(to_a[0])
|
1821
1696
|
end
|
1697
|
+
# :nocov:
|
1822
1698
|
|
1823
1699
|
# Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
|
1824
1700
|
def to_nmatrix
|
1825
|
-
|
1826
|
-
|
1827
|
-
|
1828
|
-
vector.missing_positions.empty?
|
1829
|
-
end
|
1830
|
-
|
1831
|
-
numerics_as_arrays.transpose.to_nm
|
1701
|
+
each_vector.select do |vector|
|
1702
|
+
vector.numeric? && !vector.include_values?(*Daru::MISSING_VALUES)
|
1703
|
+
end.map(&:to_a).transpose.to_nm
|
1832
1704
|
end
|
1833
1705
|
|
1834
1706
|
# Converts the DataFrame into an array of hashes where key is vector name
|
@@ -1837,13 +1709,7 @@ module Daru
|
|
1837
1709
|
# of the dataframe. Each element in the index array corresponds to its row
|
1838
1710
|
# in the array of hashes, which has the same index.
|
1839
1711
|
def to_a
|
1840
|
-
|
1841
|
-
each_row do |row|
|
1842
|
-
arry[0] << row.to_h
|
1843
|
-
end
|
1844
|
-
arry[1] = @index.to_a
|
1845
|
-
|
1846
|
-
arry
|
1712
|
+
[each_row.map(&:to_h), @index.to_a]
|
1847
1713
|
end
|
1848
1714
|
|
1849
1715
|
# Convert to json. If no_index is false then the index will NOT be included
|
@@ -1859,54 +1725,19 @@ module Daru
|
|
1859
1725
|
# Converts DataFrame to a hash (explicit) with keys as vector names and values as
|
1860
1726
|
# the corresponding vectors.
|
1861
1727
|
def to_h
|
1862
|
-
|
1863
|
-
|
1864
|
-
|
1865
|
-
end
|
1866
|
-
|
1867
|
-
hsh
|
1728
|
+
@vectors
|
1729
|
+
.each_with_index
|
1730
|
+
.map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
|
1868
1731
|
end
|
1869
1732
|
|
1870
1733
|
# Convert to html for IRuby.
|
1871
1734
|
def to_html threshold=30
|
1872
|
-
|
1873
|
-
|
1874
|
-
|
1875
|
-
|
1876
|
-
|
1877
|
-
|
1878
|
-
html +='<tr><th></th>'
|
1879
|
-
@vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
|
1880
|
-
html += '</tr>'
|
1881
|
-
|
1882
|
-
@index.each_with_index do |index, num|
|
1883
|
-
html += '<tr>'
|
1884
|
-
html += '<td>' + index.to_s + '</td>'
|
1885
|
-
|
1886
|
-
row[index].each do |element|
|
1887
|
-
html += '<td>' + element.to_s + '</td>'
|
1888
|
-
end
|
1889
|
-
|
1890
|
-
html += '</tr>'
|
1891
|
-
next if num <= threshold
|
1892
|
-
|
1893
|
-
html += '<tr>'
|
1894
|
-
(@vectors.size + 1).times { html += '<td>...</td>' }
|
1895
|
-
html += '</tr>'
|
1896
|
-
|
1897
|
-
last_index = @index.to_a.last
|
1898
|
-
last_row = row[last_index]
|
1899
|
-
html += '<tr>'
|
1900
|
-
html += '<td>' + last_index.to_s + '</td>'
|
1901
|
-
(0..(ncols - 1)).to_a.each do |i|
|
1902
|
-
html += '<td>' + last_row[i].to_s + '</td>'
|
1903
|
-
end
|
1904
|
-
html += '</tr>'
|
1905
|
-
break
|
1906
|
-
end
|
1907
|
-
html += '</table>'
|
1908
|
-
|
1909
|
-
html
|
1735
|
+
path = if index.is_a?(MultiIndex)
|
1736
|
+
File.expand_path('../iruby/templates/dataframe_mi.html.erb', __FILE__)
|
1737
|
+
else
|
1738
|
+
File.expand_path('../iruby/templates/dataframe.html.erb', __FILE__)
|
1739
|
+
end
|
1740
|
+
ERB.new(File.read(path).strip).result(binding)
|
1910
1741
|
end
|
1911
1742
|
|
1912
1743
|
def to_s
|
@@ -1925,8 +1756,11 @@ module Daru
|
|
1925
1756
|
# Rename the DataFrame.
|
1926
1757
|
def rename new_name
|
1927
1758
|
@name = new_name
|
1759
|
+
self
|
1928
1760
|
end
|
1929
1761
|
|
1762
|
+
alias_method :name=, :rename
|
1763
|
+
|
1930
1764
|
# Write this DataFrame to a CSV file.
|
1931
1765
|
#
|
1932
1766
|
# == Arguements
|
@@ -2003,46 +1837,28 @@ module Daru
|
|
2003
1837
|
|
2004
1838
|
# Transpose a DataFrame, tranposing elements and row, column indexing.
|
2005
1839
|
def transpose
|
2006
|
-
|
2007
|
-
|
2008
|
-
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
1840
|
+
Daru::DataFrame.new(
|
1841
|
+
each_vector.map(&:to_a).transpose,
|
1842
|
+
index: @vectors,
|
1843
|
+
order: @index,
|
1844
|
+
dtype: @dtype,
|
1845
|
+
name: @name
|
1846
|
+
)
|
2012
1847
|
end
|
2013
1848
|
|
2014
1849
|
# Pretty print in a nice table format for the command line (irb/pry/iruby)
|
2015
1850
|
def inspect spacing=10, threshold=15
|
2016
|
-
|
2017
|
-
|
2018
|
-
|
2019
|
-
|
2020
|
-
|
2021
|
-
|
2022
|
-
|
2023
|
-
|
2024
|
-
|
2025
|
-
|
2026
|
-
|
2027
|
-
content += "\n#<" + self.class.to_s + ':' + object_id.to_s + ' @name = ' +
|
2028
|
-
name.to_s + ' @size = ' + @size.to_s + '>'
|
2029
|
-
content += formatter % ['', *@vectors.map(&:to_s)]
|
2030
|
-
row_num = 1
|
2031
|
-
|
2032
|
-
each_row_with_index do |row, index|
|
2033
|
-
content += formatter % [index.to_s, *row.to_h.values.map { |e| (e || 'nil').to_s }]
|
2034
|
-
row_num += 1
|
2035
|
-
next if row_num <= threshold
|
2036
|
-
|
2037
|
-
dots = []
|
2038
|
-
|
2039
|
-
(@vectors.size + 1).times { dots << '...' }
|
2040
|
-
content += formatter % dots
|
2041
|
-
break
|
2042
|
-
end
|
2043
|
-
content += "\n"
|
2044
|
-
|
2045
|
-
content
|
1851
|
+
row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
|
1852
|
+
name_part = @name ? ": #{@name} " : ''
|
1853
|
+
|
1854
|
+
"#<#{self.class}#{name_part}(#{nrows}x#{ncols})>\n" +
|
1855
|
+
Formatters::Table.format(
|
1856
|
+
each_row.lazy,
|
1857
|
+
row_headers: row_headers,
|
1858
|
+
headers: vectors,
|
1859
|
+
threshold: threshold,
|
1860
|
+
spacing: spacing
|
1861
|
+
)
|
2046
1862
|
end
|
2047
1863
|
|
2048
1864
|
# Query a DataFrame by passing a Daru::Core::Query::BoolArray object.
|
@@ -2058,218 +1874,202 @@ module Daru
|
|
2058
1874
|
@vectors.to_a.all? { |v| self[v] == other[v] }
|
2059
1875
|
end
|
2060
1876
|
|
1877
|
+
# Converts the specified non category type vectors to category type vectors
|
1878
|
+
# @param [Array] *names names of non category type vectors to be converted
|
1879
|
+
# @return [Daru::DataFrame] data frame in which specified vectors have been
|
1880
|
+
# converted to category type
|
1881
|
+
# @example
|
1882
|
+
# df = Daru::DataFrame.new({
|
1883
|
+
# a: [1, 2, 3],
|
1884
|
+
# b: ['a', 'a', 'b']
|
1885
|
+
# })
|
1886
|
+
# df.to_category :b
|
1887
|
+
# df[:b].type
|
1888
|
+
# # => :category
|
1889
|
+
def to_category *names
|
1890
|
+
names.each { |n| self[n] = self[n].to_category }
|
1891
|
+
self
|
1892
|
+
end
|
1893
|
+
|
2061
1894
|
def method_missing(name, *args, &block)
|
2062
1895
|
if name =~ /(.+)\=/
|
2063
|
-
insert_or_modify_vector name[/(.+)\=/].delete('=').to_sym, args[0]
|
1896
|
+
insert_or_modify_vector [name[/(.+)\=/].delete('=').to_sym], args[0]
|
2064
1897
|
elsif has_vector? name
|
2065
1898
|
self[name]
|
2066
1899
|
else
|
2067
|
-
super
|
1900
|
+
super
|
2068
1901
|
end
|
2069
1902
|
end
|
2070
1903
|
|
2071
|
-
|
1904
|
+
def respond_to_missing?(name, include_private=false)
|
1905
|
+
name.to_s.end_with?('=') || has_vector?(name) || super
|
1906
|
+
end
|
2072
1907
|
|
2073
|
-
def
|
2074
|
-
|
2075
|
-
|
2076
|
-
else
|
2077
|
-
Daru::Index.new(index)
|
1908
|
+
def interact_code vector_names, full
|
1909
|
+
dfs = vector_names.zip(full).map do |vec_name, f|
|
1910
|
+
self[vec_name].contrast_code(full: f).each.to_a
|
2078
1911
|
end
|
1912
|
+
|
1913
|
+
all_vectors = recursive_product(dfs)
|
1914
|
+
Daru::DataFrame.new all_vectors,
|
1915
|
+
order: all_vectors.map(&:name)
|
2079
1916
|
end
|
2080
1917
|
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2086
|
-
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
1918
|
+
# Split the dataframe into many dataframes based on category vector
|
1919
|
+
# @param [object] cat_name name of category vector to split the dataframe
|
1920
|
+
# @return [Array] array of dataframes split by category with category vector
|
1921
|
+
# used to split not included
|
1922
|
+
# @example
|
1923
|
+
# df = Daru::DataFrame.new({
|
1924
|
+
# a: [1, 2, 3],
|
1925
|
+
# b: ['a', 'a', 'b']
|
1926
|
+
# })
|
1927
|
+
# df.to_category :b
|
1928
|
+
# df.split_by_category :b
|
1929
|
+
# # => [#<Daru::DataFrame: a (2x1)>
|
1930
|
+
# # a
|
1931
|
+
# # 0 1
|
1932
|
+
# # 1 2,
|
1933
|
+
# # #<Daru::DataFrame: b (1x1)>
|
1934
|
+
# # a
|
1935
|
+
# # 2 3]
|
1936
|
+
def split_by_category cat_name
|
1937
|
+
cat_dv = self[cat_name]
|
1938
|
+
raise ArguementError, "#{cat_name} is not a category vector" unless
|
1939
|
+
cat_dv.category?
|
1940
|
+
|
1941
|
+
cat_dv.categories.map do |cat|
|
1942
|
+
where(cat_dv.eq cat)
|
1943
|
+
.rename(cat)
|
1944
|
+
.delete_vector cat_name
|
2093
1945
|
end
|
1946
|
+
end
|
1947
|
+
|
1948
|
+
private
|
2094
1949
|
|
2095
|
-
|
1950
|
+
def convert_categorical_vectors names
|
1951
|
+
names.map do |n|
|
1952
|
+
next unless self[n].category?
|
1953
|
+
old = [n, self[n]]
|
1954
|
+
self[n] = Daru::Vector.new(self[n].to_ints)
|
1955
|
+
old
|
1956
|
+
end.compact
|
2096
1957
|
end
|
2097
1958
|
|
2098
|
-
def
|
2099
|
-
|
2100
|
-
|
2101
|
-
value = if opts[:ascending][i]
|
2102
|
-
@data[@vectors[v]].data[r1]
|
2103
|
-
else
|
2104
|
-
@data[@vectors[v]].data[r2]
|
2105
|
-
end
|
2106
|
-
|
2107
|
-
if opts[:by][v] && !opts[:handle_nils][i]
|
2108
|
-
# Block given and nils handled manually
|
2109
|
-
value = opts[:by][v].call value
|
2110
|
-
|
2111
|
-
elsif opts[:by][v] && opts[:handle_nils][i]
|
2112
|
-
# Block given and nils handled automatically
|
2113
|
-
value = opts[:by][v].call value rescue nil
|
2114
|
-
blocks[v].call value
|
1959
|
+
def restore_categorical_vectors old
|
1960
|
+
old.each { |name, vector| self[name] = vector }
|
1961
|
+
end
|
2115
1962
|
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
1963
|
+
def recursive_product dfs
|
1964
|
+
return dfs.first if dfs.size == 1
|
1965
|
+
|
1966
|
+
left = dfs.first
|
1967
|
+
dfs.shift
|
1968
|
+
right = recursive_product dfs
|
1969
|
+
left.product(right).map do |dv1, dv2|
|
1970
|
+
(dv1*dv2).rename "#{dv1.name}:#{dv2.name}"
|
2120
1971
|
end
|
2121
1972
|
end
|
2122
1973
|
|
2123
|
-
def
|
2124
|
-
if
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
1974
|
+
def should_be_vector! val
|
1975
|
+
return val if val.is_a?(Daru::Vector)
|
1976
|
+
raise TypeError, "Every iteration must return Daru::Vector not #{val.class}"
|
1977
|
+
end
|
1978
|
+
|
1979
|
+
def dispatch_to_axis(axis, method, *args, &block)
|
1980
|
+
if axis == :vector || axis == :column
|
1981
|
+
send("#{method}_vector", *args, &block)
|
1982
|
+
elsif axis == :row
|
1983
|
+
send("#{method}_row", *args, &block)
|
2128
1984
|
else
|
2129
|
-
|
1985
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
2130
1986
|
end
|
2131
1987
|
end
|
2132
1988
|
|
2133
|
-
def
|
2134
|
-
if
|
2135
|
-
|
2136
|
-
|
2137
|
-
|
1989
|
+
def dispatch_to_axis_pl(axis, method, *args, &block)
|
1990
|
+
if axis == :vector || axis == :column
|
1991
|
+
send("#{method}_vectors", *args, &block)
|
1992
|
+
elsif axis == :row
|
1993
|
+
send("#{method}_rows", *args, &block)
|
2138
1994
|
else
|
2139
|
-
|
1995
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
2140
1996
|
end
|
2141
1997
|
end
|
2142
1998
|
|
2143
|
-
|
2144
|
-
|
2145
|
-
|
2146
|
-
|
2147
|
-
|
1999
|
+
AXES = [:row, :vector].freeze
|
2000
|
+
|
2001
|
+
def extract_axis names, default=:vector
|
2002
|
+
if AXES.include?(names.last)
|
2003
|
+
names.pop
|
2004
|
+
else
|
2005
|
+
default
|
2148
2006
|
end
|
2149
2007
|
end
|
2150
2008
|
|
2151
2009
|
def access_vector *names
|
2152
|
-
|
2010
|
+
if names.first.is_a?(Range)
|
2011
|
+
dup(@vectors[names.first])
|
2012
|
+
elsif @vectors.is_a?(MultiIndex)
|
2013
|
+
access_vector_multi_index(*names)
|
2014
|
+
else
|
2015
|
+
access_vector_single_index(*names)
|
2016
|
+
end
|
2017
|
+
end
|
2153
2018
|
|
2154
|
-
|
2155
|
-
|
2156
|
-
pos = @vectors[names]
|
2019
|
+
def access_vector_multi_index *names
|
2020
|
+
pos = @vectors[names]
|
2157
2021
|
|
2158
|
-
|
2022
|
+
return @data[pos] if pos.is_a?(Integer)
|
2159
2023
|
|
2160
|
-
|
2161
|
-
new_vectors = pos.map do |tuple|
|
2162
|
-
@data[@vectors[tuple]]
|
2163
|
-
end
|
2024
|
+
new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
|
2164
2025
|
|
2165
|
-
|
2166
|
-
pos = pos.drop_left_level names.size
|
2167
|
-
end
|
2026
|
+
pos = pos.drop_left_level(names.size) if names.size < @vectors.width
|
2168
2027
|
|
2169
|
-
|
2170
|
-
|
2171
|
-
unless names[1]
|
2172
|
-
pos = @vectors[location]
|
2173
|
-
|
2174
|
-
return @data[pos] if pos.is_a?(Numeric)
|
2028
|
+
Daru::DataFrame.new(new_vectors, index: @index, order: pos)
|
2029
|
+
end
|
2175
2030
|
|
2176
|
-
|
2177
|
-
|
2031
|
+
def access_vector_single_index *names
|
2032
|
+
if names.count < 2
|
2033
|
+
pos = @vectors[names.first]
|
2178
2034
|
|
2179
|
-
|
2180
|
-
names.each do |name|
|
2181
|
-
new_vectors[name] = @data[@vectors[name]]
|
2182
|
-
end
|
2035
|
+
return @data[pos] if pos.is_a?(Numeric)
|
2183
2036
|
|
2184
|
-
|
2185
|
-
Daru::DataFrame.new(new_vectors, order: order,
|
2186
|
-
index: @index, name: @name)
|
2037
|
+
names = pos
|
2187
2038
|
end
|
2188
|
-
end
|
2189
2039
|
|
2190
|
-
|
2191
|
-
location = names[0]
|
2040
|
+
new_vectors = names.map { |name| [name, @data[@vectors[name]]] }.to_h
|
2192
2041
|
|
2193
|
-
|
2194
|
-
|
2195
|
-
|
2196
|
-
|
2197
|
-
end
|
2042
|
+
order = names.is_a?(Array) ? Daru::Index.new(names) : names
|
2043
|
+
Daru::DataFrame.new(new_vectors, order: order,
|
2044
|
+
index: @index, name: @name)
|
2045
|
+
end
|
2198
2046
|
|
2199
|
-
|
2047
|
+
def access_row *indexes
|
2048
|
+
positions = @index.pos(*indexes)
|
2200
2049
|
|
2201
|
-
|
2202
|
-
|
2203
|
-
|
2204
|
-
|
2205
|
-
Daru::DataFrame.rows(new_rows, order: @vectors, name: @name, index: pos)
|
2050
|
+
if positions.is_a? Numeric
|
2051
|
+
return Daru::Vector.new populate_row_for(positions),
|
2052
|
+
index: @vectors,
|
2053
|
+
name: indexes.first
|
2206
2054
|
else
|
2207
|
-
|
2208
|
-
|
2209
|
-
|
2210
|
-
|
2211
|
-
@data.each do |vector|
|
2212
|
-
row << vector[location]
|
2213
|
-
end
|
2214
|
-
|
2215
|
-
return Daru::Vector.new(row, index: @vectors, name: set_name(location))
|
2216
|
-
end
|
2217
|
-
end
|
2218
|
-
# Access multiple rows
|
2219
|
-
rows = []
|
2220
|
-
names.each do |name|
|
2221
|
-
rows << self.row[name].to_a
|
2222
|
-
end
|
2223
|
-
|
2224
|
-
Daru::DataFrame.rows rows, index: names,name: @name, order: @vectors
|
2055
|
+
new_rows = @data.map { |vec| vec[*indexes] }
|
2056
|
+
return Daru::DataFrame.new new_rows,
|
2057
|
+
index: @index.subset(*indexes),
|
2058
|
+
order: @vectors
|
2225
2059
|
end
|
2226
2060
|
end
|
2227
2061
|
|
2228
2062
|
def populate_row_for pos
|
2229
|
-
@data.map
|
2230
|
-
vector[pos]
|
2231
|
-
end
|
2063
|
+
@data.map { |vector| vector[pos] }
|
2232
2064
|
end
|
2233
2065
|
|
2234
2066
|
def insert_or_modify_vector name, vector
|
2235
2067
|
name = name[0] unless @vectors.is_a?(MultiIndex)
|
2236
|
-
vec = nil
|
2237
2068
|
|
2238
2069
|
if @index.empty?
|
2239
|
-
|
2240
|
-
vector
|
2241
|
-
else
|
2242
|
-
Daru::Vector.new(vector.to_a, name: set_name(name))
|
2243
|
-
end
|
2244
|
-
|
2245
|
-
@index = vec.index
|
2246
|
-
assign_or_add_vector name, vec
|
2247
|
-
set_size
|
2248
|
-
|
2249
|
-
@data.map! do |v|
|
2250
|
-
if v.empty?
|
2251
|
-
Daru::Vector.new([nil]*@size, name: set_name(name), metadata: v.metadata, index: @index)
|
2252
|
-
else
|
2253
|
-
v
|
2254
|
-
end
|
2255
|
-
end
|
2070
|
+
insert_vector_in_empty name, vector
|
2256
2071
|
else
|
2257
|
-
|
2258
|
-
if vector.index == @index # so that index-by-index assignment is avoided when possible.
|
2259
|
-
vec = vector.dup
|
2260
|
-
else
|
2261
|
-
vec = Daru::Vector.new [], name: set_name(name), metadata: vector.metadata.dup, index: @index
|
2262
|
-
@index.each do |idx|
|
2263
|
-
vec[idx] = vector.index.include?(idx) ? vector[idx] : nil
|
2264
|
-
end
|
2265
|
-
end
|
2266
|
-
else
|
2267
|
-
raise SizeError,
|
2268
|
-
"Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
|
2269
|
-
@size != vector.size
|
2270
|
-
|
2271
|
-
vec = Daru::Vector.new(vector, name: set_name(name), index: @index)
|
2272
|
-
end
|
2072
|
+
vec = prepare_vector_for_insert name, vector
|
2273
2073
|
|
2274
2074
|
assign_or_add_vector name, vec
|
2275
2075
|
end
|
@@ -2283,54 +2083,82 @@ module Daru
|
|
2283
2083
|
pos = name
|
2284
2084
|
end
|
2285
2085
|
|
2286
|
-
|
2287
|
-
|
2086
|
+
case
|
2087
|
+
when pos.is_a?(Daru::Index)
|
2088
|
+
assign_multiple_vectors pos, v
|
2089
|
+
when pos == name &&
|
2090
|
+
(@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
|
2091
|
+
|
2288
2092
|
@data[pos] = v
|
2289
|
-
elsif pos.is_a?(Daru::Index)
|
2290
|
-
pos.each do |p|
|
2291
|
-
@data[@vectors[p]] = v
|
2292
|
-
end
|
2293
2093
|
else
|
2294
|
-
|
2295
|
-
@data[@vectors[name]] = v
|
2094
|
+
assign_or_add_vector_rough name, v
|
2296
2095
|
end
|
2297
2096
|
end
|
2298
2097
|
|
2299
|
-
def
|
2300
|
-
|
2301
|
-
|
2302
|
-
|
2303
|
-
|
2304
|
-
vec =
|
2305
|
-
if vector.is_a?(Daru::Vector)
|
2306
|
-
vector
|
2307
|
-
else
|
2308
|
-
Daru::Vector.new(vector, name: set_name(name), index: @vectors)
|
2309
|
-
end
|
2098
|
+
def assign_multiple_vectors pos, v
|
2099
|
+
pos.each do |p|
|
2100
|
+
@data[@vectors[p]] = v
|
2101
|
+
end
|
2102
|
+
end
|
2310
2103
|
|
2311
|
-
|
2312
|
-
|
2313
|
-
|
2314
|
-
|
2315
|
-
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
2104
|
+
def assign_or_add_vector_rough name, v
|
2105
|
+
@vectors |= [name] unless @vectors.include?(name)
|
2106
|
+
@data[@vectors[name]] = v
|
2107
|
+
end
|
2108
|
+
|
2109
|
+
def insert_vector_in_empty name, vector
|
2110
|
+
vec = Vector.coerce(vector.to_a, name: coerce_name(name))
|
2111
|
+
|
2112
|
+
@index = vec.index
|
2113
|
+
assign_or_add_vector name, vec
|
2114
|
+
set_size
|
2115
|
+
|
2116
|
+
@data.map! { |v| v.empty? ? v.reindex(@index) : v }
|
2117
|
+
end
|
2118
|
+
|
2119
|
+
def prepare_vector_for_insert name, vector
|
2120
|
+
if vector.is_a?(Daru::Vector)
|
2121
|
+
# so that index-by-index assignment is avoided when possible.
|
2122
|
+
return vector.dup if vector.index == @index
|
2123
|
+
|
2124
|
+
Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v|
|
2125
|
+
@index.each do |idx|
|
2126
|
+
v[idx] = vector.index.include?(idx) ? vector[idx] : nil
|
2319
2127
|
end
|
2320
|
-
|
2128
|
+
}
|
2129
|
+
else
|
2130
|
+
# FIXME: No spec checks this case... And SizeError is not a thing - zverok, 2016-05-08
|
2131
|
+
raise SizeError,
|
2132
|
+
"Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
|
2133
|
+
@size != vector.size
|
2134
|
+
|
2135
|
+
Daru::Vector.new(vector, name: coerce_name(name), index: @index)
|
2136
|
+
end
|
2137
|
+
end
|
2138
|
+
|
2139
|
+
def insert_or_modify_row indexes, vector
|
2140
|
+
vector = coerce_vector vector
|
2141
|
+
|
2142
|
+
raise SizeError, 'Vector length should match row length' if
|
2143
|
+
vector.size != @vectors.size
|
2321
2144
|
|
2322
|
-
|
2145
|
+
@data.each_with_index do |vec, pos|
|
2146
|
+
vec.send(:set, indexes, vector.at(pos))
|
2323
2147
|
end
|
2148
|
+
@index = @data[0].index
|
2149
|
+
|
2150
|
+
set_size
|
2324
2151
|
end
|
2325
2152
|
|
2326
2153
|
def create_empty_vectors
|
2327
|
-
@vectors.
|
2328
|
-
|
2154
|
+
@data = @vectors.map do |name|
|
2155
|
+
Daru::Vector.new([], name: coerce_name(name), index: @index)
|
2329
2156
|
end
|
2330
2157
|
end
|
2331
2158
|
|
2332
2159
|
def validate_labels
|
2333
|
-
raise IndexError, "Expected equal number of vector names (#{@vectors.size})
|
2160
|
+
raise IndexError, "Expected equal number of vector names (#{@vectors.size}) " \
|
2161
|
+
"for number of vectors (#{@data.size})." if
|
2334
2162
|
@vectors && @vectors.size != @data.size
|
2335
2163
|
|
2336
2164
|
raise IndexError, 'Expected number of indexes same as number of rows' if
|
@@ -2348,12 +2176,6 @@ module Daru
|
|
2348
2176
|
validate_vector_sizes
|
2349
2177
|
end
|
2350
2178
|
|
2351
|
-
def all_daru_vectors_in_source? source
|
2352
|
-
source.values.all? do |vector|
|
2353
|
-
vector.is_a?(Daru::Vector)
|
2354
|
-
end
|
2355
|
-
end
|
2356
|
-
|
2357
2179
|
def set_size
|
2358
2180
|
@size = @index.size
|
2359
2181
|
end
|
@@ -2382,32 +2204,301 @@ module Daru
|
|
2382
2204
|
def all_vectors_have_equal_indexes? source
|
2383
2205
|
idx = source.values[0].index
|
2384
2206
|
|
2385
|
-
source.values.all?
|
2386
|
-
|
2207
|
+
source.values.all? { |vector| idx == vector.index }
|
2208
|
+
end
|
2209
|
+
|
2210
|
+
def coerce_name potential_name
|
2211
|
+
potential_name.is_a?(Array) ? potential_name.join : potential_name
|
2212
|
+
end
|
2213
|
+
|
2214
|
+
def initialize_from_array source, vectors, index, opts
|
2215
|
+
raise ArgumentError, 'All objects in data source should be same class' \
|
2216
|
+
unless source.map(&:class).uniq.size == 1
|
2217
|
+
|
2218
|
+
case source.first
|
2219
|
+
when Array
|
2220
|
+
initialize_from_array_of_arrays source, vectors, index, opts
|
2221
|
+
when Vector
|
2222
|
+
initialize_from_array_of_vectors source, vectors, index, opts
|
2223
|
+
when Hash
|
2224
|
+
initialize_from_array_of_hashes source, vectors, index, opts
|
2225
|
+
else
|
2226
|
+
raise ArgumentError, "Can't create DataFrame from #{source}"
|
2387
2227
|
end
|
2388
2228
|
end
|
2389
2229
|
|
2390
|
-
def
|
2391
|
-
|
2230
|
+
def initialize_from_array_of_arrays source, vectors, index, _opts
|
2231
|
+
raise ArgumentError, "Number of vectors (#{vectors.size}) should \
|
2232
|
+
equal order size (#{source.size})" if source.size != vectors.size
|
2233
|
+
|
2234
|
+
@index = Index.coerce(index || source[0].size)
|
2235
|
+
@vectors = Index.coerce(vectors)
|
2236
|
+
|
2237
|
+
@data = @vectors.each_with_index.map do |_vec,idx|
|
2238
|
+
Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
|
2239
|
+
end
|
2392
2240
|
end
|
2393
2241
|
|
2394
|
-
def
|
2395
|
-
|
2242
|
+
def initialize_from_array_of_vectors source, vectors, index, opts
|
2243
|
+
clone = opts[:clone] != false
|
2244
|
+
hsh = vectors.each_with_index.map do |name, idx|
|
2245
|
+
[name, source[idx]]
|
2246
|
+
end.to_h
|
2247
|
+
initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
|
2248
|
+
end
|
2249
|
+
|
2250
|
+
def initialize_from_array_of_hashes source, vectors, index, _opts
|
2251
|
+
names =
|
2252
|
+
if vectors.nil?
|
2253
|
+
source[0].keys
|
2254
|
+
else
|
2255
|
+
(vectors + source[0].keys).uniq
|
2256
|
+
end
|
2257
|
+
@vectors = Daru::Index.new(names)
|
2258
|
+
@index = Daru::Index.new(index || source.size)
|
2259
|
+
|
2260
|
+
@data = @vectors.map do |name|
|
2261
|
+
v = source.map { |h| h[name] || h[name.to_s] }
|
2262
|
+
Daru::Vector.new(v, name: coerce_name(name), index: @index)
|
2263
|
+
end
|
2264
|
+
end
|
2265
|
+
|
2266
|
+
def initialize_from_hash source, vectors, index, opts
|
2267
|
+
create_vectors_index_with vectors, source
|
2268
|
+
|
2269
|
+
if ArrayHelper.array_of?(source.values, Vector)
|
2270
|
+
initialize_from_hash_with_vectors source, index, opts
|
2271
|
+
else
|
2272
|
+
initialize_from_hash_with_arrays source, index, opts
|
2273
|
+
end
|
2274
|
+
end
|
2275
|
+
|
2276
|
+
def initialize_from_hash_with_vectors source, index, opts
|
2277
|
+
vectors_have_same_index = all_vectors_have_equal_indexes?(source)
|
2278
|
+
|
2279
|
+
clone = opts[:clone] != false
|
2280
|
+
clone = true unless index || vectors_have_same_index
|
2281
|
+
|
2282
|
+
@index = deduce_index index, source, vectors_have_same_index
|
2283
|
+
|
2284
|
+
if clone
|
2285
|
+
@data = clone_vectors source, vectors_have_same_index
|
2286
|
+
else
|
2287
|
+
@data.concat source.values
|
2288
|
+
end
|
2289
|
+
end
|
2290
|
+
|
2291
|
+
def deduce_index index, source, vectors_have_same_index
|
2292
|
+
if !index.nil?
|
2293
|
+
Index.coerce index
|
2294
|
+
elsif vectors_have_same_index
|
2295
|
+
source.values[0].index.dup
|
2296
|
+
else
|
2297
|
+
all_indexes = source
|
2298
|
+
.values.map { |v| v.index.to_a }
|
2299
|
+
.flatten.uniq.sort # sort only if missing indexes detected
|
2300
|
+
|
2301
|
+
Daru::Index.new all_indexes
|
2302
|
+
end
|
2396
2303
|
end
|
2397
2304
|
|
2398
|
-
def
|
2399
|
-
|
2400
|
-
if
|
2401
|
-
|
2402
|
-
|
2403
|
-
|
2305
|
+
def clone_vectors source, vectors_have_same_index
|
2306
|
+
@vectors.map do |vector|
|
2307
|
+
# avoids matching indexes of vectors if all the supplied vectors
|
2308
|
+
# have the same index.
|
2309
|
+
if vectors_have_same_index
|
2310
|
+
source[vector].dup
|
2311
|
+
else
|
2312
|
+
Daru::Vector.new([], name: vector, index: @index).tap do |v|
|
2313
|
+
@index.each do |idx|
|
2314
|
+
v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
|
2404
2315
|
end
|
2405
2316
|
end
|
2317
|
+
end
|
2318
|
+
end
|
2319
|
+
end
|
2320
|
+
|
2321
|
+
def initialize_from_hash_with_arrays source, index, _opts
|
2322
|
+
@index = Index.coerce(index || source.values[0].size)
|
2323
|
+
|
2324
|
+
@vectors.each do |name|
|
2325
|
+
@data << Daru::Vector.new(source[name].dup, name: coerce_name(name), index: @index)
|
2326
|
+
end
|
2327
|
+
end
|
2328
|
+
|
2329
|
+
def sort_build_row vector_locs, by_blocks, ascending, handle_nils, r1, r2 # rubocop:disable Metrics/ParameterLists
|
2330
|
+
# Create an array to be used for comparison of two rows in sorting
|
2331
|
+
vector_locs
|
2332
|
+
.zip(by_blocks, ascending, handle_nils)
|
2333
|
+
.map do |vector_loc, by, asc, handle_nil|
|
2334
|
+
value = @data[vector_loc].data[asc ? r1 : r2]
|
2335
|
+
|
2336
|
+
value = by.call(value) rescue nil if by
|
2337
|
+
|
2338
|
+
sort_handle_nils value, asc, handle_nil || !by
|
2339
|
+
end
|
2340
|
+
end
|
2341
|
+
|
2342
|
+
def sort_handle_nils value, asc, handle_nil
|
2343
|
+
case
|
2344
|
+
when !handle_nil
|
2345
|
+
value
|
2346
|
+
when asc
|
2347
|
+
[value.nil? ? 0 : 1, value]
|
2348
|
+
else
|
2349
|
+
[value.nil? ? 1 : 0, value]
|
2350
|
+
end
|
2351
|
+
end
|
2352
|
+
|
2353
|
+
def sort_coerce_boolean opts, symbol, default, size
|
2354
|
+
val = opts[symbol]
|
2355
|
+
case val
|
2356
|
+
when true, false
|
2357
|
+
Array.new(size, val)
|
2358
|
+
when nil
|
2359
|
+
Array.new(size, default)
|
2360
|
+
when Array
|
2361
|
+
raise ArgumentError, "Specify same number of vector names and #{symbol}" if
|
2362
|
+
size != val.size
|
2363
|
+
val
|
2364
|
+
else
|
2365
|
+
raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
|
2366
|
+
end
|
2367
|
+
end
|
2368
|
+
|
2369
|
+
def sort_prepare_block vector_order, opts
|
2370
|
+
ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
|
2371
|
+
handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
|
2372
|
+
|
2373
|
+
by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
|
2374
|
+
vector_locs = vector_order.map { |v| @vectors[v] }
|
2375
|
+
|
2376
|
+
lambda do |index1, index2|
|
2377
|
+
# Build left and right array to compare two rows
|
2378
|
+
left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
|
2379
|
+
right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
|
2380
|
+
|
2381
|
+
# Resolve conflict by Index if all attributes are same
|
2382
|
+
left << index1
|
2383
|
+
right << index2
|
2384
|
+
left <=> right
|
2385
|
+
end
|
2386
|
+
end
|
2387
|
+
|
2388
|
+
def verify_error_message row, test, id, i
|
2389
|
+
description, fields, = test
|
2390
|
+
values =
|
2391
|
+
if fields.empty?
|
2392
|
+
''
|
2406
2393
|
else
|
2407
|
-
|
2394
|
+
' (' + fields.collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
|
2395
|
+
end
|
2396
|
+
"#{i+1} [#{row[id]}]: #{description}#{values}"
|
2397
|
+
end
|
2398
|
+
|
2399
|
+
def prepare_pivot_values index, vectors, opts
|
2400
|
+
case opts[:values]
|
2401
|
+
when nil # values not specified at all.
|
2402
|
+
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
2403
|
+
when Array # multiple values specified.
|
2404
|
+
opts[:values]
|
2405
|
+
else # single value specified.
|
2406
|
+
[opts[:values]]
|
2407
|
+
end
|
2408
|
+
end
|
2409
|
+
|
2410
|
+
def make_pivot_hash grouped, vectors, values, aggregate_function
|
2411
|
+
grouped.groups.map { |n, _| [n, {}] }.to_h.tap do |super_hash|
|
2412
|
+
values.each do |value|
|
2413
|
+
grouped.groups.each do |group_name, row_numbers|
|
2414
|
+
row_numbers.each do |num|
|
2415
|
+
arry = [value, *vectors.map { |v| self[v][num] }]
|
2416
|
+
sub_hash = super_hash[group_name]
|
2417
|
+
sub_hash[arry] ||= []
|
2418
|
+
|
2419
|
+
sub_hash[arry] << self[value][num]
|
2420
|
+
end
|
2421
|
+
end
|
2422
|
+
end
|
2423
|
+
|
2424
|
+
setup_pivot_aggregates super_hash, aggregate_function
|
2425
|
+
end
|
2426
|
+
end
|
2427
|
+
|
2428
|
+
def setup_pivot_aggregates super_hash, aggregate_function
|
2429
|
+
super_hash.each_value do |sub_hash|
|
2430
|
+
sub_hash.each do |group_name, aggregates|
|
2431
|
+
sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
|
2408
2432
|
end
|
2433
|
+
end
|
2434
|
+
end
|
2435
|
+
|
2436
|
+
def pivot_dataframe super_hash
|
2437
|
+
df_index = Daru::MultiIndex.from_tuples super_hash.keys
|
2438
|
+
df_vectors = Daru::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
|
2409
2439
|
|
2410
|
-
|
2440
|
+
Daru::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
|
2441
|
+
super_hash.each do |row_index, sub_h|
|
2442
|
+
sub_h.each do |vector_index, val|
|
2443
|
+
pivoted_dataframe[vector_index][row_index] = val
|
2444
|
+
end
|
2445
|
+
end
|
2446
|
+
end
|
2447
|
+
end
|
2448
|
+
|
2449
|
+
def one_to_many_components pattern
|
2450
|
+
re = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)')
|
2451
|
+
|
2452
|
+
vars, numbers =
|
2453
|
+
@vectors
|
2454
|
+
.map { |v| v.scan(re) }
|
2455
|
+
.reject(&:empty?).flatten(1).transpose
|
2456
|
+
|
2457
|
+
[vars.uniq, numbers.map(&:to_i).sort.uniq]
|
2458
|
+
end
|
2459
|
+
|
2460
|
+
def one_to_many_row row, number, vars, pattern
|
2461
|
+
vars
|
2462
|
+
.map { |v|
|
2463
|
+
name = pattern.sub('%v', v).sub('%n', number.to_s)
|
2464
|
+
[v, row[name]]
|
2465
|
+
}.to_h
|
2466
|
+
end
|
2467
|
+
|
2468
|
+
# Raises IndexError when one of the positions is not a valid position
|
2469
|
+
def validate_positions *positions, size
|
2470
|
+
positions = [positions] if positions.is_a? Integer
|
2471
|
+
positions.each do |pos|
|
2472
|
+
raise IndexError, "#{pos} is not a valid position." if pos >= size
|
2473
|
+
end
|
2474
|
+
end
|
2475
|
+
|
2476
|
+
# Accepts hash, enumerable and vector and align it properly so it can be added
|
2477
|
+
def coerce_vector vector
|
2478
|
+
case vector
|
2479
|
+
when Daru::Vector
|
2480
|
+
vector.reindex @vectors
|
2481
|
+
when Hash
|
2482
|
+
Daru::Vector.new(vector).reindex @vectors
|
2483
|
+
else
|
2484
|
+
Daru::Vector.new vector
|
2485
|
+
end
|
2486
|
+
end
|
2487
|
+
|
2488
|
+
# coerce ranges, integers and array in appropriate ways
|
2489
|
+
def coerce_positions *positions, size
|
2490
|
+
if positions.size == 1
|
2491
|
+
case positions.first
|
2492
|
+
when Integer
|
2493
|
+
positions.first
|
2494
|
+
when Range
|
2495
|
+
size.times.to_a[positions.first]
|
2496
|
+
else
|
2497
|
+
raise ArgumentError, 'Unkown position type.'
|
2498
|
+
end
|
2499
|
+
else
|
2500
|
+
positions
|
2501
|
+
end
|
2411
2502
|
end
|
2412
2503
|
end
|
2413
2504
|
end
|