daru_lite 0.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
- data/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- data/.github/workflows/ci.yml +20 -0
- data/.rubocop_todo.yml +35 -33
- data/README.md +19 -115
- data/daru_lite.gemspec +1 -0
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +142 -2355
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3243
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +102 -3
- data/.github/ISSUE_TEMPLATE.md +0 -18
data/lib/daru_lite/dataframe.rb
CHANGED
@@ -1,10 +1,40 @@
|
|
1
1
|
require 'daru_lite/accessors/dataframe_by_row'
|
2
|
+
require 'daru_lite/data_frame/aggregatable'
|
3
|
+
require 'daru_lite/data_frame/calculatable'
|
4
|
+
require 'daru_lite/data_frame/convertible'
|
5
|
+
require 'daru_lite/data_frame/duplicatable'
|
6
|
+
require 'daru_lite/data_frame/fetchable'
|
7
|
+
require 'daru_lite/data_frame/filterable'
|
8
|
+
require 'daru_lite/data_frame/indexable'
|
9
|
+
require 'daru_lite/data_frame/i_o_able'
|
10
|
+
require 'daru_lite/data_frame/iterable'
|
11
|
+
require 'daru_lite/data_frame/joinable'
|
12
|
+
require 'daru_lite/data_frame/missable'
|
13
|
+
require 'daru_lite/data_frame/pivotable'
|
14
|
+
require 'daru_lite/data_frame/setable'
|
15
|
+
require 'daru_lite/data_frame/sortable'
|
16
|
+
require 'daru_lite/data_frame/queryable'
|
2
17
|
require 'daru_lite/maths/arithmetic/dataframe'
|
3
18
|
require 'daru_lite/maths/statistics/dataframe'
|
4
19
|
require 'daru_lite/io/io'
|
5
20
|
|
6
21
|
module DaruLite
|
7
22
|
class DataFrame # rubocop:disable Metrics/ClassLength
|
23
|
+
include DaruLite::DataFrame::Aggregatable
|
24
|
+
include DaruLite::DataFrame::Calculatable
|
25
|
+
include DaruLite::DataFrame::Convertible
|
26
|
+
include DaruLite::DataFrame::Duplicatable
|
27
|
+
include DaruLite::DataFrame::Fetchable
|
28
|
+
include DaruLite::DataFrame::Filterable
|
29
|
+
include DaruLite::DataFrame::Indexable
|
30
|
+
include DaruLite::DataFrame::Iterable
|
31
|
+
include DaruLite::DataFrame::IOAble
|
32
|
+
include DaruLite::DataFrame::Joinable
|
33
|
+
include DaruLite::DataFrame::Missable
|
34
|
+
include DaruLite::DataFrame::Pivotable
|
35
|
+
include DaruLite::DataFrame::Setable
|
36
|
+
include DaruLite::DataFrame::Sortable
|
37
|
+
include DaruLite::DataFrame::Queryable
|
8
38
|
include DaruLite::Maths::Arithmetic::DataFrame
|
9
39
|
include DaruLite::Maths::Statistics::DataFrame
|
10
40
|
|
@@ -13,109 +43,6 @@ module DaruLite
|
|
13
43
|
extend Gem::Deprecate
|
14
44
|
|
15
45
|
class << self
|
16
|
-
# Load data from a CSV file. Specify an optional block to grab the CSV
|
17
|
-
# object and pre-condition it (for example use the `convert` or
|
18
|
-
# `header_convert` methods).
|
19
|
-
#
|
20
|
-
# == Arguments
|
21
|
-
#
|
22
|
-
# * path - Local path / Remote URL of the file to load specified as a String.
|
23
|
-
#
|
24
|
-
# == Options
|
25
|
-
#
|
26
|
-
# Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
|
27
|
-
# and uses those to eventually construct the resulting DataFrame.
|
28
|
-
#
|
29
|
-
# == Verbose Description
|
30
|
-
#
|
31
|
-
# You can specify all the options to the `.from_csv` function that you
|
32
|
-
# do to the Ruby `CSV.read()` function, since this is what is used internally.
|
33
|
-
#
|
34
|
-
# For example, if the columns in your CSV file are separated by something
|
35
|
-
# other that commas, you can use the `:col_sep` option. If you want to
|
36
|
-
# convert numeric values to numbers and not keep them as strings, you can
|
37
|
-
# use the `:converters` option and set it to `:numeric`.
|
38
|
-
#
|
39
|
-
# The `.from_csv` function uses the following defaults for reading CSV files
|
40
|
-
# (that are passed into the `CSV.read()` function):
|
41
|
-
#
|
42
|
-
# {
|
43
|
-
# :col_sep => ',',
|
44
|
-
# :converters => :numeric
|
45
|
-
# }
|
46
|
-
def from_csv(path, opts = {}, &block)
|
47
|
-
DaruLite::IO.from_csv path, opts, &block
|
48
|
-
end
|
49
|
-
|
50
|
-
# Read data from an Excel file into a DataFrame.
|
51
|
-
#
|
52
|
-
# == Arguments
|
53
|
-
#
|
54
|
-
# * path - Path of the file to be read.
|
55
|
-
#
|
56
|
-
# == Options
|
57
|
-
#
|
58
|
-
# *:worksheet_id - ID of the worksheet that is to be read.
|
59
|
-
def from_excel(path, opts = {}, &block)
|
60
|
-
DaruLite::IO.from_excel path, opts, &block
|
61
|
-
end
|
62
|
-
|
63
|
-
# Read a database query and returns a Dataset
|
64
|
-
#
|
65
|
-
# @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
|
66
|
-
# @param query [String] The query to be executed
|
67
|
-
#
|
68
|
-
# @return A dataframe containing the data resulting from the query
|
69
|
-
#
|
70
|
-
# USE:
|
71
|
-
#
|
72
|
-
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
73
|
-
# DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
|
74
|
-
#
|
75
|
-
# #Alternatively
|
76
|
-
#
|
77
|
-
# require 'dbi'
|
78
|
-
# DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
|
79
|
-
def from_sql(dbh, query)
|
80
|
-
DaruLite::IO.from_sql dbh, query
|
81
|
-
end
|
82
|
-
|
83
|
-
# Read a dataframe from AR::Relation
|
84
|
-
#
|
85
|
-
# @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
|
86
|
-
# @param fields [Array] Field names to be loaded (optional)
|
87
|
-
#
|
88
|
-
# @return A dataframe containing the data loaded from the relation
|
89
|
-
#
|
90
|
-
# USE:
|
91
|
-
#
|
92
|
-
# # When Post model is defined as:
|
93
|
-
# class Post < ActiveRecord::Base
|
94
|
-
# scope :active, -> { where.not(published_at: nil) }
|
95
|
-
# end
|
96
|
-
#
|
97
|
-
# # You can load active posts into a dataframe by:
|
98
|
-
# DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
|
99
|
-
def from_activerecord(relation, *fields)
|
100
|
-
DaruLite::IO.from_activerecord relation, *fields
|
101
|
-
end
|
102
|
-
|
103
|
-
# Read the database from a plaintext file. For this method to work,
|
104
|
-
# the data should be present in a plain text file in columns. See
|
105
|
-
# spec/fixtures/bank2.dat for an example.
|
106
|
-
#
|
107
|
-
# == Arguments
|
108
|
-
#
|
109
|
-
# * path - Path of the file to be read.
|
110
|
-
# * fields - Vector names of the resulting database.
|
111
|
-
#
|
112
|
-
# == Usage
|
113
|
-
#
|
114
|
-
# df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
|
115
|
-
def from_plaintext(path, fields)
|
116
|
-
DaruLite::IO.from_plaintext path, fields
|
117
|
-
end
|
118
|
-
|
119
46
|
# Create DataFrame by specifying rows as an Array of Arrays or Array of
|
120
47
|
# DaruLite::Vector objects.
|
121
48
|
def rows(source, opts = {})
|
@@ -316,179 +243,6 @@ module DaruLite
|
|
316
243
|
update
|
317
244
|
end
|
318
245
|
|
319
|
-
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
320
|
-
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
321
|
-
# rows. Use df.row[:a] for accessing row with index ':a'.
|
322
|
-
def [](*names)
|
323
|
-
axis = extract_axis(names, :vector)
|
324
|
-
dispatch_to_axis axis, :access, *names
|
325
|
-
end
|
326
|
-
|
327
|
-
# Retrive rows by positions
|
328
|
-
# @param [Array<Integer>] positions of rows to retrive
|
329
|
-
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
330
|
-
# @example
|
331
|
-
# df = DaruLite::DataFrame.new({
|
332
|
-
# a: [1, 2, 3],
|
333
|
-
# b: ['a', 'b', 'c']
|
334
|
-
# })
|
335
|
-
# df.row_at 1, 2
|
336
|
-
# # => #<DaruLite::DataFrame(2x2)>
|
337
|
-
# # a b
|
338
|
-
# # 1 2 b
|
339
|
-
# # 2 3 c
|
340
|
-
def row_at(*positions)
|
341
|
-
original_positions = positions
|
342
|
-
positions = coerce_positions(*positions, nrows)
|
343
|
-
validate_positions(*positions, nrows)
|
344
|
-
|
345
|
-
if positions.is_a? Integer
|
346
|
-
row = get_rows_for([positions])
|
347
|
-
DaruLite::Vector.new row, index: @vectors
|
348
|
-
else
|
349
|
-
new_rows = get_rows_for(original_positions)
|
350
|
-
DaruLite::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
|
351
|
-
end
|
352
|
-
end
|
353
|
-
|
354
|
-
# Set rows by positions
|
355
|
-
# @param [Array<Integer>] positions positions of rows to set
|
356
|
-
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
357
|
-
# @example
|
358
|
-
# df = DaruLite::DataFrame.new({
|
359
|
-
# a: [1, 2, 3],
|
360
|
-
# b: ['a', 'b', 'c']
|
361
|
-
# })
|
362
|
-
# df.set_row_at [0, 1], ['x', 'x']
|
363
|
-
# df
|
364
|
-
# #=> #<DaruLite::DataFrame(3x2)>
|
365
|
-
# # a b
|
366
|
-
# # 0 x x
|
367
|
-
# # 1 x x
|
368
|
-
# # 2 3 c
|
369
|
-
def set_row_at(positions, vector)
|
370
|
-
validate_positions(*positions, nrows)
|
371
|
-
vector =
|
372
|
-
if vector.is_a? DaruLite::Vector
|
373
|
-
vector.reindex @vectors
|
374
|
-
else
|
375
|
-
DaruLite::Vector.new vector
|
376
|
-
end
|
377
|
-
|
378
|
-
raise SizeError, 'Vector length should match row length' if
|
379
|
-
vector.size != @vectors.size
|
380
|
-
|
381
|
-
@data.each_with_index do |vec, pos|
|
382
|
-
vec.set_at(positions, vector.at(pos))
|
383
|
-
end
|
384
|
-
@index = @data[0].index
|
385
|
-
set_size
|
386
|
-
end
|
387
|
-
|
388
|
-
# Retrive vectors by positions
|
389
|
-
# @param [Array<Integer>] positions of vectors to retrive
|
390
|
-
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
391
|
-
# @example
|
392
|
-
# df = DaruLite::DataFrame.new({
|
393
|
-
# a: [1, 2, 3],
|
394
|
-
# b: ['a', 'b', 'c']
|
395
|
-
# })
|
396
|
-
# df.at 0
|
397
|
-
# # => #<DaruLite::Vector(3)>
|
398
|
-
# # a
|
399
|
-
# # 0 1
|
400
|
-
# # 1 2
|
401
|
-
# # 2 3
|
402
|
-
def at(*positions)
|
403
|
-
if AXES.include? positions.last
|
404
|
-
axis = positions.pop
|
405
|
-
return row_at(*positions) if axis == :row
|
406
|
-
end
|
407
|
-
|
408
|
-
original_positions = positions
|
409
|
-
positions = coerce_positions(*positions, ncols)
|
410
|
-
validate_positions(*positions, ncols)
|
411
|
-
|
412
|
-
if positions.is_a? Integer
|
413
|
-
@data[positions].dup
|
414
|
-
else
|
415
|
-
DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
|
416
|
-
index: @index,
|
417
|
-
order: @vectors.at(*original_positions),
|
418
|
-
name: @name
|
419
|
-
end
|
420
|
-
end
|
421
|
-
|
422
|
-
# Set vectors by positions
|
423
|
-
# @param [Array<Integer>] positions positions of vectors to set
|
424
|
-
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
425
|
-
# @example
|
426
|
-
# df = DaruLite::DataFrame.new({
|
427
|
-
# a: [1, 2, 3],
|
428
|
-
# b: ['a', 'b', 'c']
|
429
|
-
# })
|
430
|
-
# df.set_at [0], ['x', 'y', 'z']
|
431
|
-
# df
|
432
|
-
# #=> #<DaruLite::DataFrame(3x2)>
|
433
|
-
# # a b
|
434
|
-
# # 0 x a
|
435
|
-
# # 1 y b
|
436
|
-
# # 2 z c
|
437
|
-
def set_at(positions, vector)
|
438
|
-
if positions.last == :row
|
439
|
-
positions.pop
|
440
|
-
return set_row_at(positions, vector)
|
441
|
-
end
|
442
|
-
|
443
|
-
validate_positions(*positions, ncols)
|
444
|
-
vector =
|
445
|
-
if vector.is_a? DaruLite::Vector
|
446
|
-
vector.reindex @index
|
447
|
-
else
|
448
|
-
DaruLite::Vector.new vector
|
449
|
-
end
|
450
|
-
|
451
|
-
raise SizeError, 'Vector length should match index length' if
|
452
|
-
vector.size != @index.size
|
453
|
-
|
454
|
-
positions.each { |pos| @data[pos] = vector }
|
455
|
-
end
|
456
|
-
|
457
|
-
# Insert a new row/vector of the specified name or modify a previous row.
|
458
|
-
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
|
459
|
-
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
|
460
|
-
#
|
461
|
-
# In case a DaruLite::Vector is specified after the equality the sign, the indexes
|
462
|
-
# of the vector will be matched against the row/vector indexes of the DataFrame
|
463
|
-
# before an insertion is performed. Unmatched indexes will be set to nil.
|
464
|
-
def []=(*args)
|
465
|
-
vector = args.pop
|
466
|
-
axis = extract_axis(args)
|
467
|
-
names = args
|
468
|
-
|
469
|
-
dispatch_to_axis axis, :insert_or_modify, names, vector
|
470
|
-
end
|
471
|
-
|
472
|
-
def add_row(row, index = nil)
|
473
|
-
self.row[*(index || @size)] = row
|
474
|
-
end
|
475
|
-
|
476
|
-
def add_vector(n, vector)
|
477
|
-
self[n] = vector
|
478
|
-
end
|
479
|
-
|
480
|
-
def insert_vector(n, name, source)
|
481
|
-
raise ArgumentError unless source.is_a? Array
|
482
|
-
|
483
|
-
vector = DaruLite::Vector.new(source, index: @index, name: @name)
|
484
|
-
@data << vector
|
485
|
-
@vectors = @vectors.add name
|
486
|
-
ordr = @vectors.dup.to_a
|
487
|
-
elmnt = ordr.pop
|
488
|
-
ordr.insert n, elmnt
|
489
|
-
self.order = ordr
|
490
|
-
end
|
491
|
-
|
492
246
|
# Access a row or set/create a row. Refer #[] and #[]= docs for details.
|
493
247
|
#
|
494
248
|
# == Usage
|
@@ -498,1697 +252,177 @@ module DaruLite
|
|
498
252
|
DaruLite::Accessors::DataFrameByRow.new(self)
|
499
253
|
end
|
500
254
|
|
501
|
-
#
|
502
|
-
|
503
|
-
|
504
|
-
def get_sub_dataframe(keys, by_position: true)
|
505
|
-
return DaruLite::DataFrame.new({}) if keys == []
|
506
|
-
|
507
|
-
keys = @index.pos(*keys) unless by_position
|
508
|
-
|
509
|
-
sub_df = row_at(*keys)
|
510
|
-
sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
|
511
|
-
|
512
|
-
sub_df
|
513
|
-
end
|
514
|
-
|
515
|
-
# Duplicate the DataFrame entirely.
|
516
|
-
#
|
517
|
-
# == Arguments
|
518
|
-
#
|
519
|
-
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
520
|
-
# be duplicated. Will duplicate the entire DataFrame if not specified.
|
521
|
-
def dup(vectors_to_dup = nil)
|
522
|
-
vectors_to_dup ||= @vectors.to_a
|
523
|
-
|
524
|
-
src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
|
525
|
-
new_order = DaruLite::Index.new(vectors_to_dup)
|
526
|
-
|
527
|
-
DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
|
528
|
-
end
|
529
|
-
|
530
|
-
# Only clone the structure of the DataFrame.
|
531
|
-
def clone_structure
|
532
|
-
DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
|
533
|
-
end
|
534
|
-
|
535
|
-
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
|
536
|
-
# preserved.
|
537
|
-
#
|
538
|
-
# == Arguments
|
539
|
-
#
|
540
|
-
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
541
|
-
# a view of the whole data frame otherwise.
|
542
|
-
def clone(*vectors_to_clone)
|
543
|
-
vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
|
544
|
-
vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
|
545
|
-
|
546
|
-
h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
|
547
|
-
DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
|
548
|
-
end
|
549
|
-
|
550
|
-
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
551
|
-
# or a full copy of only valid data if missing data is present.
|
552
|
-
def clone_only_valid
|
553
|
-
if include_values?(*DaruLite::MISSING_VALUES)
|
554
|
-
reject_values(*DaruLite::MISSING_VALUES)
|
555
|
-
else
|
556
|
-
clone
|
557
|
-
end
|
558
|
-
end
|
559
|
-
|
560
|
-
# Creates a new duplicate dataframe containing only rows
|
561
|
-
# without a single missing value.
|
562
|
-
def dup_only_valid(vecs = nil)
|
563
|
-
rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
|
564
|
-
.inject(&:concat)
|
565
|
-
.uniq
|
566
|
-
|
567
|
-
row_indexes = @index.to_a
|
568
|
-
(vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
|
569
|
-
end
|
570
|
-
deprecate :dup_only_valid, :reject_values, 2016, 10
|
571
|
-
|
572
|
-
# Returns a dataframe in which rows with any of the mentioned values
|
573
|
-
# are ignored.
|
574
|
-
# @param [Array] values to reject to form the new dataframe
|
575
|
-
# @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
|
576
|
-
# contain the mentioned values
|
577
|
-
# @example
|
578
|
-
# df = DaruLite::DataFrame.new({
|
579
|
-
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
580
|
-
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
581
|
-
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
582
|
-
# }, index: 11..18)
|
583
|
-
# df.reject_values nil, Float::NAN
|
584
|
-
# # => #<DaruLite::DataFrame(2x3)>
|
585
|
-
# # a b c
|
586
|
-
# # 11 1 a a
|
587
|
-
# # 18 7 8 7
|
588
|
-
def reject_values(*values)
|
589
|
-
positions =
|
590
|
-
size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
|
591
|
-
# Handle the case when positions size is 1 and #row_at wouldn't return a df
|
592
|
-
if positions.size == 1
|
593
|
-
pos = positions.first
|
594
|
-
row_at(pos..pos)
|
595
|
-
else
|
596
|
-
row_at(*positions)
|
597
|
-
end
|
598
|
-
end
|
599
|
-
|
600
|
-
# Replace specified values with given value
|
601
|
-
# @param [Array] old_values values to replace with new value
|
602
|
-
# @param [object] new_value new value to replace with
|
603
|
-
# @return [DaruLite::DataFrame] Data Frame itself with old values replace
|
604
|
-
# with new value
|
605
|
-
# @example
|
606
|
-
# df = DaruLite::DataFrame.new({
|
607
|
-
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
608
|
-
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
609
|
-
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
610
|
-
# }, index: 11..18)
|
611
|
-
# df.replace_values nil, Float::NAN
|
612
|
-
# # => #<DaruLite::DataFrame(8x3)>
|
613
|
-
# # a b c
|
614
|
-
# # 11 1 a a
|
615
|
-
# # 12 2 b NaN
|
616
|
-
# # 13 3 NaN 3
|
617
|
-
# # 14 NaN NaN 4
|
618
|
-
# # 15 NaN NaN 3
|
619
|
-
# # 16 NaN 3 5
|
620
|
-
# # 17 1 5 NaN
|
621
|
-
# # 18 7 8 7
|
622
|
-
def replace_values(old_values, new_value)
|
623
|
-
@data.each { |vec| vec.replace_values old_values, new_value }
|
624
|
-
self
|
625
|
-
end
|
626
|
-
|
627
|
-
# Rolling fillna
|
628
|
-
# replace all Float::NAN and NIL values with the preceeding or following value
|
629
|
-
#
|
630
|
-
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
631
|
-
#
|
632
|
-
# @example
|
633
|
-
# df = DaruLite::DataFrame.new({
|
634
|
-
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
635
|
-
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
|
636
|
-
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
637
|
-
# })
|
638
|
-
#
|
639
|
-
# => #<DaruLite::DataFrame(8x3)>
|
640
|
-
# a b c
|
641
|
-
# 0 1 a a
|
642
|
-
# 1 2 b NaN
|
643
|
-
# 2 3 nil 3
|
644
|
-
# 3 nil NaN 4
|
645
|
-
# 4 NaN nil 3
|
646
|
-
# 5 nil 3 5
|
647
|
-
# 6 1 5 nil
|
648
|
-
# 7 7 nil 7
|
649
|
-
#
|
650
|
-
# 2.3.3 :068 > df.rolling_fillna(:forward)
|
651
|
-
# => #<DaruLite::DataFrame(8x3)>
|
652
|
-
# a b c
|
653
|
-
# 0 1 a a
|
654
|
-
# 1 2 b a
|
655
|
-
# 2 3 b 3
|
656
|
-
# 3 3 b 4
|
657
|
-
# 4 3 b 3
|
658
|
-
# 5 3 3 5
|
659
|
-
# 6 1 5 5
|
660
|
-
# 7 7 5 7
|
661
|
-
#
|
662
|
-
def rolling_fillna!(direction = :forward)
|
663
|
-
@data.each { |vec| vec.rolling_fillna!(direction) }
|
664
|
-
self
|
665
|
-
end
|
666
|
-
|
667
|
-
def rolling_fillna(direction = :forward)
|
668
|
-
dup.rolling_fillna!(direction)
|
669
|
-
end
|
670
|
-
|
671
|
-
# Return unique rows by vector specified or all vectors
|
672
|
-
#
|
673
|
-
# @param vtrs [String][Symbol] vector names(s) that should be considered
|
674
|
-
#
|
675
|
-
# @example
|
676
|
-
#
|
677
|
-
# => #<DaruLite::DataFrame(6x2)>
|
678
|
-
# a b
|
679
|
-
# 0 1 a
|
680
|
-
# 1 2 b
|
681
|
-
# 2 3 c
|
682
|
-
# 3 4 d
|
683
|
-
# 2 3 c
|
684
|
-
# 3 4 f
|
685
|
-
#
|
686
|
-
# 2.3.3 :> df.unique
|
687
|
-
# => #<DaruLite::DataFrame(5x2)>
|
688
|
-
# a b
|
689
|
-
# 0 1 a
|
690
|
-
# 1 2 b
|
691
|
-
# 2 3 c
|
692
|
-
# 3 4 d
|
693
|
-
# 3 4 f
|
694
|
-
#
|
695
|
-
# 2.3.3 :> df.unique(:a)
|
696
|
-
# => #<DaruLite::DataFrame(5x2)>
|
697
|
-
# a b
|
698
|
-
# 0 1 a
|
699
|
-
# 1 2 b
|
700
|
-
# 2 3 c
|
701
|
-
# 3 4 d
|
702
|
-
#
|
703
|
-
def uniq(*vtrs)
|
704
|
-
vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
|
705
|
-
grouped = group_by(vecs)
|
706
|
-
indexes = grouped.groups.values.map { |v| v[0] }.sort
|
707
|
-
row[*indexes]
|
708
|
-
end
|
709
|
-
|
710
|
-
# Iterate over each index of the DataFrame.
|
711
|
-
def each_index(&block)
|
712
|
-
return to_enum(:each_index) unless block
|
713
|
-
|
714
|
-
@index.each(&block)
|
715
|
-
|
716
|
-
self
|
717
|
-
end
|
718
|
-
|
719
|
-
# Iterate over each vector
|
720
|
-
def each_vector(&block)
|
721
|
-
return to_enum(:each_vector) unless block
|
255
|
+
# Delete a vector
|
256
|
+
def delete_vector(vector)
|
257
|
+
raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
|
722
258
|
|
723
|
-
@data.
|
259
|
+
@data.delete_at @vectors[vector]
|
260
|
+
@vectors = DaruLite::Index.new @vectors.to_a - [vector]
|
724
261
|
|
725
262
|
self
|
726
263
|
end
|
727
264
|
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
def each_vector_with_index
|
732
|
-
return to_enum(:each_vector_with_index) unless block_given?
|
733
|
-
|
734
|
-
@vectors.each do |vector|
|
735
|
-
yield @data[@vectors[vector]], vector
|
736
|
-
end
|
265
|
+
# Deletes a list of vectors
|
266
|
+
def delete_vectors(*vectors)
|
267
|
+
Array(vectors).each { |vec| delete_vector vec }
|
737
268
|
|
738
269
|
self
|
739
270
|
end
|
740
271
|
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
def each_row
|
745
|
-
return to_enum(:each_row) unless block_given?
|
746
|
-
|
747
|
-
@index.size.times do |pos|
|
748
|
-
yield row_at(pos)
|
749
|
-
end
|
750
|
-
|
751
|
-
self
|
752
|
-
end
|
272
|
+
# Delete a row
|
273
|
+
def delete_row(index)
|
274
|
+
idx = named_index_for index
|
753
275
|
|
754
|
-
|
755
|
-
return to_enum(:each_row_with_index) unless block_given?
|
276
|
+
raise IndexError, "Index #{index} does not exist." unless @index.include? idx
|
756
277
|
|
757
|
-
@index.
|
758
|
-
|
278
|
+
@index = DaruLite::Index.new(@index.to_a - [idx])
|
279
|
+
each_vector do |vector|
|
280
|
+
vector.delete_at idx
|
759
281
|
end
|
760
282
|
|
761
|
-
|
762
|
-
end
|
763
|
-
|
764
|
-
# Iterate over each row or vector of the DataFrame. Specify axis
|
765
|
-
# by passing :vector or :row as the argument. Default to :vector.
|
766
|
-
#
|
767
|
-
# == Description
|
768
|
-
#
|
769
|
-
# `#each` works exactly like Array#each. The default mode for `each`
|
770
|
-
# is to iterate over the columns of the DataFrame. To iterate over
|
771
|
-
# rows you must pass the axis, i.e `:row` as an argument.
|
772
|
-
#
|
773
|
-
# == Arguments
|
774
|
-
#
|
775
|
-
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
776
|
-
# or :row. Default to :vector.
|
777
|
-
def each(axis = :vector, &block)
|
778
|
-
dispatch_to_axis axis, :each, &block
|
779
|
-
end
|
780
|
-
|
781
|
-
# Iterate over a row or vector and return results in a DaruLite::Vector.
|
782
|
-
# Specify axis with :vector or :row. Default to :vector.
|
783
|
-
#
|
784
|
-
# == Description
|
785
|
-
#
|
786
|
-
# The #collect iterator works similar to #map, the only difference
|
787
|
-
# being that it returns a DaruLite::Vector comprising of the results of
|
788
|
-
# each block run. The resultant Vector has the same index as that
|
789
|
-
# of the axis over which collect has iterated. It also accepts the
|
790
|
-
# optional axis argument.
|
791
|
-
#
|
792
|
-
# == Arguments
|
793
|
-
#
|
794
|
-
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
795
|
-
# or :row. Default to :vector.
|
796
|
-
def collect(axis = :vector, &block)
|
797
|
-
dispatch_to_axis_pl axis, :collect, &block
|
283
|
+
set_size
|
798
284
|
end
|
799
285
|
|
800
|
-
#
|
801
|
-
#
|
802
|
-
#
|
803
|
-
|
804
|
-
|
805
|
-
# == Description
|
806
|
-
#
|
807
|
-
# The #map iterator works like Array#map. The value returned by
|
808
|
-
# each run of the block is added to an Array and the Array is
|
809
|
-
# returned. This method also accepts an axis argument, like #each.
|
810
|
-
# The default is :vector.
|
811
|
-
#
|
812
|
-
# == Arguments
|
813
|
-
#
|
814
|
-
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
815
|
-
# Default to :vector.
|
816
|
-
def map(axis = :vector, &block)
|
817
|
-
dispatch_to_axis_pl axis, :map, &block
|
818
|
-
end
|
286
|
+
# Delete a row based on its position
|
287
|
+
# More robust than #delete_row when working with a CategoricalIndex or when the
|
288
|
+
# Index includes integers
|
289
|
+
def delete_at_position(position)
|
290
|
+
raise IndexError, "Position #{position} does not exist." unless position < size
|
819
291
|
|
820
|
-
|
821
|
-
|
822
|
-
# as the argument. Default to :vector.
|
823
|
-
#
|
824
|
-
# == Arguments
|
825
|
-
#
|
826
|
-
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
827
|
-
# Default to :vector.
|
828
|
-
def map!(axis = :vector, &block)
|
829
|
-
if %i[vector column].include?(axis)
|
830
|
-
map_vectors!(&block)
|
831
|
-
elsif axis == :row
|
832
|
-
map_rows!(&block)
|
833
|
-
end
|
834
|
-
end
|
292
|
+
each_vector { |vector| vector.delete_at_position(position) }
|
293
|
+
@index = @index.delete_at(position)
|
835
294
|
|
836
|
-
|
837
|
-
# block must return a DaruLite::Vector object. You can specify the axis
|
838
|
-
# to map over. Default to :vector.
|
839
|
-
#
|
840
|
-
# == Description
|
841
|
-
#
|
842
|
-
# Recode works similarly to #map, but an important difference between
|
843
|
-
# the two is that recode returns a modified DaruLite::DataFrame instead
|
844
|
-
# of an Array. For this reason, #recode expects that every run of the
|
845
|
-
# block to return a DaruLite::Vector.
|
846
|
-
#
|
847
|
-
# Just like map and each, recode also accepts an optional _axis_ argument.
|
848
|
-
#
|
849
|
-
# == Arguments
|
850
|
-
#
|
851
|
-
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
852
|
-
# Default to :vector.
|
853
|
-
def recode(axis = :vector, &block)
|
854
|
-
dispatch_to_axis_pl axis, :recode, &block
|
295
|
+
set_size
|
855
296
|
end
|
856
297
|
|
857
|
-
#
|
858
|
-
#
|
859
|
-
# == Description
|
860
|
-
#
|
861
|
-
# For filtering out certain rows/vectors based on their values,
|
862
|
-
# use the #filter method. By default it iterates over vectors and
|
863
|
-
# keeps those vectors for which the block returns true. It accepts
|
864
|
-
# an optional axis argument which lets you specify whether you want
|
865
|
-
# to iterate over vectors or rows.
|
866
|
-
#
|
867
|
-
# == Arguments
|
868
|
-
#
|
869
|
-
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
870
|
-
# Default to :vector.
|
871
|
-
#
|
872
|
-
# == Usage
|
873
|
-
#
|
874
|
-
# # Filter vectors
|
875
|
-
#
|
876
|
-
# df.filter do |vector|
|
877
|
-
# vector.type == :numeric and vector.median < 50
|
878
|
-
# end
|
879
|
-
#
|
880
|
-
# # Filter rows
|
298
|
+
# Creates a DataFrame with the random data, of n size.
|
299
|
+
# If n not given, uses original number of rows.
|
881
300
|
#
|
882
|
-
#
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
def recode_vectors
|
890
|
-
block_given? or return to_enum(:recode_vectors)
|
891
|
-
|
892
|
-
dup.tap do |df|
|
893
|
-
df.each_vector_with_index do |v, i|
|
894
|
-
df[*i] = should_be_vector!(yield(v))
|
895
|
-
end
|
896
|
-
end
|
897
|
-
end
|
898
|
-
|
899
|
-
def recode_rows
|
900
|
-
block_given? or return to_enum(:recode_rows)
|
901
|
-
|
902
|
-
dup.tap do |df|
|
903
|
-
df.each_row_with_index do |r, i|
|
904
|
-
df.row[i] = should_be_vector!(yield(r))
|
301
|
+
# @return {DaruLite::DataFrame}
|
302
|
+
def bootstrap(n = nil)
|
303
|
+
n ||= nrows
|
304
|
+
DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
|
305
|
+
n.times do
|
306
|
+
df_boot.add_row(row[rand(n)])
|
905
307
|
end
|
308
|
+
df_boot.update
|
906
309
|
end
|
907
310
|
end
|
908
311
|
|
909
|
-
# Map each vector and return an Array.
|
910
|
-
def map_vectors(&block)
|
911
|
-
return to_enum(:map_vectors) unless block
|
912
|
-
|
913
|
-
@data.map(&block)
|
914
|
-
end
|
915
|
-
|
916
|
-
# Destructive form of #map_vectors
|
917
|
-
def map_vectors!
|
918
|
-
return to_enum(:map_vectors!) unless block_given?
|
919
|
-
|
920
|
-
vectors.dup.each do |n|
|
921
|
-
self[n] = should_be_vector!(yield(self[n]))
|
922
|
-
end
|
923
|
-
|
924
|
-
self
|
925
|
-
end
|
926
|
-
|
927
|
-
# Map vectors alongwith the index.
|
928
|
-
def map_vectors_with_index(&block)
|
929
|
-
return to_enum(:map_vectors_with_index) unless block
|
930
|
-
|
931
|
-
each_vector_with_index.map(&block)
|
932
|
-
end
|
933
|
-
|
934
|
-
# Map each row
|
935
|
-
def map_rows(&block)
|
936
|
-
return to_enum(:map_rows) unless block
|
937
|
-
|
938
|
-
each_row.map(&block)
|
939
|
-
end
|
940
|
-
|
941
|
-
def map_rows_with_index(&block)
|
942
|
-
return to_enum(:map_rows_with_index) unless block
|
943
|
-
|
944
|
-
each_row_with_index.map(&block)
|
945
|
-
end
|
946
|
-
|
947
|
-
def map_rows!
|
948
|
-
return to_enum(:map_rows!) unless block_given?
|
949
|
-
|
950
|
-
index.dup.each do |i|
|
951
|
-
row[i] = should_be_vector!(yield(row[i]))
|
952
|
-
end
|
953
|
-
|
954
|
-
self
|
955
|
-
end
|
956
|
-
|
957
|
-
def apply_method(method, keys: nil, by_position: true)
|
958
|
-
df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
|
959
|
-
|
960
|
-
case method
|
961
|
-
when Symbol then df.send(method)
|
962
|
-
when Proc then method.call(df)
|
963
|
-
when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
|
964
|
-
else raise
|
965
|
-
end
|
966
|
-
end
|
967
|
-
alias apply_method_on_sub_df apply_method
|
968
|
-
|
969
|
-
# Retrieves a DaruLite::Vector, based on the result of calculation
|
970
|
-
# performed on each row.
|
971
|
-
def collect_rows(&block)
|
972
|
-
return to_enum(:collect_rows) unless block
|
973
|
-
|
974
|
-
DaruLite::Vector.new(each_row.map(&block), index: @index)
|
975
|
-
end
|
976
|
-
|
977
|
-
def collect_row_with_index(&block)
|
978
|
-
return to_enum(:collect_row_with_index) unless block
|
979
|
-
|
980
|
-
DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
|
981
|
-
end
|
982
|
-
|
983
|
-
# Retrives a DaruLite::Vector, based on the result of calculation
|
984
|
-
# performed on each vector.
|
985
|
-
def collect_vectors(&block)
|
986
|
-
return to_enum(:collect_vectors) unless block
|
987
|
-
|
988
|
-
DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
|
989
|
-
end
|
990
|
-
|
991
|
-
def collect_vector_with_index(&block)
|
992
|
-
return to_enum(:collect_vector_with_index) unless block
|
993
|
-
|
994
|
-
DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
|
995
|
-
end
|
996
|
-
|
997
|
-
# Generate a matrix, based on vector names of the DataFrame.
|
998
|
-
#
|
999
|
-
# @return {::Matrix}
|
1000
|
-
# :nocov:
|
1001
|
-
# FIXME: Even not trying to cover this: I can't get, how it is expected
|
1002
|
-
# to work.... -- zverok
|
1003
|
-
def collect_matrix
|
1004
|
-
return to_enum(:collect_matrix) unless block_given?
|
1005
|
-
|
1006
|
-
vecs = vectors.to_a
|
1007
|
-
rows = vecs.collect do |row|
|
1008
|
-
vecs.collect do |col|
|
1009
|
-
yield row, col
|
1010
|
-
end
|
1011
|
-
end
|
1012
|
-
|
1013
|
-
Matrix.rows(rows)
|
1014
|
-
end
|
1015
|
-
# :nocov:
|
1016
|
-
|
1017
|
-
# Delete a vector
|
1018
|
-
def delete_vector(vector)
|
1019
|
-
raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
|
1020
|
-
|
1021
|
-
@data.delete_at @vectors[vector]
|
1022
|
-
@vectors = DaruLite::Index.new @vectors.to_a - [vector]
|
1023
|
-
|
1024
|
-
self
|
1025
|
-
end
|
1026
|
-
|
1027
|
-
# Deletes a list of vectors
|
1028
|
-
def delete_vectors(*vectors)
|
1029
|
-
Array(vectors).each { |vec| delete_vector vec }
|
1030
|
-
|
1031
|
-
self
|
1032
|
-
end
|
1033
|
-
|
1034
|
-
# Delete a row
|
1035
|
-
def delete_row(index)
|
1036
|
-
idx = named_index_for index
|
1037
|
-
|
1038
|
-
raise IndexError, "Index #{index} does not exist." unless @index.include? idx
|
1039
|
-
|
1040
|
-
@index = DaruLite::Index.new(@index.to_a - [idx])
|
1041
|
-
each_vector do |vector|
|
1042
|
-
vector.delete_at idx
|
1043
|
-
end
|
1044
|
-
|
1045
|
-
set_size
|
1046
|
-
end
|
1047
|
-
|
1048
|
-
# Creates a DataFrame with the random data, of n size.
|
1049
|
-
# If n not given, uses original number of rows.
|
1050
|
-
#
|
1051
|
-
# @return {DaruLite::DataFrame}
|
1052
|
-
def bootstrap(n = nil)
|
1053
|
-
n ||= nrows
|
1054
|
-
DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
|
1055
|
-
n.times do
|
1056
|
-
df_boot.add_row(row[rand(n)])
|
1057
|
-
end
|
1058
|
-
df_boot.update
|
1059
|
-
end
|
1060
|
-
end
|
1061
|
-
|
1062
|
-
def keep_row_if
|
1063
|
-
@index
|
1064
|
-
.reject { |idx| yield access_row(idx) }
|
1065
|
-
.each { |idx| delete_row idx }
|
1066
|
-
end
|
1067
|
-
|
1068
|
-
def keep_vector_if
|
1069
|
-
@vectors.each do |vector|
|
1070
|
-
delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
|
1071
|
-
end
|
1072
|
-
end
|
1073
|
-
|
1074
|
-
# creates a new vector with the data of a given field which the block returns true
|
1075
|
-
def filter_vector(vec, &block)
|
1076
|
-
DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
|
1077
|
-
end
|
1078
|
-
|
1079
|
-
# Iterates over each row and retains it in a new DataFrame if the block returns
|
1080
|
-
# true for that row.
|
1081
|
-
def filter_rows
|
1082
|
-
return to_enum(:filter_rows) unless block_given?
|
1083
|
-
|
1084
|
-
keep_rows = @index.map { |index| yield access_row(index) }
|
1085
|
-
|
1086
|
-
where keep_rows
|
1087
|
-
end
|
1088
|
-
|
1089
|
-
# Iterates over each vector and retains it in a new DataFrame if the block returns
|
1090
|
-
# true for that vector.
|
1091
|
-
def filter_vectors(&block)
|
1092
|
-
return to_enum(:filter_vectors) unless block
|
1093
|
-
|
1094
|
-
dup.tap { |df| df.keep_vector_if(&block) }
|
1095
|
-
end
|
1096
|
-
|
1097
|
-
# Test each row with one or more tests.
|
1098
|
-
# @param tests [Proc] Each test is a Proc with the form
|
1099
|
-
# *Proc.new {|row| row[:age] > 0}*
|
1100
|
-
# The function returns an array with all errors.
|
1101
|
-
#
|
1102
|
-
# FIXME: description here is too sparse. As far as I can get,
|
1103
|
-
# it should tell something about that each test is [descr, fields, block],
|
1104
|
-
# and that first value may be column name to output. - zverok, 2016-05-18
|
1105
|
-
def verify(*tests)
|
1106
|
-
id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
|
1107
|
-
|
1108
|
-
each_row_with_index.map do |row, i|
|
1109
|
-
tests.reject { |*_, block| block.call(row) }
|
1110
|
-
.map { |test| verify_error_message row, test, id, i }
|
1111
|
-
end.flatten
|
1112
|
-
end
|
1113
|
-
|
1114
|
-
# DSL for yielding each row and returning a DaruLite::Vector based on the
|
1115
|
-
# value each run of the block returns.
|
1116
|
-
#
|
1117
|
-
# == Usage
|
1118
|
-
#
|
1119
|
-
# a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
|
1120
|
-
# a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
|
1121
|
-
# a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
|
1122
|
-
# ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
|
1123
|
-
# total = ds.vector_by_calculation { a + b + c }
|
1124
|
-
# # <DaruLite::Vector:82314050 @name = nil @size = 7 >
|
1125
|
-
# # nil
|
1126
|
-
# # 0 111
|
1127
|
-
# # 1 222
|
1128
|
-
# # 2 333
|
1129
|
-
# # 3 444
|
1130
|
-
# # 4 555
|
1131
|
-
# # 5 666
|
1132
|
-
# # 6 777
|
1133
|
-
def vector_by_calculation(&block)
|
1134
|
-
a = each_row.map { |r| r.instance_eval(&block) }
|
1135
|
-
|
1136
|
-
DaruLite::Vector.new a, index: @index
|
1137
|
-
end
|
1138
|
-
|
1139
|
-
# Reorder the vectors in a dataframe
|
1140
|
-
# @param [Array] order_array new order of the vectors
|
1141
|
-
# @example
|
1142
|
-
# df = DaruLite::DataFrame({
|
1143
|
-
# a: [1, 2, 3],
|
1144
|
-
# b: [4, 5, 6]
|
1145
|
-
# }, order: [:a, :b])
|
1146
|
-
# df.order = [:b, :a]
|
1147
|
-
# df
|
1148
|
-
# # => #<DaruLite::DataFrame(3x2)>
|
1149
|
-
# # b a
|
1150
|
-
# # 0 4 1
|
1151
|
-
# # 1 5 2
|
1152
|
-
# # 2 6 3
|
1153
|
-
def order=(order_array)
|
1154
|
-
raise ArgumentError, 'Invalid order' unless
|
1155
|
-
order_array.sort == vectors.to_a.sort
|
1156
|
-
|
1157
|
-
initialize(to_h, order: order_array)
|
1158
|
-
end
|
1159
|
-
|
1160
|
-
# Return the dataframe with rotate vectors positions, the vector at position count is now
|
1161
|
-
# the first vector of the dataframe.
|
1162
|
-
# If only one vector in the dataframe, the dataframe is return without any change.
|
1163
|
-
# @param count => Integer, the vector at position count will be the first vector of the dataframe.
|
1164
|
-
# @example
|
1165
|
-
# df = DaruLite::DataFrame({
|
1166
|
-
# a: [1, 2, 3],
|
1167
|
-
# b: [4, 5, 6],
|
1168
|
-
# total: [5, 7, 9],
|
1169
|
-
# })
|
1170
|
-
# df.rotate_vectors(-1)
|
1171
|
-
# df
|
1172
|
-
# # => #<DaruLite::DataFrame(3x3)>
|
1173
|
-
# # total b a
|
1174
|
-
# # 0 5 4 1
|
1175
|
-
# # 1 7 5 2
|
1176
|
-
# # 2 9 6 3
|
1177
|
-
def rotate_vectors(count = -1)
|
1178
|
-
return self unless vectors.many?
|
1179
|
-
|
1180
|
-
self.order = vectors.to_a.rotate(count)
|
1181
|
-
self
|
1182
|
-
end
|
1183
|
-
|
1184
|
-
# Returns a vector, based on a string with a calculation based
|
1185
|
-
# on vector.
|
1186
|
-
#
|
1187
|
-
# The calculation will be eval'ed, so you can put any variable
|
1188
|
-
# or expression valid on ruby.
|
1189
|
-
#
|
1190
|
-
# For example:
|
1191
|
-
# a = DaruLite::Vector.new [1,2]
|
1192
|
-
# b = DaruLite::Vector.new [3,4]
|
1193
|
-
# ds = DaruLite::DataFrame.new({:a => a,:b => b})
|
1194
|
-
# ds.compute("a+b")
|
1195
|
-
# => Vector [4,6]
|
1196
|
-
def compute(text, &block)
|
1197
|
-
return instance_eval(&block) if block
|
1198
|
-
|
1199
|
-
instance_eval(text)
|
1200
|
-
end
|
1201
|
-
|
1202
|
-
# Return a vector with the number of missing values in each row.
|
1203
|
-
#
|
1204
|
-
# == Arguments
|
1205
|
-
#
|
1206
|
-
# * +missing_values+ - An Array of the values that should be
|
1207
|
-
# treated as 'missing'. The default missing value is *nil*.
|
1208
|
-
def missing_values_rows(missing_values = [nil])
|
1209
|
-
number_of_missing = each_row.map do |row|
|
1210
|
-
row.indexes(*missing_values).size
|
1211
|
-
end
|
1212
|
-
|
1213
|
-
DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
|
1214
|
-
end
|
1215
|
-
|
1216
|
-
# TODO: remove next version
|
1217
|
-
alias vector_missing_values missing_values_rows
|
1218
|
-
|
1219
|
-
def has_missing_data?
|
1220
|
-
@data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
|
1221
|
-
end
|
1222
|
-
alias flawed? has_missing_data?
|
1223
|
-
deprecate :has_missing_data?, :include_values?, 2016, 10
|
1224
|
-
deprecate :flawed?, :include_values?, 2016, 10
|
1225
|
-
|
1226
|
-
# Check if any of given values occur in the data frame
|
1227
|
-
# @param [Array] values to check for
|
1228
|
-
# @return [true, false] true if any of the given values occur in the
|
1229
|
-
# dataframe, false otherwise
|
1230
|
-
# @example
|
1231
|
-
# df = DaruLite::DataFrame.new({
|
1232
|
-
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
1233
|
-
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
1234
|
-
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
1235
|
-
# }, index: 11..18)
|
1236
|
-
# df.include_values? nil
|
1237
|
-
# # => true
|
1238
|
-
def include_values?(*values)
|
1239
|
-
@data.any? { |vec| vec.include_values?(*values) }
|
1240
|
-
end
|
1241
|
-
|
1242
312
|
# Return a nested hash using vector names as keys and an array constructed of
|
1243
313
|
# hashes with other values. If block provided, is used to provide the
|
1244
314
|
# values, with parameters +row+ of dataset, +current+ last hash on
|
1245
315
|
# hierarchy and +name+ of the key to include
|
1246
|
-
def nest(*tree_keys, &block)
|
1247
|
-
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
1248
|
-
|
1249
|
-
each_row.with_object({}) do |row, current|
|
1250
|
-
# Create tree
|
1251
|
-
*keys, last = tree_keys
|
1252
|
-
current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
|
1253
|
-
name = row[last]
|
1254
|
-
|
1255
|
-
if block
|
1256
|
-
current[name] = yield(row, current, name)
|
1257
|
-
else
|
1258
|
-
current[name] ||= []
|
1259
|
-
current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
|
1260
|
-
end
|
1261
|
-
end
|
1262
|
-
end
|
1263
|
-
|
1264
|
-
def vector_count_characters(vecs = nil)
|
1265
|
-
vecs ||= @vectors.to_a
|
1266
|
-
|
1267
|
-
collect_rows do |row|
|
1268
|
-
vecs.sum { |v| row[v].to_s.size }
|
1269
|
-
end
|
1270
|
-
end
|
1271
|
-
|
1272
|
-
def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
1273
|
-
self[name]
|
1274
|
-
.split_by_separator(sep)
|
1275
|
-
.each { |k, v| self[:"#{name}#{join}#{k}"] = v }
|
1276
|
-
end
|
1277
|
-
|
1278
|
-
# Return the number of rows and columns of the DataFrame in an Array.
|
1279
|
-
def shape
|
1280
|
-
[nrows, ncols]
|
1281
|
-
end
|
1282
|
-
|
1283
|
-
# The number of rows
|
1284
|
-
def nrows
|
1285
|
-
@index.size
|
1286
|
-
end
|
1287
|
-
|
1288
|
-
# The number of vectors
|
1289
|
-
def ncols
|
1290
|
-
@vectors.size
|
1291
|
-
end
|
1292
|
-
|
1293
|
-
# Check if a vector is present
|
1294
|
-
def has_vector?(vector)
|
1295
|
-
@vectors.include? vector
|
1296
|
-
end
|
1297
|
-
|
1298
|
-
# Works like Array#any?.
|
1299
|
-
#
|
1300
|
-
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
1301
|
-
# :row. A DaruLite::Vector object is yielded in the block.
|
1302
|
-
# @example Using any?
|
1303
|
-
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
1304
|
-
# df.any?(:row) do |row|
|
1305
|
-
# row[:a] < 3 and row[:b] == 'b'
|
1306
|
-
# end #=> true
|
1307
|
-
def any?(axis = :vector, &block)
|
1308
|
-
if %i[vector column].include?(axis)
|
1309
|
-
@data.any?(&block)
|
1310
|
-
elsif axis == :row
|
1311
|
-
each_row do |row|
|
1312
|
-
return true if yield(row)
|
1313
|
-
end
|
1314
|
-
false
|
1315
|
-
else
|
1316
|
-
raise ArgumentError, "Unidentified axis #{axis}"
|
1317
|
-
end
|
1318
|
-
end
|
1319
|
-
|
1320
|
-
# Works like Array#all?
|
1321
|
-
#
|
1322
|
-
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
1323
|
-
# :row. A DaruLite::Vector object is yielded in the block.
|
1324
|
-
# @example Using all?
|
1325
|
-
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
1326
|
-
# df.all?(:row) do |row|
|
1327
|
-
# row[:a] < 10
|
1328
|
-
# end #=> true
|
1329
|
-
def all?(axis = :vector, &block)
|
1330
|
-
if %i[vector column].include?(axis)
|
1331
|
-
@data.all?(&block)
|
1332
|
-
elsif axis == :row
|
1333
|
-
each_row.all?(&block)
|
1334
|
-
else
|
1335
|
-
raise ArgumentError, "Unidentified axis #{axis}"
|
1336
|
-
end
|
1337
|
-
end
|
1338
|
-
|
1339
|
-
# The first ten elements of the DataFrame
|
1340
|
-
#
|
1341
|
-
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
1342
|
-
def head(quantity = 10)
|
1343
|
-
row.at 0..(quantity - 1)
|
1344
|
-
end
|
1345
|
-
|
1346
|
-
alias first head
|
1347
|
-
|
1348
|
-
# The last ten elements of the DataFrame
|
1349
|
-
#
|
1350
|
-
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
1351
|
-
def tail(quantity = 10)
|
1352
|
-
start = [-quantity, -size].max
|
1353
|
-
row.at start..-1
|
1354
|
-
end
|
1355
|
-
|
1356
|
-
alias last tail
|
1357
|
-
|
1358
|
-
# Sum all numeric/specified vectors in the DataFrame.
|
1359
|
-
#
|
1360
|
-
# Returns a new vector that's a containing a sum of all numeric
|
1361
|
-
# or specified vectors of the DataFrame. By default, if the vector
|
1362
|
-
# contains a nil, the sum is nil.
|
1363
|
-
# With :skipnil argument set to true, nil values are assumed to be
|
1364
|
-
# 0 (zero) and the sum vector is returned.
|
1365
|
-
#
|
1366
|
-
# @param args [Array] List of vectors to sum. Default is nil in which case
|
1367
|
-
# all numeric vectors are summed.
|
1368
|
-
#
|
1369
|
-
# @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
|
1370
|
-
#
|
1371
|
-
# @return Vector with sum of all vectors specified in the argument.
|
1372
|
-
# If vecs parameter is empty, sum all numeric vector.
|
1373
|
-
#
|
1374
|
-
# @example
|
1375
|
-
# df = DaruLite::DataFrame.new({
|
1376
|
-
# a: [1, 2, nil],
|
1377
|
-
# b: [2, 1, 3],
|
1378
|
-
# c: [1, 1, 1]
|
1379
|
-
# })
|
1380
|
-
# => #<DaruLite::DataFrame(3x3)>
|
1381
|
-
# a b c
|
1382
|
-
# 0 1 2 1
|
1383
|
-
# 1 2 1 1
|
1384
|
-
# 2 nil 3 1
|
1385
|
-
# df.vector_sum [:a, :c]
|
1386
|
-
# => #<DaruLite::Vector(3)>
|
1387
|
-
# 0 2
|
1388
|
-
# 1 3
|
1389
|
-
# 2 nil
|
1390
|
-
# df.vector_sum
|
1391
|
-
# => #<DaruLite::Vector(3)>
|
1392
|
-
# 0 4
|
1393
|
-
# 1 4
|
1394
|
-
# 2 nil
|
1395
|
-
# df.vector_sum skipnil: true
|
1396
|
-
# => #<DaruLite::Vector(3)>
|
1397
|
-
# c
|
1398
|
-
# 0 4
|
1399
|
-
# 1 4
|
1400
|
-
# 2 4
|
1401
|
-
#
|
1402
|
-
def vector_sum(*args)
|
1403
|
-
defaults = { vecs: nil, skipnil: false }
|
1404
|
-
options = args.last.is_a?(::Hash) ? args.pop : {}
|
1405
|
-
options = defaults.merge(options)
|
1406
|
-
vecs = args[0] || options[:vecs]
|
1407
|
-
skipnil = args[1] || options[:skipnil]
|
1408
|
-
|
1409
|
-
vecs ||= numeric_vectors
|
1410
|
-
sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
|
1411
|
-
vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
|
1412
|
-
end
|
1413
|
-
|
1414
|
-
# Calculate mean of the rows of the dataframe.
|
1415
|
-
#
|
1416
|
-
# == Arguments
|
1417
|
-
#
|
1418
|
-
# * +max_missing+ - The maximum number of elements in the row that can be
|
1419
|
-
# zero for the mean calculation to happen. Default to 0.
|
1420
|
-
def vector_mean(max_missing = 0)
|
1421
|
-
# FIXME: in vector_sum we preserve created vector dtype, but
|
1422
|
-
# here we are not. Is this by design or ...? - zverok, 2016-05-18
|
1423
|
-
mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
|
1424
|
-
|
1425
|
-
each_row_with_index.with_object(mean_vec) do |(row, i), memo|
|
1426
|
-
memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
|
1427
|
-
end
|
1428
|
-
end
|
1429
|
-
|
1430
|
-
# Group elements by vector to perform operations on them. Returns a
|
1431
|
-
# DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
|
1432
|
-
# list of possible operations.
|
1433
|
-
#
|
1434
|
-
# == Arguments
|
1435
|
-
#
|
1436
|
-
# * vectors - An Array contatining names of vectors to group by.
|
1437
|
-
#
|
1438
|
-
# == Usage
|
1439
|
-
#
|
1440
|
-
# df = DaruLite::DataFrame.new({
|
1441
|
-
# a: %w{foo bar foo bar foo bar foo foo},
|
1442
|
-
# b: %w{one one two three two two one three},
|
1443
|
-
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
1444
|
-
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
1445
|
-
# })
|
1446
|
-
# df.group_by([:a,:b,:c]).groups
|
1447
|
-
# #=> {["bar", "one", 2]=>[1],
|
1448
|
-
# # ["bar", "three", 1]=>[3],
|
1449
|
-
# # ["bar", "two", 6]=>[5],
|
1450
|
-
# # ["foo", "one", 1]=>[0],
|
1451
|
-
# # ["foo", "one", 3]=>[6],
|
1452
|
-
# # ["foo", "three", 8]=>[7],
|
1453
|
-
# # ["foo", "two", 3]=>[2, 4]}
|
1454
|
-
def group_by(*vectors)
|
1455
|
-
vectors.flatten!
|
1456
|
-
missing = vectors - @vectors.to_a
|
1457
|
-
raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
|
1458
|
-
|
1459
|
-
vectors = [@vectors.first] if vectors.empty?
|
1460
|
-
|
1461
|
-
DaruLite::Core::GroupBy.new(self, vectors)
|
1462
|
-
end
|
1463
|
-
|
1464
|
-
def reindex_vectors(new_vectors)
|
1465
|
-
unless new_vectors.is_a?(DaruLite::Index)
|
1466
|
-
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
1467
|
-
"subclasses, not #{new_vectors.class}"
|
1468
|
-
end
|
1469
|
-
|
1470
|
-
cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
|
1471
|
-
new_vectors.each_with_object(cl) do |vec, memo|
|
1472
|
-
memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
|
1473
|
-
end
|
1474
|
-
end
|
1475
|
-
|
1476
|
-
def get_vector_anyways(v)
|
1477
|
-
@vectors.include?(v) ? self[v].to_a : Array.new(size)
|
1478
|
-
end
|
1479
|
-
|
1480
|
-
# Concatenate another DataFrame along corresponding columns.
|
1481
|
-
# If columns do not exist in both dataframes, they are filled with nils
|
1482
|
-
def concat(other_df)
|
1483
|
-
vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
|
1484
|
-
|
1485
|
-
data = vectors.map do |v|
|
1486
|
-
get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
|
1487
|
-
end
|
1488
|
-
|
1489
|
-
DaruLite::DataFrame.new(data, order: vectors)
|
1490
|
-
end
|
1491
|
-
|
1492
|
-
# Concatenates another DataFrame as #concat.
|
1493
|
-
# Additionally it tries to preserve the index. If the indices contain
|
1494
|
-
# common elements, #union will overwrite the according rows in the
|
1495
|
-
# first dataframe.
|
1496
|
-
def union(other_df)
|
1497
|
-
index = (@index.to_a + other_df.index.to_a).uniq
|
1498
|
-
df = row[*(@index.to_a - other_df.index.to_a)]
|
1499
|
-
|
1500
|
-
df = df.concat(other_df)
|
1501
|
-
df.index = DaruLite::Index.new(index)
|
1502
|
-
df
|
1503
|
-
end
|
1504
|
-
|
1505
|
-
module SetSingleIndexStrategy
|
1506
|
-
def self.uniq_size(df, col)
|
1507
|
-
df[col].uniq.size
|
1508
|
-
end
|
1509
|
-
|
1510
|
-
def self.new_index(df, col)
|
1511
|
-
DaruLite::Index.new(df[col].to_a)
|
1512
|
-
end
|
1513
|
-
|
1514
|
-
def self.delete_vector(df, col)
|
1515
|
-
df.delete_vector(col)
|
1516
|
-
end
|
1517
|
-
end
|
1518
|
-
|
1519
|
-
module SetCategoricalIndexStrategy
|
1520
|
-
def self.new_index(df, col)
|
1521
|
-
DaruLite::CategoricalIndex.new(df[col].to_a)
|
1522
|
-
end
|
1523
|
-
|
1524
|
-
def self.delete_vector(df, col)
|
1525
|
-
df.delete_vector(col)
|
1526
|
-
end
|
1527
|
-
end
|
1528
|
-
|
1529
|
-
module SetMultiIndexStrategy
|
1530
|
-
def self.uniq_size(df, cols)
|
1531
|
-
df[*cols].uniq.size
|
1532
|
-
end
|
1533
|
-
|
1534
|
-
def self.new_index(df, cols)
|
1535
|
-
DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
|
1536
|
-
mi.name = cols
|
1537
|
-
end
|
1538
|
-
end
|
1539
|
-
|
1540
|
-
def self.delete_vector(df, cols)
|
1541
|
-
df.delete_vectors(*cols)
|
1542
|
-
end
|
1543
|
-
end
|
1544
|
-
|
1545
|
-
# Set a particular column as the new DF
|
1546
|
-
def set_index(new_index_col, keep: false, categorical: false)
|
1547
|
-
if categorical
|
1548
|
-
strategy = SetCategoricalIndexStrategy
|
1549
|
-
elsif new_index_col.respond_to?(:to_a)
|
1550
|
-
strategy = SetMultiIndexStrategy
|
1551
|
-
new_index_col = new_index_col.to_a
|
1552
|
-
else
|
1553
|
-
strategy = SetSingleIndexStrategy
|
1554
|
-
end
|
1555
|
-
|
1556
|
-
unless categorical
|
1557
|
-
uniq_size = strategy.uniq_size(self, new_index_col)
|
1558
|
-
raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
|
1559
|
-
end
|
1560
|
-
|
1561
|
-
self.index = strategy.new_index(self, new_index_col)
|
1562
|
-
strategy.delete_vector(self, new_index_col) unless keep
|
1563
|
-
self
|
1564
|
-
end
|
1565
|
-
|
1566
|
-
# Change the index of the DataFrame and preserve the labels of the previous
|
1567
|
-
# indexing. New index can be DaruLite::Index or any of its subclasses.
|
1568
|
-
#
|
1569
|
-
# @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
|
1570
|
-
# @example Reindexing DataFrame
|
1571
|
-
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
|
1572
|
-
# index: ['a','b','c','d'])
|
1573
|
-
# #=>
|
1574
|
-
# ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
1575
|
-
# # a b
|
1576
|
-
# # a 1 11
|
1577
|
-
# # b 2 22
|
1578
|
-
# # c 3 33
|
1579
|
-
# # d 4 44
|
1580
|
-
# df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
|
1581
|
-
# #=>
|
1582
|
-
# ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
1583
|
-
# # a b
|
1584
|
-
# # b 2 22
|
1585
|
-
# # 0 nil nil
|
1586
|
-
# # a 1 11
|
1587
|
-
# # g nil nil
|
1588
|
-
def reindex(new_index)
|
1589
|
-
unless new_index.is_a?(DaruLite::Index)
|
1590
|
-
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
1591
|
-
"subclasses, not #{new_index.class}"
|
1592
|
-
end
|
1593
|
-
|
1594
|
-
cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
|
1595
|
-
new_index.each_with_object(cl) do |idx, memo|
|
1596
|
-
memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
|
1597
|
-
end
|
1598
|
-
end
|
1599
|
-
|
1600
|
-
def reset_index
|
1601
|
-
index_df = index.to_df
|
1602
|
-
names = index.name
|
1603
|
-
names = [names] unless names.instance_of?(Array)
|
1604
|
-
new_vectors = names + vectors.to_a
|
1605
|
-
self.index = index_df.index
|
1606
|
-
names.each do |name|
|
1607
|
-
self[name] = index_df[name]
|
1608
|
-
end
|
1609
|
-
self.order = new_vectors
|
1610
|
-
self
|
1611
|
-
end
|
1612
|
-
|
1613
|
-
# Reassign index with a new index of type DaruLite::Index or any of its subclasses.
|
1614
|
-
#
|
1615
|
-
# @param [DaruLite::Index] idx New index object on which the rows of the dataframe
|
1616
|
-
# are to be indexed.
|
1617
|
-
# @example Reassigining index of a DataFrame
|
1618
|
-
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
|
1619
|
-
# df.index.to_a #=> [0,1,2,3]
|
1620
|
-
#
|
1621
|
-
# df.index = DaruLite::Index.new(['a','b','c','d'])
|
1622
|
-
# df.index.to_a #=> ['a','b','c','d']
|
1623
|
-
# df.row['a'].to_a #=> [1,11]
|
1624
|
-
def index=(idx)
|
1625
|
-
@index = Index.coerce idx
|
1626
|
-
@data.each { |vec| vec.index = @index }
|
1627
|
-
|
1628
|
-
self
|
1629
|
-
end
|
1630
|
-
|
1631
|
-
# Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
|
1632
|
-
#
|
1633
|
-
# @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
|
1634
|
-
# be indexed. Must of the same size as ncols.
|
1635
|
-
# @example Reassigning vectors of a DataFrame
|
1636
|
-
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
|
1637
|
-
# df.vectors.to_a #=> [:a, :b, :c]
|
1638
|
-
#
|
1639
|
-
# df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
|
1640
|
-
# df.vectors.to_a #=> [:foo, :bar, :baz]
|
1641
|
-
def vectors=(new_index)
|
1642
|
-
raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
|
1643
|
-
|
1644
|
-
if new_index.size != ncols
|
1645
|
-
raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
|
1646
|
-
"dataframe size #{ncols}"
|
1647
|
-
end
|
1648
|
-
|
1649
|
-
@vectors = new_index
|
1650
|
-
@data.zip(new_index.to_a).each do |vect, name|
|
1651
|
-
vect.name = name
|
1652
|
-
end
|
1653
|
-
self
|
1654
|
-
end
|
1655
|
-
|
1656
|
-
# Renames the vectors
|
1657
|
-
#
|
1658
|
-
# == Arguments
|
1659
|
-
#
|
1660
|
-
# * name_map - A hash where the keys are the exising vector names and
|
1661
|
-
# the values are the new names. If a vector is renamed
|
1662
|
-
# to a vector name that is already in use, the existing
|
1663
|
-
# one is overwritten.
|
1664
|
-
#
|
1665
|
-
# == Usage
|
1666
|
-
#
|
1667
|
-
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
1668
|
-
# df.rename_vectors :a => :alpha, :c => :gamma
|
1669
|
-
# df.vectors.to_a #=> [:alpha, :b, :gamma]
|
1670
|
-
def rename_vectors(name_map)
|
1671
|
-
existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
|
1672
|
-
delete_vectors(*existing_targets)
|
1673
|
-
|
1674
|
-
new_names = vectors.to_a.map { |v| name_map[v] || v }
|
1675
|
-
self.vectors = DaruLite::Index.new new_names
|
1676
|
-
end
|
1677
|
-
|
1678
|
-
# Renames the vectors and returns itself
|
1679
|
-
#
|
1680
|
-
# == Arguments
|
1681
|
-
#
|
1682
|
-
# * name_map - A hash where the keys are the exising vector names and
|
1683
|
-
# the values are the new names. If a vector is renamed
|
1684
|
-
# to a vector name that is already in use, the existing
|
1685
|
-
# one is overwritten.
|
1686
|
-
#
|
1687
|
-
# == Usage
|
1688
|
-
#
|
1689
|
-
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
1690
|
-
# df.rename_vectors! :a => :alpha, :c => :gamma # df
|
1691
|
-
def rename_vectors!(name_map)
|
1692
|
-
rename_vectors(name_map)
|
1693
|
-
self
|
1694
|
-
end
|
1695
|
-
|
1696
|
-
# Converts the vectors to a DaruLite::MultiIndex.
|
1697
|
-
# The argument passed is used as the MultiIndex's top level
|
1698
|
-
def add_level_to_vectors(top_level_label)
|
1699
|
-
tuples = vectors.map { |label| [top_level_label, *label] }
|
1700
|
-
self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
|
1701
|
-
end
|
1702
|
-
|
1703
|
-
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
1704
|
-
# alongwith numbers.
|
1705
|
-
def numeric_vectors
|
1706
|
-
# FIXME: Why _with_index ?..
|
1707
|
-
each_vector_with_index
|
1708
|
-
.select { |vec, _i| vec.numeric? }
|
1709
|
-
.map(&:last)
|
1710
|
-
end
|
1711
|
-
|
1712
|
-
def numeric_vector_names
|
1713
|
-
@vectors.select { |v| self[v].numeric? }
|
1714
|
-
end
|
1715
|
-
|
1716
|
-
# Return a DataFrame of only the numerical Vectors. If clone: false
|
1717
|
-
# is specified as option, only a *view* of the Vectors will be
|
1718
|
-
# returned. Defaults to clone: true.
|
1719
|
-
def only_numerics(opts = {})
|
1720
|
-
cln = opts[:clone] != false
|
1721
|
-
arry = numeric_vectors.map { |v| self[v] }
|
1722
|
-
|
1723
|
-
order = Index.new(numeric_vectors)
|
1724
|
-
DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
1725
|
-
end
|
1726
|
-
|
1727
|
-
# Generate a summary of this DataFrame based on individual vectors in the DataFrame
|
1728
|
-
# @return [String] String containing the summary of the DataFrame
|
1729
|
-
def summary
|
1730
|
-
summary = "= #{name}"
|
1731
|
-
summary << "\n Number of rows: #{nrows}"
|
1732
|
-
@vectors.each do |v|
|
1733
|
-
summary << "\n Element:[#{v}]\n"
|
1734
|
-
summary << self[v].summary(1)
|
1735
|
-
end
|
1736
|
-
summary
|
1737
|
-
end
|
1738
|
-
|
1739
|
-
# Sorts a dataframe (ascending/descending) in the given pripority sequence of
|
1740
|
-
# vectors, with or without a block.
|
1741
|
-
#
|
1742
|
-
# @param vector_order [Array] The order of vector names in which the DataFrame
|
1743
|
-
# should be sorted.
|
1744
|
-
# @param opts [Hash] opts The options to sort with.
|
1745
|
-
# @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
|
1746
|
-
# or descending order. Specify Array corresponding to *order* for multiple
|
1747
|
-
# sort orders.
|
1748
|
-
# @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
|
1749
|
-
# to be used for sorting, for each vector name in *order* as a hash of
|
1750
|
-
# vector name and lambda expressions. In case a lambda for a vector is not
|
1751
|
-
# specified, the default will be used.
|
1752
|
-
# @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
|
1753
|
-
# automatically or not when a block is provided.
|
1754
|
-
# If set to True, nils will appear at top after sorting.
|
1755
|
-
#
|
1756
|
-
# @example Sort a dataframe with a vector sequence.
|
1757
|
-
#
|
1758
|
-
#
|
1759
|
-
# df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
|
1760
|
-
#
|
1761
|
-
# df.sort [:a, :b]
|
1762
|
-
# # =>
|
1763
|
-
# # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
|
1764
|
-
# # a b
|
1765
|
-
# # 2 1 3
|
1766
|
-
# # 0 1 5
|
1767
|
-
# # 3 2 2
|
1768
|
-
# # 1 2 4
|
1769
|
-
# # 4 3 1
|
1770
|
-
#
|
1771
|
-
# @example Sort a dataframe without a block. Here nils will be handled automatically.
|
1772
|
-
#
|
1773
|
-
# df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
|
1774
|
-
#
|
1775
|
-
# df.sort([:a])
|
1776
|
-
# # =>
|
1777
|
-
# # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
|
1778
|
-
# # a b
|
1779
|
-
# # 1 nil 3
|
1780
|
-
# # 3 nil 1
|
1781
|
-
# # 0 -3 4
|
1782
|
-
# # 2 -1 2
|
1783
|
-
# # 4 5 4
|
1784
|
-
#
|
1785
|
-
# @example Sort a dataframe with a block with nils handled automatically.
|
1786
|
-
#
|
1787
|
-
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
1788
|
-
#
|
1789
|
-
# df.sort [:b], by: {b: lambda { |a| a.length } }
|
1790
|
-
# # NoMethodError: undefined method `length' for nil:NilClass
|
1791
|
-
# # from (pry):8:in `block in __pry__'
|
1792
|
-
#
|
1793
|
-
# df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
|
1794
|
-
#
|
1795
|
-
# # =>
|
1796
|
-
# # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
|
1797
|
-
# # a b
|
1798
|
-
# # 2 1 nil
|
1799
|
-
# # 5 1 nil
|
1800
|
-
# # 4 -1 x
|
1801
|
-
# # 1 -1 aa
|
1802
|
-
# # 0 nil aaa
|
1803
|
-
# # 3 nil baaa
|
1804
|
-
#
|
1805
|
-
# @example Sort a dataframe with a block with nils handled manually.
|
1806
|
-
#
|
1807
|
-
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
1808
|
-
#
|
1809
|
-
# # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
|
1810
|
-
# df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
|
1811
|
-
#
|
1812
|
-
# # =>
|
1813
|
-
# #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
|
1814
|
-
# # a b
|
1815
|
-
# # 4 -1 x
|
1816
|
-
# # 1 -1 aa
|
1817
|
-
# # 0 nil aaa
|
1818
|
-
# # 3 nil baaa
|
1819
|
-
# # 2 1 nil
|
1820
|
-
# # 5 1 nil
|
1821
|
-
|
1822
|
-
def sort!(vector_order, opts = {})
|
1823
|
-
raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
|
1824
|
-
|
1825
|
-
# To enable sorting with categorical data,
|
1826
|
-
# map categories to integers preserving their order
|
1827
|
-
old = convert_categorical_vectors vector_order
|
1828
|
-
block = sort_prepare_block vector_order, opts
|
1829
|
-
|
1830
|
-
order = @index.size.times.sort(&block)
|
1831
|
-
new_index = @index.reorder order
|
1832
|
-
|
1833
|
-
# To reverse map mapping of categorical data to integers
|
1834
|
-
restore_categorical_vectors old
|
1835
|
-
|
1836
|
-
@data.each do |vector|
|
1837
|
-
vector.reorder! order
|
1838
|
-
end
|
1839
|
-
|
1840
|
-
self.index = new_index
|
1841
|
-
|
1842
|
-
self
|
1843
|
-
end
|
1844
|
-
|
1845
|
-
# Non-destructive version of #sort!
|
1846
|
-
def sort(vector_order, opts = {})
|
1847
|
-
dup.sort! vector_order, opts
|
1848
|
-
end
|
1849
|
-
|
1850
|
-
# Pivots a data frame on specified vectors and applies an aggregate function
|
1851
|
-
# to quickly generate a summary.
|
1852
|
-
#
|
1853
|
-
# == Options
|
1854
|
-
#
|
1855
|
-
# +:index+ - Keys to group by on the pivot table row index. Pass vector names
|
1856
|
-
# contained in an Array.
|
1857
|
-
#
|
1858
|
-
# +:vectors+ - Keys to group by on the pivot table column index. Pass vector
|
1859
|
-
# names contained in an Array.
|
1860
|
-
#
|
1861
|
-
# +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
|
1862
|
-
# use any of the statistics functions applicable on Vectors that can be found in
|
1863
|
-
# the DaruLite::Statistics::Vector module.
|
1864
|
-
#
|
1865
|
-
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
1866
|
-
# specified in *:index* or *:vectors*. Optional.
|
1867
|
-
#
|
1868
|
-
# == Usage
|
1869
|
-
#
|
1870
|
-
# df = DaruLite::DataFrame.new({
|
1871
|
-
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
1872
|
-
# b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
|
1873
|
-
# c: ['small','large','large','small','small','large','small','large','small'],
|
1874
|
-
# d: [1,2,2,3,3,4,5,6,7],
|
1875
|
-
# e: [2,4,4,6,6,8,10,12,14]
|
1876
|
-
# })
|
1877
|
-
# df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
|
1878
|
-
#
|
1879
|
-
# #=>
|
1880
|
-
# # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
|
1881
|
-
# # [:e, :one] [:e, :two]
|
1882
|
-
# # [:bar] 18 26
|
1883
|
-
# # [:foo] 10 12
|
1884
|
-
def pivot_table(opts = {})
|
1885
|
-
raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
|
1886
|
-
|
1887
|
-
index = opts[:index]
|
1888
|
-
vectors = opts[:vectors] || []
|
1889
|
-
aggregate_function = opts[:agg] || :mean
|
1890
|
-
values = prepare_pivot_values index, vectors, opts
|
1891
|
-
raise IndexError, 'No numeric vectors to aggregate' if values.empty?
|
1892
|
-
|
1893
|
-
grouped = group_by(index)
|
1894
|
-
return grouped.send(aggregate_function) if vectors.empty?
|
1895
|
-
|
1896
|
-
super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
|
1897
|
-
|
1898
|
-
pivot_dataframe super_hash
|
1899
|
-
end
|
1900
|
-
|
1901
|
-
# Merge vectors from two DataFrames. In case of name collision,
|
1902
|
-
# the vectors names are changed to x_1, x_2 ....
|
1903
|
-
#
|
1904
|
-
# @return {DaruLite::DataFrame}
|
1905
|
-
def merge(other_df)
|
1906
|
-
unless nrows == other_df.nrows
|
1907
|
-
raise ArgumentError,
|
1908
|
-
"Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
|
1909
|
-
end
|
1910
|
-
|
1911
|
-
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
1912
|
-
new_fields = ArrayHelper.recode_repeated(new_fields)
|
1913
|
-
DataFrame.new({}, order: new_fields).tap do |df_new|
|
1914
|
-
(0...nrows).each do |i|
|
1915
|
-
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
1916
|
-
end
|
1917
|
-
df_new.index = @index if @index == other_df.index
|
1918
|
-
df_new.update
|
1919
|
-
end
|
1920
|
-
end
|
1921
|
-
|
1922
|
-
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
1923
|
-
# outer, right outer and full outer joins.
|
1924
|
-
#
|
1925
|
-
# @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
|
1926
|
-
# to be performed.
|
1927
|
-
# @param [Hash] opts Options Hash
|
1928
|
-
# @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
|
1929
|
-
# @option :on [Array] The columns on which the join is to be performed.
|
1930
|
-
# Column names specified here must be common to both DataFrames.
|
1931
|
-
# @option :indicator [Symbol] The name of a vector to add to the resultant
|
1932
|
-
# dataframe that indicates whether the record was in the left (:left_only),
|
1933
|
-
# right (:right_only), or both (:both) joining dataframes.
|
1934
|
-
# @return [DaruLite::DataFrame]
|
1935
|
-
# @example Inner Join
|
1936
|
-
# left = DaruLite::DataFrame.new({
|
1937
|
-
# :id => [1,2,3,4],
|
1938
|
-
# :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
|
1939
|
-
# })
|
1940
|
-
# right = DaruLite::DataFrame.new({
|
1941
|
-
# :id => [1,2,3,4],
|
1942
|
-
# :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
|
1943
|
-
# })
|
1944
|
-
# left.join(right, how: :inner, on: [:name])
|
1945
|
-
# #=>
|
1946
|
-
# ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
|
1947
|
-
# # id_1 name id_2
|
1948
|
-
# # 0 1 Pirate 2
|
1949
|
-
# # 1 3 Ninja 4
|
1950
|
-
def join(other_df, opts = {})
|
1951
|
-
DaruLite::Core::Merge.join(self, other_df, opts)
|
1952
|
-
end
|
1953
|
-
|
1954
|
-
# Creates a new dataset for one to many relations
|
1955
|
-
# on a dataset, based on pattern of field names.
|
1956
|
-
#
|
1957
|
-
# for example, you have a survey for number of children
|
1958
|
-
# with this structure:
|
1959
|
-
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
1960
|
-
# with
|
1961
|
-
# ds.one_to_many([:id], "child_%v_%n"
|
1962
|
-
# the field of first parameters will be copied verbatim
|
1963
|
-
# to new dataset, and fields which responds to second
|
1964
|
-
# pattern will be added one case for each different %n.
|
1965
|
-
#
|
1966
|
-
# @example
|
1967
|
-
# cases=[
|
1968
|
-
# ['1','george','red',10,'blue',20,nil,nil],
|
1969
|
-
# ['2','fred','green',15,'orange',30,'white',20],
|
1970
|
-
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
1971
|
-
# ]
|
1972
|
-
# ds=DaruLite::DataFrame.rows(cases, order:
|
1973
|
-
# [:id, :name,
|
1974
|
-
# :car_color1, :car_value1,
|
1975
|
-
# :car_color2, :car_value2,
|
1976
|
-
# :car_color3, :car_value3])
|
1977
|
-
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
1978
|
-
# #=> Matrix[
|
1979
|
-
# # ["red", "1", 10],
|
1980
|
-
# # ["blue", "1", 20],
|
1981
|
-
# # ["green", "2", 15],
|
1982
|
-
# # ["orange", "2", 30],
|
1983
|
-
# # ["white", "2", 20]
|
1984
|
-
# # ]
|
1985
|
-
def one_to_many(parent_fields, pattern)
|
1986
|
-
vars, numbers = one_to_many_components(pattern)
|
1987
|
-
|
1988
|
-
DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
|
1989
|
-
each_row do |row|
|
1990
|
-
verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
|
1991
|
-
numbers.each do |n|
|
1992
|
-
generated = one_to_many_row row, n, vars, pattern
|
1993
|
-
next if generated.values.all?(&:nil?)
|
1994
|
-
|
1995
|
-
ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
|
1996
|
-
end
|
1997
|
-
end
|
1998
|
-
ds.update
|
1999
|
-
end
|
2000
|
-
end
|
2001
|
-
|
2002
|
-
def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
2003
|
-
self[nm]
|
2004
|
-
.split_by_separator(sep)
|
2005
|
-
.each_with_index do |(k, v), i|
|
2006
|
-
v.rename "#{nm}:#{k}"
|
2007
|
-
self[:"#{nm}#{join}#{i + 1}"] = v
|
2008
|
-
end
|
2009
|
-
end
|
2010
|
-
|
2011
|
-
# Create a sql, basen on a given Dataset
|
2012
|
-
#
|
2013
|
-
# == Arguments
|
2014
|
-
#
|
2015
|
-
# * table - String specifying name of the table that will created in SQL.
|
2016
|
-
# * charset - Character set. Default is "UTF8".
|
2017
|
-
#
|
2018
|
-
# @example
|
2019
|
-
#
|
2020
|
-
# ds = DaruLite::DataFrame.new({
|
2021
|
-
# :id => DaruLite::Vector.new([1,2,3,4,5]),
|
2022
|
-
# :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
|
2023
|
-
# })
|
2024
|
-
# ds.create_sql('names')
|
2025
|
-
# #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
2026
|
-
#
|
2027
|
-
def create_sql(table, charset = 'UTF8')
|
2028
|
-
sql = "CREATE TABLE #{table} ("
|
2029
|
-
fields = vectors.to_a.collect do |f|
|
2030
|
-
v = self[f]
|
2031
|
-
"#{f} #{v.db_type}"
|
2032
|
-
end
|
2033
|
-
|
2034
|
-
sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
|
2035
|
-
end
|
2036
|
-
|
2037
|
-
# Returns the dataframe. This can be convenient when the user does not
|
2038
|
-
# know whether the object is a vector or a dataframe.
|
2039
|
-
# @return [self] the dataframe
|
2040
|
-
def to_df
|
2041
|
-
self
|
2042
|
-
end
|
2043
|
-
|
2044
|
-
# Convert all vectors of type *:numeric* into a Matrix.
|
2045
|
-
def to_matrix
|
2046
|
-
Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
|
2047
|
-
end
|
2048
|
-
|
2049
|
-
# Converts the DataFrame into an array of hashes where key is vector name
|
2050
|
-
# and value is the corresponding element. The 0th index of the array contains
|
2051
|
-
# the array of hashes while the 1th index contains the indexes of each row
|
2052
|
-
# of the dataframe. Each element in the index array corresponds to its row
|
2053
|
-
# in the array of hashes, which has the same index.
|
2054
|
-
def to_a
|
2055
|
-
[each_row.map(&:to_h), @index.to_a]
|
2056
|
-
end
|
2057
|
-
|
2058
|
-
# Convert to json. If no_index is false then the index will NOT be included
|
2059
|
-
# in the JSON thus created.
|
2060
|
-
def to_json(no_index = true)
|
2061
|
-
if no_index
|
2062
|
-
to_a[0].to_json
|
2063
|
-
else
|
2064
|
-
to_a.to_json
|
2065
|
-
end
|
2066
|
-
end
|
2067
|
-
|
2068
|
-
# Converts DataFrame to a hash (explicit) with keys as vector names and values as
|
2069
|
-
# the corresponding vectors.
|
2070
|
-
def to_h
|
2071
|
-
@vectors
|
2072
|
-
.each_with_index
|
2073
|
-
.map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
|
2074
|
-
end
|
316
|
+
def nest(*tree_keys, &block)
|
317
|
+
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
2075
318
|
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2081
|
-
File.expand_path('iruby/templates/dataframe_mi.html.erb', __dir__)
|
2082
|
-
else
|
2083
|
-
File.expand_path('iruby/templates/dataframe.html.erb', __dir__)
|
2084
|
-
end
|
2085
|
-
ERB.new(File.read(path).strip).result(binding)
|
2086
|
-
end
|
319
|
+
each_row.with_object({}) do |row, current|
|
320
|
+
# Create tree
|
321
|
+
*keys, last = tree_keys
|
322
|
+
current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
|
323
|
+
name = row[last]
|
2087
324
|
|
2088
|
-
|
2089
|
-
|
2090
|
-
if index.is_a?(MultiIndex)
|
2091
|
-
File.expand_path('iruby/templates/dataframe_mi_thead.html.erb', __dir__)
|
325
|
+
if block
|
326
|
+
current[name] = yield(row, current, name)
|
2092
327
|
else
|
2093
|
-
|
328
|
+
current[name] ||= []
|
329
|
+
current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
|
2094
330
|
end
|
2095
|
-
|
331
|
+
end
|
2096
332
|
end
|
2097
333
|
|
2098
|
-
def
|
2099
|
-
|
2100
|
-
|
2101
|
-
|
2102
|
-
File.expand_path('iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
|
2103
|
-
else
|
2104
|
-
File.expand_path('iruby/templates/dataframe_tbody.html.erb', __dir__)
|
2105
|
-
end
|
2106
|
-
ERB.new(File.read(table_tbody_path).strip).result(binding)
|
334
|
+
def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
335
|
+
self[name]
|
336
|
+
.split_by_separator(sep)
|
337
|
+
.each { |k, v| self[:"#{name}#{join}#{k}"] = v }
|
2107
338
|
end
|
2108
339
|
|
2109
|
-
|
2110
|
-
|
340
|
+
# Return the number of rows and columns of the DataFrame in an Array.
|
341
|
+
def shape
|
342
|
+
[nrows, ncols]
|
2111
343
|
end
|
2112
344
|
|
2113
|
-
#
|
2114
|
-
|
2115
|
-
|
2116
|
-
# assignment/deletion of elements is done. Updating data this way is called
|
2117
|
-
# lazy loading. To set or unset lazy loading, see the .lazy_update= method.
|
2118
|
-
def update
|
2119
|
-
@data.each(&:update) if DaruLite.lazy_update
|
345
|
+
# The number of rows
|
346
|
+
def nrows
|
347
|
+
@index.size
|
2120
348
|
end
|
2121
349
|
|
2122
|
-
#
|
2123
|
-
def
|
2124
|
-
@
|
2125
|
-
self
|
350
|
+
# The number of vectors
|
351
|
+
def ncols
|
352
|
+
@vectors.size
|
2126
353
|
end
|
2127
354
|
|
2128
|
-
|
2129
|
-
|
2130
|
-
# Write this DataFrame to a CSV file.
|
355
|
+
# Renames the vectors
|
2131
356
|
#
|
2132
357
|
# == Arguments
|
2133
358
|
#
|
2134
|
-
# *
|
359
|
+
# * name_map - A hash where the keys are the exising vector names and
|
360
|
+
# the values are the new names. If a vector is renamed
|
361
|
+
# to a vector name that is already in use, the existing
|
362
|
+
# one is overwritten.
|
2135
363
|
#
|
2136
|
-
# ==
|
364
|
+
# == Usage
|
2137
365
|
#
|
2138
|
-
#
|
2139
|
-
#
|
2140
|
-
#
|
2141
|
-
|
2142
|
-
|
2143
|
-
|
2144
|
-
end
|
366
|
+
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
367
|
+
# df.rename_vectors :a => :alpha, :c => :gamma
|
368
|
+
# df.vectors.to_a #=> [:alpha, :b, :gamma]
|
369
|
+
def rename_vectors(name_map)
|
370
|
+
existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
|
371
|
+
delete_vectors(*existing_targets)
|
2145
372
|
|
2146
|
-
|
2147
|
-
|
2148
|
-
# == Arguments
|
2149
|
-
#
|
2150
|
-
# * filename - The path of the file where the DataFrame should be written.
|
2151
|
-
def write_excel(filename, opts = {})
|
2152
|
-
DaruLite::IO.dataframe_write_excel self, filename, opts
|
373
|
+
new_names = vectors.to_a.map { |v| name_map[v] || v }
|
374
|
+
self.vectors = DaruLite::Index.new new_names
|
2153
375
|
end
|
2154
376
|
|
2155
|
-
#
|
377
|
+
# Renames the vectors and returns itself
|
2156
378
|
#
|
2157
379
|
# == Arguments
|
2158
380
|
#
|
2159
|
-
# *
|
2160
|
-
#
|
381
|
+
# * name_map - A hash where the keys are the exising vector names and
|
382
|
+
# the values are the new names. If a vector is renamed
|
383
|
+
# to a vector name that is already in use, the existing
|
384
|
+
# one is overwritten.
|
2161
385
|
#
|
2162
386
|
# == Usage
|
2163
387
|
#
|
2164
|
-
#
|
2165
|
-
#
|
2166
|
-
|
2167
|
-
|
2168
|
-
|
388
|
+
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
389
|
+
# df.rename_vectors! :a => :alpha, :c => :gamma # df
|
390
|
+
def rename_vectors!(name_map)
|
391
|
+
rename_vectors(name_map)
|
392
|
+
self
|
2169
393
|
end
|
2170
394
|
|
2171
|
-
#
|
2172
|
-
|
2173
|
-
|
395
|
+
# Converts the vectors to a DaruLite::MultiIndex.
|
396
|
+
# The argument passed is used as the MultiIndex's top level
|
397
|
+
def add_level_to_vectors(top_level_label)
|
398
|
+
tuples = vectors.map { |label| [top_level_label, *label] }
|
399
|
+
self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
|
2174
400
|
end
|
2175
401
|
|
2176
|
-
def
|
2177
|
-
|
2178
|
-
|
2179
|
-
|
2180
|
-
|
2181
|
-
|
2182
|
-
|
402
|
+
def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
403
|
+
self[nm]
|
404
|
+
.split_by_separator(sep)
|
405
|
+
.each_with_index do |(k, v), i|
|
406
|
+
v.rename "#{nm}:#{k}"
|
407
|
+
self[:"#{nm}#{join}#{i + 1}"] = v
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
# Method for updating the metadata (i.e. missing value positions) of the
|
412
|
+
# after assingment/deletion etc. are complete. This is provided so that
|
413
|
+
# time is not wasted in creating the metadata for the vector each time
|
414
|
+
# assignment/deletion of elements is done. Updating data this way is called
|
415
|
+
# lazy loading. To set or unset lazy loading, see the .lazy_update= method.
|
416
|
+
def update
|
417
|
+
@data.each(&:update) if DaruLite.lazy_update
|
2183
418
|
end
|
2184
419
|
|
2185
|
-
|
2186
|
-
|
2187
|
-
|
2188
|
-
|
2189
|
-
order: h[:order],
|
2190
|
-
name: h[:name])
|
420
|
+
# Rename the DataFrame.
|
421
|
+
def rename(new_name)
|
422
|
+
@name = new_name
|
423
|
+
self
|
2191
424
|
end
|
425
|
+
alias name= rename
|
2192
426
|
|
2193
427
|
# Transpose a DataFrame, tranposing elements and row, column indexing.
|
2194
428
|
def transpose
|
@@ -2204,7 +438,10 @@ module DaruLite
|
|
2204
438
|
# Pretty print in a nice table format for the command line (irb/pry/iruby)
|
2205
439
|
def inspect(spacing = DaruLite.spacing, threshold = DaruLite.max_rows)
|
2206
440
|
name_part = @name ? ": #{@name} " : ''
|
2207
|
-
spacing = [
|
441
|
+
spacing = [
|
442
|
+
headers.to_a.map { |header| header.try(:length) || header.to_s.length }.max,
|
443
|
+
spacing
|
444
|
+
].max
|
2208
445
|
|
2209
446
|
"#<#{self.class}#{name_part}(#{nrows}x#{ncols})>#{$INPUT_RECORD_SEPARATOR}" +
|
2210
447
|
Formatters::Table.format(
|
@@ -2216,11 +453,6 @@ module DaruLite
|
|
2216
453
|
)
|
2217
454
|
end
|
2218
455
|
|
2219
|
-
# Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
|
2220
|
-
def where(bool_array)
|
2221
|
-
DaruLite::Core::Query.df_where self, bool_array
|
2222
|
-
end
|
2223
|
-
|
2224
456
|
def ==(other)
|
2225
457
|
self.class == other.class &&
|
2226
458
|
@size == other.size &&
|
@@ -2274,144 +506,6 @@ module DaruLite
|
|
2274
506
|
order: all_vectors.map(&:name)
|
2275
507
|
end
|
2276
508
|
|
2277
|
-
# Split the dataframe into many dataframes based on category vector
|
2278
|
-
# @param [object] cat_name name of category vector to split the dataframe
|
2279
|
-
# @return [Array] array of dataframes split by category with category vector
|
2280
|
-
# used to split not included
|
2281
|
-
# @example
|
2282
|
-
# df = DaruLite::DataFrame.new({
|
2283
|
-
# a: [1, 2, 3],
|
2284
|
-
# b: ['a', 'a', 'b']
|
2285
|
-
# })
|
2286
|
-
# df.to_category :b
|
2287
|
-
# df.split_by_category :b
|
2288
|
-
# # => [#<DaruLite::DataFrame: a (2x1)>
|
2289
|
-
# # a
|
2290
|
-
# # 0 1
|
2291
|
-
# # 1 2,
|
2292
|
-
# # #<DaruLite::DataFrame: b (1x1)>
|
2293
|
-
# # a
|
2294
|
-
# # 2 3]
|
2295
|
-
def split_by_category(cat_name)
|
2296
|
-
cat_dv = self[cat_name]
|
2297
|
-
raise ArgumentError, "#{cat_name} is not a category vector" unless
|
2298
|
-
cat_dv.category?
|
2299
|
-
|
2300
|
-
cat_dv.categories.map do |cat|
|
2301
|
-
where(cat_dv.eq cat)
|
2302
|
-
.rename(cat)
|
2303
|
-
.delete_vector cat_name
|
2304
|
-
end
|
2305
|
-
end
|
2306
|
-
|
2307
|
-
# @param indexes [Array] index(s) at which row tuples are retrieved
|
2308
|
-
# @return [Array] returns array of row tuples at given index(s)
|
2309
|
-
# @example Using DaruLite::Index
|
2310
|
-
# df = DaruLite::DataFrame.new({
|
2311
|
-
# a: [1, 2, 3],
|
2312
|
-
# b: ['a', 'a', 'b']
|
2313
|
-
# })
|
2314
|
-
#
|
2315
|
-
# df.access_row_tuples_by_indexs(1,2)
|
2316
|
-
# # => [[2, "a"], [3, "b"]]
|
2317
|
-
#
|
2318
|
-
# df.index = DaruLite::Index.new([:one,:two,:three])
|
2319
|
-
# df.access_row_tuples_by_indexs(:one,:three)
|
2320
|
-
# # => [[1, "a"], [3, "b"]]
|
2321
|
-
#
|
2322
|
-
# @example Using DaruLite::MultiIndex
|
2323
|
-
# mi_idx = DaruLite::MultiIndex.from_tuples [
|
2324
|
-
# [:a,:one,:bar],
|
2325
|
-
# [:a,:one,:baz],
|
2326
|
-
# [:b,:two,:bar],
|
2327
|
-
# [:a,:two,:baz],
|
2328
|
-
# ]
|
2329
|
-
# df_mi = DaruLite::DataFrame.new({
|
2330
|
-
# a: 1..4,
|
2331
|
-
# b: 'a'..'d'
|
2332
|
-
# }, index: mi_idx )
|
2333
|
-
#
|
2334
|
-
# df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
|
2335
|
-
# # => [[3, "c"]]
|
2336
|
-
# df_mi.access_row_tuples_by_indexs(:a)
|
2337
|
-
# # => [[1, "a"], [2, "b"], [4, "d"]]
|
2338
|
-
def access_row_tuples_by_indexs(*indexes)
|
2339
|
-
return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
|
2340
|
-
@index.is_a?(DaruLite::MultiIndex)
|
2341
|
-
|
2342
|
-
positions = @index.pos(*indexes)
|
2343
|
-
if positions.is_a? Numeric
|
2344
|
-
row = get_rows_for([positions])
|
2345
|
-
row.first.is_a?(Array) ? row : [row]
|
2346
|
-
else
|
2347
|
-
new_rows = get_rows_for(indexes, by_position: false)
|
2348
|
-
indexes.map { |index| new_rows.map { |r| r[index] } }
|
2349
|
-
end
|
2350
|
-
end
|
2351
|
-
|
2352
|
-
# Function to use for aggregating the data.
|
2353
|
-
#
|
2354
|
-
# @param options [Hash] options for column, you want in resultant dataframe
|
2355
|
-
#
|
2356
|
-
# @return [DaruLite::DataFrame]
|
2357
|
-
#
|
2358
|
-
# @example
|
2359
|
-
# df = DaruLite::DataFrame.new(
|
2360
|
-
# {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
|
2361
|
-
# => #<DaruLite::DataFrame(5x2)>
|
2362
|
-
# col num
|
2363
|
-
# 0 a 52
|
2364
|
-
# 1 b 12
|
2365
|
-
# 2 c 7
|
2366
|
-
# 3 d 17
|
2367
|
-
# 4 e 1
|
2368
|
-
#
|
2369
|
-
# df.aggregate(num_100_times: ->(df) { (df.num*100).first })
|
2370
|
-
# => #<DaruLite::DataFrame(5x1)>
|
2371
|
-
# num_100_ti
|
2372
|
-
# 0 5200
|
2373
|
-
# 1 1200
|
2374
|
-
# 2 700
|
2375
|
-
# 3 1700
|
2376
|
-
# 4 100
|
2377
|
-
#
|
2378
|
-
# When we have duplicate index :
|
2379
|
-
#
|
2380
|
-
# idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
|
2381
|
-
# df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
|
2382
|
-
# => #<DaruLite::DataFrame(5x1)>
|
2383
|
-
# num
|
2384
|
-
# a 52
|
2385
|
-
# b 12
|
2386
|
-
# a 7
|
2387
|
-
# a 17
|
2388
|
-
# c 1
|
2389
|
-
#
|
2390
|
-
# df.aggregate(num: :mean)
|
2391
|
-
# => #<DaruLite::DataFrame(3x1)>
|
2392
|
-
# num
|
2393
|
-
# a 25.3333333
|
2394
|
-
# b 12
|
2395
|
-
# c 1
|
2396
|
-
#
|
2397
|
-
# Note: `GroupBy` class `aggregate` method uses this `aggregate` method
|
2398
|
-
# internally.
|
2399
|
-
def aggregate(options = {}, multi_index_level = -1)
|
2400
|
-
if block_given?
|
2401
|
-
positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
|
2402
|
-
else
|
2403
|
-
positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
|
2404
|
-
end
|
2405
|
-
|
2406
|
-
colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
|
2407
|
-
|
2408
|
-
DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
|
2409
|
-
end
|
2410
|
-
|
2411
|
-
def group_by_and_aggregate(*group_by_keys, **aggregation_map)
|
2412
|
-
group_by(*group_by_keys).aggregate(aggregation_map)
|
2413
|
-
end
|
2414
|
-
|
2415
509
|
private
|
2416
510
|
|
2417
511
|
def headers
|
@@ -2422,20 +516,6 @@ module DaruLite
|
|
2422
516
|
index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
|
2423
517
|
end
|
2424
518
|
|
2425
|
-
def convert_categorical_vectors(names)
|
2426
|
-
names.filter_map do |n|
|
2427
|
-
next unless self[n].category?
|
2428
|
-
|
2429
|
-
old = [n, self[n]]
|
2430
|
-
self[n] = DaruLite::Vector.new(self[n].to_ints)
|
2431
|
-
old
|
2432
|
-
end
|
2433
|
-
end
|
2434
|
-
|
2435
|
-
def restore_categorical_vectors(old)
|
2436
|
-
old.each { |name, vector| self[name] = vector }
|
2437
|
-
end
|
2438
|
-
|
2439
519
|
def recursive_product(dfs)
|
2440
520
|
return dfs.first if dfs.size == 1
|
2441
521
|
|
@@ -2447,12 +527,6 @@ module DaruLite
|
|
2447
527
|
end
|
2448
528
|
end
|
2449
529
|
|
2450
|
-
def should_be_vector!(val)
|
2451
|
-
return val if val.is_a?(DaruLite::Vector)
|
2452
|
-
|
2453
|
-
raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
|
2454
|
-
end
|
2455
|
-
|
2456
530
|
def dispatch_to_axis(axis, method, *args, &block)
|
2457
531
|
if %i[vector column].include?(axis)
|
2458
532
|
send(:"#{method}_vector", *args, &block)
|
@@ -2483,76 +557,6 @@ module DaruLite
|
|
2483
557
|
end
|
2484
558
|
end
|
2485
559
|
|
2486
|
-
def access_vector(*names)
|
2487
|
-
if names.first.is_a?(Range)
|
2488
|
-
dup(@vectors.subset(names.first))
|
2489
|
-
elsif @vectors.is_a?(MultiIndex)
|
2490
|
-
access_vector_multi_index(*names)
|
2491
|
-
else
|
2492
|
-
access_vector_single_index(*names)
|
2493
|
-
end
|
2494
|
-
end
|
2495
|
-
|
2496
|
-
def access_vector_multi_index(*names)
|
2497
|
-
pos = @vectors[names]
|
2498
|
-
|
2499
|
-
return @data[pos] if pos.is_a?(Integer)
|
2500
|
-
|
2501
|
-
new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
|
2502
|
-
|
2503
|
-
pos = pos.drop_left_level(names.size) if names.size < @vectors.width
|
2504
|
-
|
2505
|
-
DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
|
2506
|
-
end
|
2507
|
-
|
2508
|
-
def access_vector_single_index(*names)
|
2509
|
-
if names.count < 2
|
2510
|
-
begin
|
2511
|
-
pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
|
2512
|
-
rescue IndexError
|
2513
|
-
raise IndexError, "Specified vector #{names.first} does not exist"
|
2514
|
-
end
|
2515
|
-
return @data[pos] if pos.is_a?(Numeric)
|
2516
|
-
|
2517
|
-
names = pos
|
2518
|
-
end
|
2519
|
-
|
2520
|
-
new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
|
2521
|
-
|
2522
|
-
order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
|
2523
|
-
DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
|
2524
|
-
end
|
2525
|
-
|
2526
|
-
def access_row(*indexes)
|
2527
|
-
positions = @index.pos(*indexes)
|
2528
|
-
|
2529
|
-
if positions.is_a? Numeric
|
2530
|
-
row = get_rows_for([positions])
|
2531
|
-
DaruLite::Vector.new row, index: @vectors, name: indexes.first
|
2532
|
-
else
|
2533
|
-
new_rows = get_rows_for(indexes, by_position: false)
|
2534
|
-
DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
|
2535
|
-
end
|
2536
|
-
end
|
2537
|
-
|
2538
|
-
# @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
|
2539
|
-
# because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
|
2540
|
-
# values (representing a row) or an array of Vectors (that can be seen as rows)
|
2541
|
-
def get_rows_for(keys, by_position: true)
|
2542
|
-
raise unless keys.is_a?(Array)
|
2543
|
-
|
2544
|
-
if by_position
|
2545
|
-
pos = keys
|
2546
|
-
@data.map { |vector| vector.at(*pos) }
|
2547
|
-
else
|
2548
|
-
# TODO: for now (2018-07-27), it is different than using
|
2549
|
-
# get_rows_for(@index.pos(*keys))
|
2550
|
-
# because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
|
2551
|
-
indexes = keys
|
2552
|
-
@data.map { |vec| vec[*indexes] }
|
2553
|
-
end
|
2554
|
-
end
|
2555
|
-
|
2556
560
|
def insert_or_modify_vector(name, vector)
|
2557
561
|
name = name[0] unless @vectors.is_a?(MultiIndex)
|
2558
562
|
|
@@ -2835,146 +839,6 @@ module DaruLite
|
|
2835
839
|
end
|
2836
840
|
end
|
2837
841
|
|
2838
|
-
def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
|
2839
|
-
# Create an array to be used for comparison of two rows in sorting
|
2840
|
-
vector_locs
|
2841
|
-
.zip(by_blocks, ascending, handle_nils)
|
2842
|
-
.map do |vector_loc, by, asc, handle_nil|
|
2843
|
-
value = @data[vector_loc].data[asc ? r1 : r2]
|
2844
|
-
|
2845
|
-
if by
|
2846
|
-
value = begin
|
2847
|
-
by.call(value)
|
2848
|
-
rescue StandardError
|
2849
|
-
nil
|
2850
|
-
end
|
2851
|
-
end
|
2852
|
-
|
2853
|
-
sort_handle_nils value, asc, handle_nil || !by
|
2854
|
-
end
|
2855
|
-
end
|
2856
|
-
|
2857
|
-
def sort_handle_nils(value, asc, handle_nil)
|
2858
|
-
if !handle_nil
|
2859
|
-
value
|
2860
|
-
elsif asc
|
2861
|
-
[value.nil? ? 0 : 1, value]
|
2862
|
-
else
|
2863
|
-
[value.nil? ? 1 : 0, value]
|
2864
|
-
end
|
2865
|
-
end
|
2866
|
-
|
2867
|
-
def sort_coerce_boolean(opts, symbol, default, size)
|
2868
|
-
val = opts[symbol]
|
2869
|
-
case val
|
2870
|
-
when true, false
|
2871
|
-
Array.new(size, val)
|
2872
|
-
when nil
|
2873
|
-
Array.new(size, default)
|
2874
|
-
when Array
|
2875
|
-
raise ArgumentError, "Specify same number of vector names and #{symbol}" if
|
2876
|
-
size != val.size
|
2877
|
-
|
2878
|
-
val
|
2879
|
-
else
|
2880
|
-
raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
|
2881
|
-
end
|
2882
|
-
end
|
2883
|
-
|
2884
|
-
def sort_prepare_block(vector_order, opts)
|
2885
|
-
ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
|
2886
|
-
handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
|
2887
|
-
|
2888
|
-
by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
|
2889
|
-
vector_locs = vector_order.map { |v| @vectors[v] }
|
2890
|
-
|
2891
|
-
lambda do |index1, index2|
|
2892
|
-
# Build left and right array to compare two rows
|
2893
|
-
left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
|
2894
|
-
right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
|
2895
|
-
|
2896
|
-
# Resolve conflict by Index if all attributes are same
|
2897
|
-
left << index1
|
2898
|
-
right << index2
|
2899
|
-
left <=> right
|
2900
|
-
end
|
2901
|
-
end
|
2902
|
-
|
2903
|
-
def verify_error_message(row, test, id, i)
|
2904
|
-
description, fields, = test
|
2905
|
-
values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
|
2906
|
-
"#{i + 1} [#{row[id]}]: #{description}#{values}"
|
2907
|
-
end
|
2908
|
-
|
2909
|
-
def prepare_pivot_values(index, vectors, opts)
|
2910
|
-
case opts[:values]
|
2911
|
-
when nil # values not specified at all.
|
2912
|
-
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
2913
|
-
when Array # multiple values specified.
|
2914
|
-
opts[:values]
|
2915
|
-
else # single value specified.
|
2916
|
-
[opts[:values]]
|
2917
|
-
end
|
2918
|
-
end
|
2919
|
-
|
2920
|
-
def make_pivot_hash(grouped, vectors, values, aggregate_function)
|
2921
|
-
grouped.groups.transform_values { |_| {} }.tap do |super_hash|
|
2922
|
-
values.each do |value|
|
2923
|
-
grouped.groups.each do |group_name, row_numbers|
|
2924
|
-
row_numbers.each do |num|
|
2925
|
-
arry = [value, *vectors.map { |v| self[v][num] }]
|
2926
|
-
sub_hash = super_hash[group_name]
|
2927
|
-
sub_hash[arry] ||= []
|
2928
|
-
|
2929
|
-
sub_hash[arry] << self[value][num]
|
2930
|
-
end
|
2931
|
-
end
|
2932
|
-
end
|
2933
|
-
|
2934
|
-
setup_pivot_aggregates super_hash, aggregate_function
|
2935
|
-
end
|
2936
|
-
end
|
2937
|
-
|
2938
|
-
def setup_pivot_aggregates(super_hash, aggregate_function)
|
2939
|
-
super_hash.each_value do |sub_hash|
|
2940
|
-
sub_hash.each do |group_name, aggregates|
|
2941
|
-
sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
|
2942
|
-
end
|
2943
|
-
end
|
2944
|
-
end
|
2945
|
-
|
2946
|
-
def pivot_dataframe(super_hash)
|
2947
|
-
df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
|
2948
|
-
df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
|
2949
|
-
|
2950
|
-
DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
|
2951
|
-
super_hash.each do |row_index, sub_h|
|
2952
|
-
sub_h.each do |vector_index, val|
|
2953
|
-
pivoted_dataframe[vector_index][row_index] = val
|
2954
|
-
end
|
2955
|
-
end
|
2956
|
-
end
|
2957
|
-
end
|
2958
|
-
|
2959
|
-
def one_to_many_components(pattern)
|
2960
|
-
re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
|
2961
|
-
|
2962
|
-
vars, numbers =
|
2963
|
-
@vectors
|
2964
|
-
.map { |v| v.scan(re) }
|
2965
|
-
.reject(&:empty?).flatten(1).transpose
|
2966
|
-
|
2967
|
-
[vars.uniq, numbers.map(&:to_i).sort.uniq]
|
2968
|
-
end
|
2969
|
-
|
2970
|
-
def one_to_many_row(row, number, vars, pattern)
|
2971
|
-
vars
|
2972
|
-
.to_h do |v|
|
2973
|
-
name = pattern.sub('%v', v).sub('%n', number.to_s)
|
2974
|
-
[v, row[name]]
|
2975
|
-
end
|
2976
|
-
end
|
2977
|
-
|
2978
842
|
# Raises IndexError when one of the positions is not a valid position
|
2979
843
|
def validate_positions(*positions, size)
|
2980
844
|
positions.each do |pos|
|
@@ -2999,82 +863,5 @@ module DaruLite
|
|
2999
863
|
DaruLite::Vector.new(source[idx], index: @index, name: vectors[idx])
|
3000
864
|
end
|
3001
865
|
end
|
3002
|
-
|
3003
|
-
def aggregate_by_positions_tuples(options, positions_tuples)
|
3004
|
-
agg_over_vectors_only, options = cast_aggregation_options(options)
|
3005
|
-
|
3006
|
-
if agg_over_vectors_only
|
3007
|
-
options.map do |vect_name, method|
|
3008
|
-
vect = self[vect_name]
|
3009
|
-
|
3010
|
-
positions_tuples.map do |positions|
|
3011
|
-
vect.apply_method_on_sub_vector(method, keys: positions)
|
3012
|
-
end
|
3013
|
-
end
|
3014
|
-
else
|
3015
|
-
methods = options.values
|
3016
|
-
|
3017
|
-
# NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
|
3018
|
-
rows = positions_tuples.map do |positions|
|
3019
|
-
apply_method_on_sub_df(methods, keys: positions)
|
3020
|
-
end
|
3021
|
-
|
3022
|
-
rows.transpose
|
3023
|
-
end
|
3024
|
-
end
|
3025
|
-
|
3026
|
-
# convert operations over sub-vectors to operations over sub-dfs when it improves perf
|
3027
|
-
# note: we don't always "cast" because aggregation over a single vector / a few vector is faster
|
3028
|
-
# than aggregation over (sub-)dfs
|
3029
|
-
def cast_aggregation_options(options)
|
3030
|
-
vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
|
3031
|
-
|
3032
|
-
over_vectors = true
|
3033
|
-
|
3034
|
-
if non_vects.any?
|
3035
|
-
options = options.clone
|
3036
|
-
|
3037
|
-
vects.each do |name|
|
3038
|
-
proc_on_vect = options[name].to_proc
|
3039
|
-
options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
|
3040
|
-
end
|
3041
|
-
|
3042
|
-
over_vectors = false
|
3043
|
-
end
|
3044
|
-
|
3045
|
-
[over_vectors, options]
|
3046
|
-
end
|
3047
|
-
|
3048
|
-
def group_index_for_aggregation(index, multi_index_level = -1)
|
3049
|
-
case index
|
3050
|
-
when DaruLite::MultiIndex
|
3051
|
-
groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
|
3052
|
-
|
3053
|
-
new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
|
3054
|
-
pos_tuples = groups_by_pos.values
|
3055
|
-
when DaruLite::Index, DaruLite::CategoricalIndex
|
3056
|
-
new_index = Array(index).uniq
|
3057
|
-
pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
|
3058
|
-
else raise
|
3059
|
-
end
|
3060
|
-
|
3061
|
-
[pos_tuples, new_index]
|
3062
|
-
end
|
3063
|
-
|
3064
|
-
# coerce ranges, integers and array in appropriate ways
|
3065
|
-
def coerce_positions(*positions, size)
|
3066
|
-
if positions.size == 1
|
3067
|
-
case positions.first
|
3068
|
-
when Integer
|
3069
|
-
positions.first
|
3070
|
-
when Range
|
3071
|
-
size.times.to_a[positions.first]
|
3072
|
-
else
|
3073
|
-
raise ArgumentError, 'Unknown position type.'
|
3074
|
-
end
|
3075
|
-
else
|
3076
|
-
positions
|
3077
|
-
end
|
3078
|
-
end
|
3079
866
|
end
|
3080
867
|
end
|