daru_lite 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +35 -33
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +138 -2353
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3289
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +86 -2
data/lib/daru_lite/dataframe.rb
CHANGED
@@ -1,10 +1,40 @@
|
|
1
1
|
require 'daru_lite/accessors/dataframe_by_row'
|
2
|
+
require 'daru_lite/data_frame/aggregatable'
|
3
|
+
require 'daru_lite/data_frame/calculatable'
|
4
|
+
require 'daru_lite/data_frame/convertible'
|
5
|
+
require 'daru_lite/data_frame/duplicatable'
|
6
|
+
require 'daru_lite/data_frame/fetchable'
|
7
|
+
require 'daru_lite/data_frame/filterable'
|
8
|
+
require 'daru_lite/data_frame/indexable'
|
9
|
+
require 'daru_lite/data_frame/i_o_able'
|
10
|
+
require 'daru_lite/data_frame/iterable'
|
11
|
+
require 'daru_lite/data_frame/joinable'
|
12
|
+
require 'daru_lite/data_frame/missable'
|
13
|
+
require 'daru_lite/data_frame/pivotable'
|
14
|
+
require 'daru_lite/data_frame/setable'
|
15
|
+
require 'daru_lite/data_frame/sortable'
|
16
|
+
require 'daru_lite/data_frame/queryable'
|
2
17
|
require 'daru_lite/maths/arithmetic/dataframe'
|
3
18
|
require 'daru_lite/maths/statistics/dataframe'
|
4
19
|
require 'daru_lite/io/io'
|
5
20
|
|
6
21
|
module DaruLite
|
7
22
|
class DataFrame # rubocop:disable Metrics/ClassLength
|
23
|
+
include DaruLite::DataFrame::Aggregatable
|
24
|
+
include DaruLite::DataFrame::Calculatable
|
25
|
+
include DaruLite::DataFrame::Convertible
|
26
|
+
include DaruLite::DataFrame::Duplicatable
|
27
|
+
include DaruLite::DataFrame::Fetchable
|
28
|
+
include DaruLite::DataFrame::Filterable
|
29
|
+
include DaruLite::DataFrame::Indexable
|
30
|
+
include DaruLite::DataFrame::Iterable
|
31
|
+
include DaruLite::DataFrame::IOAble
|
32
|
+
include DaruLite::DataFrame::Joinable
|
33
|
+
include DaruLite::DataFrame::Missable
|
34
|
+
include DaruLite::DataFrame::Pivotable
|
35
|
+
include DaruLite::DataFrame::Setable
|
36
|
+
include DaruLite::DataFrame::Sortable
|
37
|
+
include DaruLite::DataFrame::Queryable
|
8
38
|
include DaruLite::Maths::Arithmetic::DataFrame
|
9
39
|
include DaruLite::Maths::Statistics::DataFrame
|
10
40
|
|
@@ -13,109 +43,6 @@ module DaruLite
|
|
13
43
|
extend Gem::Deprecate
|
14
44
|
|
15
45
|
class << self
|
16
|
-
# Load data from a CSV file. Specify an optional block to grab the CSV
|
17
|
-
# object and pre-condition it (for example use the `convert` or
|
18
|
-
# `header_convert` methods).
|
19
|
-
#
|
20
|
-
# == Arguments
|
21
|
-
#
|
22
|
-
# * path - Local path / Remote URL of the file to load specified as a String.
|
23
|
-
#
|
24
|
-
# == Options
|
25
|
-
#
|
26
|
-
# Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
|
27
|
-
# and uses those to eventually construct the resulting DataFrame.
|
28
|
-
#
|
29
|
-
# == Verbose Description
|
30
|
-
#
|
31
|
-
# You can specify all the options to the `.from_csv` function that you
|
32
|
-
# do to the Ruby `CSV.read()` function, since this is what is used internally.
|
33
|
-
#
|
34
|
-
# For example, if the columns in your CSV file are separated by something
|
35
|
-
# other that commas, you can use the `:col_sep` option. If you want to
|
36
|
-
# convert numeric values to numbers and not keep them as strings, you can
|
37
|
-
# use the `:converters` option and set it to `:numeric`.
|
38
|
-
#
|
39
|
-
# The `.from_csv` function uses the following defaults for reading CSV files
|
40
|
-
# (that are passed into the `CSV.read()` function):
|
41
|
-
#
|
42
|
-
# {
|
43
|
-
# :col_sep => ',',
|
44
|
-
# :converters => :numeric
|
45
|
-
# }
|
46
|
-
def from_csv(path, opts = {}, &block)
|
47
|
-
DaruLite::IO.from_csv path, opts, &block
|
48
|
-
end
|
49
|
-
|
50
|
-
# Read data from an Excel file into a DataFrame.
|
51
|
-
#
|
52
|
-
# == Arguments
|
53
|
-
#
|
54
|
-
# * path - Path of the file to be read.
|
55
|
-
#
|
56
|
-
# == Options
|
57
|
-
#
|
58
|
-
# *:worksheet_id - ID of the worksheet that is to be read.
|
59
|
-
def from_excel(path, opts = {}, &block)
|
60
|
-
DaruLite::IO.from_excel path, opts, &block
|
61
|
-
end
|
62
|
-
|
63
|
-
# Read a database query and returns a Dataset
|
64
|
-
#
|
65
|
-
# @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
|
66
|
-
# @param query [String] The query to be executed
|
67
|
-
#
|
68
|
-
# @return A dataframe containing the data resulting from the query
|
69
|
-
#
|
70
|
-
# USE:
|
71
|
-
#
|
72
|
-
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
73
|
-
# DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
|
74
|
-
#
|
75
|
-
# #Alternatively
|
76
|
-
#
|
77
|
-
# require 'dbi'
|
78
|
-
# DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
|
79
|
-
def from_sql(dbh, query)
|
80
|
-
DaruLite::IO.from_sql dbh, query
|
81
|
-
end
|
82
|
-
|
83
|
-
# Read a dataframe from AR::Relation
|
84
|
-
#
|
85
|
-
# @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
|
86
|
-
# @param fields [Array] Field names to be loaded (optional)
|
87
|
-
#
|
88
|
-
# @return A dataframe containing the data loaded from the relation
|
89
|
-
#
|
90
|
-
# USE:
|
91
|
-
#
|
92
|
-
# # When Post model is defined as:
|
93
|
-
# class Post < ActiveRecord::Base
|
94
|
-
# scope :active, -> { where.not(published_at: nil) }
|
95
|
-
# end
|
96
|
-
#
|
97
|
-
# # You can load active posts into a dataframe by:
|
98
|
-
# DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
|
99
|
-
def from_activerecord(relation, *fields)
|
100
|
-
DaruLite::IO.from_activerecord relation, *fields
|
101
|
-
end
|
102
|
-
|
103
|
-
# Read the database from a plaintext file. For this method to work,
|
104
|
-
# the data should be present in a plain text file in columns. See
|
105
|
-
# spec/fixtures/bank2.dat for an example.
|
106
|
-
#
|
107
|
-
# == Arguments
|
108
|
-
#
|
109
|
-
# * path - Path of the file to be read.
|
110
|
-
# * fields - Vector names of the resulting database.
|
111
|
-
#
|
112
|
-
# == Usage
|
113
|
-
#
|
114
|
-
# df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
|
115
|
-
def from_plaintext(path, fields)
|
116
|
-
DaruLite::IO.from_plaintext path, fields
|
117
|
-
end
|
118
|
-
|
119
46
|
# Create DataFrame by specifying rows as an Array of Arrays or Array of
|
120
47
|
# DaruLite::Vector objects.
|
121
48
|
def rows(source, opts = {})
|
@@ -316,179 +243,6 @@ module DaruLite
|
|
316
243
|
update
|
317
244
|
end
|
318
245
|
|
319
|
-
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
320
|
-
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
321
|
-
# rows. Use df.row[:a] for accessing row with index ':a'.
|
322
|
-
def [](*names)
|
323
|
-
axis = extract_axis(names, :vector)
|
324
|
-
dispatch_to_axis axis, :access, *names
|
325
|
-
end
|
326
|
-
|
327
|
-
# Retrive rows by positions
|
328
|
-
# @param [Array<Integer>] positions of rows to retrive
|
329
|
-
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
330
|
-
# @example
|
331
|
-
# df = DaruLite::DataFrame.new({
|
332
|
-
# a: [1, 2, 3],
|
333
|
-
# b: ['a', 'b', 'c']
|
334
|
-
# })
|
335
|
-
# df.row_at 1, 2
|
336
|
-
# # => #<DaruLite::DataFrame(2x2)>
|
337
|
-
# # a b
|
338
|
-
# # 1 2 b
|
339
|
-
# # 2 3 c
|
340
|
-
def row_at(*positions)
|
341
|
-
original_positions = positions
|
342
|
-
positions = coerce_positions(*positions, nrows)
|
343
|
-
validate_positions(*positions, nrows)
|
344
|
-
|
345
|
-
if positions.is_a? Integer
|
346
|
-
row = get_rows_for([positions])
|
347
|
-
DaruLite::Vector.new row, index: @vectors
|
348
|
-
else
|
349
|
-
new_rows = get_rows_for(original_positions)
|
350
|
-
DaruLite::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
|
351
|
-
end
|
352
|
-
end
|
353
|
-
|
354
|
-
# Set rows by positions
|
355
|
-
# @param [Array<Integer>] positions positions of rows to set
|
356
|
-
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
357
|
-
# @example
|
358
|
-
# df = DaruLite::DataFrame.new({
|
359
|
-
# a: [1, 2, 3],
|
360
|
-
# b: ['a', 'b', 'c']
|
361
|
-
# })
|
362
|
-
# df.set_row_at [0, 1], ['x', 'x']
|
363
|
-
# df
|
364
|
-
# #=> #<DaruLite::DataFrame(3x2)>
|
365
|
-
# # a b
|
366
|
-
# # 0 x x
|
367
|
-
# # 1 x x
|
368
|
-
# # 2 3 c
|
369
|
-
def set_row_at(positions, vector)
|
370
|
-
validate_positions(*positions, nrows)
|
371
|
-
vector =
|
372
|
-
if vector.is_a? DaruLite::Vector
|
373
|
-
vector.reindex @vectors
|
374
|
-
else
|
375
|
-
DaruLite::Vector.new vector
|
376
|
-
end
|
377
|
-
|
378
|
-
raise SizeError, 'Vector length should match row length' if
|
379
|
-
vector.size != @vectors.size
|
380
|
-
|
381
|
-
@data.each_with_index do |vec, pos|
|
382
|
-
vec.set_at(positions, vector.at(pos))
|
383
|
-
end
|
384
|
-
@index = @data[0].index
|
385
|
-
set_size
|
386
|
-
end
|
387
|
-
|
388
|
-
# Retrive vectors by positions
|
389
|
-
# @param [Array<Integer>] positions of vectors to retrive
|
390
|
-
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
391
|
-
# @example
|
392
|
-
# df = DaruLite::DataFrame.new({
|
393
|
-
# a: [1, 2, 3],
|
394
|
-
# b: ['a', 'b', 'c']
|
395
|
-
# })
|
396
|
-
# df.at 0
|
397
|
-
# # => #<DaruLite::Vector(3)>
|
398
|
-
# # a
|
399
|
-
# # 0 1
|
400
|
-
# # 1 2
|
401
|
-
# # 2 3
|
402
|
-
def at(*positions)
|
403
|
-
if AXES.include? positions.last
|
404
|
-
axis = positions.pop
|
405
|
-
return row_at(*positions) if axis == :row
|
406
|
-
end
|
407
|
-
|
408
|
-
original_positions = positions
|
409
|
-
positions = coerce_positions(*positions, ncols)
|
410
|
-
validate_positions(*positions, ncols)
|
411
|
-
|
412
|
-
if positions.is_a? Integer
|
413
|
-
@data[positions].dup
|
414
|
-
else
|
415
|
-
DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
|
416
|
-
index: @index,
|
417
|
-
order: @vectors.at(*original_positions),
|
418
|
-
name: @name
|
419
|
-
end
|
420
|
-
end
|
421
|
-
|
422
|
-
# Set vectors by positions
|
423
|
-
# @param [Array<Integer>] positions positions of vectors to set
|
424
|
-
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
425
|
-
# @example
|
426
|
-
# df = DaruLite::DataFrame.new({
|
427
|
-
# a: [1, 2, 3],
|
428
|
-
# b: ['a', 'b', 'c']
|
429
|
-
# })
|
430
|
-
# df.set_at [0], ['x', 'y', 'z']
|
431
|
-
# df
|
432
|
-
# #=> #<DaruLite::DataFrame(3x2)>
|
433
|
-
# # a b
|
434
|
-
# # 0 x a
|
435
|
-
# # 1 y b
|
436
|
-
# # 2 z c
|
437
|
-
def set_at(positions, vector)
|
438
|
-
if positions.last == :row
|
439
|
-
positions.pop
|
440
|
-
return set_row_at(positions, vector)
|
441
|
-
end
|
442
|
-
|
443
|
-
validate_positions(*positions, ncols)
|
444
|
-
vector =
|
445
|
-
if vector.is_a? DaruLite::Vector
|
446
|
-
vector.reindex @index
|
447
|
-
else
|
448
|
-
DaruLite::Vector.new vector
|
449
|
-
end
|
450
|
-
|
451
|
-
raise SizeError, 'Vector length should match index length' if
|
452
|
-
vector.size != @index.size
|
453
|
-
|
454
|
-
positions.each { |pos| @data[pos] = vector }
|
455
|
-
end
|
456
|
-
|
457
|
-
# Insert a new row/vector of the specified name or modify a previous row.
|
458
|
-
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
|
459
|
-
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
|
460
|
-
#
|
461
|
-
# In case a DaruLite::Vector is specified after the equality the sign, the indexes
|
462
|
-
# of the vector will be matched against the row/vector indexes of the DataFrame
|
463
|
-
# before an insertion is performed. Unmatched indexes will be set to nil.
|
464
|
-
def []=(*args)
|
465
|
-
vector = args.pop
|
466
|
-
axis = extract_axis(args)
|
467
|
-
names = args
|
468
|
-
|
469
|
-
dispatch_to_axis axis, :insert_or_modify, names, vector
|
470
|
-
end
|
471
|
-
|
472
|
-
def add_row(row, index = nil)
|
473
|
-
self.row[*(index || @size)] = row
|
474
|
-
end
|
475
|
-
|
476
|
-
def add_vector(n, vector)
|
477
|
-
self[n] = vector
|
478
|
-
end
|
479
|
-
|
480
|
-
def insert_vector(n, name, source)
|
481
|
-
raise ArgumentError unless source.is_a? Array
|
482
|
-
|
483
|
-
vector = DaruLite::Vector.new(source, index: @index, name: @name)
|
484
|
-
@data << vector
|
485
|
-
@vectors = @vectors.add name
|
486
|
-
ordr = @vectors.dup.to_a
|
487
|
-
elmnt = ordr.pop
|
488
|
-
ordr.insert n, elmnt
|
489
|
-
self.order = ordr
|
490
|
-
end
|
491
|
-
|
492
246
|
# Access a row or set/create a row. Refer #[] and #[]= docs for details.
|
493
247
|
#
|
494
248
|
# == Usage
|
@@ -498,1696 +252,177 @@ module DaruLite
|
|
498
252
|
DaruLite::Accessors::DataFrameByRow.new(self)
|
499
253
|
end
|
500
254
|
|
501
|
-
#
|
502
|
-
|
503
|
-
|
504
|
-
def get_sub_dataframe(keys, by_position: true)
|
505
|
-
return DaruLite::DataFrame.new({}) if keys == []
|
506
|
-
|
507
|
-
keys = @index.pos(*keys) unless by_position
|
508
|
-
|
509
|
-
sub_df = row_at(*keys)
|
510
|
-
sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
|
511
|
-
|
512
|
-
sub_df
|
513
|
-
end
|
514
|
-
|
515
|
-
# Duplicate the DataFrame entirely.
|
516
|
-
#
|
517
|
-
# == Arguments
|
518
|
-
#
|
519
|
-
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
520
|
-
# be duplicated. Will duplicate the entire DataFrame if not specified.
|
521
|
-
def dup(vectors_to_dup = nil)
|
522
|
-
vectors_to_dup ||= @vectors.to_a
|
523
|
-
|
524
|
-
src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
|
525
|
-
new_order = DaruLite::Index.new(vectors_to_dup)
|
526
|
-
|
527
|
-
DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
|
528
|
-
end
|
529
|
-
|
530
|
-
# Only clone the structure of the DataFrame.
|
531
|
-
def clone_structure
|
532
|
-
DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
|
533
|
-
end
|
534
|
-
|
535
|
-
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
|
536
|
-
# preserved.
|
537
|
-
#
|
538
|
-
# == Arguments
|
539
|
-
#
|
540
|
-
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
541
|
-
# a view of the whole data frame otherwise.
|
542
|
-
def clone(*vectors_to_clone)
|
543
|
-
vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
|
544
|
-
vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
|
545
|
-
|
546
|
-
h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
|
547
|
-
DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
|
548
|
-
end
|
549
|
-
|
550
|
-
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
551
|
-
# or a full copy of only valid data if missing data is present.
|
552
|
-
def clone_only_valid
|
553
|
-
if include_values?(*DaruLite::MISSING_VALUES)
|
554
|
-
reject_values(*DaruLite::MISSING_VALUES)
|
555
|
-
else
|
556
|
-
clone
|
557
|
-
end
|
558
|
-
end
|
559
|
-
|
560
|
-
# Creates a new duplicate dataframe containing only rows
|
561
|
-
# without a single missing value.
|
562
|
-
def dup_only_valid(vecs = nil)
|
563
|
-
rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
|
564
|
-
.inject(&:concat)
|
565
|
-
.uniq
|
566
|
-
|
567
|
-
row_indexes = @index.to_a
|
568
|
-
(vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
|
569
|
-
end
|
570
|
-
deprecate :dup_only_valid, :reject_values, 2016, 10
|
571
|
-
|
572
|
-
# Returns a dataframe in which rows with any of the mentioned values
|
573
|
-
# are ignored.
|
574
|
-
# @param [Array] values to reject to form the new dataframe
|
575
|
-
# @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
|
576
|
-
# contain the mentioned values
|
577
|
-
# @example
|
578
|
-
# df = DaruLite::DataFrame.new({
|
579
|
-
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
580
|
-
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
581
|
-
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
582
|
-
# }, index: 11..18)
|
583
|
-
# df.reject_values nil, Float::NAN
|
584
|
-
# # => #<DaruLite::DataFrame(2x3)>
|
585
|
-
# # a b c
|
586
|
-
# # 11 1 a a
|
587
|
-
# # 18 7 8 7
|
588
|
-
def reject_values(*values)
|
589
|
-
positions =
|
590
|
-
size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
|
591
|
-
# Handle the case when positions size is 1 and #row_at wouldn't return a df
|
592
|
-
if positions.size == 1
|
593
|
-
pos = positions.first
|
594
|
-
row_at(pos..pos)
|
595
|
-
else
|
596
|
-
row_at(*positions)
|
597
|
-
end
|
598
|
-
end
|
599
|
-
|
600
|
-
# Replace specified values with given value
|
601
|
-
# @param [Array] old_values values to replace with new value
|
602
|
-
# @param [object] new_value new value to replace with
|
603
|
-
# @return [DaruLite::DataFrame] Data Frame itself with old values replace
|
604
|
-
# with new value
|
605
|
-
# @example
|
606
|
-
# df = DaruLite::DataFrame.new({
|
607
|
-
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
608
|
-
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
609
|
-
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
610
|
-
# }, index: 11..18)
|
611
|
-
# df.replace_values nil, Float::NAN
|
612
|
-
# # => #<DaruLite::DataFrame(8x3)>
|
613
|
-
# # a b c
|
614
|
-
# # 11 1 a a
|
615
|
-
# # 12 2 b NaN
|
616
|
-
# # 13 3 NaN 3
|
617
|
-
# # 14 NaN NaN 4
|
618
|
-
# # 15 NaN NaN 3
|
619
|
-
# # 16 NaN 3 5
|
620
|
-
# # 17 1 5 NaN
|
621
|
-
# # 18 7 8 7
|
622
|
-
def replace_values(old_values, new_value)
|
623
|
-
@data.each { |vec| vec.replace_values old_values, new_value }
|
624
|
-
self
|
625
|
-
end
|
626
|
-
|
627
|
-
# Rolling fillna
|
628
|
-
# replace all Float::NAN and NIL values with the preceeding or following value
|
629
|
-
#
|
630
|
-
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
631
|
-
#
|
632
|
-
# @example
|
633
|
-
# df = DaruLite::DataFrame.new({
|
634
|
-
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
635
|
-
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
|
636
|
-
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
637
|
-
# })
|
638
|
-
#
|
639
|
-
# => #<DaruLite::DataFrame(8x3)>
|
640
|
-
# a b c
|
641
|
-
# 0 1 a a
|
642
|
-
# 1 2 b NaN
|
643
|
-
# 2 3 nil 3
|
644
|
-
# 3 nil NaN 4
|
645
|
-
# 4 NaN nil 3
|
646
|
-
# 5 nil 3 5
|
647
|
-
# 6 1 5 nil
|
648
|
-
# 7 7 nil 7
|
649
|
-
#
|
650
|
-
# 2.3.3 :068 > df.rolling_fillna(:forward)
|
651
|
-
# => #<DaruLite::DataFrame(8x3)>
|
652
|
-
# a b c
|
653
|
-
# 0 1 a a
|
654
|
-
# 1 2 b a
|
655
|
-
# 2 3 b 3
|
656
|
-
# 3 3 b 4
|
657
|
-
# 4 3 b 3
|
658
|
-
# 5 3 3 5
|
659
|
-
# 6 1 5 5
|
660
|
-
# 7 7 5 7
|
661
|
-
#
|
662
|
-
def rolling_fillna!(direction = :forward)
|
663
|
-
@data.each { |vec| vec.rolling_fillna!(direction) }
|
664
|
-
self
|
665
|
-
end
|
666
|
-
|
667
|
-
def rolling_fillna(direction = :forward)
|
668
|
-
dup.rolling_fillna!(direction)
|
669
|
-
end
|
670
|
-
|
671
|
-
# Return unique rows by vector specified or all vectors
|
672
|
-
#
|
673
|
-
# @param vtrs [String][Symbol] vector names(s) that should be considered
|
674
|
-
#
|
675
|
-
# @example
|
676
|
-
#
|
677
|
-
# => #<DaruLite::DataFrame(6x2)>
|
678
|
-
# a b
|
679
|
-
# 0 1 a
|
680
|
-
# 1 2 b
|
681
|
-
# 2 3 c
|
682
|
-
# 3 4 d
|
683
|
-
# 2 3 c
|
684
|
-
# 3 4 f
|
685
|
-
#
|
686
|
-
# 2.3.3 :> df.unique
|
687
|
-
# => #<DaruLite::DataFrame(5x2)>
|
688
|
-
# a b
|
689
|
-
# 0 1 a
|
690
|
-
# 1 2 b
|
691
|
-
# 2 3 c
|
692
|
-
# 3 4 d
|
693
|
-
# 3 4 f
|
694
|
-
#
|
695
|
-
# 2.3.3 :> df.unique(:a)
|
696
|
-
# => #<DaruLite::DataFrame(5x2)>
|
697
|
-
# a b
|
698
|
-
# 0 1 a
|
699
|
-
# 1 2 b
|
700
|
-
# 2 3 c
|
701
|
-
# 3 4 d
|
702
|
-
#
|
703
|
-
def uniq(*vtrs)
|
704
|
-
vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
|
705
|
-
grouped = group_by(vecs)
|
706
|
-
indexes = grouped.groups.values.map { |v| v[0] }.sort
|
707
|
-
row[*indexes]
|
708
|
-
end
|
709
|
-
|
710
|
-
# Iterate over each index of the DataFrame.
|
711
|
-
def each_index(&block)
|
712
|
-
return to_enum(:each_index) unless block
|
713
|
-
|
714
|
-
@index.each(&block)
|
715
|
-
|
716
|
-
self
|
717
|
-
end
|
718
|
-
|
719
|
-
# Iterate over each vector
|
720
|
-
def each_vector(&block)
|
721
|
-
return to_enum(:each_vector) unless block
|
255
|
+
# Delete a vector
|
256
|
+
def delete_vector(vector)
|
257
|
+
raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
|
722
258
|
|
723
|
-
@data.
|
259
|
+
@data.delete_at @vectors[vector]
|
260
|
+
@vectors = DaruLite::Index.new @vectors.to_a - [vector]
|
724
261
|
|
725
262
|
self
|
726
263
|
end
|
727
264
|
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
def each_vector_with_index
|
732
|
-
return to_enum(:each_vector_with_index) unless block_given?
|
733
|
-
|
734
|
-
@vectors.each do |vector|
|
735
|
-
yield @data[@vectors[vector]], vector
|
736
|
-
end
|
265
|
+
# Deletes a list of vectors
|
266
|
+
def delete_vectors(*vectors)
|
267
|
+
Array(vectors).each { |vec| delete_vector vec }
|
737
268
|
|
738
269
|
self
|
739
270
|
end
|
740
271
|
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
def each_row
|
745
|
-
return to_enum(:each_row) unless block_given?
|
746
|
-
|
747
|
-
@index.size.times do |pos|
|
748
|
-
yield row_at(pos)
|
749
|
-
end
|
750
|
-
|
751
|
-
self
|
752
|
-
end
|
272
|
+
# Delete a row
|
273
|
+
def delete_row(index)
|
274
|
+
idx = named_index_for index
|
753
275
|
|
754
|
-
|
755
|
-
return to_enum(:each_row_with_index) unless block_given?
|
276
|
+
raise IndexError, "Index #{index} does not exist." unless @index.include? idx
|
756
277
|
|
757
|
-
@index.
|
758
|
-
|
278
|
+
@index = DaruLite::Index.new(@index.to_a - [idx])
|
279
|
+
each_vector do |vector|
|
280
|
+
vector.delete_at idx
|
759
281
|
end
|
760
282
|
|
761
|
-
|
762
|
-
end
|
763
|
-
|
764
|
-
# Iterate over each row or vector of the DataFrame. Specify axis
|
765
|
-
# by passing :vector or :row as the argument. Default to :vector.
|
766
|
-
#
|
767
|
-
# == Description
|
768
|
-
#
|
769
|
-
# `#each` works exactly like Array#each. The default mode for `each`
|
770
|
-
# is to iterate over the columns of the DataFrame. To iterate over
|
771
|
-
# rows you must pass the axis, i.e `:row` as an argument.
|
772
|
-
#
|
773
|
-
# == Arguments
|
774
|
-
#
|
775
|
-
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
776
|
-
# or :row. Default to :vector.
|
777
|
-
def each(axis = :vector, &block)
|
778
|
-
dispatch_to_axis axis, :each, &block
|
779
|
-
end
|
780
|
-
|
781
|
-
# Iterate over a row or vector and return results in a DaruLite::Vector.
|
782
|
-
# Specify axis with :vector or :row. Default to :vector.
|
783
|
-
#
|
784
|
-
# == Description
|
785
|
-
#
|
786
|
-
# The #collect iterator works similar to #map, the only difference
|
787
|
-
# being that it returns a DaruLite::Vector comprising of the results of
|
788
|
-
# each block run. The resultant Vector has the same index as that
|
789
|
-
# of the axis over which collect has iterated. It also accepts the
|
790
|
-
# optional axis argument.
|
791
|
-
#
|
792
|
-
# == Arguments
|
793
|
-
#
|
794
|
-
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
795
|
-
# or :row. Default to :vector.
|
796
|
-
def collect(axis = :vector, &block)
|
797
|
-
dispatch_to_axis_pl axis, :collect, &block
|
283
|
+
set_size
|
798
284
|
end
|
799
285
|
|
800
|
-
#
|
801
|
-
#
|
802
|
-
#
|
803
|
-
|
804
|
-
|
805
|
-
# == Description
|
806
|
-
#
|
807
|
-
# The #map iterator works like Array#map. The value returned by
|
808
|
-
# each run of the block is added to an Array and the Array is
|
809
|
-
# returned. This method also accepts an axis argument, like #each.
|
810
|
-
# The default is :vector.
|
811
|
-
#
|
812
|
-
# == Arguments
|
813
|
-
#
|
814
|
-
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
815
|
-
# Default to :vector.
|
816
|
-
def map(axis = :vector, &block)
|
817
|
-
dispatch_to_axis_pl axis, :map, &block
|
818
|
-
end
|
286
|
+
# Delete a row based on its position
|
287
|
+
# More robust than #delete_row when working with a CategoricalIndex or when the
|
288
|
+
# Index includes integers
|
289
|
+
def delete_at_position(position)
|
290
|
+
raise IndexError, "Position #{position} does not exist." unless position < size
|
819
291
|
|
820
|
-
|
821
|
-
|
822
|
-
# as the argument. Default to :vector.
|
823
|
-
#
|
824
|
-
# == Arguments
|
825
|
-
#
|
826
|
-
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
827
|
-
# Default to :vector.
|
828
|
-
def map!(axis = :vector, &block)
|
829
|
-
if %i[vector column].include?(axis)
|
830
|
-
map_vectors!(&block)
|
831
|
-
elsif axis == :row
|
832
|
-
map_rows!(&block)
|
833
|
-
end
|
834
|
-
end
|
292
|
+
each_vector { |vector| vector.delete_at_position(position) }
|
293
|
+
@index = @index.delete_at(position)
|
835
294
|
|
836
|
-
|
837
|
-
# block must return a DaruLite::Vector object. You can specify the axis
|
838
|
-
# to map over. Default to :vector.
|
839
|
-
#
|
840
|
-
# == Description
|
841
|
-
#
|
842
|
-
# Recode works similarly to #map, but an important difference between
|
843
|
-
# the two is that recode returns a modified DaruLite::DataFrame instead
|
844
|
-
# of an Array. For this reason, #recode expects that every run of the
|
845
|
-
# block to return a DaruLite::Vector.
|
846
|
-
#
|
847
|
-
# Just like map and each, recode also accepts an optional _axis_ argument.
|
848
|
-
#
|
849
|
-
# == Arguments
|
850
|
-
#
|
851
|
-
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
852
|
-
# Default to :vector.
|
853
|
-
def recode(axis = :vector, &block)
|
854
|
-
dispatch_to_axis_pl axis, :recode, &block
|
295
|
+
set_size
|
855
296
|
end
|
856
297
|
|
857
|
-
#
|
858
|
-
#
|
859
|
-
# == Description
|
860
|
-
#
|
861
|
-
# For filtering out certain rows/vectors based on their values,
|
862
|
-
# use the #filter method. By default it iterates over vectors and
|
863
|
-
# keeps those vectors for which the block returns true. It accepts
|
864
|
-
# an optional axis argument which lets you specify whether you want
|
865
|
-
# to iterate over vectors or rows.
|
866
|
-
#
|
867
|
-
# == Arguments
|
868
|
-
#
|
869
|
-
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
870
|
-
# Default to :vector.
|
871
|
-
#
|
872
|
-
# == Usage
|
873
|
-
#
|
874
|
-
# # Filter vectors
|
875
|
-
#
|
876
|
-
# df.filter do |vector|
|
877
|
-
# vector.type == :numeric and vector.median < 50
|
878
|
-
# end
|
879
|
-
#
|
880
|
-
# # Filter rows
|
298
|
+
# Creates a DataFrame with the random data, of n size.
|
299
|
+
# If n not given, uses original number of rows.
|
881
300
|
#
|
882
|
-
#
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
def recode_vectors
|
890
|
-
block_given? or return to_enum(:recode_vectors)
|
891
|
-
|
892
|
-
dup.tap do |df|
|
893
|
-
df.each_vector_with_index do |v, i|
|
894
|
-
df[*i] = should_be_vector!(yield(v))
|
895
|
-
end
|
896
|
-
end
|
897
|
-
end
|
898
|
-
|
899
|
-
def recode_rows
|
900
|
-
block_given? or return to_enum(:recode_rows)
|
901
|
-
|
902
|
-
dup.tap do |df|
|
903
|
-
df.each_row_with_index do |r, i|
|
904
|
-
df.row[i] = should_be_vector!(yield(r))
|
301
|
+
# @return {DaruLite::DataFrame}
|
302
|
+
def bootstrap(n = nil)
|
303
|
+
n ||= nrows
|
304
|
+
DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
|
305
|
+
n.times do
|
306
|
+
df_boot.add_row(row[rand(n)])
|
905
307
|
end
|
308
|
+
df_boot.update
|
906
309
|
end
|
907
310
|
end
|
908
311
|
|
909
|
-
# Map each vector and return an Array.
|
910
|
-
def map_vectors(&block)
|
911
|
-
return to_enum(:map_vectors) unless block
|
912
|
-
|
913
|
-
@data.map(&block)
|
914
|
-
end
|
915
|
-
|
916
|
-
# Destructive form of #map_vectors
|
917
|
-
def map_vectors!
|
918
|
-
return to_enum(:map_vectors!) unless block_given?
|
919
|
-
|
920
|
-
vectors.dup.each do |n|
|
921
|
-
self[n] = should_be_vector!(yield(self[n]))
|
922
|
-
end
|
923
|
-
|
924
|
-
self
|
925
|
-
end
|
926
|
-
|
927
|
-
# Map vectors alongwith the index.
|
928
|
-
def map_vectors_with_index(&block)
|
929
|
-
return to_enum(:map_vectors_with_index) unless block
|
930
|
-
|
931
|
-
each_vector_with_index.map(&block)
|
932
|
-
end
|
933
|
-
|
934
|
-
# Map each row
|
935
|
-
def map_rows(&block)
|
936
|
-
return to_enum(:map_rows) unless block
|
937
|
-
|
938
|
-
each_row.map(&block)
|
939
|
-
end
|
940
|
-
|
941
|
-
def map_rows_with_index(&block)
|
942
|
-
return to_enum(:map_rows_with_index) unless block
|
943
|
-
|
944
|
-
each_row_with_index.map(&block)
|
945
|
-
end
|
946
|
-
|
947
|
-
def map_rows!
|
948
|
-
return to_enum(:map_rows!) unless block_given?
|
949
|
-
|
950
|
-
index.dup.each do |i|
|
951
|
-
row[i] = should_be_vector!(yield(row[i]))
|
952
|
-
end
|
953
|
-
|
954
|
-
self
|
955
|
-
end
|
956
|
-
|
957
|
-
def apply_method(method, keys: nil, by_position: true)
|
958
|
-
df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
|
959
|
-
|
960
|
-
case method
|
961
|
-
when Symbol then df.send(method)
|
962
|
-
when Proc then method.call(df)
|
963
|
-
when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
|
964
|
-
else raise
|
965
|
-
end
|
966
|
-
end
|
967
|
-
alias apply_method_on_sub_df apply_method
|
968
|
-
|
969
|
-
# Retrieves a DaruLite::Vector, based on the result of calculation
|
970
|
-
# performed on each row.
|
971
|
-
def collect_rows(&block)
|
972
|
-
return to_enum(:collect_rows) unless block
|
973
|
-
|
974
|
-
DaruLite::Vector.new(each_row.map(&block), index: @index)
|
975
|
-
end
|
976
|
-
|
977
|
-
def collect_row_with_index(&block)
|
978
|
-
return to_enum(:collect_row_with_index) unless block
|
979
|
-
|
980
|
-
DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
|
981
|
-
end
|
982
|
-
|
983
|
-
# Retrives a DaruLite::Vector, based on the result of calculation
|
984
|
-
# performed on each vector.
|
985
|
-
def collect_vectors(&block)
|
986
|
-
return to_enum(:collect_vectors) unless block
|
987
|
-
|
988
|
-
DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
|
989
|
-
end
|
990
|
-
|
991
|
-
def collect_vector_with_index(&block)
|
992
|
-
return to_enum(:collect_vector_with_index) unless block
|
993
|
-
|
994
|
-
DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
|
995
|
-
end
|
996
|
-
|
997
|
-
# Generate a matrix, based on vector names of the DataFrame.
|
998
|
-
#
|
999
|
-
# @return {::Matrix}
|
1000
|
-
# :nocov:
|
1001
|
-
# FIXME: Even not trying to cover this: I can't get, how it is expected
|
1002
|
-
# to work.... -- zverok
|
1003
|
-
def collect_matrix
|
1004
|
-
return to_enum(:collect_matrix) unless block_given?
|
1005
|
-
|
1006
|
-
vecs = vectors.to_a
|
1007
|
-
rows = vecs.collect do |row|
|
1008
|
-
vecs.collect do |col|
|
1009
|
-
yield row, col
|
1010
|
-
end
|
1011
|
-
end
|
1012
|
-
|
1013
|
-
Matrix.rows(rows)
|
1014
|
-
end
|
1015
|
-
# :nocov:
|
1016
|
-
|
1017
|
-
# Delete a vector
|
1018
|
-
def delete_vector(vector)
|
1019
|
-
raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
|
1020
|
-
|
1021
|
-
@data.delete_at @vectors[vector]
|
1022
|
-
@vectors = DaruLite::Index.new @vectors.to_a - [vector]
|
1023
|
-
|
1024
|
-
self
|
1025
|
-
end
|
1026
|
-
|
1027
|
-
# Deletes a list of vectors
|
1028
|
-
def delete_vectors(*vectors)
|
1029
|
-
Array(vectors).each { |vec| delete_vector vec }
|
1030
|
-
|
1031
|
-
self
|
1032
|
-
end
|
1033
|
-
|
1034
|
-
# Delete a row
|
1035
|
-
def delete_row(index)
|
1036
|
-
idx = named_index_for index
|
1037
|
-
|
1038
|
-
raise IndexError, "Index #{index} does not exist." unless @index.include? idx
|
1039
|
-
|
1040
|
-
@index = DaruLite::Index.new(@index.to_a - [idx])
|
1041
|
-
each_vector do |vector|
|
1042
|
-
vector.delete_at idx
|
1043
|
-
end
|
1044
|
-
|
1045
|
-
set_size
|
1046
|
-
end
|
1047
|
-
|
1048
|
-
# Creates a DataFrame with the random data, of n size.
|
1049
|
-
# If n not given, uses original number of rows.
|
1050
|
-
#
|
1051
|
-
# @return {DaruLite::DataFrame}
|
1052
|
-
def bootstrap(n = nil)
|
1053
|
-
n ||= nrows
|
1054
|
-
DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
|
1055
|
-
n.times do
|
1056
|
-
df_boot.add_row(row[rand(n)])
|
1057
|
-
end
|
1058
|
-
df_boot.update
|
1059
|
-
end
|
1060
|
-
end
|
1061
|
-
|
1062
|
-
def keep_row_if
|
1063
|
-
@index
|
1064
|
-
.reject { |idx| yield access_row(idx) }
|
1065
|
-
.each { |idx| delete_row idx }
|
1066
|
-
end
|
1067
|
-
|
1068
|
-
def keep_vector_if
|
1069
|
-
@vectors.each do |vector|
|
1070
|
-
delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
|
1071
|
-
end
|
1072
|
-
end
|
1073
|
-
|
1074
|
-
# creates a new vector with the data of a given field which the block returns true
|
1075
|
-
def filter_vector(vec, &block)
|
1076
|
-
DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
|
1077
|
-
end
|
1078
|
-
|
1079
|
-
# Iterates over each row and retains it in a new DataFrame if the block returns
|
1080
|
-
# true for that row.
|
1081
|
-
def filter_rows
|
1082
|
-
return to_enum(:filter_rows) unless block_given?
|
1083
|
-
|
1084
|
-
keep_rows = @index.map { |index| yield access_row(index) }
|
1085
|
-
|
1086
|
-
where keep_rows
|
1087
|
-
end
|
1088
|
-
|
1089
|
-
# Iterates over each vector and retains it in a new DataFrame if the block returns
|
1090
|
-
# true for that vector.
|
1091
|
-
def filter_vectors(&block)
|
1092
|
-
return to_enum(:filter_vectors) unless block
|
1093
|
-
|
1094
|
-
dup.tap { |df| df.keep_vector_if(&block) }
|
1095
|
-
end
|
1096
|
-
|
1097
|
-
# Test each row with one or more tests.
|
1098
|
-
# @param tests [Proc] Each test is a Proc with the form
|
1099
|
-
# *Proc.new {|row| row[:age] > 0}*
|
1100
|
-
# The function returns an array with all errors.
|
1101
|
-
#
|
1102
|
-
# FIXME: description here is too sparse. As far as I can get,
|
1103
|
-
# it should tell something about that each test is [descr, fields, block],
|
1104
|
-
# and that first value may be column name to output. - zverok, 2016-05-18
|
1105
|
-
def verify(*tests)
|
1106
|
-
id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
|
1107
|
-
|
1108
|
-
each_row_with_index.map do |row, i|
|
1109
|
-
tests.reject { |*_, block| block.call(row) }
|
1110
|
-
.map { |test| verify_error_message row, test, id, i }
|
1111
|
-
end.flatten
|
1112
|
-
end
|
1113
|
-
|
1114
|
-
# DSL for yielding each row and returning a DaruLite::Vector based on the
|
1115
|
-
# value each run of the block returns.
|
1116
|
-
#
|
1117
|
-
# == Usage
|
1118
|
-
#
|
1119
|
-
# a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
|
1120
|
-
# a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
|
1121
|
-
# a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
|
1122
|
-
# ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
|
1123
|
-
# total = ds.vector_by_calculation { a + b + c }
|
1124
|
-
# # <DaruLite::Vector:82314050 @name = nil @size = 7 >
|
1125
|
-
# # nil
|
1126
|
-
# # 0 111
|
1127
|
-
# # 1 222
|
1128
|
-
# # 2 333
|
1129
|
-
# # 3 444
|
1130
|
-
# # 4 555
|
1131
|
-
# # 5 666
|
1132
|
-
# # 6 777
|
1133
|
-
def vector_by_calculation(&block)
|
1134
|
-
a = each_row.map { |r| r.instance_eval(&block) }
|
1135
|
-
|
1136
|
-
DaruLite::Vector.new a, index: @index
|
1137
|
-
end
|
1138
|
-
|
1139
|
-
# Reorder the vectors in a dataframe
|
1140
|
-
# @param [Array] order_array new order of the vectors
|
1141
|
-
# @example
|
1142
|
-
# df = DaruLite::DataFrame({
|
1143
|
-
# a: [1, 2, 3],
|
1144
|
-
# b: [4, 5, 6]
|
1145
|
-
# }, order: [:a, :b])
|
1146
|
-
# df.order = [:b, :a]
|
1147
|
-
# df
|
1148
|
-
# # => #<DaruLite::DataFrame(3x2)>
|
1149
|
-
# # b a
|
1150
|
-
# # 0 4 1
|
1151
|
-
# # 1 5 2
|
1152
|
-
# # 2 6 3
|
1153
|
-
def order=(order_array)
|
1154
|
-
raise ArgumentError, 'Invalid order' unless vectors.to_a.tally == order_array.tally
|
1155
|
-
|
1156
|
-
initialize(to_h, order: order_array)
|
1157
|
-
end
|
1158
|
-
|
1159
|
-
# Return the dataframe with rotate vectors positions, the vector at position count is now
|
1160
|
-
# the first vector of the dataframe.
|
1161
|
-
# If only one vector in the dataframe, the dataframe is return without any change.
|
1162
|
-
# @param count => Integer, the vector at position count will be the first vector of the dataframe.
|
1163
|
-
# @example
|
1164
|
-
# df = DaruLite::DataFrame({
|
1165
|
-
# a: [1, 2, 3],
|
1166
|
-
# b: [4, 5, 6],
|
1167
|
-
# total: [5, 7, 9],
|
1168
|
-
# })
|
1169
|
-
# df.rotate_vectors(-1)
|
1170
|
-
# df
|
1171
|
-
# # => #<DaruLite::DataFrame(3x3)>
|
1172
|
-
# # total b a
|
1173
|
-
# # 0 5 4 1
|
1174
|
-
# # 1 7 5 2
|
1175
|
-
# # 2 9 6 3
|
1176
|
-
def rotate_vectors(count = -1)
|
1177
|
-
return self unless vectors.many?
|
1178
|
-
|
1179
|
-
self.order = vectors.to_a.rotate(count)
|
1180
|
-
self
|
1181
|
-
end
|
1182
|
-
|
1183
|
-
# Returns a vector, based on a string with a calculation based
|
1184
|
-
# on vector.
|
1185
|
-
#
|
1186
|
-
# The calculation will be eval'ed, so you can put any variable
|
1187
|
-
# or expression valid on ruby.
|
1188
|
-
#
|
1189
|
-
# For example:
|
1190
|
-
# a = DaruLite::Vector.new [1,2]
|
1191
|
-
# b = DaruLite::Vector.new [3,4]
|
1192
|
-
# ds = DaruLite::DataFrame.new({:a => a,:b => b})
|
1193
|
-
# ds.compute("a+b")
|
1194
|
-
# => Vector [4,6]
|
1195
|
-
def compute(text, &block)
|
1196
|
-
return instance_eval(&block) if block
|
1197
|
-
|
1198
|
-
instance_eval(text)
|
1199
|
-
end
|
1200
|
-
|
1201
|
-
# Return a vector with the number of missing values in each row.
|
1202
|
-
#
|
1203
|
-
# == Arguments
|
1204
|
-
#
|
1205
|
-
# * +missing_values+ - An Array of the values that should be
|
1206
|
-
# treated as 'missing'. The default missing value is *nil*.
|
1207
|
-
def missing_values_rows(missing_values = [nil])
|
1208
|
-
number_of_missing = each_row.map do |row|
|
1209
|
-
row.indexes(*missing_values).size
|
1210
|
-
end
|
1211
|
-
|
1212
|
-
DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
|
1213
|
-
end
|
1214
|
-
|
1215
|
-
# TODO: remove next version
|
1216
|
-
alias vector_missing_values missing_values_rows
|
1217
|
-
|
1218
|
-
def has_missing_data?
|
1219
|
-
@data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
|
1220
|
-
end
|
1221
|
-
alias flawed? has_missing_data?
|
1222
|
-
deprecate :has_missing_data?, :include_values?, 2016, 10
|
1223
|
-
deprecate :flawed?, :include_values?, 2016, 10
|
1224
|
-
|
1225
|
-
# Check if any of given values occur in the data frame
|
1226
|
-
# @param [Array] values to check for
|
1227
|
-
# @return [true, false] true if any of the given values occur in the
|
1228
|
-
# dataframe, false otherwise
|
1229
|
-
# @example
|
1230
|
-
# df = DaruLite::DataFrame.new({
|
1231
|
-
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
1232
|
-
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
1233
|
-
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
1234
|
-
# }, index: 11..18)
|
1235
|
-
# df.include_values? nil
|
1236
|
-
# # => true
|
1237
|
-
def include_values?(*values)
|
1238
|
-
@data.any? { |vec| vec.include_values?(*values) }
|
1239
|
-
end
|
1240
|
-
|
1241
312
|
# Return a nested hash using vector names as keys and an array constructed of
|
1242
313
|
# hashes with other values. If block provided, is used to provide the
|
1243
314
|
# values, with parameters +row+ of dataset, +current+ last hash on
|
1244
315
|
# hierarchy and +name+ of the key to include
|
1245
|
-
def nest(*tree_keys, &block)
|
1246
|
-
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
1247
|
-
|
1248
|
-
each_row.with_object({}) do |row, current|
|
1249
|
-
# Create tree
|
1250
|
-
*keys, last = tree_keys
|
1251
|
-
current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
|
1252
|
-
name = row[last]
|
1253
|
-
|
1254
|
-
if block
|
1255
|
-
current[name] = yield(row, current, name)
|
1256
|
-
else
|
1257
|
-
current[name] ||= []
|
1258
|
-
current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
|
1259
|
-
end
|
1260
|
-
end
|
1261
|
-
end
|
1262
|
-
|
1263
|
-
def vector_count_characters(vecs = nil)
|
1264
|
-
vecs ||= @vectors.to_a
|
1265
|
-
|
1266
|
-
collect_rows do |row|
|
1267
|
-
vecs.sum { |v| row[v].to_s.size }
|
1268
|
-
end
|
1269
|
-
end
|
1270
|
-
|
1271
|
-
def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
1272
|
-
self[name]
|
1273
|
-
.split_by_separator(sep)
|
1274
|
-
.each { |k, v| self[:"#{name}#{join}#{k}"] = v }
|
1275
|
-
end
|
1276
|
-
|
1277
|
-
# Return the number of rows and columns of the DataFrame in an Array.
|
1278
|
-
def shape
|
1279
|
-
[nrows, ncols]
|
1280
|
-
end
|
1281
|
-
|
1282
|
-
# The number of rows
|
1283
|
-
def nrows
|
1284
|
-
@index.size
|
1285
|
-
end
|
1286
|
-
|
1287
|
-
# The number of vectors
|
1288
|
-
def ncols
|
1289
|
-
@vectors.size
|
1290
|
-
end
|
1291
|
-
|
1292
|
-
# Check if a vector is present
|
1293
|
-
def has_vector?(vector)
|
1294
|
-
@vectors.include? vector
|
1295
|
-
end
|
1296
|
-
|
1297
|
-
# Works like Array#any?.
|
1298
|
-
#
|
1299
|
-
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
1300
|
-
# :row. A DaruLite::Vector object is yielded in the block.
|
1301
|
-
# @example Using any?
|
1302
|
-
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
1303
|
-
# df.any?(:row) do |row|
|
1304
|
-
# row[:a] < 3 and row[:b] == 'b'
|
1305
|
-
# end #=> true
|
1306
|
-
def any?(axis = :vector, &block)
|
1307
|
-
if %i[vector column].include?(axis)
|
1308
|
-
@data.any?(&block)
|
1309
|
-
elsif axis == :row
|
1310
|
-
each_row do |row|
|
1311
|
-
return true if yield(row)
|
1312
|
-
end
|
1313
|
-
false
|
1314
|
-
else
|
1315
|
-
raise ArgumentError, "Unidentified axis #{axis}"
|
1316
|
-
end
|
1317
|
-
end
|
1318
|
-
|
1319
|
-
# Works like Array#all?
|
1320
|
-
#
|
1321
|
-
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
1322
|
-
# :row. A DaruLite::Vector object is yielded in the block.
|
1323
|
-
# @example Using all?
|
1324
|
-
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
1325
|
-
# df.all?(:row) do |row|
|
1326
|
-
# row[:a] < 10
|
1327
|
-
# end #=> true
|
1328
|
-
def all?(axis = :vector, &block)
|
1329
|
-
if %i[vector column].include?(axis)
|
1330
|
-
@data.all?(&block)
|
1331
|
-
elsif axis == :row
|
1332
|
-
each_row.all?(&block)
|
1333
|
-
else
|
1334
|
-
raise ArgumentError, "Unidentified axis #{axis}"
|
1335
|
-
end
|
1336
|
-
end
|
1337
|
-
|
1338
|
-
# The first ten elements of the DataFrame
|
1339
|
-
#
|
1340
|
-
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
1341
|
-
def head(quantity = 10)
|
1342
|
-
row.at 0..(quantity - 1)
|
1343
|
-
end
|
1344
|
-
|
1345
|
-
alias first head
|
1346
|
-
|
1347
|
-
# The last ten elements of the DataFrame
|
1348
|
-
#
|
1349
|
-
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
1350
|
-
def tail(quantity = 10)
|
1351
|
-
start = [-quantity, -size].max
|
1352
|
-
row.at start..-1
|
1353
|
-
end
|
1354
|
-
|
1355
|
-
alias last tail
|
1356
|
-
|
1357
|
-
# Sum all numeric/specified vectors in the DataFrame.
|
1358
|
-
#
|
1359
|
-
# Returns a new vector that's a containing a sum of all numeric
|
1360
|
-
# or specified vectors of the DataFrame. By default, if the vector
|
1361
|
-
# contains a nil, the sum is nil.
|
1362
|
-
# With :skipnil argument set to true, nil values are assumed to be
|
1363
|
-
# 0 (zero) and the sum vector is returned.
|
1364
|
-
#
|
1365
|
-
# @param args [Array] List of vectors to sum. Default is nil in which case
|
1366
|
-
# all numeric vectors are summed.
|
1367
|
-
#
|
1368
|
-
# @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
|
1369
|
-
#
|
1370
|
-
# @return Vector with sum of all vectors specified in the argument.
|
1371
|
-
# If vecs parameter is empty, sum all numeric vector.
|
1372
|
-
#
|
1373
|
-
# @example
|
1374
|
-
# df = DaruLite::DataFrame.new({
|
1375
|
-
# a: [1, 2, nil],
|
1376
|
-
# b: [2, 1, 3],
|
1377
|
-
# c: [1, 1, 1]
|
1378
|
-
# })
|
1379
|
-
# => #<DaruLite::DataFrame(3x3)>
|
1380
|
-
# a b c
|
1381
|
-
# 0 1 2 1
|
1382
|
-
# 1 2 1 1
|
1383
|
-
# 2 nil 3 1
|
1384
|
-
# df.vector_sum [:a, :c]
|
1385
|
-
# => #<DaruLite::Vector(3)>
|
1386
|
-
# 0 2
|
1387
|
-
# 1 3
|
1388
|
-
# 2 nil
|
1389
|
-
# df.vector_sum
|
1390
|
-
# => #<DaruLite::Vector(3)>
|
1391
|
-
# 0 4
|
1392
|
-
# 1 4
|
1393
|
-
# 2 nil
|
1394
|
-
# df.vector_sum skipnil: true
|
1395
|
-
# => #<DaruLite::Vector(3)>
|
1396
|
-
# c
|
1397
|
-
# 0 4
|
1398
|
-
# 1 4
|
1399
|
-
# 2 4
|
1400
|
-
#
|
1401
|
-
def vector_sum(*args)
|
1402
|
-
defaults = { vecs: nil, skipnil: false }
|
1403
|
-
options = args.last.is_a?(::Hash) ? args.pop : {}
|
1404
|
-
options = defaults.merge(options)
|
1405
|
-
vecs = args[0] || options[:vecs]
|
1406
|
-
skipnil = args[1] || options[:skipnil]
|
1407
|
-
|
1408
|
-
vecs ||= numeric_vectors
|
1409
|
-
sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
|
1410
|
-
vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
|
1411
|
-
end
|
1412
|
-
|
1413
|
-
# Calculate mean of the rows of the dataframe.
|
1414
|
-
#
|
1415
|
-
# == Arguments
|
1416
|
-
#
|
1417
|
-
# * +max_missing+ - The maximum number of elements in the row that can be
|
1418
|
-
# zero for the mean calculation to happen. Default to 0.
|
1419
|
-
def vector_mean(max_missing = 0)
|
1420
|
-
# FIXME: in vector_sum we preserve created vector dtype, but
|
1421
|
-
# here we are not. Is this by design or ...? - zverok, 2016-05-18
|
1422
|
-
mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
|
1423
|
-
|
1424
|
-
each_row_with_index.with_object(mean_vec) do |(row, i), memo|
|
1425
|
-
memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
|
1426
|
-
end
|
1427
|
-
end
|
1428
|
-
|
1429
|
-
# Group elements by vector to perform operations on them. Returns a
|
1430
|
-
# DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
|
1431
|
-
# list of possible operations.
|
1432
|
-
#
|
1433
|
-
# == Arguments
|
1434
|
-
#
|
1435
|
-
# * vectors - An Array contatining names of vectors to group by.
|
1436
|
-
#
|
1437
|
-
# == Usage
|
1438
|
-
#
|
1439
|
-
# df = DaruLite::DataFrame.new({
|
1440
|
-
# a: %w{foo bar foo bar foo bar foo foo},
|
1441
|
-
# b: %w{one one two three two two one three},
|
1442
|
-
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
1443
|
-
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
1444
|
-
# })
|
1445
|
-
# df.group_by([:a,:b,:c]).groups
|
1446
|
-
# #=> {["bar", "one", 2]=>[1],
|
1447
|
-
# # ["bar", "three", 1]=>[3],
|
1448
|
-
# # ["bar", "two", 6]=>[5],
|
1449
|
-
# # ["foo", "one", 1]=>[0],
|
1450
|
-
# # ["foo", "one", 3]=>[6],
|
1451
|
-
# # ["foo", "three", 8]=>[7],
|
1452
|
-
# # ["foo", "two", 3]=>[2, 4]}
|
1453
|
-
def group_by(*vectors)
|
1454
|
-
vectors.flatten!
|
1455
|
-
missing = vectors - @vectors.to_a
|
1456
|
-
raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
|
1457
|
-
|
1458
|
-
vectors = [@vectors.first] if vectors.empty?
|
1459
|
-
|
1460
|
-
DaruLite::Core::GroupBy.new(self, vectors)
|
1461
|
-
end
|
1462
|
-
|
1463
|
-
def reindex_vectors(new_vectors)
|
1464
|
-
unless new_vectors.is_a?(DaruLite::Index)
|
1465
|
-
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
1466
|
-
"subclasses, not #{new_vectors.class}"
|
1467
|
-
end
|
1468
|
-
|
1469
|
-
cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
|
1470
|
-
new_vectors.each_with_object(cl) do |vec, memo|
|
1471
|
-
memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
|
1472
|
-
end
|
1473
|
-
end
|
1474
|
-
|
1475
|
-
def get_vector_anyways(v)
|
1476
|
-
@vectors.include?(v) ? self[v].to_a : Array.new(size)
|
1477
|
-
end
|
1478
|
-
|
1479
|
-
# Concatenate another DataFrame along corresponding columns.
|
1480
|
-
# If columns do not exist in both dataframes, they are filled with nils
|
1481
|
-
def concat(other_df)
|
1482
|
-
vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
|
1483
|
-
|
1484
|
-
data = vectors.map do |v|
|
1485
|
-
get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
|
1486
|
-
end
|
1487
|
-
|
1488
|
-
DaruLite::DataFrame.new(data, order: vectors)
|
1489
|
-
end
|
1490
|
-
|
1491
|
-
# Concatenates another DataFrame as #concat.
|
1492
|
-
# Additionally it tries to preserve the index. If the indices contain
|
1493
|
-
# common elements, #union will overwrite the according rows in the
|
1494
|
-
# first dataframe.
|
1495
|
-
def union(other_df)
|
1496
|
-
index = (@index.to_a + other_df.index.to_a).uniq
|
1497
|
-
df = row[*(@index.to_a - other_df.index.to_a)]
|
1498
|
-
|
1499
|
-
df = df.concat(other_df)
|
1500
|
-
df.index = DaruLite::Index.new(index)
|
1501
|
-
df
|
1502
|
-
end
|
1503
|
-
|
1504
|
-
module SetSingleIndexStrategy
|
1505
|
-
def self.uniq_size(df, col)
|
1506
|
-
df[col].uniq.size
|
1507
|
-
end
|
1508
|
-
|
1509
|
-
def self.new_index(df, col)
|
1510
|
-
DaruLite::Index.new(df[col].to_a)
|
1511
|
-
end
|
1512
|
-
|
1513
|
-
def self.delete_vector(df, col)
|
1514
|
-
df.delete_vector(col)
|
1515
|
-
end
|
1516
|
-
end
|
1517
|
-
|
1518
|
-
module SetCategoricalIndexStrategy
|
1519
|
-
def self.new_index(df, col)
|
1520
|
-
DaruLite::CategoricalIndex.new(df[col].to_a)
|
1521
|
-
end
|
1522
|
-
|
1523
|
-
def self.delete_vector(df, col)
|
1524
|
-
df.delete_vector(col)
|
1525
|
-
end
|
1526
|
-
end
|
1527
|
-
|
1528
|
-
module SetMultiIndexStrategy
|
1529
|
-
def self.uniq_size(df, cols)
|
1530
|
-
df[*cols].uniq.size
|
1531
|
-
end
|
1532
|
-
|
1533
|
-
def self.new_index(df, cols)
|
1534
|
-
DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
|
1535
|
-
mi.name = cols
|
1536
|
-
end
|
1537
|
-
end
|
1538
|
-
|
1539
|
-
def self.delete_vector(df, cols)
|
1540
|
-
df.delete_vectors(*cols)
|
1541
|
-
end
|
1542
|
-
end
|
1543
|
-
|
1544
|
-
# Set a particular column as the new DF
|
1545
|
-
def set_index(new_index_col, keep: false, categorical: false)
|
1546
|
-
if categorical
|
1547
|
-
strategy = SetCategoricalIndexStrategy
|
1548
|
-
elsif new_index_col.respond_to?(:to_a)
|
1549
|
-
strategy = SetMultiIndexStrategy
|
1550
|
-
new_index_col = new_index_col.to_a
|
1551
|
-
else
|
1552
|
-
strategy = SetSingleIndexStrategy
|
1553
|
-
end
|
1554
|
-
|
1555
|
-
unless categorical
|
1556
|
-
uniq_size = strategy.uniq_size(self, new_index_col)
|
1557
|
-
raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
|
1558
|
-
end
|
1559
|
-
|
1560
|
-
self.index = strategy.new_index(self, new_index_col)
|
1561
|
-
strategy.delete_vector(self, new_index_col) unless keep
|
1562
|
-
self
|
1563
|
-
end
|
1564
|
-
|
1565
|
-
# Change the index of the DataFrame and preserve the labels of the previous
|
1566
|
-
# indexing. New index can be DaruLite::Index or any of its subclasses.
|
1567
|
-
#
|
1568
|
-
# @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
|
1569
|
-
# @example Reindexing DataFrame
|
1570
|
-
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
|
1571
|
-
# index: ['a','b','c','d'])
|
1572
|
-
# #=>
|
1573
|
-
# ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
1574
|
-
# # a b
|
1575
|
-
# # a 1 11
|
1576
|
-
# # b 2 22
|
1577
|
-
# # c 3 33
|
1578
|
-
# # d 4 44
|
1579
|
-
# df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
|
1580
|
-
# #=>
|
1581
|
-
# ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
1582
|
-
# # a b
|
1583
|
-
# # b 2 22
|
1584
|
-
# # 0 nil nil
|
1585
|
-
# # a 1 11
|
1586
|
-
# # g nil nil
|
1587
|
-
def reindex(new_index)
|
1588
|
-
unless new_index.is_a?(DaruLite::Index)
|
1589
|
-
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
1590
|
-
"subclasses, not #{new_index.class}"
|
1591
|
-
end
|
1592
|
-
|
1593
|
-
cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
|
1594
|
-
new_index.each_with_object(cl) do |idx, memo|
|
1595
|
-
memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
|
1596
|
-
end
|
1597
|
-
end
|
1598
|
-
|
1599
|
-
def reset_index
|
1600
|
-
index_df = index.to_df
|
1601
|
-
names = index.name
|
1602
|
-
names = [names] unless names.instance_of?(Array)
|
1603
|
-
new_vectors = names + vectors.to_a
|
1604
|
-
self.index = index_df.index
|
1605
|
-
names.each do |name|
|
1606
|
-
self[name] = index_df[name]
|
1607
|
-
end
|
1608
|
-
self.order = new_vectors
|
1609
|
-
self
|
1610
|
-
end
|
1611
|
-
|
1612
|
-
# Reassign index with a new index of type DaruLite::Index or any of its subclasses.
|
1613
|
-
#
|
1614
|
-
# @param [DaruLite::Index] idx New index object on which the rows of the dataframe
|
1615
|
-
# are to be indexed.
|
1616
|
-
# @example Reassigining index of a DataFrame
|
1617
|
-
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
|
1618
|
-
# df.index.to_a #=> [0,1,2,3]
|
1619
|
-
#
|
1620
|
-
# df.index = DaruLite::Index.new(['a','b','c','d'])
|
1621
|
-
# df.index.to_a #=> ['a','b','c','d']
|
1622
|
-
# df.row['a'].to_a #=> [1,11]
|
1623
|
-
def index=(idx)
|
1624
|
-
@index = Index.coerce idx
|
1625
|
-
@data.each { |vec| vec.index = @index }
|
1626
|
-
|
1627
|
-
self
|
1628
|
-
end
|
1629
|
-
|
1630
|
-
# Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
|
1631
|
-
#
|
1632
|
-
# @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
|
1633
|
-
# be indexed. Must of the same size as ncols.
|
1634
|
-
# @example Reassigning vectors of a DataFrame
|
1635
|
-
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
|
1636
|
-
# df.vectors.to_a #=> [:a, :b, :c]
|
1637
|
-
#
|
1638
|
-
# df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
|
1639
|
-
# df.vectors.to_a #=> [:foo, :bar, :baz]
|
1640
|
-
def vectors=(new_index)
|
1641
|
-
raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
|
1642
|
-
|
1643
|
-
if new_index.size != ncols
|
1644
|
-
raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
|
1645
|
-
"dataframe size #{ncols}"
|
1646
|
-
end
|
1647
|
-
|
1648
|
-
@vectors = new_index
|
1649
|
-
@data.zip(new_index.to_a).each do |vect, name|
|
1650
|
-
vect.name = name
|
1651
|
-
end
|
1652
|
-
self
|
1653
|
-
end
|
1654
|
-
|
1655
|
-
# Renames the vectors
|
1656
|
-
#
|
1657
|
-
# == Arguments
|
1658
|
-
#
|
1659
|
-
# * name_map - A hash where the keys are the exising vector names and
|
1660
|
-
# the values are the new names. If a vector is renamed
|
1661
|
-
# to a vector name that is already in use, the existing
|
1662
|
-
# one is overwritten.
|
1663
|
-
#
|
1664
|
-
# == Usage
|
1665
|
-
#
|
1666
|
-
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
1667
|
-
# df.rename_vectors :a => :alpha, :c => :gamma
|
1668
|
-
# df.vectors.to_a #=> [:alpha, :b, :gamma]
|
1669
|
-
def rename_vectors(name_map)
|
1670
|
-
existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
|
1671
|
-
delete_vectors(*existing_targets)
|
1672
|
-
|
1673
|
-
new_names = vectors.to_a.map { |v| name_map[v] || v }
|
1674
|
-
self.vectors = DaruLite::Index.new new_names
|
1675
|
-
end
|
1676
|
-
|
1677
|
-
# Renames the vectors and returns itself
|
1678
|
-
#
|
1679
|
-
# == Arguments
|
1680
|
-
#
|
1681
|
-
# * name_map - A hash where the keys are the exising vector names and
|
1682
|
-
# the values are the new names. If a vector is renamed
|
1683
|
-
# to a vector name that is already in use, the existing
|
1684
|
-
# one is overwritten.
|
1685
|
-
#
|
1686
|
-
# == Usage
|
1687
|
-
#
|
1688
|
-
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
1689
|
-
# df.rename_vectors! :a => :alpha, :c => :gamma # df
|
1690
|
-
def rename_vectors!(name_map)
|
1691
|
-
rename_vectors(name_map)
|
1692
|
-
self
|
1693
|
-
end
|
1694
|
-
|
1695
|
-
# Converts the vectors to a DaruLite::MultiIndex.
|
1696
|
-
# The argument passed is used as the MultiIndex's top level
|
1697
|
-
def add_level_to_vectors(top_level_label)
|
1698
|
-
tuples = vectors.map { |label| [top_level_label, *label] }
|
1699
|
-
self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
|
1700
|
-
end
|
1701
|
-
|
1702
|
-
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
1703
|
-
# alongwith numbers.
|
1704
|
-
def numeric_vectors
|
1705
|
-
# FIXME: Why _with_index ?..
|
1706
|
-
each_vector_with_index
|
1707
|
-
.select { |vec, _i| vec.numeric? }
|
1708
|
-
.map(&:last)
|
1709
|
-
end
|
1710
|
-
|
1711
|
-
def numeric_vector_names
|
1712
|
-
@vectors.select { |v| self[v].numeric? }
|
1713
|
-
end
|
1714
|
-
|
1715
|
-
# Return a DataFrame of only the numerical Vectors. If clone: false
|
1716
|
-
# is specified as option, only a *view* of the Vectors will be
|
1717
|
-
# returned. Defaults to clone: true.
|
1718
|
-
def only_numerics(opts = {})
|
1719
|
-
cln = opts[:clone] != false
|
1720
|
-
arry = numeric_vectors.map { |v| self[v] }
|
1721
|
-
|
1722
|
-
order = Index.new(numeric_vectors)
|
1723
|
-
DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
1724
|
-
end
|
1725
|
-
|
1726
|
-
# Generate a summary of this DataFrame based on individual vectors in the DataFrame
|
1727
|
-
# @return [String] String containing the summary of the DataFrame
|
1728
|
-
def summary
|
1729
|
-
summary = "= #{name}"
|
1730
|
-
summary << "\n Number of rows: #{nrows}"
|
1731
|
-
@vectors.each do |v|
|
1732
|
-
summary << "\n Element:[#{v}]\n"
|
1733
|
-
summary << self[v].summary(1)
|
1734
|
-
end
|
1735
|
-
summary
|
1736
|
-
end
|
1737
|
-
|
1738
|
-
# Sorts a dataframe (ascending/descending) in the given pripority sequence of
|
1739
|
-
# vectors, with or without a block.
|
1740
|
-
#
|
1741
|
-
# @param vector_order [Array] The order of vector names in which the DataFrame
|
1742
|
-
# should be sorted.
|
1743
|
-
# @param opts [Hash] opts The options to sort with.
|
1744
|
-
# @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
|
1745
|
-
# or descending order. Specify Array corresponding to *order* for multiple
|
1746
|
-
# sort orders.
|
1747
|
-
# @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
|
1748
|
-
# to be used for sorting, for each vector name in *order* as a hash of
|
1749
|
-
# vector name and lambda expressions. In case a lambda for a vector is not
|
1750
|
-
# specified, the default will be used.
|
1751
|
-
# @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
|
1752
|
-
# automatically or not when a block is provided.
|
1753
|
-
# If set to True, nils will appear at top after sorting.
|
1754
|
-
#
|
1755
|
-
# @example Sort a dataframe with a vector sequence.
|
1756
|
-
#
|
1757
|
-
#
|
1758
|
-
# df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
|
1759
|
-
#
|
1760
|
-
# df.sort [:a, :b]
|
1761
|
-
# # =>
|
1762
|
-
# # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
|
1763
|
-
# # a b
|
1764
|
-
# # 2 1 3
|
1765
|
-
# # 0 1 5
|
1766
|
-
# # 3 2 2
|
1767
|
-
# # 1 2 4
|
1768
|
-
# # 4 3 1
|
1769
|
-
#
|
1770
|
-
# @example Sort a dataframe without a block. Here nils will be handled automatically.
|
1771
|
-
#
|
1772
|
-
# df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
|
1773
|
-
#
|
1774
|
-
# df.sort([:a])
|
1775
|
-
# # =>
|
1776
|
-
# # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
|
1777
|
-
# # a b
|
1778
|
-
# # 1 nil 3
|
1779
|
-
# # 3 nil 1
|
1780
|
-
# # 0 -3 4
|
1781
|
-
# # 2 -1 2
|
1782
|
-
# # 4 5 4
|
1783
|
-
#
|
1784
|
-
# @example Sort a dataframe with a block with nils handled automatically.
|
1785
|
-
#
|
1786
|
-
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
1787
|
-
#
|
1788
|
-
# df.sort [:b], by: {b: lambda { |a| a.length } }
|
1789
|
-
# # NoMethodError: undefined method `length' for nil:NilClass
|
1790
|
-
# # from (pry):8:in `block in __pry__'
|
1791
|
-
#
|
1792
|
-
# df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
|
1793
|
-
#
|
1794
|
-
# # =>
|
1795
|
-
# # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
|
1796
|
-
# # a b
|
1797
|
-
# # 2 1 nil
|
1798
|
-
# # 5 1 nil
|
1799
|
-
# # 4 -1 x
|
1800
|
-
# # 1 -1 aa
|
1801
|
-
# # 0 nil aaa
|
1802
|
-
# # 3 nil baaa
|
1803
|
-
#
|
1804
|
-
# @example Sort a dataframe with a block with nils handled manually.
|
1805
|
-
#
|
1806
|
-
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
1807
|
-
#
|
1808
|
-
# # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
|
1809
|
-
# df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
|
1810
|
-
#
|
1811
|
-
# # =>
|
1812
|
-
# #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
|
1813
|
-
# # a b
|
1814
|
-
# # 4 -1 x
|
1815
|
-
# # 1 -1 aa
|
1816
|
-
# # 0 nil aaa
|
1817
|
-
# # 3 nil baaa
|
1818
|
-
# # 2 1 nil
|
1819
|
-
# # 5 1 nil
|
1820
|
-
|
1821
|
-
def sort!(vector_order, opts = {})
|
1822
|
-
raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
|
1823
|
-
|
1824
|
-
# To enable sorting with categorical data,
|
1825
|
-
# map categories to integers preserving their order
|
1826
|
-
old = convert_categorical_vectors vector_order
|
1827
|
-
block = sort_prepare_block vector_order, opts
|
1828
|
-
|
1829
|
-
order = @index.size.times.sort(&block)
|
1830
|
-
new_index = @index.reorder order
|
1831
|
-
|
1832
|
-
# To reverse map mapping of categorical data to integers
|
1833
|
-
restore_categorical_vectors old
|
1834
|
-
|
1835
|
-
@data.each do |vector|
|
1836
|
-
vector.reorder! order
|
1837
|
-
end
|
1838
|
-
|
1839
|
-
self.index = new_index
|
1840
|
-
|
1841
|
-
self
|
1842
|
-
end
|
1843
|
-
|
1844
|
-
# Non-destructive version of #sort!
|
1845
|
-
def sort(vector_order, opts = {})
|
1846
|
-
dup.sort! vector_order, opts
|
1847
|
-
end
|
1848
|
-
|
1849
|
-
# Pivots a data frame on specified vectors and applies an aggregate function
|
1850
|
-
# to quickly generate a summary.
|
1851
|
-
#
|
1852
|
-
# == Options
|
1853
|
-
#
|
1854
|
-
# +:index+ - Keys to group by on the pivot table row index. Pass vector names
|
1855
|
-
# contained in an Array.
|
1856
|
-
#
|
1857
|
-
# +:vectors+ - Keys to group by on the pivot table column index. Pass vector
|
1858
|
-
# names contained in an Array.
|
1859
|
-
#
|
1860
|
-
# +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
|
1861
|
-
# use any of the statistics functions applicable on Vectors that can be found in
|
1862
|
-
# the DaruLite::Statistics::Vector module.
|
1863
|
-
#
|
1864
|
-
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
1865
|
-
# specified in *:index* or *:vectors*. Optional.
|
1866
|
-
#
|
1867
|
-
# == Usage
|
1868
|
-
#
|
1869
|
-
# df = DaruLite::DataFrame.new({
|
1870
|
-
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
1871
|
-
# b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
|
1872
|
-
# c: ['small','large','large','small','small','large','small','large','small'],
|
1873
|
-
# d: [1,2,2,3,3,4,5,6,7],
|
1874
|
-
# e: [2,4,4,6,6,8,10,12,14]
|
1875
|
-
# })
|
1876
|
-
# df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
|
1877
|
-
#
|
1878
|
-
# #=>
|
1879
|
-
# # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
|
1880
|
-
# # [:e, :one] [:e, :two]
|
1881
|
-
# # [:bar] 18 26
|
1882
|
-
# # [:foo] 10 12
|
1883
|
-
def pivot_table(opts = {})
|
1884
|
-
raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
|
1885
|
-
|
1886
|
-
index = opts[:index]
|
1887
|
-
vectors = opts[:vectors] || []
|
1888
|
-
aggregate_function = opts[:agg] || :mean
|
1889
|
-
values = prepare_pivot_values index, vectors, opts
|
1890
|
-
raise IndexError, 'No numeric vectors to aggregate' if values.empty?
|
1891
|
-
|
1892
|
-
grouped = group_by(index)
|
1893
|
-
return grouped.send(aggregate_function) if vectors.empty?
|
1894
|
-
|
1895
|
-
super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
|
1896
|
-
|
1897
|
-
pivot_dataframe super_hash
|
1898
|
-
end
|
1899
|
-
|
1900
|
-
# Merge vectors from two DataFrames. In case of name collision,
|
1901
|
-
# the vectors names are changed to x_1, x_2 ....
|
1902
|
-
#
|
1903
|
-
# @return {DaruLite::DataFrame}
|
1904
|
-
def merge(other_df)
|
1905
|
-
unless nrows == other_df.nrows
|
1906
|
-
raise ArgumentError,
|
1907
|
-
"Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
|
1908
|
-
end
|
1909
|
-
|
1910
|
-
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
1911
|
-
new_fields = ArrayHelper.recode_repeated(new_fields)
|
1912
|
-
DataFrame.new({}, order: new_fields).tap do |df_new|
|
1913
|
-
(0...nrows).each do |i|
|
1914
|
-
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
1915
|
-
end
|
1916
|
-
df_new.index = @index if @index == other_df.index
|
1917
|
-
df_new.update
|
1918
|
-
end
|
1919
|
-
end
|
1920
|
-
|
1921
|
-
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
1922
|
-
# outer, right outer and full outer joins.
|
1923
|
-
#
|
1924
|
-
# @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
|
1925
|
-
# to be performed.
|
1926
|
-
# @param [Hash] opts Options Hash
|
1927
|
-
# @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
|
1928
|
-
# @option :on [Array] The columns on which the join is to be performed.
|
1929
|
-
# Column names specified here must be common to both DataFrames.
|
1930
|
-
# @option :indicator [Symbol] The name of a vector to add to the resultant
|
1931
|
-
# dataframe that indicates whether the record was in the left (:left_only),
|
1932
|
-
# right (:right_only), or both (:both) joining dataframes.
|
1933
|
-
# @return [DaruLite::DataFrame]
|
1934
|
-
# @example Inner Join
|
1935
|
-
# left = DaruLite::DataFrame.new({
|
1936
|
-
# :id => [1,2,3,4],
|
1937
|
-
# :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
|
1938
|
-
# })
|
1939
|
-
# right = DaruLite::DataFrame.new({
|
1940
|
-
# :id => [1,2,3,4],
|
1941
|
-
# :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
|
1942
|
-
# })
|
1943
|
-
# left.join(right, how: :inner, on: [:name])
|
1944
|
-
# #=>
|
1945
|
-
# ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
|
1946
|
-
# # id_1 name id_2
|
1947
|
-
# # 0 1 Pirate 2
|
1948
|
-
# # 1 3 Ninja 4
|
1949
|
-
def join(other_df, opts = {})
|
1950
|
-
DaruLite::Core::Merge.join(self, other_df, opts)
|
1951
|
-
end
|
1952
|
-
|
1953
|
-
# Creates a new dataset for one to many relations
|
1954
|
-
# on a dataset, based on pattern of field names.
|
1955
|
-
#
|
1956
|
-
# for example, you have a survey for number of children
|
1957
|
-
# with this structure:
|
1958
|
-
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
1959
|
-
# with
|
1960
|
-
# ds.one_to_many([:id], "child_%v_%n"
|
1961
|
-
# the field of first parameters will be copied verbatim
|
1962
|
-
# to new dataset, and fields which responds to second
|
1963
|
-
# pattern will be added one case for each different %n.
|
1964
|
-
#
|
1965
|
-
# @example
|
1966
|
-
# cases=[
|
1967
|
-
# ['1','george','red',10,'blue',20,nil,nil],
|
1968
|
-
# ['2','fred','green',15,'orange',30,'white',20],
|
1969
|
-
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
1970
|
-
# ]
|
1971
|
-
# ds=DaruLite::DataFrame.rows(cases, order:
|
1972
|
-
# [:id, :name,
|
1973
|
-
# :car_color1, :car_value1,
|
1974
|
-
# :car_color2, :car_value2,
|
1975
|
-
# :car_color3, :car_value3])
|
1976
|
-
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
1977
|
-
# #=> Matrix[
|
1978
|
-
# # ["red", "1", 10],
|
1979
|
-
# # ["blue", "1", 20],
|
1980
|
-
# # ["green", "2", 15],
|
1981
|
-
# # ["orange", "2", 30],
|
1982
|
-
# # ["white", "2", 20]
|
1983
|
-
# # ]
|
1984
|
-
def one_to_many(parent_fields, pattern)
|
1985
|
-
vars, numbers = one_to_many_components(pattern)
|
1986
|
-
|
1987
|
-
DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
|
1988
|
-
each_row do |row|
|
1989
|
-
verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
|
1990
|
-
numbers.each do |n|
|
1991
|
-
generated = one_to_many_row row, n, vars, pattern
|
1992
|
-
next if generated.values.all?(&:nil?)
|
1993
|
-
|
1994
|
-
ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
|
1995
|
-
end
|
1996
|
-
end
|
1997
|
-
ds.update
|
1998
|
-
end
|
1999
|
-
end
|
2000
|
-
|
2001
|
-
def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
2002
|
-
self[nm]
|
2003
|
-
.split_by_separator(sep)
|
2004
|
-
.each_with_index do |(k, v), i|
|
2005
|
-
v.rename "#{nm}:#{k}"
|
2006
|
-
self[:"#{nm}#{join}#{i + 1}"] = v
|
2007
|
-
end
|
2008
|
-
end
|
2009
|
-
|
2010
|
-
# Create a sql, basen on a given Dataset
|
2011
|
-
#
|
2012
|
-
# == Arguments
|
2013
|
-
#
|
2014
|
-
# * table - String specifying name of the table that will created in SQL.
|
2015
|
-
# * charset - Character set. Default is "UTF8".
|
2016
|
-
#
|
2017
|
-
# @example
|
2018
|
-
#
|
2019
|
-
# ds = DaruLite::DataFrame.new({
|
2020
|
-
# :id => DaruLite::Vector.new([1,2,3,4,5]),
|
2021
|
-
# :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
|
2022
|
-
# })
|
2023
|
-
# ds.create_sql('names')
|
2024
|
-
# #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
2025
|
-
#
|
2026
|
-
def create_sql(table, charset = 'UTF8')
|
2027
|
-
sql = "CREATE TABLE #{table} ("
|
2028
|
-
fields = vectors.to_a.collect do |f|
|
2029
|
-
v = self[f]
|
2030
|
-
"#{f} #{v.db_type}"
|
2031
|
-
end
|
2032
|
-
|
2033
|
-
sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
|
2034
|
-
end
|
2035
|
-
|
2036
|
-
# Returns the dataframe. This can be convenient when the user does not
|
2037
|
-
# know whether the object is a vector or a dataframe.
|
2038
|
-
# @return [self] the dataframe
|
2039
|
-
def to_df
|
2040
|
-
self
|
2041
|
-
end
|
2042
|
-
|
2043
|
-
# Convert all vectors of type *:numeric* into a Matrix.
|
2044
|
-
def to_matrix
|
2045
|
-
Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
|
2046
|
-
end
|
2047
|
-
|
2048
|
-
# Converts the DataFrame into an array of hashes where key is vector name
|
2049
|
-
# and value is the corresponding element. The 0th index of the array contains
|
2050
|
-
# the array of hashes while the 1th index contains the indexes of each row
|
2051
|
-
# of the dataframe. Each element in the index array corresponds to its row
|
2052
|
-
# in the array of hashes, which has the same index.
|
2053
|
-
def to_a
|
2054
|
-
[each_row.map(&:to_h), @index.to_a]
|
2055
|
-
end
|
2056
|
-
|
2057
|
-
# Convert to json. If no_index is false then the index will NOT be included
|
2058
|
-
# in the JSON thus created.
|
2059
|
-
def to_json(no_index = true)
|
2060
|
-
if no_index
|
2061
|
-
to_a[0].to_json
|
2062
|
-
else
|
2063
|
-
to_a.to_json
|
2064
|
-
end
|
2065
|
-
end
|
2066
|
-
|
2067
|
-
# Converts DataFrame to a hash (explicit) with keys as vector names and values as
|
2068
|
-
# the corresponding vectors.
|
2069
|
-
def to_h
|
2070
|
-
@vectors
|
2071
|
-
.each_with_index
|
2072
|
-
.map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
|
2073
|
-
end
|
316
|
+
def nest(*tree_keys, &block)
|
317
|
+
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
2074
318
|
|
2075
|
-
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2079
|
-
|
2080
|
-
File.expand_path('iruby/templates/dataframe_mi.html.erb', __dir__)
|
2081
|
-
else
|
2082
|
-
File.expand_path('iruby/templates/dataframe.html.erb', __dir__)
|
2083
|
-
end
|
2084
|
-
ERB.new(File.read(path).strip).result(binding)
|
2085
|
-
end
|
319
|
+
each_row.with_object({}) do |row, current|
|
320
|
+
# Create tree
|
321
|
+
*keys, last = tree_keys
|
322
|
+
current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
|
323
|
+
name = row[last]
|
2086
324
|
|
2087
|
-
|
2088
|
-
|
2089
|
-
if index.is_a?(MultiIndex)
|
2090
|
-
File.expand_path('iruby/templates/dataframe_mi_thead.html.erb', __dir__)
|
325
|
+
if block
|
326
|
+
current[name] = yield(row, current, name)
|
2091
327
|
else
|
2092
|
-
|
328
|
+
current[name] ||= []
|
329
|
+
current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
|
2093
330
|
end
|
2094
|
-
|
331
|
+
end
|
2095
332
|
end
|
2096
333
|
|
2097
|
-
def
|
2098
|
-
|
2099
|
-
|
2100
|
-
|
2101
|
-
File.expand_path('iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
|
2102
|
-
else
|
2103
|
-
File.expand_path('iruby/templates/dataframe_tbody.html.erb', __dir__)
|
2104
|
-
end
|
2105
|
-
ERB.new(File.read(table_tbody_path).strip).result(binding)
|
334
|
+
def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
335
|
+
self[name]
|
336
|
+
.split_by_separator(sep)
|
337
|
+
.each { |k, v| self[:"#{name}#{join}#{k}"] = v }
|
2106
338
|
end
|
2107
339
|
|
2108
|
-
|
2109
|
-
|
340
|
+
# Return the number of rows and columns of the DataFrame in an Array.
|
341
|
+
def shape
|
342
|
+
[nrows, ncols]
|
2110
343
|
end
|
2111
344
|
|
2112
|
-
#
|
2113
|
-
|
2114
|
-
|
2115
|
-
# assignment/deletion of elements is done. Updating data this way is called
|
2116
|
-
# lazy loading. To set or unset lazy loading, see the .lazy_update= method.
|
2117
|
-
def update
|
2118
|
-
@data.each(&:update) if DaruLite.lazy_update
|
345
|
+
# The number of rows
|
346
|
+
def nrows
|
347
|
+
@index.size
|
2119
348
|
end
|
2120
349
|
|
2121
|
-
#
|
2122
|
-
def
|
2123
|
-
@
|
2124
|
-
self
|
350
|
+
# The number of vectors
|
351
|
+
def ncols
|
352
|
+
@vectors.size
|
2125
353
|
end
|
2126
354
|
|
2127
|
-
|
2128
|
-
|
2129
|
-
# Write this DataFrame to a CSV file.
|
355
|
+
# Renames the vectors
|
2130
356
|
#
|
2131
357
|
# == Arguments
|
2132
358
|
#
|
2133
|
-
# *
|
359
|
+
# * name_map - A hash where the keys are the exising vector names and
|
360
|
+
# the values are the new names. If a vector is renamed
|
361
|
+
# to a vector name that is already in use, the existing
|
362
|
+
# one is overwritten.
|
2134
363
|
#
|
2135
|
-
# ==
|
364
|
+
# == Usage
|
2136
365
|
#
|
2137
|
-
#
|
2138
|
-
#
|
2139
|
-
#
|
2140
|
-
|
2141
|
-
|
2142
|
-
|
2143
|
-
end
|
366
|
+
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
367
|
+
# df.rename_vectors :a => :alpha, :c => :gamma
|
368
|
+
# df.vectors.to_a #=> [:alpha, :b, :gamma]
|
369
|
+
def rename_vectors(name_map)
|
370
|
+
existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
|
371
|
+
delete_vectors(*existing_targets)
|
2144
372
|
|
2145
|
-
|
2146
|
-
|
2147
|
-
# == Arguments
|
2148
|
-
#
|
2149
|
-
# * filename - The path of the file where the DataFrame should be written.
|
2150
|
-
def write_excel(filename, opts = {})
|
2151
|
-
DaruLite::IO.dataframe_write_excel self, filename, opts
|
373
|
+
new_names = vectors.to_a.map { |v| name_map[v] || v }
|
374
|
+
self.vectors = DaruLite::Index.new new_names
|
2152
375
|
end
|
2153
376
|
|
2154
|
-
#
|
377
|
+
# Renames the vectors and returns itself
|
2155
378
|
#
|
2156
379
|
# == Arguments
|
2157
380
|
#
|
2158
|
-
# *
|
2159
|
-
#
|
381
|
+
# * name_map - A hash where the keys are the exising vector names and
|
382
|
+
# the values are the new names. If a vector is renamed
|
383
|
+
# to a vector name that is already in use, the existing
|
384
|
+
# one is overwritten.
|
2160
385
|
#
|
2161
386
|
# == Usage
|
2162
387
|
#
|
2163
|
-
#
|
2164
|
-
#
|
2165
|
-
|
2166
|
-
|
2167
|
-
|
388
|
+
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
389
|
+
# df.rename_vectors! :a => :alpha, :c => :gamma # df
|
390
|
+
def rename_vectors!(name_map)
|
391
|
+
rename_vectors(name_map)
|
392
|
+
self
|
2168
393
|
end
|
2169
394
|
|
2170
|
-
#
|
2171
|
-
|
2172
|
-
|
395
|
+
# Converts the vectors to a DaruLite::MultiIndex.
|
396
|
+
# The argument passed is used as the MultiIndex's top level
|
397
|
+
def add_level_to_vectors(top_level_label)
|
398
|
+
tuples = vectors.map { |label| [top_level_label, *label] }
|
399
|
+
self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
|
2173
400
|
end
|
2174
401
|
|
2175
|
-
def
|
2176
|
-
|
2177
|
-
|
2178
|
-
|
2179
|
-
|
2180
|
-
|
2181
|
-
|
402
|
+
def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
403
|
+
self[nm]
|
404
|
+
.split_by_separator(sep)
|
405
|
+
.each_with_index do |(k, v), i|
|
406
|
+
v.rename "#{nm}:#{k}"
|
407
|
+
self[:"#{nm}#{join}#{i + 1}"] = v
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
# Method for updating the metadata (i.e. missing value positions) of the
|
412
|
+
# after assingment/deletion etc. are complete. This is provided so that
|
413
|
+
# time is not wasted in creating the metadata for the vector each time
|
414
|
+
# assignment/deletion of elements is done. Updating data this way is called
|
415
|
+
# lazy loading. To set or unset lazy loading, see the .lazy_update= method.
|
416
|
+
def update
|
417
|
+
@data.each(&:update) if DaruLite.lazy_update
|
2182
418
|
end
|
2183
419
|
|
2184
|
-
|
2185
|
-
|
2186
|
-
|
2187
|
-
|
2188
|
-
order: h[:order],
|
2189
|
-
name: h[:name])
|
420
|
+
# Rename the DataFrame.
|
421
|
+
def rename(new_name)
|
422
|
+
@name = new_name
|
423
|
+
self
|
2190
424
|
end
|
425
|
+
alias name= rename
|
2191
426
|
|
2192
427
|
# Transpose a DataFrame, tranposing elements and row, column indexing.
|
2193
428
|
def transpose
|
@@ -2218,11 +453,6 @@ module DaruLite
|
|
2218
453
|
)
|
2219
454
|
end
|
2220
455
|
|
2221
|
-
# Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
|
2222
|
-
def where(bool_array)
|
2223
|
-
DaruLite::Core::Query.df_where self, bool_array
|
2224
|
-
end
|
2225
|
-
|
2226
456
|
def ==(other)
|
2227
457
|
self.class == other.class &&
|
2228
458
|
@size == other.size &&
|
@@ -2276,144 +506,6 @@ module DaruLite
|
|
2276
506
|
order: all_vectors.map(&:name)
|
2277
507
|
end
|
2278
508
|
|
2279
|
-
# Split the dataframe into many dataframes based on category vector
|
2280
|
-
# @param [object] cat_name name of category vector to split the dataframe
|
2281
|
-
# @return [Array] array of dataframes split by category with category vector
|
2282
|
-
# used to split not included
|
2283
|
-
# @example
|
2284
|
-
# df = DaruLite::DataFrame.new({
|
2285
|
-
# a: [1, 2, 3],
|
2286
|
-
# b: ['a', 'a', 'b']
|
2287
|
-
# })
|
2288
|
-
# df.to_category :b
|
2289
|
-
# df.split_by_category :b
|
2290
|
-
# # => [#<DaruLite::DataFrame: a (2x1)>
|
2291
|
-
# # a
|
2292
|
-
# # 0 1
|
2293
|
-
# # 1 2,
|
2294
|
-
# # #<DaruLite::DataFrame: b (1x1)>
|
2295
|
-
# # a
|
2296
|
-
# # 2 3]
|
2297
|
-
def split_by_category(cat_name)
|
2298
|
-
cat_dv = self[cat_name]
|
2299
|
-
raise ArgumentError, "#{cat_name} is not a category vector" unless
|
2300
|
-
cat_dv.category?
|
2301
|
-
|
2302
|
-
cat_dv.categories.map do |cat|
|
2303
|
-
where(cat_dv.eq cat)
|
2304
|
-
.rename(cat)
|
2305
|
-
.delete_vector cat_name
|
2306
|
-
end
|
2307
|
-
end
|
2308
|
-
|
2309
|
-
# @param indexes [Array] index(s) at which row tuples are retrieved
|
2310
|
-
# @return [Array] returns array of row tuples at given index(s)
|
2311
|
-
# @example Using DaruLite::Index
|
2312
|
-
# df = DaruLite::DataFrame.new({
|
2313
|
-
# a: [1, 2, 3],
|
2314
|
-
# b: ['a', 'a', 'b']
|
2315
|
-
# })
|
2316
|
-
#
|
2317
|
-
# df.access_row_tuples_by_indexs(1,2)
|
2318
|
-
# # => [[2, "a"], [3, "b"]]
|
2319
|
-
#
|
2320
|
-
# df.index = DaruLite::Index.new([:one,:two,:three])
|
2321
|
-
# df.access_row_tuples_by_indexs(:one,:three)
|
2322
|
-
# # => [[1, "a"], [3, "b"]]
|
2323
|
-
#
|
2324
|
-
# @example Using DaruLite::MultiIndex
|
2325
|
-
# mi_idx = DaruLite::MultiIndex.from_tuples [
|
2326
|
-
# [:a,:one,:bar],
|
2327
|
-
# [:a,:one,:baz],
|
2328
|
-
# [:b,:two,:bar],
|
2329
|
-
# [:a,:two,:baz],
|
2330
|
-
# ]
|
2331
|
-
# df_mi = DaruLite::DataFrame.new({
|
2332
|
-
# a: 1..4,
|
2333
|
-
# b: 'a'..'d'
|
2334
|
-
# }, index: mi_idx )
|
2335
|
-
#
|
2336
|
-
# df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
|
2337
|
-
# # => [[3, "c"]]
|
2338
|
-
# df_mi.access_row_tuples_by_indexs(:a)
|
2339
|
-
# # => [[1, "a"], [2, "b"], [4, "d"]]
|
2340
|
-
def access_row_tuples_by_indexs(*indexes)
|
2341
|
-
return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
|
2342
|
-
@index.is_a?(DaruLite::MultiIndex)
|
2343
|
-
|
2344
|
-
positions = @index.pos(*indexes)
|
2345
|
-
if positions.is_a? Numeric
|
2346
|
-
row = get_rows_for([positions])
|
2347
|
-
row.first.is_a?(Array) ? row : [row]
|
2348
|
-
else
|
2349
|
-
new_rows = get_rows_for(indexes, by_position: false)
|
2350
|
-
indexes.map { |index| new_rows.map { |r| r[index] } }
|
2351
|
-
end
|
2352
|
-
end
|
2353
|
-
|
2354
|
-
# Function to use for aggregating the data.
|
2355
|
-
#
|
2356
|
-
# @param options [Hash] options for column, you want in resultant dataframe
|
2357
|
-
#
|
2358
|
-
# @return [DaruLite::DataFrame]
|
2359
|
-
#
|
2360
|
-
# @example
|
2361
|
-
# df = DaruLite::DataFrame.new(
|
2362
|
-
# {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
|
2363
|
-
# => #<DaruLite::DataFrame(5x2)>
|
2364
|
-
# col num
|
2365
|
-
# 0 a 52
|
2366
|
-
# 1 b 12
|
2367
|
-
# 2 c 7
|
2368
|
-
# 3 d 17
|
2369
|
-
# 4 e 1
|
2370
|
-
#
|
2371
|
-
# df.aggregate(num_100_times: ->(df) { (df.num*100).first })
|
2372
|
-
# => #<DaruLite::DataFrame(5x1)>
|
2373
|
-
# num_100_ti
|
2374
|
-
# 0 5200
|
2375
|
-
# 1 1200
|
2376
|
-
# 2 700
|
2377
|
-
# 3 1700
|
2378
|
-
# 4 100
|
2379
|
-
#
|
2380
|
-
# When we have duplicate index :
|
2381
|
-
#
|
2382
|
-
# idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
|
2383
|
-
# df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
|
2384
|
-
# => #<DaruLite::DataFrame(5x1)>
|
2385
|
-
# num
|
2386
|
-
# a 52
|
2387
|
-
# b 12
|
2388
|
-
# a 7
|
2389
|
-
# a 17
|
2390
|
-
# c 1
|
2391
|
-
#
|
2392
|
-
# df.aggregate(num: :mean)
|
2393
|
-
# => #<DaruLite::DataFrame(3x1)>
|
2394
|
-
# num
|
2395
|
-
# a 25.3333333
|
2396
|
-
# b 12
|
2397
|
-
# c 1
|
2398
|
-
#
|
2399
|
-
# Note: `GroupBy` class `aggregate` method uses this `aggregate` method
|
2400
|
-
# internally.
|
2401
|
-
def aggregate(options = {}, multi_index_level = -1)
|
2402
|
-
if block_given?
|
2403
|
-
positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
|
2404
|
-
else
|
2405
|
-
positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
|
2406
|
-
end
|
2407
|
-
|
2408
|
-
colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
|
2409
|
-
|
2410
|
-
DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
|
2411
|
-
end
|
2412
|
-
|
2413
|
-
def group_by_and_aggregate(*group_by_keys, **aggregation_map)
|
2414
|
-
group_by(*group_by_keys).aggregate(aggregation_map)
|
2415
|
-
end
|
2416
|
-
|
2417
509
|
private
|
2418
510
|
|
2419
511
|
def headers
|
@@ -2424,20 +516,6 @@ module DaruLite
|
|
2424
516
|
index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
|
2425
517
|
end
|
2426
518
|
|
2427
|
-
def convert_categorical_vectors(names)
|
2428
|
-
names.filter_map do |n|
|
2429
|
-
next unless self[n].category?
|
2430
|
-
|
2431
|
-
old = [n, self[n]]
|
2432
|
-
self[n] = DaruLite::Vector.new(self[n].to_ints)
|
2433
|
-
old
|
2434
|
-
end
|
2435
|
-
end
|
2436
|
-
|
2437
|
-
def restore_categorical_vectors(old)
|
2438
|
-
old.each { |name, vector| self[name] = vector }
|
2439
|
-
end
|
2440
|
-
|
2441
519
|
def recursive_product(dfs)
|
2442
520
|
return dfs.first if dfs.size == 1
|
2443
521
|
|
@@ -2449,12 +527,6 @@ module DaruLite
|
|
2449
527
|
end
|
2450
528
|
end
|
2451
529
|
|
2452
|
-
def should_be_vector!(val)
|
2453
|
-
return val if val.is_a?(DaruLite::Vector)
|
2454
|
-
|
2455
|
-
raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
|
2456
|
-
end
|
2457
|
-
|
2458
530
|
def dispatch_to_axis(axis, method, *args, &block)
|
2459
531
|
if %i[vector column].include?(axis)
|
2460
532
|
send(:"#{method}_vector", *args, &block)
|
@@ -2485,76 +557,6 @@ module DaruLite
|
|
2485
557
|
end
|
2486
558
|
end
|
2487
559
|
|
2488
|
-
def access_vector(*names)
|
2489
|
-
if names.first.is_a?(Range)
|
2490
|
-
dup(@vectors.subset(names.first))
|
2491
|
-
elsif @vectors.is_a?(MultiIndex)
|
2492
|
-
access_vector_multi_index(*names)
|
2493
|
-
else
|
2494
|
-
access_vector_single_index(*names)
|
2495
|
-
end
|
2496
|
-
end
|
2497
|
-
|
2498
|
-
def access_vector_multi_index(*names)
|
2499
|
-
pos = @vectors[names]
|
2500
|
-
|
2501
|
-
return @data[pos] if pos.is_a?(Integer)
|
2502
|
-
|
2503
|
-
new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
|
2504
|
-
|
2505
|
-
pos = pos.drop_left_level(names.size) if names.size < @vectors.width
|
2506
|
-
|
2507
|
-
DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
|
2508
|
-
end
|
2509
|
-
|
2510
|
-
def access_vector_single_index(*names)
|
2511
|
-
if names.count < 2
|
2512
|
-
begin
|
2513
|
-
pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
|
2514
|
-
rescue IndexError
|
2515
|
-
raise IndexError, "Specified vector #{names.first} does not exist"
|
2516
|
-
end
|
2517
|
-
return @data[pos] if pos.is_a?(Numeric)
|
2518
|
-
|
2519
|
-
names = pos
|
2520
|
-
end
|
2521
|
-
|
2522
|
-
new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
|
2523
|
-
|
2524
|
-
order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
|
2525
|
-
DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
|
2526
|
-
end
|
2527
|
-
|
2528
|
-
def access_row(*indexes)
|
2529
|
-
positions = @index.pos(*indexes)
|
2530
|
-
|
2531
|
-
if positions.is_a? Numeric
|
2532
|
-
row = get_rows_for([positions])
|
2533
|
-
DaruLite::Vector.new row, index: @vectors, name: indexes.first
|
2534
|
-
else
|
2535
|
-
new_rows = get_rows_for(indexes, by_position: false)
|
2536
|
-
DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
|
2537
|
-
end
|
2538
|
-
end
|
2539
|
-
|
2540
|
-
# @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
|
2541
|
-
# because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
|
2542
|
-
# values (representing a row) or an array of Vectors (that can be seen as rows)
|
2543
|
-
def get_rows_for(keys, by_position: true)
|
2544
|
-
raise unless keys.is_a?(Array)
|
2545
|
-
|
2546
|
-
if by_position
|
2547
|
-
pos = keys
|
2548
|
-
@data.map { |vector| vector.at(*pos) }
|
2549
|
-
else
|
2550
|
-
# TODO: for now (2018-07-27), it is different than using
|
2551
|
-
# get_rows_for(@index.pos(*keys))
|
2552
|
-
# because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
|
2553
|
-
indexes = keys
|
2554
|
-
@data.map { |vec| vec[*indexes] }
|
2555
|
-
end
|
2556
|
-
end
|
2557
|
-
|
2558
560
|
def insert_or_modify_vector(name, vector)
|
2559
561
|
name = name[0] unless @vectors.is_a?(MultiIndex)
|
2560
562
|
|
@@ -2837,146 +839,6 @@ module DaruLite
|
|
2837
839
|
end
|
2838
840
|
end
|
2839
841
|
|
2840
|
-
def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
|
2841
|
-
# Create an array to be used for comparison of two rows in sorting
|
2842
|
-
vector_locs
|
2843
|
-
.zip(by_blocks, ascending, handle_nils)
|
2844
|
-
.map do |vector_loc, by, asc, handle_nil|
|
2845
|
-
value = @data[vector_loc].data[asc ? r1 : r2]
|
2846
|
-
|
2847
|
-
if by
|
2848
|
-
value = begin
|
2849
|
-
by.call(value)
|
2850
|
-
rescue StandardError
|
2851
|
-
nil
|
2852
|
-
end
|
2853
|
-
end
|
2854
|
-
|
2855
|
-
sort_handle_nils value, asc, handle_nil || !by
|
2856
|
-
end
|
2857
|
-
end
|
2858
|
-
|
2859
|
-
def sort_handle_nils(value, asc, handle_nil)
|
2860
|
-
if !handle_nil
|
2861
|
-
value
|
2862
|
-
elsif asc
|
2863
|
-
[value.nil? ? 0 : 1, value]
|
2864
|
-
else
|
2865
|
-
[value.nil? ? 1 : 0, value]
|
2866
|
-
end
|
2867
|
-
end
|
2868
|
-
|
2869
|
-
def sort_coerce_boolean(opts, symbol, default, size)
|
2870
|
-
val = opts[symbol]
|
2871
|
-
case val
|
2872
|
-
when true, false
|
2873
|
-
Array.new(size, val)
|
2874
|
-
when nil
|
2875
|
-
Array.new(size, default)
|
2876
|
-
when Array
|
2877
|
-
raise ArgumentError, "Specify same number of vector names and #{symbol}" if
|
2878
|
-
size != val.size
|
2879
|
-
|
2880
|
-
val
|
2881
|
-
else
|
2882
|
-
raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
|
2883
|
-
end
|
2884
|
-
end
|
2885
|
-
|
2886
|
-
def sort_prepare_block(vector_order, opts)
|
2887
|
-
ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
|
2888
|
-
handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
|
2889
|
-
|
2890
|
-
by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
|
2891
|
-
vector_locs = vector_order.map { |v| @vectors[v] }
|
2892
|
-
|
2893
|
-
lambda do |index1, index2|
|
2894
|
-
# Build left and right array to compare two rows
|
2895
|
-
left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
|
2896
|
-
right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
|
2897
|
-
|
2898
|
-
# Resolve conflict by Index if all attributes are same
|
2899
|
-
left << index1
|
2900
|
-
right << index2
|
2901
|
-
left <=> right
|
2902
|
-
end
|
2903
|
-
end
|
2904
|
-
|
2905
|
-
def verify_error_message(row, test, id, i)
|
2906
|
-
description, fields, = test
|
2907
|
-
values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
|
2908
|
-
"#{i + 1} [#{row[id]}]: #{description}#{values}"
|
2909
|
-
end
|
2910
|
-
|
2911
|
-
def prepare_pivot_values(index, vectors, opts)
|
2912
|
-
case opts[:values]
|
2913
|
-
when nil # values not specified at all.
|
2914
|
-
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
2915
|
-
when Array # multiple values specified.
|
2916
|
-
opts[:values]
|
2917
|
-
else # single value specified.
|
2918
|
-
[opts[:values]]
|
2919
|
-
end
|
2920
|
-
end
|
2921
|
-
|
2922
|
-
def make_pivot_hash(grouped, vectors, values, aggregate_function)
|
2923
|
-
grouped.groups.transform_values { |_| {} }.tap do |super_hash|
|
2924
|
-
values.each do |value|
|
2925
|
-
grouped.groups.each do |group_name, row_numbers|
|
2926
|
-
row_numbers.each do |num|
|
2927
|
-
arry = [value, *vectors.map { |v| self[v][num] }]
|
2928
|
-
sub_hash = super_hash[group_name]
|
2929
|
-
sub_hash[arry] ||= []
|
2930
|
-
|
2931
|
-
sub_hash[arry] << self[value][num]
|
2932
|
-
end
|
2933
|
-
end
|
2934
|
-
end
|
2935
|
-
|
2936
|
-
setup_pivot_aggregates super_hash, aggregate_function
|
2937
|
-
end
|
2938
|
-
end
|
2939
|
-
|
2940
|
-
def setup_pivot_aggregates(super_hash, aggregate_function)
|
2941
|
-
super_hash.each_value do |sub_hash|
|
2942
|
-
sub_hash.each do |group_name, aggregates|
|
2943
|
-
sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
|
2944
|
-
end
|
2945
|
-
end
|
2946
|
-
end
|
2947
|
-
|
2948
|
-
def pivot_dataframe(super_hash)
|
2949
|
-
df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
|
2950
|
-
df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
|
2951
|
-
|
2952
|
-
DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
|
2953
|
-
super_hash.each do |row_index, sub_h|
|
2954
|
-
sub_h.each do |vector_index, val|
|
2955
|
-
pivoted_dataframe[vector_index][row_index] = val
|
2956
|
-
end
|
2957
|
-
end
|
2958
|
-
end
|
2959
|
-
end
|
2960
|
-
|
2961
|
-
def one_to_many_components(pattern)
|
2962
|
-
re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
|
2963
|
-
|
2964
|
-
vars, numbers =
|
2965
|
-
@vectors
|
2966
|
-
.map { |v| v.scan(re) }
|
2967
|
-
.reject(&:empty?).flatten(1).transpose
|
2968
|
-
|
2969
|
-
[vars.uniq, numbers.map(&:to_i).sort.uniq]
|
2970
|
-
end
|
2971
|
-
|
2972
|
-
def one_to_many_row(row, number, vars, pattern)
|
2973
|
-
vars
|
2974
|
-
.to_h do |v|
|
2975
|
-
name = pattern.sub('%v', v).sub('%n', number.to_s)
|
2976
|
-
[v, row[name]]
|
2977
|
-
end
|
2978
|
-
end
|
2979
|
-
|
2980
842
|
# Raises IndexError when one of the positions is not a valid position
|
2981
843
|
def validate_positions(*positions, size)
|
2982
844
|
positions.each do |pos|
|
@@ -3001,82 +863,5 @@ module DaruLite
|
|
3001
863
|
DaruLite::Vector.new(source[idx], index: @index, name: vectors[idx])
|
3002
864
|
end
|
3003
865
|
end
|
3004
|
-
|
3005
|
-
def aggregate_by_positions_tuples(options, positions_tuples)
|
3006
|
-
agg_over_vectors_only, options = cast_aggregation_options(options)
|
3007
|
-
|
3008
|
-
if agg_over_vectors_only
|
3009
|
-
options.map do |vect_name, method|
|
3010
|
-
vect = self[vect_name]
|
3011
|
-
|
3012
|
-
positions_tuples.map do |positions|
|
3013
|
-
vect.apply_method_on_sub_vector(method, keys: positions)
|
3014
|
-
end
|
3015
|
-
end
|
3016
|
-
else
|
3017
|
-
methods = options.values
|
3018
|
-
|
3019
|
-
# NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
|
3020
|
-
rows = positions_tuples.map do |positions|
|
3021
|
-
apply_method_on_sub_df(methods, keys: positions)
|
3022
|
-
end
|
3023
|
-
|
3024
|
-
rows.transpose
|
3025
|
-
end
|
3026
|
-
end
|
3027
|
-
|
3028
|
-
# convert operations over sub-vectors to operations over sub-dfs when it improves perf
|
3029
|
-
# note: we don't always "cast" because aggregation over a single vector / a few vector is faster
|
3030
|
-
# than aggregation over (sub-)dfs
|
3031
|
-
def cast_aggregation_options(options)
|
3032
|
-
vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
|
3033
|
-
|
3034
|
-
over_vectors = true
|
3035
|
-
|
3036
|
-
if non_vects.any?
|
3037
|
-
options = options.clone
|
3038
|
-
|
3039
|
-
vects.each do |name|
|
3040
|
-
proc_on_vect = options[name].to_proc
|
3041
|
-
options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
|
3042
|
-
end
|
3043
|
-
|
3044
|
-
over_vectors = false
|
3045
|
-
end
|
3046
|
-
|
3047
|
-
[over_vectors, options]
|
3048
|
-
end
|
3049
|
-
|
3050
|
-
def group_index_for_aggregation(index, multi_index_level = -1)
|
3051
|
-
case index
|
3052
|
-
when DaruLite::MultiIndex
|
3053
|
-
groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
|
3054
|
-
|
3055
|
-
new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
|
3056
|
-
pos_tuples = groups_by_pos.values
|
3057
|
-
when DaruLite::Index, DaruLite::CategoricalIndex
|
3058
|
-
new_index = Array(index).uniq
|
3059
|
-
pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
|
3060
|
-
else raise
|
3061
|
-
end
|
3062
|
-
|
3063
|
-
[pos_tuples, new_index]
|
3064
|
-
end
|
3065
|
-
|
3066
|
-
# coerce ranges, integers and array in appropriate ways
|
3067
|
-
def coerce_positions(*positions, size)
|
3068
|
-
if positions.size == 1
|
3069
|
-
case positions.first
|
3070
|
-
when Integer
|
3071
|
-
positions.first
|
3072
|
-
when Range
|
3073
|
-
size.times.to_a[positions.first]
|
3074
|
-
else
|
3075
|
-
raise ArgumentError, 'Unknown position type.'
|
3076
|
-
end
|
3077
|
-
else
|
3078
|
-
positions
|
3079
|
-
end
|
3080
|
-
end
|
3081
866
|
end
|
3082
867
|
end
|