daru_lite 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +35 -33
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +138 -2353
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3289
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +86 -2
@@ -0,0 +1,75 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Missable
|
4
|
+
extend Gem::Deprecate
|
5
|
+
|
6
|
+
# Rolling fillna
|
7
|
+
# replace all Float::NAN and NIL values with the preceeding or following value
|
8
|
+
#
|
9
|
+
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# df = DaruLite::DataFrame.new({
|
13
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
14
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
|
15
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
16
|
+
# })
|
17
|
+
#
|
18
|
+
# => #<DaruLite::DataFrame(8x3)>
|
19
|
+
# a b c
|
20
|
+
# 0 1 a a
|
21
|
+
# 1 2 b NaN
|
22
|
+
# 2 3 nil 3
|
23
|
+
# 3 nil NaN 4
|
24
|
+
# 4 NaN nil 3
|
25
|
+
# 5 nil 3 5
|
26
|
+
# 6 1 5 nil
|
27
|
+
# 7 7 nil 7
|
28
|
+
#
|
29
|
+
# 2.3.3 :068 > df.rolling_fillna(:forward)
|
30
|
+
# => #<DaruLite::DataFrame(8x3)>
|
31
|
+
# a b c
|
32
|
+
# 0 1 a a
|
33
|
+
# 1 2 b a
|
34
|
+
# 2 3 b 3
|
35
|
+
# 3 3 b 4
|
36
|
+
# 4 3 b 3
|
37
|
+
# 5 3 3 5
|
38
|
+
# 6 1 5 5
|
39
|
+
# 7 7 5 7
|
40
|
+
#
|
41
|
+
def rolling_fillna!(direction = :forward)
|
42
|
+
@data.each { |vec| vec.rolling_fillna!(direction) }
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def rolling_fillna(direction = :forward)
|
47
|
+
dup.rolling_fillna!(direction)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return a vector with the number of missing values in each row.
|
51
|
+
#
|
52
|
+
# == Arguments
|
53
|
+
#
|
54
|
+
# * +missing_values+ - An Array of the values that should be
|
55
|
+
# treated as 'missing'. The default missing value is *nil*.
|
56
|
+
def missing_values_rows(missing_values = [nil])
|
57
|
+
number_of_missing = each_row.map do |row|
|
58
|
+
row.indexes(*missing_values).size
|
59
|
+
end
|
60
|
+
|
61
|
+
DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
|
62
|
+
end
|
63
|
+
|
64
|
+
# TODO: remove next version
|
65
|
+
alias vector_missing_values missing_values_rows
|
66
|
+
|
67
|
+
def has_missing_data?
|
68
|
+
@data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
|
69
|
+
end
|
70
|
+
alias flawed? has_missing_data?
|
71
|
+
deprecate :has_missing_data?, :include_values?, 2016, 10
|
72
|
+
deprecate :flawed?, :include_values?, 2016, 10
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Pivotable
|
4
|
+
# Pivots a data frame on specified vectors and applies an aggregate function
|
5
|
+
# to quickly generate a summary.
|
6
|
+
#
|
7
|
+
# == Options
|
8
|
+
#
|
9
|
+
# +:index+ - Keys to group by on the pivot table row index. Pass vector names
|
10
|
+
# contained in an Array.
|
11
|
+
#
|
12
|
+
# +:vectors+ - Keys to group by on the pivot table column index. Pass vector
|
13
|
+
# names contained in an Array.
|
14
|
+
#
|
15
|
+
# +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
|
16
|
+
# use any of the statistics functions applicable on Vectors that can be found in
|
17
|
+
# the DaruLite::Statistics::Vector module.
|
18
|
+
#
|
19
|
+
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
20
|
+
# specified in *:index* or *:vectors*. Optional.
|
21
|
+
#
|
22
|
+
# == Usage
|
23
|
+
#
|
24
|
+
# df = DaruLite::DataFrame.new({
|
25
|
+
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
26
|
+
# b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
|
27
|
+
# c: ['small','large','large','small','small','large','small','large','small'],
|
28
|
+
# d: [1,2,2,3,3,4,5,6,7],
|
29
|
+
# e: [2,4,4,6,6,8,10,12,14]
|
30
|
+
# })
|
31
|
+
# df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
|
32
|
+
#
|
33
|
+
# #=>
|
34
|
+
# # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
|
35
|
+
# # [:e, :one] [:e, :two]
|
36
|
+
# # [:bar] 18 26
|
37
|
+
# # [:foo] 10 12
|
38
|
+
def pivot_table(opts = {})
|
39
|
+
raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
|
40
|
+
|
41
|
+
index = opts[:index]
|
42
|
+
vectors = opts[:vectors] || []
|
43
|
+
aggregate_function = opts[:agg] || :mean
|
44
|
+
values = prepare_pivot_values index, vectors, opts
|
45
|
+
raise IndexError, 'No numeric vectors to aggregate' if values.empty?
|
46
|
+
|
47
|
+
grouped = group_by(index)
|
48
|
+
return grouped.send(aggregate_function) if vectors.empty?
|
49
|
+
|
50
|
+
super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
|
51
|
+
|
52
|
+
pivot_dataframe super_hash
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def prepare_pivot_values(index, vectors, opts)
|
58
|
+
case opts[:values]
|
59
|
+
when nil # values not specified at all.
|
60
|
+
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
61
|
+
when Array # multiple values specified.
|
62
|
+
opts[:values]
|
63
|
+
else # single value specified.
|
64
|
+
[opts[:values]]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def make_pivot_hash(grouped, vectors, values, aggregate_function)
|
69
|
+
grouped.groups.transform_values { |_| {} }.tap do |super_hash|
|
70
|
+
values.each do |value|
|
71
|
+
grouped.groups.each do |group_name, row_numbers|
|
72
|
+
row_numbers.each do |num|
|
73
|
+
arry = [value, *vectors.map { |v| self[v][num] }]
|
74
|
+
sub_hash = super_hash[group_name]
|
75
|
+
sub_hash[arry] ||= []
|
76
|
+
|
77
|
+
sub_hash[arry] << self[value][num]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
setup_pivot_aggregates super_hash, aggregate_function
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def setup_pivot_aggregates(super_hash, aggregate_function)
|
87
|
+
super_hash.each_value do |sub_hash|
|
88
|
+
sub_hash.each do |group_name, aggregates|
|
89
|
+
sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def pivot_dataframe(super_hash)
|
95
|
+
df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
|
96
|
+
df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
|
97
|
+
|
98
|
+
DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
|
99
|
+
super_hash.each do |row_index, sub_h|
|
100
|
+
sub_h.each do |vector_index, val|
|
101
|
+
pivoted_dataframe[vector_index][row_index] = val
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Queryable
|
4
|
+
# Check if a vector is present
|
5
|
+
def has_vector?(vector)
|
6
|
+
@vectors.include? vector
|
7
|
+
end
|
8
|
+
|
9
|
+
# Check if any of given values occur in the data frame
|
10
|
+
# @param [Array] values to check for
|
11
|
+
# @return [true, false] true if any of the given values occur in the
|
12
|
+
# dataframe, false otherwise
|
13
|
+
# @example
|
14
|
+
# df = DaruLite::DataFrame.new({
|
15
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
16
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
17
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
18
|
+
# }, index: 11..18)
|
19
|
+
# df.include_values? nil
|
20
|
+
# # => true
|
21
|
+
def include_values?(*values)
|
22
|
+
@data.any? { |vec| vec.include_values?(*values) }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Works like Array#any?.
|
26
|
+
#
|
27
|
+
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
28
|
+
# :row. A DaruLite::Vector object is yielded in the block.
|
29
|
+
# @example Using any?
|
30
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
31
|
+
# df.any?(:row) do |row|
|
32
|
+
# row[:a] < 3 and row[:b] == 'b'
|
33
|
+
# end #=> true
|
34
|
+
def any?(axis = :vector, &block)
|
35
|
+
if %i[vector column].include?(axis)
|
36
|
+
@data.any?(&block)
|
37
|
+
elsif axis == :row
|
38
|
+
each_row do |row|
|
39
|
+
return true if yield(row)
|
40
|
+
end
|
41
|
+
false
|
42
|
+
else
|
43
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Works like Array#all?
|
48
|
+
#
|
49
|
+
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
50
|
+
# :row. A DaruLite::Vector object is yielded in the block.
|
51
|
+
# @example Using all?
|
52
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
53
|
+
# df.all?(:row) do |row|
|
54
|
+
# row[:a] < 10
|
55
|
+
# end #=> true
|
56
|
+
def all?(axis = :vector, &block)
|
57
|
+
if %i[vector column].include?(axis)
|
58
|
+
@data.all?(&block)
|
59
|
+
elsif axis == :row
|
60
|
+
each_row.all?(&block)
|
61
|
+
else
|
62
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Setable
|
4
|
+
# Set rows by positions
|
5
|
+
# @param [Array<Integer>] positions positions of rows to set
|
6
|
+
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
7
|
+
# @example
|
8
|
+
# df = DaruLite::DataFrame.new({
|
9
|
+
# a: [1, 2, 3],
|
10
|
+
# b: ['a', 'b', 'c']
|
11
|
+
# })
|
12
|
+
# df.set_row_at [0, 1], ['x', 'x']
|
13
|
+
# df
|
14
|
+
# #=> #<DaruLite::DataFrame(3x2)>
|
15
|
+
# # a b
|
16
|
+
# # 0 x x
|
17
|
+
# # 1 x x
|
18
|
+
# # 2 3 c
|
19
|
+
def set_row_at(positions, vector)
|
20
|
+
validate_positions(*positions, nrows)
|
21
|
+
vector =
|
22
|
+
if vector.is_a? DaruLite::Vector
|
23
|
+
vector.reindex @vectors
|
24
|
+
else
|
25
|
+
DaruLite::Vector.new vector
|
26
|
+
end
|
27
|
+
|
28
|
+
raise SizeError, 'Vector length should match row length' if
|
29
|
+
vector.size != @vectors.size
|
30
|
+
|
31
|
+
@data.each_with_index do |vec, pos|
|
32
|
+
vec.set_at(positions, vector.at(pos))
|
33
|
+
end
|
34
|
+
@index = @data[0].index
|
35
|
+
set_size
|
36
|
+
end
|
37
|
+
|
38
|
+
# Set vectors by positions
|
39
|
+
# @param [Array<Integer>] positions positions of vectors to set
|
40
|
+
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
41
|
+
# @example
|
42
|
+
# df = DaruLite::DataFrame.new({
|
43
|
+
# a: [1, 2, 3],
|
44
|
+
# b: ['a', 'b', 'c']
|
45
|
+
# })
|
46
|
+
# df.set_at [0], ['x', 'y', 'z']
|
47
|
+
# df
|
48
|
+
# #=> #<DaruLite::DataFrame(3x2)>
|
49
|
+
# # a b
|
50
|
+
# # 0 x a
|
51
|
+
# # 1 y b
|
52
|
+
# # 2 z c
|
53
|
+
def set_at(positions, vector)
|
54
|
+
if positions.last == :row
|
55
|
+
positions.pop
|
56
|
+
return set_row_at(positions, vector)
|
57
|
+
end
|
58
|
+
|
59
|
+
validate_positions(*positions, ncols)
|
60
|
+
vector =
|
61
|
+
if vector.is_a? DaruLite::Vector
|
62
|
+
vector.reindex @index
|
63
|
+
else
|
64
|
+
DaruLite::Vector.new vector
|
65
|
+
end
|
66
|
+
|
67
|
+
raise SizeError, 'Vector length should match index length' if
|
68
|
+
vector.size != @index.size
|
69
|
+
|
70
|
+
positions.each { |pos| @data[pos] = vector }
|
71
|
+
end
|
72
|
+
|
73
|
+
# Insert a new row/vector of the specified name or modify a previous row.
|
74
|
+
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
|
75
|
+
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
|
76
|
+
#
|
77
|
+
# In case a DaruLite::Vector is specified after the equality the sign, the indexes
|
78
|
+
# of the vector will be matched against the row/vector indexes of the DataFrame
|
79
|
+
# before an insertion is performed. Unmatched indexes will be set to nil.
|
80
|
+
def []=(*args)
|
81
|
+
vector = args.pop
|
82
|
+
axis = extract_axis(args)
|
83
|
+
names = args
|
84
|
+
|
85
|
+
dispatch_to_axis axis, :insert_or_modify, names, vector
|
86
|
+
end
|
87
|
+
|
88
|
+
def add_row(row, index = nil)
|
89
|
+
self.row[*(index || @size)] = row
|
90
|
+
end
|
91
|
+
|
92
|
+
def add_vector(n, vector)
|
93
|
+
self[n] = vector
|
94
|
+
end
|
95
|
+
|
96
|
+
def insert_vector(n, name, source)
|
97
|
+
raise ArgumentError unless source.is_a? Array
|
98
|
+
|
99
|
+
vector = DaruLite::Vector.new(source, index: @index, name: @name)
|
100
|
+
@data << vector
|
101
|
+
@vectors = @vectors.add name
|
102
|
+
ordr = @vectors.dup.to_a
|
103
|
+
elmnt = ordr.pop
|
104
|
+
ordr.insert n, elmnt
|
105
|
+
self.order = ordr
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,241 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Sortable
|
4
|
+
# Reorder the vectors in a dataframe
|
5
|
+
# @param [Array] order_array new order of the vectors
|
6
|
+
# @example
|
7
|
+
# df = DaruLite::DataFrame({
|
8
|
+
# a: [1, 2, 3],
|
9
|
+
# b: [4, 5, 6]
|
10
|
+
# }, order: [:a, :b])
|
11
|
+
# df.order = [:b, :a]
|
12
|
+
# df
|
13
|
+
# # => #<DaruLite::DataFrame(3x2)>
|
14
|
+
# # b a
|
15
|
+
# # 0 4 1
|
16
|
+
# # 1 5 2
|
17
|
+
# # 2 6 3
|
18
|
+
def order=(order_array)
|
19
|
+
raise ArgumentError, 'Invalid order' unless order_array.tally == vectors.to_a.tally
|
20
|
+
|
21
|
+
initialize(to_h, order: order_array)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Return the dataframe with rotate vectors positions, the vector at position count is now
|
25
|
+
# the first vector of the dataframe.
|
26
|
+
# If only one vector in the dataframe, the dataframe is return without any change.
|
27
|
+
# @param count => Integer, the vector at position count will be the first vector of the dataframe.
|
28
|
+
# @example
|
29
|
+
# df = DaruLite::DataFrame({
|
30
|
+
# a: [1, 2, 3],
|
31
|
+
# b: [4, 5, 6],
|
32
|
+
# total: [5, 7, 9],
|
33
|
+
# })
|
34
|
+
# df.rotate_vectors(-1)
|
35
|
+
# df
|
36
|
+
# # => #<DaruLite::DataFrame(3x3)>
|
37
|
+
# # total b a
|
38
|
+
# # 0 5 4 1
|
39
|
+
# # 1 7 5 2
|
40
|
+
# # 2 9 6 3
|
41
|
+
def rotate_vectors(count = -1)
|
42
|
+
return self unless vectors.many?
|
43
|
+
|
44
|
+
self.order = vectors.to_a.rotate(count)
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
# Sorts a dataframe (ascending/descending) in the given pripority sequence of
|
49
|
+
# vectors, with or without a block.
|
50
|
+
#
|
51
|
+
# @param vector_order [Array] The order of vector names in which the DataFrame
|
52
|
+
# should be sorted.
|
53
|
+
# @param opts [Hash] opts The options to sort with.
|
54
|
+
# @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
|
55
|
+
# or descending order. Specify Array corresponding to *order* for multiple
|
56
|
+
# sort orders.
|
57
|
+
# @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
|
58
|
+
# to be used for sorting, for each vector name in *order* as a hash of
|
59
|
+
# vector name and lambda expressions. In case a lambda for a vector is not
|
60
|
+
# specified, the default will be used.
|
61
|
+
# @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
|
62
|
+
# automatically or not when a block is provided.
|
63
|
+
# If set to True, nils will appear at top after sorting.
|
64
|
+
#
|
65
|
+
# @example Sort a dataframe with a vector sequence.
|
66
|
+
#
|
67
|
+
#
|
68
|
+
# df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
|
69
|
+
#
|
70
|
+
# df.sort [:a, :b]
|
71
|
+
# # =>
|
72
|
+
# # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
|
73
|
+
# # a b
|
74
|
+
# # 2 1 3
|
75
|
+
# # 0 1 5
|
76
|
+
# # 3 2 2
|
77
|
+
# # 1 2 4
|
78
|
+
# # 4 3 1
|
79
|
+
#
|
80
|
+
# @example Sort a dataframe without a block. Here nils will be handled automatically.
|
81
|
+
#
|
82
|
+
# df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
|
83
|
+
#
|
84
|
+
# df.sort([:a])
|
85
|
+
# # =>
|
86
|
+
# # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
|
87
|
+
# # a b
|
88
|
+
# # 1 nil 3
|
89
|
+
# # 3 nil 1
|
90
|
+
# # 0 -3 4
|
91
|
+
# # 2 -1 2
|
92
|
+
# # 4 5 4
|
93
|
+
#
|
94
|
+
# @example Sort a dataframe with a block with nils handled automatically.
|
95
|
+
#
|
96
|
+
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
97
|
+
#
|
98
|
+
# df.sort [:b], by: {b: lambda { |a| a.length } }
|
99
|
+
# # NoMethodError: undefined method `length' for nil:NilClass
|
100
|
+
# # from (pry):8:in `block in __pry__'
|
101
|
+
#
|
102
|
+
# df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
|
103
|
+
#
|
104
|
+
# # =>
|
105
|
+
# # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
|
106
|
+
# # a b
|
107
|
+
# # 2 1 nil
|
108
|
+
# # 5 1 nil
|
109
|
+
# # 4 -1 x
|
110
|
+
# # 1 -1 aa
|
111
|
+
# # 0 nil aaa
|
112
|
+
# # 3 nil baaa
|
113
|
+
#
|
114
|
+
# @example Sort a dataframe with a block with nils handled manually.
|
115
|
+
#
|
116
|
+
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
117
|
+
#
|
118
|
+
# # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
|
119
|
+
# df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
|
120
|
+
#
|
121
|
+
# # =>
|
122
|
+
# #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
|
123
|
+
# # a b
|
124
|
+
# # 4 -1 x
|
125
|
+
# # 1 -1 aa
|
126
|
+
# # 0 nil aaa
|
127
|
+
# # 3 nil baaa
|
128
|
+
# # 2 1 nil
|
129
|
+
# # 5 1 nil
|
130
|
+
|
131
|
+
def sort!(vector_order, opts = {})
|
132
|
+
raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
|
133
|
+
|
134
|
+
# To enable sorting with categorical data,
|
135
|
+
# map categories to integers preserving their order
|
136
|
+
old = convert_categorical_vectors vector_order
|
137
|
+
block = sort_prepare_block vector_order, opts
|
138
|
+
|
139
|
+
order = @index.size.times.sort(&block)
|
140
|
+
new_index = @index.reorder order
|
141
|
+
|
142
|
+
# To reverse map mapping of categorical data to integers
|
143
|
+
restore_categorical_vectors old
|
144
|
+
|
145
|
+
@data.each do |vector|
|
146
|
+
vector.reorder! order
|
147
|
+
end
|
148
|
+
|
149
|
+
self.index = new_index
|
150
|
+
|
151
|
+
self
|
152
|
+
end
|
153
|
+
|
154
|
+
# Non-destructive version of #sort!
|
155
|
+
def sort(vector_order, opts = {})
|
156
|
+
dup.sort! vector_order, opts
|
157
|
+
end
|
158
|
+
|
159
|
+
private
|
160
|
+
|
161
|
+
def convert_categorical_vectors(names)
|
162
|
+
names.filter_map do |n|
|
163
|
+
next unless self[n].category?
|
164
|
+
|
165
|
+
old = [n, self[n]]
|
166
|
+
self[n] = DaruLite::Vector.new(self[n].to_ints)
|
167
|
+
old
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def restore_categorical_vectors(old)
|
172
|
+
old.each { |name, vector| self[name] = vector }
|
173
|
+
end
|
174
|
+
|
175
|
+
def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
|
176
|
+
# Create an array to be used for comparison of two rows in sorting
|
177
|
+
vector_locs
|
178
|
+
.zip(by_blocks, ascending, handle_nils)
|
179
|
+
.map do |vector_loc, by, asc, handle_nil|
|
180
|
+
value = @data[vector_loc].data[asc ? r1 : r2]
|
181
|
+
|
182
|
+
if by
|
183
|
+
value = begin
|
184
|
+
by.call(value)
|
185
|
+
rescue StandardError
|
186
|
+
nil
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
sort_handle_nils value, asc, handle_nil || !by
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def sort_handle_nils(value, asc, handle_nil)
|
195
|
+
if !handle_nil
|
196
|
+
value
|
197
|
+
elsif asc
|
198
|
+
[value.nil? ? 0 : 1, value]
|
199
|
+
else
|
200
|
+
[value.nil? ? 1 : 0, value]
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
def sort_coerce_boolean(opts, symbol, default, size)
|
205
|
+
val = opts[symbol]
|
206
|
+
case val
|
207
|
+
when true, false
|
208
|
+
Array.new(size, val)
|
209
|
+
when nil
|
210
|
+
Array.new(size, default)
|
211
|
+
when Array
|
212
|
+
raise ArgumentError, "Specify same number of vector names and #{symbol}" if
|
213
|
+
size != val.size
|
214
|
+
|
215
|
+
val
|
216
|
+
else
|
217
|
+
raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
def sort_prepare_block(vector_order, opts)
|
222
|
+
ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
|
223
|
+
handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
|
224
|
+
|
225
|
+
by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
|
226
|
+
vector_locs = vector_order.map { |v| @vectors[v] }
|
227
|
+
|
228
|
+
lambda do |index1, index2|
|
229
|
+
# Build left and right array to compare two rows
|
230
|
+
left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
|
231
|
+
right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
|
232
|
+
|
233
|
+
# Resolve conflict by Index if all attributes are same
|
234
|
+
left << index1
|
235
|
+
right << index2
|
236
|
+
left <=> right
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|