daru_lite 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +35 -33
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +138 -2353
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3289
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +86 -2
@@ -0,0 +1,75 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Missable
|
4
|
+
extend Gem::Deprecate
|
5
|
+
|
6
|
+
# Rolling fillna
|
7
|
+
# replace all Float::NAN and NIL values with the preceeding or following value
|
8
|
+
#
|
9
|
+
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# df = DaruLite::DataFrame.new({
|
13
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
14
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
|
15
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
16
|
+
# })
|
17
|
+
#
|
18
|
+
# => #<DaruLite::DataFrame(8x3)>
|
19
|
+
# a b c
|
20
|
+
# 0 1 a a
|
21
|
+
# 1 2 b NaN
|
22
|
+
# 2 3 nil 3
|
23
|
+
# 3 nil NaN 4
|
24
|
+
# 4 NaN nil 3
|
25
|
+
# 5 nil 3 5
|
26
|
+
# 6 1 5 nil
|
27
|
+
# 7 7 nil 7
|
28
|
+
#
|
29
|
+
# 2.3.3 :068 > df.rolling_fillna(:forward)
|
30
|
+
# => #<DaruLite::DataFrame(8x3)>
|
31
|
+
# a b c
|
32
|
+
# 0 1 a a
|
33
|
+
# 1 2 b a
|
34
|
+
# 2 3 b 3
|
35
|
+
# 3 3 b 4
|
36
|
+
# 4 3 b 3
|
37
|
+
# 5 3 3 5
|
38
|
+
# 6 1 5 5
|
39
|
+
# 7 7 5 7
|
40
|
+
#
|
41
|
+
def rolling_fillna!(direction = :forward)
|
42
|
+
@data.each { |vec| vec.rolling_fillna!(direction) }
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def rolling_fillna(direction = :forward)
|
47
|
+
dup.rolling_fillna!(direction)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return a vector with the number of missing values in each row.
|
51
|
+
#
|
52
|
+
# == Arguments
|
53
|
+
#
|
54
|
+
# * +missing_values+ - An Array of the values that should be
|
55
|
+
# treated as 'missing'. The default missing value is *nil*.
|
56
|
+
def missing_values_rows(missing_values = [nil])
|
57
|
+
number_of_missing = each_row.map do |row|
|
58
|
+
row.indexes(*missing_values).size
|
59
|
+
end
|
60
|
+
|
61
|
+
DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
|
62
|
+
end
|
63
|
+
|
64
|
+
# TODO: remove next version
|
65
|
+
alias vector_missing_values missing_values_rows
|
66
|
+
|
67
|
+
def has_missing_data?
|
68
|
+
@data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
|
69
|
+
end
|
70
|
+
alias flawed? has_missing_data?
|
71
|
+
deprecate :has_missing_data?, :include_values?, 2016, 10
|
72
|
+
deprecate :flawed?, :include_values?, 2016, 10
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Pivotable
|
4
|
+
# Pivots a data frame on specified vectors and applies an aggregate function
|
5
|
+
# to quickly generate a summary.
|
6
|
+
#
|
7
|
+
# == Options
|
8
|
+
#
|
9
|
+
# +:index+ - Keys to group by on the pivot table row index. Pass vector names
|
10
|
+
# contained in an Array.
|
11
|
+
#
|
12
|
+
# +:vectors+ - Keys to group by on the pivot table column index. Pass vector
|
13
|
+
# names contained in an Array.
|
14
|
+
#
|
15
|
+
# +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
|
16
|
+
# use any of the statistics functions applicable on Vectors that can be found in
|
17
|
+
# the DaruLite::Statistics::Vector module.
|
18
|
+
#
|
19
|
+
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
20
|
+
# specified in *:index* or *:vectors*. Optional.
|
21
|
+
#
|
22
|
+
# == Usage
|
23
|
+
#
|
24
|
+
# df = DaruLite::DataFrame.new({
|
25
|
+
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
26
|
+
# b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
|
27
|
+
# c: ['small','large','large','small','small','large','small','large','small'],
|
28
|
+
# d: [1,2,2,3,3,4,5,6,7],
|
29
|
+
# e: [2,4,4,6,6,8,10,12,14]
|
30
|
+
# })
|
31
|
+
# df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
|
32
|
+
#
|
33
|
+
# #=>
|
34
|
+
# # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
|
35
|
+
# # [:e, :one] [:e, :two]
|
36
|
+
# # [:bar] 18 26
|
37
|
+
# # [:foo] 10 12
|
38
|
+
def pivot_table(opts = {})
|
39
|
+
raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
|
40
|
+
|
41
|
+
index = opts[:index]
|
42
|
+
vectors = opts[:vectors] || []
|
43
|
+
aggregate_function = opts[:agg] || :mean
|
44
|
+
values = prepare_pivot_values index, vectors, opts
|
45
|
+
raise IndexError, 'No numeric vectors to aggregate' if values.empty?
|
46
|
+
|
47
|
+
grouped = group_by(index)
|
48
|
+
return grouped.send(aggregate_function) if vectors.empty?
|
49
|
+
|
50
|
+
super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
|
51
|
+
|
52
|
+
pivot_dataframe super_hash
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def prepare_pivot_values(index, vectors, opts)
|
58
|
+
case opts[:values]
|
59
|
+
when nil # values not specified at all.
|
60
|
+
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
61
|
+
when Array # multiple values specified.
|
62
|
+
opts[:values]
|
63
|
+
else # single value specified.
|
64
|
+
[opts[:values]]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def make_pivot_hash(grouped, vectors, values, aggregate_function)
|
69
|
+
grouped.groups.transform_values { |_| {} }.tap do |super_hash|
|
70
|
+
values.each do |value|
|
71
|
+
grouped.groups.each do |group_name, row_numbers|
|
72
|
+
row_numbers.each do |num|
|
73
|
+
arry = [value, *vectors.map { |v| self[v][num] }]
|
74
|
+
sub_hash = super_hash[group_name]
|
75
|
+
sub_hash[arry] ||= []
|
76
|
+
|
77
|
+
sub_hash[arry] << self[value][num]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
setup_pivot_aggregates super_hash, aggregate_function
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def setup_pivot_aggregates(super_hash, aggregate_function)
|
87
|
+
super_hash.each_value do |sub_hash|
|
88
|
+
sub_hash.each do |group_name, aggregates|
|
89
|
+
sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def pivot_dataframe(super_hash)
|
95
|
+
df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
|
96
|
+
df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
|
97
|
+
|
98
|
+
DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
|
99
|
+
super_hash.each do |row_index, sub_h|
|
100
|
+
sub_h.each do |vector_index, val|
|
101
|
+
pivoted_dataframe[vector_index][row_index] = val
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Queryable
|
4
|
+
# Check if a vector is present
|
5
|
+
def has_vector?(vector)
|
6
|
+
@vectors.include? vector
|
7
|
+
end
|
8
|
+
|
9
|
+
# Check if any of given values occur in the data frame
|
10
|
+
# @param [Array] values to check for
|
11
|
+
# @return [true, false] true if any of the given values occur in the
|
12
|
+
# dataframe, false otherwise
|
13
|
+
# @example
|
14
|
+
# df = DaruLite::DataFrame.new({
|
15
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
16
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
17
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
18
|
+
# }, index: 11..18)
|
19
|
+
# df.include_values? nil
|
20
|
+
# # => true
|
21
|
+
def include_values?(*values)
|
22
|
+
@data.any? { |vec| vec.include_values?(*values) }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Works like Array#any?.
|
26
|
+
#
|
27
|
+
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
28
|
+
# :row. A DaruLite::Vector object is yielded in the block.
|
29
|
+
# @example Using any?
|
30
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
31
|
+
# df.any?(:row) do |row|
|
32
|
+
# row[:a] < 3 and row[:b] == 'b'
|
33
|
+
# end #=> true
|
34
|
+
def any?(axis = :vector, &block)
|
35
|
+
if %i[vector column].include?(axis)
|
36
|
+
@data.any?(&block)
|
37
|
+
elsif axis == :row
|
38
|
+
each_row do |row|
|
39
|
+
return true if yield(row)
|
40
|
+
end
|
41
|
+
false
|
42
|
+
else
|
43
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Works like Array#all?
|
48
|
+
#
|
49
|
+
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
50
|
+
# :row. A DaruLite::Vector object is yielded in the block.
|
51
|
+
# @example Using all?
|
52
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
53
|
+
# df.all?(:row) do |row|
|
54
|
+
# row[:a] < 10
|
55
|
+
# end #=> true
|
56
|
+
def all?(axis = :vector, &block)
|
57
|
+
if %i[vector column].include?(axis)
|
58
|
+
@data.all?(&block)
|
59
|
+
elsif axis == :row
|
60
|
+
each_row.all?(&block)
|
61
|
+
else
|
62
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Setable
|
4
|
+
# Set rows by positions
|
5
|
+
# @param [Array<Integer>] positions positions of rows to set
|
6
|
+
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
7
|
+
# @example
|
8
|
+
# df = DaruLite::DataFrame.new({
|
9
|
+
# a: [1, 2, 3],
|
10
|
+
# b: ['a', 'b', 'c']
|
11
|
+
# })
|
12
|
+
# df.set_row_at [0, 1], ['x', 'x']
|
13
|
+
# df
|
14
|
+
# #=> #<DaruLite::DataFrame(3x2)>
|
15
|
+
# # a b
|
16
|
+
# # 0 x x
|
17
|
+
# # 1 x x
|
18
|
+
# # 2 3 c
|
19
|
+
def set_row_at(positions, vector)
|
20
|
+
validate_positions(*positions, nrows)
|
21
|
+
vector =
|
22
|
+
if vector.is_a? DaruLite::Vector
|
23
|
+
vector.reindex @vectors
|
24
|
+
else
|
25
|
+
DaruLite::Vector.new vector
|
26
|
+
end
|
27
|
+
|
28
|
+
raise SizeError, 'Vector length should match row length' if
|
29
|
+
vector.size != @vectors.size
|
30
|
+
|
31
|
+
@data.each_with_index do |vec, pos|
|
32
|
+
vec.set_at(positions, vector.at(pos))
|
33
|
+
end
|
34
|
+
@index = @data[0].index
|
35
|
+
set_size
|
36
|
+
end
|
37
|
+
|
38
|
+
# Set vectors by positions
|
39
|
+
# @param [Array<Integer>] positions positions of vectors to set
|
40
|
+
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
41
|
+
# @example
|
42
|
+
# df = DaruLite::DataFrame.new({
|
43
|
+
# a: [1, 2, 3],
|
44
|
+
# b: ['a', 'b', 'c']
|
45
|
+
# })
|
46
|
+
# df.set_at [0], ['x', 'y', 'z']
|
47
|
+
# df
|
48
|
+
# #=> #<DaruLite::DataFrame(3x2)>
|
49
|
+
# # a b
|
50
|
+
# # 0 x a
|
51
|
+
# # 1 y b
|
52
|
+
# # 2 z c
|
53
|
+
def set_at(positions, vector)
|
54
|
+
if positions.last == :row
|
55
|
+
positions.pop
|
56
|
+
return set_row_at(positions, vector)
|
57
|
+
end
|
58
|
+
|
59
|
+
validate_positions(*positions, ncols)
|
60
|
+
vector =
|
61
|
+
if vector.is_a? DaruLite::Vector
|
62
|
+
vector.reindex @index
|
63
|
+
else
|
64
|
+
DaruLite::Vector.new vector
|
65
|
+
end
|
66
|
+
|
67
|
+
raise SizeError, 'Vector length should match index length' if
|
68
|
+
vector.size != @index.size
|
69
|
+
|
70
|
+
positions.each { |pos| @data[pos] = vector }
|
71
|
+
end
|
72
|
+
|
73
|
+
# Insert a new row/vector of the specified name or modify a previous row.
|
74
|
+
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
|
75
|
+
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
|
76
|
+
#
|
77
|
+
# In case a DaruLite::Vector is specified after the equality the sign, the indexes
|
78
|
+
# of the vector will be matched against the row/vector indexes of the DataFrame
|
79
|
+
# before an insertion is performed. Unmatched indexes will be set to nil.
|
80
|
+
def []=(*args)
|
81
|
+
vector = args.pop
|
82
|
+
axis = extract_axis(args)
|
83
|
+
names = args
|
84
|
+
|
85
|
+
dispatch_to_axis axis, :insert_or_modify, names, vector
|
86
|
+
end
|
87
|
+
|
88
|
+
def add_row(row, index = nil)
|
89
|
+
self.row[*(index || @size)] = row
|
90
|
+
end
|
91
|
+
|
92
|
+
def add_vector(n, vector)
|
93
|
+
self[n] = vector
|
94
|
+
end
|
95
|
+
|
96
|
+
def insert_vector(n, name, source)
|
97
|
+
raise ArgumentError unless source.is_a? Array
|
98
|
+
|
99
|
+
vector = DaruLite::Vector.new(source, index: @index, name: @name)
|
100
|
+
@data << vector
|
101
|
+
@vectors = @vectors.add name
|
102
|
+
ordr = @vectors.dup.to_a
|
103
|
+
elmnt = ordr.pop
|
104
|
+
ordr.insert n, elmnt
|
105
|
+
self.order = ordr
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,241 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Sortable
|
4
|
+
# Reorder the vectors in a dataframe
|
5
|
+
# @param [Array] order_array new order of the vectors
|
6
|
+
# @example
|
7
|
+
# df = DaruLite::DataFrame({
|
8
|
+
# a: [1, 2, 3],
|
9
|
+
# b: [4, 5, 6]
|
10
|
+
# }, order: [:a, :b])
|
11
|
+
# df.order = [:b, :a]
|
12
|
+
# df
|
13
|
+
# # => #<DaruLite::DataFrame(3x2)>
|
14
|
+
# # b a
|
15
|
+
# # 0 4 1
|
16
|
+
# # 1 5 2
|
17
|
+
# # 2 6 3
|
18
|
+
def order=(order_array)
|
19
|
+
raise ArgumentError, 'Invalid order' unless order_array.tally == vectors.to_a.tally
|
20
|
+
|
21
|
+
initialize(to_h, order: order_array)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Return the dataframe with rotate vectors positions, the vector at position count is now
|
25
|
+
# the first vector of the dataframe.
|
26
|
+
# If only one vector in the dataframe, the dataframe is return without any change.
|
27
|
+
# @param count => Integer, the vector at position count will be the first vector of the dataframe.
|
28
|
+
# @example
|
29
|
+
# df = DaruLite::DataFrame({
|
30
|
+
# a: [1, 2, 3],
|
31
|
+
# b: [4, 5, 6],
|
32
|
+
# total: [5, 7, 9],
|
33
|
+
# })
|
34
|
+
# df.rotate_vectors(-1)
|
35
|
+
# df
|
36
|
+
# # => #<DaruLite::DataFrame(3x3)>
|
37
|
+
# # total b a
|
38
|
+
# # 0 5 4 1
|
39
|
+
# # 1 7 5 2
|
40
|
+
# # 2 9 6 3
|
41
|
+
def rotate_vectors(count = -1)
|
42
|
+
return self unless vectors.many?
|
43
|
+
|
44
|
+
self.order = vectors.to_a.rotate(count)
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
# Sorts a dataframe (ascending/descending) in the given pripority sequence of
|
49
|
+
# vectors, with or without a block.
|
50
|
+
#
|
51
|
+
# @param vector_order [Array] The order of vector names in which the DataFrame
|
52
|
+
# should be sorted.
|
53
|
+
# @param opts [Hash] opts The options to sort with.
|
54
|
+
# @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
|
55
|
+
# or descending order. Specify Array corresponding to *order* for multiple
|
56
|
+
# sort orders.
|
57
|
+
# @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
|
58
|
+
# to be used for sorting, for each vector name in *order* as a hash of
|
59
|
+
# vector name and lambda expressions. In case a lambda for a vector is not
|
60
|
+
# specified, the default will be used.
|
61
|
+
# @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
|
62
|
+
# automatically or not when a block is provided.
|
63
|
+
# If set to True, nils will appear at top after sorting.
|
64
|
+
#
|
65
|
+
# @example Sort a dataframe with a vector sequence.
|
66
|
+
#
|
67
|
+
#
|
68
|
+
# df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
|
69
|
+
#
|
70
|
+
# df.sort [:a, :b]
|
71
|
+
# # =>
|
72
|
+
# # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
|
73
|
+
# # a b
|
74
|
+
# # 2 1 3
|
75
|
+
# # 0 1 5
|
76
|
+
# # 3 2 2
|
77
|
+
# # 1 2 4
|
78
|
+
# # 4 3 1
|
79
|
+
#
|
80
|
+
# @example Sort a dataframe without a block. Here nils will be handled automatically.
|
81
|
+
#
|
82
|
+
# df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
|
83
|
+
#
|
84
|
+
# df.sort([:a])
|
85
|
+
# # =>
|
86
|
+
# # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
|
87
|
+
# # a b
|
88
|
+
# # 1 nil 3
|
89
|
+
# # 3 nil 1
|
90
|
+
# # 0 -3 4
|
91
|
+
# # 2 -1 2
|
92
|
+
# # 4 5 4
|
93
|
+
#
|
94
|
+
# @example Sort a dataframe with a block with nils handled automatically.
|
95
|
+
#
|
96
|
+
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
97
|
+
#
|
98
|
+
# df.sort [:b], by: {b: lambda { |a| a.length } }
|
99
|
+
# # NoMethodError: undefined method `length' for nil:NilClass
|
100
|
+
# # from (pry):8:in `block in __pry__'
|
101
|
+
#
|
102
|
+
# df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
|
103
|
+
#
|
104
|
+
# # =>
|
105
|
+
# # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
|
106
|
+
# # a b
|
107
|
+
# # 2 1 nil
|
108
|
+
# # 5 1 nil
|
109
|
+
# # 4 -1 x
|
110
|
+
# # 1 -1 aa
|
111
|
+
# # 0 nil aaa
|
112
|
+
# # 3 nil baaa
|
113
|
+
#
|
114
|
+
# @example Sort a dataframe with a block with nils handled manually.
|
115
|
+
#
|
116
|
+
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
117
|
+
#
|
118
|
+
# # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
|
119
|
+
# df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
|
120
|
+
#
|
121
|
+
# # =>
|
122
|
+
# #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
|
123
|
+
# # a b
|
124
|
+
# # 4 -1 x
|
125
|
+
# # 1 -1 aa
|
126
|
+
# # 0 nil aaa
|
127
|
+
# # 3 nil baaa
|
128
|
+
# # 2 1 nil
|
129
|
+
# # 5 1 nil
|
130
|
+
|
131
|
+
def sort!(vector_order, opts = {})
|
132
|
+
raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
|
133
|
+
|
134
|
+
# To enable sorting with categorical data,
|
135
|
+
# map categories to integers preserving their order
|
136
|
+
old = convert_categorical_vectors vector_order
|
137
|
+
block = sort_prepare_block vector_order, opts
|
138
|
+
|
139
|
+
order = @index.size.times.sort(&block)
|
140
|
+
new_index = @index.reorder order
|
141
|
+
|
142
|
+
# To reverse map mapping of categorical data to integers
|
143
|
+
restore_categorical_vectors old
|
144
|
+
|
145
|
+
@data.each do |vector|
|
146
|
+
vector.reorder! order
|
147
|
+
end
|
148
|
+
|
149
|
+
self.index = new_index
|
150
|
+
|
151
|
+
self
|
152
|
+
end
|
153
|
+
|
154
|
+
# Non-destructive version of #sort!
|
155
|
+
def sort(vector_order, opts = {})
|
156
|
+
dup.sort! vector_order, opts
|
157
|
+
end
|
158
|
+
|
159
|
+
private
|
160
|
+
|
161
|
+
def convert_categorical_vectors(names)
|
162
|
+
names.filter_map do |n|
|
163
|
+
next unless self[n].category?
|
164
|
+
|
165
|
+
old = [n, self[n]]
|
166
|
+
self[n] = DaruLite::Vector.new(self[n].to_ints)
|
167
|
+
old
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def restore_categorical_vectors(old)
|
172
|
+
old.each { |name, vector| self[name] = vector }
|
173
|
+
end
|
174
|
+
|
175
|
+
def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
|
176
|
+
# Create an array to be used for comparison of two rows in sorting
|
177
|
+
vector_locs
|
178
|
+
.zip(by_blocks, ascending, handle_nils)
|
179
|
+
.map do |vector_loc, by, asc, handle_nil|
|
180
|
+
value = @data[vector_loc].data[asc ? r1 : r2]
|
181
|
+
|
182
|
+
if by
|
183
|
+
value = begin
|
184
|
+
by.call(value)
|
185
|
+
rescue StandardError
|
186
|
+
nil
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
sort_handle_nils value, asc, handle_nil || !by
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def sort_handle_nils(value, asc, handle_nil)
|
195
|
+
if !handle_nil
|
196
|
+
value
|
197
|
+
elsif asc
|
198
|
+
[value.nil? ? 0 : 1, value]
|
199
|
+
else
|
200
|
+
[value.nil? ? 1 : 0, value]
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
def sort_coerce_boolean(opts, symbol, default, size)
|
205
|
+
val = opts[symbol]
|
206
|
+
case val
|
207
|
+
when true, false
|
208
|
+
Array.new(size, val)
|
209
|
+
when nil
|
210
|
+
Array.new(size, default)
|
211
|
+
when Array
|
212
|
+
raise ArgumentError, "Specify same number of vector names and #{symbol}" if
|
213
|
+
size != val.size
|
214
|
+
|
215
|
+
val
|
216
|
+
else
|
217
|
+
raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
def sort_prepare_block(vector_order, opts)
|
222
|
+
ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
|
223
|
+
handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
|
224
|
+
|
225
|
+
by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
|
226
|
+
vector_locs = vector_order.map { |v| @vectors[v] }
|
227
|
+
|
228
|
+
lambda do |index1, index2|
|
229
|
+
# Build left and right array to compare two rows
|
230
|
+
left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
|
231
|
+
right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
|
232
|
+
|
233
|
+
# Resolve conflict by Index if all attributes are same
|
234
|
+
left << index1
|
235
|
+
right << index2
|
236
|
+
left <=> right
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|