red_amber 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +12 -0
- data/CHANGELOG.md +114 -31
- data/Gemfile +4 -2
- data/README.md +41 -25
- data/benchmark/basic.yml +79 -0
- data/benchmark/combine.yml +63 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +33 -0
- data/benchmark/reshape.yml +27 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/doc/DataFrame.md +332 -53
- data/doc/Vector.md +3 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/lib/red_amber/data_frame.rb +6 -5
- data/lib/red_amber/data_frame_combinable.rb +283 -0
- data/lib/red_amber/data_frame_displayable.rb +2 -0
- data/lib/red_amber/data_frame_selectable.rb +9 -9
- data/lib/red_amber/data_frame_variable_operation.rb +4 -4
- data/lib/red_amber/group.rb +99 -18
- data/lib/red_amber/helper.rb +1 -13
- data/lib/red_amber/vector.rb +7 -0
- data/lib/red_amber/vector_functions.rb +0 -8
- data/lib/red_amber/vector_updatable.rb +60 -65
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +1 -0
- data/red_amber.gemspec +1 -1
- metadata +21 -10
data/lib/red_amber/data_frame.rb
CHANGED
@@ -5,6 +5,7 @@ module RedAmber
|
|
5
5
|
# Variable @table holds an Arrow::Table object.
|
6
6
|
class DataFrame
|
7
7
|
# mix-in
|
8
|
+
include DataFrameCombinable
|
8
9
|
include DataFrameDisplayable
|
9
10
|
include DataFrameIndexable
|
10
11
|
include DataFrameLoadSave
|
@@ -47,8 +48,6 @@ module RedAmber
|
|
47
48
|
@table = table
|
48
49
|
in [Arrow::Table => table]
|
49
50
|
@table = table
|
50
|
-
in [DataFrame => dataframe]
|
51
|
-
@table = dataframe.table
|
52
51
|
in [rover_or_hash]
|
53
52
|
begin
|
54
53
|
# Accepts Rover::DataFrame or Hash
|
@@ -77,8 +76,9 @@ module RedAmber
|
|
77
76
|
def size
|
78
77
|
@table.n_rows
|
79
78
|
end
|
80
|
-
alias_method :
|
79
|
+
alias_method :n_records, :size
|
81
80
|
alias_method :n_obs, :size
|
81
|
+
alias_method :n_rows, :size
|
82
82
|
|
83
83
|
# Returns the number of columns.
|
84
84
|
#
|
@@ -86,8 +86,9 @@ module RedAmber
|
|
86
86
|
def n_keys
|
87
87
|
@table.n_columns
|
88
88
|
end
|
89
|
-
alias_method :
|
89
|
+
alias_method :n_variables, :n_keys
|
90
90
|
alias_method :n_vars, :n_keys
|
91
|
+
alias_method :n_cols, :n_keys
|
91
92
|
|
92
93
|
# Returns the numbers of rows and columns.
|
93
94
|
#
|
@@ -174,7 +175,7 @@ module RedAmber
|
|
174
175
|
# - indices(1) #=> [1, 2, 3, 4, 5]
|
175
176
|
# - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
|
176
177
|
def indices(start = 0)
|
177
|
-
(start..).take(size)
|
178
|
+
Vector.new((start..).take(size))
|
178
179
|
end
|
179
180
|
alias_method :indexes, :indices
|
180
181
|
|
@@ -0,0 +1,283 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-in for the class DataFrame
|
5
|
+
module DataFrameCombinable
|
6
|
+
# Concatenate other dataframe onto the bottom.
|
7
|
+
#
|
8
|
+
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
9
|
+
# DataFrame/Table to concatenate onto the bottom of self.
|
10
|
+
# @return [DataFrame]
|
11
|
+
# Concatenated dataframe.
|
12
|
+
def concatenate(*other)
|
13
|
+
case other
|
14
|
+
in [] | [nil] | [[]]
|
15
|
+
return self
|
16
|
+
in [Array => array]
|
17
|
+
# Nop
|
18
|
+
else
|
19
|
+
array = other
|
20
|
+
end
|
21
|
+
|
22
|
+
table_array = array.map do |e|
|
23
|
+
case e
|
24
|
+
when Arrow::Table
|
25
|
+
e
|
26
|
+
when DataFrame
|
27
|
+
e.table
|
28
|
+
else
|
29
|
+
raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
DataFrame.new(table.concatenate(table_array))
|
34
|
+
end
|
35
|
+
|
36
|
+
alias_method :concat, :concatenate
|
37
|
+
alias_method :bind_rows, :concatenate
|
38
|
+
|
39
|
+
# Merge other DataFrame or Table from other.
|
40
|
+
# - Self and other must have same size.
|
41
|
+
# - Self and other do not share the same key.
|
42
|
+
# - If they share any keys, raise Error.
|
43
|
+
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
44
|
+
# DataFrame/Table to concatenate.
|
45
|
+
# @return [DataFrame]
|
46
|
+
# Merged dataframe.
|
47
|
+
def merge(*other)
|
48
|
+
case other
|
49
|
+
in [] | [nil] | [[]]
|
50
|
+
return self
|
51
|
+
in [Array => array]
|
52
|
+
# Nop
|
53
|
+
else
|
54
|
+
array = other
|
55
|
+
end
|
56
|
+
|
57
|
+
hash = array.each_with_object({}) do |e, h|
|
58
|
+
df =
|
59
|
+
case e
|
60
|
+
when Arrow::Table
|
61
|
+
DataFrame.new(e)
|
62
|
+
when DataFrame
|
63
|
+
e
|
64
|
+
else
|
65
|
+
raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
|
66
|
+
end
|
67
|
+
|
68
|
+
raise DataFrameArgumentError, "#{e} do not have same size as self" if size != df.size
|
69
|
+
|
70
|
+
k = keys.intersection(df.keys).any?
|
71
|
+
raise DataFrameArgumentError, "There are some shared keys: #{k}" if k
|
72
|
+
|
73
|
+
h.merge!(df.to_h)
|
74
|
+
end
|
75
|
+
|
76
|
+
assign(hash)
|
77
|
+
end
|
78
|
+
|
79
|
+
alias_method :bind_cols, :merge
|
80
|
+
|
81
|
+
# Mutating joins
|
82
|
+
|
83
|
+
# Join data, leaving only the matching records.
|
84
|
+
#
|
85
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
86
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
87
|
+
# @return [DataFrame] Joined dataframe.
|
88
|
+
#
|
89
|
+
def inner_join(other, join_keys = nil, suffix: '.1')
|
90
|
+
join(other, join_keys, type: :inner, suffix: suffix)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Join data, leaving all records.
|
94
|
+
#
|
95
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
96
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
97
|
+
# @return [DataFrame] Joined dataframe.
|
98
|
+
#
|
99
|
+
def full_join(other, join_keys = nil, suffix: '.1')
|
100
|
+
join(other, join_keys, type: :full_outer, suffix: suffix)
|
101
|
+
end
|
102
|
+
|
103
|
+
alias_method :outer_join, :full_join
|
104
|
+
|
105
|
+
# Join matching values to self from other.
|
106
|
+
#
|
107
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
108
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
109
|
+
# @return [DataFrame] Joined dataframe.
|
110
|
+
#
|
111
|
+
def left_join(other, join_keys = nil, suffix: '.1')
|
112
|
+
join(other, join_keys, type: :left_outer, suffix: suffix)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Join matching values from self to other.
|
116
|
+
#
|
117
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
118
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
119
|
+
# @return [DataFrame] Joined dataframe.
|
120
|
+
#
|
121
|
+
def right_join(other, join_keys = nil, suffix: '.1')
|
122
|
+
join(other, join_keys, type: :right_outer, suffix: suffix)
|
123
|
+
end
|
124
|
+
|
125
|
+
# Filtering joins
|
126
|
+
|
127
|
+
# Return records of self that have a match in other.
|
128
|
+
#
|
129
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
130
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
131
|
+
# @return [DataFrame] Joined dataframe.
|
132
|
+
#
|
133
|
+
def semi_join(other, join_keys = nil, suffix: '.1')
|
134
|
+
join(other, join_keys, type: :left_semi, suffix: suffix)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Return records of self that do not have a match in other.
|
138
|
+
#
|
139
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
140
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
141
|
+
# @return [DataFrame] Joined dataframe.
|
142
|
+
#
|
143
|
+
def anti_join(other, join_keys = nil, suffix: '.1')
|
144
|
+
join(other, join_keys, type: :left_anti, suffix: suffix)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Set operations
|
148
|
+
|
149
|
+
# Check if set operation with self and other is possible.
|
150
|
+
#
|
151
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be checked with self.
|
152
|
+
# @return [Boolean] true if set operation is possible.
|
153
|
+
#
|
154
|
+
def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
|
155
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
156
|
+
keys == other.keys
|
157
|
+
end
|
158
|
+
|
159
|
+
# Select records appearing in both self and other.
|
160
|
+
#
|
161
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
162
|
+
# @return [DataFrame] Joined dataframe.
|
163
|
+
#
|
164
|
+
def intersect(other)
|
165
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
166
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
167
|
+
|
168
|
+
join(other, keys, type: :inner)
|
169
|
+
end
|
170
|
+
|
171
|
+
# Select records appearing in self or other.
|
172
|
+
#
|
173
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
174
|
+
# @return [DataFrame] Joined dataframe.
|
175
|
+
#
|
176
|
+
def union(other)
|
177
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
178
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
179
|
+
|
180
|
+
join(other, keys, type: :full_outer)
|
181
|
+
end
|
182
|
+
|
183
|
+
# Select records appearing in self but not in other.
|
184
|
+
#
|
185
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
186
|
+
# @return [DataFrame] Joined dataframe.
|
187
|
+
#
|
188
|
+
def difference(other)
|
189
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
190
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
191
|
+
|
192
|
+
join(other, keys, type: :left_anti)
|
193
|
+
end
|
194
|
+
|
195
|
+
alias_method :setdiff, :difference
|
196
|
+
|
197
|
+
# Undocumented. It is preferable to call specific methods.
|
198
|
+
|
199
|
+
# Join other dataframe
|
200
|
+
#
|
201
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
202
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
203
|
+
# @return [DataFrame] Joined dataframe.
|
204
|
+
#
|
205
|
+
# :type is one of
|
206
|
+
# :left_semi, :right_semi, :left_anti, :right_anti inner, :left_outer, :right_outer, :full_outer.
|
207
|
+
def join(other, join_keys = nil, type: :inner, suffix: '.1', left_outputs: nil, right_outputs: nil)
|
208
|
+
case other
|
209
|
+
when DataFrame
|
210
|
+
# Nop
|
211
|
+
when Arrow::Table
|
212
|
+
other = DataFrame.new(other)
|
213
|
+
else
|
214
|
+
raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
|
215
|
+
end
|
216
|
+
|
217
|
+
# Support natural keys (implicit common keys)
|
218
|
+
natural_keys = keys.intersection(other.keys)
|
219
|
+
raise DataFrameArgumentError, "#{join_keys} are not common keys" if natural_keys.empty?
|
220
|
+
|
221
|
+
join_keys =
|
222
|
+
if join_keys
|
223
|
+
Array(join_keys).map(&:to_sym)
|
224
|
+
else
|
225
|
+
natural_keys
|
226
|
+
end
|
227
|
+
return self if join_keys.empty?
|
228
|
+
|
229
|
+
# Support partial join_keys (common key other than join_key will be renamed with suffix)
|
230
|
+
remainer_keys = natural_keys - join_keys
|
231
|
+
unless remainer_keys.empty?
|
232
|
+
renamer = remainer_keys.each_with_object({}) do |key, hash|
|
233
|
+
new_key = nil
|
234
|
+
loop do
|
235
|
+
new_key = "#{key}#{suffix}".to_sym
|
236
|
+
break unless keys.include?(new_key)
|
237
|
+
|
238
|
+
s = suffix.succ
|
239
|
+
raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
|
240
|
+
|
241
|
+
suffix = s
|
242
|
+
end
|
243
|
+
hash[key] = new_key
|
244
|
+
end
|
245
|
+
other = other.rename(renamer)
|
246
|
+
end
|
247
|
+
|
248
|
+
# Red Arrow's #join returns duplicated join_keys from self and other as of v9.0.0 .
|
249
|
+
# Temporally merge key vectors here to workaround.
|
250
|
+
table_output =
|
251
|
+
table.join(other.table, join_keys, type: type, left_outputs: left_outputs, right_outputs: right_outputs)
|
252
|
+
left_indexes = [*0...n_keys]
|
253
|
+
right_indexes = [*((other.keys - join_keys).map { |key| other.keys.index(key) + n_keys })]
|
254
|
+
|
255
|
+
case type
|
256
|
+
when :left_semi, :left_anti, :right_semi, :right_anti
|
257
|
+
return DataFrame.new(table_output)
|
258
|
+
else
|
259
|
+
selected_indexes = left_indexes.concat(right_indexes)
|
260
|
+
end
|
261
|
+
merged_columns = join_keys.map do |key|
|
262
|
+
i = keys.index(key)
|
263
|
+
merge_column(table_output[i], table_output[n_keys + i], type)
|
264
|
+
end
|
265
|
+
DataFrame.new(table_output[selected_indexes])
|
266
|
+
.assign(*join_keys) { merged_columns }
|
267
|
+
end
|
268
|
+
|
269
|
+
private
|
270
|
+
|
271
|
+
def merge_column(column1, column2, type)
|
272
|
+
a1 = column1.to_a
|
273
|
+
a2 = column2.to_a
|
274
|
+
if type == :full_outer
|
275
|
+
a1.zip(a2).map { |x, y| x || y }
|
276
|
+
elsif type.start_with?('right')
|
277
|
+
a2
|
278
|
+
else # :inner or :left-*
|
279
|
+
a1
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
@@ -174,6 +174,8 @@ module RedAmber
|
|
174
174
|
end
|
175
175
|
|
176
176
|
def format_table(width: 80, head: 5, tail: 3, n_digit: 2)
|
177
|
+
return " #{keys.join(' ')}\n (Empty Vectors)\n" if size.zero?
|
178
|
+
|
177
179
|
original = self
|
178
180
|
indices = size > head + tail ? [*0..head, *(size - tail)...size] : [*0...size]
|
179
181
|
df = slice(indices).assign do
|
@@ -17,7 +17,7 @@ module RedAmber
|
|
17
17
|
raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
|
18
18
|
end
|
19
19
|
return take_by_array(vector) if vector.numeric?
|
20
|
-
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.
|
20
|
+
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.dictionary?
|
21
21
|
|
22
22
|
raise DataFrameArgumentError, "Invalid argument: #{args}"
|
23
23
|
end
|
@@ -118,10 +118,10 @@ module RedAmber
|
|
118
118
|
end
|
119
119
|
|
120
120
|
normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
|
121
|
-
return remove_all_values if normalized_indices == indices
|
121
|
+
return remove_all_values if normalized_indices == indices.to_a
|
122
122
|
return self if normalized_indices.empty?
|
123
123
|
|
124
|
-
index_array = indices - normalized_indices
|
124
|
+
index_array = indices.to_a - normalized_indices
|
125
125
|
|
126
126
|
datum = Arrow::Function.find(:take).execute([table, index_array])
|
127
127
|
return DataFrame.new(datum.value)
|
@@ -168,14 +168,14 @@ module RedAmber
|
|
168
168
|
|
169
169
|
# Undocumented
|
170
170
|
# TODO: support for option {boundscheck: true}
|
171
|
-
def take(*
|
172
|
-
|
173
|
-
return remove_all_values if
|
171
|
+
def take(*arg_indices)
|
172
|
+
arg_indices.flatten!
|
173
|
+
return remove_all_values if arg_indices.empty?
|
174
174
|
|
175
|
-
|
176
|
-
|
175
|
+
arg_indices = arg_indices[0] if arg_indices.one? && !arg_indices[0].is_a?(Numeric)
|
176
|
+
arg_indices = Vector.new(arg_indices) unless arg_indices.is_a?(Vector)
|
177
177
|
|
178
|
-
take_by_array(
|
178
|
+
take_by_array(arg_indices)
|
179
179
|
end
|
180
180
|
|
181
181
|
# Undocumented
|
@@ -23,12 +23,12 @@ module RedAmber
|
|
23
23
|
elsif vec.numeric?
|
24
24
|
key_vector.take(*vec).to_a
|
25
25
|
elsif vec.string? || vec.dictionary?
|
26
|
-
|
26
|
+
vec.to_a
|
27
27
|
else
|
28
28
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
29
29
|
end
|
30
30
|
|
31
|
-
# DataFrame#[] creates a Vector
|
31
|
+
# DataFrame#[] creates a Vector if single key is specified.
|
32
32
|
# DataFrame#pick creates a DataFrame with single key.
|
33
33
|
DataFrame.new(@table[ary])
|
34
34
|
end
|
@@ -52,14 +52,14 @@ module RedAmber
|
|
52
52
|
elsif vec.numeric?
|
53
53
|
keys - key_vector.take(*vec).each.map(&:to_sym) # Array
|
54
54
|
elsif vec.string? || vec.dictionary?
|
55
|
-
keys -
|
55
|
+
keys - vec.to_a.map { _1&.to_sym } # Array
|
56
56
|
else
|
57
57
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
58
58
|
end
|
59
59
|
|
60
60
|
return DataFrame.new if ary.empty?
|
61
61
|
|
62
|
-
# DataFrame#[] creates a Vector
|
62
|
+
# DataFrame#[] creates a Vector if single key is specified.
|
63
63
|
# DataFrame#drop creates a DataFrame with single key.
|
64
64
|
DataFrame.new(@table[ary])
|
65
65
|
end
|
data/lib/red_amber/group.rb
CHANGED
@@ -3,35 +3,84 @@
|
|
3
3
|
module RedAmber
|
4
4
|
# group class
|
5
5
|
class Group
|
6
|
+
include Enumerable # This feature is experimental
|
7
|
+
|
6
8
|
# Creates a new Group object.
|
7
9
|
#
|
8
10
|
# @param dataframe [DataFrame] dataframe to be grouped.
|
9
11
|
# @param group_keys [Array<>] keys for grouping.
|
10
12
|
def initialize(dataframe, *group_keys)
|
11
13
|
@dataframe = dataframe
|
12
|
-
@table = @dataframe.table
|
13
14
|
@group_keys = group_keys.flatten
|
14
15
|
|
15
|
-
raise GroupArgumentError, 'group_keys
|
16
|
+
raise GroupArgumentError, 'group_keys are empty.' if @group_keys.empty?
|
16
17
|
|
17
18
|
d = @group_keys - @dataframe.keys
|
18
19
|
raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}." unless d.empty?
|
19
20
|
|
20
|
-
@
|
21
|
+
@filters = @group_counts = @base_table = nil
|
22
|
+
@group = @dataframe.table.group(*@group_keys)
|
21
23
|
end
|
22
24
|
|
25
|
+
attr_reader :dataframe, :group_keys
|
26
|
+
|
23
27
|
functions = %i[count sum product mean min max stddev variance]
|
24
28
|
functions.each do |function|
|
25
29
|
define_method(function) do |*summary_keys|
|
26
|
-
|
30
|
+
summary_keys = Array(summary_keys).flatten
|
31
|
+
d = summary_keys - @dataframe.keys
|
32
|
+
raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}." unless summary_keys.empty? || d.empty?
|
33
|
+
|
34
|
+
table = @group.aggregate(*build_aggregation_keys("hash_#{function}", summary_keys))
|
35
|
+
df = DataFrame.new(table)
|
36
|
+
df.pick(@group_keys, df.keys - @group_keys)
|
27
37
|
end
|
28
38
|
end
|
29
39
|
|
30
|
-
|
31
|
-
|
32
|
-
|
40
|
+
alias_method :__count, :count
|
41
|
+
private :__count
|
42
|
+
|
43
|
+
def count(*summary_keys)
|
44
|
+
df = __count(summary_keys)
|
45
|
+
# if counts are the same (and do not include NaN or nil), aggregate count columns.
|
46
|
+
if df.pick(@group_keys.size..).to_h.values.uniq.size == 1
|
47
|
+
df.pick(0..@group_keys.size).rename { [keys[-1], :count] }
|
48
|
+
else
|
49
|
+
df
|
33
50
|
end
|
34
|
-
|
51
|
+
end
|
52
|
+
|
53
|
+
def filters
|
54
|
+
@filters ||= begin
|
55
|
+
first, *others = @group_keys.map do |key|
|
56
|
+
vector = @dataframe[key]
|
57
|
+
vector.uniq.each.map { |u| u.nil? ? vector.is_nil : vector == u }
|
58
|
+
end
|
59
|
+
|
60
|
+
if others.empty?
|
61
|
+
first.select(&:any?)
|
62
|
+
else
|
63
|
+
first.product(*others).map { |a| a.reduce(&:&) }.select(&:any?)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def each
|
69
|
+
filters
|
70
|
+
return enum_for(:each) unless block_given?
|
71
|
+
|
72
|
+
@filters.each do |filter|
|
73
|
+
yield @dataframe[filter]
|
74
|
+
end
|
75
|
+
@filters.size
|
76
|
+
end
|
77
|
+
|
78
|
+
def group_count
|
79
|
+
DataFrame.new(add_columns_to_table(base_table, [:group_count], [group_counts]))
|
80
|
+
end
|
81
|
+
|
82
|
+
def inspect
|
83
|
+
"#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
|
35
84
|
end
|
36
85
|
|
37
86
|
def summarize(&block)
|
@@ -48,18 +97,50 @@ module RedAmber
|
|
48
97
|
|
49
98
|
private
|
50
99
|
|
51
|
-
def
|
52
|
-
|
53
|
-
|
54
|
-
|
100
|
+
def build_aggregation_keys(function_name, summary_keys)
|
101
|
+
if summary_keys.empty?
|
102
|
+
[function_name]
|
103
|
+
else
|
104
|
+
summary_keys.map { |key| "#{function_name}(#{key})" }
|
105
|
+
end
|
106
|
+
end
|
55
107
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
108
|
+
# @group_counts.sum == @dataframe.size
|
109
|
+
def group_counts
|
110
|
+
@group_counts ||= filters.map(&:sum)
|
111
|
+
end
|
112
|
+
|
113
|
+
def base_table
|
114
|
+
@base_table ||= begin
|
115
|
+
indexes = filters.map { |filter| filter.index(true) }
|
116
|
+
@dataframe.table[@group_keys].take(indexes)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def add_columns_to_table(table, keys, data_arrays)
|
121
|
+
fields = table.schema.fields
|
122
|
+
arrays = table.columns.map(&:data)
|
123
|
+
|
124
|
+
keys.zip(data_arrays).each do |key, array|
|
125
|
+
data = Arrow::ChunkedArray.new([array])
|
126
|
+
fields << Arrow::Field.new(key, data.value_data_type)
|
127
|
+
arrays << data
|
128
|
+
end
|
129
|
+
|
130
|
+
Arrow::Table.new(Arrow::Schema.new(fields), arrays)
|
131
|
+
end
|
132
|
+
|
133
|
+
# Call Vector aggregating function and return an array of arrays:
|
134
|
+
# [keys, data_arrays]
|
135
|
+
# (Experimental feature)
|
136
|
+
def call_aggregating_function(func, summary_keys, _options)
|
137
|
+
summary_keys.each.with_object([[], []]) do |key, (keys, arrays)|
|
138
|
+
vector = @dataframe[key]
|
139
|
+
arrays << filters.map { |filter| vector.filter(filter).send(func) }
|
140
|
+
keys << "#{func}(#{key})".to_sym
|
141
|
+
rescue Arrow::Error::NotImplemented
|
142
|
+
# next
|
61
143
|
end
|
62
|
-
df
|
63
144
|
end
|
64
145
|
end
|
65
146
|
end
|
data/lib/red_amber/helper.rb
CHANGED
@@ -9,22 +9,10 @@ module RedAmber
|
|
9
9
|
num > 1 ? 's' : ''
|
10
10
|
end
|
11
11
|
|
12
|
-
def out_of_range?(indeces)
|
13
|
-
indeces.max >= size || indeces.min < -size
|
14
|
-
end
|
15
|
-
|
16
|
-
def integers?(enum)
|
17
|
-
enum.all?(Integer)
|
18
|
-
end
|
19
|
-
|
20
12
|
def booleans?(enum)
|
21
13
|
enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
|
22
14
|
end
|
23
15
|
|
24
|
-
def create_dataframe_from_vector(key, vector)
|
25
|
-
DataFrame.new(key => vector.data)
|
26
|
-
end
|
27
|
-
|
28
16
|
def parse_to_vector(args, vsize: size)
|
29
17
|
a = args.reduce([]) do |accum, elem|
|
30
18
|
accum.concat(normalize_element(elem, vsize: vsize))
|
@@ -51,7 +39,7 @@ module RedAmber
|
|
51
39
|
elsif bg.nil? && en.nil?
|
52
40
|
Array(0...vsize)
|
53
41
|
else
|
54
|
-
Array
|
42
|
+
Array(elem)
|
55
43
|
end
|
56
44
|
when Enumerator
|
57
45
|
elem.to_a
|
data/lib/red_amber/vector.rb
CHANGED
@@ -24,6 +24,8 @@ module RedAmber
|
|
24
24
|
a
|
25
25
|
in [Arrow::ChunkedArray => ca]
|
26
26
|
ca
|
27
|
+
in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
|
28
|
+
arrow_array_like.to_arrow_array
|
27
29
|
in [Range => r]
|
28
30
|
Arrow::Array.new(Array(r))
|
29
31
|
else
|
@@ -37,6 +39,11 @@ module RedAmber
|
|
37
39
|
end
|
38
40
|
|
39
41
|
attr_reader :data
|
42
|
+
|
43
|
+
def to_arrow_array
|
44
|
+
@data
|
45
|
+
end
|
46
|
+
|
40
47
|
attr_accessor :key
|
41
48
|
|
42
49
|
def to_s
|
@@ -187,12 +187,6 @@ module RedAmber
|
|
187
187
|
alias_method :ne, :not_equal
|
188
188
|
|
189
189
|
def coerce(other)
|
190
|
-
case other
|
191
|
-
when Vector, Array, Arrow::Array
|
192
|
-
raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
|
193
|
-
|
194
|
-
[Vector.new(Array(other)), self]
|
195
|
-
end
|
196
190
|
[Vector.new(Array(other) * size), self]
|
197
191
|
end
|
198
192
|
|
@@ -271,8 +265,6 @@ module RedAmber
|
|
271
265
|
find(function).execute([data, other.data], options)
|
272
266
|
when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
|
273
267
|
find(function).execute([data, other], options)
|
274
|
-
else
|
275
|
-
raise VectorArgumentError, "Operand is not supported: #{other.class}"
|
276
268
|
end
|
277
269
|
end
|
278
270
|
|