red_amber 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +12 -0
- data/CHANGELOG.md +114 -31
- data/Gemfile +4 -2
- data/README.md +41 -25
- data/benchmark/basic.yml +79 -0
- data/benchmark/combine.yml +63 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +33 -0
- data/benchmark/reshape.yml +27 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/doc/DataFrame.md +332 -53
- data/doc/Vector.md +3 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/lib/red_amber/data_frame.rb +6 -5
- data/lib/red_amber/data_frame_combinable.rb +283 -0
- data/lib/red_amber/data_frame_displayable.rb +2 -0
- data/lib/red_amber/data_frame_selectable.rb +9 -9
- data/lib/red_amber/data_frame_variable_operation.rb +4 -4
- data/lib/red_amber/group.rb +99 -18
- data/lib/red_amber/helper.rb +1 -13
- data/lib/red_amber/vector.rb +7 -0
- data/lib/red_amber/vector_functions.rb +0 -8
- data/lib/red_amber/vector_updatable.rb +60 -65
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +1 -0
- data/red_amber.gemspec +1 -1
- metadata +21 -10
data/lib/red_amber/data_frame.rb
CHANGED
@@ -5,6 +5,7 @@ module RedAmber
|
|
5
5
|
# Variable @table holds an Arrow::Table object.
|
6
6
|
class DataFrame
|
7
7
|
# mix-in
|
8
|
+
include DataFrameCombinable
|
8
9
|
include DataFrameDisplayable
|
9
10
|
include DataFrameIndexable
|
10
11
|
include DataFrameLoadSave
|
@@ -47,8 +48,6 @@ module RedAmber
|
|
47
48
|
@table = table
|
48
49
|
in [Arrow::Table => table]
|
49
50
|
@table = table
|
50
|
-
in [DataFrame => dataframe]
|
51
|
-
@table = dataframe.table
|
52
51
|
in [rover_or_hash]
|
53
52
|
begin
|
54
53
|
# Accepts Rover::DataFrame or Hash
|
@@ -77,8 +76,9 @@ module RedAmber
|
|
77
76
|
def size
|
78
77
|
@table.n_rows
|
79
78
|
end
|
80
|
-
alias_method :
|
79
|
+
alias_method :n_records, :size
|
81
80
|
alias_method :n_obs, :size
|
81
|
+
alias_method :n_rows, :size
|
82
82
|
|
83
83
|
# Returns the number of columns.
|
84
84
|
#
|
@@ -86,8 +86,9 @@ module RedAmber
|
|
86
86
|
def n_keys
|
87
87
|
@table.n_columns
|
88
88
|
end
|
89
|
-
alias_method :
|
89
|
+
alias_method :n_variables, :n_keys
|
90
90
|
alias_method :n_vars, :n_keys
|
91
|
+
alias_method :n_cols, :n_keys
|
91
92
|
|
92
93
|
# Returns the numbers of rows and columns.
|
93
94
|
#
|
@@ -174,7 +175,7 @@ module RedAmber
|
|
174
175
|
# - indices(1) #=> [1, 2, 3, 4, 5]
|
175
176
|
# - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
|
176
177
|
def indices(start = 0)
|
177
|
-
(start..).take(size)
|
178
|
+
Vector.new((start..).take(size))
|
178
179
|
end
|
179
180
|
alias_method :indexes, :indices
|
180
181
|
|
@@ -0,0 +1,283 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-in for the class DataFrame
|
5
|
+
module DataFrameCombinable
|
6
|
+
# Concatenate other dataframe onto the bottom.
|
7
|
+
#
|
8
|
+
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
9
|
+
# DataFrame/Table to concatenate onto the bottom of self.
|
10
|
+
# @return [DataFrame]
|
11
|
+
# Concatenated dataframe.
|
12
|
+
def concatenate(*other)
|
13
|
+
case other
|
14
|
+
in [] | [nil] | [[]]
|
15
|
+
return self
|
16
|
+
in [Array => array]
|
17
|
+
# Nop
|
18
|
+
else
|
19
|
+
array = other
|
20
|
+
end
|
21
|
+
|
22
|
+
table_array = array.map do |e|
|
23
|
+
case e
|
24
|
+
when Arrow::Table
|
25
|
+
e
|
26
|
+
when DataFrame
|
27
|
+
e.table
|
28
|
+
else
|
29
|
+
raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
DataFrame.new(table.concatenate(table_array))
|
34
|
+
end
|
35
|
+
|
36
|
+
alias_method :concat, :concatenate
|
37
|
+
alias_method :bind_rows, :concatenate
|
38
|
+
|
39
|
+
# Merge other DataFrame or Table from other.
|
40
|
+
# - Self and other must have same size.
|
41
|
+
# - Self and other do not share the same key.
|
42
|
+
# - If they share any keys, raise Error.
|
43
|
+
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
44
|
+
# DataFrame/Table to concatenate.
|
45
|
+
# @return [DataFrame]
|
46
|
+
# Merged dataframe.
|
47
|
+
def merge(*other)
|
48
|
+
case other
|
49
|
+
in [] | [nil] | [[]]
|
50
|
+
return self
|
51
|
+
in [Array => array]
|
52
|
+
# Nop
|
53
|
+
else
|
54
|
+
array = other
|
55
|
+
end
|
56
|
+
|
57
|
+
hash = array.each_with_object({}) do |e, h|
|
58
|
+
df =
|
59
|
+
case e
|
60
|
+
when Arrow::Table
|
61
|
+
DataFrame.new(e)
|
62
|
+
when DataFrame
|
63
|
+
e
|
64
|
+
else
|
65
|
+
raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
|
66
|
+
end
|
67
|
+
|
68
|
+
raise DataFrameArgumentError, "#{e} do not have same size as self" if size != df.size
|
69
|
+
|
70
|
+
k = keys.intersection(df.keys).any?
|
71
|
+
raise DataFrameArgumentError, "There are some shared keys: #{k}" if k
|
72
|
+
|
73
|
+
h.merge!(df.to_h)
|
74
|
+
end
|
75
|
+
|
76
|
+
assign(hash)
|
77
|
+
end
|
78
|
+
|
79
|
+
alias_method :bind_cols, :merge
|
80
|
+
|
81
|
+
# Mutating joins
|
82
|
+
|
83
|
+
# Join data, leaving only the matching records.
|
84
|
+
#
|
85
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
86
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
87
|
+
# @return [DataFrame] Joined dataframe.
|
88
|
+
#
|
89
|
+
def inner_join(other, join_keys = nil, suffix: '.1')
|
90
|
+
join(other, join_keys, type: :inner, suffix: suffix)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Join data, leaving all records.
|
94
|
+
#
|
95
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
96
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
97
|
+
# @return [DataFrame] Joined dataframe.
|
98
|
+
#
|
99
|
+
def full_join(other, join_keys = nil, suffix: '.1')
|
100
|
+
join(other, join_keys, type: :full_outer, suffix: suffix)
|
101
|
+
end
|
102
|
+
|
103
|
+
alias_method :outer_join, :full_join
|
104
|
+
|
105
|
+
# Join matching values to self from other.
|
106
|
+
#
|
107
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
108
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
109
|
+
# @return [DataFrame] Joined dataframe.
|
110
|
+
#
|
111
|
+
def left_join(other, join_keys = nil, suffix: '.1')
|
112
|
+
join(other, join_keys, type: :left_outer, suffix: suffix)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Join matching values from self to other.
|
116
|
+
#
|
117
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
118
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
119
|
+
# @return [DataFrame] Joined dataframe.
|
120
|
+
#
|
121
|
+
def right_join(other, join_keys = nil, suffix: '.1')
|
122
|
+
join(other, join_keys, type: :right_outer, suffix: suffix)
|
123
|
+
end
|
124
|
+
|
125
|
+
# Filtering joins
|
126
|
+
|
127
|
+
# Return records of self that have a match in other.
|
128
|
+
#
|
129
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
130
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
131
|
+
# @return [DataFrame] Joined dataframe.
|
132
|
+
#
|
133
|
+
def semi_join(other, join_keys = nil, suffix: '.1')
|
134
|
+
join(other, join_keys, type: :left_semi, suffix: suffix)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Return records of self that do not have a match in other.
|
138
|
+
#
|
139
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
140
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
141
|
+
# @return [DataFrame] Joined dataframe.
|
142
|
+
#
|
143
|
+
def anti_join(other, join_keys = nil, suffix: '.1')
|
144
|
+
join(other, join_keys, type: :left_anti, suffix: suffix)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Set operations
|
148
|
+
|
149
|
+
# Check if set operation with self and other is possible.
|
150
|
+
#
|
151
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be checked with self.
|
152
|
+
# @return [Boolean] true if set operation is possible.
|
153
|
+
#
|
154
|
+
def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
|
155
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
156
|
+
keys == other.keys
|
157
|
+
end
|
158
|
+
|
159
|
+
# Select records appearing in both self and other.
|
160
|
+
#
|
161
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
162
|
+
# @return [DataFrame] Joined dataframe.
|
163
|
+
#
|
164
|
+
def intersect(other)
|
165
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
166
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
167
|
+
|
168
|
+
join(other, keys, type: :inner)
|
169
|
+
end
|
170
|
+
|
171
|
+
# Select records appearing in self or other.
|
172
|
+
#
|
173
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
174
|
+
# @return [DataFrame] Joined dataframe.
|
175
|
+
#
|
176
|
+
def union(other)
|
177
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
178
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
179
|
+
|
180
|
+
join(other, keys, type: :full_outer)
|
181
|
+
end
|
182
|
+
|
183
|
+
# Select records appearing in self but not in other.
|
184
|
+
#
|
185
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
186
|
+
# @return [DataFrame] Joined dataframe.
|
187
|
+
#
|
188
|
+
def difference(other)
|
189
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
190
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
191
|
+
|
192
|
+
join(other, keys, type: :left_anti)
|
193
|
+
end
|
194
|
+
|
195
|
+
alias_method :setdiff, :difference
|
196
|
+
|
197
|
+
# Undocumented. It is preferable to call specific methods.
|
198
|
+
|
199
|
+
# Join other dataframe
|
200
|
+
#
|
201
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
202
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
203
|
+
# @return [DataFrame] Joined dataframe.
|
204
|
+
#
|
205
|
+
# :type is one of
|
206
|
+
# :left_semi, :right_semi, :left_anti, :right_anti inner, :left_outer, :right_outer, :full_outer.
|
207
|
+
def join(other, join_keys = nil, type: :inner, suffix: '.1', left_outputs: nil, right_outputs: nil)
|
208
|
+
case other
|
209
|
+
when DataFrame
|
210
|
+
# Nop
|
211
|
+
when Arrow::Table
|
212
|
+
other = DataFrame.new(other)
|
213
|
+
else
|
214
|
+
raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
|
215
|
+
end
|
216
|
+
|
217
|
+
# Support natural keys (implicit common keys)
|
218
|
+
natural_keys = keys.intersection(other.keys)
|
219
|
+
raise DataFrameArgumentError, "#{join_keys} are not common keys" if natural_keys.empty?
|
220
|
+
|
221
|
+
join_keys =
|
222
|
+
if join_keys
|
223
|
+
Array(join_keys).map(&:to_sym)
|
224
|
+
else
|
225
|
+
natural_keys
|
226
|
+
end
|
227
|
+
return self if join_keys.empty?
|
228
|
+
|
229
|
+
# Support partial join_keys (common key other than join_key will be renamed with suffix)
|
230
|
+
remainer_keys = natural_keys - join_keys
|
231
|
+
unless remainer_keys.empty?
|
232
|
+
renamer = remainer_keys.each_with_object({}) do |key, hash|
|
233
|
+
new_key = nil
|
234
|
+
loop do
|
235
|
+
new_key = "#{key}#{suffix}".to_sym
|
236
|
+
break unless keys.include?(new_key)
|
237
|
+
|
238
|
+
s = suffix.succ
|
239
|
+
raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
|
240
|
+
|
241
|
+
suffix = s
|
242
|
+
end
|
243
|
+
hash[key] = new_key
|
244
|
+
end
|
245
|
+
other = other.rename(renamer)
|
246
|
+
end
|
247
|
+
|
248
|
+
# Red Arrow's #join returns duplicated join_keys from self and other as of v9.0.0 .
|
249
|
+
# Temporally merge key vectors here to workaround.
|
250
|
+
table_output =
|
251
|
+
table.join(other.table, join_keys, type: type, left_outputs: left_outputs, right_outputs: right_outputs)
|
252
|
+
left_indexes = [*0...n_keys]
|
253
|
+
right_indexes = [*((other.keys - join_keys).map { |key| other.keys.index(key) + n_keys })]
|
254
|
+
|
255
|
+
case type
|
256
|
+
when :left_semi, :left_anti, :right_semi, :right_anti
|
257
|
+
return DataFrame.new(table_output)
|
258
|
+
else
|
259
|
+
selected_indexes = left_indexes.concat(right_indexes)
|
260
|
+
end
|
261
|
+
merged_columns = join_keys.map do |key|
|
262
|
+
i = keys.index(key)
|
263
|
+
merge_column(table_output[i], table_output[n_keys + i], type)
|
264
|
+
end
|
265
|
+
DataFrame.new(table_output[selected_indexes])
|
266
|
+
.assign(*join_keys) { merged_columns }
|
267
|
+
end
|
268
|
+
|
269
|
+
private
|
270
|
+
|
271
|
+
def merge_column(column1, column2, type)
|
272
|
+
a1 = column1.to_a
|
273
|
+
a2 = column2.to_a
|
274
|
+
if type == :full_outer
|
275
|
+
a1.zip(a2).map { |x, y| x || y }
|
276
|
+
elsif type.start_with?('right')
|
277
|
+
a2
|
278
|
+
else # :inner or :left-*
|
279
|
+
a1
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
@@ -174,6 +174,8 @@ module RedAmber
|
|
174
174
|
end
|
175
175
|
|
176
176
|
def format_table(width: 80, head: 5, tail: 3, n_digit: 2)
|
177
|
+
return " #{keys.join(' ')}\n (Empty Vectors)\n" if size.zero?
|
178
|
+
|
177
179
|
original = self
|
178
180
|
indices = size > head + tail ? [*0..head, *(size - tail)...size] : [*0...size]
|
179
181
|
df = slice(indices).assign do
|
@@ -17,7 +17,7 @@ module RedAmber
|
|
17
17
|
raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
|
18
18
|
end
|
19
19
|
return take_by_array(vector) if vector.numeric?
|
20
|
-
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.
|
20
|
+
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.dictionary?
|
21
21
|
|
22
22
|
raise DataFrameArgumentError, "Invalid argument: #{args}"
|
23
23
|
end
|
@@ -118,10 +118,10 @@ module RedAmber
|
|
118
118
|
end
|
119
119
|
|
120
120
|
normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
|
121
|
-
return remove_all_values if normalized_indices == indices
|
121
|
+
return remove_all_values if normalized_indices == indices.to_a
|
122
122
|
return self if normalized_indices.empty?
|
123
123
|
|
124
|
-
index_array = indices - normalized_indices
|
124
|
+
index_array = indices.to_a - normalized_indices
|
125
125
|
|
126
126
|
datum = Arrow::Function.find(:take).execute([table, index_array])
|
127
127
|
return DataFrame.new(datum.value)
|
@@ -168,14 +168,14 @@ module RedAmber
|
|
168
168
|
|
169
169
|
# Undocumented
|
170
170
|
# TODO: support for option {boundscheck: true}
|
171
|
-
def take(*
|
172
|
-
|
173
|
-
return remove_all_values if
|
171
|
+
def take(*arg_indices)
|
172
|
+
arg_indices.flatten!
|
173
|
+
return remove_all_values if arg_indices.empty?
|
174
174
|
|
175
|
-
|
176
|
-
|
175
|
+
arg_indices = arg_indices[0] if arg_indices.one? && !arg_indices[0].is_a?(Numeric)
|
176
|
+
arg_indices = Vector.new(arg_indices) unless arg_indices.is_a?(Vector)
|
177
177
|
|
178
|
-
take_by_array(
|
178
|
+
take_by_array(arg_indices)
|
179
179
|
end
|
180
180
|
|
181
181
|
# Undocumented
|
@@ -23,12 +23,12 @@ module RedAmber
|
|
23
23
|
elsif vec.numeric?
|
24
24
|
key_vector.take(*vec).to_a
|
25
25
|
elsif vec.string? || vec.dictionary?
|
26
|
-
|
26
|
+
vec.to_a
|
27
27
|
else
|
28
28
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
29
29
|
end
|
30
30
|
|
31
|
-
# DataFrame#[] creates a Vector
|
31
|
+
# DataFrame#[] creates a Vector if single key is specified.
|
32
32
|
# DataFrame#pick creates a DataFrame with single key.
|
33
33
|
DataFrame.new(@table[ary])
|
34
34
|
end
|
@@ -52,14 +52,14 @@ module RedAmber
|
|
52
52
|
elsif vec.numeric?
|
53
53
|
keys - key_vector.take(*vec).each.map(&:to_sym) # Array
|
54
54
|
elsif vec.string? || vec.dictionary?
|
55
|
-
keys -
|
55
|
+
keys - vec.to_a.map { _1&.to_sym } # Array
|
56
56
|
else
|
57
57
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
58
58
|
end
|
59
59
|
|
60
60
|
return DataFrame.new if ary.empty?
|
61
61
|
|
62
|
-
# DataFrame#[] creates a Vector
|
62
|
+
# DataFrame#[] creates a Vector if single key is specified.
|
63
63
|
# DataFrame#drop creates a DataFrame with single key.
|
64
64
|
DataFrame.new(@table[ary])
|
65
65
|
end
|
data/lib/red_amber/group.rb
CHANGED
@@ -3,35 +3,84 @@
|
|
3
3
|
module RedAmber
|
4
4
|
# group class
|
5
5
|
class Group
|
6
|
+
include Enumerable # This feature is experimental
|
7
|
+
|
6
8
|
# Creates a new Group object.
|
7
9
|
#
|
8
10
|
# @param dataframe [DataFrame] dataframe to be grouped.
|
9
11
|
# @param group_keys [Array<>] keys for grouping.
|
10
12
|
def initialize(dataframe, *group_keys)
|
11
13
|
@dataframe = dataframe
|
12
|
-
@table = @dataframe.table
|
13
14
|
@group_keys = group_keys.flatten
|
14
15
|
|
15
|
-
raise GroupArgumentError, 'group_keys
|
16
|
+
raise GroupArgumentError, 'group_keys are empty.' if @group_keys.empty?
|
16
17
|
|
17
18
|
d = @group_keys - @dataframe.keys
|
18
19
|
raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}." unless d.empty?
|
19
20
|
|
20
|
-
@
|
21
|
+
@filters = @group_counts = @base_table = nil
|
22
|
+
@group = @dataframe.table.group(*@group_keys)
|
21
23
|
end
|
22
24
|
|
25
|
+
attr_reader :dataframe, :group_keys
|
26
|
+
|
23
27
|
functions = %i[count sum product mean min max stddev variance]
|
24
28
|
functions.each do |function|
|
25
29
|
define_method(function) do |*summary_keys|
|
26
|
-
|
30
|
+
summary_keys = Array(summary_keys).flatten
|
31
|
+
d = summary_keys - @dataframe.keys
|
32
|
+
raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}." unless summary_keys.empty? || d.empty?
|
33
|
+
|
34
|
+
table = @group.aggregate(*build_aggregation_keys("hash_#{function}", summary_keys))
|
35
|
+
df = DataFrame.new(table)
|
36
|
+
df.pick(@group_keys, df.keys - @group_keys)
|
27
37
|
end
|
28
38
|
end
|
29
39
|
|
30
|
-
|
31
|
-
|
32
|
-
|
40
|
+
alias_method :__count, :count
|
41
|
+
private :__count
|
42
|
+
|
43
|
+
def count(*summary_keys)
|
44
|
+
df = __count(summary_keys)
|
45
|
+
# if counts are the same (and do not include NaN or nil), aggregate count columns.
|
46
|
+
if df.pick(@group_keys.size..).to_h.values.uniq.size == 1
|
47
|
+
df.pick(0..@group_keys.size).rename { [keys[-1], :count] }
|
48
|
+
else
|
49
|
+
df
|
33
50
|
end
|
34
|
-
|
51
|
+
end
|
52
|
+
|
53
|
+
def filters
|
54
|
+
@filters ||= begin
|
55
|
+
first, *others = @group_keys.map do |key|
|
56
|
+
vector = @dataframe[key]
|
57
|
+
vector.uniq.each.map { |u| u.nil? ? vector.is_nil : vector == u }
|
58
|
+
end
|
59
|
+
|
60
|
+
if others.empty?
|
61
|
+
first.select(&:any?)
|
62
|
+
else
|
63
|
+
first.product(*others).map { |a| a.reduce(&:&) }.select(&:any?)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def each
|
69
|
+
filters
|
70
|
+
return enum_for(:each) unless block_given?
|
71
|
+
|
72
|
+
@filters.each do |filter|
|
73
|
+
yield @dataframe[filter]
|
74
|
+
end
|
75
|
+
@filters.size
|
76
|
+
end
|
77
|
+
|
78
|
+
def group_count
|
79
|
+
DataFrame.new(add_columns_to_table(base_table, [:group_count], [group_counts]))
|
80
|
+
end
|
81
|
+
|
82
|
+
def inspect
|
83
|
+
"#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
|
35
84
|
end
|
36
85
|
|
37
86
|
def summarize(&block)
|
@@ -48,18 +97,50 @@ module RedAmber
|
|
48
97
|
|
49
98
|
private
|
50
99
|
|
51
|
-
def
|
52
|
-
|
53
|
-
|
54
|
-
|
100
|
+
def build_aggregation_keys(function_name, summary_keys)
|
101
|
+
if summary_keys.empty?
|
102
|
+
[function_name]
|
103
|
+
else
|
104
|
+
summary_keys.map { |key| "#{function_name}(#{key})" }
|
105
|
+
end
|
106
|
+
end
|
55
107
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
108
|
+
# @group_counts.sum == @dataframe.size
|
109
|
+
def group_counts
|
110
|
+
@group_counts ||= filters.map(&:sum)
|
111
|
+
end
|
112
|
+
|
113
|
+
def base_table
|
114
|
+
@base_table ||= begin
|
115
|
+
indexes = filters.map { |filter| filter.index(true) }
|
116
|
+
@dataframe.table[@group_keys].take(indexes)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def add_columns_to_table(table, keys, data_arrays)
|
121
|
+
fields = table.schema.fields
|
122
|
+
arrays = table.columns.map(&:data)
|
123
|
+
|
124
|
+
keys.zip(data_arrays).each do |key, array|
|
125
|
+
data = Arrow::ChunkedArray.new([array])
|
126
|
+
fields << Arrow::Field.new(key, data.value_data_type)
|
127
|
+
arrays << data
|
128
|
+
end
|
129
|
+
|
130
|
+
Arrow::Table.new(Arrow::Schema.new(fields), arrays)
|
131
|
+
end
|
132
|
+
|
133
|
+
# Call Vector aggregating function and return an array of arrays:
|
134
|
+
# [keys, data_arrays]
|
135
|
+
# (Experimental feature)
|
136
|
+
def call_aggregating_function(func, summary_keys, _options)
|
137
|
+
summary_keys.each.with_object([[], []]) do |key, (keys, arrays)|
|
138
|
+
vector = @dataframe[key]
|
139
|
+
arrays << filters.map { |filter| vector.filter(filter).send(func) }
|
140
|
+
keys << "#{func}(#{key})".to_sym
|
141
|
+
rescue Arrow::Error::NotImplemented
|
142
|
+
# next
|
61
143
|
end
|
62
|
-
df
|
63
144
|
end
|
64
145
|
end
|
65
146
|
end
|
data/lib/red_amber/helper.rb
CHANGED
@@ -9,22 +9,10 @@ module RedAmber
|
|
9
9
|
num > 1 ? 's' : ''
|
10
10
|
end
|
11
11
|
|
12
|
-
def out_of_range?(indeces)
|
13
|
-
indeces.max >= size || indeces.min < -size
|
14
|
-
end
|
15
|
-
|
16
|
-
def integers?(enum)
|
17
|
-
enum.all?(Integer)
|
18
|
-
end
|
19
|
-
|
20
12
|
def booleans?(enum)
|
21
13
|
enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
|
22
14
|
end
|
23
15
|
|
24
|
-
def create_dataframe_from_vector(key, vector)
|
25
|
-
DataFrame.new(key => vector.data)
|
26
|
-
end
|
27
|
-
|
28
16
|
def parse_to_vector(args, vsize: size)
|
29
17
|
a = args.reduce([]) do |accum, elem|
|
30
18
|
accum.concat(normalize_element(elem, vsize: vsize))
|
@@ -51,7 +39,7 @@ module RedAmber
|
|
51
39
|
elsif bg.nil? && en.nil?
|
52
40
|
Array(0...vsize)
|
53
41
|
else
|
54
|
-
Array
|
42
|
+
Array(elem)
|
55
43
|
end
|
56
44
|
when Enumerator
|
57
45
|
elem.to_a
|
data/lib/red_amber/vector.rb
CHANGED
@@ -24,6 +24,8 @@ module RedAmber
|
|
24
24
|
a
|
25
25
|
in [Arrow::ChunkedArray => ca]
|
26
26
|
ca
|
27
|
+
in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
|
28
|
+
arrow_array_like.to_arrow_array
|
27
29
|
in [Range => r]
|
28
30
|
Arrow::Array.new(Array(r))
|
29
31
|
else
|
@@ -37,6 +39,11 @@ module RedAmber
|
|
37
39
|
end
|
38
40
|
|
39
41
|
attr_reader :data
|
42
|
+
|
43
|
+
def to_arrow_array
|
44
|
+
@data
|
45
|
+
end
|
46
|
+
|
40
47
|
attr_accessor :key
|
41
48
|
|
42
49
|
def to_s
|
@@ -187,12 +187,6 @@ module RedAmber
|
|
187
187
|
alias_method :ne, :not_equal
|
188
188
|
|
189
189
|
def coerce(other)
|
190
|
-
case other
|
191
|
-
when Vector, Array, Arrow::Array
|
192
|
-
raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
|
193
|
-
|
194
|
-
[Vector.new(Array(other)), self]
|
195
|
-
end
|
196
190
|
[Vector.new(Array(other) * size), self]
|
197
191
|
end
|
198
192
|
|
@@ -271,8 +265,6 @@ module RedAmber
|
|
271
265
|
find(function).execute([data, other.data], options)
|
272
266
|
when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
|
273
267
|
find(function).execute([data, other], options)
|
274
|
-
else
|
275
|
-
raise VectorArgumentError, "Operand is not supported: #{other.class}"
|
276
268
|
end
|
277
269
|
end
|
278
270
|
|