red_amber 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +39 -20
- data/.yardopts +2 -0
- data/CHANGELOG.md +113 -0
- data/Gemfile +1 -1
- data/LICENSE +1 -1
- data/README.md +25 -26
- data/benchmark/basic.yml +2 -2
- data/benchmark/combine.yml +2 -2
- data/benchmark/dataframe.yml +2 -2
- data/benchmark/group.yml +2 -2
- data/benchmark/reshape.yml +2 -2
- data/benchmark/vector.yml +3 -0
- data/doc/DataFrame.md +32 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +207 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +429 -75
- data/lib/red_amber/data_frame_combinable.rb +516 -66
- data/lib/red_amber/data_frame_displayable.rb +244 -14
- data/lib/red_amber/data_frame_indexable.rb +121 -18
- data/lib/red_amber/data_frame_loadsave.rb +78 -10
- data/lib/red_amber/data_frame_reshaping.rb +184 -14
- data/lib/red_amber/data_frame_selectable.rb +622 -66
- data/lib/red_amber/data_frame_variable_operation.rb +446 -34
- data/lib/red_amber/group.rb +187 -22
- data/lib/red_amber/helper.rb +70 -10
- data/lib/red_amber/refinements.rb +12 -5
- data/lib/red_amber/subframes.rb +1066 -0
- data/lib/red_amber/vector.rb +385 -11
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +387 -0
- data/lib/red_amber/vector_selectable.rb +217 -12
- data/lib/red_amber/vector_unary_element_wise.rb +436 -0
- data/lib/red_amber/vector_updatable.rb +278 -34
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +13 -1
- data/red_amber.gemspec +2 -2
- metadata +13 -8
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -242
data/lib/red_amber/group.rb
CHANGED
@@ -1,16 +1,72 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Group class
|
5
5
|
class Group
|
6
6
|
include Enumerable # This feature is experimental
|
7
7
|
|
8
8
|
using RefineArrowTable
|
9
9
|
|
10
|
+
# Source DataFrame.
|
11
|
+
#
|
12
|
+
# @return [DataFrame]
|
13
|
+
# source DataFrame.
|
14
|
+
#
|
15
|
+
attr_reader :dataframe
|
16
|
+
|
17
|
+
# Keys for grouping by value.
|
18
|
+
#
|
19
|
+
# @return [Array]
|
20
|
+
# group keys.
|
21
|
+
#
|
22
|
+
attr_reader :group_keys
|
23
|
+
|
24
|
+
class << self
|
25
|
+
private
|
26
|
+
|
27
|
+
# @!macro [attach] define_group_aggregation
|
28
|
+
# @!method $1(*summary_keys)
|
29
|
+
# Group aggregation function `$1`.
|
30
|
+
# @param summary_keys [Array<Symbol, String>]
|
31
|
+
# summary keys.
|
32
|
+
# @return [DataFrame]
|
33
|
+
# aggregated DataFrame
|
34
|
+
#
|
35
|
+
def define_group_aggregation(function)
|
36
|
+
define_method(function) do |*summary_keys|
|
37
|
+
summary_keys = Array(summary_keys).flatten
|
38
|
+
d = summary_keys - @dataframe.keys
|
39
|
+
unless summary_keys.empty? || d.empty?
|
40
|
+
raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}."
|
41
|
+
end
|
42
|
+
|
43
|
+
table = @group.aggregate(*build_aggregation_keys("hash_#{function}",
|
44
|
+
summary_keys))
|
45
|
+
g = @group_keys.map(&:to_s)
|
46
|
+
DataFrame.new(table[g + (table.keys - g)])
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
10
51
|
# Creates a new Group object.
|
11
52
|
#
|
12
|
-
# @param dataframe [DataFrame]
|
13
|
-
#
|
53
|
+
# @param dataframe [DataFrame]
|
54
|
+
# dataframe to be grouped.
|
55
|
+
# @param group_keys [Array<Symbol, String>]
|
56
|
+
# keys for grouping.
|
57
|
+
# @return [Group]
|
58
|
+
# Group object.
|
59
|
+
# @example
|
60
|
+
# Group.new(penguins, :species)
|
61
|
+
#
|
62
|
+
# # =>
|
63
|
+
# #<RedAmber::Group : 0x000000000000f410>
|
64
|
+
# species group_count
|
65
|
+
# <string> <uint8>
|
66
|
+
# 0 Adelie 152
|
67
|
+
# 1 Chinstrap 68
|
68
|
+
# 2 Gentoo 124
|
69
|
+
#
|
14
70
|
def initialize(dataframe, *group_keys)
|
15
71
|
@dataframe = dataframe
|
16
72
|
@group_keys = group_keys.flatten
|
@@ -23,24 +79,7 @@ module RedAmber
|
|
23
79
|
@group = @dataframe.table.group(*@group_keys)
|
24
80
|
end
|
25
81
|
|
26
|
-
|
27
|
-
|
28
|
-
functions = %i[count sum product mean min max stddev variance]
|
29
|
-
functions.each do |function|
|
30
|
-
define_method(function) do |*summary_keys|
|
31
|
-
summary_keys = Array(summary_keys).flatten
|
32
|
-
d = summary_keys - @dataframe.keys
|
33
|
-
unless summary_keys.empty? || d.empty?
|
34
|
-
raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}."
|
35
|
-
end
|
36
|
-
|
37
|
-
table = @group.aggregate(*build_aggregation_keys("hash_#{function}",
|
38
|
-
summary_keys))
|
39
|
-
g = @group_keys.map(&:to_s)
|
40
|
-
DataFrame.new(table[g + (table.keys - g)])
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
82
|
+
define_group_aggregation(:count)
|
44
83
|
alias_method :__count, :count
|
45
84
|
private :__count
|
46
85
|
|
@@ -54,6 +93,26 @@ module RedAmber
|
|
54
93
|
end
|
55
94
|
end
|
56
95
|
|
96
|
+
define_group_aggregation(:sum)
|
97
|
+
|
98
|
+
define_group_aggregation(:product)
|
99
|
+
|
100
|
+
define_group_aggregation(:mean)
|
101
|
+
|
102
|
+
define_group_aggregation(:min)
|
103
|
+
|
104
|
+
define_group_aggregation(:max)
|
105
|
+
|
106
|
+
define_group_aggregation(:stddev)
|
107
|
+
|
108
|
+
define_group_aggregation(:variance)
|
109
|
+
|
110
|
+
# Returns Array of boolean filters to select each records in the Group.
|
111
|
+
#
|
112
|
+
# @api private
|
113
|
+
# @return [Array]
|
114
|
+
# an Array of boolean filter Vectors.
|
115
|
+
#
|
57
116
|
def filters
|
58
117
|
@filters ||= begin
|
59
118
|
first, *others = @group_keys.map do |key|
|
@@ -69,6 +128,25 @@ module RedAmber
|
|
69
128
|
end
|
70
129
|
end
|
71
130
|
|
131
|
+
# Iterates over each record group as a DataFrame or returns a Enumerator.
|
132
|
+
#
|
133
|
+
# @api private
|
134
|
+
# @overload each
|
135
|
+
# Returns a new Enumerator if no block given.
|
136
|
+
#
|
137
|
+
# @return [Enumerator]
|
138
|
+
# Enumerator of each group as a DataFrame.
|
139
|
+
#
|
140
|
+
# @overload each
|
141
|
+
# When a block given, passes each record group as a DataFrame to the block.
|
142
|
+
#
|
143
|
+
# @yieldparam df [DataFrame]
|
144
|
+
# passes each record group as a DataFrame by a block parameter.
|
145
|
+
# @yieldreturn [Object]
|
146
|
+
# evaluated result value from the block.
|
147
|
+
# @return [Integer]
|
148
|
+
# group size.
|
149
|
+
#
|
72
150
|
def each
|
73
151
|
filters
|
74
152
|
return enum_for(:each) unless block_given?
|
@@ -79,14 +157,98 @@ module RedAmber
|
|
79
157
|
@filters.size
|
80
158
|
end
|
81
159
|
|
160
|
+
# Returns each record group size as a DataFrame.
|
161
|
+
#
|
162
|
+
# @return [DataFrame]
|
163
|
+
# DataFrame consists of:
|
164
|
+
# - Group key columns.
|
165
|
+
# - Result columns by group aggregation.
|
166
|
+
# @example
|
167
|
+
# penguins.group(:species).group_count
|
168
|
+
#
|
169
|
+
# # =>
|
170
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
|
171
|
+
# species group_count
|
172
|
+
# <string> <uint8>
|
173
|
+
# 0 Adelie 152
|
174
|
+
# 1 Chinstrap 68
|
175
|
+
# 2 Gentoo 124
|
176
|
+
#
|
82
177
|
def group_count
|
83
178
|
DataFrame.create(add_columns_to_table(base_table, [:group_count], [group_counts]))
|
84
179
|
end
|
85
180
|
|
181
|
+
# String representation of self.
|
182
|
+
#
|
183
|
+
# @return [String]
|
184
|
+
# show information of self as a String.
|
185
|
+
# @example
|
186
|
+
# puts penguins.group(:species).inspect
|
187
|
+
#
|
188
|
+
# # =>
|
189
|
+
# #<RedAmber::Group : 0x0000000000003a98>
|
190
|
+
# species group_count
|
191
|
+
# <string> <uint8>
|
192
|
+
# 0 Adelie 152
|
193
|
+
# 1 Chinstrap 68
|
194
|
+
# 2 Gentoo 124
|
195
|
+
#
|
86
196
|
def inspect
|
87
197
|
"#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
|
88
198
|
end
|
89
199
|
|
200
|
+
# Summarize Group by aggregation functions from the block.
|
201
|
+
#
|
202
|
+
# @yieldparam group [Group]
|
203
|
+
# passes group object self.
|
204
|
+
# @yieldreturn [DataFrame, Array<DataFrame>]
|
205
|
+
# an aggregated DataFrame or an array of aggregated DataFrames.
|
206
|
+
# @return [DataFrame]
|
207
|
+
# summarized DataFrame.
|
208
|
+
# @example Single function and single variable
|
209
|
+
# group = penguins.group(:species)
|
210
|
+
# group
|
211
|
+
#
|
212
|
+
# # =>
|
213
|
+
# #<RedAmber::Group : 0x000000000000c314>
|
214
|
+
# species group_count
|
215
|
+
# <string> <uint8>
|
216
|
+
# 0 Adelie 152
|
217
|
+
# 1 Chinstrap 68
|
218
|
+
# 2 Gentoo 124
|
219
|
+
#
|
220
|
+
# group.summarize { mean(:bill_length_mm) }
|
221
|
+
#
|
222
|
+
# # =>
|
223
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
|
224
|
+
# species mean(bill_length_mm)
|
225
|
+
# <string> <double>
|
226
|
+
# 0 Adelie 38.79
|
227
|
+
# 1 Chinstrap 48.83
|
228
|
+
# 2 Gentoo 47.5
|
229
|
+
#
|
230
|
+
# @example Single function only
|
231
|
+
# group.summarize { mean }
|
232
|
+
#
|
233
|
+
# # =>
|
234
|
+
# #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
|
235
|
+
# species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
|
236
|
+
# <string> <double> <double> ... <double>
|
237
|
+
# 0 Adelie 38.79 18.35 ... 2008.01
|
238
|
+
# 1 Chinstrap 48.83 18.42 ... 2007.97
|
239
|
+
# 2 Gentoo 47.5 14.98 ... 2008.08
|
240
|
+
#
|
241
|
+
# @example Multiple functions
|
242
|
+
# group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
|
243
|
+
#
|
244
|
+
# # =>
|
245
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
|
246
|
+
# species min(bill_length_mm) max(bill_length_mm)
|
247
|
+
# <string> <double> <double>
|
248
|
+
# 0 Adelie 32.1 46.0
|
249
|
+
# 1 Chinstrap 40.9 58.0
|
250
|
+
# 2 Gentoo 40.9 59.6
|
251
|
+
#
|
90
252
|
def summarize(&block)
|
91
253
|
agg = instance_eval(&block)
|
92
254
|
case agg
|
@@ -99,7 +261,10 @@ module RedAmber
|
|
99
261
|
end
|
100
262
|
end
|
101
263
|
|
102
|
-
#
|
264
|
+
# Aggregating summary.
|
265
|
+
#
|
266
|
+
# @api private
|
267
|
+
#
|
103
268
|
def agg_sum(*summary_keys)
|
104
269
|
call_aggregating_function(:sum, summary_keys, _options = nil)
|
105
270
|
end
|
data/lib/red_amber/helper.rb
CHANGED
@@ -1,28 +1,33 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Mix-in for the class DataFrame
|
5
5
|
module Helper
|
6
6
|
private
|
7
7
|
|
8
8
|
# If num is larger than 1 return 's' to be plural.
|
9
9
|
#
|
10
|
-
# @param num [Numeric]
|
11
|
-
#
|
10
|
+
# @param num [Numeric]
|
11
|
+
# some number.
|
12
|
+
# @return ['s', '']
|
13
|
+
# return 's' if num is larger than 1.
|
12
14
|
# Otherwise return ''.
|
15
|
+
#
|
13
16
|
def pl(num)
|
14
17
|
num > 1 ? 's' : ''
|
15
18
|
end
|
16
19
|
|
17
|
-
# Parse the argments in an Array
|
18
|
-
# and returns a parsed Array.
|
20
|
+
# Parse the argments in an Array and returns a parsed Array.
|
19
21
|
#
|
20
22
|
# @param args
|
21
23
|
# [<Integer, Symbol, true, false, nil, Array, Range, Enumerator, String, Float>]
|
22
24
|
# arguments.
|
23
|
-
# @param array_size [Integer]
|
24
|
-
#
|
25
|
+
# @param array_size [Integer]
|
26
|
+
# size of target Array to use in a endless Range.
|
27
|
+
# @return [<Integer, Symbol, true, false, nil>]
|
28
|
+
# parsed flat Array.
|
25
29
|
# @note This method is recursively called to parse.
|
30
|
+
#
|
26
31
|
def parse_args(args, array_size)
|
27
32
|
args.flat_map do |elem|
|
28
33
|
case elem
|
@@ -46,9 +51,13 @@ module RedAmber
|
|
46
51
|
|
47
52
|
# Parse a Range to an Array
|
48
53
|
#
|
49
|
-
# @param range [Range]
|
50
|
-
#
|
51
|
-
# @
|
54
|
+
# @param range [Range]
|
55
|
+
# range to parse.
|
56
|
+
# @param array_size [Integer]
|
57
|
+
# size of target Array to use in a endless Range.
|
58
|
+
# @return [Array<Integer, Symbol, String>]
|
59
|
+
# parsed Array.
|
60
|
+
#
|
52
61
|
def parse_range(range, array_size)
|
53
62
|
bg = range.begin
|
54
63
|
en = range.end
|
@@ -70,4 +79,55 @@ module RedAmber
|
|
70
79
|
end
|
71
80
|
end
|
72
81
|
end
|
82
|
+
|
83
|
+
# rubocop:disable Layout/LineLength
|
84
|
+
|
85
|
+
# Helper for Arrow Functions
|
86
|
+
module ArrowFunction
|
87
|
+
module_function
|
88
|
+
|
89
|
+
# Find Arrow's compute function.
|
90
|
+
#
|
91
|
+
# {https://arrow.apache.org/docs/cpp/compute.html}
|
92
|
+
# @param function_name [Symbol]
|
93
|
+
# function name.
|
94
|
+
# @return [Arrow::Function]
|
95
|
+
# arrow compute function object.
|
96
|
+
# @example
|
97
|
+
# RedAmber::ArrowFunction.find(:array_sort_indices)
|
98
|
+
#
|
99
|
+
# # =>
|
100
|
+
# #<Arrow::Function:0x7fa8838a0d80 ptr=0x7fa87e9b7320 array_sort_indices(array, {order=Ascending, null_placement=AtEnd}): Return the indices that would sort an array>
|
101
|
+
#
|
102
|
+
def find(function_name)
|
103
|
+
Arrow::Function.find(function_name)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Show document of Arrow's compute function.
|
107
|
+
#
|
108
|
+
# @param function_name [Symbol]
|
109
|
+
# function name.
|
110
|
+
# @return [String]
|
111
|
+
# document of compute function object.
|
112
|
+
# @example
|
113
|
+
# puts RedAmber::ArrowFunction.arrow_doc(:array_sort_indices)
|
114
|
+
#
|
115
|
+
# # =>
|
116
|
+
# array_sort_indices(array, {order=Ascending, null_placement=AtEnd}): Return the indices that would sort an array
|
117
|
+
# ------------------
|
118
|
+
# This function computes an array of indices that define a stable sort
|
119
|
+
# of the input array. By default, Null values are considered greater
|
120
|
+
# than any other value and are therefore sorted at the end of the array.
|
121
|
+
# For floating-point types, NaNs are considered greater than any
|
122
|
+
# other non-null value, but smaller than null values.
|
123
|
+
#
|
124
|
+
# The handling of nulls and NaNs can be changed in ArraySortOptions.
|
125
|
+
#
|
126
|
+
def arrow_doc(function_name)
|
127
|
+
f = find(function_name)
|
128
|
+
"#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# rubocop:enable Layout/LineLength
|
73
133
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
# Namespace of RedAmber
|
3
4
|
module RedAmber
|
4
5
|
# Add additional capabilities to Hash
|
5
6
|
module RefineHash
|
@@ -154,23 +155,27 @@ module RedAmber
|
|
154
155
|
# Add additional capabilities to Array
|
155
156
|
module RefineArray
|
156
157
|
refine Array do
|
157
|
-
def
|
158
|
+
def integer?
|
158
159
|
all? { |e| e.is_a?(Integer) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
|
159
160
|
end
|
160
161
|
|
161
|
-
def
|
162
|
+
def numeric?
|
163
|
+
all? { |e| e.is_a?(Numeric) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
|
164
|
+
end
|
165
|
+
|
166
|
+
def boolean?
|
162
167
|
all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
|
163
168
|
end
|
164
169
|
|
165
|
-
def
|
170
|
+
def symbol?
|
166
171
|
all? { |e| e.is_a?(Symbol) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
|
167
172
|
end
|
168
173
|
|
169
|
-
def
|
174
|
+
def string?
|
170
175
|
all? { |e| e.is_a?(String) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
|
171
176
|
end
|
172
177
|
|
173
|
-
def
|
178
|
+
def symbol_or_string?
|
174
179
|
all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
|
175
180
|
end
|
176
181
|
|
@@ -196,4 +201,6 @@ module RedAmber
|
|
196
201
|
end
|
197
202
|
end
|
198
203
|
end
|
204
|
+
|
205
|
+
private_constant :RefineArray, :RefineArrayLike, :RefineArrowTable, :RefineHash
|
199
206
|
end
|