red_amber 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +56 -22
- data/.yardopts +2 -0
- data/CHANGELOG.md +178 -0
- data/Gemfile +1 -1
- data/LICENSE +1 -1
- data/README.md +29 -30
- data/benchmark/basic.yml +7 -7
- data/benchmark/combine.yml +3 -3
- data/benchmark/dataframe.yml +15 -9
- data/benchmark/group.yml +6 -6
- data/benchmark/reshape.yml +6 -6
- data/benchmark/vector.yml +6 -3
- data/doc/DataFrame.md +32 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +207 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +454 -85
- data/lib/red_amber/data_frame_combinable.rb +609 -115
- data/lib/red_amber/data_frame_displayable.rb +313 -34
- data/lib/red_amber/data_frame_indexable.rb +122 -19
- data/lib/red_amber/data_frame_loadsave.rb +78 -10
- data/lib/red_amber/data_frame_reshaping.rb +184 -14
- data/lib/red_amber/data_frame_selectable.rb +623 -70
- data/lib/red_amber/data_frame_variable_operation.rb +452 -35
- data/lib/red_amber/group.rb +186 -22
- data/lib/red_amber/helper.rb +74 -14
- data/lib/red_amber/refinements.rb +26 -6
- data/lib/red_amber/subframes.rb +1101 -0
- data/lib/red_amber/vector.rb +362 -11
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +506 -0
- data/lib/red_amber/vector_selectable.rb +265 -23
- data/lib/red_amber/vector_unary_element_wise.rb +529 -0
- data/lib/red_amber/vector_updatable.rb +278 -34
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +13 -1
- data/red_amber.gemspec +2 -2
- metadata +13 -8
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -242
@@ -0,0 +1,312 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
|
4
|
+
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
5
|
+
|
6
|
+
module RedAmber
|
7
|
+
# Representing a series of data.
|
8
|
+
class Vector
|
9
|
+
class << self
|
10
|
+
private
|
11
|
+
|
12
|
+
# @!macro [attach] define_unary_aggregation
|
13
|
+
# [Unary aggregation function] Returns a scalar.
|
14
|
+
#
|
15
|
+
def define_unary_aggregation(function)
|
16
|
+
define_method(function) do |**options|
|
17
|
+
datum = exec_func_unary(function, options)
|
18
|
+
get_scalar(datum)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Not implemented in red-arrow yet:
|
24
|
+
# Arrow::Indexoptions, Arrow::ModeOptions, Arrow::TDigestOptions
|
25
|
+
|
26
|
+
# @!macro scalar_aggregate_options
|
27
|
+
# @param skip_nulls [true, false]
|
28
|
+
# If true, nil values are ignored.
|
29
|
+
# Otherwise, if any value is nil, emit nil.
|
30
|
+
# @param min_count [Integer]
|
31
|
+
# if less than this many non-nil values are observed, emit nil.
|
32
|
+
# If skip_nulls is false, this option is not respected.
|
33
|
+
|
34
|
+
# @!macro count_options
|
35
|
+
# @param mode [:only_valid, :only_null, :all]
|
36
|
+
# control count aggregate kernel behavior.
|
37
|
+
# - only_valid: count only non-nil values.
|
38
|
+
# - only_null: count only nil.
|
39
|
+
# - all: count both.
|
40
|
+
|
41
|
+
# @!macro variance_options
|
42
|
+
# @param ddof [0, 1]
|
43
|
+
# Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel.
|
44
|
+
# The divisor used in calculations is N - ddof, where N is the number
|
45
|
+
# of elements. By default, ddof is zero, and population variance or stddev
|
46
|
+
# is returned.
|
47
|
+
# @macro scalar_aggregate_options
|
48
|
+
|
49
|
+
# Test whether all elements in self are evaluated to true.
|
50
|
+
#
|
51
|
+
# @!method all(skip_nulls: true, min_count: 1)
|
52
|
+
# @macro scalar_aggregate_options
|
53
|
+
# @return [true, false]
|
54
|
+
# `all` result of self.
|
55
|
+
# @example Default.
|
56
|
+
# Vector.new(true, true, nil).all # => true
|
57
|
+
#
|
58
|
+
# @example Skip nils.
|
59
|
+
# Vector.new(true, true, nil).all(skip_nulls: false) # => false
|
60
|
+
#
|
61
|
+
define_unary_aggregation :all
|
62
|
+
alias_method :all?, :all
|
63
|
+
|
64
|
+
# Test whether any elements in self are evaluated to true.
|
65
|
+
#
|
66
|
+
# @!method any(skip_nulls: true, min_count: 1)
|
67
|
+
# @macro scalar_aggregate_options
|
68
|
+
# @return [true, false]
|
69
|
+
# `any` result of self.
|
70
|
+
# @example Default.
|
71
|
+
# Vector.new(true, false, nil).any # => true
|
72
|
+
#
|
73
|
+
define_unary_aggregation :any
|
74
|
+
alias_method :any?, :any
|
75
|
+
|
76
|
+
# Approximate median of a numeric Vector with T-Digest algorithm.
|
77
|
+
#
|
78
|
+
# @!method approximate_median(skip_nulls: true, min_count: 1)
|
79
|
+
# @macro scalar_aggregate_options
|
80
|
+
# @return [Float]
|
81
|
+
# median of self.
|
82
|
+
# A nil is returned if there is no valid data point.
|
83
|
+
#
|
84
|
+
define_unary_aggregation :approximate_median
|
85
|
+
alias_method :median, :approximate_median
|
86
|
+
|
87
|
+
# Count the number of nil / non-nil values.
|
88
|
+
#
|
89
|
+
# @!method count(mode: :non_null)
|
90
|
+
# @macro count_options
|
91
|
+
# @return [Integer] count of self.
|
92
|
+
# @example Count only non-nil (default)
|
93
|
+
# Vector.new(1.0, -2.0, Float::NAN, nil).count # => 3
|
94
|
+
#
|
95
|
+
# @example Count nil only.
|
96
|
+
# Vector.new(1.0, -2.0, Float::NAN, nil).count(mode: :only_null) # => 1
|
97
|
+
#
|
98
|
+
# @example Count both non-nil and nil.
|
99
|
+
# Vector.new(1.0, -2.0, Float::NAN, nil).count(mode: :all) # => 4
|
100
|
+
#
|
101
|
+
define_unary_aggregation :count
|
102
|
+
|
103
|
+
# Count the number of unique values.
|
104
|
+
#
|
105
|
+
# @!method count_distinct(mode: :only_valid)
|
106
|
+
# @macro count_options
|
107
|
+
# @return [Integer]
|
108
|
+
# unique count of self.
|
109
|
+
# @example
|
110
|
+
# vector = Vector.new(1, 1.0, nil, nil, Float::NAN, Float::NAN)
|
111
|
+
# vector
|
112
|
+
#
|
113
|
+
# # =>
|
114
|
+
# #<RedAmber::Vector(:double, size=6):0x000000000000d390>
|
115
|
+
# [1.0, 1.0, nil, nil, NaN, NaN]
|
116
|
+
#
|
117
|
+
# # Float::NANs are counted as 1.
|
118
|
+
# vector.count_uniq # => 2
|
119
|
+
#
|
120
|
+
# # nils are counted as 1.
|
121
|
+
# vector.count_uniq(mode: :only_null) # => 1
|
122
|
+
#
|
123
|
+
# vector.count_uniq(mode: :all) # => 3
|
124
|
+
#
|
125
|
+
define_unary_aggregation :count_distinct
|
126
|
+
alias_method :count_uniq, :count_distinct
|
127
|
+
|
128
|
+
# Compute maximum value of self.
|
129
|
+
#
|
130
|
+
# @!method max(skip_nulls: true, min_count: 1)
|
131
|
+
# @macro scalar_aggregate_options
|
132
|
+
# @return [Numeric]
|
133
|
+
# maximum value of self.
|
134
|
+
#
|
135
|
+
define_unary_aggregation :max
|
136
|
+
|
137
|
+
# Compute mean value of self.
|
138
|
+
#
|
139
|
+
# @!method mean(skip_nulls: true, min_count: 1)
|
140
|
+
# @macro scalar_aggregate_options
|
141
|
+
# @return [Numeric]
|
142
|
+
# mean of self.
|
143
|
+
#
|
144
|
+
define_unary_aggregation :mean
|
145
|
+
|
146
|
+
# Compute minimum value of self.
|
147
|
+
#
|
148
|
+
# @!method min(skip_nulls: true, min_count: 1)
|
149
|
+
# @macro scalar_aggregate_options
|
150
|
+
# @return [Numeric]
|
151
|
+
# minimum of self.
|
152
|
+
#
|
153
|
+
define_unary_aggregation :min
|
154
|
+
|
155
|
+
# Compute the min and max value of self.
|
156
|
+
#
|
157
|
+
# @!method min_max(skip_nulls: true, min_count: 1)
|
158
|
+
# @macro scalar_aggregate_options
|
159
|
+
# @return [Array<min, max>]
|
160
|
+
# min and max of self in an Array.
|
161
|
+
#
|
162
|
+
define_unary_aggregation :min_max
|
163
|
+
|
164
|
+
# Compute product value of self.
|
165
|
+
#
|
166
|
+
# @note Self must be a numeric Vector.
|
167
|
+
# @!method product(skip_nulls: true, min_count: 1)
|
168
|
+
# @macro scalar_aggregate_options
|
169
|
+
# @return [Numeric]
|
170
|
+
# product of self.
|
171
|
+
#
|
172
|
+
define_unary_aggregation :product
|
173
|
+
|
174
|
+
# Calculate standard deviation of self.
|
175
|
+
#
|
176
|
+
# @note Self must be a numeric Vector.
|
177
|
+
# @!method stddev(ddof: 0, skip_nulls: true, min_count: 1)
|
178
|
+
# @macro variance_options
|
179
|
+
# @return [Float]
|
180
|
+
# standard deviation of self. Biased (ddof=0) by default.
|
181
|
+
#
|
182
|
+
define_unary_aggregation :stddev
|
183
|
+
|
184
|
+
# Calculate unbiased standard deviation of self.
|
185
|
+
#
|
186
|
+
# @note Self must be a numeric Vector.
|
187
|
+
# @!method sd(ddof: 1, skip_nulls: true, min_count: 1)
|
188
|
+
# @macro variance_options
|
189
|
+
# @return [Float]
|
190
|
+
# standard deviation of self. Unviased (ddof=1)by default.
|
191
|
+
#
|
192
|
+
def sd
|
193
|
+
stddev(ddof: 1)
|
194
|
+
end
|
195
|
+
alias_method :std, :sd
|
196
|
+
|
197
|
+
# Compute sum of self.
|
198
|
+
#
|
199
|
+
# @note Self must be a numeric Vector.
|
200
|
+
# @!method sum(skip_nulls: true, min_count: 1)
|
201
|
+
# @macro scalar_aggregate_options
|
202
|
+
# @return [Numeric]
|
203
|
+
# sum of self.
|
204
|
+
#
|
205
|
+
define_unary_aggregation :sum
|
206
|
+
|
207
|
+
# Calculate variance of self.
|
208
|
+
#
|
209
|
+
# @note Self must be a numeric Vector.
|
210
|
+
# @!method variance(ddof: 0, skip_nulls: true, min_count: 1)
|
211
|
+
# @macro variance_options
|
212
|
+
#
|
213
|
+
# @return [Float]
|
214
|
+
# unviased (ddof=1) standard deviation of self by default.
|
215
|
+
#
|
216
|
+
# @return [Float]
|
217
|
+
# variance of self. Biased (ddof=0) by default.
|
218
|
+
#
|
219
|
+
define_unary_aggregation :variance
|
220
|
+
|
221
|
+
# Calculate unbiased variance of self.
|
222
|
+
#
|
223
|
+
# @note self must be a numeric Vector.
|
224
|
+
# @!method unbiased_variance(ddof: 1, skip_nulls: true, min_count: 1)
|
225
|
+
# @macro variance_options
|
226
|
+
# @return [Float]
|
227
|
+
# variance of self. Unviased (ddof=1) by default.
|
228
|
+
#
|
229
|
+
def unbiased_variance
|
230
|
+
variance(ddof: 1)
|
231
|
+
end
|
232
|
+
alias_method :var, :unbiased_variance
|
233
|
+
|
234
|
+
# @!macro quantile_interpolation
|
235
|
+
# @param interpolation [Symbol]
|
236
|
+
# specifies interpolation method to use,
|
237
|
+
# when the quantile lies between the data i and j.
|
238
|
+
# - Default value is :linear, which returns i + (j - i) * fraction.
|
239
|
+
# - lower: returns i.
|
240
|
+
# - higher: returns j.
|
241
|
+
# - nearest: returns i or j, whichever is closer.
|
242
|
+
# - midpoint: returns (i + j) / 2.
|
243
|
+
|
244
|
+
# Returns a quantile value.
|
245
|
+
# - 0.5 quantile (median) is returned by default.
|
246
|
+
# - Or return quantile for specified probability (prob).
|
247
|
+
# - If quantile lies between two data points, interpolated value is
|
248
|
+
# returned based on selected interpolation method.
|
249
|
+
# - Nils and NaNs are ignored.
|
250
|
+
# - Nil is returned if there are no valid data point.
|
251
|
+
#
|
252
|
+
# @param prob [Float]
|
253
|
+
# probability.
|
254
|
+
# @macro quantile_interpolation
|
255
|
+
# @macro scalar_aggregate_options
|
256
|
+
# @return [Float]
|
257
|
+
# quantile of self.
|
258
|
+
# @example
|
259
|
+
# penguins[:bill_depth_mm].quantile
|
260
|
+
#
|
261
|
+
# # =>
|
262
|
+
# 17.3 # defaultis prob = 0.5
|
263
|
+
#
|
264
|
+
def quantile(prob = 0.5, interpolation: :linear, skip_nulls: true, min_count: 0)
|
265
|
+
unless (0..1).cover? prob
|
266
|
+
raise VectorArgumentError,
|
267
|
+
"Invalid: probability #{prob} must be between 0 and 1"
|
268
|
+
end
|
269
|
+
|
270
|
+
datum = find(:quantile).execute([data],
|
271
|
+
q: prob,
|
272
|
+
interpolation: interpolation,
|
273
|
+
skip_nulls: skip_nulls,
|
274
|
+
min_count: min_count)
|
275
|
+
datum.value.to_a.first
|
276
|
+
end
|
277
|
+
|
278
|
+
# Return quantiles in a DataFrame
|
279
|
+
#
|
280
|
+
# @param probs [Array]
|
281
|
+
# Array of probabilities. Default probabilities are 0.0, 0.25, 0.5 0.75, 1.0 .
|
282
|
+
# @macro quantile_interpolation
|
283
|
+
# @macro scalar_aggregate_options
|
284
|
+
# @return [DataFrame]
|
285
|
+
# quantiles of self.
|
286
|
+
# @example
|
287
|
+
# penguins[:bill_depth_mm].quantiles([0.05, 0.95])
|
288
|
+
#
|
289
|
+
# # =>
|
290
|
+
# #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000000fb2c>
|
291
|
+
# probs quantiles
|
292
|
+
# <double> <double>
|
293
|
+
# 0 0.05 13.9
|
294
|
+
# 1 0.95 20.0
|
295
|
+
#
|
296
|
+
def quantiles(probs = [0.0, 0.25, 0.5, 0.75, 1.0],
|
297
|
+
interpolation: :linear, skip_nulls: true, min_count: 0)
|
298
|
+
if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
|
299
|
+
raise VectorArgumentError, "Invarid probavilities #{probs}"
|
300
|
+
end
|
301
|
+
|
302
|
+
DataFrame.new(
|
303
|
+
probs: probs,
|
304
|
+
quantiles: probs.map do |q|
|
305
|
+
quantile(q,
|
306
|
+
interpolation: interpolation, skip_nulls: skip_nulls,
|
307
|
+
min_count: min_count)
|
308
|
+
end
|
309
|
+
)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|