red_amber 0.3.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +56 -22
- data/.yardopts +2 -0
- data/CHANGELOG.md +178 -0
- data/Gemfile +1 -1
- data/LICENSE +1 -1
- data/README.md +29 -30
- data/benchmark/basic.yml +7 -7
- data/benchmark/combine.yml +3 -3
- data/benchmark/dataframe.yml +15 -9
- data/benchmark/group.yml +6 -6
- data/benchmark/reshape.yml +6 -6
- data/benchmark/vector.yml +6 -3
- data/doc/DataFrame.md +32 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +207 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +454 -85
- data/lib/red_amber/data_frame_combinable.rb +609 -115
- data/lib/red_amber/data_frame_displayable.rb +313 -34
- data/lib/red_amber/data_frame_indexable.rb +122 -19
- data/lib/red_amber/data_frame_loadsave.rb +78 -10
- data/lib/red_amber/data_frame_reshaping.rb +184 -14
- data/lib/red_amber/data_frame_selectable.rb +623 -70
- data/lib/red_amber/data_frame_variable_operation.rb +452 -35
- data/lib/red_amber/group.rb +186 -22
- data/lib/red_amber/helper.rb +74 -14
- data/lib/red_amber/refinements.rb +26 -6
- data/lib/red_amber/subframes.rb +1101 -0
- data/lib/red_amber/vector.rb +362 -11
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +506 -0
- data/lib/red_amber/vector_selectable.rb +265 -23
- data/lib/red_amber/vector_unary_element_wise.rb +529 -0
- data/lib/red_amber/vector_updatable.rb +278 -34
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +13 -1
- data/red_amber.gemspec +2 -2
- metadata +13 -8
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -242
@@ -0,0 +1,312 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
|
4
|
+
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
5
|
+
|
6
|
+
module RedAmber
|
7
|
+
# Representing a series of data.
|
8
|
+
class Vector
|
9
|
+
class << self
|
10
|
+
private
|
11
|
+
|
12
|
+
# @!macro [attach] define_unary_aggregation
|
13
|
+
# [Unary aggregation function] Returns a scalar.
|
14
|
+
#
|
15
|
+
def define_unary_aggregation(function)
|
16
|
+
define_method(function) do |**options|
|
17
|
+
datum = exec_func_unary(function, options)
|
18
|
+
get_scalar(datum)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Not implemented in red-arrow yet:
|
24
|
+
# Arrow::Indexoptions, Arrow::ModeOptions, Arrow::TDigestOptions
|
25
|
+
|
26
|
+
# @!macro scalar_aggregate_options
|
27
|
+
# @param skip_nulls [true, false]
|
28
|
+
# If true, nil values are ignored.
|
29
|
+
# Otherwise, if any value is nil, emit nil.
|
30
|
+
# @param min_count [Integer]
|
31
|
+
# if less than this many non-nil values are observed, emit nil.
|
32
|
+
# If skip_nulls is false, this option is not respected.
|
33
|
+
|
34
|
+
# @!macro count_options
|
35
|
+
# @param mode [:only_valid, :only_null, :all]
|
36
|
+
# control count aggregate kernel behavior.
|
37
|
+
# - only_valid: count only non-nil values.
|
38
|
+
# - only_null: count only nil.
|
39
|
+
# - all: count both.
|
40
|
+
|
41
|
+
# @!macro variance_options
|
42
|
+
# @param ddof [0, 1]
|
43
|
+
# Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel.
|
44
|
+
# The divisor used in calculations is N - ddof, where N is the number
|
45
|
+
# of elements. By default, ddof is zero, and population variance or stddev
|
46
|
+
# is returned.
|
47
|
+
# @macro scalar_aggregate_options
|
48
|
+
|
49
|
+
# Test whether all elements in self are evaluated to true.
|
50
|
+
#
|
51
|
+
# @!method all(skip_nulls: true, min_count: 1)
|
52
|
+
# @macro scalar_aggregate_options
|
53
|
+
# @return [true, false]
|
54
|
+
# `all` result of self.
|
55
|
+
# @example Default.
|
56
|
+
# Vector.new(true, true, nil).all # => true
|
57
|
+
#
|
58
|
+
# @example Skip nils.
|
59
|
+
# Vector.new(true, true, nil).all(skip_nulls: false) # => false
|
60
|
+
#
|
61
|
+
define_unary_aggregation :all
|
62
|
+
alias_method :all?, :all
|
63
|
+
|
64
|
+
# Test whether any elements in self are evaluated to true.
|
65
|
+
#
|
66
|
+
# @!method any(skip_nulls: true, min_count: 1)
|
67
|
+
# @macro scalar_aggregate_options
|
68
|
+
# @return [true, false]
|
69
|
+
# `any` result of self.
|
70
|
+
# @example Default.
|
71
|
+
# Vector.new(true, false, nil).any # => true
|
72
|
+
#
|
73
|
+
define_unary_aggregation :any
|
74
|
+
alias_method :any?, :any
|
75
|
+
|
76
|
+
# Approximate median of a numeric Vector with T-Digest algorithm.
|
77
|
+
#
|
78
|
+
# @!method approximate_median(skip_nulls: true, min_count: 1)
|
79
|
+
# @macro scalar_aggregate_options
|
80
|
+
# @return [Float]
|
81
|
+
# median of self.
|
82
|
+
# A nil is returned if there is no valid data point.
|
83
|
+
#
|
84
|
+
define_unary_aggregation :approximate_median
|
85
|
+
alias_method :median, :approximate_median
|
86
|
+
|
87
|
+
# Count the number of nil / non-nil values.
|
88
|
+
#
|
89
|
+
# @!method count(mode: :non_null)
|
90
|
+
# @macro count_options
|
91
|
+
# @return [Integer] count of self.
|
92
|
+
# @example Count only non-nil (default)
|
93
|
+
# Vector.new(1.0, -2.0, Float::NAN, nil).count # => 3
|
94
|
+
#
|
95
|
+
# @example Count nil only.
|
96
|
+
# Vector.new(1.0, -2.0, Float::NAN, nil).count(mode: :only_null) # => 1
|
97
|
+
#
|
98
|
+
# @example Count both non-nil and nil.
|
99
|
+
# Vector.new(1.0, -2.0, Float::NAN, nil).count(mode: :all) # => 4
|
100
|
+
#
|
101
|
+
define_unary_aggregation :count
|
102
|
+
|
103
|
+
# Count the number of unique values.
|
104
|
+
#
|
105
|
+
# @!method count_distinct(mode: :only_valid)
|
106
|
+
# @macro count_options
|
107
|
+
# @return [Integer]
|
108
|
+
# unique count of self.
|
109
|
+
# @example
|
110
|
+
# vector = Vector.new(1, 1.0, nil, nil, Float::NAN, Float::NAN)
|
111
|
+
# vector
|
112
|
+
#
|
113
|
+
# # =>
|
114
|
+
# #<RedAmber::Vector(:double, size=6):0x000000000000d390>
|
115
|
+
# [1.0, 1.0, nil, nil, NaN, NaN]
|
116
|
+
#
|
117
|
+
# # Float::NANs are counted as 1.
|
118
|
+
# vector.count_uniq # => 2
|
119
|
+
#
|
120
|
+
# # nils are counted as 1.
|
121
|
+
# vector.count_uniq(mode: :only_null) # => 1
|
122
|
+
#
|
123
|
+
# vector.count_uniq(mode: :all) # => 3
|
124
|
+
#
|
125
|
+
define_unary_aggregation :count_distinct
|
126
|
+
alias_method :count_uniq, :count_distinct
|
127
|
+
|
128
|
+
# Compute maximum value of self.
|
129
|
+
#
|
130
|
+
# @!method max(skip_nulls: true, min_count: 1)
|
131
|
+
# @macro scalar_aggregate_options
|
132
|
+
# @return [Numeric]
|
133
|
+
# maximum value of self.
|
134
|
+
#
|
135
|
+
define_unary_aggregation :max
|
136
|
+
|
137
|
+
# Compute mean value of self.
|
138
|
+
#
|
139
|
+
# @!method mean(skip_nulls: true, min_count: 1)
|
140
|
+
# @macro scalar_aggregate_options
|
141
|
+
# @return [Numeric]
|
142
|
+
# mean of self.
|
143
|
+
#
|
144
|
+
define_unary_aggregation :mean
|
145
|
+
|
146
|
+
# Compute minimum value of self.
|
147
|
+
#
|
148
|
+
# @!method min(skip_nulls: true, min_count: 1)
|
149
|
+
# @macro scalar_aggregate_options
|
150
|
+
# @return [Numeric]
|
151
|
+
# minimum of self.
|
152
|
+
#
|
153
|
+
define_unary_aggregation :min
|
154
|
+
|
155
|
+
# Compute the min and max value of self.
|
156
|
+
#
|
157
|
+
# @!method min_max(skip_nulls: true, min_count: 1)
|
158
|
+
# @macro scalar_aggregate_options
|
159
|
+
# @return [Array<min, max>]
|
160
|
+
# min and max of self in an Array.
|
161
|
+
#
|
162
|
+
define_unary_aggregation :min_max
|
163
|
+
|
164
|
+
# Compute product value of self.
|
165
|
+
#
|
166
|
+
# @note Self must be a numeric Vector.
|
167
|
+
# @!method product(skip_nulls: true, min_count: 1)
|
168
|
+
# @macro scalar_aggregate_options
|
169
|
+
# @return [Numeric]
|
170
|
+
# product of self.
|
171
|
+
#
|
172
|
+
define_unary_aggregation :product
|
173
|
+
|
174
|
+
# Calculate standard deviation of self.
|
175
|
+
#
|
176
|
+
# @note Self must be a numeric Vector.
|
177
|
+
# @!method stddev(ddof: 0, skip_nulls: true, min_count: 1)
|
178
|
+
# @macro variance_options
|
179
|
+
# @return [Float]
|
180
|
+
# standard deviation of self. Biased (ddof=0) by default.
|
181
|
+
#
|
182
|
+
define_unary_aggregation :stddev
|
183
|
+
|
184
|
+
# Calculate unbiased standard deviation of self.
|
185
|
+
#
|
186
|
+
# @note Self must be a numeric Vector.
|
187
|
+
# @!method sd(ddof: 1, skip_nulls: true, min_count: 1)
|
188
|
+
# @macro variance_options
|
189
|
+
# @return [Float]
|
190
|
+
# standard deviation of self. Unviased (ddof=1)by default.
|
191
|
+
#
|
192
|
+
def sd
|
193
|
+
stddev(ddof: 1)
|
194
|
+
end
|
195
|
+
alias_method :std, :sd
|
196
|
+
|
197
|
+
# Compute sum of self.
|
198
|
+
#
|
199
|
+
# @note Self must be a numeric Vector.
|
200
|
+
# @!method sum(skip_nulls: true, min_count: 1)
|
201
|
+
# @macro scalar_aggregate_options
|
202
|
+
# @return [Numeric]
|
203
|
+
# sum of self.
|
204
|
+
#
|
205
|
+
define_unary_aggregation :sum
|
206
|
+
|
207
|
+
# Calculate variance of self.
|
208
|
+
#
|
209
|
+
# @note Self must be a numeric Vector.
|
210
|
+
# @!method variance(ddof: 0, skip_nulls: true, min_count: 1)
|
211
|
+
# @macro variance_options
|
212
|
+
#
|
213
|
+
# @return [Float]
|
214
|
+
# unviased (ddof=1) standard deviation of self by default.
|
215
|
+
#
|
216
|
+
# @return [Float]
|
217
|
+
# variance of self. Biased (ddof=0) by default.
|
218
|
+
#
|
219
|
+
define_unary_aggregation :variance
|
220
|
+
|
221
|
+
# Calculate unbiased variance of self.
|
222
|
+
#
|
223
|
+
# @note self must be a numeric Vector.
|
224
|
+
# @!method unbiased_variance(ddof: 1, skip_nulls: true, min_count: 1)
|
225
|
+
# @macro variance_options
|
226
|
+
# @return [Float]
|
227
|
+
# variance of self. Unviased (ddof=1) by default.
|
228
|
+
#
|
229
|
+
def unbiased_variance
|
230
|
+
variance(ddof: 1)
|
231
|
+
end
|
232
|
+
alias_method :var, :unbiased_variance
|
233
|
+
|
234
|
+
# @!macro quantile_interpolation
|
235
|
+
# @param interpolation [Symbol]
|
236
|
+
# specifies interpolation method to use,
|
237
|
+
# when the quantile lies between the data i and j.
|
238
|
+
# - Default value is :linear, which returns i + (j - i) * fraction.
|
239
|
+
# - lower: returns i.
|
240
|
+
# - higher: returns j.
|
241
|
+
# - nearest: returns i or j, whichever is closer.
|
242
|
+
# - midpoint: returns (i + j) / 2.
|
243
|
+
|
244
|
+
# Returns a quantile value.
|
245
|
+
# - 0.5 quantile (median) is returned by default.
|
246
|
+
# - Or return quantile for specified probability (prob).
|
247
|
+
# - If quantile lies between two data points, interpolated value is
|
248
|
+
# returned based on selected interpolation method.
|
249
|
+
# - Nils and NaNs are ignored.
|
250
|
+
# - Nil is returned if there are no valid data point.
|
251
|
+
#
|
252
|
+
# @param prob [Float]
|
253
|
+
# probability.
|
254
|
+
# @macro quantile_interpolation
|
255
|
+
# @macro scalar_aggregate_options
|
256
|
+
# @return [Float]
|
257
|
+
# quantile of self.
|
258
|
+
# @example
|
259
|
+
# penguins[:bill_depth_mm].quantile
|
260
|
+
#
|
261
|
+
# # =>
|
262
|
+
# 17.3 # defaultis prob = 0.5
|
263
|
+
#
|
264
|
+
def quantile(prob = 0.5, interpolation: :linear, skip_nulls: true, min_count: 0)
|
265
|
+
unless (0..1).cover? prob
|
266
|
+
raise VectorArgumentError,
|
267
|
+
"Invalid: probability #{prob} must be between 0 and 1"
|
268
|
+
end
|
269
|
+
|
270
|
+
datum = find(:quantile).execute([data],
|
271
|
+
q: prob,
|
272
|
+
interpolation: interpolation,
|
273
|
+
skip_nulls: skip_nulls,
|
274
|
+
min_count: min_count)
|
275
|
+
datum.value.to_a.first
|
276
|
+
end
|
277
|
+
|
278
|
+
# Return quantiles in a DataFrame
|
279
|
+
#
|
280
|
+
# @param probs [Array]
|
281
|
+
# Array of probabilities. Default probabilities are 0.0, 0.25, 0.5 0.75, 1.0 .
|
282
|
+
# @macro quantile_interpolation
|
283
|
+
# @macro scalar_aggregate_options
|
284
|
+
# @return [DataFrame]
|
285
|
+
# quantiles of self.
|
286
|
+
# @example
|
287
|
+
# penguins[:bill_depth_mm].quantiles([0.05, 0.95])
|
288
|
+
#
|
289
|
+
# # =>
|
290
|
+
# #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000000fb2c>
|
291
|
+
# probs quantiles
|
292
|
+
# <double> <double>
|
293
|
+
# 0 0.05 13.9
|
294
|
+
# 1 0.95 20.0
|
295
|
+
#
|
296
|
+
def quantiles(probs = [0.0, 0.25, 0.5, 0.75, 1.0],
|
297
|
+
interpolation: :linear, skip_nulls: true, min_count: 0)
|
298
|
+
if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
|
299
|
+
raise VectorArgumentError, "Invarid probavilities #{probs}"
|
300
|
+
end
|
301
|
+
|
302
|
+
DataFrame.new(
|
303
|
+
probs: probs,
|
304
|
+
quantiles: probs.map do |q|
|
305
|
+
quantile(q,
|
306
|
+
interpolation: interpolation, skip_nulls: skip_nulls,
|
307
|
+
min_count: min_count)
|
308
|
+
end
|
309
|
+
)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|