red_amber 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +56 -22
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +29 -30
  8. data/benchmark/basic.yml +7 -7
  9. data/benchmark/combine.yml +3 -3
  10. data/benchmark/dataframe.yml +15 -9
  11. data/benchmark/group.yml +6 -6
  12. data/benchmark/reshape.yml +6 -6
  13. data/benchmark/vector.yml +6 -3
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +454 -85
  20. data/lib/red_amber/data_frame_combinable.rb +609 -115
  21. data/lib/red_amber/data_frame_displayable.rb +313 -34
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +623 -70
  26. data/lib/red_amber/data_frame_variable_operation.rb +452 -35
  27. data/lib/red_amber/group.rb +186 -22
  28. data/lib/red_amber/helper.rb +74 -14
  29. data/lib/red_amber/refinements.rb +26 -6
  30. data/lib/red_amber/subframes.rb +1101 -0
  31. data/lib/red_amber/vector.rb +362 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +506 -0
  34. data/lib/red_amber/vector_selectable.rb +265 -23
  35. data/lib/red_amber/vector_unary_element_wise.rb +529 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -0,0 +1,312 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
+ # reference: https://arrow.apache.org/docs/cpp/compute.html
5
+
6
+ module RedAmber
7
+ # Representing a series of data.
8
+ class Vector
9
+ class << self
10
+ private
11
+
12
+ # @!macro [attach] define_unary_aggregation
13
+ # [Unary aggregation function] Returns a scalar.
14
+ #
15
+ def define_unary_aggregation(function)
16
+ define_method(function) do |**options|
17
+ datum = exec_func_unary(function, options)
18
+ get_scalar(datum)
19
+ end
20
+ end
21
+ end
22
+
23
+ # Not implemented in red-arrow yet:
24
+ # Arrow::Indexoptions, Arrow::ModeOptions, Arrow::TDigestOptions
25
+
26
+ # @!macro scalar_aggregate_options
27
+ # @param skip_nulls [true, false]
28
+ # If true, nil values are ignored.
29
+ # Otherwise, if any value is nil, emit nil.
30
+ # @param min_count [Integer]
31
+ # if less than this many non-nil values are observed, emit nil.
32
+ # If skip_nulls is false, this option is not respected.
33
+
34
+ # @!macro count_options
35
+ # @param mode [:only_valid, :only_null, :all]
36
+ # control count aggregate kernel behavior.
37
+ # - only_valid: count only non-nil values.
38
+ # - only_null: count only nil.
39
+ # - all: count both.
40
+
41
+ # @!macro variance_options
42
+ # @param ddof [0, 1]
43
+ # Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel.
44
+ # The divisor used in calculations is N - ddof, where N is the number
45
+ # of elements. By default, ddof is zero, and population variance or stddev
46
+ # is returned.
47
+ # @macro scalar_aggregate_options
48
+
49
+ # Test whether all elements in self are evaluated to true.
50
+ #
51
+ # @!method all(skip_nulls: true, min_count: 1)
52
+ # @macro scalar_aggregate_options
53
+ # @return [true, false]
54
+ # `all` result of self.
55
+ # @example Default.
56
+ # Vector.new(true, true, nil).all # => true
57
+ #
58
+ # @example Skip nils.
59
+ # Vector.new(true, true, nil).all(skip_nulls: false) # => false
60
+ #
61
+ define_unary_aggregation :all
62
+ alias_method :all?, :all
63
+
64
+ # Test whether any elements in self are evaluated to true.
65
+ #
66
+ # @!method any(skip_nulls: true, min_count: 1)
67
+ # @macro scalar_aggregate_options
68
+ # @return [true, false]
69
+ # `any` result of self.
70
+ # @example Default.
71
+ # Vector.new(true, false, nil).any # => true
72
+ #
73
+ define_unary_aggregation :any
74
+ alias_method :any?, :any
75
+
76
+ # Approximate median of a numeric Vector with T-Digest algorithm.
77
+ #
78
+ # @!method approximate_median(skip_nulls: true, min_count: 1)
79
+ # @macro scalar_aggregate_options
80
+ # @return [Float]
81
+ # median of self.
82
+ # A nil is returned if there is no valid data point.
83
+ #
84
+ define_unary_aggregation :approximate_median
85
+ alias_method :median, :approximate_median
86
+
87
+ # Count the number of nil / non-nil values.
88
+ #
89
+ # @!method count(mode: :non_null)
90
+ # @macro count_options
91
+ # @return [Integer] count of self.
92
+ # @example Count only non-nil (default)
93
+ # Vector.new(1.0, -2.0, Float::NAN, nil).count # => 3
94
+ #
95
+ # @example Count nil only.
96
+ # Vector.new(1.0, -2.0, Float::NAN, nil).count(mode: :only_null) # => 1
97
+ #
98
+ # @example Count both non-nil and nil.
99
+ # Vector.new(1.0, -2.0, Float::NAN, nil).count(mode: :all) # => 4
100
+ #
101
+ define_unary_aggregation :count
102
+
103
+ # Count the number of unique values.
104
+ #
105
+ # @!method count_distinct(mode: :only_valid)
106
+ # @macro count_options
107
+ # @return [Integer]
108
+ # unique count of self.
109
+ # @example
110
+ # vector = Vector.new(1, 1.0, nil, nil, Float::NAN, Float::NAN)
111
+ # vector
112
+ #
113
+ # # =>
114
+ # #<RedAmber::Vector(:double, size=6):0x000000000000d390>
115
+ # [1.0, 1.0, nil, nil, NaN, NaN]
116
+ #
117
+ # # Float::NANs are counted as 1.
118
+ # vector.count_uniq # => 2
119
+ #
120
+ # # nils are counted as 1.
121
+ # vector.count_uniq(mode: :only_null) # => 1
122
+ #
123
+ # vector.count_uniq(mode: :all) # => 3
124
+ #
125
+ define_unary_aggregation :count_distinct
126
+ alias_method :count_uniq, :count_distinct
127
+
128
+ # Compute maximum value of self.
129
+ #
130
+ # @!method max(skip_nulls: true, min_count: 1)
131
+ # @macro scalar_aggregate_options
132
+ # @return [Numeric]
133
+ # maximum value of self.
134
+ #
135
+ define_unary_aggregation :max
136
+
137
+ # Compute mean value of self.
138
+ #
139
+ # @!method mean(skip_nulls: true, min_count: 1)
140
+ # @macro scalar_aggregate_options
141
+ # @return [Numeric]
142
+ # mean of self.
143
+ #
144
+ define_unary_aggregation :mean
145
+
146
+ # Compute minimum value of self.
147
+ #
148
+ # @!method min(skip_nulls: true, min_count: 1)
149
+ # @macro scalar_aggregate_options
150
+ # @return [Numeric]
151
+ # minimum of self.
152
+ #
153
+ define_unary_aggregation :min
154
+
155
+ # Compute the min and max value of self.
156
+ #
157
+ # @!method min_max(skip_nulls: true, min_count: 1)
158
+ # @macro scalar_aggregate_options
159
+ # @return [Array<min, max>]
160
+ # min and max of self in an Array.
161
+ #
162
+ define_unary_aggregation :min_max
163
+
164
+ # Compute product value of self.
165
+ #
166
+ # @note Self must be a numeric Vector.
167
+ # @!method product(skip_nulls: true, min_count: 1)
168
+ # @macro scalar_aggregate_options
169
+ # @return [Numeric]
170
+ # product of self.
171
+ #
172
+ define_unary_aggregation :product
173
+
174
+ # Calculate standard deviation of self.
175
+ #
176
+ # @note Self must be a numeric Vector.
177
+ # @!method stddev(ddof: 0, skip_nulls: true, min_count: 1)
178
+ # @macro variance_options
179
+ # @return [Float]
180
+ # standard deviation of self. Biased (ddof=0) by default.
181
+ #
182
+ define_unary_aggregation :stddev
183
+
184
+ # Calculate unbiased standard deviation of self.
185
+ #
186
+ # @note Self must be a numeric Vector.
187
+ # @!method sd(ddof: 1, skip_nulls: true, min_count: 1)
188
+ # @macro variance_options
189
+ # @return [Float]
190
+ # standard deviation of self. Unviased (ddof=1)by default.
191
+ #
192
+ def sd
193
+ stddev(ddof: 1)
194
+ end
195
+ alias_method :std, :sd
196
+
197
+ # Compute sum of self.
198
+ #
199
+ # @note Self must be a numeric Vector.
200
+ # @!method sum(skip_nulls: true, min_count: 1)
201
+ # @macro scalar_aggregate_options
202
+ # @return [Numeric]
203
+ # sum of self.
204
+ #
205
+ define_unary_aggregation :sum
206
+
207
+ # Calculate variance of self.
208
+ #
209
+ # @note Self must be a numeric Vector.
210
+ # @!method variance(ddof: 0, skip_nulls: true, min_count: 1)
211
+ # @macro variance_options
212
+ #
213
+ # @return [Float]
214
+ # unviased (ddof=1) standard deviation of self by default.
215
+ #
216
+ # @return [Float]
217
+ # variance of self. Biased (ddof=0) by default.
218
+ #
219
+ define_unary_aggregation :variance
220
+
221
+ # Calculate unbiased variance of self.
222
+ #
223
+ # @note self must be a numeric Vector.
224
+ # @!method unbiased_variance(ddof: 1, skip_nulls: true, min_count: 1)
225
+ # @macro variance_options
226
+ # @return [Float]
227
+ # variance of self. Unviased (ddof=1) by default.
228
+ #
229
+ def unbiased_variance
230
+ variance(ddof: 1)
231
+ end
232
+ alias_method :var, :unbiased_variance
233
+
234
+ # @!macro quantile_interpolation
235
+ # @param interpolation [Symbol]
236
+ # specifies interpolation method to use,
237
+ # when the quantile lies between the data i and j.
238
+ # - Default value is :linear, which returns i + (j - i) * fraction.
239
+ # - lower: returns i.
240
+ # - higher: returns j.
241
+ # - nearest: returns i or j, whichever is closer.
242
+ # - midpoint: returns (i + j) / 2.
243
+
244
+ # Returns a quantile value.
245
+ # - 0.5 quantile (median) is returned by default.
246
+ # - Or return quantile for specified probability (prob).
247
+ # - If quantile lies between two data points, interpolated value is
248
+ # returned based on selected interpolation method.
249
+ # - Nils and NaNs are ignored.
250
+ # - Nil is returned if there are no valid data point.
251
+ #
252
+ # @param prob [Float]
253
+ # probability.
254
+ # @macro quantile_interpolation
255
+ # @macro scalar_aggregate_options
256
+ # @return [Float]
257
+ # quantile of self.
258
+ # @example
259
+ # penguins[:bill_depth_mm].quantile
260
+ #
261
+ # # =>
262
+ # 17.3 # defaultis prob = 0.5
263
+ #
264
+ def quantile(prob = 0.5, interpolation: :linear, skip_nulls: true, min_count: 0)
265
+ unless (0..1).cover? prob
266
+ raise VectorArgumentError,
267
+ "Invalid: probability #{prob} must be between 0 and 1"
268
+ end
269
+
270
+ datum = find(:quantile).execute([data],
271
+ q: prob,
272
+ interpolation: interpolation,
273
+ skip_nulls: skip_nulls,
274
+ min_count: min_count)
275
+ datum.value.to_a.first
276
+ end
277
+
278
+ # Return quantiles in a DataFrame
279
+ #
280
+ # @param probs [Array]
281
+ # Array of probabilities. Default probabilities are 0.0, 0.25, 0.5 0.75, 1.0 .
282
+ # @macro quantile_interpolation
283
+ # @macro scalar_aggregate_options
284
+ # @return [DataFrame]
285
+ # quantiles of self.
286
+ # @example
287
+ # penguins[:bill_depth_mm].quantiles([0.05, 0.95])
288
+ #
289
+ # # =>
290
+ # #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000000fb2c>
291
+ # probs quantiles
292
+ # <double> <double>
293
+ # 0 0.05 13.9
294
+ # 1 0.95 20.0
295
+ #
296
+ def quantiles(probs = [0.0, 0.25, 0.5, 0.75, 1.0],
297
+ interpolation: :linear, skip_nulls: true, min_count: 0)
298
+ if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
299
+ raise VectorArgumentError, "Invarid probavilities #{probs}"
300
+ end
301
+
302
+ DataFrame.new(
303
+ probs: probs,
304
+ quantiles: probs.map do |q|
305
+ quantile(q,
306
+ interpolation: interpolation, skip_nulls: skip_nulls,
307
+ min_count: min_count)
308
+ end
309
+ )
310
+ end
311
+ end
312
+ end