red_amber 0.3.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +56 -22
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +29 -30
  8. data/benchmark/basic.yml +7 -7
  9. data/benchmark/combine.yml +3 -3
  10. data/benchmark/dataframe.yml +15 -9
  11. data/benchmark/group.yml +6 -6
  12. data/benchmark/reshape.yml +6 -6
  13. data/benchmark/vector.yml +6 -3
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +454 -85
  20. data/lib/red_amber/data_frame_combinable.rb +609 -115
  21. data/lib/red_amber/data_frame_displayable.rb +313 -34
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +623 -70
  26. data/lib/red_amber/data_frame_variable_operation.rb +452 -35
  27. data/lib/red_amber/group.rb +186 -22
  28. data/lib/red_amber/helper.rb +74 -14
  29. data/lib/red_amber/refinements.rb +26 -6
  30. data/lib/red_amber/subframes.rb +1101 -0
  31. data/lib/red_amber/vector.rb +362 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +506 -0
  34. data/lib/red_amber/vector_selectable.rb +265 -23
  35. data/lib/red_amber/vector_unary_element_wise.rb +529 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -0,0 +1,312 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
+ # reference: https://arrow.apache.org/docs/cpp/compute.html
5
+
6
+ module RedAmber
7
+ # Representing a series of data.
8
+ class Vector
9
+ class << self
10
+ private
11
+
12
+ # @!macro [attach] define_unary_aggregation
13
+ # [Unary aggregation function] Returns a scalar.
14
+ #
15
+ def define_unary_aggregation(function)
16
+ define_method(function) do |**options|
17
+ datum = exec_func_unary(function, options)
18
+ get_scalar(datum)
19
+ end
20
+ end
21
+ end
22
+
23
+ # Not implemented in red-arrow yet:
24
+ # Arrow::Indexoptions, Arrow::ModeOptions, Arrow::TDigestOptions
25
+
26
+ # @!macro scalar_aggregate_options
27
+ # @param skip_nulls [true, false]
28
+ # If true, nil values are ignored.
29
+ # Otherwise, if any value is nil, emit nil.
30
+ # @param min_count [Integer]
31
+ # if less than this many non-nil values are observed, emit nil.
32
+ # If skip_nulls is false, this option is not respected.
33
+
34
+ # @!macro count_options
35
+ # @param mode [:only_valid, :only_null, :all]
36
+ # control count aggregate kernel behavior.
37
+ # - only_valid: count only non-nil values.
38
+ # - only_null: count only nil.
39
+ # - all: count both.
40
+
41
+ # @!macro variance_options
42
+ # @param ddof [0, 1]
43
+ # Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel.
44
+ # The divisor used in calculations is N - ddof, where N is the number
45
+ # of elements. By default, ddof is zero, and population variance or stddev
46
+ # is returned.
47
+ # @macro scalar_aggregate_options
48
+
49
+ # Test whether all elements in self are evaluated to true.
50
+ #
51
+ # @!method all(skip_nulls: true, min_count: 1)
52
+ # @macro scalar_aggregate_options
53
+ # @return [true, false]
54
+ # `all` result of self.
55
+ # @example Default.
56
+ # Vector.new(true, true, nil).all # => true
57
+ #
58
+ # @example Skip nils.
59
+ # Vector.new(true, true, nil).all(skip_nulls: false) # => false
60
+ #
61
+ define_unary_aggregation :all
62
+ alias_method :all?, :all
63
+
64
+ # Test whether any elements in self are evaluated to true.
65
+ #
66
+ # @!method any(skip_nulls: true, min_count: 1)
67
+ # @macro scalar_aggregate_options
68
+ # @return [true, false]
69
+ # `any` result of self.
70
+ # @example Default.
71
+ # Vector.new(true, false, nil).any # => true
72
+ #
73
+ define_unary_aggregation :any
74
+ alias_method :any?, :any
75
+
76
+ # Approximate median of a numeric Vector with T-Digest algorithm.
77
+ #
78
+ # @!method approximate_median(skip_nulls: true, min_count: 1)
79
+ # @macro scalar_aggregate_options
80
+ # @return [Float]
81
+ # median of self.
82
+ # A nil is returned if there is no valid data point.
83
+ #
84
+ define_unary_aggregation :approximate_median
85
+ alias_method :median, :approximate_median
86
+
87
+ # Count the number of nil / non-nil values.
88
+ #
89
+ # @!method count(mode: :non_null)
90
+ # @macro count_options
91
+ # @return [Integer] count of self.
92
+ # @example Count only non-nil (default)
93
+ # Vector.new(1.0, -2.0, Float::NAN, nil).count # => 3
94
+ #
95
+ # @example Count nil only.
96
+ # Vector.new(1.0, -2.0, Float::NAN, nil).count(mode: :only_null) # => 1
97
+ #
98
+ # @example Count both non-nil and nil.
99
+ # Vector.new(1.0, -2.0, Float::NAN, nil).count(mode: :all) # => 4
100
+ #
101
+ define_unary_aggregation :count
102
+
103
+ # Count the number of unique values.
104
+ #
105
+ # @!method count_distinct(mode: :only_valid)
106
+ # @macro count_options
107
+ # @return [Integer]
108
+ # unique count of self.
109
+ # @example
110
+ # vector = Vector.new(1, 1.0, nil, nil, Float::NAN, Float::NAN)
111
+ # vector
112
+ #
113
+ # # =>
114
+ # #<RedAmber::Vector(:double, size=6):0x000000000000d390>
115
+ # [1.0, 1.0, nil, nil, NaN, NaN]
116
+ #
117
+ # # Float::NANs are counted as 1.
118
+ # vector.count_uniq # => 2
119
+ #
120
+ # # nils are counted as 1.
121
+ # vector.count_uniq(mode: :only_null) # => 1
122
+ #
123
+ # vector.count_uniq(mode: :all) # => 3
124
+ #
125
+ define_unary_aggregation :count_distinct
126
+ alias_method :count_uniq, :count_distinct
127
+
128
+ # Compute maximum value of self.
129
+ #
130
+ # @!method max(skip_nulls: true, min_count: 1)
131
+ # @macro scalar_aggregate_options
132
+ # @return [Numeric]
133
+ # maximum value of self.
134
+ #
135
+ define_unary_aggregation :max
136
+
137
+ # Compute mean value of self.
138
+ #
139
+ # @!method mean(skip_nulls: true, min_count: 1)
140
+ # @macro scalar_aggregate_options
141
+ # @return [Numeric]
142
+ # mean of self.
143
+ #
144
+ define_unary_aggregation :mean
145
+
146
+ # Compute minimum value of self.
147
+ #
148
+ # @!method min(skip_nulls: true, min_count: 1)
149
+ # @macro scalar_aggregate_options
150
+ # @return [Numeric]
151
+ # minimum of self.
152
+ #
153
+ define_unary_aggregation :min
154
+
155
+ # Compute the min and max value of self.
156
+ #
157
+ # @!method min_max(skip_nulls: true, min_count: 1)
158
+ # @macro scalar_aggregate_options
159
+ # @return [Array<min, max>]
160
+ # min and max of self in an Array.
161
+ #
162
+ define_unary_aggregation :min_max
163
+
164
+ # Compute product value of self.
165
+ #
166
+ # @note Self must be a numeric Vector.
167
+ # @!method product(skip_nulls: true, min_count: 1)
168
+ # @macro scalar_aggregate_options
169
+ # @return [Numeric]
170
+ # product of self.
171
+ #
172
+ define_unary_aggregation :product
173
+
174
+ # Calculate standard deviation of self.
175
+ #
176
+ # @note Self must be a numeric Vector.
177
+ # @!method stddev(ddof: 0, skip_nulls: true, min_count: 1)
178
+ # @macro variance_options
179
+ # @return [Float]
180
+ # standard deviation of self. Biased (ddof=0) by default.
181
+ #
182
+ define_unary_aggregation :stddev
183
+
184
+ # Calculate unbiased standard deviation of self.
185
+ #
186
+ # @note Self must be a numeric Vector.
187
+ # @!method sd(ddof: 1, skip_nulls: true, min_count: 1)
188
+ # @macro variance_options
189
+ # @return [Float]
190
+ # standard deviation of self. Unviased (ddof=1)by default.
191
+ #
192
+ def sd
193
+ stddev(ddof: 1)
194
+ end
195
+ alias_method :std, :sd
196
+
197
+ # Compute sum of self.
198
+ #
199
+ # @note Self must be a numeric Vector.
200
+ # @!method sum(skip_nulls: true, min_count: 1)
201
+ # @macro scalar_aggregate_options
202
+ # @return [Numeric]
203
+ # sum of self.
204
+ #
205
+ define_unary_aggregation :sum
206
+
207
+ # Calculate variance of self.
208
+ #
209
+ # @note Self must be a numeric Vector.
210
+ # @!method variance(ddof: 0, skip_nulls: true, min_count: 1)
211
+ # @macro variance_options
212
+ #
213
+ # @return [Float]
214
+ # unviased (ddof=1) standard deviation of self by default.
215
+ #
216
+ # @return [Float]
217
+ # variance of self. Biased (ddof=0) by default.
218
+ #
219
+ define_unary_aggregation :variance
220
+
221
+ # Calculate unbiased variance of self.
222
+ #
223
+ # @note self must be a numeric Vector.
224
+ # @!method unbiased_variance(ddof: 1, skip_nulls: true, min_count: 1)
225
+ # @macro variance_options
226
+ # @return [Float]
227
+ # variance of self. Unviased (ddof=1) by default.
228
+ #
229
+ def unbiased_variance
230
+ variance(ddof: 1)
231
+ end
232
+ alias_method :var, :unbiased_variance
233
+
234
+ # @!macro quantile_interpolation
235
+ # @param interpolation [Symbol]
236
+ # specifies interpolation method to use,
237
+ # when the quantile lies between the data i and j.
238
+ # - Default value is :linear, which returns i + (j - i) * fraction.
239
+ # - lower: returns i.
240
+ # - higher: returns j.
241
+ # - nearest: returns i or j, whichever is closer.
242
+ # - midpoint: returns (i + j) / 2.
243
+
244
+ # Returns a quantile value.
245
+ # - 0.5 quantile (median) is returned by default.
246
+ # - Or return quantile for specified probability (prob).
247
+ # - If quantile lies between two data points, interpolated value is
248
+ # returned based on selected interpolation method.
249
+ # - Nils and NaNs are ignored.
250
+ # - Nil is returned if there are no valid data point.
251
+ #
252
+ # @param prob [Float]
253
+ # probability.
254
+ # @macro quantile_interpolation
255
+ # @macro scalar_aggregate_options
256
+ # @return [Float]
257
+ # quantile of self.
258
+ # @example
259
+ # penguins[:bill_depth_mm].quantile
260
+ #
261
+ # # =>
262
+ # 17.3 # defaultis prob = 0.5
263
+ #
264
+ def quantile(prob = 0.5, interpolation: :linear, skip_nulls: true, min_count: 0)
265
+ unless (0..1).cover? prob
266
+ raise VectorArgumentError,
267
+ "Invalid: probability #{prob} must be between 0 and 1"
268
+ end
269
+
270
+ datum = find(:quantile).execute([data],
271
+ q: prob,
272
+ interpolation: interpolation,
273
+ skip_nulls: skip_nulls,
274
+ min_count: min_count)
275
+ datum.value.to_a.first
276
+ end
277
+
278
+ # Return quantiles in a DataFrame
279
+ #
280
+ # @param probs [Array]
281
+ # Array of probabilities. Default probabilities are 0.0, 0.25, 0.5 0.75, 1.0 .
282
+ # @macro quantile_interpolation
283
+ # @macro scalar_aggregate_options
284
+ # @return [DataFrame]
285
+ # quantiles of self.
286
+ # @example
287
+ # penguins[:bill_depth_mm].quantiles([0.05, 0.95])
288
+ #
289
+ # # =>
290
+ # #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000000fb2c>
291
+ # probs quantiles
292
+ # <double> <double>
293
+ # 0 0.05 13.9
294
+ # 1 0.95 20.0
295
+ #
296
+ def quantiles(probs = [0.0, 0.25, 0.5, 0.75, 1.0],
297
+ interpolation: :linear, skip_nulls: true, min_count: 0)
298
+ if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
299
+ raise VectorArgumentError, "Invarid probavilities #{probs}"
300
+ end
301
+
302
+ DataFrame.new(
303
+ probs: probs,
304
+ quantiles: probs.map do |q|
305
+ quantile(q,
306
+ interpolation: interpolation, skip_nulls: skip_nulls,
307
+ min_count: min_count)
308
+ end
309
+ )
310
+ end
311
+ end
312
+ end