red_amber 0.2.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -51
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +203 -1
  5. data/Gemfile +2 -1
  6. data/LICENSE +1 -1
  7. data/README.md +61 -45
  8. data/benchmark/basic.yml +11 -4
  9. data/benchmark/combine.yml +3 -4
  10. data/benchmark/dataframe.yml +62 -0
  11. data/benchmark/group.yml +7 -1
  12. data/benchmark/reshape.yml +6 -2
  13. data/benchmark/vector.yml +63 -0
  14. data/doc/DataFrame.md +35 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +295 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +537 -68
  20. data/lib/red_amber/data_frame_combinable.rb +776 -123
  21. data/lib/red_amber/data_frame_displayable.rb +248 -18
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +81 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +216 -21
  25. data/lib/red_amber/data_frame_selectable.rb +781 -120
  26. data/lib/red_amber/data_frame_variable_operation.rb +561 -85
  27. data/lib/red_amber/group.rb +195 -21
  28. data/lib/red_amber/helper.rb +114 -32
  29. data/lib/red_amber/refinements.rb +206 -0
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +435 -58
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +321 -69
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +397 -24
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +15 -1
  39. data/red_amber.gemspec +4 -3
  40. metadata +19 -11
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -294
@@ -1,294 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
- # reference: https://arrow.apache.org/docs/cpp/compute.html
5
-
6
- # Not implemented in Red Arrow 8.0.0
7
- # divmod, # '%',
8
- # true_unless_null
9
-
10
- module RedAmber
11
- # mix-ins for class Vector
12
- module VectorFunctions
13
- # [Unary aggregations]: vector.func => scalar
14
- unary_aggregations =
15
- %i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
16
- unary_aggregations.each do |function|
17
- define_method(function) do |**options|
18
- datum = exec_func_unary(function, options)
19
- get_scalar(datum)
20
- end
21
- end
22
- alias_method :median, :approximate_median
23
- alias_method :count_uniq, :count_distinct
24
- alias_method :all?, :all
25
- alias_method :any?, :any
26
-
27
- def unbiased_variance
28
- variance(ddof: 1)
29
- end
30
- alias_method :var, :unbiased_variance
31
-
32
- def sd
33
- stddev(ddof: 1)
34
- end
35
- alias_method :std, :sd
36
-
37
- # Return quantile
38
- # 0.5 quantile (median) is returned by default.
39
- # Or return quantile for specified probability (prob).
40
- # If quantile lies between two data points, interpolated value is
41
- # returned based on selected interpolation method.
42
- # Nils and NaNs are ignored.
43
- # Nil is returned if there are no valid data point.
44
- #
45
- # @param prob [Float] probability.
46
- # @param interpolation [Symbol] specifies interpolation method to use,
47
- # when the quantile lies between the data i and j.
48
- # - Default value is :linear, which returns i + (j - i) * fraction.
49
- # - :lower returns i.
50
- # - :higher returns j.
51
- # - :nearest returns i or j, whichever is closer.
52
- # - :midpoint returns (i + j) / 2.
53
- # @param skip_nils [Boolean] wheather to ignore nil.
54
- # @param min_count [Integer] min count.
55
- # @return [Float] quantile.
56
- def quantile(prob = 0.5, interpolation: :linear, skip_nils: true, min_count: 0)
57
- raise VectorArgumentError, "Invalid: probability #{prob} must be between 0 and 1" unless (0..1).cover? prob
58
-
59
- datum = find(:quantile).execute([data],
60
- q: prob,
61
- interpolation: interpolation,
62
- skip_nulls: skip_nils,
63
- min_count: min_count)
64
- datum.value.to_a.first
65
- end
66
-
67
- # Return quantiles in a DataFrame
68
- #
69
- def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0], interpolation: :linear, skip_nils: true, min_count: 0)
70
- if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
71
- raise VectorArgumentError, "Invarid probavilities #{probs}"
72
- end
73
-
74
- DataFrame.new(
75
- probs: probs,
76
- quantiles: probs.map do |q|
77
- quantile(q, interpolation: interpolation, skip_nils: skip_nils, min_count: min_count)
78
- end
79
- )
80
- end
81
-
82
- # [Unary element-wise]: vector.func => vector
83
- unary_element_wise =
84
- %i[abs acos asin array_sort_indices atan bit_wise_not ceil cos fill_null_backward \
85
- fill_null_forward floor is_finite is_inf is_nan is_null is_valid ln log10 log1p log2 \
86
- round round_to_multiple sign sin tan trunc unique]
87
- unary_element_wise.each do |function|
88
- define_method(function) do |**options|
89
- datum = exec_func_unary(function, options)
90
- Vector.new(datum.value)
91
- end
92
- end
93
- alias_method :is_nil, :is_null
94
-
95
- def is_na
96
- numeric? ? (is_nil | is_nan) : is_nil
97
- end
98
-
99
- alias_method :fill_nil_backward, :fill_null_backward
100
- alias_method :fill_nil_forward, :fill_null_forward
101
-
102
- alias_method :sort_indexes, :array_sort_indices
103
- alias_method :sort_indices, :array_sort_indices
104
- alias_method :sort_index, :array_sort_indices
105
-
106
- alias_method :uniq, :unique
107
-
108
- # [Unary element-wise with operator]: vector.func => vector, op vector
109
- unary_element_wise_op = {
110
- invert: '!',
111
- negate: '-@',
112
- }
113
- unary_element_wise_op.each do |function, operator|
114
- define_method(function) do |**options|
115
- datum = exec_func_unary(function, options)
116
- Vector.new(datum.value)
117
- end
118
-
119
- define_method(operator) do |**options|
120
- datum = exec_func_unary(function, options)
121
- Vector.new(datum.value)
122
- end
123
- end
124
- alias_method :not, :invert
125
-
126
- # [Binary element-wise]: vector.func(other) => vector
127
- binary_element_wise =
128
- %i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor logb]
129
- binary_element_wise.each do |function|
130
- define_method(function) do |other, **options|
131
- datum = exec_func_binary(function, other, options)
132
- Vector.new(datum.value)
133
- end
134
- end
135
-
136
- # [Logical binary element-wise]: vector.func(other) => vector
137
- logical_binary_element_wise = {
138
- '&': :and_kleene,
139
- and_kleene: :and_kleene,
140
- and_org: :and,
141
- '|': :or_kleene,
142
- or_kleene: :or_kleene,
143
- or_org: :or,
144
- }
145
- logical_binary_element_wise.each do |method, function|
146
- define_method(method) do |other, **options|
147
- datum = exec_func_binary(function, other, options)
148
- Vector.new(datum.value)
149
- end
150
- end
151
-
152
- # [Binary element-wise with operator]: vector.func(other) => vector
153
- binary_element_wise_op = {
154
- add: '+',
155
- divide: '/',
156
- multiply: '*',
157
- power: '**',
158
- subtract: '-',
159
-
160
- xor: '^',
161
- shift_left: '<<',
162
- shift_right: '>>',
163
-
164
- equal: '==',
165
- greater: '>',
166
- greater_equal: '>=',
167
- less: '<',
168
- less_equal: '<=',
169
- not_equal: '!=',
170
- }
171
- binary_element_wise_op.each do |function, operator|
172
- define_method(function) do |other, **options|
173
- datum = exec_func_binary(function, other, options)
174
- Vector.new(datum.value)
175
- end
176
-
177
- define_method(operator) do |other, **options|
178
- datum = exec_func_binary(function, other, options)
179
- Vector.new(datum.value)
180
- end
181
- end
182
- alias_method :eq, :equal
183
- alias_method :ge, :greater_equal
184
- alias_method :gt, :greater
185
- alias_method :le, :less_equal
186
- alias_method :lt, :less
187
- alias_method :ne, :not_equal
188
-
189
- def coerce(other)
190
- [Vector.new(Array(other) * size), self]
191
- end
192
-
193
- # < Not implimented yet > ---
194
-
195
- # option(s) required
196
- # - index
197
-
198
- # Returns other than value
199
- # - mode
200
- # - tdigest
201
-
202
- # Functions with numerical range check (unary)
203
- # - abs_checked acos_checked asin_checked cos_checked ln_checked
204
- # log10_checked log1p_checked log2_checked sin_checked tan_checked
205
-
206
- # Functions with numerical range check (binary)
207
- # - add_checked divide_checked logb_checked multiply_checked power_checked subtract_checked
208
- # shift_left_checked shift_right_checked
209
-
210
- # (array functions)
211
- # dictionary_encode,
212
- # partition_nth_indices,
213
- # quarter, quarters_between,
214
-
215
- # (strings)
216
- # ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
217
- # ascii_is_lower, ascii_is_printable, ascii_is_space, ascii_is_title, ascii_is_upper,
218
- # ascii_lower, ascii_lpad, ascii_ltrim, ascii_ltrim_whitespace, ascii_reverse,
219
- # ascii_rpad, ascii_rtrim, ascii_rtrim_whitespace, ascii_split_whitespace,
220
- # ascii_swapcase, ascii_title, ascii_trim, ascii_trim_whitespace, ascii_upper,
221
- # binary_join, binary_join_element_wise, binary_length, binary_repeat,
222
- # binary_replace_slice, binary_reverse, count_substring, count_substring_regex,
223
- # ends_with, extract_regex, find_substring, find_substring_regex,
224
- # match_like, match_substring, match_substring_regex, replace_substring,
225
- # replace_substring_regex, split_pattern, split_pattern_regex, starts_with,
226
- # string_is_ascii, utf8_capitalize, utf8_center, utf8_is_alnum, utf8_is_alpha,
227
- # utf8_is_decimal, utf8_is_digit, utf8_is_lower, utf8_is_numeric, utf8_is_printable,
228
- # utf8_is_space, utf8_is_title, utf8_is_upper, utf8_length, utf8_lower, utf8_lpad,
229
- # utf8_ltrim, utf8_ltrim_whitespace, utf8_normalize, utf8_replace_slice, utf8_reverse,
230
- # utf8_rpad, utf8_rtrim, utf8_rtrim_whitespace, utf8_slice_codeunits, utf8_split_whitespace,
231
- # utf8_swapcase, utf8_title, utf8_trim, utf8_trim_whitespace, utf8_upper
232
-
233
- # (temporal)
234
- # assume_timezone, ceil_temporal, day, day_of_week, day_of_year, day_time_interval_between,
235
- # days_between, floor_temporal, hour, hours_between, iso_calendar, iso_week, iso_year,
236
- # microsecond, microseconds_between, millisecond, milliseconds_between, minute,
237
- # minutes_between, month, month_day_nano_interval_between, month_interval_between,
238
- # nanosecond, nanoseconds_between, round_temporal, second, seconds_between, strftime,
239
- # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
240
-
241
- # (onditional)
242
- # case_when, cast,
243
-
244
- # (indices)
245
- # choose, index_in, index_in_meta_binary, indices_nonzero
246
-
247
- # (others)
248
- # coalesce,
249
- # is_in_meta_binary,
250
- # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
251
- # max_element_wise, min_element_wise, random, select_k_unstable,
252
- # struct_field,
253
-
254
- private # =======
255
-
256
- def exec_func_unary(function, options)
257
- options = nil if options.empty?
258
- find(function).execute([data], options)
259
- end
260
-
261
- def exec_func_binary(function, other, options)
262
- options = nil if options.empty?
263
- case other
264
- when Vector
265
- find(function).execute([data, other.data], options)
266
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
267
- find(function).execute([data, other], options)
268
- end
269
- end
270
-
271
- def get_scalar(datum)
272
- output = datum.value
273
- case output
274
- when Arrow::StringScalar then output.to_s
275
- when Arrow::StructScalar
276
- output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
277
- else
278
- output.value
279
- end
280
- end
281
-
282
- module_function # ======
283
-
284
- def find(function_name)
285
- Arrow::Function.find(function_name)
286
- end
287
-
288
- # temporary API until RedAmber document prepared.
289
- def arrow_doc(function_name)
290
- f = find(function_name)
291
- "#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
292
- end
293
- end
294
- end