red_amber 0.2.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -51
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +203 -1
  5. data/Gemfile +2 -1
  6. data/LICENSE +1 -1
  7. data/README.md +61 -45
  8. data/benchmark/basic.yml +11 -4
  9. data/benchmark/combine.yml +3 -4
  10. data/benchmark/dataframe.yml +62 -0
  11. data/benchmark/group.yml +7 -1
  12. data/benchmark/reshape.yml +6 -2
  13. data/benchmark/vector.yml +63 -0
  14. data/doc/DataFrame.md +35 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +295 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +537 -68
  20. data/lib/red_amber/data_frame_combinable.rb +776 -123
  21. data/lib/red_amber/data_frame_displayable.rb +248 -18
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +81 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +216 -21
  25. data/lib/red_amber/data_frame_selectable.rb +781 -120
  26. data/lib/red_amber/data_frame_variable_operation.rb +561 -85
  27. data/lib/red_amber/group.rb +195 -21
  28. data/lib/red_amber/helper.rb +114 -32
  29. data/lib/red_amber/refinements.rb +206 -0
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +435 -58
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +321 -69
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +397 -24
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +15 -1
  39. data/red_amber.gemspec +4 -3
  40. metadata +19 -11
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -294
@@ -1,294 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
- # reference: https://arrow.apache.org/docs/cpp/compute.html
5
-
6
- # Not implemented in Red Arrow 8.0.0
7
- # divmod, # '%',
8
- # true_unless_null
9
-
10
- module RedAmber
11
- # mix-ins for class Vector
12
- module VectorFunctions
13
- # [Unary aggregations]: vector.func => scalar
14
- unary_aggregations =
15
- %i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
16
- unary_aggregations.each do |function|
17
- define_method(function) do |**options|
18
- datum = exec_func_unary(function, options)
19
- get_scalar(datum)
20
- end
21
- end
22
- alias_method :median, :approximate_median
23
- alias_method :count_uniq, :count_distinct
24
- alias_method :all?, :all
25
- alias_method :any?, :any
26
-
27
- def unbiased_variance
28
- variance(ddof: 1)
29
- end
30
- alias_method :var, :unbiased_variance
31
-
32
- def sd
33
- stddev(ddof: 1)
34
- end
35
- alias_method :std, :sd
36
-
37
- # Return quantile
38
- # 0.5 quantile (median) is returned by default.
39
- # Or return quantile for specified probability (prob).
40
- # If quantile lies between two data points, interpolated value is
41
- # returned based on selected interpolation method.
42
- # Nils and NaNs are ignored.
43
- # Nil is returned if there are no valid data point.
44
- #
45
- # @param prob [Float] probability.
46
- # @param interpolation [Symbol] specifies interpolation method to use,
47
- # when the quantile lies between the data i and j.
48
- # - Default value is :linear, which returns i + (j - i) * fraction.
49
- # - :lower returns i.
50
- # - :higher returns j.
51
- # - :nearest returns i or j, whichever is closer.
52
- # - :midpoint returns (i + j) / 2.
53
- # @param skip_nils [Boolean] wheather to ignore nil.
54
- # @param min_count [Integer] min count.
55
- # @return [Float] quantile.
56
- def quantile(prob = 0.5, interpolation: :linear, skip_nils: true, min_count: 0)
57
- raise VectorArgumentError, "Invalid: probability #{prob} must be between 0 and 1" unless (0..1).cover? prob
58
-
59
- datum = find(:quantile).execute([data],
60
- q: prob,
61
- interpolation: interpolation,
62
- skip_nulls: skip_nils,
63
- min_count: min_count)
64
- datum.value.to_a.first
65
- end
66
-
67
- # Return quantiles in a DataFrame
68
- #
69
- def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0], interpolation: :linear, skip_nils: true, min_count: 0)
70
- if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
71
- raise VectorArgumentError, "Invarid probavilities #{probs}"
72
- end
73
-
74
- DataFrame.new(
75
- probs: probs,
76
- quantiles: probs.map do |q|
77
- quantile(q, interpolation: interpolation, skip_nils: skip_nils, min_count: min_count)
78
- end
79
- )
80
- end
81
-
82
- # [Unary element-wise]: vector.func => vector
83
- unary_element_wise =
84
- %i[abs acos asin array_sort_indices atan bit_wise_not ceil cos fill_null_backward \
85
- fill_null_forward floor is_finite is_inf is_nan is_null is_valid ln log10 log1p log2 \
86
- round round_to_multiple sign sin tan trunc unique]
87
- unary_element_wise.each do |function|
88
- define_method(function) do |**options|
89
- datum = exec_func_unary(function, options)
90
- Vector.new(datum.value)
91
- end
92
- end
93
- alias_method :is_nil, :is_null
94
-
95
- def is_na
96
- numeric? ? (is_nil | is_nan) : is_nil
97
- end
98
-
99
- alias_method :fill_nil_backward, :fill_null_backward
100
- alias_method :fill_nil_forward, :fill_null_forward
101
-
102
- alias_method :sort_indexes, :array_sort_indices
103
- alias_method :sort_indices, :array_sort_indices
104
- alias_method :sort_index, :array_sort_indices
105
-
106
- alias_method :uniq, :unique
107
-
108
- # [Unary element-wise with operator]: vector.func => vector, op vector
109
- unary_element_wise_op = {
110
- invert: '!',
111
- negate: '-@',
112
- }
113
- unary_element_wise_op.each do |function, operator|
114
- define_method(function) do |**options|
115
- datum = exec_func_unary(function, options)
116
- Vector.new(datum.value)
117
- end
118
-
119
- define_method(operator) do |**options|
120
- datum = exec_func_unary(function, options)
121
- Vector.new(datum.value)
122
- end
123
- end
124
- alias_method :not, :invert
125
-
126
- # [Binary element-wise]: vector.func(other) => vector
127
- binary_element_wise =
128
- %i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor logb]
129
- binary_element_wise.each do |function|
130
- define_method(function) do |other, **options|
131
- datum = exec_func_binary(function, other, options)
132
- Vector.new(datum.value)
133
- end
134
- end
135
-
136
- # [Logical binary element-wise]: vector.func(other) => vector
137
- logical_binary_element_wise = {
138
- '&': :and_kleene,
139
- and_kleene: :and_kleene,
140
- and_org: :and,
141
- '|': :or_kleene,
142
- or_kleene: :or_kleene,
143
- or_org: :or,
144
- }
145
- logical_binary_element_wise.each do |method, function|
146
- define_method(method) do |other, **options|
147
- datum = exec_func_binary(function, other, options)
148
- Vector.new(datum.value)
149
- end
150
- end
151
-
152
- # [Binary element-wise with operator]: vector.func(other) => vector
153
- binary_element_wise_op = {
154
- add: '+',
155
- divide: '/',
156
- multiply: '*',
157
- power: '**',
158
- subtract: '-',
159
-
160
- xor: '^',
161
- shift_left: '<<',
162
- shift_right: '>>',
163
-
164
- equal: '==',
165
- greater: '>',
166
- greater_equal: '>=',
167
- less: '<',
168
- less_equal: '<=',
169
- not_equal: '!=',
170
- }
171
- binary_element_wise_op.each do |function, operator|
172
- define_method(function) do |other, **options|
173
- datum = exec_func_binary(function, other, options)
174
- Vector.new(datum.value)
175
- end
176
-
177
- define_method(operator) do |other, **options|
178
- datum = exec_func_binary(function, other, options)
179
- Vector.new(datum.value)
180
- end
181
- end
182
- alias_method :eq, :equal
183
- alias_method :ge, :greater_equal
184
- alias_method :gt, :greater
185
- alias_method :le, :less_equal
186
- alias_method :lt, :less
187
- alias_method :ne, :not_equal
188
-
189
- def coerce(other)
190
- [Vector.new(Array(other) * size), self]
191
- end
192
-
193
- # < Not implimented yet > ---
194
-
195
- # option(s) required
196
- # - index
197
-
198
- # Returns other than value
199
- # - mode
200
- # - tdigest
201
-
202
- # Functions with numerical range check (unary)
203
- # - abs_checked acos_checked asin_checked cos_checked ln_checked
204
- # log10_checked log1p_checked log2_checked sin_checked tan_checked
205
-
206
- # Functions with numerical range check (binary)
207
- # - add_checked divide_checked logb_checked multiply_checked power_checked subtract_checked
208
- # shift_left_checked shift_right_checked
209
-
210
- # (array functions)
211
- # dictionary_encode,
212
- # partition_nth_indices,
213
- # quarter, quarters_between,
214
-
215
- # (strings)
216
- # ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
217
- # ascii_is_lower, ascii_is_printable, ascii_is_space, ascii_is_title, ascii_is_upper,
218
- # ascii_lower, ascii_lpad, ascii_ltrim, ascii_ltrim_whitespace, ascii_reverse,
219
- # ascii_rpad, ascii_rtrim, ascii_rtrim_whitespace, ascii_split_whitespace,
220
- # ascii_swapcase, ascii_title, ascii_trim, ascii_trim_whitespace, ascii_upper,
221
- # binary_join, binary_join_element_wise, binary_length, binary_repeat,
222
- # binary_replace_slice, binary_reverse, count_substring, count_substring_regex,
223
- # ends_with, extract_regex, find_substring, find_substring_regex,
224
- # match_like, match_substring, match_substring_regex, replace_substring,
225
- # replace_substring_regex, split_pattern, split_pattern_regex, starts_with,
226
- # string_is_ascii, utf8_capitalize, utf8_center, utf8_is_alnum, utf8_is_alpha,
227
- # utf8_is_decimal, utf8_is_digit, utf8_is_lower, utf8_is_numeric, utf8_is_printable,
228
- # utf8_is_space, utf8_is_title, utf8_is_upper, utf8_length, utf8_lower, utf8_lpad,
229
- # utf8_ltrim, utf8_ltrim_whitespace, utf8_normalize, utf8_replace_slice, utf8_reverse,
230
- # utf8_rpad, utf8_rtrim, utf8_rtrim_whitespace, utf8_slice_codeunits, utf8_split_whitespace,
231
- # utf8_swapcase, utf8_title, utf8_trim, utf8_trim_whitespace, utf8_upper
232
-
233
- # (temporal)
234
- # assume_timezone, ceil_temporal, day, day_of_week, day_of_year, day_time_interval_between,
235
- # days_between, floor_temporal, hour, hours_between, iso_calendar, iso_week, iso_year,
236
- # microsecond, microseconds_between, millisecond, milliseconds_between, minute,
237
- # minutes_between, month, month_day_nano_interval_between, month_interval_between,
238
- # nanosecond, nanoseconds_between, round_temporal, second, seconds_between, strftime,
239
- # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
240
-
241
- # (onditional)
242
- # case_when, cast,
243
-
244
- # (indices)
245
- # choose, index_in, index_in_meta_binary, indices_nonzero
246
-
247
- # (others)
248
- # coalesce,
249
- # is_in_meta_binary,
250
- # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
251
- # max_element_wise, min_element_wise, random, select_k_unstable,
252
- # struct_field,
253
-
254
- private # =======
255
-
256
- def exec_func_unary(function, options)
257
- options = nil if options.empty?
258
- find(function).execute([data], options)
259
- end
260
-
261
- def exec_func_binary(function, other, options)
262
- options = nil if options.empty?
263
- case other
264
- when Vector
265
- find(function).execute([data, other.data], options)
266
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
267
- find(function).execute([data, other], options)
268
- end
269
- end
270
-
271
- def get_scalar(datum)
272
- output = datum.value
273
- case output
274
- when Arrow::StringScalar then output.to_s
275
- when Arrow::StructScalar
276
- output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
277
- else
278
- output.value
279
- end
280
- end
281
-
282
- module_function # ======
283
-
284
- def find(function_name)
285
- Arrow::Function.find(function_name)
286
- end
287
-
288
- # temporary API until RedAmber document prepared.
289
- def arrow_doc(function_name)
290
- f = find(function_name)
291
- "#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
292
- end
293
- end
294
- end