red_amber 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +114 -39
  3. data/CHANGELOG.md +203 -31
  4. data/Gemfile +5 -2
  5. data/README.md +62 -29
  6. data/benchmark/basic.yml +86 -0
  7. data/benchmark/combine.yml +62 -0
  8. data/benchmark/dataframe.yml +62 -0
  9. data/benchmark/drop_nil.yml +15 -3
  10. data/benchmark/group.yml +39 -0
  11. data/benchmark/reshape.yml +31 -0
  12. data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
  13. data/benchmark/rover/flights.yml +23 -0
  14. data/benchmark/rover/penguins.yml +23 -0
  15. data/benchmark/rover/planes.yml +23 -0
  16. data/benchmark/rover/weather.yml +23 -0
  17. data/benchmark/vector.yml +60 -0
  18. data/doc/DataFrame.md +335 -53
  19. data/doc/Vector.md +91 -0
  20. data/doc/image/dataframe/join.png +0 -0
  21. data/doc/image/dataframe/set_and_bind.png +0 -0
  22. data/doc/image/dataframe_model.png +0 -0
  23. data/lib/red_amber/data_frame.rb +167 -51
  24. data/lib/red_amber/data_frame_combinable.rb +486 -0
  25. data/lib/red_amber/data_frame_displayable.rb +6 -4
  26. data/lib/red_amber/data_frame_indexable.rb +2 -2
  27. data/lib/red_amber/data_frame_loadsave.rb +4 -1
  28. data/lib/red_amber/data_frame_reshaping.rb +35 -10
  29. data/lib/red_amber/data_frame_selectable.rb +221 -116
  30. data/lib/red_amber/data_frame_variable_operation.rb +146 -82
  31. data/lib/red_amber/group.rb +108 -18
  32. data/lib/red_amber/helper.rb +53 -43
  33. data/lib/red_amber/refinements.rb +199 -0
  34. data/lib/red_amber/vector.rb +56 -46
  35. data/lib/red_amber/vector_functions.rb +23 -83
  36. data/lib/red_amber/vector_selectable.rb +116 -69
  37. data/lib/red_amber/vector_updatable.rb +189 -65
  38. data/lib/red_amber/version.rb +1 -1
  39. data/lib/red_amber.rb +3 -0
  40. data/red_amber.gemspec +4 -3
  41. metadata +24 -10
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # Add additional capabilities to Hash
5
+ module RefineHash
6
+ refine Hash do
7
+ # Convert self to an Arrow::Table
8
+ def to_arrow
9
+ Arrow::Table.new(self)
10
+ end
11
+ end
12
+ end
13
+
14
+ # Add additional capabilities to Array-like classes
15
+ module RefineArrayLike
16
+ refine Array do
17
+ def to_arrow_array
18
+ Arrow::Array.new(self)
19
+ end
20
+ end
21
+
22
+ refine Range do
23
+ def to_arrow_array
24
+ Arrow::Array.new(Array(self))
25
+ end
26
+ end
27
+
28
+ # common methods for Arrow::Array and Arrow::ChunkedArray
29
+ # Refinement#include is deprecated and will be removed in Ruby 3.2
30
+ refine Arrow::Array do
31
+ def to_arrow_array
32
+ self
33
+ end
34
+
35
+ def type_class
36
+ value_data_type.class
37
+ end
38
+
39
+ def boolean?
40
+ value_data_type.instance_of?(Arrow::BooleanDataType)
41
+ end
42
+
43
+ def numeric?
44
+ value_data_type.class < Arrow::NumericDataType
45
+ end
46
+
47
+ def float?
48
+ value_data_type.class < Arrow::FloatingPointDataType
49
+ end
50
+
51
+ def integer?
52
+ value_data_type.class < Arrow::IntegerDataType
53
+ end
54
+
55
+ def list?
56
+ is_a? Arrow::ListArray
57
+ end
58
+
59
+ def unsigned_integer?
60
+ value_data_type.instance_of?(Arrow::UInt8DataType) ||
61
+ value_data_type.instance_of?(Arrow::UInt16DataType) ||
62
+ value_data_type.instance_of?(Arrow::UInt32DataType) ||
63
+ value_data_type.instance_of?(Arrow::UInt64DataType)
64
+ end
65
+
66
+ def string?
67
+ value_data_type.instance_of?(Arrow::StringDataType)
68
+ end
69
+
70
+ def dictionary?
71
+ value_data_type.instance_of?(Arrow::DictionaryDataType)
72
+ end
73
+
74
+ def temporal?
75
+ value_data_type.class < Arrow::TemporalDataType
76
+ end
77
+
78
+ def primitive_invert
79
+ n = Arrow::Function.find(:is_null).execute([self])
80
+ i = Arrow::Function.find(:if_else).execute([n, false, self])
81
+ Arrow::Function.find(:invert).execute([i]).value
82
+ end
83
+ end
84
+
85
+ refine Arrow::ChunkedArray do
86
+ def to_arrow_array
87
+ self
88
+ end
89
+
90
+ def type_class
91
+ value_data_type.class
92
+ end
93
+
94
+ def boolean?
95
+ value_data_type.instance_of?(Arrow::BooleanDataType)
96
+ end
97
+
98
+ def numeric?
99
+ value_data_type.class < Arrow::NumericDataType
100
+ end
101
+
102
+ def float?
103
+ value_data_type.class < Arrow::FloatingPointDataType
104
+ end
105
+
106
+ def integer?
107
+ value_data_type.class < Arrow::IntegerDataType
108
+ end
109
+
110
+ def unsigned_integer?
111
+ value_data_type.instance_of?(Arrow::UInt8DataType) ||
112
+ value_data_type.instance_of?(Arrow::UInt16DataType) ||
113
+ value_data_type.instance_of?(Arrow::UInt32DataType) ||
114
+ value_data_type.instance_of?(Arrow::UInt64DataType)
115
+ end
116
+
117
+ def string?
118
+ value_data_type.instance_of?(Arrow::StringDataType)
119
+ end
120
+
121
+ def dictionary?
122
+ value_data_type.instance_of?(Arrow::DictionaryDataType)
123
+ end
124
+
125
+ def temporal?
126
+ value_data_type.class < Arrow::TemporalDataType
127
+ end
128
+
129
+ def list?
130
+ value_type.nick == 'list'
131
+ end
132
+
133
+ def primitive_invert
134
+ n = Arrow::Function.find(:is_null).execute([self])
135
+ i = Arrow::Function.find(:if_else).execute([n, false, self])
136
+ Arrow::Function.find(:invert).execute([i]).value
137
+ end
138
+ end
139
+ end
140
+
141
+ # Add additional capabilities to Arrow::Table
142
+ module RefineArrowTable
143
+ refine Arrow::Table do
144
+ def keys
145
+ columns.map(&:name)
146
+ end
147
+
148
+ def key?(key)
149
+ keys.include?(key)
150
+ end
151
+ end
152
+ end
153
+
154
+ # Add additional capabilities to Array
155
+ module RefineArray
156
+ refine Array do
157
+ def integers?
158
+ all? { |e| e.is_a?(Integer) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
159
+ end
160
+
161
+ def booleans?
162
+ all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
163
+ end
164
+
165
+ def symbols?
166
+ all? { |e| e.is_a?(Symbol) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
167
+ end
168
+
169
+ def strings?
170
+ all? { |e| e.is_a?(String) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
171
+ end
172
+
173
+ def symbols_or_strings?
174
+ all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
175
+ end
176
+
177
+ # convert booleans to indices
178
+ def booleans_to_indices
179
+ (0...size).select.with_index { |_, i| self[i] }
180
+ end
181
+
182
+ # select elements by booleans
183
+ def select_by_booleans(booleans)
184
+ select.with_index { |_, i| booleans[i] }
185
+ end
186
+
187
+ # reject elements by booleans
188
+ def reject_by_booleans(booleans)
189
+ reject.with_index { |_, i| booleans[i] }
190
+ end
191
+
192
+ # reject elements by indices
193
+ # notice: order by indices is not considered.
194
+ def reject_by_indices(indices)
195
+ reject.with_index { |_, i| indices.include?(i) || indices.include?(i - size) }
196
+ end
197
+ end
198
+ end
199
+ end
@@ -10,33 +10,38 @@ module RedAmber
10
10
  include VectorSelectable
11
11
  include Helper
12
12
 
13
+ using RefineArrayLike
14
+
15
+ # Quicker constructor of Vector.
16
+ #
17
+ def self.create(arrow_array)
18
+ instance = allocate
19
+ instance.instance_variable_set(:@data, arrow_array)
20
+ instance
21
+ end
22
+
23
+ # Create a Vector.
24
+ #
25
+ # @note default is headless Vector and '@key == nil'
13
26
  def initialize(*array)
14
- @key = nil # default is 'headless' Vector
15
- if array.empty? || array.first.nil?
16
- Vector.new([])
17
- else
18
- array.flatten!
19
- @data =
20
- case array
21
- in [Vector => v]
22
- v.data
23
- in [Arrow::Array => a]
24
- a
25
- in [Arrow::ChunkedArray => ca]
26
- ca
27
- in [Range => r]
28
- Arrow::Array.new(Array(r))
29
- else
30
- begin
31
- Arrow::Array.new(Array(array))
32
- rescue Error
33
- raise VectorArgumentError, "Invalid argument: #{array}"
34
- end
35
- end
36
- end
27
+ @data =
28
+ case array
29
+ in [Vector => v]
30
+ v.data
31
+ in [Range => r]
32
+ Arrow::Array.new(Array(r))
33
+ in [Arrow::Array | Arrow::ChunkedArray]
34
+ array[0]
35
+ in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
36
+ arrow_array_like.to_arrow_array
37
+ else
38
+ Arrow::Array.new(array.flatten)
39
+ end
37
40
  end
38
41
 
39
42
  attr_reader :data
43
+ alias_method :to_arrow_array, :data
44
+
40
45
  attr_accessor :key
41
46
 
42
47
  def to_s
@@ -45,45 +50,46 @@ module RedAmber
45
50
 
46
51
  def inspect(limit: 80)
47
52
  if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table').casecmp('MINIMUM').zero?
48
- # Better performance than `.upcase == 'MINIMUM'``
53
+ # Better performance than `.upcase == 'MINIMUM'`
49
54
  "#{self.class}(:#{type}, size=#{size})"
50
55
  else
51
56
  sio = StringIO.new << '['
52
- to_a.each_with_object(sio).with_index do |(e, s), i|
53
- next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
54
- if (s.size + next_str.size) < limit
55
- s << next_str
57
+ each.with_index do |e, i|
58
+ next_str = "#{sio.size > 1 ? ', ' : ''}#{e.inspect}"
59
+ if (sio.size + next_str.size) < limit
60
+ sio << next_str
56
61
  else
57
- s << ', ... ' if i < size
62
+ sio << ', ... ' if i < size
58
63
  break
59
64
  end
60
65
  end
61
66
  sio << ']'
62
67
 
63
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
68
+ format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n",
69
+ object_id, sio.string
64
70
  end
65
71
  end
66
72
 
67
- def values
73
+ def to_ary
68
74
  @data.values
69
75
  end
70
- alias_method :to_a, :values
71
- alias_method :entries, :values
76
+
77
+ alias_method :to_a, :to_ary
78
+ alias_method :values, :to_ary
79
+ alias_method :entries, :to_ary
72
80
 
73
81
  def indices
74
82
  (0...size).to_a
75
83
  end
84
+
76
85
  alias_method :indexes, :indices
77
86
  alias_method :indeces, :indices
78
87
 
79
- def to_ary
80
- values
81
- end
82
-
83
88
  def size
84
89
  # only defined :length in Arrow?
85
90
  @data.length
86
91
  end
92
+
87
93
  alias_method :length, :size
88
94
  alias_method :n_rows, :size
89
95
  alias_method :nrow, :size
@@ -93,39 +99,43 @@ module RedAmber
93
99
  end
94
100
 
95
101
  def type
96
- @data.value_type.nick.to_sym
102
+ list? ? :list : @data.value_type.nick.to_sym
97
103
  end
98
104
 
99
105
  def boolean?
100
- type_class == Arrow::BooleanDataType
106
+ @data.boolean?
101
107
  end
102
108
 
103
109
  def numeric?
104
- type_class < Arrow::NumericDataType
110
+ @data.numeric?
105
111
  end
106
112
 
107
113
  def float?
108
- type_class < Arrow::FloatingPointDataType
114
+ @data.float?
109
115
  end
110
116
 
111
117
  def integer?
112
- type_class < Arrow::IntegerDataType
118
+ @data.integer?
113
119
  end
114
120
 
115
121
  def string?
116
- type_class == Arrow::StringDataType
122
+ @data.string?
117
123
  end
118
124
 
119
125
  def dictionary?
120
- type_class == Arrow::DictionaryDataType
126
+ @data.dictionary?
121
127
  end
122
128
 
123
129
  def temporal?
124
- type_class < Arrow::TemporalDataType
130
+ @data.temporal?
131
+ end
132
+
133
+ def list?
134
+ @data.list?
125
135
  end
126
136
 
127
137
  def type_class
128
- @data.value_data_type.class
138
+ @data.type_class
129
139
  end
130
140
 
131
141
  def each
@@ -12,7 +12,8 @@ module RedAmber
12
12
  module VectorFunctions
13
13
  # [Unary aggregations]: vector.func => scalar
14
14
  unary_aggregations =
15
- %i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
15
+ %i[all any approximate_median count count_distinct max mean min min_max
16
+ product stddev sum variance]
16
17
  unary_aggregations.each do |function|
17
18
  define_method(function) do |**options|
18
19
  datum = exec_func_unary(function, options)
@@ -54,7 +55,10 @@ module RedAmber
54
55
  # @param min_count [Integer] min count.
55
56
  # @return [Float] quantile.
56
57
  def quantile(prob = 0.5, interpolation: :linear, skip_nils: true, min_count: 0)
57
- raise VectorArgumentError, "Invalid: probability #{prob} must be between 0 and 1" unless (0..1).cover? prob
58
+ unless (0..1).cover? prob
59
+ raise VectorArgumentError,
60
+ "Invalid: probability #{prob} must be between 0 and 1"
61
+ end
58
62
 
59
63
  datum = find(:quantile).execute([data],
60
64
  q: prob,
@@ -66,7 +70,8 @@ module RedAmber
66
70
 
67
71
  # Return quantiles in a DataFrame
68
72
  #
69
- def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0], interpolation: :linear, skip_nils: true, min_count: 0)
73
+ def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0],
74
+ interpolation: :linear, skip_nils: true, min_count: 0)
70
75
  if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
71
76
  raise VectorArgumentError, "Invarid probavilities #{probs}"
72
77
  end
@@ -74,20 +79,23 @@ module RedAmber
74
79
  DataFrame.new(
75
80
  probs: probs,
76
81
  quantiles: probs.map do |q|
77
- quantile(q, interpolation: interpolation, skip_nils: skip_nils, min_count: min_count)
82
+ quantile(q,
83
+ interpolation: interpolation, skip_nils: skip_nils,
84
+ min_count: min_count)
78
85
  end
79
86
  )
80
87
  end
81
88
 
82
89
  # [Unary element-wise]: vector.func => vector
83
90
  unary_element_wise =
84
- %i[abs acos asin array_sort_indices atan bit_wise_not ceil cos fill_null_backward \
85
- fill_null_forward floor is_finite is_inf is_nan is_null is_valid ln log10 log1p log2 \
91
+ %i[abs acos asin array_sort_indices atan bit_wise_not ceil cos
92
+ fill_null_backward fill_null_forward floor
93
+ is_finite is_inf is_nan is_null is_valid ln log10 log1p log2
86
94
  round round_to_multiple sign sin tan trunc unique]
87
95
  unary_element_wise.each do |function|
88
96
  define_method(function) do |**options|
89
97
  datum = exec_func_unary(function, options)
90
- Vector.new(datum.value)
98
+ Vector.create(datum.value)
91
99
  end
92
100
  end
93
101
  alias_method :is_nil, :is_null
@@ -113,12 +121,12 @@ module RedAmber
113
121
  unary_element_wise_op.each do |function, operator|
114
122
  define_method(function) do |**options|
115
123
  datum = exec_func_unary(function, options)
116
- Vector.new(datum.value)
124
+ Vector.create(datum.value)
117
125
  end
118
126
 
119
127
  define_method(operator) do |**options|
120
128
  datum = exec_func_unary(function, options)
121
- Vector.new(datum.value)
129
+ Vector.create(datum.value)
122
130
  end
123
131
  end
124
132
  alias_method :not, :invert
@@ -129,7 +137,7 @@ module RedAmber
129
137
  binary_element_wise.each do |function|
130
138
  define_method(function) do |other, **options|
131
139
  datum = exec_func_binary(function, other, options)
132
- Vector.new(datum.value)
140
+ Vector.create(datum.value)
133
141
  end
134
142
  end
135
143
 
@@ -145,7 +153,7 @@ module RedAmber
145
153
  logical_binary_element_wise.each do |method, function|
146
154
  define_method(method) do |other, **options|
147
155
  datum = exec_func_binary(function, other, options)
148
- Vector.new(datum.value)
156
+ Vector.create(datum.value)
149
157
  end
150
158
  end
151
159
 
@@ -171,12 +179,12 @@ module RedAmber
171
179
  binary_element_wise_op.each do |function, operator|
172
180
  define_method(function) do |other, **options|
173
181
  datum = exec_func_binary(function, other, options)
174
- Vector.new(datum.value)
182
+ Vector.create(datum.value)
175
183
  end
176
184
 
177
185
  define_method(operator) do |other, **options|
178
186
  datum = exec_func_binary(function, other, options)
179
- Vector.new(datum.value)
187
+ Vector.create(datum.value)
180
188
  end
181
189
  end
182
190
  alias_method :eq, :equal
@@ -187,76 +195,9 @@ module RedAmber
187
195
  alias_method :ne, :not_equal
188
196
 
189
197
  def coerce(other)
190
- case other
191
- when Vector, Array, Arrow::Array
192
- raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
193
-
194
- [Vector.new(Array(other)), self]
195
- end
196
198
  [Vector.new(Array(other) * size), self]
197
199
  end
198
200
 
199
- # < Not implimented yet > ---
200
-
201
- # option(s) required
202
- # - index
203
-
204
- # Returns other than value
205
- # - mode
206
- # - tdigest
207
-
208
- # Functions with numerical range check (unary)
209
- # - abs_checked acos_checked asin_checked cos_checked ln_checked
210
- # log10_checked log1p_checked log2_checked sin_checked tan_checked
211
-
212
- # Functions with numerical range check (binary)
213
- # - add_checked divide_checked logb_checked multiply_checked power_checked subtract_checked
214
- # shift_left_checked shift_right_checked
215
-
216
- # (array functions)
217
- # dictionary_encode,
218
- # partition_nth_indices,
219
- # quarter, quarters_between,
220
-
221
- # (strings)
222
- # ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
223
- # ascii_is_lower, ascii_is_printable, ascii_is_space, ascii_is_title, ascii_is_upper,
224
- # ascii_lower, ascii_lpad, ascii_ltrim, ascii_ltrim_whitespace, ascii_reverse,
225
- # ascii_rpad, ascii_rtrim, ascii_rtrim_whitespace, ascii_split_whitespace,
226
- # ascii_swapcase, ascii_title, ascii_trim, ascii_trim_whitespace, ascii_upper,
227
- # binary_join, binary_join_element_wise, binary_length, binary_repeat,
228
- # binary_replace_slice, binary_reverse, count_substring, count_substring_regex,
229
- # ends_with, extract_regex, find_substring, find_substring_regex,
230
- # match_like, match_substring, match_substring_regex, replace_substring,
231
- # replace_substring_regex, split_pattern, split_pattern_regex, starts_with,
232
- # string_is_ascii, utf8_capitalize, utf8_center, utf8_is_alnum, utf8_is_alpha,
233
- # utf8_is_decimal, utf8_is_digit, utf8_is_lower, utf8_is_numeric, utf8_is_printable,
234
- # utf8_is_space, utf8_is_title, utf8_is_upper, utf8_length, utf8_lower, utf8_lpad,
235
- # utf8_ltrim, utf8_ltrim_whitespace, utf8_normalize, utf8_replace_slice, utf8_reverse,
236
- # utf8_rpad, utf8_rtrim, utf8_rtrim_whitespace, utf8_slice_codeunits, utf8_split_whitespace,
237
- # utf8_swapcase, utf8_title, utf8_trim, utf8_trim_whitespace, utf8_upper
238
-
239
- # (temporal)
240
- # assume_timezone, ceil_temporal, day, day_of_week, day_of_year, day_time_interval_between,
241
- # days_between, floor_temporal, hour, hours_between, iso_calendar, iso_week, iso_year,
242
- # microsecond, microseconds_between, millisecond, milliseconds_between, minute,
243
- # minutes_between, month, month_day_nano_interval_between, month_interval_between,
244
- # nanosecond, nanoseconds_between, round_temporal, second, seconds_between, strftime,
245
- # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
246
-
247
- # (onditional)
248
- # case_when, cast,
249
-
250
- # (indices)
251
- # choose, index_in, index_in_meta_binary, indices_nonzero
252
-
253
- # (others)
254
- # coalesce,
255
- # is_in_meta_binary,
256
- # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
257
- # max_element_wise, min_element_wise, random, select_k_unstable,
258
- # struct_field,
259
-
260
201
  private # =======
261
202
 
262
203
  def exec_func_unary(function, options)
@@ -269,10 +210,9 @@ module RedAmber
269
210
  case other
270
211
  when Vector
271
212
  find(function).execute([data, other.data], options)
272
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
213
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar,
214
+ Array, Numeric, String, TrueClass, FalseClass
273
215
  find(function).execute([data, other], options)
274
- else
275
- raise VectorArgumentError, "Operand is not supported: #{other.class}"
276
216
  end
277
217
  end
278
218