red_amber 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +114 -39
  3. data/CHANGELOG.md +203 -31
  4. data/Gemfile +5 -2
  5. data/README.md +62 -29
  6. data/benchmark/basic.yml +86 -0
  7. data/benchmark/combine.yml +62 -0
  8. data/benchmark/dataframe.yml +62 -0
  9. data/benchmark/drop_nil.yml +15 -3
  10. data/benchmark/group.yml +39 -0
  11. data/benchmark/reshape.yml +31 -0
  12. data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
  13. data/benchmark/rover/flights.yml +23 -0
  14. data/benchmark/rover/penguins.yml +23 -0
  15. data/benchmark/rover/planes.yml +23 -0
  16. data/benchmark/rover/weather.yml +23 -0
  17. data/benchmark/vector.yml +60 -0
  18. data/doc/DataFrame.md +335 -53
  19. data/doc/Vector.md +91 -0
  20. data/doc/image/dataframe/join.png +0 -0
  21. data/doc/image/dataframe/set_and_bind.png +0 -0
  22. data/doc/image/dataframe_model.png +0 -0
  23. data/lib/red_amber/data_frame.rb +167 -51
  24. data/lib/red_amber/data_frame_combinable.rb +486 -0
  25. data/lib/red_amber/data_frame_displayable.rb +6 -4
  26. data/lib/red_amber/data_frame_indexable.rb +2 -2
  27. data/lib/red_amber/data_frame_loadsave.rb +4 -1
  28. data/lib/red_amber/data_frame_reshaping.rb +35 -10
  29. data/lib/red_amber/data_frame_selectable.rb +221 -116
  30. data/lib/red_amber/data_frame_variable_operation.rb +146 -82
  31. data/lib/red_amber/group.rb +108 -18
  32. data/lib/red_amber/helper.rb +53 -43
  33. data/lib/red_amber/refinements.rb +199 -0
  34. data/lib/red_amber/vector.rb +56 -46
  35. data/lib/red_amber/vector_functions.rb +23 -83
  36. data/lib/red_amber/vector_selectable.rb +116 -69
  37. data/lib/red_amber/vector_updatable.rb +189 -65
  38. data/lib/red_amber/version.rb +1 -1
  39. data/lib/red_amber.rb +3 -0
  40. data/red_amber.gemspec +4 -3
  41. metadata +24 -10
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # Add additional capabilities to Hash
5
+ module RefineHash
6
+ refine Hash do
7
+ # Convert self to an Arrow::Table
8
+ def to_arrow
9
+ Arrow::Table.new(self)
10
+ end
11
+ end
12
+ end
13
+
14
+ # Add additional capabilities to Array-like classes
15
+ module RefineArrayLike
16
+ refine Array do
17
+ def to_arrow_array
18
+ Arrow::Array.new(self)
19
+ end
20
+ end
21
+
22
+ refine Range do
23
+ def to_arrow_array
24
+ Arrow::Array.new(Array(self))
25
+ end
26
+ end
27
+
28
+ # common methods for Arrow::Array and Arrow::ChunkedArray
29
+ # Refinement#include is deprecated and will be removed in Ruby 3.2
30
+ refine Arrow::Array do
31
+ def to_arrow_array
32
+ self
33
+ end
34
+
35
+ def type_class
36
+ value_data_type.class
37
+ end
38
+
39
+ def boolean?
40
+ value_data_type.instance_of?(Arrow::BooleanDataType)
41
+ end
42
+
43
+ def numeric?
44
+ value_data_type.class < Arrow::NumericDataType
45
+ end
46
+
47
+ def float?
48
+ value_data_type.class < Arrow::FloatingPointDataType
49
+ end
50
+
51
+ def integer?
52
+ value_data_type.class < Arrow::IntegerDataType
53
+ end
54
+
55
+ def list?
56
+ is_a? Arrow::ListArray
57
+ end
58
+
59
+ def unsigned_integer?
60
+ value_data_type.instance_of?(Arrow::UInt8DataType) ||
61
+ value_data_type.instance_of?(Arrow::UInt16DataType) ||
62
+ value_data_type.instance_of?(Arrow::UInt32DataType) ||
63
+ value_data_type.instance_of?(Arrow::UInt64DataType)
64
+ end
65
+
66
+ def string?
67
+ value_data_type.instance_of?(Arrow::StringDataType)
68
+ end
69
+
70
+ def dictionary?
71
+ value_data_type.instance_of?(Arrow::DictionaryDataType)
72
+ end
73
+
74
+ def temporal?
75
+ value_data_type.class < Arrow::TemporalDataType
76
+ end
77
+
78
+ def primitive_invert
79
+ n = Arrow::Function.find(:is_null).execute([self])
80
+ i = Arrow::Function.find(:if_else).execute([n, false, self])
81
+ Arrow::Function.find(:invert).execute([i]).value
82
+ end
83
+ end
84
+
85
+ refine Arrow::ChunkedArray do
86
+ def to_arrow_array
87
+ self
88
+ end
89
+
90
+ def type_class
91
+ value_data_type.class
92
+ end
93
+
94
+ def boolean?
95
+ value_data_type.instance_of?(Arrow::BooleanDataType)
96
+ end
97
+
98
+ def numeric?
99
+ value_data_type.class < Arrow::NumericDataType
100
+ end
101
+
102
+ def float?
103
+ value_data_type.class < Arrow::FloatingPointDataType
104
+ end
105
+
106
+ def integer?
107
+ value_data_type.class < Arrow::IntegerDataType
108
+ end
109
+
110
+ def unsigned_integer?
111
+ value_data_type.instance_of?(Arrow::UInt8DataType) ||
112
+ value_data_type.instance_of?(Arrow::UInt16DataType) ||
113
+ value_data_type.instance_of?(Arrow::UInt32DataType) ||
114
+ value_data_type.instance_of?(Arrow::UInt64DataType)
115
+ end
116
+
117
+ def string?
118
+ value_data_type.instance_of?(Arrow::StringDataType)
119
+ end
120
+
121
+ def dictionary?
122
+ value_data_type.instance_of?(Arrow::DictionaryDataType)
123
+ end
124
+
125
+ def temporal?
126
+ value_data_type.class < Arrow::TemporalDataType
127
+ end
128
+
129
+ def list?
130
+ value_type.nick == 'list'
131
+ end
132
+
133
+ def primitive_invert
134
+ n = Arrow::Function.find(:is_null).execute([self])
135
+ i = Arrow::Function.find(:if_else).execute([n, false, self])
136
+ Arrow::Function.find(:invert).execute([i]).value
137
+ end
138
+ end
139
+ end
140
+
141
+ # Add additional capabilities to Arrow::Table
142
+ module RefineArrowTable
143
+ refine Arrow::Table do
144
+ def keys
145
+ columns.map(&:name)
146
+ end
147
+
148
+ def key?(key)
149
+ keys.include?(key)
150
+ end
151
+ end
152
+ end
153
+
154
+ # Add additional capabilities to Array
155
+ module RefineArray
156
+ refine Array do
157
+ def integers?
158
+ all? { |e| e.is_a?(Integer) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
159
+ end
160
+
161
+ def booleans?
162
+ all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
163
+ end
164
+
165
+ def symbols?
166
+ all? { |e| e.is_a?(Symbol) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
167
+ end
168
+
169
+ def strings?
170
+ all? { |e| e.is_a?(String) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
171
+ end
172
+
173
+ def symbols_or_strings?
174
+ all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
175
+ end
176
+
177
+ # convert booleans to indices
178
+ def booleans_to_indices
179
+ (0...size).select.with_index { |_, i| self[i] }
180
+ end
181
+
182
+ # select elements by booleans
183
+ def select_by_booleans(booleans)
184
+ select.with_index { |_, i| booleans[i] }
185
+ end
186
+
187
+ # reject elements by booleans
188
+ def reject_by_booleans(booleans)
189
+ reject.with_index { |_, i| booleans[i] }
190
+ end
191
+
192
+ # reject elements by indices
193
+ # notice: order by indices is not considered.
194
+ def reject_by_indices(indices)
195
+ reject.with_index { |_, i| indices.include?(i) || indices.include?(i - size) }
196
+ end
197
+ end
198
+ end
199
+ end
@@ -10,33 +10,38 @@ module RedAmber
10
10
  include VectorSelectable
11
11
  include Helper
12
12
 
13
+ using RefineArrayLike
14
+
15
+ # Quicker constructor of Vector.
16
+ #
17
+ def self.create(arrow_array)
18
+ instance = allocate
19
+ instance.instance_variable_set(:@data, arrow_array)
20
+ instance
21
+ end
22
+
23
+ # Create a Vector.
24
+ #
25
+ # @note default is headless Vector and '@key == nil'
13
26
  def initialize(*array)
14
- @key = nil # default is 'headless' Vector
15
- if array.empty? || array.first.nil?
16
- Vector.new([])
17
- else
18
- array.flatten!
19
- @data =
20
- case array
21
- in [Vector => v]
22
- v.data
23
- in [Arrow::Array => a]
24
- a
25
- in [Arrow::ChunkedArray => ca]
26
- ca
27
- in [Range => r]
28
- Arrow::Array.new(Array(r))
29
- else
30
- begin
31
- Arrow::Array.new(Array(array))
32
- rescue Error
33
- raise VectorArgumentError, "Invalid argument: #{array}"
34
- end
35
- end
36
- end
27
+ @data =
28
+ case array
29
+ in [Vector => v]
30
+ v.data
31
+ in [Range => r]
32
+ Arrow::Array.new(Array(r))
33
+ in [Arrow::Array | Arrow::ChunkedArray]
34
+ array[0]
35
+ in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
36
+ arrow_array_like.to_arrow_array
37
+ else
38
+ Arrow::Array.new(array.flatten)
39
+ end
37
40
  end
38
41
 
39
42
  attr_reader :data
43
+ alias_method :to_arrow_array, :data
44
+
40
45
  attr_accessor :key
41
46
 
42
47
  def to_s
@@ -45,45 +50,46 @@ module RedAmber
45
50
 
46
51
  def inspect(limit: 80)
47
52
  if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table').casecmp('MINIMUM').zero?
48
- # Better performance than `.upcase == 'MINIMUM'``
53
+ # Better performance than `.upcase == 'MINIMUM'`
49
54
  "#{self.class}(:#{type}, size=#{size})"
50
55
  else
51
56
  sio = StringIO.new << '['
52
- to_a.each_with_object(sio).with_index do |(e, s), i|
53
- next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
54
- if (s.size + next_str.size) < limit
55
- s << next_str
57
+ each.with_index do |e, i|
58
+ next_str = "#{sio.size > 1 ? ', ' : ''}#{e.inspect}"
59
+ if (sio.size + next_str.size) < limit
60
+ sio << next_str
56
61
  else
57
- s << ', ... ' if i < size
62
+ sio << ', ... ' if i < size
58
63
  break
59
64
  end
60
65
  end
61
66
  sio << ']'
62
67
 
63
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
68
+ format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n",
69
+ object_id, sio.string
64
70
  end
65
71
  end
66
72
 
67
- def values
73
+ def to_ary
68
74
  @data.values
69
75
  end
70
- alias_method :to_a, :values
71
- alias_method :entries, :values
76
+
77
+ alias_method :to_a, :to_ary
78
+ alias_method :values, :to_ary
79
+ alias_method :entries, :to_ary
72
80
 
73
81
  def indices
74
82
  (0...size).to_a
75
83
  end
84
+
76
85
  alias_method :indexes, :indices
77
86
  alias_method :indeces, :indices
78
87
 
79
- def to_ary
80
- values
81
- end
82
-
83
88
  def size
84
89
  # only defined :length in Arrow?
85
90
  @data.length
86
91
  end
92
+
87
93
  alias_method :length, :size
88
94
  alias_method :n_rows, :size
89
95
  alias_method :nrow, :size
@@ -93,39 +99,43 @@ module RedAmber
93
99
  end
94
100
 
95
101
  def type
96
- @data.value_type.nick.to_sym
102
+ list? ? :list : @data.value_type.nick.to_sym
97
103
  end
98
104
 
99
105
  def boolean?
100
- type_class == Arrow::BooleanDataType
106
+ @data.boolean?
101
107
  end
102
108
 
103
109
  def numeric?
104
- type_class < Arrow::NumericDataType
110
+ @data.numeric?
105
111
  end
106
112
 
107
113
  def float?
108
- type_class < Arrow::FloatingPointDataType
114
+ @data.float?
109
115
  end
110
116
 
111
117
  def integer?
112
- type_class < Arrow::IntegerDataType
118
+ @data.integer?
113
119
  end
114
120
 
115
121
  def string?
116
- type_class == Arrow::StringDataType
122
+ @data.string?
117
123
  end
118
124
 
119
125
  def dictionary?
120
- type_class == Arrow::DictionaryDataType
126
+ @data.dictionary?
121
127
  end
122
128
 
123
129
  def temporal?
124
- type_class < Arrow::TemporalDataType
130
+ @data.temporal?
131
+ end
132
+
133
+ def list?
134
+ @data.list?
125
135
  end
126
136
 
127
137
  def type_class
128
- @data.value_data_type.class
138
+ @data.type_class
129
139
  end
130
140
 
131
141
  def each
@@ -12,7 +12,8 @@ module RedAmber
12
12
  module VectorFunctions
13
13
  # [Unary aggregations]: vector.func => scalar
14
14
  unary_aggregations =
15
- %i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
15
+ %i[all any approximate_median count count_distinct max mean min min_max
16
+ product stddev sum variance]
16
17
  unary_aggregations.each do |function|
17
18
  define_method(function) do |**options|
18
19
  datum = exec_func_unary(function, options)
@@ -54,7 +55,10 @@ module RedAmber
54
55
  # @param min_count [Integer] min count.
55
56
  # @return [Float] quantile.
56
57
  def quantile(prob = 0.5, interpolation: :linear, skip_nils: true, min_count: 0)
57
- raise VectorArgumentError, "Invalid: probability #{prob} must be between 0 and 1" unless (0..1).cover? prob
58
+ unless (0..1).cover? prob
59
+ raise VectorArgumentError,
60
+ "Invalid: probability #{prob} must be between 0 and 1"
61
+ end
58
62
 
59
63
  datum = find(:quantile).execute([data],
60
64
  q: prob,
@@ -66,7 +70,8 @@ module RedAmber
66
70
 
67
71
  # Return quantiles in a DataFrame
68
72
  #
69
- def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0], interpolation: :linear, skip_nils: true, min_count: 0)
73
+ def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0],
74
+ interpolation: :linear, skip_nils: true, min_count: 0)
70
75
  if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
71
76
  raise VectorArgumentError, "Invarid probavilities #{probs}"
72
77
  end
@@ -74,20 +79,23 @@ module RedAmber
74
79
  DataFrame.new(
75
80
  probs: probs,
76
81
  quantiles: probs.map do |q|
77
- quantile(q, interpolation: interpolation, skip_nils: skip_nils, min_count: min_count)
82
+ quantile(q,
83
+ interpolation: interpolation, skip_nils: skip_nils,
84
+ min_count: min_count)
78
85
  end
79
86
  )
80
87
  end
81
88
 
82
89
  # [Unary element-wise]: vector.func => vector
83
90
  unary_element_wise =
84
- %i[abs acos asin array_sort_indices atan bit_wise_not ceil cos fill_null_backward \
85
- fill_null_forward floor is_finite is_inf is_nan is_null is_valid ln log10 log1p log2 \
91
+ %i[abs acos asin array_sort_indices atan bit_wise_not ceil cos
92
+ fill_null_backward fill_null_forward floor
93
+ is_finite is_inf is_nan is_null is_valid ln log10 log1p log2
86
94
  round round_to_multiple sign sin tan trunc unique]
87
95
  unary_element_wise.each do |function|
88
96
  define_method(function) do |**options|
89
97
  datum = exec_func_unary(function, options)
90
- Vector.new(datum.value)
98
+ Vector.create(datum.value)
91
99
  end
92
100
  end
93
101
  alias_method :is_nil, :is_null
@@ -113,12 +121,12 @@ module RedAmber
113
121
  unary_element_wise_op.each do |function, operator|
114
122
  define_method(function) do |**options|
115
123
  datum = exec_func_unary(function, options)
116
- Vector.new(datum.value)
124
+ Vector.create(datum.value)
117
125
  end
118
126
 
119
127
  define_method(operator) do |**options|
120
128
  datum = exec_func_unary(function, options)
121
- Vector.new(datum.value)
129
+ Vector.create(datum.value)
122
130
  end
123
131
  end
124
132
  alias_method :not, :invert
@@ -129,7 +137,7 @@ module RedAmber
129
137
  binary_element_wise.each do |function|
130
138
  define_method(function) do |other, **options|
131
139
  datum = exec_func_binary(function, other, options)
132
- Vector.new(datum.value)
140
+ Vector.create(datum.value)
133
141
  end
134
142
  end
135
143
 
@@ -145,7 +153,7 @@ module RedAmber
145
153
  logical_binary_element_wise.each do |method, function|
146
154
  define_method(method) do |other, **options|
147
155
  datum = exec_func_binary(function, other, options)
148
- Vector.new(datum.value)
156
+ Vector.create(datum.value)
149
157
  end
150
158
  end
151
159
 
@@ -171,12 +179,12 @@ module RedAmber
171
179
  binary_element_wise_op.each do |function, operator|
172
180
  define_method(function) do |other, **options|
173
181
  datum = exec_func_binary(function, other, options)
174
- Vector.new(datum.value)
182
+ Vector.create(datum.value)
175
183
  end
176
184
 
177
185
  define_method(operator) do |other, **options|
178
186
  datum = exec_func_binary(function, other, options)
179
- Vector.new(datum.value)
187
+ Vector.create(datum.value)
180
188
  end
181
189
  end
182
190
  alias_method :eq, :equal
@@ -187,76 +195,9 @@ module RedAmber
187
195
  alias_method :ne, :not_equal
188
196
 
189
197
  def coerce(other)
190
- case other
191
- when Vector, Array, Arrow::Array
192
- raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
193
-
194
- [Vector.new(Array(other)), self]
195
- end
196
198
  [Vector.new(Array(other) * size), self]
197
199
  end
198
200
 
199
- # < Not implimented yet > ---
200
-
201
- # option(s) required
202
- # - index
203
-
204
- # Returns other than value
205
- # - mode
206
- # - tdigest
207
-
208
- # Functions with numerical range check (unary)
209
- # - abs_checked acos_checked asin_checked cos_checked ln_checked
210
- # log10_checked log1p_checked log2_checked sin_checked tan_checked
211
-
212
- # Functions with numerical range check (binary)
213
- # - add_checked divide_checked logb_checked multiply_checked power_checked subtract_checked
214
- # shift_left_checked shift_right_checked
215
-
216
- # (array functions)
217
- # dictionary_encode,
218
- # partition_nth_indices,
219
- # quarter, quarters_between,
220
-
221
- # (strings)
222
- # ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
223
- # ascii_is_lower, ascii_is_printable, ascii_is_space, ascii_is_title, ascii_is_upper,
224
- # ascii_lower, ascii_lpad, ascii_ltrim, ascii_ltrim_whitespace, ascii_reverse,
225
- # ascii_rpad, ascii_rtrim, ascii_rtrim_whitespace, ascii_split_whitespace,
226
- # ascii_swapcase, ascii_title, ascii_trim, ascii_trim_whitespace, ascii_upper,
227
- # binary_join, binary_join_element_wise, binary_length, binary_repeat,
228
- # binary_replace_slice, binary_reverse, count_substring, count_substring_regex,
229
- # ends_with, extract_regex, find_substring, find_substring_regex,
230
- # match_like, match_substring, match_substring_regex, replace_substring,
231
- # replace_substring_regex, split_pattern, split_pattern_regex, starts_with,
232
- # string_is_ascii, utf8_capitalize, utf8_center, utf8_is_alnum, utf8_is_alpha,
233
- # utf8_is_decimal, utf8_is_digit, utf8_is_lower, utf8_is_numeric, utf8_is_printable,
234
- # utf8_is_space, utf8_is_title, utf8_is_upper, utf8_length, utf8_lower, utf8_lpad,
235
- # utf8_ltrim, utf8_ltrim_whitespace, utf8_normalize, utf8_replace_slice, utf8_reverse,
236
- # utf8_rpad, utf8_rtrim, utf8_rtrim_whitespace, utf8_slice_codeunits, utf8_split_whitespace,
237
- # utf8_swapcase, utf8_title, utf8_trim, utf8_trim_whitespace, utf8_upper
238
-
239
- # (temporal)
240
- # assume_timezone, ceil_temporal, day, day_of_week, day_of_year, day_time_interval_between,
241
- # days_between, floor_temporal, hour, hours_between, iso_calendar, iso_week, iso_year,
242
- # microsecond, microseconds_between, millisecond, milliseconds_between, minute,
243
- # minutes_between, month, month_day_nano_interval_between, month_interval_between,
244
- # nanosecond, nanoseconds_between, round_temporal, second, seconds_between, strftime,
245
- # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
246
-
247
- # (onditional)
248
- # case_when, cast,
249
-
250
- # (indices)
251
- # choose, index_in, index_in_meta_binary, indices_nonzero
252
-
253
- # (others)
254
- # coalesce,
255
- # is_in_meta_binary,
256
- # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
257
- # max_element_wise, min_element_wise, random, select_k_unstable,
258
- # struct_field,
259
-
260
201
  private # =======
261
202
 
262
203
  def exec_func_unary(function, options)
@@ -269,10 +210,9 @@ module RedAmber
269
210
  case other
270
211
  when Vector
271
212
  find(function).execute([data, other.data], options)
272
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
213
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar,
214
+ Array, Numeric, String, TrueClass, FalseClass
273
215
  find(function).execute([data, other], options)
274
- else
275
- raise VectorArgumentError, "Operand is not supported: #{other.class}"
276
216
  end
277
217
  end
278
218