red_amber 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,45 @@ module RedAmber
10
10
  using RefineArray
11
11
  using RefineArrayLike
12
12
 
13
+ # Entity to select sub-dataframes
14
+ class Selectors
15
+ attr_reader :selectors, :size, :sizes
16
+
17
+ def initialize(selectors)
18
+ @selectors = selectors
19
+ @size = selectors.size
20
+ @sizes = []
21
+ end
22
+
23
+ # Generic iterator method
24
+ def each
25
+ @selectors.each
26
+ end
27
+ end
28
+
29
+ # Boolean selectors of sub-dataframes
30
+ class Filters < Selectors
31
+ # Return sizes of filter
32
+ # @return [Array<Integer>]
33
+ # sizes of each sub dataframes.
34
+ # Counts true for each filter.
35
+ def sizes
36
+ @sizes = @selectors.map { |s| s.to_a.count { _1 } } # rubocop:disable Performance/Size
37
+ end
38
+ end
39
+
40
+ # Index selectors of sub-dataframes
41
+ class Indices < Selectors
42
+ # Return sizes of selector indices.
43
+ # @return [Array<Integer>]
44
+ # sizes of each sub dataframes.
45
+ def sizes
46
+ @sizes = @selectors.map(&:size)
47
+ end
48
+ end
49
+
50
+ private_constant :Selectors, :Filters, :Indices
51
+
13
52
  class << self
14
53
  # Create SubFrames from a Group.
15
54
  #
@@ -61,7 +100,7 @@ module RedAmber
61
100
  # @since 0.4.0
62
101
  #
63
102
  def by_group(group)
64
- SubFrames.new(group.dataframe, group.filters)
103
+ SubFrames.by_filters(group.dataframe, group.filters)
65
104
  end
66
105
 
67
106
  # Create a new SubFrames object from a DataFrame and an array of indices.
@@ -79,13 +118,8 @@ module RedAmber
79
118
  def by_indices(dataframe, subset_indices)
80
119
  instance = allocate
81
120
  instance.instance_variable_set(:@baseframe, dataframe)
82
- enum =
83
- Enumerator.new(subset_indices.size) do |y|
84
- subset_indices.each do |i|
85
- y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.take(i))
86
- end
87
- end
88
- instance.instance_variable_set(:@enum, enum)
121
+ instance.instance_variable_set(:@selectors, Indices.new(subset_indices))
122
+ instance.instance_variable_set(:@frames, [])
89
123
  instance
90
124
  end
91
125
 
@@ -105,13 +139,8 @@ module RedAmber
105
139
  def by_filters(dataframe, subset_filters)
106
140
  instance = allocate
107
141
  instance.instance_variable_set(:@baseframe, dataframe)
108
- enum =
109
- Enumerator.new(subset_filters.size) do |y|
110
- subset_filters.each do |i|
111
- y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.filter(i))
112
- end
113
- end
114
- instance.instance_variable_set(:@enum, enum)
142
+ instance.instance_variable_set(:@selectors, Filters.new(subset_filters))
143
+ instance.instance_variable_set(:@frames, [])
115
144
  instance
116
145
  end
117
146
 
@@ -130,18 +159,13 @@ module RedAmber
130
159
  case Array(dataframes)
131
160
  when [] || [nil]
132
161
  instance.instance_variable_set(:@baseframe, DataFrame.new)
162
+ instance.instance_variable_set(:@selectors, [])
133
163
  instance.instance_variable_set(:@frames, [])
134
- enum = [].each
135
164
  else
136
- enum =
137
- Enumerator.new(dataframes.size) do |y|
138
- dataframes.each do |i|
139
- y.yield i
140
- end
141
- end
142
- instance.instance_variable_set(:@baseframe, enum.lazy)
165
+ instance.instance_variable_set(:@baseframe, nil)
166
+ instance.instance_variable_set(:@selectors, nil)
167
+ instance.instance_variable_set(:@frames, dataframes)
143
168
  end
144
- instance.instance_variable_set(:@enum, enum)
145
169
  instance
146
170
  end
147
171
 
@@ -261,40 +285,34 @@ module RedAmber
261
285
  #
262
286
  # @since 0.4.0
263
287
  #
264
- def initialize(dataframe, subset_specifier = nil, &block)
288
+ def initialize(dataframe, selectors = nil, &block)
265
289
  unless dataframe.is_a?(DataFrame)
266
290
  raise SubFramesArgumentError, "not a DataFrame: #{dataframe}"
267
291
  end
268
292
 
269
293
  if block
270
- unless subset_specifier.nil?
294
+ unless selectors.nil?
271
295
  raise SubFramesArgumentError, 'Must not specify both arguments and block.'
272
296
  end
273
297
 
274
- subset_specifier = yield(dataframe)
298
+ selectors = yield(dataframe)
275
299
  end
276
300
 
277
- if dataframe.empty? || subset_specifier.nil? || subset_specifier.empty?
301
+ if dataframe.empty? || selectors.nil? || selectors.size.zero? # rubocop:disable Style/ZeroLengthPredicate
278
302
  @baseframe = DataFrame.new
279
- @frames = []
280
- @enum = @frames.each
303
+ @selectors = Selectors.new([])
281
304
  else
282
- @baseframe = nil
283
- @enum =
284
- Enumerator.new(subset_specifier.size) do |yielder|
285
- subset_specifier.map do |i|
286
- df =
287
- if i.numeric?
288
- dataframe.take(i)
289
- elsif i.boolean?
290
- dataframe.filter(i)
291
- else
292
- raise SubFramesArgumentError, "illegal type: #{i}"
293
- end
294
- yielder.yield DataFrame.new_dataframe_with_schema(dataframe, df)
295
- end
305
+ @baseframe = dataframe
306
+ @selectors =
307
+ if selectors.first.boolean?
308
+ Filters.new(selectors)
309
+ elsif selectors.first.numeric?
310
+ Indices.new(selectors)
311
+ else
312
+ raise SubFramesArgumentError, "illegal type: #{selectors}"
296
313
  end
297
314
  end
315
+ @frames = []
298
316
  end
299
317
 
300
318
  # Return concatenated SubFrames as a DataFrame.
@@ -305,11 +323,7 @@ module RedAmber
305
323
  # @since 0.4.0
306
324
  #
307
325
  def baseframe
308
- if @baseframe.nil? || @baseframe.is_a?(Enumerator)
309
- @baseframe = reduce(&:concatenate)
310
- else
311
- @baseframe
312
- end
326
+ @baseframe ||= reduce(&:concatenate)
313
327
  end
314
328
  alias_method :concatenate, :baseframe
315
329
  alias_method :concat, :baseframe
@@ -384,7 +398,19 @@ module RedAmber
384
398
  def each(&block)
385
399
  return enum_for(__method__) { size } unless block
386
400
 
387
- frames.each(&block)
401
+ if @selectors
402
+ @selectors.each.with_index do |selector, i|
403
+ if i < @frames.size
404
+ yield @frames[i]
405
+ else
406
+ frame = get_subframe(selector)
407
+ @frames << frame
408
+ yield frame
409
+ end
410
+ end
411
+ else
412
+ @frames.each(&block)
413
+ end
388
414
  nil
389
415
  end
390
416
 
@@ -916,6 +942,26 @@ module RedAmber
916
942
  #
917
943
  define_subframable_method :filter_map
918
944
 
945
+ # Return 0...num sub-dataframes in self.
946
+ #
947
+ # @param num [Integer, Float]
948
+ # num of sub-dataframes to pick up. `num`` must be positive or zero.
949
+ # @return [SubFrames]
950
+ # A new SubFrames.
951
+ # If n == 0, it returns empty SubFrames.
952
+ # If n >= size, it returns self.
953
+ # @since 0.4.2
954
+ #
955
+ def take(num)
956
+ if num.zero?
957
+ SubFrames.new(DataFrame.new, [])
958
+ elsif num >= size
959
+ self
960
+ else
961
+ SubFrames.by_dataframes(frames(num))
962
+ end
963
+ end
964
+
919
965
  # Number of subsets.
920
966
  #
921
967
  # @return [Integer]
@@ -923,7 +969,12 @@ module RedAmber
923
969
  # @since 0.4.0
924
970
  #
925
971
  def size
926
- @size ||= @enum.size
972
+ @size ||=
973
+ if @selectors
974
+ @selectors.size
975
+ else
976
+ @frames.size
977
+ end
927
978
  end
928
979
 
929
980
  # Size list of subsets.
@@ -933,7 +984,12 @@ module RedAmber
933
984
  # @since 0.4.0
934
985
  #
935
986
  def sizes
936
- @sizes ||= @enum.map(&:size)
987
+ @sizes ||=
988
+ if @selectors
989
+ @selectors.sizes
990
+ else
991
+ @frames.map(&:size)
992
+ end
937
993
  end
938
994
 
939
995
  # Indices at the top of each sub DataFrames.
@@ -945,10 +1001,17 @@ module RedAmber
945
1001
  # @since 0.4.0
946
1002
  #
947
1003
  def offset_indices
948
- sum = 0
949
- sizes.map do |size|
950
- sum += size
951
- sum - size
1004
+ case @selectors
1005
+ when Filters
1006
+ @selectors.selectors.map do |selector|
1007
+ selector.each.with_index.find { |x, _| x }[1]
1008
+ end
1009
+ else # Indices, nil
1010
+ sum = 0
1011
+ sizes.map do |size|
1012
+ sum += size
1013
+ sum - size
1014
+ end
952
1015
  end
953
1016
  end
954
1017
 
@@ -965,11 +1028,11 @@ module RedAmber
965
1028
  # Test if self has only one subset and it is comprehensive.
966
1029
  #
967
1030
  # @return [true, false]
968
- # true if only member of self is equal to universal DataFrame.
1031
+ # true if the only member of self is equal to universal DataFrame.
969
1032
  # @since 0.4.0
970
1033
  #
971
1034
  def universal?
972
- size == 1 && @enum.first == baseframe
1035
+ size == 1 && first == @baseframe
973
1036
  end
974
1037
 
975
1038
  # Return string representation of self.
@@ -1012,7 +1075,7 @@ module RedAmber
1012
1075
  #
1013
1076
  # @since 0.4.0
1014
1077
  #
1015
- def to_s(limit: 16)
1078
+ def to_s(limit: 5)
1016
1079
  _to_s(limit: limit)
1017
1080
  end
1018
1081
 
@@ -1064,10 +1127,10 @@ module RedAmber
1064
1127
  #
1065
1128
  # @since 0.4.0
1066
1129
  #
1067
- def inspect(limit: 16)
1130
+ def inspect(limit: 5)
1068
1131
  shape =
1069
- if @baseframe.is_a?(Enumerator)
1070
- "Enumerator::Lazy:size=#{@baseframe.size}"
1132
+ if @baseframe.nil?
1133
+ '(Not prepared)'
1071
1134
  else
1072
1135
  baseframe.shape_str(with_id: true)
1073
1136
  end
@@ -1079,14 +1142,51 @@ module RedAmber
1079
1142
  "---\n#{_to_s(limit: limit, with_id: true)}"
1080
1143
  end
1081
1144
 
1145
+ # Return an Array of sub DataFrames
1146
+ #
1147
+ # @overload frames
1148
+ # Returns all sub dataframes.
1149
+ #
1150
+ # @return [Array<DataFrame>]
1151
+ # sub DataFrames.
1152
+ #
1153
+ # @overload frames(n_frames)
1154
+ # Returns partial sub dataframes.
1155
+ #
1156
+ # @param n_frames [Integer]
1157
+ # num of dataframes to retrieve.
1158
+ # @return [Array<DataFrame>]
1159
+ # sub DataFrames.
1160
+ #
1161
+ # @since 0.4.2
1162
+ #
1163
+ def frames(n_frames = nil)
1164
+ n_frames = size if n_frames.nil?
1165
+
1166
+ if @frames.size < n_frames
1167
+ @frames = each.take(n_frames)
1168
+ else
1169
+ @frames.take(n_frames)
1170
+ end
1171
+ end
1172
+
1082
1173
  private
1083
1174
 
1084
- def frames
1085
- @frames ||= @enum.to_a
1175
+ # Get sub dataframe specified by 'selector'
1176
+ def get_subframe(selector)
1177
+ df =
1178
+ case @selectors
1179
+ when Filters
1180
+ @baseframe.filter(selector)
1181
+ when Indices
1182
+ @baseframe.take(selector)
1183
+ end
1184
+ DataFrame.new_dataframe_with_schema(@baseframe, df)
1086
1185
  end
1087
1186
 
1088
- def _to_s(limit: 16, with_id: false)
1089
- a = take(limit).map do |df|
1187
+ # Subcontractor of to_s
1188
+ def _to_s(limit: 5, with_id: false)
1189
+ a = each.take(limit).map do |df|
1090
1190
  if with_id
1091
1191
  "#<#{df.shape_str(with_id: with_id)}>\n" \
1092
1192
  "#{df.to_s(head: 2, tail: 2)}"
@@ -10,21 +10,54 @@ module RedAmber
10
10
  include ArrowFunction
11
11
  include VectorUpdatable
12
12
  include VectorSelectable
13
+ include VectorStringFunction
13
14
 
14
15
  using RefineArrayLike
15
16
 
16
- # Quicker constructor of Vector.
17
+ # Entity of Vector.
17
18
  #
18
- # @param arrow_array [Arrow::Array]
19
- # Arrow::Array object to have in the Vector.
20
- # @return [Vector]
21
- # created Vector.
22
- # @note This method doesn't check argment type.
19
+ # @return [Arrow::Array]
20
+ #
21
+ attr_reader :data
22
+ alias_method :to_arrow_array, :data
23
+
24
+ # Associated key name when self is in a DataFrame.
25
+ #
26
+ # Default Vector is 'head-less' (key-less).
27
+ # @return [Symbol]
23
28
  #
24
- def self.create(arrow_array)
25
- instance = allocate
26
- instance.instance_variable_set(:@data, arrow_array)
27
- instance
29
+ attr_accessor :key
30
+
31
+ class << self
32
+ # Create a Vector (calling `.new`).
33
+ #
34
+ # @param (see #initialize)
35
+ # @return (see #initialize)
36
+ # @example Create an empty Vector.
37
+ # Vector[]
38
+ # # =>
39
+ # #<RedAmber::Vector(:string, size=0):0x000000000000e2cc>
40
+ # []
41
+ #
42
+ # @since 0.5.0
43
+ #
44
+ def [](...)
45
+ new(...)
46
+ end
47
+
48
+ # Quicker constructor of Vector.
49
+ #
50
+ # @param arrow_array [Arrow::Array]
51
+ # Arrow::Array object to have in the Vector.
52
+ # @return [Vector]
53
+ # created Vector.
54
+ # @note This method doesn't check argment type.
55
+ #
56
+ def create(arrow_array)
57
+ instance = allocate
58
+ instance.instance_variable_set(:@data, arrow_array)
59
+ instance
60
+ end
28
61
  end
29
62
 
30
63
  # Create a Vector.
@@ -51,20 +84,6 @@ module RedAmber
51
84
  end
52
85
  end
53
86
 
54
- # Entity of Vector.
55
- #
56
- # @return [Arrow::Array]
57
- #
58
- attr_reader :data
59
- alias_method :to_arrow_array, :data
60
-
61
- # Associated key name when self is in a DataFrame.
62
- #
63
- # Default Vector is 'head-less' (key-less).
64
- # @return [Symbol]
65
- #
66
- attr_accessor :key
67
-
68
87
  # Return other as a Vector which is same data type as self.
69
88
  #
70
89
  # @param other [Vector, Array, Arrow::Array, Arrow::ChunkedArray]
@@ -161,6 +161,22 @@ module RedAmber
161
161
  #
162
162
  define_unary_aggregation :min_max
163
163
 
164
+ # Compute the 1 most common values and their respective
165
+ # occurence counts.
166
+ #
167
+ # @note Self must be a numeric or a boolean Vector.
168
+ # @note ModeOptions are not supported in 0.5.0 .
169
+ # Only one mode value is returned.
170
+ # @api private
171
+ # @return [Hash{'mode'=>mode, 'count'=>count}]
172
+ # mode and count of self in an array.
173
+ # @since 0.5.0
174
+ #
175
+ def mode
176
+ datum = find(:mode).execute([data])
177
+ datum.value.to_a.first
178
+ end
179
+
164
180
  # Compute product value of self.
165
181
  #
166
182
  # @note Self must be a numeric Vector.
@@ -241,6 +257,16 @@ module RedAmber
241
257
  # - nearest: returns i or j, whichever is closer.
242
258
  # - midpoint: returns (i + j) / 2.
243
259
 
260
+ # Get a non-nil element in self.
261
+ #
262
+ # @return [Object, nil]
263
+ # first non-nil value detected. If all elements are nil, return nil.
264
+ # @since 0.5.0
265
+ #
266
+ def one
267
+ each.find { !_1.nil? }
268
+ end
269
+
244
270
  # Returns a quantile value.
245
271
  # - 0.5 quantile (median) is returned by default.
246
272
  # - Or return quantile for specified probability (prob).
@@ -236,7 +236,6 @@ module RedAmber
236
236
  # division of self by other.
237
237
  #
238
238
  define_binary_element_wise :divide
239
- alias_method :div, :divide
240
239
  alias_method :'/', :divide
241
240
 
242
241
  # Divide the arguments element-wise.
@@ -248,21 +247,21 @@ module RedAmber
248
247
 
249
248
  # Returns element-wise modulo.
250
249
  #
251
- # This is equivalent to `self-other*(self/other).floor`.
252
- # @param other [Vector, numeric]
253
- # other numeric Vector or numeric scalar.
250
+ # This is equivalent to `self-divisor*(self/divisor).floor`.
251
+ # @note Same behavior as Ruby.
252
+ # @param divisor [Vector, numeric]
253
+ # divisor numeric Vector or numeric scalar.
254
254
  # @return [Vector]
255
- # modulo of dividing self by other.
255
+ # modulo of dividing self by divisor.
256
256
  #
257
- def modulo(other)
258
- other = other.data if other.is_a?(Vector)
259
- d = find(:divide).execute([data, other])
257
+ def modulo(divisor)
258
+ divisor = divisor.data if divisor.is_a?(Vector)
259
+ d = find(:divide).execute([data, divisor])
260
260
  d = find(:floor).execute([d]) if d.value.is_a?(Arrow::DoubleArray)
261
- m = find(:multiply).execute([d, other])
261
+ m = find(:multiply).execute([d, divisor])
262
262
  datum = find(:subtract).execute([data, m])
263
263
  Vector.create(datum.value)
264
264
  end
265
- alias_method :mod, :modulo
266
265
  alias_method :'%', :modulo
267
266
 
268
267
  # Returns element-wise modulo.
@@ -270,11 +269,11 @@ module RedAmber
270
269
  # This function is a overflow-checking variant of #modulo.
271
270
  # @return (see #modulo)
272
271
  #
273
- def modulo_checked(other)
274
- other = other.data if other.is_a?(Vector)
275
- d = find(:divide_checked).execute([data, other])
272
+ def modulo_checked(divisor)
273
+ divisor = divisor.data if divisor.is_a?(Vector)
274
+ d = find(:divide_checked).execute([data, divisor])
276
275
  d = find(:floor).execute([d]) if d.value.is_a?(Arrow::DoubleArray)
277
- m = find(:multiply_checked).execute([d, other])
276
+ m = find(:multiply_checked).execute([d, divisor])
278
277
  datum = find(:subtract_checked).execute([data, m])
279
278
  Vector.create(datum.value)
280
279
  end
@@ -323,27 +322,57 @@ module RedAmber
323
322
 
324
323
  # Returns element-wise quotient by double Vector.
325
324
  #
326
- # @param other [Vector, numeric]
327
- # other numeric Vector or numeric scalar.
325
+ # @param divisor [Vector, numeric]
326
+ # divisor numeric Vector or numeric scalar.
328
327
  # @return [Vector]
329
- # quotient of dividing self by other.
328
+ # quotient of dividing self by divisor.
330
329
  #
331
- def quotient(other)
332
- other = other.data if other.is_a?(Vector)
333
- datum = find(:divide).execute([Arrow::DoubleArray.new(data), other])
330
+ def fdiv(divisor)
331
+ divisor = divisor.data if divisor.is_a?(Vector)
332
+ datum = find(:divide).execute([Arrow::DoubleArray.new(data), divisor])
334
333
  Vector.create(datum.value)
335
334
  end
336
- alias_method :quo, :quotient
337
- alias_method :fdiv, :quotient
338
335
 
339
336
  # Returns element-wise quotient by double Vector.
340
337
  #
341
338
  # This function is a overflow-checking variant of #quotient.
342
339
  # @return (see #quotient)
343
340
  #
344
- def quotient_checked(other)
345
- other = other.data if other.is_a?(Vector)
346
- datum = find(:divide_checked).execute([Arrow::DoubleArray.new(data), other])
341
+ def fdiv_checked(divisor)
342
+ divisor = divisor.data if divisor.is_a?(Vector)
343
+ datum = find(:divide_checked).execute([Arrow::DoubleArray.new(data), divisor])
344
+ Vector.create(datum.value)
345
+ end
346
+
347
+ # Returns element-wise remainder.
348
+ #
349
+ # This is equivalent to `self-divisor*(self/divisor).trunc`.
350
+ # @note Same behavior as Ruby's remainder.
351
+ # @param divisor [Vector, numeric]
352
+ # divisor numeric Vector or numeric scalar.
353
+ # @return [Vector]
354
+ # modulo of dividing self by divisor.
355
+ #
356
+ def remainder(divisor)
357
+ divisor = divisor.data if divisor.is_a?(Vector)
358
+ d = find(:divide).execute([data, divisor])
359
+ d = find(:trunc).execute([d]) if d.value.is_a?(Arrow::DoubleArray)
360
+ m = find(:multiply).execute([d, divisor])
361
+ datum = find(:subtract).execute([data, m])
362
+ Vector.create(datum.value)
363
+ end
364
+
365
+ # Returns element-wise modulo.
366
+ #
367
+ # This function is a overflow-checking variant of #modulo.
368
+ # @return (see #modulo)
369
+ #
370
+ def remainder_checked(divisor)
371
+ divisor = divisor.data if divisor.is_a?(Vector)
372
+ d = find(:divide_checked).execute([data, divisor])
373
+ d = find(:trunc).execute([d]) if d.value.is_a?(Arrow::DoubleArray)
374
+ m = find(:multiply_checked).execute([d, divisor])
375
+ datum = find(:subtract_checked).execute([data, m])
347
376
  Vector.create(datum.value)
348
377
  end
349
378