red_amber 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -10,6 +10,45 @@ module RedAmber
10
10
  using RefineArray
11
11
  using RefineArrayLike
12
12
 
13
+ # Entity to select sub-dataframes
14
+ class Selectors
15
+ attr_reader :selectors, :size, :sizes
16
+
17
+ def initialize(selectors)
18
+ @selectors = selectors
19
+ @size = selectors.size
20
+ @sizes = []
21
+ end
22
+
23
+ # Generic iterator method
24
+ def each
25
+ @selectors.each
26
+ end
27
+ end
28
+
29
+ # Boolean selectors of sub-dataframes
30
+ class Filters < Selectors
31
+ # Return sizes of filter
32
+ # @return [Array<Integer>]
33
+ # sizes of each sub dataframes.
34
+ # Counts true for each filter.
35
+ def sizes
36
+ @sizes = @selectors.map { |s| s.to_a.count { _1 } } # rubocop:disable Performance/Size
37
+ end
38
+ end
39
+
40
+ # Index selectors of sub-dataframes
41
+ class Indices < Selectors
42
+ # Return sizes of selector indices.
43
+ # @return [Array<Integer>]
44
+ # sizes of each sub dataframes.
45
+ def sizes
46
+ @sizes = @selectors.map(&:size)
47
+ end
48
+ end
49
+
50
+ private_constant :Selectors, :Filters, :Indices
51
+
13
52
  class << self
14
53
  # Create SubFrames from a Group.
15
54
  #
@@ -61,7 +100,7 @@ module RedAmber
61
100
  # @since 0.4.0
62
101
  #
63
102
  def by_group(group)
64
- SubFrames.new(group.dataframe, group.filters)
103
+ SubFrames.by_filters(group.dataframe, group.filters)
65
104
  end
66
105
 
67
106
  # Create a new SubFrames object from a DataFrame and an array of indices.
@@ -79,13 +118,8 @@ module RedAmber
79
118
  def by_indices(dataframe, subset_indices)
80
119
  instance = allocate
81
120
  instance.instance_variable_set(:@baseframe, dataframe)
82
- enum =
83
- Enumerator.new(subset_indices.size) do |y|
84
- subset_indices.each do |i|
85
- y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.take(i))
86
- end
87
- end
88
- instance.instance_variable_set(:@enum, enum)
121
+ instance.instance_variable_set(:@selectors, Indices.new(subset_indices))
122
+ instance.instance_variable_set(:@frames, [])
89
123
  instance
90
124
  end
91
125
 
@@ -105,13 +139,8 @@ module RedAmber
105
139
  def by_filters(dataframe, subset_filters)
106
140
  instance = allocate
107
141
  instance.instance_variable_set(:@baseframe, dataframe)
108
- enum =
109
- Enumerator.new(subset_filters.size) do |y|
110
- subset_filters.each do |i|
111
- y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.filter(i))
112
- end
113
- end
114
- instance.instance_variable_set(:@enum, enum)
142
+ instance.instance_variable_set(:@selectors, Filters.new(subset_filters))
143
+ instance.instance_variable_set(:@frames, [])
115
144
  instance
116
145
  end
117
146
 
@@ -130,18 +159,13 @@ module RedAmber
130
159
  case Array(dataframes)
131
160
  when [] || [nil]
132
161
  instance.instance_variable_set(:@baseframe, DataFrame.new)
162
+ instance.instance_variable_set(:@selectors, [])
133
163
  instance.instance_variable_set(:@frames, [])
134
- enum = [].each
135
164
  else
136
- enum =
137
- Enumerator.new(dataframes.size) do |y|
138
- dataframes.each do |i|
139
- y.yield i
140
- end
141
- end
142
- instance.instance_variable_set(:@baseframe, enum.lazy)
165
+ instance.instance_variable_set(:@baseframe, nil)
166
+ instance.instance_variable_set(:@selectors, nil)
167
+ instance.instance_variable_set(:@frames, dataframes)
143
168
  end
144
- instance.instance_variable_set(:@enum, enum)
145
169
  instance
146
170
  end
147
171
 
@@ -261,40 +285,34 @@ module RedAmber
261
285
  #
262
286
  # @since 0.4.0
263
287
  #
264
- def initialize(dataframe, subset_specifier = nil, &block)
288
+ def initialize(dataframe, selectors = nil, &block)
265
289
  unless dataframe.is_a?(DataFrame)
266
290
  raise SubFramesArgumentError, "not a DataFrame: #{dataframe}"
267
291
  end
268
292
 
269
293
  if block
270
- unless subset_specifier.nil?
294
+ unless selectors.nil?
271
295
  raise SubFramesArgumentError, 'Must not specify both arguments and block.'
272
296
  end
273
297
 
274
- subset_specifier = yield(dataframe)
298
+ selectors = yield(dataframe)
275
299
  end
276
300
 
277
- if dataframe.empty? || subset_specifier.nil? || subset_specifier.empty?
301
+ if dataframe.empty? || selectors.nil? || selectors.size.zero? # rubocop:disable Style/ZeroLengthPredicate
278
302
  @baseframe = DataFrame.new
279
- @frames = []
280
- @enum = @frames.each
303
+ @selectors = Selectors.new([])
281
304
  else
282
- @baseframe = nil
283
- @enum =
284
- Enumerator.new(subset_specifier.size) do |yielder|
285
- subset_specifier.map do |i|
286
- df =
287
- if i.numeric?
288
- dataframe.take(i)
289
- elsif i.boolean?
290
- dataframe.filter(i)
291
- else
292
- raise SubFramesArgumentError, "illegal type: #{i}"
293
- end
294
- yielder.yield DataFrame.new_dataframe_with_schema(dataframe, df)
295
- end
305
+ @baseframe = dataframe
306
+ @selectors =
307
+ if selectors.first.boolean?
308
+ Filters.new(selectors)
309
+ elsif selectors.first.numeric?
310
+ Indices.new(selectors)
311
+ else
312
+ raise SubFramesArgumentError, "illegal type: #{selectors}"
296
313
  end
297
314
  end
315
+ @frames = []
298
316
  end
299
317
 
300
318
  # Return concatenated SubFrames as a DataFrame.
@@ -305,11 +323,7 @@ module RedAmber
305
323
  # @since 0.4.0
306
324
  #
307
325
  def baseframe
308
- if @baseframe.nil? || @baseframe.is_a?(Enumerator)
309
- @baseframe = reduce(&:concatenate)
310
- else
311
- @baseframe
312
- end
326
+ @baseframe ||= reduce(&:concatenate)
313
327
  end
314
328
  alias_method :concatenate, :baseframe
315
329
  alias_method :concat, :baseframe
@@ -384,7 +398,19 @@ module RedAmber
384
398
  def each(&block)
385
399
  return enum_for(__method__) { size } unless block
386
400
 
387
- frames.each(&block)
401
+ if @selectors
402
+ @selectors.each.with_index do |selector, i|
403
+ if i < @frames.size
404
+ yield @frames[i]
405
+ else
406
+ frame = get_subframe(selector)
407
+ @frames << frame
408
+ yield frame
409
+ end
410
+ end
411
+ else
412
+ @frames.each(&block)
413
+ end
388
414
  nil
389
415
  end
390
416
 
@@ -916,6 +942,26 @@ module RedAmber
916
942
  #
917
943
  define_subframable_method :filter_map
918
944
 
945
+ # Return 0...num sub-dataframes in self.
946
+ #
947
+ # @param num [Integer, Float]
948
+ # num of sub-dataframes to pick up. `num`` must be positive or zero.
949
+ # @return [SubFrames]
950
+ # A new SubFrames.
951
+ # If n == 0, it returns empty SubFrames.
952
+ # If n >= size, it returns self.
953
+ # @since 0.4.2
954
+ #
955
+ def take(num)
956
+ if num.zero?
957
+ SubFrames.new(DataFrame.new, [])
958
+ elsif num >= size
959
+ self
960
+ else
961
+ SubFrames.by_dataframes(frames(num))
962
+ end
963
+ end
964
+
919
965
  # Number of subsets.
920
966
  #
921
967
  # @return [Integer]
@@ -923,7 +969,12 @@ module RedAmber
923
969
  # @since 0.4.0
924
970
  #
925
971
  def size
926
- @size ||= @enum.size
972
+ @size ||=
973
+ if @selectors
974
+ @selectors.size
975
+ else
976
+ @frames.size
977
+ end
927
978
  end
928
979
 
929
980
  # Size list of subsets.
@@ -933,7 +984,12 @@ module RedAmber
933
984
  # @since 0.4.0
934
985
  #
935
986
  def sizes
936
- @sizes ||= @enum.map(&:size)
987
+ @sizes ||=
988
+ if @selectors
989
+ @selectors.sizes
990
+ else
991
+ @frames.map(&:size)
992
+ end
937
993
  end
938
994
 
939
995
  # Indices at the top of each sub DataFrames.
@@ -945,10 +1001,17 @@ module RedAmber
945
1001
  # @since 0.4.0
946
1002
  #
947
1003
  def offset_indices
948
- sum = 0
949
- sizes.map do |size|
950
- sum += size
951
- sum - size
1004
+ case @selectors
1005
+ when Filters
1006
+ @selectors.selectors.map do |selector|
1007
+ selector.each.with_index.find { |x, _| x }[1]
1008
+ end
1009
+ else # Indices, nil
1010
+ sum = 0
1011
+ sizes.map do |size|
1012
+ sum += size
1013
+ sum - size
1014
+ end
952
1015
  end
953
1016
  end
954
1017
 
@@ -965,11 +1028,11 @@ module RedAmber
965
1028
  # Test if self has only one subset and it is comprehensive.
966
1029
  #
967
1030
  # @return [true, false]
968
- # true if only member of self is equal to universal DataFrame.
1031
+ # true if the only member of self is equal to universal DataFrame.
969
1032
  # @since 0.4.0
970
1033
  #
971
1034
  def universal?
972
- size == 1 && @enum.first == baseframe
1035
+ size == 1 && first == @baseframe
973
1036
  end
974
1037
 
975
1038
  # Return string representation of self.
@@ -1012,7 +1075,7 @@ module RedAmber
1012
1075
  #
1013
1076
  # @since 0.4.0
1014
1077
  #
1015
- def to_s(limit: 16)
1078
+ def to_s(limit: 5)
1016
1079
  _to_s(limit: limit)
1017
1080
  end
1018
1081
 
@@ -1064,10 +1127,10 @@ module RedAmber
1064
1127
  #
1065
1128
  # @since 0.4.0
1066
1129
  #
1067
- def inspect(limit: 16)
1130
+ def inspect(limit: 5)
1068
1131
  shape =
1069
- if @baseframe.is_a?(Enumerator)
1070
- "Enumerator::Lazy:size=#{@baseframe.size}"
1132
+ if @baseframe.nil?
1133
+ '(Not prepared)'
1071
1134
  else
1072
1135
  baseframe.shape_str(with_id: true)
1073
1136
  end
@@ -1079,14 +1142,51 @@ module RedAmber
1079
1142
  "---\n#{_to_s(limit: limit, with_id: true)}"
1080
1143
  end
1081
1144
 
1145
+ # Return an Array of sub DataFrames
1146
+ #
1147
+ # @overload frames
1148
+ # Returns all sub dataframes.
1149
+ #
1150
+ # @return [Array<DataFrame>]
1151
+ # sub DataFrames.
1152
+ #
1153
+ # @overload frames(n_frames)
1154
+ # Returns partial sub dataframes.
1155
+ #
1156
+ # @param n_frames [Integer]
1157
+ # num of dataframes to retrieve.
1158
+ # @return [Array<DataFrame>]
1159
+ # sub DataFrames.
1160
+ #
1161
+ # @since 0.4.2
1162
+ #
1163
+ def frames(n_frames = nil)
1164
+ n_frames = size if n_frames.nil?
1165
+
1166
+ if @frames.size < n_frames
1167
+ @frames = each.take(n_frames)
1168
+ else
1169
+ @frames.take(n_frames)
1170
+ end
1171
+ end
1172
+
1082
1173
  private
1083
1174
 
1084
- def frames
1085
- @frames ||= @enum.to_a
1175
+ # Get sub dataframe specified by 'selector'
1176
+ def get_subframe(selector)
1177
+ df =
1178
+ case @selectors
1179
+ when Filters
1180
+ @baseframe.filter(selector)
1181
+ when Indices
1182
+ @baseframe.take(selector)
1183
+ end
1184
+ DataFrame.new_dataframe_with_schema(@baseframe, df)
1086
1185
  end
1087
1186
 
1088
- def _to_s(limit: 16, with_id: false)
1089
- a = take(limit).map do |df|
1187
+ # Subcontractor of to_s
1188
+ def _to_s(limit: 5, with_id: false)
1189
+ a = each.take(limit).map do |df|
1090
1190
  if with_id
1091
1191
  "#<#{df.shape_str(with_id: with_id)}>\n" \
1092
1192
  "#{df.to_s(head: 2, tail: 2)}"
@@ -10,21 +10,54 @@ module RedAmber
10
10
  include ArrowFunction
11
11
  include VectorUpdatable
12
12
  include VectorSelectable
13
+ include VectorStringFunction
13
14
 
14
15
  using RefineArrayLike
15
16
 
16
- # Quicker constructor of Vector.
17
+ # Entity of Vector.
17
18
  #
18
- # @param arrow_array [Arrow::Array]
19
- # Arrow::Array object to have in the Vector.
20
- # @return [Vector]
21
- # created Vector.
22
- # @note This method doesn't check argment type.
19
+ # @return [Arrow::Array]
20
+ #
21
+ attr_reader :data
22
+ alias_method :to_arrow_array, :data
23
+
24
+ # Associated key name when self is in a DataFrame.
25
+ #
26
+ # Default Vector is 'head-less' (key-less).
27
+ # @return [Symbol]
23
28
  #
24
- def self.create(arrow_array)
25
- instance = allocate
26
- instance.instance_variable_set(:@data, arrow_array)
27
- instance
29
+ attr_accessor :key
30
+
31
+ class << self
32
+ # Create a Vector (calling `.new`).
33
+ #
34
+ # @param (see #initialize)
35
+ # @return (see #initialize)
36
+ # @example Create an empty Vector.
37
+ # Vector[]
38
+ # # =>
39
+ # #<RedAmber::Vector(:string, size=0):0x000000000000e2cc>
40
+ # []
41
+ #
42
+ # @since 0.5.0
43
+ #
44
+ def [](...)
45
+ new(...)
46
+ end
47
+
48
+ # Quicker constructor of Vector.
49
+ #
50
+ # @param arrow_array [Arrow::Array]
51
+ # Arrow::Array object to have in the Vector.
52
+ # @return [Vector]
53
+ # created Vector.
54
+ # @note This method doesn't check argment type.
55
+ #
56
+ def create(arrow_array)
57
+ instance = allocate
58
+ instance.instance_variable_set(:@data, arrow_array)
59
+ instance
60
+ end
28
61
  end
29
62
 
30
63
  # Create a Vector.
@@ -51,20 +84,6 @@ module RedAmber
51
84
  end
52
85
  end
53
86
 
54
- # Entity of Vector.
55
- #
56
- # @return [Arrow::Array]
57
- #
58
- attr_reader :data
59
- alias_method :to_arrow_array, :data
60
-
61
- # Associated key name when self is in a DataFrame.
62
- #
63
- # Default Vector is 'head-less' (key-less).
64
- # @return [Symbol]
65
- #
66
- attr_accessor :key
67
-
68
87
  # Return other as a Vector which is same data type as self.
69
88
  #
70
89
  # @param other [Vector, Array, Arrow::Array, Arrow::ChunkedArray]
@@ -161,6 +161,22 @@ module RedAmber
161
161
  #
162
162
  define_unary_aggregation :min_max
163
163
 
164
+ # Compute the 1 most common values and their respective
165
+ # occurence counts.
166
+ #
167
+ # @note Self must be a numeric or a boolean Vector.
168
+ # @note ModeOptions are not supported in 0.5.0 .
169
+ # Only one mode value is returned.
170
+ # @api private
171
+ # @return [Hash{'mode'=>mode, 'count'=>count}]
172
+ # mode and count of self in an array.
173
+ # @since 0.5.0
174
+ #
175
+ def mode
176
+ datum = find(:mode).execute([data])
177
+ datum.value.to_a.first
178
+ end
179
+
164
180
  # Compute product value of self.
165
181
  #
166
182
  # @note Self must be a numeric Vector.
@@ -241,6 +257,16 @@ module RedAmber
241
257
  # - nearest: returns i or j, whichever is closer.
242
258
  # - midpoint: returns (i + j) / 2.
243
259
 
260
+ # Get a non-nil element in self.
261
+ #
262
+ # @return [Object, nil]
263
+ # first non-nil value detected. If all elements are nil, return nil.
264
+ # @since 0.5.0
265
+ #
266
+ def one
267
+ each.find { !_1.nil? }
268
+ end
269
+
244
270
  # Returns a quantile value.
245
271
  # - 0.5 quantile (median) is returned by default.
246
272
  # - Or return quantile for specified probability (prob).
@@ -236,7 +236,6 @@ module RedAmber
236
236
  # division of self by other.
237
237
  #
238
238
  define_binary_element_wise :divide
239
- alias_method :div, :divide
240
239
  alias_method :'/', :divide
241
240
 
242
241
  # Divide the arguments element-wise.
@@ -248,21 +247,21 @@ module RedAmber
248
247
 
249
248
  # Returns element-wise modulo.
250
249
  #
251
- # This is equivalent to `self-other*(self/other).floor`.
252
- # @param other [Vector, numeric]
253
- # other numeric Vector or numeric scalar.
250
+ # This is equivalent to `self-divisor*(self/divisor).floor`.
251
+ # @note Same behavior as Ruby.
252
+ # @param divisor [Vector, numeric]
253
+ # divisor numeric Vector or numeric scalar.
254
254
  # @return [Vector]
255
- # modulo of dividing self by other.
255
+ # modulo of dividing self by divisor.
256
256
  #
257
- def modulo(other)
258
- other = other.data if other.is_a?(Vector)
259
- d = find(:divide).execute([data, other])
257
+ def modulo(divisor)
258
+ divisor = divisor.data if divisor.is_a?(Vector)
259
+ d = find(:divide).execute([data, divisor])
260
260
  d = find(:floor).execute([d]) if d.value.is_a?(Arrow::DoubleArray)
261
- m = find(:multiply).execute([d, other])
261
+ m = find(:multiply).execute([d, divisor])
262
262
  datum = find(:subtract).execute([data, m])
263
263
  Vector.create(datum.value)
264
264
  end
265
- alias_method :mod, :modulo
266
265
  alias_method :'%', :modulo
267
266
 
268
267
  # Returns element-wise modulo.
@@ -270,11 +269,11 @@ module RedAmber
270
269
  # This function is a overflow-checking variant of #modulo.
271
270
  # @return (see #modulo)
272
271
  #
273
- def modulo_checked(other)
274
- other = other.data if other.is_a?(Vector)
275
- d = find(:divide_checked).execute([data, other])
272
+ def modulo_checked(divisor)
273
+ divisor = divisor.data if divisor.is_a?(Vector)
274
+ d = find(:divide_checked).execute([data, divisor])
276
275
  d = find(:floor).execute([d]) if d.value.is_a?(Arrow::DoubleArray)
277
- m = find(:multiply_checked).execute([d, other])
276
+ m = find(:multiply_checked).execute([d, divisor])
278
277
  datum = find(:subtract_checked).execute([data, m])
279
278
  Vector.create(datum.value)
280
279
  end
@@ -323,27 +322,57 @@ module RedAmber
323
322
 
324
323
  # Returns element-wise quotient by double Vector.
325
324
  #
326
- # @param other [Vector, numeric]
327
- # other numeric Vector or numeric scalar.
325
+ # @param divisor [Vector, numeric]
326
+ # divisor numeric Vector or numeric scalar.
328
327
  # @return [Vector]
329
- # quotient of dividing self by other.
328
+ # quotient of dividing self by divisor.
330
329
  #
331
- def quotient(other)
332
- other = other.data if other.is_a?(Vector)
333
- datum = find(:divide).execute([Arrow::DoubleArray.new(data), other])
330
+ def fdiv(divisor)
331
+ divisor = divisor.data if divisor.is_a?(Vector)
332
+ datum = find(:divide).execute([Arrow::DoubleArray.new(data), divisor])
334
333
  Vector.create(datum.value)
335
334
  end
336
- alias_method :quo, :quotient
337
- alias_method :fdiv, :quotient
338
335
 
339
336
  # Returns element-wise quotient by double Vector.
340
337
  #
341
338
  # This function is a overflow-checking variant of #quotient.
342
339
  # @return (see #quotient)
343
340
  #
344
- def quotient_checked(other)
345
- other = other.data if other.is_a?(Vector)
346
- datum = find(:divide_checked).execute([Arrow::DoubleArray.new(data), other])
341
+ def fdiv_checked(divisor)
342
+ divisor = divisor.data if divisor.is_a?(Vector)
343
+ datum = find(:divide_checked).execute([Arrow::DoubleArray.new(data), divisor])
344
+ Vector.create(datum.value)
345
+ end
346
+
347
+ # Returns element-wise remainder.
348
+ #
349
+ # This is equivalent to `self-divisor*(self/divisor).trunc`.
350
+ # @note Same behavior as Ruby's remainder.
351
+ # @param divisor [Vector, numeric]
352
+ # divisor numeric Vector or numeric scalar.
353
+ # @return [Vector]
354
+ # modulo of dividing self by divisor.
355
+ #
356
+ def remainder(divisor)
357
+ divisor = divisor.data if divisor.is_a?(Vector)
358
+ d = find(:divide).execute([data, divisor])
359
+ d = find(:trunc).execute([d]) if d.value.is_a?(Arrow::DoubleArray)
360
+ m = find(:multiply).execute([d, divisor])
361
+ datum = find(:subtract).execute([data, m])
362
+ Vector.create(datum.value)
363
+ end
364
+
365
+ # Returns element-wise modulo.
366
+ #
367
+ # This function is a overflow-checking variant of #modulo.
368
+ # @return (see #modulo)
369
+ #
370
+ def remainder_checked(divisor)
371
+ divisor = divisor.data if divisor.is_a?(Vector)
372
+ d = find(:divide_checked).execute([data, divisor])
373
+ d = find(:trunc).execute([d]) if d.value.is_a?(Arrow::DoubleArray)
374
+ m = find(:multiply_checked).execute([d, divisor])
375
+ datum = find(:subtract_checked).execute([data, m])
347
376
  Vector.create(datum.value)
348
377
  end
349
378