red_amber 0.3.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +56 -22
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +29 -30
  8. data/benchmark/basic.yml +7 -7
  9. data/benchmark/combine.yml +3 -3
  10. data/benchmark/dataframe.yml +15 -9
  11. data/benchmark/group.yml +6 -6
  12. data/benchmark/reshape.yml +6 -6
  13. data/benchmark/vector.yml +6 -3
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +454 -85
  20. data/lib/red_amber/data_frame_combinable.rb +609 -115
  21. data/lib/red_amber/data_frame_displayable.rb +313 -34
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +623 -70
  26. data/lib/red_amber/data_frame_variable_operation.rb +452 -35
  27. data/lib/red_amber/group.rb +186 -22
  28. data/lib/red_amber/helper.rb +74 -14
  29. data/lib/red_amber/refinements.rb +26 -6
  30. data/lib/red_amber/subframes.rb +1101 -0
  31. data/lib/red_amber/vector.rb +362 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +506 -0
  34. data/lib/red_amber/vector_selectable.rb +265 -23
  35. data/lib/red_amber/vector_unary_element_wise.rb +529 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -4,7 +4,7 @@
4
4
  # reference: https://arrow.apache.org/docs/cpp/compute.html
5
5
 
6
6
  module RedAmber
7
- # mix-in for class Vector
7
+ # Mix-in for class Vector
8
8
  # Functions to select some data.
9
9
  module VectorSelectable
10
10
  using RefineArray
@@ -12,11 +12,14 @@ module RedAmber
12
12
 
13
13
  # Select elements in the self by indices.
14
14
  #
15
- # @param indices [Array<Numeric>, Vector] indices.
16
- # @yield [Array<Numeric>, Vector] indices.
17
- # @return [Vector] Vector by selected elements.
15
+ # @param indices [Array<Numeric>, Vector]
16
+ # an array-like of indices.
17
+ # @yieldreturn [Array<Numeric>, Vector]
18
+ # an array-like of indices from the block.
19
+ # @return [Vector]
20
+ # vector by selected elements.
18
21
  #
19
- # TODO: support for the option `boundscheck: true`
22
+ # TODO: support for the option `boundscheck: true`
20
23
  def take(*indices, &block)
21
24
  if block
22
25
  unless indices.empty?
@@ -47,11 +50,14 @@ module RedAmber
47
50
 
48
51
  # Select elements in the self by booleans.
49
52
  #
50
- # @param booleans [Array<true, false, nil>, Vector] booleans.
51
- # @yield [Array<true, false, nil>, Vector] booleans.
52
- # @return [Vector] Vector by selected elements.
53
+ # @param booleans [Array<true, false, nil>, Vector]
54
+ # an array-like of booleans.
55
+ # @yieldreturn [Array<true, false, nil>, Vector]
56
+ # an array-like of booleans from the block.
57
+ # @return [Vector]
58
+ # vector by selected elements.
53
59
  #
54
- # TODO: support for the option `null_selection_behavior: :drop`
60
+ # TODO: support for the option `null_selection_behavior: :drop`
55
61
  def filter(*booleans, &block)
56
62
  if block
57
63
  unless booleans.empty?
@@ -87,9 +93,12 @@ module RedAmber
87
93
 
88
94
  # Select elements in the self by indices or booleans.
89
95
  #
90
- # @param args [Array<Numeric, true, false, nil>, Vector] specifier.
91
- # @yield [Array<Numeric, true, false, nil>, Vector] specifier.
92
- # @return [scalar, Array] returns scalar or array.
96
+ # @param args [Array<Numeric, true, false, nil>, Vector]
97
+ # specifier. Indices or booleans.
98
+ # @yieldparam [Array<Numeric, true, false, nil>, Vector]
99
+ # specifier. Indices or booleans.
100
+ # @return [scalar, Array]
101
+ # returns scalar or array.
93
102
  #
94
103
  def [](*args)
95
104
  array =
@@ -119,31 +128,264 @@ module RedAmber
119
128
  raise VectorArgumentError, "Invalid argument: #{args}"
120
129
  end
121
130
 
122
- # @param values [Array, Arrow::Array, Vector]
131
+ # Check if elements of self are in the other values.
132
+ #
133
+ # @param values [Vector, Array, Arrow::Array, Arrow::ChunkedArray]
134
+ # values to test existence.
135
+ # @return [Vector]
136
+ # boolean Vector.
137
+ #
123
138
  def is_in(*values)
124
- self_data = chunked? ? data.pack : data
125
-
126
- array =
139
+ enum =
127
140
  case values
128
- in [Vector] | [Arrow::Array] | [Arrow::ChunkedArray]
129
- values[0].to_a
141
+ in [] | [[]] | [nil] |[[nil]]
142
+ return Vector.new([false] * size)
143
+ in [Vector | Arrow::Array | Arrow::ChunkedArray]
144
+ values[0].each
130
145
  else
131
- Array(values).flatten
146
+ parse_args(values, size, symbolize: false)
132
147
  end
133
-
134
- Vector.create(self_data.is_in(array))
148
+ enum.filter_map { self == _1 unless _1.nil? }.reduce(&:|)
135
149
  end
136
150
 
137
- # Arrow's support required
151
+ # Returns index of first matched position of element in self.
152
+ #
153
+ # @param element
154
+ # an element of self.
155
+ # @return [integer, nil]
156
+ # founded position of element. If it is not found, returns nil.
157
+ #
138
158
  def index(element)
139
- to_a.index(element)
159
+ (0...size).find { |i| self[i] == element }
140
160
  end
141
161
 
162
+ # Returns first element of self.
163
+ #
164
+ # @return
165
+ # the first element.
166
+ # @since 0.4.1
167
+ #
168
+ def first
169
+ data[0]
170
+ end
171
+
172
+ # Returns last element of self.
173
+ #
174
+ # @return
175
+ # the last element.
176
+ # @since 0.4.1
177
+ #
178
+ def last
179
+ data[-1]
180
+ end
181
+
182
+ # Drop nil in self and returns a new Vector as a result.
183
+ #
184
+ # @return [Vector]
185
+ # a Vector without nils.
186
+ #
142
187
  def drop_nil
143
188
  datum = find(:drop_null).execute([data])
144
189
  Vector.create(datum.value)
145
190
  end
146
191
 
192
+ # Arrange values in Vector.
193
+ #
194
+ # @param order [Symbol]
195
+ # sort order.
196
+ # - `:+`, `:ascending` or without argument will sort in increasing order.
197
+ # - `:-` or `:descending` will sort in decreasing order.
198
+ # @return [Vector]
199
+ # sorted Vector.
200
+ # @example Sort in increasing order (default)
201
+ # Vector.new(%w[B D A E C]).sort
202
+ # # same as #sort(:+)
203
+ # # same as #sort(:ascending)
204
+ #
205
+ # # =>
206
+ # #<RedAmber::Vector(:string, size=5):0x000000000000c134>
207
+ # ["A", "B", "C", "D", "E"]
208
+ #
209
+ # @example Sort in decreasing order
210
+ # Vector.new(%w[B D A E C]).sort(:-)
211
+ # # same as #sort(:descending)
212
+ #
213
+ # # =>
214
+ # #<RedAmber::Vector(:string, size=5):0x000000000000c148>
215
+ # ["E", "D", "C", "B", "A"]
216
+ #
217
+ # @since 0.4.0
218
+ #
219
+ def sort(order = :ascending)
220
+ order =
221
+ case order.to_sym
222
+ when :+, :ascending, :increasing
223
+ :ascending
224
+ when :-, :descending, :decreasing
225
+ :descending
226
+ else
227
+ raise VectorArgumentError, "illegal order option: #{order}"
228
+ end
229
+ take(sort_indices(order: order))
230
+ end
231
+
232
+ # Returns numerical rank of self.
233
+ # - Nil values are considered greater than any value.
234
+ # - NaN values are considered greater than any value but smaller than nil values.
235
+ # - Tiebreakers are ranked in order of appearance.
236
+ # - `RankOptions` in C++ function is not implemented in C GLib yet.
237
+ # This method is currently fixed to the default behavior.
238
+ #
239
+ # @return [Vector]
240
+ # 0-based rank of self (0...size in range).
241
+ # @example Rank of float Vector
242
+ # fv = Vector.new(0.1, nil, Float::NAN, 0.2, 0.1); fv
243
+ #
244
+ # # =>
245
+ # #<RedAmber::Vector(:double, size=5):0x000000000000c65c>
246
+ # [0.1, nil, NaN, 0.2, 0.1]
247
+ #
248
+ # fv.rank
249
+ #
250
+ # # =>
251
+ # #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
252
+ # [0, 4, 3, 2, 1]
253
+ #
254
+ # @example Rank of string Vector
255
+ # sv = Vector.new("A", "B", nil, "A", "C"); sv
256
+ #
257
+ # # =>
258
+ # #<RedAmber::Vector(:string, size=5):0x0000000000003854>
259
+ # ["A", "B", nil, "A", "C"]
260
+ #
261
+ # sv.rank
262
+ #
263
+ # # =>
264
+ # #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
265
+ # [0, 2, 4, 1, 3]
266
+ #
267
+ # @since 0.4.0
268
+ #
269
+ def rank
270
+ datum =
271
+ case data
272
+ when Arrow::ChunkedArray
273
+ Arrow::Function.find(:rank).execute([data.pack])
274
+ else
275
+ Arrow::Function.find(:rank).execute([data])
276
+ end
277
+ Vector.create(datum.value) - 1
278
+ end
279
+
280
+ # Pick up elements at random.
281
+ #
282
+ # @overload sample()
283
+ # Return a randomly selected element.
284
+ # This is one of an aggregation function.
285
+ #
286
+ # @return [scalar]
287
+ # one of an element in self.
288
+ # @example Sample a element
289
+ # v = Vector.new('A'..'H'); v
290
+ #
291
+ # # =>
292
+ # #<RedAmber::Vector(:string, size=8):0x0000000000011b20>
293
+ # ["A", "B", "C", "D", "E", "F", "G", "H"]
294
+ #
295
+ # v.sample
296
+ #
297
+ # # =>
298
+ # "C"
299
+ #
300
+ # @overload sample(n)
301
+ # Pick up n elements at random.
302
+ #
303
+ # @param n [Integer]
304
+ # positive number of elements to pick.
305
+ # If n is smaller or equal to size, elements are picked by non-repeating.
306
+ # If n is greater than `size`, elements are picked repeatedly.
307
+ # @return [Vector]
308
+ # sampled elements.
309
+ # If n == 1 (in case of `sample(1)`), it returns a Vector of size == 1
310
+ # not a scalar.
311
+ # @example Sample Vector in size 1
312
+ # v.sample(1)
313
+ #
314
+ # # =>
315
+ # #<RedAmber::Vector(:string, size=1):0x000000000001a3b0>
316
+ # ["H"]
317
+ #
318
+ # @example Sample same size of self: every element is picked in random order
319
+ # v.sample(8)
320
+ #
321
+ # # =>
322
+ # #<RedAmber::Vector(:string, size=8):0x000000000001bda0>
323
+ # ["H", "D", "B", "F", "E", "A", "G", "C"]
324
+ #
325
+ # @example Over sampling: "E" and "A" are sampled repeatedly
326
+ # v.sample(9)
327
+ #
328
+ # # =>
329
+ # #<RedAmber::Vector(:string, size=9):0x000000000001d790>
330
+ # ["E", "E", "A", "D", "H", "C", "A", "F", "H"]
331
+ #
332
+ # @overload sample(prop)
333
+ # Pick up elements by proportion `prop` at random.
334
+ #
335
+ # @param prop [Float]
336
+ # positive proportion of elements to pick.
337
+ # Absolute number of elements to pick:`prop*size` is rounded (by `half: :up``).
338
+ # If prop is smaller or equal to 1.0, elements are picked by non-repeating.
339
+ # If prop is greater than 1.0, some elements are picked repeatedly.
340
+ # @return [Vector]
341
+ # sampled elements.
342
+ # If picked element is only one, it returns a Vector of size == 1
343
+ # not a scalar.
344
+ # @example Sample same size of self: every element is picked in random order
345
+ # v.sample(1.0)
346
+ #
347
+ # # =>
348
+ # #<RedAmber::Vector(:string, size=8):0x000000000001bda0>
349
+ # ["D", "H", "F", "C", "A", "B", "E", "G"]
350
+ #
351
+ # @example 2 times over sampling
352
+ # v.sample(2.0)
353
+ #
354
+ # # =>
355
+ # #<RedAmber::Vector(:string, size=16):0x00000000000233e8>
356
+ # ["H", "B", "C", "B", "C", "A", "F", "A", "E", "C", "H", "F", "F", "A", ... ]
357
+ #
358
+ # @since 0.4.0
359
+ #
360
+ def sample(n_or_prop = nil)
361
+ require 'arrow-numo-narray'
362
+
363
+ return nil if size == 0
364
+
365
+ n_sample =
366
+ case n_or_prop
367
+ in Integer
368
+ n_or_prop
369
+ in Float
370
+ (n_or_prop * size).round
371
+ in nil
372
+ return to_a.sample
373
+ else
374
+ raise VectorArgumentError, "must specify Integer or Float but #{n_or_prop}"
375
+ end
376
+ if n_or_prop < 0
377
+ raise VectorArgumentError, '#sample does not accept negative number.'
378
+ end
379
+ return Vector.new([]) if n_sample == 0
380
+
381
+ over_sample = [8 * size, n_sample].max
382
+ over_size = n_sample > size ? n_sample / size * size * 2 : size
383
+ over_vector =
384
+ Vector.create(Numo::UInt32.new(over_size).rand(over_sample).to_arrow_array)
385
+ indices = over_vector.rank.take(*0...n_sample)
386
+ take(indices - ((indices / size) * size))
387
+ end
388
+
147
389
  private
148
390
 
149
391
  # Accepts indices by numeric Vector