red_amber 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +56 -22
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +29 -30
  8. data/benchmark/basic.yml +7 -7
  9. data/benchmark/combine.yml +3 -3
  10. data/benchmark/dataframe.yml +15 -9
  11. data/benchmark/group.yml +6 -6
  12. data/benchmark/reshape.yml +6 -6
  13. data/benchmark/vector.yml +6 -3
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +454 -85
  20. data/lib/red_amber/data_frame_combinable.rb +609 -115
  21. data/lib/red_amber/data_frame_displayable.rb +313 -34
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +623 -70
  26. data/lib/red_amber/data_frame_variable_operation.rb +452 -35
  27. data/lib/red_amber/group.rb +186 -22
  28. data/lib/red_amber/helper.rb +74 -14
  29. data/lib/red_amber/refinements.rb +26 -6
  30. data/lib/red_amber/subframes.rb +1101 -0
  31. data/lib/red_amber/vector.rb +362 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +506 -0
  34. data/lib/red_amber/vector_selectable.rb +265 -23
  35. data/lib/red_amber/vector_unary_element_wise.rb +529 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,17 +1,38 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameCombinable
6
6
  # Refinements for Arrow::Table
7
7
  using RefineArrowTable
8
8
 
9
- # Concatenate other dataframe onto the bottom.
9
+ # Concatenate other dataframes or tables onto the bottom of self.
10
10
  #
11
+ # @note the `#types` must be same as `other#types`.
11
12
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
12
- # DataFrame/Table to concatenate onto the bottom of self.
13
+ # DataFrames or Tables to concatenate.
13
14
  # @return [DataFrame]
14
- # Concatenated dataframe.
15
+ # concatenated dataframe.
16
+ # @example
17
+ # df = DataFrame.new(x: [1, 2], y: ['A', 'B'])
18
+ # other = DataFrame.new(x: [3, 4], y: ['C', 'D'])
19
+ # [df.types, other.types]
20
+ #
21
+ # # =>
22
+ # [[:uint8, :string], [:uint8, :string]]
23
+ #
24
+ # df.concatenate(other)
25
+ #
26
+ # # =>
27
+ # x y
28
+ # <uint8> <string>
29
+ # 0 1 A
30
+ # 1 2 B
31
+ # 2 3 C
32
+ # 3 4 D
33
+ #
34
+ # @since 0.2.3
35
+ #
15
36
  def concatenate(*other)
16
37
  case other
17
38
  in [] | [nil] | [[]]
@@ -39,14 +60,29 @@ module RedAmber
39
60
  alias_method :concat, :concatenate
40
61
  alias_method :bind_rows, :concatenate
41
62
 
42
- # Merge other DataFrame or Table from other.
43
- # - Self and other must have same size.
44
- # - Self and other do not share the same key.
45
- # - If they share any keys, raise Error.
63
+ # Merge other DataFrames or Tables.
64
+ #
65
+ # @note the `#size` must be same as `other#size`.
66
+ # @note self and other must not share the same key.
46
67
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
47
- # DataFrame/Table to concatenate.
68
+ # DataFrames or Tables to merge.
69
+ # @raise [DataFrameArgumentError]
70
+ # if size is not same or self and other shares the same key.
48
71
  # @return [DataFrame]
49
- # Merged dataframe.
72
+ # merged dataframe.
73
+ # @example
74
+ # df = DataFrame.new(x: [1, 2], y: [3, 4])
75
+ # other = DataFrame.new(a: ['A', 'B'], b: ['C', 'D'])
76
+ # df.merge(other)
77
+ #
78
+ # # =>
79
+ # x y a b
80
+ # <uint8> <uint8> <string> <string>
81
+ # 0 1 3 A C
82
+ # 1 2 4 B D
83
+ #
84
+ # @since 0.2.3
85
+ #
50
86
  def merge(*other)
51
87
  case other
52
88
  in [] | [nil] | [[]]
@@ -85,81 +121,225 @@ module RedAmber
85
121
 
86
122
  # Mutating joins (#inner_join, #full_join, #left_join, #right_join)
87
123
 
88
- # Join another DataFrame or Table, leaving only the matching records.
89
- # - Same as `#join` with `type: :inner`
90
- # - A kind of mutating join.
91
- #
92
124
  # @!macro join_before
93
125
  # @param other [DataFrame, Arrow::Table]
94
126
  # A DataFrame or a Table to be joined with self.
95
127
  #
128
+ # @!macro join_force_order
129
+ # @param force_order [Boolean]
130
+ # wheather force order of the output always same.
131
+ # - This option is used in `:full_outer` and `:right_outer`.
132
+ # - If this option is true (by default) it will append index to the source
133
+ # and sort after joining. It will cause some degradation in performance.
134
+ #
96
135
  # @!macro join_after
97
136
  # @param suffix [#succ]
98
137
  # a suffix to rename keys when key names conflict as a result of join.
99
138
  # `suffix` must be responsible to `#succ`.
100
139
  # @return [DataFrame]
101
- # Joined dataframe.
140
+ # joined dataframe.
102
141
  #
103
142
  # @!macro join_key_in_array
104
143
  # @param join_keys [String, Symbol, Array<String, Symbol>]
105
- # A key or keys to match.
144
+ # a key or keys to match.
106
145
  #
107
146
  # @!macro join_key_in_hash
108
147
  # @param join_key_pairs [Hash]
109
- # Pairs of a key name or key names to match in left and right.
148
+ # pairs of a key name or key names to match in left and right.
110
149
  # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :left
111
- # Join keys in `self`.
150
+ # join keys in `self`.
112
151
  # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :right
113
- # Join keys in `other`.
152
+ # join keys in `other`.
153
+ #
154
+ # @!macro join_common_example_1
155
+ # @example
156
+ # df = DataFrame.new(KEY: %w[A B C], X1: [1, 2, 3])
157
+ #
158
+ # # =>
159
+ # KEY X1
160
+ # <string> <uint8>
161
+ # 0 A 1
162
+ # 1 B 2
163
+ # 2 C 3
164
+ #
165
+ # other = DataFrame.new(KEY: %w[A B D], X2: [true, false, nil])
166
+ #
167
+ # # =>
168
+ # KEY X2
169
+ # <string> <boolean>
170
+ # 0 A true
171
+ # 1 B false
172
+ # 2 D (nil)
173
+ #
174
+ # @!macro join_common_example_2
175
+ # @example
176
+ # df2 = DataFrame.new(KEY1: %w[A B C], X1: [1, 2, 3])
177
+ #
178
+ # # =>
179
+ # KEY1 X1
180
+ # <string> <uint8>
181
+ # 0 A 1
182
+ # 1 B 2
183
+ # 2 C 3
184
+ #
185
+ # other2 = DataFrame.new(KEY2: %w[A B D], X2: [true, false, nil])
186
+ #
187
+ # # =>
188
+ # KEY2 X2
189
+ # <string> <boolean>
190
+ # 0 A true
191
+ # 1 B false
192
+ # 2 D (nil)
193
+ #
194
+ # @!macro join_common_example_3
195
+ # @example
196
+ # df3 = DataFrame.new(
197
+ # KEY1: %w[A B C],
198
+ # KEY2: [1, 2, 3]
199
+ # )
200
+ #
201
+ # # =>
202
+ # KEY1 KEY2
203
+ # <string> <uint8>
204
+ # 0 A 1
205
+ # 1 B 2
206
+ # 2 C 3
207
+ #
208
+ # other3 = DataFrame.new(
209
+ # KEY1: %w[A B D],
210
+ # KEY2: [1, 4, 5]
211
+ # )
212
+ #
213
+ # # =>
214
+ # KEY1 KEY2
215
+ # <string> <uint8>
216
+ # 0 A 1
217
+ # 1 B 4
218
+ # 2 D 5
219
+
220
+ # Join another DataFrame or Table, leaving only the matching records.
221
+ # - Same as `#join` with `type: :inner`
222
+ # - A kind of mutating join.
114
223
  #
115
- # @overload inner_join(other, suffix: '.1')
224
+ # @overload inner_join(other, suffix: '.1', force_order: true)
116
225
  # If `join_key` is not specified, common keys in self and other are used
117
226
  # (natural keys). Returns joined dataframe.
118
227
  #
119
228
  # @macro join_before
229
+ # @macro join_force_order
120
230
  # @macro join_after
231
+ # @macro join_common_example_1
232
+ # @example without key (use implicit common key)
233
+ # df.inner_join(other)
234
+ #
235
+ # # =>
236
+ # KEY X1 X2
237
+ # <string> <uint8> <boolean>
238
+ # 0 A 1 true
239
+ # 1 B 2 false
121
240
  #
122
- # @overload inner_join(other, join_keys, suffix: '.1')
241
+ # @overload inner_join(other, join_keys, suffix: '.1', force_order: true)
123
242
  #
124
243
  # @macro join_before
125
244
  # @macro join_key_in_array
245
+ # @macro join_force_order
126
246
  # @macro join_after
247
+ # @macro join_common_example_1
248
+ # @example with a key
249
+ # df.inner_join(other, :KEY)
127
250
  #
128
- # @overload inner_join(other, join_key_pairs, suffix: '.1')
251
+ # # =>
252
+ # KEY X1 X2
253
+ # <string> <uint8> <boolean>
254
+ # 0 A 1 true
255
+ # 1 B 2 false
256
+ #
257
+ # @overload inner_join(other, join_key_pairs, suffix: '.1', force_order: true)
129
258
  #
130
259
  # @macro join_before
131
260
  # @macro join_key_in_hash
261
+ # @macro join_force_order
132
262
  # @macro join_after
263
+ # @macro join_common_example_2
264
+ # @example with key pairs
265
+ # df2.inner_join(other2, { left: :KEY1, right: :KEY2 })
266
+ #
267
+ # # =>
268
+ # KEY1 X1 X2
269
+ # <string> <uint8> <boolean>
270
+ # 0 A 1 true
271
+ # 1 B 2 false
133
272
  #
134
- def inner_join(other, join_keys = nil, suffix: '.1')
135
- join(other, join_keys, type: :inner, suffix: suffix)
273
+ # @since 0.2.3
274
+ #
275
+ def inner_join(other, join_keys = nil, suffix: '.1', force_order: true)
276
+ join(other, join_keys, type: :inner, suffix: suffix, force_order: force_order)
136
277
  end
137
278
 
138
279
  # Join another DataFrame or Table, leaving all records.
139
280
  # - Same as `#join` with `type: :full_outer`
140
281
  # - A kind of mutating join.
141
282
  #
142
- # @overload full_join(other, suffix: '.1')
283
+ # @overload full_join(other, suffix: '.1', force_order: true)
143
284
  # If `join_key` is not specified, common keys in self and other are used
144
285
  # (natural keys). Returns joined dataframe.
145
286
  #
146
287
  # @macro join_before
288
+ # @macro join_force_order
147
289
  # @macro join_after
290
+ # @macro join_common_example_1
291
+ # @example without key (use implicit common key)
292
+ # df.full_join(other)
293
+ #
294
+ # # =>
295
+ # KEY X1 X2
296
+ # <string> <uint8> <boolean>
297
+ # 0 A 1 true
298
+ # 1 B 2 false
299
+ # 2 C 3 (nil)
300
+ # 3 D (nil) (nil)
148
301
  #
149
- # @overload full_join(other, join_keys, suffix: '.1')
302
+ # @overload full_join(other, join_keys, suffix: '.1', force_order: true)
150
303
  #
151
304
  # @macro join_before
152
305
  # @macro join_key_in_array
306
+ # @macro join_force_order
153
307
  # @macro join_after
308
+ # @macro join_common_example_1
309
+ # @example with a key
310
+ # df.full_join(other, :KEY)
311
+ #
312
+ # # =>
313
+ # KEY X1 X2
314
+ # <string> <uint8> <boolean>
315
+ # 0 A 1 true
316
+ # 1 B 2 false
317
+ # 2 C 3 (nil)
318
+ # 3 D (nil) (nil)
154
319
  #
155
- # @overload full_join(other, join_key_pairs, suffix: '.1')
320
+ # @overload full_join(other, join_key_pairs, suffix: '.1', force_order: true)
156
321
  #
157
322
  # @macro join_before
158
323
  # @macro join_key_in_hash
324
+ # @macro join_force_order
159
325
  # @macro join_after
160
- #
161
- def full_join(other, join_keys = nil, suffix: '.1')
162
- join(other, join_keys, type: :full_outer, suffix: suffix)
326
+ # @macro join_common_example_2
327
+ # @example with key pairs
328
+ # df2.full_join(other2, { left: :KEY1, right: :KEY2 })
329
+ #
330
+ # # =>
331
+ # KEY1 X1 X2
332
+ # <string> <uint8> <boolean>
333
+ # 0 A 1 true
334
+ # 1 B 2 false
335
+ # 2 C 3 (nil)
336
+ # 3 D (nil) (nil)
337
+ #
338
+ # @since 0.2.3
339
+ #
340
+ def full_join(other, join_keys = nil, suffix: '.1', force_order: true)
341
+ join(other, join_keys,
342
+ type: :full_outer, suffix: suffix, force_order: force_order)
163
343
  end
164
344
 
165
345
  alias_method :outer_join, :full_join
@@ -168,54 +348,130 @@ module RedAmber
168
348
  # - Same as `#join` with `type: :left_outer`
169
349
  # - A kind of mutating join.
170
350
  #
171
- # @overload left_join(other, suffix: '.1')
351
+ # @overload left_join(other, suffix: '.1', force_order: true)
172
352
  # If `join_key` is not specified, common keys in self and other are used
173
353
  # (natural keys). Returns joined dataframe.
174
354
  #
175
355
  # @macro join_before
356
+ # @macro join_force_order
176
357
  # @macro join_after
358
+ # @macro join_common_example_1
359
+ # @example without key (use implicit common key)
360
+ # df.left_join(other)
361
+ #
362
+ # # =>
363
+ # KEY X1 X2
364
+ # <string> <uint8> <boolean>
365
+ # 0 A 1 true
366
+ # 1 B 2 false
367
+ # 2 C 3 (nil)
177
368
  #
178
- # @overload left_join(other, join_keys, suffix: '.1')
369
+ # @overload left_join(other, join_keys, suffix: '.1', force_order: true)
179
370
  #
180
371
  # @macro join_before
181
372
  # @macro join_key_in_array
373
+ # @macro join_force_order
182
374
  # @macro join_after
375
+ # @macro join_common_example_1
376
+ # @example with a key
377
+ # df.left_join(other, :KEY)
378
+ #
379
+ # # =>
380
+ # KEY X1 X2
381
+ # <string> <uint8> <boolean>
382
+ # 0 A 1 true
383
+ # 1 B 2 false
384
+ # 2 C 3 (nil)
183
385
  #
184
- # @overload left_join(other, join_key_pairs, suffix: '.1')
386
+ # @overload left_join(other, join_key_pairs, suffix: '.1', force_order: true)
185
387
  #
186
388
  # @macro join_before
187
389
  # @macro join_key_in_hash
390
+ # @macro join_force_order
188
391
  # @macro join_after
392
+ # @macro join_common_example_2
393
+ # @example with key pairs
394
+ # df2.left_join(other2, { left: :KEY1, right: :KEY2 })
189
395
  #
190
- def left_join(other, join_keys = nil, suffix: '.1')
191
- join(other, join_keys, type: :left_outer, suffix: suffix)
396
+ # # =>
397
+ # KEY1 X1 X2
398
+ # <string> <uint8> <boolean>
399
+ # 0 A 1 true
400
+ # 1 B 2 false
401
+ # 2 C 3 (nil)
402
+ #
403
+ # @since 0.2.3
404
+ #
405
+ def left_join(other, join_keys = nil, suffix: '.1', force_order: true)
406
+ join(other, join_keys, type: :left_outer, suffix: suffix, force_order: force_order)
192
407
  end
193
408
 
194
409
  # Join matching values from self to other.
195
410
  # - Same as `#join` with `type: :right_outer`
196
411
  # - A kind of mutating join.
197
412
  #
198
- # @overload right_join(other, suffix: '.1')
413
+ # @overload right_join(other, suffix: '.1', force_order: true)
199
414
  # If `join_key` is not specified, common keys in self and other are used
200
415
  # (natural keys). Returns joined dataframe.
201
416
  #
202
417
  # @macro join_before
418
+ # @macro join_force_order
203
419
  # @macro join_after
420
+ # @macro join_common_example_1
421
+ # @example without key (use implicit common key)
422
+ # df.right_join(other)
423
+ #
424
+ # # =>
425
+ # KEY X1 X2
426
+ # <string> <uint8> <boolean>
427
+ # 0 A 1 true
428
+ # 1 B 2 false
429
+ # 2 D (nil) (nil)
204
430
  #
205
- # @overload right_join(other, join_keys, suffix: '.1')
431
+ # @overload right_join(other, join_keys, suffix: '.1', force_order: true)
206
432
  #
207
433
  # @macro join_before
208
434
  # @macro join_key_in_array
435
+ # @macro join_force_order
209
436
  # @macro join_after
437
+ # @macro join_common_example_1
438
+ # @example with a key
439
+ # df.right_join(other, :KEY)
210
440
  #
211
- # @overload right_join(other, join_key_pairs, suffix: '.1')
441
+ # # =>
442
+ # KEY X1 X2
443
+ # <string> <uint8> <boolean>
444
+ # 0 A 1 true
445
+ # 1 B 2 false
446
+ # 2 D (nil) (nil)
447
+ #
448
+ # @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
212
449
  #
213
450
  # @macro join_before
214
451
  # @macro join_key_in_hash
452
+ # @macro join_force_order
215
453
  # @macro join_after
216
- #
217
- def right_join(other, join_keys = nil, suffix: '.1')
218
- join(other, join_keys, type: :right_outer, suffix: suffix)
454
+ # @macro join_common_example_2
455
+ # @example with key pairs
456
+ # df2.right_join(other2, { left: :KEY1, right: :KEY2 })
457
+ #
458
+ # # =>
459
+ # KEY1 X1 X2
460
+ # <string> <uint8> <boolean>
461
+ # 0 A 1 true
462
+ # 1 B 2 false
463
+ # 2 D (nil) (nil)
464
+ #
465
+ # @since 0.2.3
466
+ #
467
+ def right_join(other, join_keys = nil, suffix: '.1', force_order: true)
468
+ join(
469
+ other,
470
+ join_keys,
471
+ type: :right_outer,
472
+ suffix: suffix,
473
+ force_order: force_order
474
+ )
219
475
  end
220
476
 
221
477
  # Filtering joins (#semi_join, #anti_join)
@@ -224,54 +480,115 @@ module RedAmber
224
480
  # - Same as `#join` with `type: :left_semi`
225
481
  # - A kind of filtering join.
226
482
  #
227
- # @overload semi_join(other, suffix: '.1')
483
+ # @overload semi_join(other, suffix: '.1', force_order: true)
228
484
  # If `join_key` is not specified, common keys in self and other are used
229
485
  # (natural keys). Returns joined dataframe.
230
486
  #
231
487
  # @macro join_before
488
+ # @macro join_force_order
232
489
  # @macro join_after
490
+ # @macro join_common_example_1
491
+ # @example without key (use implicit common key)
492
+ # df.semi_join(other)
233
493
  #
234
- # @overload semi_join(other, join_keys, suffix: '.1')
494
+ # # =>
495
+ # KEY X1
496
+ # <string> <uint8>
497
+ # 0 A 1
498
+ # 1 B 2
499
+ #
500
+ # @overload semi_join(other, join_keys, suffix: '.1', force_order: true)
235
501
  #
236
502
  # @macro join_before
237
503
  # @macro join_key_in_array
504
+ # @macro join_force_order
238
505
  # @macro join_after
506
+ # @macro join_common_example_1
507
+ # @example with a key
508
+ # df.semi_join(other, :KEY)
509
+ #
510
+ # # =>
511
+ # KEY X1
512
+ # <string> <uint8>
513
+ # 0 A 1
514
+ # 1 B 2
239
515
  #
240
- # @overload semi_join(other, join_key_pairs, suffix: '.1')
516
+ # @overload semi_join(other, join_key_pairs, suffix: '.1', force_order: true)
241
517
  #
242
518
  # @macro join_before
243
519
  # @macro join_key_in_hash
520
+ # @macro join_force_order
244
521
  # @macro join_after
522
+ # @macro join_common_example_2
523
+ # @example with key pairs
524
+ # df2.semi_join(other2, { left: :KEY1, right: :KEY2 })
525
+ #
526
+ # # =>
527
+ # KEY1 X1
528
+ # <string> <uint8>
529
+ # 0 A 1
530
+ # 1 B 2
245
531
  #
246
- def semi_join(other, join_keys = nil, suffix: '.1')
247
- join(other, join_keys, type: :left_semi, suffix: suffix)
532
+ # @since 0.2.3
533
+ #
534
+ def semi_join(other, join_keys = nil, suffix: '.1', force_order: true)
535
+ join(other, join_keys, type: :left_semi, suffix: suffix, force_order: force_order)
248
536
  end
249
537
 
250
538
  # Return records of self that do not have a match in other.
251
539
  # - Same as `#join` with `type: :left_anti`
252
540
  # - A kind of filtering join.
253
541
  #
254
- # @overload anti_join(other, suffix: '.1')
542
+ # @overload anti_join(other, suffix: '.1', force_order: true)
255
543
  # If `join_key` is not specified, common keys in self and other are used
256
544
  # (natural keys). Returns joined dataframe.
257
545
  #
258
546
  # @macro join_before
547
+ # @macro join_force_order
259
548
  # @macro join_after
549
+ # @macro join_common_example_1
550
+ # @example without key (use implicit common key)
551
+ # df.anti_join(other)
552
+ #
553
+ # # =>
554
+ # KEY X1
555
+ # <string> <uint8>
556
+ # 0 C 3
260
557
  #
261
- # @overload anti_join(other, join_keys, suffix: '.1')
558
+ # @overload anti_join(other, join_keys, suffix: '.1', force_order: true)
262
559
  #
263
560
  # @macro join_before
264
561
  # @macro join_key_in_array
562
+ # @macro join_force_order
265
563
  # @macro join_after
564
+ # @macro join_common_example_1
565
+ # @example with a key
566
+ # df.anti_join(other, :KEY)
266
567
  #
267
- # @overload anti_join(other, join_key_pairs, suffix: '.1')
568
+ # # =>
569
+ # KEY X1
570
+ # <string> <uint8>
571
+ # 0 C 3
572
+ #
573
+ # @overload anti_join(other, join_key_pairs, suffix: '.1', force_order: true)
268
574
  #
269
575
  # @macro join_before
270
576
  # @macro join_key_in_hash
577
+ # @macro join_force_order
271
578
  # @macro join_after
579
+ # @macro join_common_example_2
580
+ # @example with key pairs
581
+ # df2.anti_join(other2, { left: :KEY1, right: :KEY2 })
582
+ #
583
+ # # =>
584
+ # KEY1 X1
585
+ # <string> <uint8>
586
+ # 0 C 3
587
+ #
588
+ # @since 0.2.3
272
589
  #
273
- def anti_join(other, join_keys = nil, suffix: '.1')
274
- join(other, join_keys, type: :left_anti, suffix: suffix)
590
+ def anti_join(other, join_keys = nil, suffix: '.1', force_order: true)
591
+ join(other, join_keys, type: :left_anti, suffix: suffix, force_order: force_order)
275
592
  end
276
593
 
277
594
  # Set operations (#intersect, #union, #difference, #set_operable?)
@@ -279,8 +596,13 @@ module RedAmber
279
596
  # Check if set operation with self and other is possible.
280
597
  #
281
598
  # @macro join_before
599
+ # @return [Boolean]
600
+ # true if set operation is possible.
601
+ # @macro join_common_example_3
602
+ # @example
603
+ # df3.set_operable?(other3) # => true
282
604
  #
283
- # @return [Boolean] true if set operation is possible.
605
+ # @since 0.2.3
284
606
  #
285
607
  def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
286
608
  keys == other.keys.map(&:to_sym)
@@ -291,8 +613,18 @@ module RedAmber
291
613
  # - A kind of set operations.
292
614
  #
293
615
  # @macro join_before
616
+ # @return [DataFrame]
617
+ # joined dataframe.
618
+ # @macro join_common_example_3
619
+ # @example
620
+ # df3.intersect(other3)
621
+ #
622
+ # # =>
623
+ # KEY1 KEY2
624
+ # <string> <uint8>
625
+ # 0 A 1
294
626
  #
295
- # @return [DataFrame] Joined dataframe.
627
+ # @since 0.2.3
296
628
  #
297
629
  def intersect(other)
298
630
  unless keys == other.keys.map(&:to_sym)
@@ -307,8 +639,22 @@ module RedAmber
307
639
  # - A kind of set operations.
308
640
  #
309
641
  # @macro join_before
310
- #
311
- # @return [DataFrame] Joined dataframe.
642
+ # @return [DataFrame]
643
+ # joined dataframe.
644
+ # @macro join_common_example_3
645
+ # @example
646
+ # df3.intersect(other3)
647
+ #
648
+ # # =>
649
+ # KEY1 KEY2
650
+ # <string> <uint8>
651
+ # 0 A 1
652
+ # 1 B 2
653
+ # 2 C 3
654
+ # 3 B 4
655
+ # 4 D 5
656
+ #
657
+ # @since 0.2.3
312
658
  #
313
659
  def union(other)
314
660
  unless keys == other.keys.map(&:to_sym)
@@ -323,8 +669,27 @@ module RedAmber
323
669
  # - A kind of set operations.
324
670
  #
325
671
  # @macro join_before
672
+ # @return [DataFrame]
673
+ # joined dataframe.
674
+ # @macro join_common_example_3
675
+ # @example
676
+ # df3.intersect(other3)
677
+ #
678
+ # # =>
679
+ # KEY1 KEY2
680
+ # <string> <uint8>
681
+ # 0 B 2
682
+ # 1 C 3
683
+ #
684
+ # other.intersect(df)
685
+ #
686
+ # # =>
687
+ # KEY1 KEY2
688
+ # <string> <uint8>
689
+ # 0 B 4
690
+ # 1 D 5
326
691
  #
327
- # @return [DataFrame] Joined dataframe.
692
+ # @since 0.2.3
328
693
  #
329
694
  def difference(other)
330
695
  unless keys == other.keys.map(&:to_sym)
@@ -338,60 +703,167 @@ module RedAmber
338
703
 
339
704
  # Join another DataFrame or Table to self.
340
705
  #
341
- # @overload join(other, type: :inner, suffix: '.1')
706
+ # @!macro join_common_type
707
+ # @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
708
+ # left_outer, :right_outer, :full_outer] type of join.
709
+ #
710
+ # @!macro join_common_example_4
711
+ # @example
712
+ # df4 = DataFrame.new(
713
+ # X1: %w[A B C],
714
+ # Y: %w[D E F]
715
+ # )
716
+ #
717
+ # # =>
718
+ # X1 Y1
719
+ # <string> <string>
720
+ # 0 A D
721
+ # 1 B E
722
+ # 2 C F
723
+ #
724
+ # other4 = DataFrame.new(
725
+ # X2: %w[A B D],
726
+ # Y: %w[e E E]
727
+ # )
728
+ #
729
+ # # =>
730
+ # X1 Y1
731
+ # <string> <string>
732
+ # 0 A D
733
+ # 1 B E
734
+ # 2 C F
735
+
736
+ # @note the order of joined results will be preserved by default.
737
+ # This is enabled by appending index column to sort after joining but
738
+ # it will cause some performance degradation. If you don't matter
739
+ # the order of the result, set `force_order` option to `false`.
740
+ #
741
+ # @overload join(other, type: :inner, suffix: '.1', force_order: true)
342
742
  #
343
743
  # If `join_key` is not specified, common keys in self and other are used
344
744
  # (natural keys). Returns joined dataframe.
345
745
  #
346
- # @!macro join_common_type
347
- # @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
348
- # left_outer, :right_outer, :full_outer] type of join.
349
- #
350
746
  # @macro join_before
351
747
  # @macro join_common_type
748
+ # @macro join_force_order
352
749
  # @macro join_after
750
+ # @macro join_common_example_1
751
+ # @example
752
+ # df.join(other)
753
+ #
754
+ # # =>
755
+ # KEY X1 X2
756
+ # <string> <uint8> <boolean>
757
+ # 0 A 1 true
758
+ # 1 B 2 false
759
+ #
760
+ # df.join(other, type: :full_outer)
353
761
  #
354
- # @overload join(other, join_keys, type: :inner, suffix: '.1')
762
+ # # =>
763
+ # KEY X1 X2
764
+ # <string> <uint8> <boolean>
765
+ # 0 A 1 true
766
+ # 1 B 2 false
767
+ # 2 C 3 (nil)
768
+ # 3 D (nil) (nil)
769
+ #
770
+ # @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: true)
355
771
  #
356
772
  # @macro join_before
357
773
  # @macro join_key_in_array
358
774
  # @macro join_common_type
775
+ # @macro join_force_order
359
776
  # @macro join_after
777
+ # @macro join_common_example_3
778
+ # @example join keys in an Array
779
+ # df3.join(other3, [:KEY1, :KEY2])
780
+ #
781
+ # # =>
782
+ # KEY1 KEY2
783
+ # <string> <uint8>
784
+ # 0 A 1
360
785
  #
361
- # @overload join(other, join_key_pairs, type: :inner, suffix: '.1')
786
+ # @example partial join key and suffix
787
+ # df3.join(other3, :KEY1, suffix: '.a')
788
+ #
789
+ # # =>
790
+ # KEY1 KEY2 KEY2.a
791
+ # <string> <uint8> <uint8>
792
+ # 0 A 1 1
793
+ # 1 B 2 4
794
+ #
795
+ # @overload join(other, join_key_pairs, type: :inner, suffix: '.1', force_order: true)
362
796
  #
363
797
  # @macro join_before
364
798
  # @macro join_key_in_hash
365
799
  # @macro join_common_type
800
+ # @macro join_force_order
366
801
  # @macro join_after
367
- #
368
- def join(other, join_keys = nil, type: :inner, suffix: '.1')
369
- case other
370
- when DataFrame
371
- other = other.table
372
- when Arrow::Table
373
- # Nop
802
+ # @macro join_common_example_4
803
+ # @example without options
804
+ # df4.join(other4)
805
+ #
806
+ # # =>
807
+ # X1 Y X2
808
+ # <string> <string> <string>
809
+ # 0 B E D
810
+ # 1 B E B
811
+ #
812
+ # @example join by key pairs
813
+ # df4.join(other4, { left: [:X1, :Y], right: [:X2, :Y] })
814
+ #
815
+ # # =>
816
+ # X1 Y
817
+ # <string> <string>
818
+ # 0 B E
819
+ #
820
+ # @example join by key pairs, using renaming by suffix
821
+ # df4.join(other4, { left: :X1, right: :X2 })
822
+ #
823
+ # # =>
824
+ # X1 Y Y.1
825
+ # <string> <string> <string>
826
+ # 0 A D e
827
+ # 1 B E E
828
+ #
829
+ # @since 0.2.3
830
+ #
831
+ def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: true)
832
+ right_table =
833
+ case other
834
+ when DataFrame
835
+ other.table
836
+ when Arrow::Table
837
+ other
838
+ else
839
+ raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
840
+ end
841
+
842
+ type = type.to_sym
843
+ left_index = :__LEFT_INDEX__
844
+ right_index = :__RIGHT_INDEX__
845
+ if force_order
846
+ left_table = assign(left_index) { indices }.table
847
+ other = DataFrame.create(other) if other.is_a?(Arrow::Table)
848
+ right_table = other.assign(right_index) { indices }.table
374
849
  else
375
- raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
850
+ left_table = table
376
851
  end
377
852
 
378
- table_keys = table.keys
379
- other_keys = other.keys
380
- type = type.to_sym
853
+ table_keys = left_table.keys
854
+ other_keys = right_table.keys
381
855
 
382
856
  # natural keys (implicit common keys)
383
857
  join_keys ||= table_keys.intersection(other_keys)
384
858
 
385
859
  # This is not necessary if additional procedure is contributed to Red Arrow.
386
860
  if join_keys.is_a?(Hash)
387
- left_keys = join_keys[:left]
388
- right_keys = join_keys[:right]
861
+ left_keys = ensure_keys(join_keys[:left])
862
+ right_keys = ensure_keys(join_keys[:right])
389
863
  else
390
- left_keys = join_keys
391
- right_keys = join_keys
864
+ left_keys = ensure_keys(join_keys)
865
+ right_keys = left_keys
392
866
  end
393
- left_keys = Array(left_keys).map(&:to_s)
394
- right_keys = Array(right_keys).map(&:to_s)
395
867
 
396
868
  case type
397
869
  when :full_outer, :left_semi, :left_anti, :right_semi, :right_anti
@@ -407,43 +879,73 @@ module RedAmber
407
879
 
408
880
  # Should we rescue errors in Arrow::Table#join for usability ?
409
881
  joined_table =
410
- table.join(other, join_keys,
411
- type: type,
412
- left_outputs: left_outputs,
413
- right_outputs: right_outputs)
882
+ left_table.join(
883
+ right_table,
884
+ join_keys,
885
+ type: type,
886
+ left_outputs: left_outputs,
887
+ right_outputs: right_outputs
888
+ )
414
889
 
415
890
  case type
416
891
  when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
417
- if joined_table.keys.uniq!
418
- DataFrame.create(rename_table(joined_table, n_keys, suffix))
419
- else
420
- DataFrame.create(joined_table)
421
- end
892
+ dataframe =
893
+ if joined_table.keys.uniq!
894
+ DataFrame.create(rename_table(joined_table, n_keys, suffix))
895
+ else
896
+ DataFrame.create(joined_table)
897
+ end
898
+ sorter =
899
+ case type
900
+ when :inner, :left_outer
901
+ [left_index, right_index]
902
+ when :left_semi, :left_anti
903
+ [left_index]
904
+ when :right_semi, :right_anti
905
+ [right_index]
906
+ end
422
907
  when :full_outer
908
+ key_index_lr =
909
+ left_keys.map { left_table.keys.index(_1) }
910
+ .zip(right_keys.map { left_table.keys.size + right_table.keys.index(_1) })
423
911
  renamed_table = rename_table(joined_table, n_keys, suffix)
424
- renamed_keys = renamed_table.keys
425
912
  dropper = []
426
- DataFrame.create(renamed_table).assign do |df|
427
- left_keys.map do |left_key|
428
- i_left_key = renamed_keys.index(left_key)
429
- right_key = renamed_keys[i_left_key + table_keys.size]
430
- dropper << right_key
431
- [left_key.to_sym, merge_array(df[left_key].data, df[right_key].data)]
913
+ dataframe =
914
+ DataFrame.create(renamed_table).assign do |df|
915
+ key_index_lr.map do |l, r|
916
+ dropper << df.keys[r]
917
+ [df.keys[l], merge_array(df.vectors[l].data, df.vectors[r].data)]
918
+ end
432
919
  end
433
- end.drop(dropper)
920
+ dataframe = dataframe.drop(dropper)
921
+ sorter = [left_index, right_index]
434
922
  when :right_outer
435
- if joined_table.keys.uniq!
436
- DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
437
- else
438
- DataFrame.create(joined_table)
439
- end.pick do
440
- [right_keys, keys.map(&:to_s) - right_keys]
441
- end
923
+ dataframe =
924
+ if joined_table.keys.uniq!
925
+ DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
926
+ else
927
+ DataFrame.create(joined_table)
928
+ end
929
+ dataframe = dataframe.pick(right_keys, dataframe.keys - right_keys)
930
+ sorter = [left_index, right_index]
931
+ end
932
+
933
+ if force_order
934
+ dataframe
935
+ .sort(sorter)
936
+ .drop(sorter)
937
+ else
938
+ dataframe
442
939
  end
443
940
  end
444
941
 
445
942
  private
446
943
 
944
+ # To ensure Array of Symbols
945
+ def ensure_keys(keys)
946
+ Array(keys).map(&:to_sym)
947
+ end
948
+
447
949
  # Rename duplicate keys by suffix
448
950
  def rename_table(joined_table, n_keys, suffix)
449
951
  joined_keys = joined_table.keys
@@ -453,17 +955,9 @@ module RedAmber
453
955
  renamed_right_keys =
454
956
  other_keys.map do |key|
455
957
  if dup_keys.include?(key)
456
- new_key = nil
457
- loop do
458
- new_key = "#{key}#{suffix}"
459
- break unless joined_keys.include?(new_key)
460
-
461
- s = suffix.succ
462
- raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
463
-
464
- suffix = s
465
- end
466
- new_key
958
+ suffixed = "#{key}#{suffix}".to_sym
959
+ # Find a key from suffixed.succ
960
+ (suffixed..).find { !joined_keys.include?(_1) }
467
961
  else
468
962
  key
469
963
  end