red_amber 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +39 -20
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +113 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +25 -26
  8. data/benchmark/basic.yml +2 -2
  9. data/benchmark/combine.yml +2 -2
  10. data/benchmark/dataframe.yml +2 -2
  11. data/benchmark/group.yml +2 -2
  12. data/benchmark/reshape.yml +2 -2
  13. data/benchmark/vector.yml +3 -0
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +429 -75
  20. data/lib/red_amber/data_frame_combinable.rb +516 -66
  21. data/lib/red_amber/data_frame_displayable.rb +244 -14
  22. data/lib/red_amber/data_frame_indexable.rb +121 -18
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +622 -66
  26. data/lib/red_amber/data_frame_variable_operation.rb +446 -34
  27. data/lib/red_amber/group.rb +187 -22
  28. data/lib/red_amber/helper.rb +70 -10
  29. data/lib/red_amber/refinements.rb +12 -5
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +385 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +217 -12
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,17 +1,36 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameCombinable
6
6
  # Refinements for Arrow::Table
7
7
  using RefineArrowTable
8
8
 
9
- # Concatenate other dataframe onto the bottom.
9
+ # Concatenate other dataframes or tables onto the bottom of self.
10
10
  #
11
+ # @note the `#types` must be same as `other#types`.
11
12
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
12
- # DataFrame/Table to concatenate onto the bottom of self.
13
+ # DataFrames or Tables to concatenate.
13
14
  # @return [DataFrame]
14
- # Concatenated dataframe.
15
+ # concatenated dataframe.
16
+ # @example
17
+ # df = DataFrame.new(x: [1, 2], y: ['A', 'B'])
18
+ # other = DataFrame.new(x: [3, 4], y: ['C', 'D'])
19
+ # [df.types, other.types]
20
+ #
21
+ # # =>
22
+ # [[:uint8, :string], [:uint8, :string]]
23
+ #
24
+ # df.concatenate(other)
25
+ #
26
+ # # =>
27
+ # x y
28
+ # <uint8> <string>
29
+ # 0 1 A
30
+ # 1 2 B
31
+ # 2 3 C
32
+ # 3 4 D
33
+ #
15
34
  def concatenate(*other)
16
35
  case other
17
36
  in [] | [nil] | [[]]
@@ -39,14 +58,27 @@ module RedAmber
39
58
  alias_method :concat, :concatenate
40
59
  alias_method :bind_rows, :concatenate
41
60
 
42
- # Merge other DataFrame or Table from other.
43
- # - Self and other must have same size.
44
- # - Self and other do not share the same key.
45
- # - If they share any keys, raise Error.
61
+ # Merge other DataFrames or Tables.
62
+ #
63
+ # @note the `#size` must be same as `other#size`.
64
+ # @note self and other must not share the same key.
46
65
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
47
- # DataFrame/Table to concatenate.
66
+ # DataFrames or Tables to merge.
67
+ # @raise [DataFrameArgumentError]
68
+ # if size is not same or self and other shares the same key.
48
69
  # @return [DataFrame]
49
- # Merged dataframe.
70
+ # merged dataframe.
71
+ # @example
72
+ # df = DataFrame.new(x: [1, 2], y: [3, 4])
73
+ # other = DataFrame.new(a: ['A', 'B'], b: ['C', 'D'])
74
+ # df.merge(other)
75
+ #
76
+ # # =>
77
+ # x y a b
78
+ # <uint8> <uint8> <string> <string>
79
+ # 0 1 3 A C
80
+ # 1 2 4 B D
81
+ #
50
82
  def merge(*other)
51
83
  case other
52
84
  in [] | [nil] | [[]]
@@ -85,32 +117,105 @@ module RedAmber
85
117
 
86
118
  # Mutating joins (#inner_join, #full_join, #left_join, #right_join)
87
119
 
88
- # Join another DataFrame or Table, leaving only the matching records.
89
- # - Same as `#join` with `type: :inner`
90
- # - A kind of mutating join.
91
- #
92
120
  # @!macro join_before
93
121
  # @param other [DataFrame, Arrow::Table]
94
122
  # A DataFrame or a Table to be joined with self.
95
123
  #
124
+ # @!macro join_dorce_order
125
+ # @param force_order [Boolean]
126
+ # wheather force order of the output always same.
127
+ # - This option is used in `:full_outer` and `:right_outer`.
128
+ # - If this option is true (by default) it will append index to the source
129
+ # and sort after joining. It will cause some degradation in performance.
130
+ #
96
131
  # @!macro join_after
97
132
  # @param suffix [#succ]
98
133
  # a suffix to rename keys when key names conflict as a result of join.
99
134
  # `suffix` must be responsible to `#succ`.
100
135
  # @return [DataFrame]
101
- # Joined dataframe.
136
+ # joined dataframe.
102
137
  #
103
138
  # @!macro join_key_in_array
104
139
  # @param join_keys [String, Symbol, Array<String, Symbol>]
105
- # A key or keys to match.
140
+ # a key or keys to match.
106
141
  #
107
142
  # @!macro join_key_in_hash
108
143
  # @param join_key_pairs [Hash]
109
- # Pairs of a key name or key names to match in left and right.
144
+ # pairs of a key name or key names to match in left and right.
110
145
  # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :left
111
- # Join keys in `self`.
146
+ # join keys in `self`.
112
147
  # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :right
113
- # Join keys in `other`.
148
+ # join keys in `other`.
149
+ #
150
+ # @!macro join_common_example_1
151
+ # @example
152
+ # df = DataFrame.new(KEY: %w[A B C], X1: [1, 2, 3])
153
+ #
154
+ # # =>
155
+ # KEY X1
156
+ # <string> <uint8>
157
+ # 0 A 1
158
+ # 1 B 2
159
+ # 2 C 3
160
+ #
161
+ # other = DataFrame.new(KEY: %w[A B D], X2: [true, false, nil])
162
+ #
163
+ # # =>
164
+ # KEY X2
165
+ # <string> <boolean>
166
+ # 0 A true
167
+ # 1 B false
168
+ # 2 D (nil)
169
+ #
170
+ # @!macro join_common_example_2
171
+ # @example
172
+ # df2 = DataFrame.new(KEY1: %w[A B C], X1: [1, 2, 3])
173
+ #
174
+ # # =>
175
+ # KEY1 X1
176
+ # <string> <uint8>
177
+ # 0 A 1
178
+ # 1 B 2
179
+ # 2 C 3
180
+ #
181
+ # other2 = DataFrame.new(KEY2: %w[A B D], X2: [true, false, nil])
182
+ #
183
+ # # =>
184
+ # KEY2 X2
185
+ # <string> <boolean>
186
+ # 0 A true
187
+ # 1 B false
188
+ # 2 D (nil)
189
+ #
190
+ # @!macro join_common_example_3
191
+ # @example
192
+ # df3 = DataFrame.new(
193
+ # KEY1: %w[A B C],
194
+ # KEY2: [1, 2, 3]
195
+ # )
196
+ #
197
+ # # =>
198
+ # KEY1 KEY2
199
+ # <string> <uint8>
200
+ # 0 A 1
201
+ # 1 B 2
202
+ # 2 C 3
203
+ #
204
+ # other3 = DataFrame.new(
205
+ # KEY1: %w[A B D],
206
+ # KEY2: [1, 4, 5]
207
+ # )
208
+ #
209
+ # # =>
210
+ # KEY1 KEY2
211
+ # <string> <uint8>
212
+ # 0 A 1
213
+ # 1 B 4
214
+ # 2 D 5
215
+
216
+ # Join another DataFrame or Table, leaving only the matching records.
217
+ # - Same as `#join` with `type: :inner`
218
+ # - A kind of mutating join.
114
219
  #
115
220
  # @overload inner_join(other, suffix: '.1')
116
221
  # If `join_key` is not specified, common keys in self and other are used
@@ -118,18 +223,45 @@ module RedAmber
118
223
  #
119
224
  # @macro join_before
120
225
  # @macro join_after
226
+ # @macro join_common_example_1
227
+ # @example without key (use implicit common key)
228
+ # df.inner_join(other)
229
+ #
230
+ # # =>
231
+ # KEY X1 X2
232
+ # <string> <uint8> <boolean>
233
+ # 0 A 1 true
234
+ # 1 B 2 false
121
235
  #
122
236
  # @overload inner_join(other, join_keys, suffix: '.1')
123
237
  #
124
238
  # @macro join_before
125
239
  # @macro join_key_in_array
126
240
  # @macro join_after
241
+ # @macro join_common_example_1
242
+ # @example with a key
243
+ # df.inner_join(other, :KEY)
244
+ #
245
+ # # =>
246
+ # KEY X1 X2
247
+ # <string> <uint8> <boolean>
248
+ # 0 A 1 true
249
+ # 1 B 2 false
127
250
  #
128
251
  # @overload inner_join(other, join_key_pairs, suffix: '.1')
129
252
  #
130
253
  # @macro join_before
131
254
  # @macro join_key_in_hash
132
255
  # @macro join_after
256
+ # @macro join_common_example_2
257
+ # @example with key pairs
258
+ # df2.inner_join(other2, { left: :KEY1, right: :KEY2 })
259
+ #
260
+ # # =>
261
+ # KEY1 X1 X2
262
+ # <string> <uint8> <boolean>
263
+ # 0 A 1 true
264
+ # 1 B 2 false
133
265
  #
134
266
  def inner_join(other, join_keys = nil, suffix: '.1')
135
267
  join(other, join_keys, type: :inner, suffix: suffix)
@@ -139,27 +271,64 @@ module RedAmber
139
271
  # - Same as `#join` with `type: :full_outer`
140
272
  # - A kind of mutating join.
141
273
  #
142
- # @overload full_join(other, suffix: '.1')
274
+ # @overload full_join(other, suffix: '.1', force_order: true)
143
275
  # If `join_key` is not specified, common keys in self and other are used
144
276
  # (natural keys). Returns joined dataframe.
145
277
  #
146
278
  # @macro join_before
279
+ # @macro join_dorce_order
147
280
  # @macro join_after
281
+ # @macro join_common_example_1
282
+ # @example without key (use implicit common key)
283
+ # df.full_join(other)
284
+ #
285
+ # # =>
286
+ # KEY X1 X2
287
+ # <string> <uint8> <boolean>
288
+ # 0 A 1 true
289
+ # 1 B 2 false
290
+ # 2 C 3 (nil)
291
+ # 3 D (nil) (nil)
148
292
  #
149
- # @overload full_join(other, join_keys, suffix: '.1')
293
+ # @overload full_join(other, join_keys, suffix: '.1', force_order: true)
150
294
  #
151
295
  # @macro join_before
152
296
  # @macro join_key_in_array
297
+ # @macro join_dorce_order
153
298
  # @macro join_after
299
+ # @macro join_common_example_1
300
+ # @example with a key
301
+ # df.full_join(other, :KEY)
154
302
  #
155
- # @overload full_join(other, join_key_pairs, suffix: '.1')
303
+ # # =>
304
+ # KEY X1 X2
305
+ # <string> <uint8> <boolean>
306
+ # 0 A 1 true
307
+ # 1 B 2 false
308
+ # 2 C 3 (nil)
309
+ # 3 D (nil) (nil)
310
+ #
311
+ # @overload full_join(other, join_key_pairs, suffix: '.1', force_order: true)
156
312
  #
157
313
  # @macro join_before
158
314
  # @macro join_key_in_hash
315
+ # @macro join_dorce_order
159
316
  # @macro join_after
160
- #
161
- def full_join(other, join_keys = nil, suffix: '.1')
162
- join(other, join_keys, type: :full_outer, suffix: suffix)
317
+ # @macro join_common_example_2
318
+ # @example with key pairs
319
+ # df2.full_join(other2, { left: :KEY1, right: :KEY2 })
320
+ #
321
+ # # =>
322
+ # KEY1 X1 X2
323
+ # <string> <uint8> <boolean>
324
+ # 0 A 1 true
325
+ # 1 B 2 false
326
+ # 2 C 3 (nil)
327
+ # 3 D (nil) (nil)
328
+ #
329
+ def full_join(other, join_keys = nil, suffix: '.1', force_order: true)
330
+ join(other, join_keys,
331
+ type: :full_outer, suffix: suffix, force_order: force_order)
163
332
  end
164
333
 
165
334
  alias_method :outer_join, :full_join
@@ -174,18 +343,48 @@ module RedAmber
174
343
  #
175
344
  # @macro join_before
176
345
  # @macro join_after
346
+ # @macro join_common_example_1
347
+ # @example without key (use implicit common key)
348
+ # df.left_join(other)
349
+ #
350
+ # # =>
351
+ # KEY X1 X2
352
+ # <string> <uint8> <boolean>
353
+ # 0 A 1 true
354
+ # 1 B 2 false
355
+ # 2 C 3 (nil)
177
356
  #
178
357
  # @overload left_join(other, join_keys, suffix: '.1')
179
358
  #
180
359
  # @macro join_before
181
360
  # @macro join_key_in_array
182
361
  # @macro join_after
362
+ # @macro join_common_example_1
363
+ # @example with a key
364
+ # df.left_join(other, :KEY)
365
+ #
366
+ # # =>
367
+ # KEY X1 X2
368
+ # <string> <uint8> <boolean>
369
+ # 0 A 1 true
370
+ # 1 B 2 false
371
+ # 2 C 3 (nil)
183
372
  #
184
373
  # @overload left_join(other, join_key_pairs, suffix: '.1')
185
374
  #
186
375
  # @macro join_before
187
376
  # @macro join_key_in_hash
188
377
  # @macro join_after
378
+ # @macro join_common_example_2
379
+ # @example with key pairs
380
+ # df2.left_join(other2, { left: :KEY1, right: :KEY2 })
381
+ #
382
+ # # =>
383
+ # KEY1 X1 X2
384
+ # <string> <uint8> <boolean>
385
+ # 0 A 1 true
386
+ # 1 B 2 false
387
+ # 2 C 3 (nil)
189
388
  #
190
389
  def left_join(other, join_keys = nil, suffix: '.1')
191
390
  join(other, join_keys, type: :left_outer, suffix: suffix)
@@ -195,27 +394,66 @@ module RedAmber
195
394
  # - Same as `#join` with `type: :right_outer`
196
395
  # - A kind of mutating join.
197
396
  #
198
- # @overload right_join(other, suffix: '.1')
397
+ # @overload right_join(other, suffix: '.1', force_order: true)
199
398
  # If `join_key` is not specified, common keys in self and other are used
200
399
  # (natural keys). Returns joined dataframe.
201
400
  #
202
401
  # @macro join_before
402
+ # @macro join_dorce_order
203
403
  # @macro join_after
404
+ # @macro join_common_example_1
405
+ # @example without key (use implicit common key)
406
+ # df.right_join(other)
407
+ #
408
+ # # =>
409
+ # KEY X1 X2
410
+ # <string> <uint8> <boolean>
411
+ # 0 A 1 true
412
+ # 1 B 2 false
413
+ # 2 D (nil) (nil)
204
414
  #
205
- # @overload right_join(other, join_keys, suffix: '.1')
415
+ # @overload right_join(other, join_keys, suffix: '.1', force_order: true)
206
416
  #
207
417
  # @macro join_before
208
418
  # @macro join_key_in_array
419
+ # @macro join_dorce_order
209
420
  # @macro join_after
421
+ # @macro join_common_example_1
422
+ # @example with a key
423
+ # df.right_join(other, :KEY)
210
424
  #
211
- # @overload right_join(other, join_key_pairs, suffix: '.1')
425
+ # # =>
426
+ # KEY X1 X2
427
+ # <string> <uint8> <boolean>
428
+ # 0 A 1 true
429
+ # 1 B 2 false
430
+ # 2 D (nil) (nil)
431
+ #
432
+ # @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
212
433
  #
213
434
  # @macro join_before
214
435
  # @macro join_key_in_hash
436
+ # @macro join_dorce_order
215
437
  # @macro join_after
216
- #
217
- def right_join(other, join_keys = nil, suffix: '.1')
218
- join(other, join_keys, type: :right_outer, suffix: suffix)
438
+ # @macro join_common_example_2
439
+ # @example with key pairs
440
+ # df2.right_join(other2, { left: :KEY1, right: :KEY2 })
441
+ #
442
+ # # =>
443
+ # KEY1 X1 X2
444
+ # <string> <uint8> <boolean>
445
+ # 0 A 1 true
446
+ # 1 B 2 false
447
+ # 2 D (nil) (nil)
448
+ #
449
+ def right_join(other, join_keys = nil, suffix: '.1', force_order: true)
450
+ join(
451
+ other,
452
+ join_keys,
453
+ type: :right_outer,
454
+ suffix: suffix,
455
+ force_order: force_order
456
+ )
219
457
  end
220
458
 
221
459
  # Filtering joins (#semi_join, #anti_join)
@@ -230,18 +468,45 @@ module RedAmber
230
468
  #
231
469
  # @macro join_before
232
470
  # @macro join_after
471
+ # @macro join_common_example_1
472
+ # @example without key (use implicit common key)
473
+ # df.semi_join(other)
474
+ #
475
+ # # =>
476
+ # KEY X1
477
+ # <string> <uint8>
478
+ # 0 A 1
479
+ # 1 B 2
233
480
  #
234
481
  # @overload semi_join(other, join_keys, suffix: '.1')
235
482
  #
236
483
  # @macro join_before
237
484
  # @macro join_key_in_array
238
485
  # @macro join_after
486
+ # @macro join_common_example_1
487
+ # @example with a key
488
+ # df.semi_join(other, :KEY)
489
+ #
490
+ # # =>
491
+ # KEY X1
492
+ # <string> <uint8>
493
+ # 0 A 1
494
+ # 1 B 2
239
495
  #
240
496
  # @overload semi_join(other, join_key_pairs, suffix: '.1')
241
497
  #
242
498
  # @macro join_before
243
499
  # @macro join_key_in_hash
244
500
  # @macro join_after
501
+ # @macro join_common_example_2
502
+ # @example with key pairs
503
+ # df2.semi_join(other2, { left: :KEY1, right: :KEY2 })
504
+ #
505
+ # # =>
506
+ # KEY1 X1
507
+ # <string> <uint8>
508
+ # 0 A 1
509
+ # 1 B 2
245
510
  #
246
511
  def semi_join(other, join_keys = nil, suffix: '.1')
247
512
  join(other, join_keys, type: :left_semi, suffix: suffix)
@@ -257,18 +522,42 @@ module RedAmber
257
522
  #
258
523
  # @macro join_before
259
524
  # @macro join_after
525
+ # @macro join_common_example_1
526
+ # @example without key (use implicit common key)
527
+ # df.anti_join(other)
528
+ #
529
+ # # =>
530
+ # KEY X1
531
+ # <string> <uint8>
532
+ # 0 C 3
260
533
  #
261
534
  # @overload anti_join(other, join_keys, suffix: '.1')
262
535
  #
263
536
  # @macro join_before
264
537
  # @macro join_key_in_array
265
538
  # @macro join_after
539
+ # @macro join_common_example_1
540
+ # @example with a key
541
+ # df.anti_join(other, :KEY)
542
+ #
543
+ # # =>
544
+ # KEY X1
545
+ # <string> <uint8>
546
+ # 0 C 3
266
547
  #
267
548
  # @overload anti_join(other, join_key_pairs, suffix: '.1')
268
549
  #
269
550
  # @macro join_before
270
551
  # @macro join_key_in_hash
271
552
  # @macro join_after
553
+ # @macro join_common_example_2
554
+ # @example with key pairs
555
+ # df2.anti_join(other2, { left: :KEY1, right: :KEY2 })
556
+ #
557
+ # # =>
558
+ # KEY1 X1
559
+ # <string> <uint8>
560
+ # 0 C 3
272
561
  #
273
562
  def anti_join(other, join_keys = nil, suffix: '.1')
274
563
  join(other, join_keys, type: :left_anti, suffix: suffix)
@@ -279,8 +568,11 @@ module RedAmber
279
568
  # Check if set operation with self and other is possible.
280
569
  #
281
570
  # @macro join_before
282
- #
283
- # @return [Boolean] true if set operation is possible.
571
+ # @return [Boolean]
572
+ # true if set operation is possible.
573
+ # @macro join_common_example_3
574
+ # @example
575
+ # df3.set_operable?(other3) # => true
284
576
  #
285
577
  def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
286
578
  keys == other.keys.map(&:to_sym)
@@ -291,8 +583,16 @@ module RedAmber
291
583
  # - A kind of set operations.
292
584
  #
293
585
  # @macro join_before
586
+ # @return [DataFrame]
587
+ # joined dataframe.
588
+ # @macro join_common_example_3
589
+ # @example
590
+ # df3.intersect(other3)
294
591
  #
295
- # @return [DataFrame] Joined dataframe.
592
+ # # =>
593
+ # KEY1 KEY2
594
+ # <string> <uint8>
595
+ # 0 A 1
296
596
  #
297
597
  def intersect(other)
298
598
  unless keys == other.keys.map(&:to_sym)
@@ -307,8 +607,20 @@ module RedAmber
307
607
  # - A kind of set operations.
308
608
  #
309
609
  # @macro join_before
310
- #
311
- # @return [DataFrame] Joined dataframe.
610
+ # @return [DataFrame]
611
+ # joined dataframe.
612
+ # @macro join_common_example_3
613
+ # @example
614
+ # df3.intersect(other3)
615
+ #
616
+ # # =>
617
+ # KEY1 KEY2
618
+ # <string> <uint8>
619
+ # 0 A 1
620
+ # 1 B 2
621
+ # 2 C 3
622
+ # 3 B 4
623
+ # 4 D 5
312
624
  #
313
625
  def union(other)
314
626
  unless keys == other.keys.map(&:to_sym)
@@ -323,8 +635,25 @@ module RedAmber
323
635
  # - A kind of set operations.
324
636
  #
325
637
  # @macro join_before
638
+ # @return [DataFrame]
639
+ # joined dataframe.
640
+ # @macro join_common_example_3
641
+ # @example
642
+ # df3.intersect(other3)
643
+ #
644
+ # # =>
645
+ # KEY1 KEY2
646
+ # <string> <uint8>
647
+ # 0 B 2
648
+ # 1 C 3
326
649
  #
327
- # @return [DataFrame] Joined dataframe.
650
+ # other.intersect(df)
651
+ #
652
+ # # =>
653
+ # KEY1 KEY2
654
+ # <string> <uint8>
655
+ # 0 B 4
656
+ # 1 D 5
328
657
  #
329
658
  def difference(other)
330
659
  unless keys == other.keys.map(&:to_sym)
@@ -338,46 +667,153 @@ module RedAmber
338
667
 
339
668
  # Join another DataFrame or Table to self.
340
669
  #
341
- # @overload join(other, type: :inner, suffix: '.1')
670
+ # @!macro join_common_type
671
+ # @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
672
+ # left_outer, :right_outer, :full_outer] type of join.
673
+ #
674
+ # @!macro join_common_example_4
675
+ # @example
676
+ # df4 = DataFrame.new(
677
+ # X1: %w[A B C],
678
+ # Y: %w[D E F]
679
+ # )
680
+ #
681
+ # # =>
682
+ # X1 Y1
683
+ # <string> <string>
684
+ # 0 A D
685
+ # 1 B E
686
+ # 2 C F
687
+ #
688
+ # other4 = DataFrame.new(
689
+ # X2: %w[A B D],
690
+ # Y: %w[e E E]
691
+ # )
692
+ #
693
+ # # =>
694
+ # X1 Y1
695
+ # <string> <string>
696
+ # 0 A D
697
+ # 1 B E
698
+ # 2 C F
699
+
700
+ # @note the order of joined results will be preserved by default.
701
+ # This is enabled by appending index column to sort after joining but
702
+ # it will cause some performance degradation. If you don't matter
703
+ # the order of the result, set `force_order` option to `false`.
704
+ #
705
+ # @overload join(other, type: :inner, suffix: '.1', force_order: true)
342
706
  #
343
707
  # If `join_key` is not specified, common keys in self and other are used
344
708
  # (natural keys). Returns joined dataframe.
345
709
  #
346
- # @!macro join_common_type
347
- # @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
348
- # left_outer, :right_outer, :full_outer] type of join.
349
- #
350
710
  # @macro join_before
351
711
  # @macro join_common_type
712
+ # @macro join_dorce_order
352
713
  # @macro join_after
714
+ # @macro join_common_example_1
715
+ # @example
716
+ # df.join(other)
353
717
  #
354
- # @overload join(other, join_keys, type: :inner, suffix: '.1')
718
+ # # =>
719
+ # KEY X1 X2
720
+ # <string> <uint8> <boolean>
721
+ # 0 A 1 true
722
+ # 1 B 2 false
723
+ #
724
+ # df.join(other, type: :full_outer)
725
+ #
726
+ # # =>
727
+ # KEY X1 X2
728
+ # <string> <uint8> <boolean>
729
+ # 0 A 1 true
730
+ # 1 B 2 false
731
+ # 2 C 3 (nil)
732
+ # 3 D (nil) (nil)
733
+ #
734
+ # @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: true)
355
735
  #
356
736
  # @macro join_before
357
737
  # @macro join_key_in_array
358
738
  # @macro join_common_type
739
+ # @macro join_dorce_order
359
740
  # @macro join_after
741
+ # @macro join_common_example_3
742
+ # @example join keys in an Array
743
+ # df3.join(other3, [:KEY1, :KEY2])
744
+ #
745
+ # # =>
746
+ # KEY1 KEY2
747
+ # <string> <uint8>
748
+ # 0 A 1
360
749
  #
361
- # @overload join(other, join_key_pairs, type: :inner, suffix: '.1')
750
+ # @example partial join key and suffix
751
+ # df3.join(other3, :KEY1, suffix: '.a')
752
+ #
753
+ # # =>
754
+ # KEY1 KEY2 KEY2.a
755
+ # <string> <uint8> <uint8>
756
+ # 0 A 1 1
757
+ # 1 B 2 4
758
+ #
759
+ # @overload join(other, join_key_pairs, type: :inner, suffix: '.1', force_order: true)
362
760
  #
363
761
  # @macro join_before
364
762
  # @macro join_key_in_hash
365
763
  # @macro join_common_type
764
+ # @macro join_dorce_order
366
765
  # @macro join_after
367
- #
368
- def join(other, join_keys = nil, type: :inner, suffix: '.1')
369
- case other
370
- when DataFrame
371
- other = other.table
372
- when Arrow::Table
373
- # Nop
766
+ # @macro join_common_example_4
767
+ # @example without options
768
+ # df4.join(other4)
769
+ #
770
+ # # =>
771
+ # X1 Y X2
772
+ # <string> <string> <string>
773
+ # 0 B E D
774
+ # 1 B E B
775
+ #
776
+ # @example join by key pairs
777
+ # df4.join(other4, { left: [:X1, :Y], right: [:X2, :Y] })
778
+ #
779
+ # # =>
780
+ # X1 Y
781
+ # <string> <string>
782
+ # 0 B E
783
+ #
784
+ # @example join by key pairs, using renaming by suffix
785
+ # df4.join(other4, { left: :X1, right: :X2 })
786
+ #
787
+ # # =>
788
+ # X1 Y Y.1
789
+ # <string> <string> <string>
790
+ # 0 A D e
791
+ # 1 B E E
792
+ #
793
+ def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: true)
794
+ right_table =
795
+ case other
796
+ when DataFrame
797
+ other.table
798
+ when Arrow::Table
799
+ other
800
+ else
801
+ raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
802
+ end
803
+
804
+ type = type.to_sym
805
+ left_index = :__LEFT_INDEX__
806
+ right_index = :__RIGHT_INDEX__
807
+ if force_order && %i[full_outer right_outer].include?(type)
808
+ left_table = assign(left_index) { indices }.table
809
+ other = DataFrame.create(other) if other.is_a?(Arrow::Table)
810
+ right_table = other.assign(right_index) { indices }.table
374
811
  else
375
- raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
812
+ left_table = table
376
813
  end
377
814
 
378
- table_keys = table.keys
379
- other_keys = other.keys
380
- type = type.to_sym
815
+ table_keys = left_table.keys
816
+ other_keys = right_table.keys
381
817
 
382
818
  # natural keys (implicit common keys)
383
819
  join_keys ||= table_keys.intersection(other_keys)
@@ -407,10 +843,13 @@ module RedAmber
407
843
 
408
844
  # Should we rescue errors in Arrow::Table#join for usability ?
409
845
  joined_table =
410
- table.join(other, join_keys,
411
- type: type,
412
- left_outputs: left_outputs,
413
- right_outputs: right_outputs)
846
+ left_table.join(
847
+ right_table,
848
+ join_keys,
849
+ type: type,
850
+ left_outputs: left_outputs,
851
+ right_outputs: right_outputs
852
+ )
414
853
 
415
854
  case type
416
855
  when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
@@ -423,20 +862,31 @@ module RedAmber
423
862
  renamed_table = rename_table(joined_table, n_keys, suffix)
424
863
  renamed_keys = renamed_table.keys
425
864
  dropper = []
426
- DataFrame.create(renamed_table).assign do |df|
865
+ dataframe = DataFrame.create(renamed_table).assign do |df|
427
866
  left_keys.map do |left_key|
428
867
  i_left_key = renamed_keys.index(left_key)
429
868
  right_key = renamed_keys[i_left_key + table_keys.size]
430
869
  dropper << right_key
431
870
  [left_key.to_sym, merge_array(df[left_key].data, df[right_key].data)]
432
871
  end
433
- end.drop(dropper)
872
+ end
873
+ dataframe = dataframe.sort(left_index, right_index) if force_order
874
+
875
+ dataframe.drop(dropper, left_index, right_index)
434
876
  when :right_outer
435
- if joined_table.keys.uniq!
436
- DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
437
- else
438
- DataFrame.create(joined_table)
439
- end.pick do
877
+ dataframe =
878
+ if joined_table.keys.uniq!
879
+ DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
880
+ else
881
+ DataFrame.create(joined_table)
882
+ end
883
+ if force_order
884
+ dataframe =
885
+ dataframe
886
+ .sort(left_index, right_index)
887
+ .drop(left_index, right_index)
888
+ end
889
+ dataframe.pick do
440
890
  [right_keys, keys.map(&:to_s) - right_keys]
441
891
  end
442
892
  end