red_amber 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +39 -20
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +113 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +25 -26
  8. data/benchmark/basic.yml +2 -2
  9. data/benchmark/combine.yml +2 -2
  10. data/benchmark/dataframe.yml +2 -2
  11. data/benchmark/group.yml +2 -2
  12. data/benchmark/reshape.yml +2 -2
  13. data/benchmark/vector.yml +3 -0
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +429 -75
  20. data/lib/red_amber/data_frame_combinable.rb +516 -66
  21. data/lib/red_amber/data_frame_displayable.rb +244 -14
  22. data/lib/red_amber/data_frame_indexable.rb +121 -18
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +622 -66
  26. data/lib/red_amber/data_frame_variable_operation.rb +446 -34
  27. data/lib/red_amber/group.rb +187 -22
  28. data/lib/red_amber/helper.rb +70 -10
  29. data/lib/red_amber/refinements.rb +12 -5
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +385 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +217 -12
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,17 +1,36 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameCombinable
6
6
  # Refinements for Arrow::Table
7
7
  using RefineArrowTable
8
8
 
9
- # Concatenate other dataframe onto the bottom.
9
+ # Concatenate other dataframes or tables onto the bottom of self.
10
10
  #
11
+ # @note the `#types` must be same as `other#types`.
11
12
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
12
- # DataFrame/Table to concatenate onto the bottom of self.
13
+ # DataFrames or Tables to concatenate.
13
14
  # @return [DataFrame]
14
- # Concatenated dataframe.
15
+ # concatenated dataframe.
16
+ # @example
17
+ # df = DataFrame.new(x: [1, 2], y: ['A', 'B'])
18
+ # other = DataFrame.new(x: [3, 4], y: ['C', 'D'])
19
+ # [df.types, other.types]
20
+ #
21
+ # # =>
22
+ # [[:uint8, :string], [:uint8, :string]]
23
+ #
24
+ # df.concatenate(other)
25
+ #
26
+ # # =>
27
+ # x y
28
+ # <uint8> <string>
29
+ # 0 1 A
30
+ # 1 2 B
31
+ # 2 3 C
32
+ # 3 4 D
33
+ #
15
34
  def concatenate(*other)
16
35
  case other
17
36
  in [] | [nil] | [[]]
@@ -39,14 +58,27 @@ module RedAmber
39
58
  alias_method :concat, :concatenate
40
59
  alias_method :bind_rows, :concatenate
41
60
 
42
- # Merge other DataFrame or Table from other.
43
- # - Self and other must have same size.
44
- # - Self and other do not share the same key.
45
- # - If they share any keys, raise Error.
61
+ # Merge other DataFrames or Tables.
62
+ #
63
+ # @note the `#size` must be same as `other#size`.
64
+ # @note self and other must not share the same key.
46
65
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
47
- # DataFrame/Table to concatenate.
66
+ # DataFrames or Tables to merge.
67
+ # @raise [DataFrameArgumentError]
68
+ # if size is not same or self and other shares the same key.
48
69
  # @return [DataFrame]
49
- # Merged dataframe.
70
+ # merged dataframe.
71
+ # @example
72
+ # df = DataFrame.new(x: [1, 2], y: [3, 4])
73
+ # other = DataFrame.new(a: ['A', 'B'], b: ['C', 'D'])
74
+ # df.merge(other)
75
+ #
76
+ # # =>
77
+ # x y a b
78
+ # <uint8> <uint8> <string> <string>
79
+ # 0 1 3 A C
80
+ # 1 2 4 B D
81
+ #
50
82
  def merge(*other)
51
83
  case other
52
84
  in [] | [nil] | [[]]
@@ -85,32 +117,105 @@ module RedAmber
85
117
 
86
118
  # Mutating joins (#inner_join, #full_join, #left_join, #right_join)
87
119
 
88
- # Join another DataFrame or Table, leaving only the matching records.
89
- # - Same as `#join` with `type: :inner`
90
- # - A kind of mutating join.
91
- #
92
120
  # @!macro join_before
93
121
  # @param other [DataFrame, Arrow::Table]
94
122
  # A DataFrame or a Table to be joined with self.
95
123
  #
124
+ # @!macro join_dorce_order
125
+ # @param force_order [Boolean]
126
+ # wheather force order of the output always same.
127
+ # - This option is used in `:full_outer` and `:right_outer`.
128
+ # - If this option is true (by default) it will append index to the source
129
+ # and sort after joining. It will cause some degradation in performance.
130
+ #
96
131
  # @!macro join_after
97
132
  # @param suffix [#succ]
98
133
  # a suffix to rename keys when key names conflict as a result of join.
99
134
  # `suffix` must be responsible to `#succ`.
100
135
  # @return [DataFrame]
101
- # Joined dataframe.
136
+ # joined dataframe.
102
137
  #
103
138
  # @!macro join_key_in_array
104
139
  # @param join_keys [String, Symbol, Array<String, Symbol>]
105
- # A key or keys to match.
140
+ # a key or keys to match.
106
141
  #
107
142
  # @!macro join_key_in_hash
108
143
  # @param join_key_pairs [Hash]
109
- # Pairs of a key name or key names to match in left and right.
144
+ # pairs of a key name or key names to match in left and right.
110
145
  # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :left
111
- # Join keys in `self`.
146
+ # join keys in `self`.
112
147
  # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :right
113
- # Join keys in `other`.
148
+ # join keys in `other`.
149
+ #
150
+ # @!macro join_common_example_1
151
+ # @example
152
+ # df = DataFrame.new(KEY: %w[A B C], X1: [1, 2, 3])
153
+ #
154
+ # # =>
155
+ # KEY X1
156
+ # <string> <uint8>
157
+ # 0 A 1
158
+ # 1 B 2
159
+ # 2 C 3
160
+ #
161
+ # other = DataFrame.new(KEY: %w[A B D], X2: [true, false, nil])
162
+ #
163
+ # # =>
164
+ # KEY X2
165
+ # <string> <boolean>
166
+ # 0 A true
167
+ # 1 B false
168
+ # 2 D (nil)
169
+ #
170
+ # @!macro join_common_example_2
171
+ # @example
172
+ # df2 = DataFrame.new(KEY1: %w[A B C], X1: [1, 2, 3])
173
+ #
174
+ # # =>
175
+ # KEY1 X1
176
+ # <string> <uint8>
177
+ # 0 A 1
178
+ # 1 B 2
179
+ # 2 C 3
180
+ #
181
+ # other2 = DataFrame.new(KEY2: %w[A B D], X2: [true, false, nil])
182
+ #
183
+ # # =>
184
+ # KEY2 X2
185
+ # <string> <boolean>
186
+ # 0 A true
187
+ # 1 B false
188
+ # 2 D (nil)
189
+ #
190
+ # @!macro join_common_example_3
191
+ # @example
192
+ # df3 = DataFrame.new(
193
+ # KEY1: %w[A B C],
194
+ # KEY2: [1, 2, 3]
195
+ # )
196
+ #
197
+ # # =>
198
+ # KEY1 KEY2
199
+ # <string> <uint8>
200
+ # 0 A 1
201
+ # 1 B 2
202
+ # 2 C 3
203
+ #
204
+ # other3 = DataFrame.new(
205
+ # KEY1: %w[A B D],
206
+ # KEY2: [1, 4, 5]
207
+ # )
208
+ #
209
+ # # =>
210
+ # KEY1 KEY2
211
+ # <string> <uint8>
212
+ # 0 A 1
213
+ # 1 B 4
214
+ # 2 D 5
215
+
216
+ # Join another DataFrame or Table, leaving only the matching records.
217
+ # - Same as `#join` with `type: :inner`
218
+ # - A kind of mutating join.
114
219
  #
115
220
  # @overload inner_join(other, suffix: '.1')
116
221
  # If `join_key` is not specified, common keys in self and other are used
@@ -118,18 +223,45 @@ module RedAmber
118
223
  #
119
224
  # @macro join_before
120
225
  # @macro join_after
226
+ # @macro join_common_example_1
227
+ # @example without key (use implicit common key)
228
+ # df.inner_join(other)
229
+ #
230
+ # # =>
231
+ # KEY X1 X2
232
+ # <string> <uint8> <boolean>
233
+ # 0 A 1 true
234
+ # 1 B 2 false
121
235
  #
122
236
  # @overload inner_join(other, join_keys, suffix: '.1')
123
237
  #
124
238
  # @macro join_before
125
239
  # @macro join_key_in_array
126
240
  # @macro join_after
241
+ # @macro join_common_example_1
242
+ # @example with a key
243
+ # df.inner_join(other, :KEY)
244
+ #
245
+ # # =>
246
+ # KEY X1 X2
247
+ # <string> <uint8> <boolean>
248
+ # 0 A 1 true
249
+ # 1 B 2 false
127
250
  #
128
251
  # @overload inner_join(other, join_key_pairs, suffix: '.1')
129
252
  #
130
253
  # @macro join_before
131
254
  # @macro join_key_in_hash
132
255
  # @macro join_after
256
+ # @macro join_common_example_2
257
+ # @example with key pairs
258
+ # df2.inner_join(other2, { left: :KEY1, right: :KEY2 })
259
+ #
260
+ # # =>
261
+ # KEY1 X1 X2
262
+ # <string> <uint8> <boolean>
263
+ # 0 A 1 true
264
+ # 1 B 2 false
133
265
  #
134
266
  def inner_join(other, join_keys = nil, suffix: '.1')
135
267
  join(other, join_keys, type: :inner, suffix: suffix)
@@ -139,27 +271,64 @@ module RedAmber
139
271
  # - Same as `#join` with `type: :full_outer`
140
272
  # - A kind of mutating join.
141
273
  #
142
- # @overload full_join(other, suffix: '.1')
274
+ # @overload full_join(other, suffix: '.1', force_order: true)
143
275
  # If `join_key` is not specified, common keys in self and other are used
144
276
  # (natural keys). Returns joined dataframe.
145
277
  #
146
278
  # @macro join_before
279
+ # @macro join_dorce_order
147
280
  # @macro join_after
281
+ # @macro join_common_example_1
282
+ # @example without key (use implicit common key)
283
+ # df.full_join(other)
284
+ #
285
+ # # =>
286
+ # KEY X1 X2
287
+ # <string> <uint8> <boolean>
288
+ # 0 A 1 true
289
+ # 1 B 2 false
290
+ # 2 C 3 (nil)
291
+ # 3 D (nil) (nil)
148
292
  #
149
- # @overload full_join(other, join_keys, suffix: '.1')
293
+ # @overload full_join(other, join_keys, suffix: '.1', force_order: true)
150
294
  #
151
295
  # @macro join_before
152
296
  # @macro join_key_in_array
297
+ # @macro join_dorce_order
153
298
  # @macro join_after
299
+ # @macro join_common_example_1
300
+ # @example with a key
301
+ # df.full_join(other, :KEY)
154
302
  #
155
- # @overload full_join(other, join_key_pairs, suffix: '.1')
303
+ # # =>
304
+ # KEY X1 X2
305
+ # <string> <uint8> <boolean>
306
+ # 0 A 1 true
307
+ # 1 B 2 false
308
+ # 2 C 3 (nil)
309
+ # 3 D (nil) (nil)
310
+ #
311
+ # @overload full_join(other, join_key_pairs, suffix: '.1', force_order: true)
156
312
  #
157
313
  # @macro join_before
158
314
  # @macro join_key_in_hash
315
+ # @macro join_dorce_order
159
316
  # @macro join_after
160
- #
161
- def full_join(other, join_keys = nil, suffix: '.1')
162
- join(other, join_keys, type: :full_outer, suffix: suffix)
317
+ # @macro join_common_example_2
318
+ # @example with key pairs
319
+ # df2.full_join(other2, { left: :KEY1, right: :KEY2 })
320
+ #
321
+ # # =>
322
+ # KEY1 X1 X2
323
+ # <string> <uint8> <boolean>
324
+ # 0 A 1 true
325
+ # 1 B 2 false
326
+ # 2 C 3 (nil)
327
+ # 3 D (nil) (nil)
328
+ #
329
+ def full_join(other, join_keys = nil, suffix: '.1', force_order: true)
330
+ join(other, join_keys,
331
+ type: :full_outer, suffix: suffix, force_order: force_order)
163
332
  end
164
333
 
165
334
  alias_method :outer_join, :full_join
@@ -174,18 +343,48 @@ module RedAmber
174
343
  #
175
344
  # @macro join_before
176
345
  # @macro join_after
346
+ # @macro join_common_example_1
347
+ # @example without key (use implicit common key)
348
+ # df.left_join(other)
349
+ #
350
+ # # =>
351
+ # KEY X1 X2
352
+ # <string> <uint8> <boolean>
353
+ # 0 A 1 true
354
+ # 1 B 2 false
355
+ # 2 C 3 (nil)
177
356
  #
178
357
  # @overload left_join(other, join_keys, suffix: '.1')
179
358
  #
180
359
  # @macro join_before
181
360
  # @macro join_key_in_array
182
361
  # @macro join_after
362
+ # @macro join_common_example_1
363
+ # @example with a key
364
+ # df.left_join(other, :KEY)
365
+ #
366
+ # # =>
367
+ # KEY X1 X2
368
+ # <string> <uint8> <boolean>
369
+ # 0 A 1 true
370
+ # 1 B 2 false
371
+ # 2 C 3 (nil)
183
372
  #
184
373
  # @overload left_join(other, join_key_pairs, suffix: '.1')
185
374
  #
186
375
  # @macro join_before
187
376
  # @macro join_key_in_hash
188
377
  # @macro join_after
378
+ # @macro join_common_example_2
379
+ # @example with key pairs
380
+ # df2.left_join(other2, { left: :KEY1, right: :KEY2 })
381
+ #
382
+ # # =>
383
+ # KEY1 X1 X2
384
+ # <string> <uint8> <boolean>
385
+ # 0 A 1 true
386
+ # 1 B 2 false
387
+ # 2 C 3 (nil)
189
388
  #
190
389
  def left_join(other, join_keys = nil, suffix: '.1')
191
390
  join(other, join_keys, type: :left_outer, suffix: suffix)
@@ -195,27 +394,66 @@ module RedAmber
195
394
  # - Same as `#join` with `type: :right_outer`
196
395
  # - A kind of mutating join.
197
396
  #
198
- # @overload right_join(other, suffix: '.1')
397
+ # @overload right_join(other, suffix: '.1', force_order: true)
199
398
  # If `join_key` is not specified, common keys in self and other are used
200
399
  # (natural keys). Returns joined dataframe.
201
400
  #
202
401
  # @macro join_before
402
+ # @macro join_dorce_order
203
403
  # @macro join_after
404
+ # @macro join_common_example_1
405
+ # @example without key (use implicit common key)
406
+ # df.right_join(other)
407
+ #
408
+ # # =>
409
+ # KEY X1 X2
410
+ # <string> <uint8> <boolean>
411
+ # 0 A 1 true
412
+ # 1 B 2 false
413
+ # 2 D (nil) (nil)
204
414
  #
205
- # @overload right_join(other, join_keys, suffix: '.1')
415
+ # @overload right_join(other, join_keys, suffix: '.1', force_order: true)
206
416
  #
207
417
  # @macro join_before
208
418
  # @macro join_key_in_array
419
+ # @macro join_dorce_order
209
420
  # @macro join_after
421
+ # @macro join_common_example_1
422
+ # @example with a key
423
+ # df.right_join(other, :KEY)
210
424
  #
211
- # @overload right_join(other, join_key_pairs, suffix: '.1')
425
+ # # =>
426
+ # KEY X1 X2
427
+ # <string> <uint8> <boolean>
428
+ # 0 A 1 true
429
+ # 1 B 2 false
430
+ # 2 D (nil) (nil)
431
+ #
432
+ # @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
212
433
  #
213
434
  # @macro join_before
214
435
  # @macro join_key_in_hash
436
+ # @macro join_dorce_order
215
437
  # @macro join_after
216
- #
217
- def right_join(other, join_keys = nil, suffix: '.1')
218
- join(other, join_keys, type: :right_outer, suffix: suffix)
438
+ # @macro join_common_example_2
439
+ # @example with key pairs
440
+ # df2.right_join(other2, { left: :KEY1, right: :KEY2 })
441
+ #
442
+ # # =>
443
+ # KEY1 X1 X2
444
+ # <string> <uint8> <boolean>
445
+ # 0 A 1 true
446
+ # 1 B 2 false
447
+ # 2 D (nil) (nil)
448
+ #
449
+ def right_join(other, join_keys = nil, suffix: '.1', force_order: true)
450
+ join(
451
+ other,
452
+ join_keys,
453
+ type: :right_outer,
454
+ suffix: suffix,
455
+ force_order: force_order
456
+ )
219
457
  end
220
458
 
221
459
  # Filtering joins (#semi_join, #anti_join)
@@ -230,18 +468,45 @@ module RedAmber
230
468
  #
231
469
  # @macro join_before
232
470
  # @macro join_after
471
+ # @macro join_common_example_1
472
+ # @example without key (use implicit common key)
473
+ # df.semi_join(other)
474
+ #
475
+ # # =>
476
+ # KEY X1
477
+ # <string> <uint8>
478
+ # 0 A 1
479
+ # 1 B 2
233
480
  #
234
481
  # @overload semi_join(other, join_keys, suffix: '.1')
235
482
  #
236
483
  # @macro join_before
237
484
  # @macro join_key_in_array
238
485
  # @macro join_after
486
+ # @macro join_common_example_1
487
+ # @example with a key
488
+ # df.semi_join(other, :KEY)
489
+ #
490
+ # # =>
491
+ # KEY X1
492
+ # <string> <uint8>
493
+ # 0 A 1
494
+ # 1 B 2
239
495
  #
240
496
  # @overload semi_join(other, join_key_pairs, suffix: '.1')
241
497
  #
242
498
  # @macro join_before
243
499
  # @macro join_key_in_hash
244
500
  # @macro join_after
501
+ # @macro join_common_example_2
502
+ # @example with key pairs
503
+ # df2.semi_join(other2, { left: :KEY1, right: :KEY2 })
504
+ #
505
+ # # =>
506
+ # KEY1 X1
507
+ # <string> <uint8>
508
+ # 0 A 1
509
+ # 1 B 2
245
510
  #
246
511
  def semi_join(other, join_keys = nil, suffix: '.1')
247
512
  join(other, join_keys, type: :left_semi, suffix: suffix)
@@ -257,18 +522,42 @@ module RedAmber
257
522
  #
258
523
  # @macro join_before
259
524
  # @macro join_after
525
+ # @macro join_common_example_1
526
+ # @example without key (use implicit common key)
527
+ # df.anti_join(other)
528
+ #
529
+ # # =>
530
+ # KEY X1
531
+ # <string> <uint8>
532
+ # 0 C 3
260
533
  #
261
534
  # @overload anti_join(other, join_keys, suffix: '.1')
262
535
  #
263
536
  # @macro join_before
264
537
  # @macro join_key_in_array
265
538
  # @macro join_after
539
+ # @macro join_common_example_1
540
+ # @example with a key
541
+ # df.anti_join(other, :KEY)
542
+ #
543
+ # # =>
544
+ # KEY X1
545
+ # <string> <uint8>
546
+ # 0 C 3
266
547
  #
267
548
  # @overload anti_join(other, join_key_pairs, suffix: '.1')
268
549
  #
269
550
  # @macro join_before
270
551
  # @macro join_key_in_hash
271
552
  # @macro join_after
553
+ # @macro join_common_example_2
554
+ # @example with key pairs
555
+ # df2.anti_join(other2, { left: :KEY1, right: :KEY2 })
556
+ #
557
+ # # =>
558
+ # KEY1 X1
559
+ # <string> <uint8>
560
+ # 0 C 3
272
561
  #
273
562
  def anti_join(other, join_keys = nil, suffix: '.1')
274
563
  join(other, join_keys, type: :left_anti, suffix: suffix)
@@ -279,8 +568,11 @@ module RedAmber
279
568
  # Check if set operation with self and other is possible.
280
569
  #
281
570
  # @macro join_before
282
- #
283
- # @return [Boolean] true if set operation is possible.
571
+ # @return [Boolean]
572
+ # true if set operation is possible.
573
+ # @macro join_common_example_3
574
+ # @example
575
+ # df3.set_operable?(other3) # => true
284
576
  #
285
577
  def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
286
578
  keys == other.keys.map(&:to_sym)
@@ -291,8 +583,16 @@ module RedAmber
291
583
  # - A kind of set operations.
292
584
  #
293
585
  # @macro join_before
586
+ # @return [DataFrame]
587
+ # joined dataframe.
588
+ # @macro join_common_example_3
589
+ # @example
590
+ # df3.intersect(other3)
294
591
  #
295
- # @return [DataFrame] Joined dataframe.
592
+ # # =>
593
+ # KEY1 KEY2
594
+ # <string> <uint8>
595
+ # 0 A 1
296
596
  #
297
597
  def intersect(other)
298
598
  unless keys == other.keys.map(&:to_sym)
@@ -307,8 +607,20 @@ module RedAmber
307
607
  # - A kind of set operations.
308
608
  #
309
609
  # @macro join_before
310
- #
311
- # @return [DataFrame] Joined dataframe.
610
+ # @return [DataFrame]
611
+ # joined dataframe.
612
+ # @macro join_common_example_3
613
+ # @example
614
+ # df3.intersect(other3)
615
+ #
616
+ # # =>
617
+ # KEY1 KEY2
618
+ # <string> <uint8>
619
+ # 0 A 1
620
+ # 1 B 2
621
+ # 2 C 3
622
+ # 3 B 4
623
+ # 4 D 5
312
624
  #
313
625
  def union(other)
314
626
  unless keys == other.keys.map(&:to_sym)
@@ -323,8 +635,25 @@ module RedAmber
323
635
  # - A kind of set operations.
324
636
  #
325
637
  # @macro join_before
638
+ # @return [DataFrame]
639
+ # joined dataframe.
640
+ # @macro join_common_example_3
641
+ # @example
642
+ # df3.intersect(other3)
643
+ #
644
+ # # =>
645
+ # KEY1 KEY2
646
+ # <string> <uint8>
647
+ # 0 B 2
648
+ # 1 C 3
326
649
  #
327
- # @return [DataFrame] Joined dataframe.
650
+ # other.intersect(df)
651
+ #
652
+ # # =>
653
+ # KEY1 KEY2
654
+ # <string> <uint8>
655
+ # 0 B 4
656
+ # 1 D 5
328
657
  #
329
658
  def difference(other)
330
659
  unless keys == other.keys.map(&:to_sym)
@@ -338,46 +667,153 @@ module RedAmber
338
667
 
339
668
  # Join another DataFrame or Table to self.
340
669
  #
341
- # @overload join(other, type: :inner, suffix: '.1')
670
+ # @!macro join_common_type
671
+ # @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
672
+ # left_outer, :right_outer, :full_outer] type of join.
673
+ #
674
+ # @!macro join_common_example_4
675
+ # @example
676
+ # df4 = DataFrame.new(
677
+ # X1: %w[A B C],
678
+ # Y: %w[D E F]
679
+ # )
680
+ #
681
+ # # =>
682
+ # X1 Y1
683
+ # <string> <string>
684
+ # 0 A D
685
+ # 1 B E
686
+ # 2 C F
687
+ #
688
+ # other4 = DataFrame.new(
689
+ # X2: %w[A B D],
690
+ # Y: %w[e E E]
691
+ # )
692
+ #
693
+ # # =>
694
+ # X1 Y1
695
+ # <string> <string>
696
+ # 0 A D
697
+ # 1 B E
698
+ # 2 C F
699
+
700
+ # @note the order of joined results will be preserved by default.
701
+ # This is enabled by appending index column to sort after joining but
702
+ # it will cause some performance degradation. If you don't matter
703
+ # the order of the result, set `force_order` option to `false`.
704
+ #
705
+ # @overload join(other, type: :inner, suffix: '.1', force_order: true)
342
706
  #
343
707
  # If `join_key` is not specified, common keys in self and other are used
344
708
  # (natural keys). Returns joined dataframe.
345
709
  #
346
- # @!macro join_common_type
347
- # @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
348
- # left_outer, :right_outer, :full_outer] type of join.
349
- #
350
710
  # @macro join_before
351
711
  # @macro join_common_type
712
+ # @macro join_dorce_order
352
713
  # @macro join_after
714
+ # @macro join_common_example_1
715
+ # @example
716
+ # df.join(other)
353
717
  #
354
- # @overload join(other, join_keys, type: :inner, suffix: '.1')
718
+ # # =>
719
+ # KEY X1 X2
720
+ # <string> <uint8> <boolean>
721
+ # 0 A 1 true
722
+ # 1 B 2 false
723
+ #
724
+ # df.join(other, type: :full_outer)
725
+ #
726
+ # # =>
727
+ # KEY X1 X2
728
+ # <string> <uint8> <boolean>
729
+ # 0 A 1 true
730
+ # 1 B 2 false
731
+ # 2 C 3 (nil)
732
+ # 3 D (nil) (nil)
733
+ #
734
+ # @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: true)
355
735
  #
356
736
  # @macro join_before
357
737
  # @macro join_key_in_array
358
738
  # @macro join_common_type
739
+ # @macro join_dorce_order
359
740
  # @macro join_after
741
+ # @macro join_common_example_3
742
+ # @example join keys in an Array
743
+ # df3.join(other3, [:KEY1, :KEY2])
744
+ #
745
+ # # =>
746
+ # KEY1 KEY2
747
+ # <string> <uint8>
748
+ # 0 A 1
360
749
  #
361
- # @overload join(other, join_key_pairs, type: :inner, suffix: '.1')
750
+ # @example partial join key and suffix
751
+ # df3.join(other3, :KEY1, suffix: '.a')
752
+ #
753
+ # # =>
754
+ # KEY1 KEY2 KEY2.a
755
+ # <string> <uint8> <uint8>
756
+ # 0 A 1 1
757
+ # 1 B 2 4
758
+ #
759
+ # @overload join(other, join_key_pairs, type: :inner, suffix: '.1', force_order: true)
362
760
  #
363
761
  # @macro join_before
364
762
  # @macro join_key_in_hash
365
763
  # @macro join_common_type
764
+ # @macro join_dorce_order
366
765
  # @macro join_after
367
- #
368
- def join(other, join_keys = nil, type: :inner, suffix: '.1')
369
- case other
370
- when DataFrame
371
- other = other.table
372
- when Arrow::Table
373
- # Nop
766
+ # @macro join_common_example_4
767
+ # @example without options
768
+ # df4.join(other4)
769
+ #
770
+ # # =>
771
+ # X1 Y X2
772
+ # <string> <string> <string>
773
+ # 0 B E D
774
+ # 1 B E B
775
+ #
776
+ # @example join by key pairs
777
+ # df4.join(other4, { left: [:X1, :Y], right: [:X2, :Y] })
778
+ #
779
+ # # =>
780
+ # X1 Y
781
+ # <string> <string>
782
+ # 0 B E
783
+ #
784
+ # @example join by key pairs, using renaming by suffix
785
+ # df4.join(other4, { left: :X1, right: :X2 })
786
+ #
787
+ # # =>
788
+ # X1 Y Y.1
789
+ # <string> <string> <string>
790
+ # 0 A D e
791
+ # 1 B E E
792
+ #
793
+ def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: true)
794
+ right_table =
795
+ case other
796
+ when DataFrame
797
+ other.table
798
+ when Arrow::Table
799
+ other
800
+ else
801
+ raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
802
+ end
803
+
804
+ type = type.to_sym
805
+ left_index = :__LEFT_INDEX__
806
+ right_index = :__RIGHT_INDEX__
807
+ if force_order && %i[full_outer right_outer].include?(type)
808
+ left_table = assign(left_index) { indices }.table
809
+ other = DataFrame.create(other) if other.is_a?(Arrow::Table)
810
+ right_table = other.assign(right_index) { indices }.table
374
811
  else
375
- raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
812
+ left_table = table
376
813
  end
377
814
 
378
- table_keys = table.keys
379
- other_keys = other.keys
380
- type = type.to_sym
815
+ table_keys = left_table.keys
816
+ other_keys = right_table.keys
381
817
 
382
818
  # natural keys (implicit common keys)
383
819
  join_keys ||= table_keys.intersection(other_keys)
@@ -407,10 +843,13 @@ module RedAmber
407
843
 
408
844
  # Should we rescue errors in Arrow::Table#join for usability ?
409
845
  joined_table =
410
- table.join(other, join_keys,
411
- type: type,
412
- left_outputs: left_outputs,
413
- right_outputs: right_outputs)
846
+ left_table.join(
847
+ right_table,
848
+ join_keys,
849
+ type: type,
850
+ left_outputs: left_outputs,
851
+ right_outputs: right_outputs
852
+ )
414
853
 
415
854
  case type
416
855
  when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
@@ -423,20 +862,31 @@ module RedAmber
423
862
  renamed_table = rename_table(joined_table, n_keys, suffix)
424
863
  renamed_keys = renamed_table.keys
425
864
  dropper = []
426
- DataFrame.create(renamed_table).assign do |df|
865
+ dataframe = DataFrame.create(renamed_table).assign do |df|
427
866
  left_keys.map do |left_key|
428
867
  i_left_key = renamed_keys.index(left_key)
429
868
  right_key = renamed_keys[i_left_key + table_keys.size]
430
869
  dropper << right_key
431
870
  [left_key.to_sym, merge_array(df[left_key].data, df[right_key].data)]
432
871
  end
433
- end.drop(dropper)
872
+ end
873
+ dataframe = dataframe.sort(left_index, right_index) if force_order
874
+
875
+ dataframe.drop(dropper, left_index, right_index)
434
876
  when :right_outer
435
- if joined_table.keys.uniq!
436
- DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
437
- else
438
- DataFrame.create(joined_table)
439
- end.pick do
877
+ dataframe =
878
+ if joined_table.keys.uniq!
879
+ DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
880
+ else
881
+ DataFrame.create(joined_table)
882
+ end
883
+ if force_order
884
+ dataframe =
885
+ dataframe
886
+ .sort(left_index, right_index)
887
+ .drop(left_index, right_index)
888
+ end
889
+ dataframe.pick do
440
890
  [right_keys, keys.map(&:to_s) - right_keys]
441
891
  end
442
892
  end