red_amber 0.3.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +56 -22
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +29 -30
  8. data/benchmark/basic.yml +7 -7
  9. data/benchmark/combine.yml +3 -3
  10. data/benchmark/dataframe.yml +15 -9
  11. data/benchmark/group.yml +6 -6
  12. data/benchmark/reshape.yml +6 -6
  13. data/benchmark/vector.yml +6 -3
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +454 -85
  20. data/lib/red_amber/data_frame_combinable.rb +609 -115
  21. data/lib/red_amber/data_frame_displayable.rb +313 -34
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +623 -70
  26. data/lib/red_amber/data_frame_variable_operation.rb +452 -35
  27. data/lib/red_amber/group.rb +186 -22
  28. data/lib/red_amber/helper.rb +74 -14
  29. data/lib/red_amber/refinements.rb +26 -6
  30. data/lib/red_amber/subframes.rb +1101 -0
  31. data/lib/red_amber/vector.rb +362 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +506 -0
  34. data/lib/red_amber/vector_selectable.rb +265 -23
  35. data/lib/red_amber/vector_unary_element_wise.rb +529 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,17 +1,38 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameCombinable
6
6
  # Refinements for Arrow::Table
7
7
  using RefineArrowTable
8
8
 
9
- # Concatenate other dataframe onto the bottom.
9
+ # Concatenate other dataframes or tables onto the bottom of self.
10
10
  #
11
+ # @note the `#types` must be same as `other#types`.
11
12
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
12
- # DataFrame/Table to concatenate onto the bottom of self.
13
+ # DataFrames or Tables to concatenate.
13
14
  # @return [DataFrame]
14
- # Concatenated dataframe.
15
+ # concatenated dataframe.
16
+ # @example
17
+ # df = DataFrame.new(x: [1, 2], y: ['A', 'B'])
18
+ # other = DataFrame.new(x: [3, 4], y: ['C', 'D'])
19
+ # [df.types, other.types]
20
+ #
21
+ # # =>
22
+ # [[:uint8, :string], [:uint8, :string]]
23
+ #
24
+ # df.concatenate(other)
25
+ #
26
+ # # =>
27
+ # x y
28
+ # <uint8> <string>
29
+ # 0 1 A
30
+ # 1 2 B
31
+ # 2 3 C
32
+ # 3 4 D
33
+ #
34
+ # @since 0.2.3
35
+ #
15
36
  def concatenate(*other)
16
37
  case other
17
38
  in [] | [nil] | [[]]
@@ -39,14 +60,29 @@ module RedAmber
39
60
  alias_method :concat, :concatenate
40
61
  alias_method :bind_rows, :concatenate
41
62
 
42
- # Merge other DataFrame or Table from other.
43
- # - Self and other must have same size.
44
- # - Self and other do not share the same key.
45
- # - If they share any keys, raise Error.
63
+ # Merge other DataFrames or Tables.
64
+ #
65
+ # @note the `#size` must be same as `other#size`.
66
+ # @note self and other must not share the same key.
46
67
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
47
- # DataFrame/Table to concatenate.
68
+ # DataFrames or Tables to merge.
69
+ # @raise [DataFrameArgumentError]
70
+ # if size is not same or self and other shares the same key.
48
71
  # @return [DataFrame]
49
- # Merged dataframe.
72
+ # merged dataframe.
73
+ # @example
74
+ # df = DataFrame.new(x: [1, 2], y: [3, 4])
75
+ # other = DataFrame.new(a: ['A', 'B'], b: ['C', 'D'])
76
+ # df.merge(other)
77
+ #
78
+ # # =>
79
+ # x y a b
80
+ # <uint8> <uint8> <string> <string>
81
+ # 0 1 3 A C
82
+ # 1 2 4 B D
83
+ #
84
+ # @since 0.2.3
85
+ #
50
86
  def merge(*other)
51
87
  case other
52
88
  in [] | [nil] | [[]]
@@ -85,81 +121,225 @@ module RedAmber
85
121
 
86
122
  # Mutating joins (#inner_join, #full_join, #left_join, #right_join)
87
123
 
88
- # Join another DataFrame or Table, leaving only the matching records.
89
- # - Same as `#join` with `type: :inner`
90
- # - A kind of mutating join.
91
- #
92
124
  # @!macro join_before
93
125
  # @param other [DataFrame, Arrow::Table]
94
126
  # A DataFrame or a Table to be joined with self.
95
127
  #
128
+ # @!macro join_force_order
129
+ # @param force_order [Boolean]
130
+ # wheather force order of the output always same.
131
+ # - This option is used in `:full_outer` and `:right_outer`.
132
+ # - If this option is true (by default) it will append index to the source
133
+ # and sort after joining. It will cause some degradation in performance.
134
+ #
96
135
  # @!macro join_after
97
136
  # @param suffix [#succ]
98
137
  # a suffix to rename keys when key names conflict as a result of join.
99
138
  # `suffix` must be responsible to `#succ`.
100
139
  # @return [DataFrame]
101
- # Joined dataframe.
140
+ # joined dataframe.
102
141
  #
103
142
  # @!macro join_key_in_array
104
143
  # @param join_keys [String, Symbol, Array<String, Symbol>]
105
- # A key or keys to match.
144
+ # a key or keys to match.
106
145
  #
107
146
  # @!macro join_key_in_hash
108
147
  # @param join_key_pairs [Hash]
109
- # Pairs of a key name or key names to match in left and right.
148
+ # pairs of a key name or key names to match in left and right.
110
149
  # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :left
111
- # Join keys in `self`.
150
+ # join keys in `self`.
112
151
  # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :right
113
- # Join keys in `other`.
152
+ # join keys in `other`.
153
+ #
154
+ # @!macro join_common_example_1
155
+ # @example
156
+ # df = DataFrame.new(KEY: %w[A B C], X1: [1, 2, 3])
157
+ #
158
+ # # =>
159
+ # KEY X1
160
+ # <string> <uint8>
161
+ # 0 A 1
162
+ # 1 B 2
163
+ # 2 C 3
164
+ #
165
+ # other = DataFrame.new(KEY: %w[A B D], X2: [true, false, nil])
166
+ #
167
+ # # =>
168
+ # KEY X2
169
+ # <string> <boolean>
170
+ # 0 A true
171
+ # 1 B false
172
+ # 2 D (nil)
173
+ #
174
+ # @!macro join_common_example_2
175
+ # @example
176
+ # df2 = DataFrame.new(KEY1: %w[A B C], X1: [1, 2, 3])
177
+ #
178
+ # # =>
179
+ # KEY1 X1
180
+ # <string> <uint8>
181
+ # 0 A 1
182
+ # 1 B 2
183
+ # 2 C 3
184
+ #
185
+ # other2 = DataFrame.new(KEY2: %w[A B D], X2: [true, false, nil])
186
+ #
187
+ # # =>
188
+ # KEY2 X2
189
+ # <string> <boolean>
190
+ # 0 A true
191
+ # 1 B false
192
+ # 2 D (nil)
193
+ #
194
+ # @!macro join_common_example_3
195
+ # @example
196
+ # df3 = DataFrame.new(
197
+ # KEY1: %w[A B C],
198
+ # KEY2: [1, 2, 3]
199
+ # )
200
+ #
201
+ # # =>
202
+ # KEY1 KEY2
203
+ # <string> <uint8>
204
+ # 0 A 1
205
+ # 1 B 2
206
+ # 2 C 3
207
+ #
208
+ # other3 = DataFrame.new(
209
+ # KEY1: %w[A B D],
210
+ # KEY2: [1, 4, 5]
211
+ # )
212
+ #
213
+ # # =>
214
+ # KEY1 KEY2
215
+ # <string> <uint8>
216
+ # 0 A 1
217
+ # 1 B 4
218
+ # 2 D 5
219
+
220
+ # Join another DataFrame or Table, leaving only the matching records.
221
+ # - Same as `#join` with `type: :inner`
222
+ # - A kind of mutating join.
114
223
  #
115
- # @overload inner_join(other, suffix: '.1')
224
+ # @overload inner_join(other, suffix: '.1', force_order: true)
116
225
  # If `join_key` is not specified, common keys in self and other are used
117
226
  # (natural keys). Returns joined dataframe.
118
227
  #
119
228
  # @macro join_before
229
+ # @macro join_force_order
120
230
  # @macro join_after
231
+ # @macro join_common_example_1
232
+ # @example without key (use implicit common key)
233
+ # df.inner_join(other)
234
+ #
235
+ # # =>
236
+ # KEY X1 X2
237
+ # <string> <uint8> <boolean>
238
+ # 0 A 1 true
239
+ # 1 B 2 false
121
240
  #
122
- # @overload inner_join(other, join_keys, suffix: '.1')
241
+ # @overload inner_join(other, join_keys, suffix: '.1', force_order: true)
123
242
  #
124
243
  # @macro join_before
125
244
  # @macro join_key_in_array
245
+ # @macro join_force_order
126
246
  # @macro join_after
247
+ # @macro join_common_example_1
248
+ # @example with a key
249
+ # df.inner_join(other, :KEY)
127
250
  #
128
- # @overload inner_join(other, join_key_pairs, suffix: '.1')
251
+ # # =>
252
+ # KEY X1 X2
253
+ # <string> <uint8> <boolean>
254
+ # 0 A 1 true
255
+ # 1 B 2 false
256
+ #
257
+ # @overload inner_join(other, join_key_pairs, suffix: '.1', force_order: true)
129
258
  #
130
259
  # @macro join_before
131
260
  # @macro join_key_in_hash
261
+ # @macro join_force_order
132
262
  # @macro join_after
263
+ # @macro join_common_example_2
264
+ # @example with key pairs
265
+ # df2.inner_join(other2, { left: :KEY1, right: :KEY2 })
266
+ #
267
+ # # =>
268
+ # KEY1 X1 X2
269
+ # <string> <uint8> <boolean>
270
+ # 0 A 1 true
271
+ # 1 B 2 false
133
272
  #
134
- def inner_join(other, join_keys = nil, suffix: '.1')
135
- join(other, join_keys, type: :inner, suffix: suffix)
273
+ # @since 0.2.3
274
+ #
275
+ def inner_join(other, join_keys = nil, suffix: '.1', force_order: true)
276
+ join(other, join_keys, type: :inner, suffix: suffix, force_order: force_order)
136
277
  end
137
278
 
138
279
  # Join another DataFrame or Table, leaving all records.
139
280
  # - Same as `#join` with `type: :full_outer`
140
281
  # - A kind of mutating join.
141
282
  #
142
- # @overload full_join(other, suffix: '.1')
283
+ # @overload full_join(other, suffix: '.1', force_order: true)
143
284
  # If `join_key` is not specified, common keys in self and other are used
144
285
  # (natural keys). Returns joined dataframe.
145
286
  #
146
287
  # @macro join_before
288
+ # @macro join_force_order
147
289
  # @macro join_after
290
+ # @macro join_common_example_1
291
+ # @example without key (use implicit common key)
292
+ # df.full_join(other)
293
+ #
294
+ # # =>
295
+ # KEY X1 X2
296
+ # <string> <uint8> <boolean>
297
+ # 0 A 1 true
298
+ # 1 B 2 false
299
+ # 2 C 3 (nil)
300
+ # 3 D (nil) (nil)
148
301
  #
149
- # @overload full_join(other, join_keys, suffix: '.1')
302
+ # @overload full_join(other, join_keys, suffix: '.1', force_order: true)
150
303
  #
151
304
  # @macro join_before
152
305
  # @macro join_key_in_array
306
+ # @macro join_force_order
153
307
  # @macro join_after
308
+ # @macro join_common_example_1
309
+ # @example with a key
310
+ # df.full_join(other, :KEY)
311
+ #
312
+ # # =>
313
+ # KEY X1 X2
314
+ # <string> <uint8> <boolean>
315
+ # 0 A 1 true
316
+ # 1 B 2 false
317
+ # 2 C 3 (nil)
318
+ # 3 D (nil) (nil)
154
319
  #
155
- # @overload full_join(other, join_key_pairs, suffix: '.1')
320
+ # @overload full_join(other, join_key_pairs, suffix: '.1', force_order: true)
156
321
  #
157
322
  # @macro join_before
158
323
  # @macro join_key_in_hash
324
+ # @macro join_force_order
159
325
  # @macro join_after
160
- #
161
- def full_join(other, join_keys = nil, suffix: '.1')
162
- join(other, join_keys, type: :full_outer, suffix: suffix)
326
+ # @macro join_common_example_2
327
+ # @example with key pairs
328
+ # df2.full_join(other2, { left: :KEY1, right: :KEY2 })
329
+ #
330
+ # # =>
331
+ # KEY1 X1 X2
332
+ # <string> <uint8> <boolean>
333
+ # 0 A 1 true
334
+ # 1 B 2 false
335
+ # 2 C 3 (nil)
336
+ # 3 D (nil) (nil)
337
+ #
338
+ # @since 0.2.3
339
+ #
340
+ def full_join(other, join_keys = nil, suffix: '.1', force_order: true)
341
+ join(other, join_keys,
342
+ type: :full_outer, suffix: suffix, force_order: force_order)
163
343
  end
164
344
 
165
345
  alias_method :outer_join, :full_join
@@ -168,54 +348,130 @@ module RedAmber
168
348
  # - Same as `#join` with `type: :left_outer`
169
349
  # - A kind of mutating join.
170
350
  #
171
- # @overload left_join(other, suffix: '.1')
351
+ # @overload left_join(other, suffix: '.1', force_order: true)
172
352
  # If `join_key` is not specified, common keys in self and other are used
173
353
  # (natural keys). Returns joined dataframe.
174
354
  #
175
355
  # @macro join_before
356
+ # @macro join_force_order
176
357
  # @macro join_after
358
+ # @macro join_common_example_1
359
+ # @example without key (use implicit common key)
360
+ # df.left_join(other)
361
+ #
362
+ # # =>
363
+ # KEY X1 X2
364
+ # <string> <uint8> <boolean>
365
+ # 0 A 1 true
366
+ # 1 B 2 false
367
+ # 2 C 3 (nil)
177
368
  #
178
- # @overload left_join(other, join_keys, suffix: '.1')
369
+ # @overload left_join(other, join_keys, suffix: '.1', force_order: true)
179
370
  #
180
371
  # @macro join_before
181
372
  # @macro join_key_in_array
373
+ # @macro join_force_order
182
374
  # @macro join_after
375
+ # @macro join_common_example_1
376
+ # @example with a key
377
+ # df.left_join(other, :KEY)
378
+ #
379
+ # # =>
380
+ # KEY X1 X2
381
+ # <string> <uint8> <boolean>
382
+ # 0 A 1 true
383
+ # 1 B 2 false
384
+ # 2 C 3 (nil)
183
385
  #
184
- # @overload left_join(other, join_key_pairs, suffix: '.1')
386
+ # @overload left_join(other, join_key_pairs, suffix: '.1', force_order: true)
185
387
  #
186
388
  # @macro join_before
187
389
  # @macro join_key_in_hash
390
+ # @macro join_force_order
188
391
  # @macro join_after
392
+ # @macro join_common_example_2
393
+ # @example with key pairs
394
+ # df2.left_join(other2, { left: :KEY1, right: :KEY2 })
189
395
  #
190
- def left_join(other, join_keys = nil, suffix: '.1')
191
- join(other, join_keys, type: :left_outer, suffix: suffix)
396
+ # # =>
397
+ # KEY1 X1 X2
398
+ # <string> <uint8> <boolean>
399
+ # 0 A 1 true
400
+ # 1 B 2 false
401
+ # 2 C 3 (nil)
402
+ #
403
+ # @since 0.2.3
404
+ #
405
+ def left_join(other, join_keys = nil, suffix: '.1', force_order: true)
406
+ join(other, join_keys, type: :left_outer, suffix: suffix, force_order: force_order)
192
407
  end
193
408
 
194
409
  # Join matching values from self to other.
195
410
  # - Same as `#join` with `type: :right_outer`
196
411
  # - A kind of mutating join.
197
412
  #
198
- # @overload right_join(other, suffix: '.1')
413
+ # @overload right_join(other, suffix: '.1', force_order: true)
199
414
  # If `join_key` is not specified, common keys in self and other are used
200
415
  # (natural keys). Returns joined dataframe.
201
416
  #
202
417
  # @macro join_before
418
+ # @macro join_force_order
203
419
  # @macro join_after
420
+ # @macro join_common_example_1
421
+ # @example without key (use implicit common key)
422
+ # df.right_join(other)
423
+ #
424
+ # # =>
425
+ # KEY X1 X2
426
+ # <string> <uint8> <boolean>
427
+ # 0 A 1 true
428
+ # 1 B 2 false
429
+ # 2 D (nil) (nil)
204
430
  #
205
- # @overload right_join(other, join_keys, suffix: '.1')
431
+ # @overload right_join(other, join_keys, suffix: '.1', force_order: true)
206
432
  #
207
433
  # @macro join_before
208
434
  # @macro join_key_in_array
435
+ # @macro join_force_order
209
436
  # @macro join_after
437
+ # @macro join_common_example_1
438
+ # @example with a key
439
+ # df.right_join(other, :KEY)
210
440
  #
211
- # @overload right_join(other, join_key_pairs, suffix: '.1')
441
+ # # =>
442
+ # KEY X1 X2
443
+ # <string> <uint8> <boolean>
444
+ # 0 A 1 true
445
+ # 1 B 2 false
446
+ # 2 D (nil) (nil)
447
+ #
448
+ # @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
212
449
  #
213
450
  # @macro join_before
214
451
  # @macro join_key_in_hash
452
+ # @macro join_force_order
215
453
  # @macro join_after
216
- #
217
- def right_join(other, join_keys = nil, suffix: '.1')
218
- join(other, join_keys, type: :right_outer, suffix: suffix)
454
+ # @macro join_common_example_2
455
+ # @example with key pairs
456
+ # df2.right_join(other2, { left: :KEY1, right: :KEY2 })
457
+ #
458
+ # # =>
459
+ # KEY1 X1 X2
460
+ # <string> <uint8> <boolean>
461
+ # 0 A 1 true
462
+ # 1 B 2 false
463
+ # 2 D (nil) (nil)
464
+ #
465
+ # @since 0.2.3
466
+ #
467
+ def right_join(other, join_keys = nil, suffix: '.1', force_order: true)
468
+ join(
469
+ other,
470
+ join_keys,
471
+ type: :right_outer,
472
+ suffix: suffix,
473
+ force_order: force_order
474
+ )
219
475
  end
220
476
 
221
477
  # Filtering joins (#semi_join, #anti_join)
@@ -224,54 +480,115 @@ module RedAmber
224
480
  # - Same as `#join` with `type: :left_semi`
225
481
  # - A kind of filtering join.
226
482
  #
227
- # @overload semi_join(other, suffix: '.1')
483
+ # @overload semi_join(other, suffix: '.1', force_order: true)
228
484
  # If `join_key` is not specified, common keys in self and other are used
229
485
  # (natural keys). Returns joined dataframe.
230
486
  #
231
487
  # @macro join_before
488
+ # @macro join_force_order
232
489
  # @macro join_after
490
+ # @macro join_common_example_1
491
+ # @example without key (use implicit common key)
492
+ # df.semi_join(other)
233
493
  #
234
- # @overload semi_join(other, join_keys, suffix: '.1')
494
+ # # =>
495
+ # KEY X1
496
+ # <string> <uint8>
497
+ # 0 A 1
498
+ # 1 B 2
499
+ #
500
+ # @overload semi_join(other, join_keys, suffix: '.1', force_order: true)
235
501
  #
236
502
  # @macro join_before
237
503
  # @macro join_key_in_array
504
+ # @macro join_force_order
238
505
  # @macro join_after
506
+ # @macro join_common_example_1
507
+ # @example with a key
508
+ # df.semi_join(other, :KEY)
509
+ #
510
+ # # =>
511
+ # KEY X1
512
+ # <string> <uint8>
513
+ # 0 A 1
514
+ # 1 B 2
239
515
  #
240
- # @overload semi_join(other, join_key_pairs, suffix: '.1')
516
+ # @overload semi_join(other, join_key_pairs, suffix: '.1', force_order: true)
241
517
  #
242
518
  # @macro join_before
243
519
  # @macro join_key_in_hash
520
+ # @macro join_force_order
244
521
  # @macro join_after
522
+ # @macro join_common_example_2
523
+ # @example with key pairs
524
+ # df2.semi_join(other2, { left: :KEY1, right: :KEY2 })
525
+ #
526
+ # # =>
527
+ # KEY1 X1
528
+ # <string> <uint8>
529
+ # 0 A 1
530
+ # 1 B 2
245
531
  #
246
- def semi_join(other, join_keys = nil, suffix: '.1')
247
- join(other, join_keys, type: :left_semi, suffix: suffix)
532
+ # @since 0.2.3
533
+ #
534
+ def semi_join(other, join_keys = nil, suffix: '.1', force_order: true)
535
+ join(other, join_keys, type: :left_semi, suffix: suffix, force_order: force_order)
248
536
  end
249
537
 
250
538
  # Return records of self that do not have a match in other.
251
539
  # - Same as `#join` with `type: :left_anti`
252
540
  # - A kind of filtering join.
253
541
  #
254
- # @overload anti_join(other, suffix: '.1')
542
+ # @overload anti_join(other, suffix: '.1', force_order: true)
255
543
  # If `join_key` is not specified, common keys in self and other are used
256
544
  # (natural keys). Returns joined dataframe.
257
545
  #
258
546
  # @macro join_before
547
+ # @macro join_force_order
259
548
  # @macro join_after
549
+ # @macro join_common_example_1
550
+ # @example without key (use implicit common key)
551
+ # df.anti_join(other)
552
+ #
553
+ # # =>
554
+ # KEY X1
555
+ # <string> <uint8>
556
+ # 0 C 3
260
557
  #
261
- # @overload anti_join(other, join_keys, suffix: '.1')
558
+ # @overload anti_join(other, join_keys, suffix: '.1', force_order: true)
262
559
  #
263
560
  # @macro join_before
264
561
  # @macro join_key_in_array
562
+ # @macro join_force_order
265
563
  # @macro join_after
564
+ # @macro join_common_example_1
565
+ # @example with a key
566
+ # df.anti_join(other, :KEY)
266
567
  #
267
- # @overload anti_join(other, join_key_pairs, suffix: '.1')
568
+ # # =>
569
+ # KEY X1
570
+ # <string> <uint8>
571
+ # 0 C 3
572
+ #
573
+ # @overload anti_join(other, join_key_pairs, suffix: '.1', force_order: true)
268
574
  #
269
575
  # @macro join_before
270
576
  # @macro join_key_in_hash
577
+ # @macro join_force_order
271
578
  # @macro join_after
579
+ # @macro join_common_example_2
580
+ # @example with key pairs
581
+ # df2.anti_join(other2, { left: :KEY1, right: :KEY2 })
582
+ #
583
+ # # =>
584
+ # KEY1 X1
585
+ # <string> <uint8>
586
+ # 0 C 3
587
+ #
588
+ # @since 0.2.3
272
589
  #
273
- def anti_join(other, join_keys = nil, suffix: '.1')
274
- join(other, join_keys, type: :left_anti, suffix: suffix)
590
+ def anti_join(other, join_keys = nil, suffix: '.1', force_order: true)
591
+ join(other, join_keys, type: :left_anti, suffix: suffix, force_order: force_order)
275
592
  end
276
593
 
277
594
  # Set operations (#intersect, #union, #difference, #set_operable?)
@@ -279,8 +596,13 @@ module RedAmber
279
596
  # Check if set operation with self and other is possible.
280
597
  #
281
598
  # @macro join_before
599
+ # @return [Boolean]
600
+ # true if set operation is possible.
601
+ # @macro join_common_example_3
602
+ # @example
603
+ # df3.set_operable?(other3) # => true
282
604
  #
283
- # @return [Boolean] true if set operation is possible.
605
+ # @since 0.2.3
284
606
  #
285
607
  def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
286
608
  keys == other.keys.map(&:to_sym)
@@ -291,8 +613,18 @@ module RedAmber
291
613
  # - A kind of set operations.
292
614
  #
293
615
  # @macro join_before
616
+ # @return [DataFrame]
617
+ # joined dataframe.
618
+ # @macro join_common_example_3
619
+ # @example
620
+ # df3.intersect(other3)
621
+ #
622
+ # # =>
623
+ # KEY1 KEY2
624
+ # <string> <uint8>
625
+ # 0 A 1
294
626
  #
295
- # @return [DataFrame] Joined dataframe.
627
+ # @since 0.2.3
296
628
  #
297
629
  def intersect(other)
298
630
  unless keys == other.keys.map(&:to_sym)
@@ -307,8 +639,22 @@ module RedAmber
307
639
  # - A kind of set operations.
308
640
  #
309
641
  # @macro join_before
310
- #
311
- # @return [DataFrame] Joined dataframe.
642
+ # @return [DataFrame]
643
+ # joined dataframe.
644
+ # @macro join_common_example_3
645
+ # @example
646
+ # df3.intersect(other3)
647
+ #
648
+ # # =>
649
+ # KEY1 KEY2
650
+ # <string> <uint8>
651
+ # 0 A 1
652
+ # 1 B 2
653
+ # 2 C 3
654
+ # 3 B 4
655
+ # 4 D 5
656
+ #
657
+ # @since 0.2.3
312
658
  #
313
659
  def union(other)
314
660
  unless keys == other.keys.map(&:to_sym)
@@ -323,8 +669,27 @@ module RedAmber
323
669
  # - A kind of set operations.
324
670
  #
325
671
  # @macro join_before
672
+ # @return [DataFrame]
673
+ # joined dataframe.
674
+ # @macro join_common_example_3
675
+ # @example
676
+ # df3.intersect(other3)
677
+ #
678
+ # # =>
679
+ # KEY1 KEY2
680
+ # <string> <uint8>
681
+ # 0 B 2
682
+ # 1 C 3
683
+ #
684
+ # other.intersect(df)
685
+ #
686
+ # # =>
687
+ # KEY1 KEY2
688
+ # <string> <uint8>
689
+ # 0 B 4
690
+ # 1 D 5
326
691
  #
327
- # @return [DataFrame] Joined dataframe.
692
+ # @since 0.2.3
328
693
  #
329
694
  def difference(other)
330
695
  unless keys == other.keys.map(&:to_sym)
@@ -338,60 +703,167 @@ module RedAmber
338
703
 
339
704
  # Join another DataFrame or Table to self.
340
705
  #
341
- # @overload join(other, type: :inner, suffix: '.1')
706
+ # @!macro join_common_type
707
+ # @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
708
+ # left_outer, :right_outer, :full_outer] type of join.
709
+ #
710
+ # @!macro join_common_example_4
711
+ # @example
712
+ # df4 = DataFrame.new(
713
+ # X1: %w[A B C],
714
+ # Y: %w[D E F]
715
+ # )
716
+ #
717
+ # # =>
718
+ # X1 Y1
719
+ # <string> <string>
720
+ # 0 A D
721
+ # 1 B E
722
+ # 2 C F
723
+ #
724
+ # other4 = DataFrame.new(
725
+ # X2: %w[A B D],
726
+ # Y: %w[e E E]
727
+ # )
728
+ #
729
+ # # =>
730
+ # X1 Y1
731
+ # <string> <string>
732
+ # 0 A D
733
+ # 1 B E
734
+ # 2 C F
735
+
736
+ # @note the order of joined results will be preserved by default.
737
+ # This is enabled by appending index column to sort after joining but
738
+ # it will cause some performance degradation. If you don't matter
739
+ # the order of the result, set `force_order` option to `false`.
740
+ #
741
+ # @overload join(other, type: :inner, suffix: '.1', force_order: true)
342
742
  #
343
743
  # If `join_key` is not specified, common keys in self and other are used
344
744
  # (natural keys). Returns joined dataframe.
345
745
  #
346
- # @!macro join_common_type
347
- # @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
348
- # left_outer, :right_outer, :full_outer] type of join.
349
- #
350
746
  # @macro join_before
351
747
  # @macro join_common_type
748
+ # @macro join_force_order
352
749
  # @macro join_after
750
+ # @macro join_common_example_1
751
+ # @example
752
+ # df.join(other)
753
+ #
754
+ # # =>
755
+ # KEY X1 X2
756
+ # <string> <uint8> <boolean>
757
+ # 0 A 1 true
758
+ # 1 B 2 false
759
+ #
760
+ # df.join(other, type: :full_outer)
353
761
  #
354
- # @overload join(other, join_keys, type: :inner, suffix: '.1')
762
+ # # =>
763
+ # KEY X1 X2
764
+ # <string> <uint8> <boolean>
765
+ # 0 A 1 true
766
+ # 1 B 2 false
767
+ # 2 C 3 (nil)
768
+ # 3 D (nil) (nil)
769
+ #
770
+ # @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: true)
355
771
  #
356
772
  # @macro join_before
357
773
  # @macro join_key_in_array
358
774
  # @macro join_common_type
775
+ # @macro join_force_order
359
776
  # @macro join_after
777
+ # @macro join_common_example_3
778
+ # @example join keys in an Array
779
+ # df3.join(other3, [:KEY1, :KEY2])
780
+ #
781
+ # # =>
782
+ # KEY1 KEY2
783
+ # <string> <uint8>
784
+ # 0 A 1
360
785
  #
361
- # @overload join(other, join_key_pairs, type: :inner, suffix: '.1')
786
+ # @example partial join key and suffix
787
+ # df3.join(other3, :KEY1, suffix: '.a')
788
+ #
789
+ # # =>
790
+ # KEY1 KEY2 KEY2.a
791
+ # <string> <uint8> <uint8>
792
+ # 0 A 1 1
793
+ # 1 B 2 4
794
+ #
795
+ # @overload join(other, join_key_pairs, type: :inner, suffix: '.1', force_order: true)
362
796
  #
363
797
  # @macro join_before
364
798
  # @macro join_key_in_hash
365
799
  # @macro join_common_type
800
+ # @macro join_force_order
366
801
  # @macro join_after
367
- #
368
- def join(other, join_keys = nil, type: :inner, suffix: '.1')
369
- case other
370
- when DataFrame
371
- other = other.table
372
- when Arrow::Table
373
- # Nop
802
+ # @macro join_common_example_4
803
+ # @example without options
804
+ # df4.join(other4)
805
+ #
806
+ # # =>
807
+ # X1 Y X2
808
+ # <string> <string> <string>
809
+ # 0 B E D
810
+ # 1 B E B
811
+ #
812
+ # @example join by key pairs
813
+ # df4.join(other4, { left: [:X1, :Y], right: [:X2, :Y] })
814
+ #
815
+ # # =>
816
+ # X1 Y
817
+ # <string> <string>
818
+ # 0 B E
819
+ #
820
+ # @example join by key pairs, using renaming by suffix
821
+ # df4.join(other4, { left: :X1, right: :X2 })
822
+ #
823
+ # # =>
824
+ # X1 Y Y.1
825
+ # <string> <string> <string>
826
+ # 0 A D e
827
+ # 1 B E E
828
+ #
829
+ # @since 0.2.3
830
+ #
831
+ def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: true)
832
+ right_table =
833
+ case other
834
+ when DataFrame
835
+ other.table
836
+ when Arrow::Table
837
+ other
838
+ else
839
+ raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
840
+ end
841
+
842
+ type = type.to_sym
843
+ left_index = :__LEFT_INDEX__
844
+ right_index = :__RIGHT_INDEX__
845
+ if force_order
846
+ left_table = assign(left_index) { indices }.table
847
+ other = DataFrame.create(other) if other.is_a?(Arrow::Table)
848
+ right_table = other.assign(right_index) { indices }.table
374
849
  else
375
- raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
850
+ left_table = table
376
851
  end
377
852
 
378
- table_keys = table.keys
379
- other_keys = other.keys
380
- type = type.to_sym
853
+ table_keys = left_table.keys
854
+ other_keys = right_table.keys
381
855
 
382
856
  # natural keys (implicit common keys)
383
857
  join_keys ||= table_keys.intersection(other_keys)
384
858
 
385
859
  # This is not necessary if additional procedure is contributed to Red Arrow.
386
860
  if join_keys.is_a?(Hash)
387
- left_keys = join_keys[:left]
388
- right_keys = join_keys[:right]
861
+ left_keys = ensure_keys(join_keys[:left])
862
+ right_keys = ensure_keys(join_keys[:right])
389
863
  else
390
- left_keys = join_keys
391
- right_keys = join_keys
864
+ left_keys = ensure_keys(join_keys)
865
+ right_keys = left_keys
392
866
  end
393
- left_keys = Array(left_keys).map(&:to_s)
394
- right_keys = Array(right_keys).map(&:to_s)
395
867
 
396
868
  case type
397
869
  when :full_outer, :left_semi, :left_anti, :right_semi, :right_anti
@@ -407,43 +879,73 @@ module RedAmber
407
879
 
408
880
  # Should we rescue errors in Arrow::Table#join for usability ?
409
881
  joined_table =
410
- table.join(other, join_keys,
411
- type: type,
412
- left_outputs: left_outputs,
413
- right_outputs: right_outputs)
882
+ left_table.join(
883
+ right_table,
884
+ join_keys,
885
+ type: type,
886
+ left_outputs: left_outputs,
887
+ right_outputs: right_outputs
888
+ )
414
889
 
415
890
  case type
416
891
  when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
417
- if joined_table.keys.uniq!
418
- DataFrame.create(rename_table(joined_table, n_keys, suffix))
419
- else
420
- DataFrame.create(joined_table)
421
- end
892
+ dataframe =
893
+ if joined_table.keys.uniq!
894
+ DataFrame.create(rename_table(joined_table, n_keys, suffix))
895
+ else
896
+ DataFrame.create(joined_table)
897
+ end
898
+ sorter =
899
+ case type
900
+ when :inner, :left_outer
901
+ [left_index, right_index]
902
+ when :left_semi, :left_anti
903
+ [left_index]
904
+ when :right_semi, :right_anti
905
+ [right_index]
906
+ end
422
907
  when :full_outer
908
+ key_index_lr =
909
+ left_keys.map { left_table.keys.index(_1) }
910
+ .zip(right_keys.map { left_table.keys.size + right_table.keys.index(_1) })
423
911
  renamed_table = rename_table(joined_table, n_keys, suffix)
424
- renamed_keys = renamed_table.keys
425
912
  dropper = []
426
- DataFrame.create(renamed_table).assign do |df|
427
- left_keys.map do |left_key|
428
- i_left_key = renamed_keys.index(left_key)
429
- right_key = renamed_keys[i_left_key + table_keys.size]
430
- dropper << right_key
431
- [left_key.to_sym, merge_array(df[left_key].data, df[right_key].data)]
913
+ dataframe =
914
+ DataFrame.create(renamed_table).assign do |df|
915
+ key_index_lr.map do |l, r|
916
+ dropper << df.keys[r]
917
+ [df.keys[l], merge_array(df.vectors[l].data, df.vectors[r].data)]
918
+ end
432
919
  end
433
- end.drop(dropper)
920
+ dataframe = dataframe.drop(dropper)
921
+ sorter = [left_index, right_index]
434
922
  when :right_outer
435
- if joined_table.keys.uniq!
436
- DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
437
- else
438
- DataFrame.create(joined_table)
439
- end.pick do
440
- [right_keys, keys.map(&:to_s) - right_keys]
441
- end
923
+ dataframe =
924
+ if joined_table.keys.uniq!
925
+ DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
926
+ else
927
+ DataFrame.create(joined_table)
928
+ end
929
+ dataframe = dataframe.pick(right_keys, dataframe.keys - right_keys)
930
+ sorter = [left_index, right_index]
931
+ end
932
+
933
+ if force_order
934
+ dataframe
935
+ .sort(sorter)
936
+ .drop(sorter)
937
+ else
938
+ dataframe
442
939
  end
443
940
  end
444
941
 
445
942
  private
446
943
 
944
+ # To ensure Array of Symbols
945
+ def ensure_keys(keys)
946
+ Array(keys).map(&:to_sym)
947
+ end
948
+
447
949
  # Rename duplicate keys by suffix
448
950
  def rename_table(joined_table, n_keys, suffix)
449
951
  joined_keys = joined_table.keys
@@ -453,17 +955,9 @@ module RedAmber
453
955
  renamed_right_keys =
454
956
  other_keys.map do |key|
455
957
  if dup_keys.include?(key)
456
- new_key = nil
457
- loop do
458
- new_key = "#{key}#{suffix}"
459
- break unless joined_keys.include?(new_key)
460
-
461
- s = suffix.succ
462
- raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
463
-
464
- suffix = s
465
- end
466
- new_key
958
+ suffixed = "#{key}#{suffix}".to_sym
959
+ # Find a key from suffixed.succ
960
+ (suffixed..).find { !joined_keys.include?(_1) }
467
961
  else
468
962
  key
469
963
  end