red_amber 0.2.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -51
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +203 -1
  5. data/Gemfile +2 -1
  6. data/LICENSE +1 -1
  7. data/README.md +61 -45
  8. data/benchmark/basic.yml +11 -4
  9. data/benchmark/combine.yml +3 -4
  10. data/benchmark/dataframe.yml +62 -0
  11. data/benchmark/group.yml +7 -1
  12. data/benchmark/reshape.yml +6 -2
  13. data/benchmark/vector.yml +63 -0
  14. data/doc/DataFrame.md +35 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +295 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +537 -68
  20. data/lib/red_amber/data_frame_combinable.rb +776 -123
  21. data/lib/red_amber/data_frame_displayable.rb +248 -18
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +81 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +216 -21
  25. data/lib/red_amber/data_frame_selectable.rb +781 -120
  26. data/lib/red_amber/data_frame_variable_operation.rb +561 -85
  27. data/lib/red_amber/group.rb +195 -21
  28. data/lib/red_amber/helper.rb +114 -32
  29. data/lib/red_amber/refinements.rb +206 -0
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +435 -58
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +321 -69
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +397 -24
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +15 -1
  39. data/red_amber.gemspec +4 -3
  40. metadata +19 -11
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -294
@@ -1,14 +1,36 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameCombinable
6
- # Concatenate other dataframe onto the bottom.
6
+ # Refinements for Arrow::Table
7
+ using RefineArrowTable
8
+
9
+ # Concatenate other dataframes or tables onto the bottom of self.
7
10
  #
11
+ # @note the `#types` must be same as `other#types`.
8
12
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
9
- # DataFrame/Table to concatenate onto the bottom of self.
13
+ # DataFrames or Tables to concatenate.
10
14
  # @return [DataFrame]
11
- # Concatenated dataframe.
15
+ # concatenated dataframe.
16
+ # @example
17
+ # df = DataFrame.new(x: [1, 2], y: ['A', 'B'])
18
+ # other = DataFrame.new(x: [3, 4], y: ['C', 'D'])
19
+ # [df.types, other.types]
20
+ #
21
+ # # =>
22
+ # [[:uint8, :string], [:uint8, :string]]
23
+ #
24
+ # df.concatenate(other)
25
+ #
26
+ # # =>
27
+ # x y
28
+ # <uint8> <string>
29
+ # 0 1 A
30
+ # 1 2 B
31
+ # 2 3 C
32
+ # 3 4 D
33
+ #
12
34
  def concatenate(*other)
13
35
  case other
14
36
  in [] | [nil] | [[]]
@@ -30,20 +52,33 @@ module RedAmber
30
52
  end
31
53
  end
32
54
 
33
- DataFrame.new(table.concatenate(table_array))
55
+ DataFrame.create(table.concatenate(table_array))
34
56
  end
35
57
 
36
58
  alias_method :concat, :concatenate
37
59
  alias_method :bind_rows, :concatenate
38
60
 
39
- # Merge other DataFrame or Table from other.
40
- # - Self and other must have same size.
41
- # - Self and other do not share the same key.
42
- # - If they share any keys, raise Error.
61
+ # Merge other DataFrames or Tables.
62
+ #
63
+ # @note the `#size` must be same as `other#size`.
64
+ # @note self and other must not share the same key.
43
65
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
44
- # DataFrame/Table to concatenate.
66
+ # DataFrames or Tables to merge.
67
+ # @raise [DataFrameArgumentError]
68
+ # if size is not same or self and other shares the same key.
45
69
  # @return [DataFrame]
46
- # Merged dataframe.
70
+ # merged dataframe.
71
+ # @example
72
+ # df = DataFrame.new(x: [1, 2], y: [3, 4])
73
+ # other = DataFrame.new(a: ['A', 'B'], b: ['C', 'D'])
74
+ # df.merge(other)
75
+ #
76
+ # # =>
77
+ # x y a b
78
+ # <uint8> <uint8> <string> <string>
79
+ # 0 1 3 A C
80
+ # 1 2 4 B D
81
+ #
47
82
  def merge(*other)
48
83
  case other
49
84
  in [] | [nil] | [[]]
@@ -58,14 +93,16 @@ module RedAmber
58
93
  df =
59
94
  case e
60
95
  when Arrow::Table
61
- DataFrame.new(e)
96
+ DataFrame.create(e)
62
97
  when DataFrame
63
98
  e
64
99
  else
65
100
  raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
66
101
  end
67
102
 
68
- raise DataFrameArgumentError, "#{e} do not have same size as self" if size != df.size
103
+ if size != df.size
104
+ raise DataFrameArgumentError, "#{e} do not have same size as self"
105
+ end
69
106
 
70
107
  k = keys.intersection(df.keys).any?
71
108
  raise DataFrameArgumentError, "There are some shared keys: #{k}" if k
@@ -78,206 +115,822 @@ module RedAmber
78
115
 
79
116
  alias_method :bind_cols, :merge
80
117
 
81
- # Mutating joins
118
+ # Mutating joins (#inner_join, #full_join, #left_join, #right_join)
82
119
 
83
- # Join data, leaving only the matching records.
120
+ # @!macro join_before
121
+ # @param other [DataFrame, Arrow::Table]
122
+ # A DataFrame or a Table to be joined with self.
123
+ #
124
+ # @!macro join_dorce_order
125
+ # @param force_order [Boolean]
126
+ # wheather force order of the output always same.
127
+ # - This option is used in `:full_outer` and `:right_outer`.
128
+ # - If this option is true (by default) it will append index to the source
129
+ # and sort after joining. It will cause some degradation in performance.
130
+ #
131
+ # @!macro join_after
132
+ # @param suffix [#succ]
133
+ # a suffix to rename keys when key names conflict as a result of join.
134
+ # `suffix` must be responsible to `#succ`.
135
+ # @return [DataFrame]
136
+ # joined dataframe.
137
+ #
138
+ # @!macro join_key_in_array
139
+ # @param join_keys [String, Symbol, Array<String, Symbol>]
140
+ # a key or keys to match.
141
+ #
142
+ # @!macro join_key_in_hash
143
+ # @param join_key_pairs [Hash]
144
+ # pairs of a key name or key names to match in left and right.
145
+ # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :left
146
+ # join keys in `self`.
147
+ # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :right
148
+ # join keys in `other`.
149
+ #
150
+ # @!macro join_common_example_1
151
+ # @example
152
+ # df = DataFrame.new(KEY: %w[A B C], X1: [1, 2, 3])
153
+ #
154
+ # # =>
155
+ # KEY X1
156
+ # <string> <uint8>
157
+ # 0 A 1
158
+ # 1 B 2
159
+ # 2 C 3
160
+ #
161
+ # other = DataFrame.new(KEY: %w[A B D], X2: [true, false, nil])
162
+ #
163
+ # # =>
164
+ # KEY X2
165
+ # <string> <boolean>
166
+ # 0 A true
167
+ # 1 B false
168
+ # 2 D (nil)
169
+ #
170
+ # @!macro join_common_example_2
171
+ # @example
172
+ # df2 = DataFrame.new(KEY1: %w[A B C], X1: [1, 2, 3])
84
173
  #
85
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
86
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
87
- # @return [DataFrame] Joined dataframe.
174
+ # # =>
175
+ # KEY1 X1
176
+ # <string> <uint8>
177
+ # 0 A 1
178
+ # 1 B 2
179
+ # 2 C 3
180
+ #
181
+ # other2 = DataFrame.new(KEY2: %w[A B D], X2: [true, false, nil])
182
+ #
183
+ # # =>
184
+ # KEY2 X2
185
+ # <string> <boolean>
186
+ # 0 A true
187
+ # 1 B false
188
+ # 2 D (nil)
189
+ #
190
+ # @!macro join_common_example_3
191
+ # @example
192
+ # df3 = DataFrame.new(
193
+ # KEY1: %w[A B C],
194
+ # KEY2: [1, 2, 3]
195
+ # )
196
+ #
197
+ # # =>
198
+ # KEY1 KEY2
199
+ # <string> <uint8>
200
+ # 0 A 1
201
+ # 1 B 2
202
+ # 2 C 3
203
+ #
204
+ # other3 = DataFrame.new(
205
+ # KEY1: %w[A B D],
206
+ # KEY2: [1, 4, 5]
207
+ # )
208
+ #
209
+ # # =>
210
+ # KEY1 KEY2
211
+ # <string> <uint8>
212
+ # 0 A 1
213
+ # 1 B 4
214
+ # 2 D 5
215
+
216
+ # Join another DataFrame or Table, leaving only the matching records.
217
+ # - Same as `#join` with `type: :inner`
218
+ # - A kind of mutating join.
219
+ #
220
+ # @overload inner_join(other, suffix: '.1')
221
+ # If `join_key` is not specified, common keys in self and other are used
222
+ # (natural keys). Returns joined dataframe.
223
+ #
224
+ # @macro join_before
225
+ # @macro join_after
226
+ # @macro join_common_example_1
227
+ # @example without key (use implicit common key)
228
+ # df.inner_join(other)
229
+ #
230
+ # # =>
231
+ # KEY X1 X2
232
+ # <string> <uint8> <boolean>
233
+ # 0 A 1 true
234
+ # 1 B 2 false
235
+ #
236
+ # @overload inner_join(other, join_keys, suffix: '.1')
237
+ #
238
+ # @macro join_before
239
+ # @macro join_key_in_array
240
+ # @macro join_after
241
+ # @macro join_common_example_1
242
+ # @example with a key
243
+ # df.inner_join(other, :KEY)
244
+ #
245
+ # # =>
246
+ # KEY X1 X2
247
+ # <string> <uint8> <boolean>
248
+ # 0 A 1 true
249
+ # 1 B 2 false
250
+ #
251
+ # @overload inner_join(other, join_key_pairs, suffix: '.1')
252
+ #
253
+ # @macro join_before
254
+ # @macro join_key_in_hash
255
+ # @macro join_after
256
+ # @macro join_common_example_2
257
+ # @example with key pairs
258
+ # df2.inner_join(other2, { left: :KEY1, right: :KEY2 })
259
+ #
260
+ # # =>
261
+ # KEY1 X1 X2
262
+ # <string> <uint8> <boolean>
263
+ # 0 A 1 true
264
+ # 1 B 2 false
88
265
  #
89
266
  def inner_join(other, join_keys = nil, suffix: '.1')
90
267
  join(other, join_keys, type: :inner, suffix: suffix)
91
268
  end
92
269
 
93
- # Join data, leaving all records.
270
+ # Join another DataFrame or Table, leaving all records.
271
+ # - Same as `#join` with `type: :full_outer`
272
+ # - A kind of mutating join.
273
+ #
274
+ # @overload full_join(other, suffix: '.1', force_order: true)
275
+ # If `join_key` is not specified, common keys in self and other are used
276
+ # (natural keys). Returns joined dataframe.
277
+ #
278
+ # @macro join_before
279
+ # @macro join_dorce_order
280
+ # @macro join_after
281
+ # @macro join_common_example_1
282
+ # @example without key (use implicit common key)
283
+ # df.full_join(other)
284
+ #
285
+ # # =>
286
+ # KEY X1 X2
287
+ # <string> <uint8> <boolean>
288
+ # 0 A 1 true
289
+ # 1 B 2 false
290
+ # 2 C 3 (nil)
291
+ # 3 D (nil) (nil)
94
292
  #
95
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
96
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
97
- # @return [DataFrame] Joined dataframe.
293
+ # @overload full_join(other, join_keys, suffix: '.1', force_order: true)
98
294
  #
99
- def full_join(other, join_keys = nil, suffix: '.1')
100
- join(other, join_keys, type: :full_outer, suffix: suffix)
295
+ # @macro join_before
296
+ # @macro join_key_in_array
297
+ # @macro join_dorce_order
298
+ # @macro join_after
299
+ # @macro join_common_example_1
300
+ # @example with a key
301
+ # df.full_join(other, :KEY)
302
+ #
303
+ # # =>
304
+ # KEY X1 X2
305
+ # <string> <uint8> <boolean>
306
+ # 0 A 1 true
307
+ # 1 B 2 false
308
+ # 2 C 3 (nil)
309
+ # 3 D (nil) (nil)
310
+ #
311
+ # @overload full_join(other, join_key_pairs, suffix: '.1', force_order: true)
312
+ #
313
+ # @macro join_before
314
+ # @macro join_key_in_hash
315
+ # @macro join_dorce_order
316
+ # @macro join_after
317
+ # @macro join_common_example_2
318
+ # @example with key pairs
319
+ # df2.full_join(other2, { left: :KEY1, right: :KEY2 })
320
+ #
321
+ # # =>
322
+ # KEY1 X1 X2
323
+ # <string> <uint8> <boolean>
324
+ # 0 A 1 true
325
+ # 1 B 2 false
326
+ # 2 C 3 (nil)
327
+ # 3 D (nil) (nil)
328
+ #
329
+ def full_join(other, join_keys = nil, suffix: '.1', force_order: true)
330
+ join(other, join_keys,
331
+ type: :full_outer, suffix: suffix, force_order: force_order)
101
332
  end
102
333
 
103
334
  alias_method :outer_join, :full_join
104
335
 
105
336
  # Join matching values to self from other.
337
+ # - Same as `#join` with `type: :left_outer`
338
+ # - A kind of mutating join.
339
+ #
340
+ # @overload left_join(other, suffix: '.1')
341
+ # If `join_key` is not specified, common keys in self and other are used
342
+ # (natural keys). Returns joined dataframe.
343
+ #
344
+ # @macro join_before
345
+ # @macro join_after
346
+ # @macro join_common_example_1
347
+ # @example without key (use implicit common key)
348
+ # df.left_join(other)
349
+ #
350
+ # # =>
351
+ # KEY X1 X2
352
+ # <string> <uint8> <boolean>
353
+ # 0 A 1 true
354
+ # 1 B 2 false
355
+ # 2 C 3 (nil)
356
+ #
357
+ # @overload left_join(other, join_keys, suffix: '.1')
106
358
  #
107
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
108
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
109
- # @return [DataFrame] Joined dataframe.
359
+ # @macro join_before
360
+ # @macro join_key_in_array
361
+ # @macro join_after
362
+ # @macro join_common_example_1
363
+ # @example with a key
364
+ # df.left_join(other, :KEY)
365
+ #
366
+ # # =>
367
+ # KEY X1 X2
368
+ # <string> <uint8> <boolean>
369
+ # 0 A 1 true
370
+ # 1 B 2 false
371
+ # 2 C 3 (nil)
372
+ #
373
+ # @overload left_join(other, join_key_pairs, suffix: '.1')
374
+ #
375
+ # @macro join_before
376
+ # @macro join_key_in_hash
377
+ # @macro join_after
378
+ # @macro join_common_example_2
379
+ # @example with key pairs
380
+ # df2.left_join(other2, { left: :KEY1, right: :KEY2 })
381
+ #
382
+ # # =>
383
+ # KEY1 X1 X2
384
+ # <string> <uint8> <boolean>
385
+ # 0 A 1 true
386
+ # 1 B 2 false
387
+ # 2 C 3 (nil)
110
388
  #
111
389
  def left_join(other, join_keys = nil, suffix: '.1')
112
390
  join(other, join_keys, type: :left_outer, suffix: suffix)
113
391
  end
114
392
 
115
393
  # Join matching values from self to other.
394
+ # - Same as `#join` with `type: :right_outer`
395
+ # - A kind of mutating join.
396
+ #
397
+ # @overload right_join(other, suffix: '.1', force_order: true)
398
+ # If `join_key` is not specified, common keys in self and other are used
399
+ # (natural keys). Returns joined dataframe.
400
+ #
401
+ # @macro join_before
402
+ # @macro join_dorce_order
403
+ # @macro join_after
404
+ # @macro join_common_example_1
405
+ # @example without key (use implicit common key)
406
+ # df.right_join(other)
407
+ #
408
+ # # =>
409
+ # KEY X1 X2
410
+ # <string> <uint8> <boolean>
411
+ # 0 A 1 true
412
+ # 1 B 2 false
413
+ # 2 D (nil) (nil)
414
+ #
415
+ # @overload right_join(other, join_keys, suffix: '.1', force_order: true)
416
+ #
417
+ # @macro join_before
418
+ # @macro join_key_in_array
419
+ # @macro join_dorce_order
420
+ # @macro join_after
421
+ # @macro join_common_example_1
422
+ # @example with a key
423
+ # df.right_join(other, :KEY)
116
424
  #
117
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
118
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
119
- # @return [DataFrame] Joined dataframe.
425
+ # # =>
426
+ # KEY X1 X2
427
+ # <string> <uint8> <boolean>
428
+ # 0 A 1 true
429
+ # 1 B 2 false
430
+ # 2 D (nil) (nil)
120
431
  #
121
- def right_join(other, join_keys = nil, suffix: '.1')
122
- join(other, join_keys, type: :right_outer, suffix: suffix)
432
+ # @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
433
+ #
434
+ # @macro join_before
435
+ # @macro join_key_in_hash
436
+ # @macro join_dorce_order
437
+ # @macro join_after
438
+ # @macro join_common_example_2
439
+ # @example with key pairs
440
+ # df2.right_join(other2, { left: :KEY1, right: :KEY2 })
441
+ #
442
+ # # =>
443
+ # KEY1 X1 X2
444
+ # <string> <uint8> <boolean>
445
+ # 0 A 1 true
446
+ # 1 B 2 false
447
+ # 2 D (nil) (nil)
448
+ #
449
+ def right_join(other, join_keys = nil, suffix: '.1', force_order: true)
450
+ join(
451
+ other,
452
+ join_keys,
453
+ type: :right_outer,
454
+ suffix: suffix,
455
+ force_order: force_order
456
+ )
123
457
  end
124
458
 
125
- # Filtering joins
459
+ # Filtering joins (#semi_join, #anti_join)
126
460
 
127
461
  # Return records of self that have a match in other.
462
+ # - Same as `#join` with `type: :left_semi`
463
+ # - A kind of filtering join.
464
+ #
465
+ # @overload semi_join(other, suffix: '.1')
466
+ # If `join_key` is not specified, common keys in self and other are used
467
+ # (natural keys). Returns joined dataframe.
468
+ #
469
+ # @macro join_before
470
+ # @macro join_after
471
+ # @macro join_common_example_1
472
+ # @example without key (use implicit common key)
473
+ # df.semi_join(other)
474
+ #
475
+ # # =>
476
+ # KEY X1
477
+ # <string> <uint8>
478
+ # 0 A 1
479
+ # 1 B 2
480
+ #
481
+ # @overload semi_join(other, join_keys, suffix: '.1')
482
+ #
483
+ # @macro join_before
484
+ # @macro join_key_in_array
485
+ # @macro join_after
486
+ # @macro join_common_example_1
487
+ # @example with a key
488
+ # df.semi_join(other, :KEY)
489
+ #
490
+ # # =>
491
+ # KEY X1
492
+ # <string> <uint8>
493
+ # 0 A 1
494
+ # 1 B 2
128
495
  #
129
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
130
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
131
- # @return [DataFrame] Joined dataframe.
496
+ # @overload semi_join(other, join_key_pairs, suffix: '.1')
497
+ #
498
+ # @macro join_before
499
+ # @macro join_key_in_hash
500
+ # @macro join_after
501
+ # @macro join_common_example_2
502
+ # @example with key pairs
503
+ # df2.semi_join(other2, { left: :KEY1, right: :KEY2 })
504
+ #
505
+ # # =>
506
+ # KEY1 X1
507
+ # <string> <uint8>
508
+ # 0 A 1
509
+ # 1 B 2
132
510
  #
133
511
  def semi_join(other, join_keys = nil, suffix: '.1')
134
512
  join(other, join_keys, type: :left_semi, suffix: suffix)
135
513
  end
136
514
 
137
515
  # Return records of self that do not have a match in other.
516
+ # - Same as `#join` with `type: :left_anti`
517
+ # - A kind of filtering join.
518
+ #
519
+ # @overload anti_join(other, suffix: '.1')
520
+ # If `join_key` is not specified, common keys in self and other are used
521
+ # (natural keys). Returns joined dataframe.
522
+ #
523
+ # @macro join_before
524
+ # @macro join_after
525
+ # @macro join_common_example_1
526
+ # @example without key (use implicit common key)
527
+ # df.anti_join(other)
528
+ #
529
+ # # =>
530
+ # KEY X1
531
+ # <string> <uint8>
532
+ # 0 C 3
533
+ #
534
+ # @overload anti_join(other, join_keys, suffix: '.1')
535
+ #
536
+ # @macro join_before
537
+ # @macro join_key_in_array
538
+ # @macro join_after
539
+ # @macro join_common_example_1
540
+ # @example with a key
541
+ # df.anti_join(other, :KEY)
542
+ #
543
+ # # =>
544
+ # KEY X1
545
+ # <string> <uint8>
546
+ # 0 C 3
138
547
  #
139
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
140
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
141
- # @return [DataFrame] Joined dataframe.
548
+ # @overload anti_join(other, join_key_pairs, suffix: '.1')
549
+ #
550
+ # @macro join_before
551
+ # @macro join_key_in_hash
552
+ # @macro join_after
553
+ # @macro join_common_example_2
554
+ # @example with key pairs
555
+ # df2.anti_join(other2, { left: :KEY1, right: :KEY2 })
556
+ #
557
+ # # =>
558
+ # KEY1 X1
559
+ # <string> <uint8>
560
+ # 0 C 3
142
561
  #
143
562
  def anti_join(other, join_keys = nil, suffix: '.1')
144
563
  join(other, join_keys, type: :left_anti, suffix: suffix)
145
564
  end
146
565
 
147
- # Set operations
566
+ # Set operations (#intersect, #union, #difference, #set_operable?)
148
567
 
149
568
  # Check if set operation with self and other is possible.
150
569
  #
151
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be checked with self.
152
- # @return [Boolean] true if set operation is possible.
570
+ # @macro join_before
571
+ # @return [Boolean]
572
+ # true if set operation is possible.
573
+ # @macro join_common_example_3
574
+ # @example
575
+ # df3.set_operable?(other3) # => true
153
576
  #
154
577
  def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
155
- other = DataFrame.new(other) if other.is_a?(Arrow::Table)
156
- keys == other.keys
578
+ keys == other.keys.map(&:to_sym)
157
579
  end
158
580
 
159
581
  # Select records appearing in both self and other.
582
+ # - Same as `#join` with `type: :inner` when keys in self are same with other.
583
+ # - A kind of set operations.
160
584
  #
161
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
162
- # @return [DataFrame] Joined dataframe.
585
+ # @macro join_before
586
+ # @return [DataFrame]
587
+ # joined dataframe.
588
+ # @macro join_common_example_3
589
+ # @example
590
+ # df3.intersect(other3)
591
+ #
592
+ # # =>
593
+ # KEY1 KEY2
594
+ # <string> <uint8>
595
+ # 0 A 1
163
596
  #
164
597
  def intersect(other)
165
- other = DataFrame.new(other) if other.is_a?(Arrow::Table)
166
- raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
598
+ unless keys == other.keys.map(&:to_sym)
599
+ raise DataFrameArgumentError, 'keys are not same with self and other'
600
+ end
167
601
 
168
602
  join(other, keys, type: :inner)
169
603
  end
170
604
 
171
605
  # Select records appearing in self or other.
606
+ # - Same as `#join` with `type: :full_outer` when keys in self are same with other.
607
+ # - A kind of set operations.
172
608
  #
173
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
174
- # @return [DataFrame] Joined dataframe.
609
+ # @macro join_before
610
+ # @return [DataFrame]
611
+ # joined dataframe.
612
+ # @macro join_common_example_3
613
+ # @example
614
+ # df3.intersect(other3)
615
+ #
616
+ # # =>
617
+ # KEY1 KEY2
618
+ # <string> <uint8>
619
+ # 0 A 1
620
+ # 1 B 2
621
+ # 2 C 3
622
+ # 3 B 4
623
+ # 4 D 5
175
624
  #
176
625
  def union(other)
177
- other = DataFrame.new(other) if other.is_a?(Arrow::Table)
178
- raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
626
+ unless keys == other.keys.map(&:to_sym)
627
+ raise DataFrameArgumentError, 'keys are not same with self and other'
628
+ end
179
629
 
180
630
  join(other, keys, type: :full_outer)
181
631
  end
182
632
 
183
633
  # Select records appearing in self but not in other.
634
+ # - Same as `#join` with `type: :left_anti` when keys in self are same with other.
635
+ # - A kind of set operations.
636
+ #
637
+ # @macro join_before
638
+ # @return [DataFrame]
639
+ # joined dataframe.
640
+ # @macro join_common_example_3
641
+ # @example
642
+ # df3.intersect(other3)
643
+ #
644
+ # # =>
645
+ # KEY1 KEY2
646
+ # <string> <uint8>
647
+ # 0 B 2
648
+ # 1 C 3
649
+ #
650
+ # other.intersect(df)
184
651
  #
185
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
186
- # @return [DataFrame] Joined dataframe.
652
+ # # =>
653
+ # KEY1 KEY2
654
+ # <string> <uint8>
655
+ # 0 B 4
656
+ # 1 D 5
187
657
  #
188
658
  def difference(other)
189
- other = DataFrame.new(other) if other.is_a?(Arrow::Table)
190
- raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
659
+ unless keys == other.keys.map(&:to_sym)
660
+ raise DataFrameArgumentError, 'keys are not same with self and other'
661
+ end
191
662
 
192
663
  join(other, keys, type: :left_anti)
193
664
  end
194
665
 
195
666
  alias_method :setdiff, :difference
196
667
 
197
- # Undocumented. It is preferable to call specific methods.
198
-
199
- # Join other dataframe
668
+ # Join another DataFrame or Table to self.
200
669
  #
201
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
202
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
203
- # @return [DataFrame] Joined dataframe.
670
+ # @!macro join_common_type
671
+ # @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
672
+ # left_outer, :right_outer, :full_outer] type of join.
204
673
  #
205
- # :type is one of
206
- # :left_semi, :right_semi, :left_anti, :right_anti inner, :left_outer, :right_outer, :full_outer.
207
- def join(other, join_keys = nil, type: :inner, suffix: '.1', left_outputs: nil, right_outputs: nil)
208
- case other
209
- when DataFrame
210
- # Nop
211
- when Arrow::Table
212
- other = DataFrame.new(other)
674
+ # @!macro join_common_example_4
675
+ # @example
676
+ # df4 = DataFrame.new(
677
+ # X1: %w[A B C],
678
+ # Y: %w[D E F]
679
+ # )
680
+ #
681
+ # # =>
682
+ # X1 Y1
683
+ # <string> <string>
684
+ # 0 A D
685
+ # 1 B E
686
+ # 2 C F
687
+ #
688
+ # other4 = DataFrame.new(
689
+ # X2: %w[A B D],
690
+ # Y: %w[e E E]
691
+ # )
692
+ #
693
+ # # =>
694
+ # X1 Y1
695
+ # <string> <string>
696
+ # 0 A D
697
+ # 1 B E
698
+ # 2 C F
699
+
700
+ # @note the order of joined results will be preserved by default.
701
+ # This is enabled by appending index column to sort after joining but
702
+ # it will cause some performance degradation. If you don't matter
703
+ # the order of the result, set `force_order` option to `false`.
704
+ #
705
+ # @overload join(other, type: :inner, suffix: '.1', force_order: true)
706
+ #
707
+ # If `join_key` is not specified, common keys in self and other are used
708
+ # (natural keys). Returns joined dataframe.
709
+ #
710
+ # @macro join_before
711
+ # @macro join_common_type
712
+ # @macro join_dorce_order
713
+ # @macro join_after
714
+ # @macro join_common_example_1
715
+ # @example
716
+ # df.join(other)
717
+ #
718
+ # # =>
719
+ # KEY X1 X2
720
+ # <string> <uint8> <boolean>
721
+ # 0 A 1 true
722
+ # 1 B 2 false
723
+ #
724
+ # df.join(other, type: :full_outer)
725
+ #
726
+ # # =>
727
+ # KEY X1 X2
728
+ # <string> <uint8> <boolean>
729
+ # 0 A 1 true
730
+ # 1 B 2 false
731
+ # 2 C 3 (nil)
732
+ # 3 D (nil) (nil)
733
+ #
734
+ # @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: true)
735
+ #
736
+ # @macro join_before
737
+ # @macro join_key_in_array
738
+ # @macro join_common_type
739
+ # @macro join_dorce_order
740
+ # @macro join_after
741
+ # @macro join_common_example_3
742
+ # @example join keys in an Array
743
+ # df3.join(other3, [:KEY1, :KEY2])
744
+ #
745
+ # # =>
746
+ # KEY1 KEY2
747
+ # <string> <uint8>
748
+ # 0 A 1
749
+ #
750
+ # @example partial join key and suffix
751
+ # df3.join(other3, :KEY1, suffix: '.a')
752
+ #
753
+ # # =>
754
+ # KEY1 KEY2 KEY2.a
755
+ # <string> <uint8> <uint8>
756
+ # 0 A 1 1
757
+ # 1 B 2 4
758
+ #
759
+ # @overload join(other, join_key_pairs, type: :inner, suffix: '.1', force_order: true)
760
+ #
761
+ # @macro join_before
762
+ # @macro join_key_in_hash
763
+ # @macro join_common_type
764
+ # @macro join_dorce_order
765
+ # @macro join_after
766
+ # @macro join_common_example_4
767
+ # @example without options
768
+ # df4.join(other4)
769
+ #
770
+ # # =>
771
+ # X1 Y X2
772
+ # <string> <string> <string>
773
+ # 0 B E D
774
+ # 1 B E B
775
+ #
776
+ # @example join by key pairs
777
+ # df4.join(other4, { left: [:X1, :Y], right: [:X2, :Y] })
778
+ #
779
+ # # =>
780
+ # X1 Y
781
+ # <string> <string>
782
+ # 0 B E
783
+ #
784
+ # @example join by key pairs, using renaming by suffix
785
+ # df4.join(other4, { left: :X1, right: :X2 })
786
+ #
787
+ # # =>
788
+ # X1 Y Y.1
789
+ # <string> <string> <string>
790
+ # 0 A D e
791
+ # 1 B E E
792
+ #
793
+ def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: true)
794
+ right_table =
795
+ case other
796
+ when DataFrame
797
+ other.table
798
+ when Arrow::Table
799
+ other
800
+ else
801
+ raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
802
+ end
803
+
804
+ type = type.to_sym
805
+ left_index = :__LEFT_INDEX__
806
+ right_index = :__RIGHT_INDEX__
807
+ if force_order && %i[full_outer right_outer].include?(type)
808
+ left_table = assign(left_index) { indices }.table
809
+ other = DataFrame.create(other) if other.is_a?(Arrow::Table)
810
+ right_table = other.assign(right_index) { indices }.table
213
811
  else
214
- raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
812
+ left_table = table
215
813
  end
216
814
 
217
- # Support natural keys (implicit common keys)
218
- natural_keys = keys.intersection(other.keys)
219
- raise DataFrameArgumentError, "#{join_keys} are not common keys" if natural_keys.empty?
220
-
221
- join_keys =
222
- if join_keys
223
- Array(join_keys).map(&:to_sym)
224
- else
225
- natural_keys
226
- end
227
- return self if join_keys.empty?
815
+ table_keys = left_table.keys
816
+ other_keys = right_table.keys
228
817
 
229
- # Support partial join_keys (common key other than join_key will be renamed with suffix)
230
- remainer_keys = natural_keys - join_keys
231
- unless remainer_keys.empty?
232
- renamer = remainer_keys.each_with_object({}) do |key, hash|
233
- new_key = nil
234
- loop do
235
- new_key = "#{key}#{suffix}".to_sym
236
- break unless keys.include?(new_key)
818
+ # natural keys (implicit common keys)
819
+ join_keys ||= table_keys.intersection(other_keys)
237
820
 
238
- s = suffix.succ
239
- raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
821
+ # This is not necessary if additional procedure is contributed to Red Arrow.
822
+ if join_keys.is_a?(Hash)
823
+ left_keys = join_keys[:left]
824
+ right_keys = join_keys[:right]
825
+ else
826
+ left_keys = join_keys
827
+ right_keys = join_keys
828
+ end
829
+ left_keys = Array(left_keys).map(&:to_s)
830
+ right_keys = Array(right_keys).map(&:to_s)
240
831
 
241
- suffix = s
242
- end
243
- hash[key] = new_key
244
- end
245
- other = other.rename(renamer)
832
+ case type
833
+ when :full_outer, :left_semi, :left_anti, :right_semi, :right_anti
834
+ left_outputs = nil
835
+ right_outputs = nil
836
+ when :inner, :left_outer
837
+ left_outputs = table_keys
838
+ right_outputs = other_keys - right_keys
839
+ when :right_outer
840
+ left_outputs = table_keys - left_keys
841
+ right_outputs = other_keys
246
842
  end
247
843
 
248
- # Red Arrow's #join returns duplicated join_keys from self and other as of v9.0.0 .
249
- # Temporally merge key vectors here to workaround.
250
- table_output =
251
- table.join(other.table, join_keys, type: type, left_outputs: left_outputs, right_outputs: right_outputs)
252
- left_indexes = [*0...n_keys]
253
- right_indexes = [*((other.keys - join_keys).map { |key| other.keys.index(key) + n_keys })]
844
+ # Should we rescue errors in Arrow::Table#join for usability ?
845
+ joined_table =
846
+ left_table.join(
847
+ right_table,
848
+ join_keys,
849
+ type: type,
850
+ left_outputs: left_outputs,
851
+ right_outputs: right_outputs
852
+ )
254
853
 
255
854
  case type
256
- when :left_semi, :left_anti, :right_semi, :right_anti
257
- return DataFrame.new(table_output)
258
- else
259
- selected_indexes = left_indexes.concat(right_indexes)
260
- end
261
- merged_columns = join_keys.map do |key|
262
- i = keys.index(key)
263
- merge_column(table_output[i], table_output[n_keys + i], type)
855
+ when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
856
+ if joined_table.keys.uniq!
857
+ DataFrame.create(rename_table(joined_table, n_keys, suffix))
858
+ else
859
+ DataFrame.create(joined_table)
860
+ end
861
+ when :full_outer
862
+ renamed_table = rename_table(joined_table, n_keys, suffix)
863
+ renamed_keys = renamed_table.keys
864
+ dropper = []
865
+ dataframe = DataFrame.create(renamed_table).assign do |df|
866
+ left_keys.map do |left_key|
867
+ i_left_key = renamed_keys.index(left_key)
868
+ right_key = renamed_keys[i_left_key + table_keys.size]
869
+ dropper << right_key
870
+ [left_key.to_sym, merge_array(df[left_key].data, df[right_key].data)]
871
+ end
872
+ end
873
+ dataframe = dataframe.sort(left_index, right_index) if force_order
874
+
875
+ dataframe.drop(dropper, left_index, right_index)
876
+ when :right_outer
877
+ dataframe =
878
+ if joined_table.keys.uniq!
879
+ DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
880
+ else
881
+ DataFrame.create(joined_table)
882
+ end
883
+ if force_order
884
+ dataframe =
885
+ dataframe
886
+ .sort(left_index, right_index)
887
+ .drop(left_index, right_index)
888
+ end
889
+ dataframe.pick do
890
+ [right_keys, keys.map(&:to_s) - right_keys]
891
+ end
264
892
  end
265
- DataFrame.new(table_output[selected_indexes])
266
- .assign(*join_keys) { merged_columns }
267
893
  end
268
894
 
269
895
  private
270
896
 
271
- def merge_column(column1, column2, type)
272
- a1 = column1.to_a
273
- a2 = column2.to_a
274
- if type == :full_outer
275
- a1.zip(a2).map { |x, y| x || y }
276
- elsif type.start_with?('right')
277
- a2
278
- else # :inner or :left-*
279
- a1
280
- end
897
+ # Rename duplicate keys by suffix
898
+ def rename_table(joined_table, n_keys, suffix)
899
+ joined_keys = joined_table.keys
900
+ other_keys = joined_keys[n_keys..]
901
+
902
+ dup_keys = joined_keys.tally.select { |_, v| v > 1 }.keys
903
+ renamed_right_keys =
904
+ other_keys.map do |key|
905
+ if dup_keys.include?(key)
906
+ new_key = nil
907
+ loop do
908
+ new_key = "#{key}#{suffix}"
909
+ break unless joined_keys.include?(new_key)
910
+
911
+ s = suffix.succ
912
+ raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
913
+
914
+ suffix = s
915
+ end
916
+ new_key
917
+ else
918
+ key
919
+ end
920
+ end
921
+ joined_keys[n_keys..] = renamed_right_keys
922
+
923
+ fields =
924
+ joined_keys.map.with_index do |k, i|
925
+ Arrow::Field.new(k, joined_table[i].data_type)
926
+ end
927
+ Arrow::Table.new(Arrow::Schema.new(fields), joined_table.columns)
928
+ end
929
+
930
+ # Merge two Arrow::Arrays
931
+ def merge_array(array1, array2)
932
+ t = Arrow::Function.find(:is_null).execute([array1])
933
+ Arrow::Function.find(:if_else).execute([t, array2, array1]).value
281
934
  end
282
935
  end
283
936
  end