red_amber 0.2.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -51
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +203 -1
  5. data/Gemfile +2 -1
  6. data/LICENSE +1 -1
  7. data/README.md +61 -45
  8. data/benchmark/basic.yml +11 -4
  9. data/benchmark/combine.yml +3 -4
  10. data/benchmark/dataframe.yml +62 -0
  11. data/benchmark/group.yml +7 -1
  12. data/benchmark/reshape.yml +6 -2
  13. data/benchmark/vector.yml +63 -0
  14. data/doc/DataFrame.md +35 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +295 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +537 -68
  20. data/lib/red_amber/data_frame_combinable.rb +776 -123
  21. data/lib/red_amber/data_frame_displayable.rb +248 -18
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +81 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +216 -21
  25. data/lib/red_amber/data_frame_selectable.rb +781 -120
  26. data/lib/red_amber/data_frame_variable_operation.rb +561 -85
  27. data/lib/red_amber/group.rb +195 -21
  28. data/lib/red_amber/helper.rb +114 -32
  29. data/lib/red_amber/refinements.rb +206 -0
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +435 -58
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +321 -69
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +397 -24
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +15 -1
  39. data/red_amber.gemspec +4 -3
  40. metadata +19 -11
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -294
@@ -1,14 +1,36 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameCombinable
6
- # Concatenate other dataframe onto the bottom.
6
+ # Refinements for Arrow::Table
7
+ using RefineArrowTable
8
+
9
+ # Concatenate other dataframes or tables onto the bottom of self.
7
10
  #
11
+ # @note the `#types` must be same as `other#types`.
8
12
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
9
- # DataFrame/Table to concatenate onto the bottom of self.
13
+ # DataFrames or Tables to concatenate.
10
14
  # @return [DataFrame]
11
- # Concatenated dataframe.
15
+ # concatenated dataframe.
16
+ # @example
17
+ # df = DataFrame.new(x: [1, 2], y: ['A', 'B'])
18
+ # other = DataFrame.new(x: [3, 4], y: ['C', 'D'])
19
+ # [df.types, other.types]
20
+ #
21
+ # # =>
22
+ # [[:uint8, :string], [:uint8, :string]]
23
+ #
24
+ # df.concatenate(other)
25
+ #
26
+ # # =>
27
+ # x y
28
+ # <uint8> <string>
29
+ # 0 1 A
30
+ # 1 2 B
31
+ # 2 3 C
32
+ # 3 4 D
33
+ #
12
34
  def concatenate(*other)
13
35
  case other
14
36
  in [] | [nil] | [[]]
@@ -30,20 +52,33 @@ module RedAmber
30
52
  end
31
53
  end
32
54
 
33
- DataFrame.new(table.concatenate(table_array))
55
+ DataFrame.create(table.concatenate(table_array))
34
56
  end
35
57
 
36
58
  alias_method :concat, :concatenate
37
59
  alias_method :bind_rows, :concatenate
38
60
 
39
- # Merge other DataFrame or Table from other.
40
- # - Self and other must have same size.
41
- # - Self and other do not share the same key.
42
- # - If they share any keys, raise Error.
61
+ # Merge other DataFrames or Tables.
62
+ #
63
+ # @note the `#size` must be same as `other#size`.
64
+ # @note self and other must not share the same key.
43
65
  # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
44
- # DataFrame/Table to concatenate.
66
+ # DataFrames or Tables to merge.
67
+ # @raise [DataFrameArgumentError]
68
+ # if size is not same or self and other shares the same key.
45
69
  # @return [DataFrame]
46
- # Merged dataframe.
70
+ # merged dataframe.
71
+ # @example
72
+ # df = DataFrame.new(x: [1, 2], y: [3, 4])
73
+ # other = DataFrame.new(a: ['A', 'B'], b: ['C', 'D'])
74
+ # df.merge(other)
75
+ #
76
+ # # =>
77
+ # x y a b
78
+ # <uint8> <uint8> <string> <string>
79
+ # 0 1 3 A C
80
+ # 1 2 4 B D
81
+ #
47
82
  def merge(*other)
48
83
  case other
49
84
  in [] | [nil] | [[]]
@@ -58,14 +93,16 @@ module RedAmber
58
93
  df =
59
94
  case e
60
95
  when Arrow::Table
61
- DataFrame.new(e)
96
+ DataFrame.create(e)
62
97
  when DataFrame
63
98
  e
64
99
  else
65
100
  raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
66
101
  end
67
102
 
68
- raise DataFrameArgumentError, "#{e} do not have same size as self" if size != df.size
103
+ if size != df.size
104
+ raise DataFrameArgumentError, "#{e} do not have same size as self"
105
+ end
69
106
 
70
107
  k = keys.intersection(df.keys).any?
71
108
  raise DataFrameArgumentError, "There are some shared keys: #{k}" if k
@@ -78,206 +115,822 @@ module RedAmber
78
115
 
79
116
  alias_method :bind_cols, :merge
80
117
 
81
- # Mutating joins
118
+ # Mutating joins (#inner_join, #full_join, #left_join, #right_join)
82
119
 
83
- # Join data, leaving only the matching records.
120
+ # @!macro join_before
121
+ # @param other [DataFrame, Arrow::Table]
122
+ # A DataFrame or a Table to be joined with self.
123
+ #
124
+ # @!macro join_dorce_order
125
+ # @param force_order [Boolean]
126
+ # wheather force order of the output always same.
127
+ # - This option is used in `:full_outer` and `:right_outer`.
128
+ # - If this option is true (by default) it will append index to the source
129
+ # and sort after joining. It will cause some degradation in performance.
130
+ #
131
+ # @!macro join_after
132
+ # @param suffix [#succ]
133
+ # a suffix to rename keys when key names conflict as a result of join.
134
+ # `suffix` must be responsible to `#succ`.
135
+ # @return [DataFrame]
136
+ # joined dataframe.
137
+ #
138
+ # @!macro join_key_in_array
139
+ # @param join_keys [String, Symbol, Array<String, Symbol>]
140
+ # a key or keys to match.
141
+ #
142
+ # @!macro join_key_in_hash
143
+ # @param join_key_pairs [Hash]
144
+ # pairs of a key name or key names to match in left and right.
145
+ # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :left
146
+ # join keys in `self`.
147
+ # @option join_key_pairs [String, Symbol, Array<String, Symbol>] :right
148
+ # join keys in `other`.
149
+ #
150
+ # @!macro join_common_example_1
151
+ # @example
152
+ # df = DataFrame.new(KEY: %w[A B C], X1: [1, 2, 3])
153
+ #
154
+ # # =>
155
+ # KEY X1
156
+ # <string> <uint8>
157
+ # 0 A 1
158
+ # 1 B 2
159
+ # 2 C 3
160
+ #
161
+ # other = DataFrame.new(KEY: %w[A B D], X2: [true, false, nil])
162
+ #
163
+ # # =>
164
+ # KEY X2
165
+ # <string> <boolean>
166
+ # 0 A true
167
+ # 1 B false
168
+ # 2 D (nil)
169
+ #
170
+ # @!macro join_common_example_2
171
+ # @example
172
+ # df2 = DataFrame.new(KEY1: %w[A B C], X1: [1, 2, 3])
84
173
  #
85
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
86
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
87
- # @return [DataFrame] Joined dataframe.
174
+ # # =>
175
+ # KEY1 X1
176
+ # <string> <uint8>
177
+ # 0 A 1
178
+ # 1 B 2
179
+ # 2 C 3
180
+ #
181
+ # other2 = DataFrame.new(KEY2: %w[A B D], X2: [true, false, nil])
182
+ #
183
+ # # =>
184
+ # KEY2 X2
185
+ # <string> <boolean>
186
+ # 0 A true
187
+ # 1 B false
188
+ # 2 D (nil)
189
+ #
190
+ # @!macro join_common_example_3
191
+ # @example
192
+ # df3 = DataFrame.new(
193
+ # KEY1: %w[A B C],
194
+ # KEY2: [1, 2, 3]
195
+ # )
196
+ #
197
+ # # =>
198
+ # KEY1 KEY2
199
+ # <string> <uint8>
200
+ # 0 A 1
201
+ # 1 B 2
202
+ # 2 C 3
203
+ #
204
+ # other3 = DataFrame.new(
205
+ # KEY1: %w[A B D],
206
+ # KEY2: [1, 4, 5]
207
+ # )
208
+ #
209
+ # # =>
210
+ # KEY1 KEY2
211
+ # <string> <uint8>
212
+ # 0 A 1
213
+ # 1 B 4
214
+ # 2 D 5
215
+
216
+ # Join another DataFrame or Table, leaving only the matching records.
217
+ # - Same as `#join` with `type: :inner`
218
+ # - A kind of mutating join.
219
+ #
220
+ # @overload inner_join(other, suffix: '.1')
221
+ # If `join_key` is not specified, common keys in self and other are used
222
+ # (natural keys). Returns joined dataframe.
223
+ #
224
+ # @macro join_before
225
+ # @macro join_after
226
+ # @macro join_common_example_1
227
+ # @example without key (use implicit common key)
228
+ # df.inner_join(other)
229
+ #
230
+ # # =>
231
+ # KEY X1 X2
232
+ # <string> <uint8> <boolean>
233
+ # 0 A 1 true
234
+ # 1 B 2 false
235
+ #
236
+ # @overload inner_join(other, join_keys, suffix: '.1')
237
+ #
238
+ # @macro join_before
239
+ # @macro join_key_in_array
240
+ # @macro join_after
241
+ # @macro join_common_example_1
242
+ # @example with a key
243
+ # df.inner_join(other, :KEY)
244
+ #
245
+ # # =>
246
+ # KEY X1 X2
247
+ # <string> <uint8> <boolean>
248
+ # 0 A 1 true
249
+ # 1 B 2 false
250
+ #
251
+ # @overload inner_join(other, join_key_pairs, suffix: '.1')
252
+ #
253
+ # @macro join_before
254
+ # @macro join_key_in_hash
255
+ # @macro join_after
256
+ # @macro join_common_example_2
257
+ # @example with key pairs
258
+ # df2.inner_join(other2, { left: :KEY1, right: :KEY2 })
259
+ #
260
+ # # =>
261
+ # KEY1 X1 X2
262
+ # <string> <uint8> <boolean>
263
+ # 0 A 1 true
264
+ # 1 B 2 false
88
265
  #
89
266
  def inner_join(other, join_keys = nil, suffix: '.1')
90
267
  join(other, join_keys, type: :inner, suffix: suffix)
91
268
  end
92
269
 
93
- # Join data, leaving all records.
270
+ # Join another DataFrame or Table, leaving all records.
271
+ # - Same as `#join` with `type: :full_outer`
272
+ # - A kind of mutating join.
273
+ #
274
+ # @overload full_join(other, suffix: '.1', force_order: true)
275
+ # If `join_key` is not specified, common keys in self and other are used
276
+ # (natural keys). Returns joined dataframe.
277
+ #
278
+ # @macro join_before
279
+ # @macro join_dorce_order
280
+ # @macro join_after
281
+ # @macro join_common_example_1
282
+ # @example without key (use implicit common key)
283
+ # df.full_join(other)
284
+ #
285
+ # # =>
286
+ # KEY X1 X2
287
+ # <string> <uint8> <boolean>
288
+ # 0 A 1 true
289
+ # 1 B 2 false
290
+ # 2 C 3 (nil)
291
+ # 3 D (nil) (nil)
94
292
  #
95
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
96
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
97
- # @return [DataFrame] Joined dataframe.
293
+ # @overload full_join(other, join_keys, suffix: '.1', force_order: true)
98
294
  #
99
- def full_join(other, join_keys = nil, suffix: '.1')
100
- join(other, join_keys, type: :full_outer, suffix: suffix)
295
+ # @macro join_before
296
+ # @macro join_key_in_array
297
+ # @macro join_dorce_order
298
+ # @macro join_after
299
+ # @macro join_common_example_1
300
+ # @example with a key
301
+ # df.full_join(other, :KEY)
302
+ #
303
+ # # =>
304
+ # KEY X1 X2
305
+ # <string> <uint8> <boolean>
306
+ # 0 A 1 true
307
+ # 1 B 2 false
308
+ # 2 C 3 (nil)
309
+ # 3 D (nil) (nil)
310
+ #
311
+ # @overload full_join(other, join_key_pairs, suffix: '.1', force_order: true)
312
+ #
313
+ # @macro join_before
314
+ # @macro join_key_in_hash
315
+ # @macro join_dorce_order
316
+ # @macro join_after
317
+ # @macro join_common_example_2
318
+ # @example with key pairs
319
+ # df2.full_join(other2, { left: :KEY1, right: :KEY2 })
320
+ #
321
+ # # =>
322
+ # KEY1 X1 X2
323
+ # <string> <uint8> <boolean>
324
+ # 0 A 1 true
325
+ # 1 B 2 false
326
+ # 2 C 3 (nil)
327
+ # 3 D (nil) (nil)
328
+ #
329
+ def full_join(other, join_keys = nil, suffix: '.1', force_order: true)
330
+ join(other, join_keys,
331
+ type: :full_outer, suffix: suffix, force_order: force_order)
101
332
  end
102
333
 
103
334
  alias_method :outer_join, :full_join
104
335
 
105
336
  # Join matching values to self from other.
337
+ # - Same as `#join` with `type: :left_outer`
338
+ # - A kind of mutating join.
339
+ #
340
+ # @overload left_join(other, suffix: '.1')
341
+ # If `join_key` is not specified, common keys in self and other are used
342
+ # (natural keys). Returns joined dataframe.
343
+ #
344
+ # @macro join_before
345
+ # @macro join_after
346
+ # @macro join_common_example_1
347
+ # @example without key (use implicit common key)
348
+ # df.left_join(other)
349
+ #
350
+ # # =>
351
+ # KEY X1 X2
352
+ # <string> <uint8> <boolean>
353
+ # 0 A 1 true
354
+ # 1 B 2 false
355
+ # 2 C 3 (nil)
356
+ #
357
+ # @overload left_join(other, join_keys, suffix: '.1')
106
358
  #
107
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
108
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
109
- # @return [DataFrame] Joined dataframe.
359
+ # @macro join_before
360
+ # @macro join_key_in_array
361
+ # @macro join_after
362
+ # @macro join_common_example_1
363
+ # @example with a key
364
+ # df.left_join(other, :KEY)
365
+ #
366
+ # # =>
367
+ # KEY X1 X2
368
+ # <string> <uint8> <boolean>
369
+ # 0 A 1 true
370
+ # 1 B 2 false
371
+ # 2 C 3 (nil)
372
+ #
373
+ # @overload left_join(other, join_key_pairs, suffix: '.1')
374
+ #
375
+ # @macro join_before
376
+ # @macro join_key_in_hash
377
+ # @macro join_after
378
+ # @macro join_common_example_2
379
+ # @example with key pairs
380
+ # df2.left_join(other2, { left: :KEY1, right: :KEY2 })
381
+ #
382
+ # # =>
383
+ # KEY1 X1 X2
384
+ # <string> <uint8> <boolean>
385
+ # 0 A 1 true
386
+ # 1 B 2 false
387
+ # 2 C 3 (nil)
110
388
  #
111
389
  def left_join(other, join_keys = nil, suffix: '.1')
112
390
  join(other, join_keys, type: :left_outer, suffix: suffix)
113
391
  end
114
392
 
115
393
  # Join matching values from self to other.
394
+ # - Same as `#join` with `type: :right_outer`
395
+ # - A kind of mutating join.
396
+ #
397
+ # @overload right_join(other, suffix: '.1', force_order: true)
398
+ # If `join_key` is not specified, common keys in self and other are used
399
+ # (natural keys). Returns joined dataframe.
400
+ #
401
+ # @macro join_before
402
+ # @macro join_dorce_order
403
+ # @macro join_after
404
+ # @macro join_common_example_1
405
+ # @example without key (use implicit common key)
406
+ # df.right_join(other)
407
+ #
408
+ # # =>
409
+ # KEY X1 X2
410
+ # <string> <uint8> <boolean>
411
+ # 0 A 1 true
412
+ # 1 B 2 false
413
+ # 2 D (nil) (nil)
414
+ #
415
+ # @overload right_join(other, join_keys, suffix: '.1', force_order: true)
416
+ #
417
+ # @macro join_before
418
+ # @macro join_key_in_array
419
+ # @macro join_dorce_order
420
+ # @macro join_after
421
+ # @macro join_common_example_1
422
+ # @example with a key
423
+ # df.right_join(other, :KEY)
116
424
  #
117
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
118
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
119
- # @return [DataFrame] Joined dataframe.
425
+ # # =>
426
+ # KEY X1 X2
427
+ # <string> <uint8> <boolean>
428
+ # 0 A 1 true
429
+ # 1 B 2 false
430
+ # 2 D (nil) (nil)
120
431
  #
121
- def right_join(other, join_keys = nil, suffix: '.1')
122
- join(other, join_keys, type: :right_outer, suffix: suffix)
432
+ # @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
433
+ #
434
+ # @macro join_before
435
+ # @macro join_key_in_hash
436
+ # @macro join_dorce_order
437
+ # @macro join_after
438
+ # @macro join_common_example_2
439
+ # @example with key pairs
440
+ # df2.right_join(other2, { left: :KEY1, right: :KEY2 })
441
+ #
442
+ # # =>
443
+ # KEY1 X1 X2
444
+ # <string> <uint8> <boolean>
445
+ # 0 A 1 true
446
+ # 1 B 2 false
447
+ # 2 D (nil) (nil)
448
+ #
449
+ def right_join(other, join_keys = nil, suffix: '.1', force_order: true)
450
+ join(
451
+ other,
452
+ join_keys,
453
+ type: :right_outer,
454
+ suffix: suffix,
455
+ force_order: force_order
456
+ )
123
457
  end
124
458
 
125
- # Filtering joins
459
+ # Filtering joins (#semi_join, #anti_join)
126
460
 
127
461
  # Return records of self that have a match in other.
462
+ # - Same as `#join` with `type: :left_semi`
463
+ # - A kind of filtering join.
464
+ #
465
+ # @overload semi_join(other, suffix: '.1')
466
+ # If `join_key` is not specified, common keys in self and other are used
467
+ # (natural keys). Returns joined dataframe.
468
+ #
469
+ # @macro join_before
470
+ # @macro join_after
471
+ # @macro join_common_example_1
472
+ # @example without key (use implicit common key)
473
+ # df.semi_join(other)
474
+ #
475
+ # # =>
476
+ # KEY X1
477
+ # <string> <uint8>
478
+ # 0 A 1
479
+ # 1 B 2
480
+ #
481
+ # @overload semi_join(other, join_keys, suffix: '.1')
482
+ #
483
+ # @macro join_before
484
+ # @macro join_key_in_array
485
+ # @macro join_after
486
+ # @macro join_common_example_1
487
+ # @example with a key
488
+ # df.semi_join(other, :KEY)
489
+ #
490
+ # # =>
491
+ # KEY X1
492
+ # <string> <uint8>
493
+ # 0 A 1
494
+ # 1 B 2
128
495
  #
129
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
130
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
131
- # @return [DataFrame] Joined dataframe.
496
+ # @overload semi_join(other, join_key_pairs, suffix: '.1')
497
+ #
498
+ # @macro join_before
499
+ # @macro join_key_in_hash
500
+ # @macro join_after
501
+ # @macro join_common_example_2
502
+ # @example with key pairs
503
+ # df2.semi_join(other2, { left: :KEY1, right: :KEY2 })
504
+ #
505
+ # # =>
506
+ # KEY1 X1
507
+ # <string> <uint8>
508
+ # 0 A 1
509
+ # 1 B 2
132
510
  #
133
511
  def semi_join(other, join_keys = nil, suffix: '.1')
134
512
  join(other, join_keys, type: :left_semi, suffix: suffix)
135
513
  end
136
514
 
137
515
  # Return records of self that do not have a match in other.
516
+ # - Same as `#join` with `type: :left_anti`
517
+ # - A kind of filtering join.
518
+ #
519
+ # @overload anti_join(other, suffix: '.1')
520
+ # If `join_key` is not specified, common keys in self and other are used
521
+ # (natural keys). Returns joined dataframe.
522
+ #
523
+ # @macro join_before
524
+ # @macro join_after
525
+ # @macro join_common_example_1
526
+ # @example without key (use implicit common key)
527
+ # df.anti_join(other)
528
+ #
529
+ # # =>
530
+ # KEY X1
531
+ # <string> <uint8>
532
+ # 0 C 3
533
+ #
534
+ # @overload anti_join(other, join_keys, suffix: '.1')
535
+ #
536
+ # @macro join_before
537
+ # @macro join_key_in_array
538
+ # @macro join_after
539
+ # @macro join_common_example_1
540
+ # @example with a key
541
+ # df.anti_join(other, :KEY)
542
+ #
543
+ # # =>
544
+ # KEY X1
545
+ # <string> <uint8>
546
+ # 0 C 3
138
547
  #
139
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
140
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
141
- # @return [DataFrame] Joined dataframe.
548
+ # @overload anti_join(other, join_key_pairs, suffix: '.1')
549
+ #
550
+ # @macro join_before
551
+ # @macro join_key_in_hash
552
+ # @macro join_after
553
+ # @macro join_common_example_2
554
+ # @example with key pairs
555
+ # df2.anti_join(other2, { left: :KEY1, right: :KEY2 })
556
+ #
557
+ # # =>
558
+ # KEY1 X1
559
+ # <string> <uint8>
560
+ # 0 C 3
142
561
  #
143
562
  def anti_join(other, join_keys = nil, suffix: '.1')
144
563
  join(other, join_keys, type: :left_anti, suffix: suffix)
145
564
  end
146
565
 
147
- # Set operations
566
+ # Set operations (#intersect, #union, #difference, #set_operable?)
148
567
 
149
568
  # Check if set operation with self and other is possible.
150
569
  #
151
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be checked with self.
152
- # @return [Boolean] true if set operation is possible.
570
+ # @macro join_before
571
+ # @return [Boolean]
572
+ # true if set operation is possible.
573
+ # @macro join_common_example_3
574
+ # @example
575
+ # df3.set_operable?(other3) # => true
153
576
  #
154
577
  def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
155
- other = DataFrame.new(other) if other.is_a?(Arrow::Table)
156
- keys == other.keys
578
+ keys == other.keys.map(&:to_sym)
157
579
  end
158
580
 
159
581
  # Select records appearing in both self and other.
582
+ # - Same as `#join` with `type: :inner` when keys in self are same with other.
583
+ # - A kind of set operations.
160
584
  #
161
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
162
- # @return [DataFrame] Joined dataframe.
585
+ # @macro join_before
586
+ # @return [DataFrame]
587
+ # joined dataframe.
588
+ # @macro join_common_example_3
589
+ # @example
590
+ # df3.intersect(other3)
591
+ #
592
+ # # =>
593
+ # KEY1 KEY2
594
+ # <string> <uint8>
595
+ # 0 A 1
163
596
  #
164
597
  def intersect(other)
165
- other = DataFrame.new(other) if other.is_a?(Arrow::Table)
166
- raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
598
+ unless keys == other.keys.map(&:to_sym)
599
+ raise DataFrameArgumentError, 'keys are not same with self and other'
600
+ end
167
601
 
168
602
  join(other, keys, type: :inner)
169
603
  end
170
604
 
171
605
  # Select records appearing in self or other.
606
+ # - Same as `#join` with `type: :full_outer` when keys in self are same with other.
607
+ # - A kind of set operations.
172
608
  #
173
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
174
- # @return [DataFrame] Joined dataframe.
609
+ # @macro join_before
610
+ # @return [DataFrame]
611
+ # joined dataframe.
612
+ # @macro join_common_example_3
613
+ # @example
614
+ # df3.intersect(other3)
615
+ #
616
+ # # =>
617
+ # KEY1 KEY2
618
+ # <string> <uint8>
619
+ # 0 A 1
620
+ # 1 B 2
621
+ # 2 C 3
622
+ # 3 B 4
623
+ # 4 D 5
175
624
  #
176
625
  def union(other)
177
- other = DataFrame.new(other) if other.is_a?(Arrow::Table)
178
- raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
626
+ unless keys == other.keys.map(&:to_sym)
627
+ raise DataFrameArgumentError, 'keys are not same with self and other'
628
+ end
179
629
 
180
630
  join(other, keys, type: :full_outer)
181
631
  end
182
632
 
183
633
  # Select records appearing in self but not in other.
634
+ # - Same as `#join` with `type: :left_anti` when keys in self are same with other.
635
+ # - A kind of set operations.
636
+ #
637
+ # @macro join_before
638
+ # @return [DataFrame]
639
+ # joined dataframe.
640
+ # @macro join_common_example_3
641
+ # @example
642
+ # df3.intersect(other3)
643
+ #
644
+ # # =>
645
+ # KEY1 KEY2
646
+ # <string> <uint8>
647
+ # 0 B 2
648
+ # 1 C 3
649
+ #
650
+ # other.intersect(df)
184
651
  #
185
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
186
- # @return [DataFrame] Joined dataframe.
652
+ # # =>
653
+ # KEY1 KEY2
654
+ # <string> <uint8>
655
+ # 0 B 4
656
+ # 1 D 5
187
657
  #
188
658
  def difference(other)
189
- other = DataFrame.new(other) if other.is_a?(Arrow::Table)
190
- raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
659
+ unless keys == other.keys.map(&:to_sym)
660
+ raise DataFrameArgumentError, 'keys are not same with self and other'
661
+ end
191
662
 
192
663
  join(other, keys, type: :left_anti)
193
664
  end
194
665
 
195
666
  alias_method :setdiff, :difference
196
667
 
197
- # Undocumented. It is preferable to call specific methods.
198
-
199
- # Join other dataframe
668
+ # Join another DataFrame or Table to self.
200
669
  #
201
- # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
202
- # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
203
- # @return [DataFrame] Joined dataframe.
670
+ # @!macro join_common_type
671
+ # @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
672
+ # left_outer, :right_outer, :full_outer] type of join.
204
673
  #
205
- # :type is one of
206
- # :left_semi, :right_semi, :left_anti, :right_anti inner, :left_outer, :right_outer, :full_outer.
207
- def join(other, join_keys = nil, type: :inner, suffix: '.1', left_outputs: nil, right_outputs: nil)
208
- case other
209
- when DataFrame
210
- # Nop
211
- when Arrow::Table
212
- other = DataFrame.new(other)
674
+ # @!macro join_common_example_4
675
+ # @example
676
+ # df4 = DataFrame.new(
677
+ # X1: %w[A B C],
678
+ # Y: %w[D E F]
679
+ # )
680
+ #
681
+ # # =>
682
+ # X1 Y1
683
+ # <string> <string>
684
+ # 0 A D
685
+ # 1 B E
686
+ # 2 C F
687
+ #
688
+ # other4 = DataFrame.new(
689
+ # X2: %w[A B D],
690
+ # Y: %w[e E E]
691
+ # )
692
+ #
693
+ # # =>
694
+ # X1 Y1
695
+ # <string> <string>
696
+ # 0 A D
697
+ # 1 B E
698
+ # 2 C F
699
+
700
+ # @note the order of joined results will be preserved by default.
701
+ # This is enabled by appending index column to sort after joining but
702
+ # it will cause some performance degradation. If you don't matter
703
+ # the order of the result, set `force_order` option to `false`.
704
+ #
705
+ # @overload join(other, type: :inner, suffix: '.1', force_order: true)
706
+ #
707
+ # If `join_key` is not specified, common keys in self and other are used
708
+ # (natural keys). Returns joined dataframe.
709
+ #
710
+ # @macro join_before
711
+ # @macro join_common_type
712
+ # @macro join_dorce_order
713
+ # @macro join_after
714
+ # @macro join_common_example_1
715
+ # @example
716
+ # df.join(other)
717
+ #
718
+ # # =>
719
+ # KEY X1 X2
720
+ # <string> <uint8> <boolean>
721
+ # 0 A 1 true
722
+ # 1 B 2 false
723
+ #
724
+ # df.join(other, type: :full_outer)
725
+ #
726
+ # # =>
727
+ # KEY X1 X2
728
+ # <string> <uint8> <boolean>
729
+ # 0 A 1 true
730
+ # 1 B 2 false
731
+ # 2 C 3 (nil)
732
+ # 3 D (nil) (nil)
733
+ #
734
+ # @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: true)
735
+ #
736
+ # @macro join_before
737
+ # @macro join_key_in_array
738
+ # @macro join_common_type
739
+ # @macro join_dorce_order
740
+ # @macro join_after
741
+ # @macro join_common_example_3
742
+ # @example join keys in an Array
743
+ # df3.join(other3, [:KEY1, :KEY2])
744
+ #
745
+ # # =>
746
+ # KEY1 KEY2
747
+ # <string> <uint8>
748
+ # 0 A 1
749
+ #
750
+ # @example partial join key and suffix
751
+ # df3.join(other3, :KEY1, suffix: '.a')
752
+ #
753
+ # # =>
754
+ # KEY1 KEY2 KEY2.a
755
+ # <string> <uint8> <uint8>
756
+ # 0 A 1 1
757
+ # 1 B 2 4
758
+ #
759
+ # @overload join(other, join_key_pairs, type: :inner, suffix: '.1', force_order: true)
760
+ #
761
+ # @macro join_before
762
+ # @macro join_key_in_hash
763
+ # @macro join_common_type
764
+ # @macro join_dorce_order
765
+ # @macro join_after
766
+ # @macro join_common_example_4
767
+ # @example without options
768
+ # df4.join(other4)
769
+ #
770
+ # # =>
771
+ # X1 Y X2
772
+ # <string> <string> <string>
773
+ # 0 B E D
774
+ # 1 B E B
775
+ #
776
+ # @example join by key pairs
777
+ # df4.join(other4, { left: [:X1, :Y], right: [:X2, :Y] })
778
+ #
779
+ # # =>
780
+ # X1 Y
781
+ # <string> <string>
782
+ # 0 B E
783
+ #
784
+ # @example join by key pairs, using renaming by suffix
785
+ # df4.join(other4, { left: :X1, right: :X2 })
786
+ #
787
+ # # =>
788
+ # X1 Y Y.1
789
+ # <string> <string> <string>
790
+ # 0 A D e
791
+ # 1 B E E
792
+ #
793
+ def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: true)
794
+ right_table =
795
+ case other
796
+ when DataFrame
797
+ other.table
798
+ when Arrow::Table
799
+ other
800
+ else
801
+ raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
802
+ end
803
+
804
+ type = type.to_sym
805
+ left_index = :__LEFT_INDEX__
806
+ right_index = :__RIGHT_INDEX__
807
+ if force_order && %i[full_outer right_outer].include?(type)
808
+ left_table = assign(left_index) { indices }.table
809
+ other = DataFrame.create(other) if other.is_a?(Arrow::Table)
810
+ right_table = other.assign(right_index) { indices }.table
213
811
  else
214
- raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
812
+ left_table = table
215
813
  end
216
814
 
217
- # Support natural keys (implicit common keys)
218
- natural_keys = keys.intersection(other.keys)
219
- raise DataFrameArgumentError, "#{join_keys} are not common keys" if natural_keys.empty?
220
-
221
- join_keys =
222
- if join_keys
223
- Array(join_keys).map(&:to_sym)
224
- else
225
- natural_keys
226
- end
227
- return self if join_keys.empty?
815
+ table_keys = left_table.keys
816
+ other_keys = right_table.keys
228
817
 
229
- # Support partial join_keys (common key other than join_key will be renamed with suffix)
230
- remainer_keys = natural_keys - join_keys
231
- unless remainer_keys.empty?
232
- renamer = remainer_keys.each_with_object({}) do |key, hash|
233
- new_key = nil
234
- loop do
235
- new_key = "#{key}#{suffix}".to_sym
236
- break unless keys.include?(new_key)
818
+ # natural keys (implicit common keys)
819
+ join_keys ||= table_keys.intersection(other_keys)
237
820
 
238
- s = suffix.succ
239
- raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
821
+ # This is not necessary if additional procedure is contributed to Red Arrow.
822
+ if join_keys.is_a?(Hash)
823
+ left_keys = join_keys[:left]
824
+ right_keys = join_keys[:right]
825
+ else
826
+ left_keys = join_keys
827
+ right_keys = join_keys
828
+ end
829
+ left_keys = Array(left_keys).map(&:to_s)
830
+ right_keys = Array(right_keys).map(&:to_s)
240
831
 
241
- suffix = s
242
- end
243
- hash[key] = new_key
244
- end
245
- other = other.rename(renamer)
832
+ case type
833
+ when :full_outer, :left_semi, :left_anti, :right_semi, :right_anti
834
+ left_outputs = nil
835
+ right_outputs = nil
836
+ when :inner, :left_outer
837
+ left_outputs = table_keys
838
+ right_outputs = other_keys - right_keys
839
+ when :right_outer
840
+ left_outputs = table_keys - left_keys
841
+ right_outputs = other_keys
246
842
  end
247
843
 
248
- # Red Arrow's #join returns duplicated join_keys from self and other as of v9.0.0 .
249
- # Temporally merge key vectors here to workaround.
250
- table_output =
251
- table.join(other.table, join_keys, type: type, left_outputs: left_outputs, right_outputs: right_outputs)
252
- left_indexes = [*0...n_keys]
253
- right_indexes = [*((other.keys - join_keys).map { |key| other.keys.index(key) + n_keys })]
844
+ # Should we rescue errors in Arrow::Table#join for usability ?
845
+ joined_table =
846
+ left_table.join(
847
+ right_table,
848
+ join_keys,
849
+ type: type,
850
+ left_outputs: left_outputs,
851
+ right_outputs: right_outputs
852
+ )
254
853
 
255
854
  case type
256
- when :left_semi, :left_anti, :right_semi, :right_anti
257
- return DataFrame.new(table_output)
258
- else
259
- selected_indexes = left_indexes.concat(right_indexes)
260
- end
261
- merged_columns = join_keys.map do |key|
262
- i = keys.index(key)
263
- merge_column(table_output[i], table_output[n_keys + i], type)
855
+ when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
856
+ if joined_table.keys.uniq!
857
+ DataFrame.create(rename_table(joined_table, n_keys, suffix))
858
+ else
859
+ DataFrame.create(joined_table)
860
+ end
861
+ when :full_outer
862
+ renamed_table = rename_table(joined_table, n_keys, suffix)
863
+ renamed_keys = renamed_table.keys
864
+ dropper = []
865
+ dataframe = DataFrame.create(renamed_table).assign do |df|
866
+ left_keys.map do |left_key|
867
+ i_left_key = renamed_keys.index(left_key)
868
+ right_key = renamed_keys[i_left_key + table_keys.size]
869
+ dropper << right_key
870
+ [left_key.to_sym, merge_array(df[left_key].data, df[right_key].data)]
871
+ end
872
+ end
873
+ dataframe = dataframe.sort(left_index, right_index) if force_order
874
+
875
+ dataframe.drop(dropper, left_index, right_index)
876
+ when :right_outer
877
+ dataframe =
878
+ if joined_table.keys.uniq!
879
+ DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
880
+ else
881
+ DataFrame.create(joined_table)
882
+ end
883
+ if force_order
884
+ dataframe =
885
+ dataframe
886
+ .sort(left_index, right_index)
887
+ .drop(left_index, right_index)
888
+ end
889
+ dataframe.pick do
890
+ [right_keys, keys.map(&:to_s) - right_keys]
891
+ end
264
892
  end
265
- DataFrame.new(table_output[selected_indexes])
266
- .assign(*join_keys) { merged_columns }
267
893
  end
268
894
 
269
895
  private
270
896
 
271
- def merge_column(column1, column2, type)
272
- a1 = column1.to_a
273
- a2 = column2.to_a
274
- if type == :full_outer
275
- a1.zip(a2).map { |x, y| x || y }
276
- elsif type.start_with?('right')
277
- a2
278
- else # :inner or :left-*
279
- a1
280
- end
897
+ # Rename duplicate keys by suffix
898
+ def rename_table(joined_table, n_keys, suffix)
899
+ joined_keys = joined_table.keys
900
+ other_keys = joined_keys[n_keys..]
901
+
902
+ dup_keys = joined_keys.tally.select { |_, v| v > 1 }.keys
903
+ renamed_right_keys =
904
+ other_keys.map do |key|
905
+ if dup_keys.include?(key)
906
+ new_key = nil
907
+ loop do
908
+ new_key = "#{key}#{suffix}"
909
+ break unless joined_keys.include?(new_key)
910
+
911
+ s = suffix.succ
912
+ raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
913
+
914
+ suffix = s
915
+ end
916
+ new_key
917
+ else
918
+ key
919
+ end
920
+ end
921
+ joined_keys[n_keys..] = renamed_right_keys
922
+
923
+ fields =
924
+ joined_keys.map.with_index do |k, i|
925
+ Arrow::Field.new(k, joined_table[i].data_type)
926
+ end
927
+ Arrow::Table.new(Arrow::Schema.new(fields), joined_table.columns)
928
+ end
929
+
930
+ # Merge two Arrow::Arrays
931
+ def merge_array(array1, array2)
932
+ t = Arrow::Function.find(:is_null).execute([array1])
933
+ Arrow::Function.find(:if_else).execute([t, array2, array1]).value
281
934
  end
282
935
  end
283
936
  end