red_amber 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +39 -20
- data/.yardopts +2 -0
- data/CHANGELOG.md +113 -0
- data/Gemfile +1 -1
- data/LICENSE +1 -1
- data/README.md +25 -26
- data/benchmark/basic.yml +2 -2
- data/benchmark/combine.yml +2 -2
- data/benchmark/dataframe.yml +2 -2
- data/benchmark/group.yml +2 -2
- data/benchmark/reshape.yml +2 -2
- data/benchmark/vector.yml +3 -0
- data/doc/DataFrame.md +32 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +207 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +429 -75
- data/lib/red_amber/data_frame_combinable.rb +516 -66
- data/lib/red_amber/data_frame_displayable.rb +244 -14
- data/lib/red_amber/data_frame_indexable.rb +121 -18
- data/lib/red_amber/data_frame_loadsave.rb +78 -10
- data/lib/red_amber/data_frame_reshaping.rb +184 -14
- data/lib/red_amber/data_frame_selectable.rb +622 -66
- data/lib/red_amber/data_frame_variable_operation.rb +446 -34
- data/lib/red_amber/group.rb +187 -22
- data/lib/red_amber/helper.rb +70 -10
- data/lib/red_amber/refinements.rb +12 -5
- data/lib/red_amber/subframes.rb +1066 -0
- data/lib/red_amber/vector.rb +385 -11
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +387 -0
- data/lib/red_amber/vector_selectable.rb +217 -12
- data/lib/red_amber/vector_unary_element_wise.rb +436 -0
- data/lib/red_amber/vector_updatable.rb +278 -34
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +13 -1
- data/red_amber.gemspec +2 -2
- metadata +13 -8
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,17 +1,36 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Mix-in for the class DataFrame
|
5
5
|
module DataFrameCombinable
|
6
6
|
# Refinements for Arrow::Table
|
7
7
|
using RefineArrowTable
|
8
8
|
|
9
|
-
# Concatenate other
|
9
|
+
# Concatenate other dataframes or tables onto the bottom of self.
|
10
10
|
#
|
11
|
+
# @note the `#types` must be same as `other#types`.
|
11
12
|
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
12
|
-
#
|
13
|
+
# DataFrames or Tables to concatenate.
|
13
14
|
# @return [DataFrame]
|
14
|
-
#
|
15
|
+
# concatenated dataframe.
|
16
|
+
# @example
|
17
|
+
# df = DataFrame.new(x: [1, 2], y: ['A', 'B'])
|
18
|
+
# other = DataFrame.new(x: [3, 4], y: ['C', 'D'])
|
19
|
+
# [df.types, other.types]
|
20
|
+
#
|
21
|
+
# # =>
|
22
|
+
# [[:uint8, :string], [:uint8, :string]]
|
23
|
+
#
|
24
|
+
# df.concatenate(other)
|
25
|
+
#
|
26
|
+
# # =>
|
27
|
+
# x y
|
28
|
+
# <uint8> <string>
|
29
|
+
# 0 1 A
|
30
|
+
# 1 2 B
|
31
|
+
# 2 3 C
|
32
|
+
# 3 4 D
|
33
|
+
#
|
15
34
|
def concatenate(*other)
|
16
35
|
case other
|
17
36
|
in [] | [nil] | [[]]
|
@@ -39,14 +58,27 @@ module RedAmber
|
|
39
58
|
alias_method :concat, :concatenate
|
40
59
|
alias_method :bind_rows, :concatenate
|
41
60
|
|
42
|
-
# Merge other
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
61
|
+
# Merge other DataFrames or Tables.
|
62
|
+
#
|
63
|
+
# @note the `#size` must be same as `other#size`.
|
64
|
+
# @note self and other must not share the same key.
|
46
65
|
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
47
|
-
#
|
66
|
+
# DataFrames or Tables to merge.
|
67
|
+
# @raise [DataFrameArgumentError]
|
68
|
+
# if size is not same or self and other shares the same key.
|
48
69
|
# @return [DataFrame]
|
49
|
-
#
|
70
|
+
# merged dataframe.
|
71
|
+
# @example
|
72
|
+
# df = DataFrame.new(x: [1, 2], y: [3, 4])
|
73
|
+
# other = DataFrame.new(a: ['A', 'B'], b: ['C', 'D'])
|
74
|
+
# df.merge(other)
|
75
|
+
#
|
76
|
+
# # =>
|
77
|
+
# x y a b
|
78
|
+
# <uint8> <uint8> <string> <string>
|
79
|
+
# 0 1 3 A C
|
80
|
+
# 1 2 4 B D
|
81
|
+
#
|
50
82
|
def merge(*other)
|
51
83
|
case other
|
52
84
|
in [] | [nil] | [[]]
|
@@ -85,32 +117,105 @@ module RedAmber
|
|
85
117
|
|
86
118
|
# Mutating joins (#inner_join, #full_join, #left_join, #right_join)
|
87
119
|
|
88
|
-
# Join another DataFrame or Table, leaving only the matching records.
|
89
|
-
# - Same as `#join` with `type: :inner`
|
90
|
-
# - A kind of mutating join.
|
91
|
-
#
|
92
120
|
# @!macro join_before
|
93
121
|
# @param other [DataFrame, Arrow::Table]
|
94
122
|
# A DataFrame or a Table to be joined with self.
|
95
123
|
#
|
124
|
+
# @!macro join_dorce_order
|
125
|
+
# @param force_order [Boolean]
|
126
|
+
# wheather force order of the output always same.
|
127
|
+
# - This option is used in `:full_outer` and `:right_outer`.
|
128
|
+
# - If this option is true (by default) it will append index to the source
|
129
|
+
# and sort after joining. It will cause some degradation in performance.
|
130
|
+
#
|
96
131
|
# @!macro join_after
|
97
132
|
# @param suffix [#succ]
|
98
133
|
# a suffix to rename keys when key names conflict as a result of join.
|
99
134
|
# `suffix` must be responsible to `#succ`.
|
100
135
|
# @return [DataFrame]
|
101
|
-
#
|
136
|
+
# joined dataframe.
|
102
137
|
#
|
103
138
|
# @!macro join_key_in_array
|
104
139
|
# @param join_keys [String, Symbol, Array<String, Symbol>]
|
105
|
-
#
|
140
|
+
# a key or keys to match.
|
106
141
|
#
|
107
142
|
# @!macro join_key_in_hash
|
108
143
|
# @param join_key_pairs [Hash]
|
109
|
-
#
|
144
|
+
# pairs of a key name or key names to match in left and right.
|
110
145
|
# @option join_key_pairs [String, Symbol, Array<String, Symbol>] :left
|
111
|
-
#
|
146
|
+
# join keys in `self`.
|
112
147
|
# @option join_key_pairs [String, Symbol, Array<String, Symbol>] :right
|
113
|
-
#
|
148
|
+
# join keys in `other`.
|
149
|
+
#
|
150
|
+
# @!macro join_common_example_1
|
151
|
+
# @example
|
152
|
+
# df = DataFrame.new(KEY: %w[A B C], X1: [1, 2, 3])
|
153
|
+
#
|
154
|
+
# # =>
|
155
|
+
# KEY X1
|
156
|
+
# <string> <uint8>
|
157
|
+
# 0 A 1
|
158
|
+
# 1 B 2
|
159
|
+
# 2 C 3
|
160
|
+
#
|
161
|
+
# other = DataFrame.new(KEY: %w[A B D], X2: [true, false, nil])
|
162
|
+
#
|
163
|
+
# # =>
|
164
|
+
# KEY X2
|
165
|
+
# <string> <boolean>
|
166
|
+
# 0 A true
|
167
|
+
# 1 B false
|
168
|
+
# 2 D (nil)
|
169
|
+
#
|
170
|
+
# @!macro join_common_example_2
|
171
|
+
# @example
|
172
|
+
# df2 = DataFrame.new(KEY1: %w[A B C], X1: [1, 2, 3])
|
173
|
+
#
|
174
|
+
# # =>
|
175
|
+
# KEY1 X1
|
176
|
+
# <string> <uint8>
|
177
|
+
# 0 A 1
|
178
|
+
# 1 B 2
|
179
|
+
# 2 C 3
|
180
|
+
#
|
181
|
+
# other2 = DataFrame.new(KEY2: %w[A B D], X2: [true, false, nil])
|
182
|
+
#
|
183
|
+
# # =>
|
184
|
+
# KEY2 X2
|
185
|
+
# <string> <boolean>
|
186
|
+
# 0 A true
|
187
|
+
# 1 B false
|
188
|
+
# 2 D (nil)
|
189
|
+
#
|
190
|
+
# @!macro join_common_example_3
|
191
|
+
# @example
|
192
|
+
# df3 = DataFrame.new(
|
193
|
+
# KEY1: %w[A B C],
|
194
|
+
# KEY2: [1, 2, 3]
|
195
|
+
# )
|
196
|
+
#
|
197
|
+
# # =>
|
198
|
+
# KEY1 KEY2
|
199
|
+
# <string> <uint8>
|
200
|
+
# 0 A 1
|
201
|
+
# 1 B 2
|
202
|
+
# 2 C 3
|
203
|
+
#
|
204
|
+
# other3 = DataFrame.new(
|
205
|
+
# KEY1: %w[A B D],
|
206
|
+
# KEY2: [1, 4, 5]
|
207
|
+
# )
|
208
|
+
#
|
209
|
+
# # =>
|
210
|
+
# KEY1 KEY2
|
211
|
+
# <string> <uint8>
|
212
|
+
# 0 A 1
|
213
|
+
# 1 B 4
|
214
|
+
# 2 D 5
|
215
|
+
|
216
|
+
# Join another DataFrame or Table, leaving only the matching records.
|
217
|
+
# - Same as `#join` with `type: :inner`
|
218
|
+
# - A kind of mutating join.
|
114
219
|
#
|
115
220
|
# @overload inner_join(other, suffix: '.1')
|
116
221
|
# If `join_key` is not specified, common keys in self and other are used
|
@@ -118,18 +223,45 @@ module RedAmber
|
|
118
223
|
#
|
119
224
|
# @macro join_before
|
120
225
|
# @macro join_after
|
226
|
+
# @macro join_common_example_1
|
227
|
+
# @example without key (use implicit common key)
|
228
|
+
# df.inner_join(other)
|
229
|
+
#
|
230
|
+
# # =>
|
231
|
+
# KEY X1 X2
|
232
|
+
# <string> <uint8> <boolean>
|
233
|
+
# 0 A 1 true
|
234
|
+
# 1 B 2 false
|
121
235
|
#
|
122
236
|
# @overload inner_join(other, join_keys, suffix: '.1')
|
123
237
|
#
|
124
238
|
# @macro join_before
|
125
239
|
# @macro join_key_in_array
|
126
240
|
# @macro join_after
|
241
|
+
# @macro join_common_example_1
|
242
|
+
# @example with a key
|
243
|
+
# df.inner_join(other, :KEY)
|
244
|
+
#
|
245
|
+
# # =>
|
246
|
+
# KEY X1 X2
|
247
|
+
# <string> <uint8> <boolean>
|
248
|
+
# 0 A 1 true
|
249
|
+
# 1 B 2 false
|
127
250
|
#
|
128
251
|
# @overload inner_join(other, join_key_pairs, suffix: '.1')
|
129
252
|
#
|
130
253
|
# @macro join_before
|
131
254
|
# @macro join_key_in_hash
|
132
255
|
# @macro join_after
|
256
|
+
# @macro join_common_example_2
|
257
|
+
# @example with key pairs
|
258
|
+
# df2.inner_join(other2, { left: :KEY1, right: :KEY2 })
|
259
|
+
#
|
260
|
+
# # =>
|
261
|
+
# KEY1 X1 X2
|
262
|
+
# <string> <uint8> <boolean>
|
263
|
+
# 0 A 1 true
|
264
|
+
# 1 B 2 false
|
133
265
|
#
|
134
266
|
def inner_join(other, join_keys = nil, suffix: '.1')
|
135
267
|
join(other, join_keys, type: :inner, suffix: suffix)
|
@@ -139,27 +271,64 @@ module RedAmber
|
|
139
271
|
# - Same as `#join` with `type: :full_outer`
|
140
272
|
# - A kind of mutating join.
|
141
273
|
#
|
142
|
-
# @overload full_join(other, suffix: '.1')
|
274
|
+
# @overload full_join(other, suffix: '.1', force_order: true)
|
143
275
|
# If `join_key` is not specified, common keys in self and other are used
|
144
276
|
# (natural keys). Returns joined dataframe.
|
145
277
|
#
|
146
278
|
# @macro join_before
|
279
|
+
# @macro join_dorce_order
|
147
280
|
# @macro join_after
|
281
|
+
# @macro join_common_example_1
|
282
|
+
# @example without key (use implicit common key)
|
283
|
+
# df.full_join(other)
|
284
|
+
#
|
285
|
+
# # =>
|
286
|
+
# KEY X1 X2
|
287
|
+
# <string> <uint8> <boolean>
|
288
|
+
# 0 A 1 true
|
289
|
+
# 1 B 2 false
|
290
|
+
# 2 C 3 (nil)
|
291
|
+
# 3 D (nil) (nil)
|
148
292
|
#
|
149
|
-
# @overload full_join(other, join_keys, suffix: '.1')
|
293
|
+
# @overload full_join(other, join_keys, suffix: '.1', force_order: true)
|
150
294
|
#
|
151
295
|
# @macro join_before
|
152
296
|
# @macro join_key_in_array
|
297
|
+
# @macro join_dorce_order
|
153
298
|
# @macro join_after
|
299
|
+
# @macro join_common_example_1
|
300
|
+
# @example with a key
|
301
|
+
# df.full_join(other, :KEY)
|
154
302
|
#
|
155
|
-
#
|
303
|
+
# # =>
|
304
|
+
# KEY X1 X2
|
305
|
+
# <string> <uint8> <boolean>
|
306
|
+
# 0 A 1 true
|
307
|
+
# 1 B 2 false
|
308
|
+
# 2 C 3 (nil)
|
309
|
+
# 3 D (nil) (nil)
|
310
|
+
#
|
311
|
+
# @overload full_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
156
312
|
#
|
157
313
|
# @macro join_before
|
158
314
|
# @macro join_key_in_hash
|
315
|
+
# @macro join_dorce_order
|
159
316
|
# @macro join_after
|
160
|
-
#
|
161
|
-
|
162
|
-
|
317
|
+
# @macro join_common_example_2
|
318
|
+
# @example with key pairs
|
319
|
+
# df2.full_join(other2, { left: :KEY1, right: :KEY2 })
|
320
|
+
#
|
321
|
+
# # =>
|
322
|
+
# KEY1 X1 X2
|
323
|
+
# <string> <uint8> <boolean>
|
324
|
+
# 0 A 1 true
|
325
|
+
# 1 B 2 false
|
326
|
+
# 2 C 3 (nil)
|
327
|
+
# 3 D (nil) (nil)
|
328
|
+
#
|
329
|
+
def full_join(other, join_keys = nil, suffix: '.1', force_order: true)
|
330
|
+
join(other, join_keys,
|
331
|
+
type: :full_outer, suffix: suffix, force_order: force_order)
|
163
332
|
end
|
164
333
|
|
165
334
|
alias_method :outer_join, :full_join
|
@@ -174,18 +343,48 @@ module RedAmber
|
|
174
343
|
#
|
175
344
|
# @macro join_before
|
176
345
|
# @macro join_after
|
346
|
+
# @macro join_common_example_1
|
347
|
+
# @example without key (use implicit common key)
|
348
|
+
# df.left_join(other)
|
349
|
+
#
|
350
|
+
# # =>
|
351
|
+
# KEY X1 X2
|
352
|
+
# <string> <uint8> <boolean>
|
353
|
+
# 0 A 1 true
|
354
|
+
# 1 B 2 false
|
355
|
+
# 2 C 3 (nil)
|
177
356
|
#
|
178
357
|
# @overload left_join(other, join_keys, suffix: '.1')
|
179
358
|
#
|
180
359
|
# @macro join_before
|
181
360
|
# @macro join_key_in_array
|
182
361
|
# @macro join_after
|
362
|
+
# @macro join_common_example_1
|
363
|
+
# @example with a key
|
364
|
+
# df.left_join(other, :KEY)
|
365
|
+
#
|
366
|
+
# # =>
|
367
|
+
# KEY X1 X2
|
368
|
+
# <string> <uint8> <boolean>
|
369
|
+
# 0 A 1 true
|
370
|
+
# 1 B 2 false
|
371
|
+
# 2 C 3 (nil)
|
183
372
|
#
|
184
373
|
# @overload left_join(other, join_key_pairs, suffix: '.1')
|
185
374
|
#
|
186
375
|
# @macro join_before
|
187
376
|
# @macro join_key_in_hash
|
188
377
|
# @macro join_after
|
378
|
+
# @macro join_common_example_2
|
379
|
+
# @example with key pairs
|
380
|
+
# df2.left_join(other2, { left: :KEY1, right: :KEY2 })
|
381
|
+
#
|
382
|
+
# # =>
|
383
|
+
# KEY1 X1 X2
|
384
|
+
# <string> <uint8> <boolean>
|
385
|
+
# 0 A 1 true
|
386
|
+
# 1 B 2 false
|
387
|
+
# 2 C 3 (nil)
|
189
388
|
#
|
190
389
|
def left_join(other, join_keys = nil, suffix: '.1')
|
191
390
|
join(other, join_keys, type: :left_outer, suffix: suffix)
|
@@ -195,27 +394,66 @@ module RedAmber
|
|
195
394
|
# - Same as `#join` with `type: :right_outer`
|
196
395
|
# - A kind of mutating join.
|
197
396
|
#
|
198
|
-
# @overload right_join(other, suffix: '.1')
|
397
|
+
# @overload right_join(other, suffix: '.1', force_order: true)
|
199
398
|
# If `join_key` is not specified, common keys in self and other are used
|
200
399
|
# (natural keys). Returns joined dataframe.
|
201
400
|
#
|
202
401
|
# @macro join_before
|
402
|
+
# @macro join_dorce_order
|
203
403
|
# @macro join_after
|
404
|
+
# @macro join_common_example_1
|
405
|
+
# @example without key (use implicit common key)
|
406
|
+
# df.right_join(other)
|
407
|
+
#
|
408
|
+
# # =>
|
409
|
+
# KEY X1 X2
|
410
|
+
# <string> <uint8> <boolean>
|
411
|
+
# 0 A 1 true
|
412
|
+
# 1 B 2 false
|
413
|
+
# 2 D (nil) (nil)
|
204
414
|
#
|
205
|
-
# @overload right_join(other, join_keys, suffix: '.1')
|
415
|
+
# @overload right_join(other, join_keys, suffix: '.1', force_order: true)
|
206
416
|
#
|
207
417
|
# @macro join_before
|
208
418
|
# @macro join_key_in_array
|
419
|
+
# @macro join_dorce_order
|
209
420
|
# @macro join_after
|
421
|
+
# @macro join_common_example_1
|
422
|
+
# @example with a key
|
423
|
+
# df.right_join(other, :KEY)
|
210
424
|
#
|
211
|
-
#
|
425
|
+
# # =>
|
426
|
+
# KEY X1 X2
|
427
|
+
# <string> <uint8> <boolean>
|
428
|
+
# 0 A 1 true
|
429
|
+
# 1 B 2 false
|
430
|
+
# 2 D (nil) (nil)
|
431
|
+
#
|
432
|
+
# @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
212
433
|
#
|
213
434
|
# @macro join_before
|
214
435
|
# @macro join_key_in_hash
|
436
|
+
# @macro join_dorce_order
|
215
437
|
# @macro join_after
|
216
|
-
#
|
217
|
-
|
218
|
-
|
438
|
+
# @macro join_common_example_2
|
439
|
+
# @example with key pairs
|
440
|
+
# df2.right_join(other2, { left: :KEY1, right: :KEY2 })
|
441
|
+
#
|
442
|
+
# # =>
|
443
|
+
# KEY1 X1 X2
|
444
|
+
# <string> <uint8> <boolean>
|
445
|
+
# 0 A 1 true
|
446
|
+
# 1 B 2 false
|
447
|
+
# 2 D (nil) (nil)
|
448
|
+
#
|
449
|
+
def right_join(other, join_keys = nil, suffix: '.1', force_order: true)
|
450
|
+
join(
|
451
|
+
other,
|
452
|
+
join_keys,
|
453
|
+
type: :right_outer,
|
454
|
+
suffix: suffix,
|
455
|
+
force_order: force_order
|
456
|
+
)
|
219
457
|
end
|
220
458
|
|
221
459
|
# Filtering joins (#semi_join, #anti_join)
|
@@ -230,18 +468,45 @@ module RedAmber
|
|
230
468
|
#
|
231
469
|
# @macro join_before
|
232
470
|
# @macro join_after
|
471
|
+
# @macro join_common_example_1
|
472
|
+
# @example without key (use implicit common key)
|
473
|
+
# df.semi_join(other)
|
474
|
+
#
|
475
|
+
# # =>
|
476
|
+
# KEY X1
|
477
|
+
# <string> <uint8>
|
478
|
+
# 0 A 1
|
479
|
+
# 1 B 2
|
233
480
|
#
|
234
481
|
# @overload semi_join(other, join_keys, suffix: '.1')
|
235
482
|
#
|
236
483
|
# @macro join_before
|
237
484
|
# @macro join_key_in_array
|
238
485
|
# @macro join_after
|
486
|
+
# @macro join_common_example_1
|
487
|
+
# @example with a key
|
488
|
+
# df.semi_join(other, :KEY)
|
489
|
+
#
|
490
|
+
# # =>
|
491
|
+
# KEY X1
|
492
|
+
# <string> <uint8>
|
493
|
+
# 0 A 1
|
494
|
+
# 1 B 2
|
239
495
|
#
|
240
496
|
# @overload semi_join(other, join_key_pairs, suffix: '.1')
|
241
497
|
#
|
242
498
|
# @macro join_before
|
243
499
|
# @macro join_key_in_hash
|
244
500
|
# @macro join_after
|
501
|
+
# @macro join_common_example_2
|
502
|
+
# @example with key pairs
|
503
|
+
# df2.semi_join(other2, { left: :KEY1, right: :KEY2 })
|
504
|
+
#
|
505
|
+
# # =>
|
506
|
+
# KEY1 X1
|
507
|
+
# <string> <uint8>
|
508
|
+
# 0 A 1
|
509
|
+
# 1 B 2
|
245
510
|
#
|
246
511
|
def semi_join(other, join_keys = nil, suffix: '.1')
|
247
512
|
join(other, join_keys, type: :left_semi, suffix: suffix)
|
@@ -257,18 +522,42 @@ module RedAmber
|
|
257
522
|
#
|
258
523
|
# @macro join_before
|
259
524
|
# @macro join_after
|
525
|
+
# @macro join_common_example_1
|
526
|
+
# @example without key (use implicit common key)
|
527
|
+
# df.anti_join(other)
|
528
|
+
#
|
529
|
+
# # =>
|
530
|
+
# KEY X1
|
531
|
+
# <string> <uint8>
|
532
|
+
# 0 C 3
|
260
533
|
#
|
261
534
|
# @overload anti_join(other, join_keys, suffix: '.1')
|
262
535
|
#
|
263
536
|
# @macro join_before
|
264
537
|
# @macro join_key_in_array
|
265
538
|
# @macro join_after
|
539
|
+
# @macro join_common_example_1
|
540
|
+
# @example with a key
|
541
|
+
# df.anti_join(other, :KEY)
|
542
|
+
#
|
543
|
+
# # =>
|
544
|
+
# KEY X1
|
545
|
+
# <string> <uint8>
|
546
|
+
# 0 C 3
|
266
547
|
#
|
267
548
|
# @overload anti_join(other, join_key_pairs, suffix: '.1')
|
268
549
|
#
|
269
550
|
# @macro join_before
|
270
551
|
# @macro join_key_in_hash
|
271
552
|
# @macro join_after
|
553
|
+
# @macro join_common_example_2
|
554
|
+
# @example with key pairs
|
555
|
+
# df2.anti_join(other2, { left: :KEY1, right: :KEY2 })
|
556
|
+
#
|
557
|
+
# # =>
|
558
|
+
# KEY1 X1
|
559
|
+
# <string> <uint8>
|
560
|
+
# 0 C 3
|
272
561
|
#
|
273
562
|
def anti_join(other, join_keys = nil, suffix: '.1')
|
274
563
|
join(other, join_keys, type: :left_anti, suffix: suffix)
|
@@ -279,8 +568,11 @@ module RedAmber
|
|
279
568
|
# Check if set operation with self and other is possible.
|
280
569
|
#
|
281
570
|
# @macro join_before
|
282
|
-
#
|
283
|
-
#
|
571
|
+
# @return [Boolean]
|
572
|
+
# true if set operation is possible.
|
573
|
+
# @macro join_common_example_3
|
574
|
+
# @example
|
575
|
+
# df3.set_operable?(other3) # => true
|
284
576
|
#
|
285
577
|
def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
|
286
578
|
keys == other.keys.map(&:to_sym)
|
@@ -291,8 +583,16 @@ module RedAmber
|
|
291
583
|
# - A kind of set operations.
|
292
584
|
#
|
293
585
|
# @macro join_before
|
586
|
+
# @return [DataFrame]
|
587
|
+
# joined dataframe.
|
588
|
+
# @macro join_common_example_3
|
589
|
+
# @example
|
590
|
+
# df3.intersect(other3)
|
294
591
|
#
|
295
|
-
#
|
592
|
+
# # =>
|
593
|
+
# KEY1 KEY2
|
594
|
+
# <string> <uint8>
|
595
|
+
# 0 A 1
|
296
596
|
#
|
297
597
|
def intersect(other)
|
298
598
|
unless keys == other.keys.map(&:to_sym)
|
@@ -307,8 +607,20 @@ module RedAmber
|
|
307
607
|
# - A kind of set operations.
|
308
608
|
#
|
309
609
|
# @macro join_before
|
310
|
-
#
|
311
|
-
#
|
610
|
+
# @return [DataFrame]
|
611
|
+
# joined dataframe.
|
612
|
+
# @macro join_common_example_3
|
613
|
+
# @example
|
614
|
+
# df3.intersect(other3)
|
615
|
+
#
|
616
|
+
# # =>
|
617
|
+
# KEY1 KEY2
|
618
|
+
# <string> <uint8>
|
619
|
+
# 0 A 1
|
620
|
+
# 1 B 2
|
621
|
+
# 2 C 3
|
622
|
+
# 3 B 4
|
623
|
+
# 4 D 5
|
312
624
|
#
|
313
625
|
def union(other)
|
314
626
|
unless keys == other.keys.map(&:to_sym)
|
@@ -323,8 +635,25 @@ module RedAmber
|
|
323
635
|
# - A kind of set operations.
|
324
636
|
#
|
325
637
|
# @macro join_before
|
638
|
+
# @return [DataFrame]
|
639
|
+
# joined dataframe.
|
640
|
+
# @macro join_common_example_3
|
641
|
+
# @example
|
642
|
+
# df3.intersect(other3)
|
643
|
+
#
|
644
|
+
# # =>
|
645
|
+
# KEY1 KEY2
|
646
|
+
# <string> <uint8>
|
647
|
+
# 0 B 2
|
648
|
+
# 1 C 3
|
326
649
|
#
|
327
|
-
#
|
650
|
+
# other.intersect(df)
|
651
|
+
#
|
652
|
+
# # =>
|
653
|
+
# KEY1 KEY2
|
654
|
+
# <string> <uint8>
|
655
|
+
# 0 B 4
|
656
|
+
# 1 D 5
|
328
657
|
#
|
329
658
|
def difference(other)
|
330
659
|
unless keys == other.keys.map(&:to_sym)
|
@@ -338,46 +667,153 @@ module RedAmber
|
|
338
667
|
|
339
668
|
# Join another DataFrame or Table to self.
|
340
669
|
#
|
341
|
-
#
|
670
|
+
# @!macro join_common_type
|
671
|
+
# @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
|
672
|
+
# left_outer, :right_outer, :full_outer] type of join.
|
673
|
+
#
|
674
|
+
# @!macro join_common_example_4
|
675
|
+
# @example
|
676
|
+
# df4 = DataFrame.new(
|
677
|
+
# X1: %w[A B C],
|
678
|
+
# Y: %w[D E F]
|
679
|
+
# )
|
680
|
+
#
|
681
|
+
# # =>
|
682
|
+
# X1 Y1
|
683
|
+
# <string> <string>
|
684
|
+
# 0 A D
|
685
|
+
# 1 B E
|
686
|
+
# 2 C F
|
687
|
+
#
|
688
|
+
# other4 = DataFrame.new(
|
689
|
+
# X2: %w[A B D],
|
690
|
+
# Y: %w[e E E]
|
691
|
+
# )
|
692
|
+
#
|
693
|
+
# # =>
|
694
|
+
# X1 Y1
|
695
|
+
# <string> <string>
|
696
|
+
# 0 A D
|
697
|
+
# 1 B E
|
698
|
+
# 2 C F
|
699
|
+
|
700
|
+
# @note the order of joined results will be preserved by default.
|
701
|
+
# This is enabled by appending index column to sort after joining but
|
702
|
+
# it will cause some performance degradation. If you don't matter
|
703
|
+
# the order of the result, set `force_order` option to `false`.
|
704
|
+
#
|
705
|
+
# @overload join(other, type: :inner, suffix: '.1', force_order: true)
|
342
706
|
#
|
343
707
|
# If `join_key` is not specified, common keys in self and other are used
|
344
708
|
# (natural keys). Returns joined dataframe.
|
345
709
|
#
|
346
|
-
# @!macro join_common_type
|
347
|
-
# @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
|
348
|
-
# left_outer, :right_outer, :full_outer] type of join.
|
349
|
-
#
|
350
710
|
# @macro join_before
|
351
711
|
# @macro join_common_type
|
712
|
+
# @macro join_dorce_order
|
352
713
|
# @macro join_after
|
714
|
+
# @macro join_common_example_1
|
715
|
+
# @example
|
716
|
+
# df.join(other)
|
353
717
|
#
|
354
|
-
#
|
718
|
+
# # =>
|
719
|
+
# KEY X1 X2
|
720
|
+
# <string> <uint8> <boolean>
|
721
|
+
# 0 A 1 true
|
722
|
+
# 1 B 2 false
|
723
|
+
#
|
724
|
+
# df.join(other, type: :full_outer)
|
725
|
+
#
|
726
|
+
# # =>
|
727
|
+
# KEY X1 X2
|
728
|
+
# <string> <uint8> <boolean>
|
729
|
+
# 0 A 1 true
|
730
|
+
# 1 B 2 false
|
731
|
+
# 2 C 3 (nil)
|
732
|
+
# 3 D (nil) (nil)
|
733
|
+
#
|
734
|
+
# @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: true)
|
355
735
|
#
|
356
736
|
# @macro join_before
|
357
737
|
# @macro join_key_in_array
|
358
738
|
# @macro join_common_type
|
739
|
+
# @macro join_dorce_order
|
359
740
|
# @macro join_after
|
741
|
+
# @macro join_common_example_3
|
742
|
+
# @example join keys in an Array
|
743
|
+
# df3.join(other3, [:KEY1, :KEY2])
|
744
|
+
#
|
745
|
+
# # =>
|
746
|
+
# KEY1 KEY2
|
747
|
+
# <string> <uint8>
|
748
|
+
# 0 A 1
|
360
749
|
#
|
361
|
-
#
|
750
|
+
# @example partial join key and suffix
|
751
|
+
# df3.join(other3, :KEY1, suffix: '.a')
|
752
|
+
#
|
753
|
+
# # =>
|
754
|
+
# KEY1 KEY2 KEY2.a
|
755
|
+
# <string> <uint8> <uint8>
|
756
|
+
# 0 A 1 1
|
757
|
+
# 1 B 2 4
|
758
|
+
#
|
759
|
+
# @overload join(other, join_key_pairs, type: :inner, suffix: '.1', force_order: true)
|
362
760
|
#
|
363
761
|
# @macro join_before
|
364
762
|
# @macro join_key_in_hash
|
365
763
|
# @macro join_common_type
|
764
|
+
# @macro join_dorce_order
|
366
765
|
# @macro join_after
|
367
|
-
#
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
766
|
+
# @macro join_common_example_4
|
767
|
+
# @example without options
|
768
|
+
# df4.join(other4)
|
769
|
+
#
|
770
|
+
# # =>
|
771
|
+
# X1 Y X2
|
772
|
+
# <string> <string> <string>
|
773
|
+
# 0 B E D
|
774
|
+
# 1 B E B
|
775
|
+
#
|
776
|
+
# @example join by key pairs
|
777
|
+
# df4.join(other4, { left: [:X1, :Y], right: [:X2, :Y] })
|
778
|
+
#
|
779
|
+
# # =>
|
780
|
+
# X1 Y
|
781
|
+
# <string> <string>
|
782
|
+
# 0 B E
|
783
|
+
#
|
784
|
+
# @example join by key pairs, using renaming by suffix
|
785
|
+
# df4.join(other4, { left: :X1, right: :X2 })
|
786
|
+
#
|
787
|
+
# # =>
|
788
|
+
# X1 Y Y.1
|
789
|
+
# <string> <string> <string>
|
790
|
+
# 0 A D e
|
791
|
+
# 1 B E E
|
792
|
+
#
|
793
|
+
def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: true)
|
794
|
+
right_table =
|
795
|
+
case other
|
796
|
+
when DataFrame
|
797
|
+
other.table
|
798
|
+
when Arrow::Table
|
799
|
+
other
|
800
|
+
else
|
801
|
+
raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
|
802
|
+
end
|
803
|
+
|
804
|
+
type = type.to_sym
|
805
|
+
left_index = :__LEFT_INDEX__
|
806
|
+
right_index = :__RIGHT_INDEX__
|
807
|
+
if force_order && %i[full_outer right_outer].include?(type)
|
808
|
+
left_table = assign(left_index) { indices }.table
|
809
|
+
other = DataFrame.create(other) if other.is_a?(Arrow::Table)
|
810
|
+
right_table = other.assign(right_index) { indices }.table
|
374
811
|
else
|
375
|
-
|
812
|
+
left_table = table
|
376
813
|
end
|
377
814
|
|
378
|
-
table_keys =
|
379
|
-
other_keys =
|
380
|
-
type = type.to_sym
|
815
|
+
table_keys = left_table.keys
|
816
|
+
other_keys = right_table.keys
|
381
817
|
|
382
818
|
# natural keys (implicit common keys)
|
383
819
|
join_keys ||= table_keys.intersection(other_keys)
|
@@ -407,10 +843,13 @@ module RedAmber
|
|
407
843
|
|
408
844
|
# Should we rescue errors in Arrow::Table#join for usability ?
|
409
845
|
joined_table =
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
846
|
+
left_table.join(
|
847
|
+
right_table,
|
848
|
+
join_keys,
|
849
|
+
type: type,
|
850
|
+
left_outputs: left_outputs,
|
851
|
+
right_outputs: right_outputs
|
852
|
+
)
|
414
853
|
|
415
854
|
case type
|
416
855
|
when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
|
@@ -423,20 +862,31 @@ module RedAmber
|
|
423
862
|
renamed_table = rename_table(joined_table, n_keys, suffix)
|
424
863
|
renamed_keys = renamed_table.keys
|
425
864
|
dropper = []
|
426
|
-
DataFrame.create(renamed_table).assign do |df|
|
865
|
+
dataframe = DataFrame.create(renamed_table).assign do |df|
|
427
866
|
left_keys.map do |left_key|
|
428
867
|
i_left_key = renamed_keys.index(left_key)
|
429
868
|
right_key = renamed_keys[i_left_key + table_keys.size]
|
430
869
|
dropper << right_key
|
431
870
|
[left_key.to_sym, merge_array(df[left_key].data, df[right_key].data)]
|
432
871
|
end
|
433
|
-
end
|
872
|
+
end
|
873
|
+
dataframe = dataframe.sort(left_index, right_index) if force_order
|
874
|
+
|
875
|
+
dataframe.drop(dropper, left_index, right_index)
|
434
876
|
when :right_outer
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
877
|
+
dataframe =
|
878
|
+
if joined_table.keys.uniq!
|
879
|
+
DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
|
880
|
+
else
|
881
|
+
DataFrame.create(joined_table)
|
882
|
+
end
|
883
|
+
if force_order
|
884
|
+
dataframe =
|
885
|
+
dataframe
|
886
|
+
.sort(left_index, right_index)
|
887
|
+
.drop(left_index, right_index)
|
888
|
+
end
|
889
|
+
dataframe.pick do
|
440
890
|
[right_keys, keys.map(&:to_s) - right_keys]
|
441
891
|
end
|
442
892
|
end
|