red_amber 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +56 -22
- data/.yardopts +2 -0
- data/CHANGELOG.md +178 -0
- data/Gemfile +1 -1
- data/LICENSE +1 -1
- data/README.md +29 -30
- data/benchmark/basic.yml +7 -7
- data/benchmark/combine.yml +3 -3
- data/benchmark/dataframe.yml +15 -9
- data/benchmark/group.yml +6 -6
- data/benchmark/reshape.yml +6 -6
- data/benchmark/vector.yml +6 -3
- data/doc/DataFrame.md +32 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +207 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +454 -85
- data/lib/red_amber/data_frame_combinable.rb +609 -115
- data/lib/red_amber/data_frame_displayable.rb +313 -34
- data/lib/red_amber/data_frame_indexable.rb +122 -19
- data/lib/red_amber/data_frame_loadsave.rb +78 -10
- data/lib/red_amber/data_frame_reshaping.rb +184 -14
- data/lib/red_amber/data_frame_selectable.rb +623 -70
- data/lib/red_amber/data_frame_variable_operation.rb +452 -35
- data/lib/red_amber/group.rb +186 -22
- data/lib/red_amber/helper.rb +74 -14
- data/lib/red_amber/refinements.rb +26 -6
- data/lib/red_amber/subframes.rb +1101 -0
- data/lib/red_amber/vector.rb +362 -11
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +506 -0
- data/lib/red_amber/vector_selectable.rb +265 -23
- data/lib/red_amber/vector_unary_element_wise.rb +529 -0
- data/lib/red_amber/vector_updatable.rb +278 -34
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +13 -1
- data/red_amber.gemspec +2 -2
- metadata +13 -8
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,17 +1,38 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Mix-in for the class DataFrame
|
5
5
|
module DataFrameCombinable
|
6
6
|
# Refinements for Arrow::Table
|
7
7
|
using RefineArrowTable
|
8
8
|
|
9
|
-
# Concatenate other
|
9
|
+
# Concatenate other dataframes or tables onto the bottom of self.
|
10
10
|
#
|
11
|
+
# @note the `#types` must be same as `other#types`.
|
11
12
|
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
12
|
-
#
|
13
|
+
# DataFrames or Tables to concatenate.
|
13
14
|
# @return [DataFrame]
|
14
|
-
#
|
15
|
+
# concatenated dataframe.
|
16
|
+
# @example
|
17
|
+
# df = DataFrame.new(x: [1, 2], y: ['A', 'B'])
|
18
|
+
# other = DataFrame.new(x: [3, 4], y: ['C', 'D'])
|
19
|
+
# [df.types, other.types]
|
20
|
+
#
|
21
|
+
# # =>
|
22
|
+
# [[:uint8, :string], [:uint8, :string]]
|
23
|
+
#
|
24
|
+
# df.concatenate(other)
|
25
|
+
#
|
26
|
+
# # =>
|
27
|
+
# x y
|
28
|
+
# <uint8> <string>
|
29
|
+
# 0 1 A
|
30
|
+
# 1 2 B
|
31
|
+
# 2 3 C
|
32
|
+
# 3 4 D
|
33
|
+
#
|
34
|
+
# @since 0.2.3
|
35
|
+
#
|
15
36
|
def concatenate(*other)
|
16
37
|
case other
|
17
38
|
in [] | [nil] | [[]]
|
@@ -39,14 +60,29 @@ module RedAmber
|
|
39
60
|
alias_method :concat, :concatenate
|
40
61
|
alias_method :bind_rows, :concatenate
|
41
62
|
|
42
|
-
# Merge other
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
63
|
+
# Merge other DataFrames or Tables.
|
64
|
+
#
|
65
|
+
# @note the `#size` must be same as `other#size`.
|
66
|
+
# @note self and other must not share the same key.
|
46
67
|
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
47
|
-
#
|
68
|
+
# DataFrames or Tables to merge.
|
69
|
+
# @raise [DataFrameArgumentError]
|
70
|
+
# if size is not same or self and other shares the same key.
|
48
71
|
# @return [DataFrame]
|
49
|
-
#
|
72
|
+
# merged dataframe.
|
73
|
+
# @example
|
74
|
+
# df = DataFrame.new(x: [1, 2], y: [3, 4])
|
75
|
+
# other = DataFrame.new(a: ['A', 'B'], b: ['C', 'D'])
|
76
|
+
# df.merge(other)
|
77
|
+
#
|
78
|
+
# # =>
|
79
|
+
# x y a b
|
80
|
+
# <uint8> <uint8> <string> <string>
|
81
|
+
# 0 1 3 A C
|
82
|
+
# 1 2 4 B D
|
83
|
+
#
|
84
|
+
# @since 0.2.3
|
85
|
+
#
|
50
86
|
def merge(*other)
|
51
87
|
case other
|
52
88
|
in [] | [nil] | [[]]
|
@@ -85,81 +121,225 @@ module RedAmber
|
|
85
121
|
|
86
122
|
# Mutating joins (#inner_join, #full_join, #left_join, #right_join)
|
87
123
|
|
88
|
-
# Join another DataFrame or Table, leaving only the matching records.
|
89
|
-
# - Same as `#join` with `type: :inner`
|
90
|
-
# - A kind of mutating join.
|
91
|
-
#
|
92
124
|
# @!macro join_before
|
93
125
|
# @param other [DataFrame, Arrow::Table]
|
94
126
|
# A DataFrame or a Table to be joined with self.
|
95
127
|
#
|
128
|
+
# @!macro join_force_order
|
129
|
+
# @param force_order [Boolean]
|
130
|
+
# wheather force order of the output always same.
|
131
|
+
# - This option is used in `:full_outer` and `:right_outer`.
|
132
|
+
# - If this option is true (by default) it will append index to the source
|
133
|
+
# and sort after joining. It will cause some degradation in performance.
|
134
|
+
#
|
96
135
|
# @!macro join_after
|
97
136
|
# @param suffix [#succ]
|
98
137
|
# a suffix to rename keys when key names conflict as a result of join.
|
99
138
|
# `suffix` must be responsible to `#succ`.
|
100
139
|
# @return [DataFrame]
|
101
|
-
#
|
140
|
+
# joined dataframe.
|
102
141
|
#
|
103
142
|
# @!macro join_key_in_array
|
104
143
|
# @param join_keys [String, Symbol, Array<String, Symbol>]
|
105
|
-
#
|
144
|
+
# a key or keys to match.
|
106
145
|
#
|
107
146
|
# @!macro join_key_in_hash
|
108
147
|
# @param join_key_pairs [Hash]
|
109
|
-
#
|
148
|
+
# pairs of a key name or key names to match in left and right.
|
110
149
|
# @option join_key_pairs [String, Symbol, Array<String, Symbol>] :left
|
111
|
-
#
|
150
|
+
# join keys in `self`.
|
112
151
|
# @option join_key_pairs [String, Symbol, Array<String, Symbol>] :right
|
113
|
-
#
|
152
|
+
# join keys in `other`.
|
153
|
+
#
|
154
|
+
# @!macro join_common_example_1
|
155
|
+
# @example
|
156
|
+
# df = DataFrame.new(KEY: %w[A B C], X1: [1, 2, 3])
|
157
|
+
#
|
158
|
+
# # =>
|
159
|
+
# KEY X1
|
160
|
+
# <string> <uint8>
|
161
|
+
# 0 A 1
|
162
|
+
# 1 B 2
|
163
|
+
# 2 C 3
|
164
|
+
#
|
165
|
+
# other = DataFrame.new(KEY: %w[A B D], X2: [true, false, nil])
|
166
|
+
#
|
167
|
+
# # =>
|
168
|
+
# KEY X2
|
169
|
+
# <string> <boolean>
|
170
|
+
# 0 A true
|
171
|
+
# 1 B false
|
172
|
+
# 2 D (nil)
|
173
|
+
#
|
174
|
+
# @!macro join_common_example_2
|
175
|
+
# @example
|
176
|
+
# df2 = DataFrame.new(KEY1: %w[A B C], X1: [1, 2, 3])
|
177
|
+
#
|
178
|
+
# # =>
|
179
|
+
# KEY1 X1
|
180
|
+
# <string> <uint8>
|
181
|
+
# 0 A 1
|
182
|
+
# 1 B 2
|
183
|
+
# 2 C 3
|
184
|
+
#
|
185
|
+
# other2 = DataFrame.new(KEY2: %w[A B D], X2: [true, false, nil])
|
186
|
+
#
|
187
|
+
# # =>
|
188
|
+
# KEY2 X2
|
189
|
+
# <string> <boolean>
|
190
|
+
# 0 A true
|
191
|
+
# 1 B false
|
192
|
+
# 2 D (nil)
|
193
|
+
#
|
194
|
+
# @!macro join_common_example_3
|
195
|
+
# @example
|
196
|
+
# df3 = DataFrame.new(
|
197
|
+
# KEY1: %w[A B C],
|
198
|
+
# KEY2: [1, 2, 3]
|
199
|
+
# )
|
200
|
+
#
|
201
|
+
# # =>
|
202
|
+
# KEY1 KEY2
|
203
|
+
# <string> <uint8>
|
204
|
+
# 0 A 1
|
205
|
+
# 1 B 2
|
206
|
+
# 2 C 3
|
207
|
+
#
|
208
|
+
# other3 = DataFrame.new(
|
209
|
+
# KEY1: %w[A B D],
|
210
|
+
# KEY2: [1, 4, 5]
|
211
|
+
# )
|
212
|
+
#
|
213
|
+
# # =>
|
214
|
+
# KEY1 KEY2
|
215
|
+
# <string> <uint8>
|
216
|
+
# 0 A 1
|
217
|
+
# 1 B 4
|
218
|
+
# 2 D 5
|
219
|
+
|
220
|
+
# Join another DataFrame or Table, leaving only the matching records.
|
221
|
+
# - Same as `#join` with `type: :inner`
|
222
|
+
# - A kind of mutating join.
|
114
223
|
#
|
115
|
-
# @overload inner_join(other, suffix: '.1')
|
224
|
+
# @overload inner_join(other, suffix: '.1', force_order: true)
|
116
225
|
# If `join_key` is not specified, common keys in self and other are used
|
117
226
|
# (natural keys). Returns joined dataframe.
|
118
227
|
#
|
119
228
|
# @macro join_before
|
229
|
+
# @macro join_force_order
|
120
230
|
# @macro join_after
|
231
|
+
# @macro join_common_example_1
|
232
|
+
# @example without key (use implicit common key)
|
233
|
+
# df.inner_join(other)
|
234
|
+
#
|
235
|
+
# # =>
|
236
|
+
# KEY X1 X2
|
237
|
+
# <string> <uint8> <boolean>
|
238
|
+
# 0 A 1 true
|
239
|
+
# 1 B 2 false
|
121
240
|
#
|
122
|
-
# @overload inner_join(other, join_keys, suffix: '.1')
|
241
|
+
# @overload inner_join(other, join_keys, suffix: '.1', force_order: true)
|
123
242
|
#
|
124
243
|
# @macro join_before
|
125
244
|
# @macro join_key_in_array
|
245
|
+
# @macro join_force_order
|
126
246
|
# @macro join_after
|
247
|
+
# @macro join_common_example_1
|
248
|
+
# @example with a key
|
249
|
+
# df.inner_join(other, :KEY)
|
127
250
|
#
|
128
|
-
#
|
251
|
+
# # =>
|
252
|
+
# KEY X1 X2
|
253
|
+
# <string> <uint8> <boolean>
|
254
|
+
# 0 A 1 true
|
255
|
+
# 1 B 2 false
|
256
|
+
#
|
257
|
+
# @overload inner_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
129
258
|
#
|
130
259
|
# @macro join_before
|
131
260
|
# @macro join_key_in_hash
|
261
|
+
# @macro join_force_order
|
132
262
|
# @macro join_after
|
263
|
+
# @macro join_common_example_2
|
264
|
+
# @example with key pairs
|
265
|
+
# df2.inner_join(other2, { left: :KEY1, right: :KEY2 })
|
266
|
+
#
|
267
|
+
# # =>
|
268
|
+
# KEY1 X1 X2
|
269
|
+
# <string> <uint8> <boolean>
|
270
|
+
# 0 A 1 true
|
271
|
+
# 1 B 2 false
|
133
272
|
#
|
134
|
-
|
135
|
-
|
273
|
+
# @since 0.2.3
|
274
|
+
#
|
275
|
+
def inner_join(other, join_keys = nil, suffix: '.1', force_order: true)
|
276
|
+
join(other, join_keys, type: :inner, suffix: suffix, force_order: force_order)
|
136
277
|
end
|
137
278
|
|
138
279
|
# Join another DataFrame or Table, leaving all records.
|
139
280
|
# - Same as `#join` with `type: :full_outer`
|
140
281
|
# - A kind of mutating join.
|
141
282
|
#
|
142
|
-
# @overload full_join(other, suffix: '.1')
|
283
|
+
# @overload full_join(other, suffix: '.1', force_order: true)
|
143
284
|
# If `join_key` is not specified, common keys in self and other are used
|
144
285
|
# (natural keys). Returns joined dataframe.
|
145
286
|
#
|
146
287
|
# @macro join_before
|
288
|
+
# @macro join_force_order
|
147
289
|
# @macro join_after
|
290
|
+
# @macro join_common_example_1
|
291
|
+
# @example without key (use implicit common key)
|
292
|
+
# df.full_join(other)
|
293
|
+
#
|
294
|
+
# # =>
|
295
|
+
# KEY X1 X2
|
296
|
+
# <string> <uint8> <boolean>
|
297
|
+
# 0 A 1 true
|
298
|
+
# 1 B 2 false
|
299
|
+
# 2 C 3 (nil)
|
300
|
+
# 3 D (nil) (nil)
|
148
301
|
#
|
149
|
-
# @overload full_join(other, join_keys, suffix: '.1')
|
302
|
+
# @overload full_join(other, join_keys, suffix: '.1', force_order: true)
|
150
303
|
#
|
151
304
|
# @macro join_before
|
152
305
|
# @macro join_key_in_array
|
306
|
+
# @macro join_force_order
|
153
307
|
# @macro join_after
|
308
|
+
# @macro join_common_example_1
|
309
|
+
# @example with a key
|
310
|
+
# df.full_join(other, :KEY)
|
311
|
+
#
|
312
|
+
# # =>
|
313
|
+
# KEY X1 X2
|
314
|
+
# <string> <uint8> <boolean>
|
315
|
+
# 0 A 1 true
|
316
|
+
# 1 B 2 false
|
317
|
+
# 2 C 3 (nil)
|
318
|
+
# 3 D (nil) (nil)
|
154
319
|
#
|
155
|
-
# @overload full_join(other, join_key_pairs, suffix: '.1')
|
320
|
+
# @overload full_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
156
321
|
#
|
157
322
|
# @macro join_before
|
158
323
|
# @macro join_key_in_hash
|
324
|
+
# @macro join_force_order
|
159
325
|
# @macro join_after
|
160
|
-
#
|
161
|
-
|
162
|
-
|
326
|
+
# @macro join_common_example_2
|
327
|
+
# @example with key pairs
|
328
|
+
# df2.full_join(other2, { left: :KEY1, right: :KEY2 })
|
329
|
+
#
|
330
|
+
# # =>
|
331
|
+
# KEY1 X1 X2
|
332
|
+
# <string> <uint8> <boolean>
|
333
|
+
# 0 A 1 true
|
334
|
+
# 1 B 2 false
|
335
|
+
# 2 C 3 (nil)
|
336
|
+
# 3 D (nil) (nil)
|
337
|
+
#
|
338
|
+
# @since 0.2.3
|
339
|
+
#
|
340
|
+
def full_join(other, join_keys = nil, suffix: '.1', force_order: true)
|
341
|
+
join(other, join_keys,
|
342
|
+
type: :full_outer, suffix: suffix, force_order: force_order)
|
163
343
|
end
|
164
344
|
|
165
345
|
alias_method :outer_join, :full_join
|
@@ -168,54 +348,130 @@ module RedAmber
|
|
168
348
|
# - Same as `#join` with `type: :left_outer`
|
169
349
|
# - A kind of mutating join.
|
170
350
|
#
|
171
|
-
# @overload left_join(other, suffix: '.1')
|
351
|
+
# @overload left_join(other, suffix: '.1', force_order: true)
|
172
352
|
# If `join_key` is not specified, common keys in self and other are used
|
173
353
|
# (natural keys). Returns joined dataframe.
|
174
354
|
#
|
175
355
|
# @macro join_before
|
356
|
+
# @macro join_force_order
|
176
357
|
# @macro join_after
|
358
|
+
# @macro join_common_example_1
|
359
|
+
# @example without key (use implicit common key)
|
360
|
+
# df.left_join(other)
|
361
|
+
#
|
362
|
+
# # =>
|
363
|
+
# KEY X1 X2
|
364
|
+
# <string> <uint8> <boolean>
|
365
|
+
# 0 A 1 true
|
366
|
+
# 1 B 2 false
|
367
|
+
# 2 C 3 (nil)
|
177
368
|
#
|
178
|
-
# @overload left_join(other, join_keys, suffix: '.1')
|
369
|
+
# @overload left_join(other, join_keys, suffix: '.1', force_order: true)
|
179
370
|
#
|
180
371
|
# @macro join_before
|
181
372
|
# @macro join_key_in_array
|
373
|
+
# @macro join_force_order
|
182
374
|
# @macro join_after
|
375
|
+
# @macro join_common_example_1
|
376
|
+
# @example with a key
|
377
|
+
# df.left_join(other, :KEY)
|
378
|
+
#
|
379
|
+
# # =>
|
380
|
+
# KEY X1 X2
|
381
|
+
# <string> <uint8> <boolean>
|
382
|
+
# 0 A 1 true
|
383
|
+
# 1 B 2 false
|
384
|
+
# 2 C 3 (nil)
|
183
385
|
#
|
184
|
-
# @overload left_join(other, join_key_pairs, suffix: '.1')
|
386
|
+
# @overload left_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
185
387
|
#
|
186
388
|
# @macro join_before
|
187
389
|
# @macro join_key_in_hash
|
390
|
+
# @macro join_force_order
|
188
391
|
# @macro join_after
|
392
|
+
# @macro join_common_example_2
|
393
|
+
# @example with key pairs
|
394
|
+
# df2.left_join(other2, { left: :KEY1, right: :KEY2 })
|
189
395
|
#
|
190
|
-
|
191
|
-
|
396
|
+
# # =>
|
397
|
+
# KEY1 X1 X2
|
398
|
+
# <string> <uint8> <boolean>
|
399
|
+
# 0 A 1 true
|
400
|
+
# 1 B 2 false
|
401
|
+
# 2 C 3 (nil)
|
402
|
+
#
|
403
|
+
# @since 0.2.3
|
404
|
+
#
|
405
|
+
def left_join(other, join_keys = nil, suffix: '.1', force_order: true)
|
406
|
+
join(other, join_keys, type: :left_outer, suffix: suffix, force_order: force_order)
|
192
407
|
end
|
193
408
|
|
194
409
|
# Join matching values from self to other.
|
195
410
|
# - Same as `#join` with `type: :right_outer`
|
196
411
|
# - A kind of mutating join.
|
197
412
|
#
|
198
|
-
# @overload right_join(other, suffix: '.1')
|
413
|
+
# @overload right_join(other, suffix: '.1', force_order: true)
|
199
414
|
# If `join_key` is not specified, common keys in self and other are used
|
200
415
|
# (natural keys). Returns joined dataframe.
|
201
416
|
#
|
202
417
|
# @macro join_before
|
418
|
+
# @macro join_force_order
|
203
419
|
# @macro join_after
|
420
|
+
# @macro join_common_example_1
|
421
|
+
# @example without key (use implicit common key)
|
422
|
+
# df.right_join(other)
|
423
|
+
#
|
424
|
+
# # =>
|
425
|
+
# KEY X1 X2
|
426
|
+
# <string> <uint8> <boolean>
|
427
|
+
# 0 A 1 true
|
428
|
+
# 1 B 2 false
|
429
|
+
# 2 D (nil) (nil)
|
204
430
|
#
|
205
|
-
# @overload right_join(other, join_keys, suffix: '.1')
|
431
|
+
# @overload right_join(other, join_keys, suffix: '.1', force_order: true)
|
206
432
|
#
|
207
433
|
# @macro join_before
|
208
434
|
# @macro join_key_in_array
|
435
|
+
# @macro join_force_order
|
209
436
|
# @macro join_after
|
437
|
+
# @macro join_common_example_1
|
438
|
+
# @example with a key
|
439
|
+
# df.right_join(other, :KEY)
|
210
440
|
#
|
211
|
-
#
|
441
|
+
# # =>
|
442
|
+
# KEY X1 X2
|
443
|
+
# <string> <uint8> <boolean>
|
444
|
+
# 0 A 1 true
|
445
|
+
# 1 B 2 false
|
446
|
+
# 2 D (nil) (nil)
|
447
|
+
#
|
448
|
+
# @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
212
449
|
#
|
213
450
|
# @macro join_before
|
214
451
|
# @macro join_key_in_hash
|
452
|
+
# @macro join_force_order
|
215
453
|
# @macro join_after
|
216
|
-
#
|
217
|
-
|
218
|
-
|
454
|
+
# @macro join_common_example_2
|
455
|
+
# @example with key pairs
|
456
|
+
# df2.right_join(other2, { left: :KEY1, right: :KEY2 })
|
457
|
+
#
|
458
|
+
# # =>
|
459
|
+
# KEY1 X1 X2
|
460
|
+
# <string> <uint8> <boolean>
|
461
|
+
# 0 A 1 true
|
462
|
+
# 1 B 2 false
|
463
|
+
# 2 D (nil) (nil)
|
464
|
+
#
|
465
|
+
# @since 0.2.3
|
466
|
+
#
|
467
|
+
def right_join(other, join_keys = nil, suffix: '.1', force_order: true)
|
468
|
+
join(
|
469
|
+
other,
|
470
|
+
join_keys,
|
471
|
+
type: :right_outer,
|
472
|
+
suffix: suffix,
|
473
|
+
force_order: force_order
|
474
|
+
)
|
219
475
|
end
|
220
476
|
|
221
477
|
# Filtering joins (#semi_join, #anti_join)
|
@@ -224,54 +480,115 @@ module RedAmber
|
|
224
480
|
# - Same as `#join` with `type: :left_semi`
|
225
481
|
# - A kind of filtering join.
|
226
482
|
#
|
227
|
-
# @overload semi_join(other, suffix: '.1')
|
483
|
+
# @overload semi_join(other, suffix: '.1', force_order: true)
|
228
484
|
# If `join_key` is not specified, common keys in self and other are used
|
229
485
|
# (natural keys). Returns joined dataframe.
|
230
486
|
#
|
231
487
|
# @macro join_before
|
488
|
+
# @macro join_force_order
|
232
489
|
# @macro join_after
|
490
|
+
# @macro join_common_example_1
|
491
|
+
# @example without key (use implicit common key)
|
492
|
+
# df.semi_join(other)
|
233
493
|
#
|
234
|
-
#
|
494
|
+
# # =>
|
495
|
+
# KEY X1
|
496
|
+
# <string> <uint8>
|
497
|
+
# 0 A 1
|
498
|
+
# 1 B 2
|
499
|
+
#
|
500
|
+
# @overload semi_join(other, join_keys, suffix: '.1', force_order: true)
|
235
501
|
#
|
236
502
|
# @macro join_before
|
237
503
|
# @macro join_key_in_array
|
504
|
+
# @macro join_force_order
|
238
505
|
# @macro join_after
|
506
|
+
# @macro join_common_example_1
|
507
|
+
# @example with a key
|
508
|
+
# df.semi_join(other, :KEY)
|
509
|
+
#
|
510
|
+
# # =>
|
511
|
+
# KEY X1
|
512
|
+
# <string> <uint8>
|
513
|
+
# 0 A 1
|
514
|
+
# 1 B 2
|
239
515
|
#
|
240
|
-
# @overload semi_join(other, join_key_pairs, suffix: '.1')
|
516
|
+
# @overload semi_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
241
517
|
#
|
242
518
|
# @macro join_before
|
243
519
|
# @macro join_key_in_hash
|
520
|
+
# @macro join_force_order
|
244
521
|
# @macro join_after
|
522
|
+
# @macro join_common_example_2
|
523
|
+
# @example with key pairs
|
524
|
+
# df2.semi_join(other2, { left: :KEY1, right: :KEY2 })
|
525
|
+
#
|
526
|
+
# # =>
|
527
|
+
# KEY1 X1
|
528
|
+
# <string> <uint8>
|
529
|
+
# 0 A 1
|
530
|
+
# 1 B 2
|
245
531
|
#
|
246
|
-
|
247
|
-
|
532
|
+
# @since 0.2.3
|
533
|
+
#
|
534
|
+
def semi_join(other, join_keys = nil, suffix: '.1', force_order: true)
|
535
|
+
join(other, join_keys, type: :left_semi, suffix: suffix, force_order: force_order)
|
248
536
|
end
|
249
537
|
|
250
538
|
# Return records of self that do not have a match in other.
|
251
539
|
# - Same as `#join` with `type: :left_anti`
|
252
540
|
# - A kind of filtering join.
|
253
541
|
#
|
254
|
-
# @overload anti_join(other, suffix: '.1')
|
542
|
+
# @overload anti_join(other, suffix: '.1', force_order: true)
|
255
543
|
# If `join_key` is not specified, common keys in self and other are used
|
256
544
|
# (natural keys). Returns joined dataframe.
|
257
545
|
#
|
258
546
|
# @macro join_before
|
547
|
+
# @macro join_force_order
|
259
548
|
# @macro join_after
|
549
|
+
# @macro join_common_example_1
|
550
|
+
# @example without key (use implicit common key)
|
551
|
+
# df.anti_join(other)
|
552
|
+
#
|
553
|
+
# # =>
|
554
|
+
# KEY X1
|
555
|
+
# <string> <uint8>
|
556
|
+
# 0 C 3
|
260
557
|
#
|
261
|
-
# @overload anti_join(other, join_keys, suffix: '.1')
|
558
|
+
# @overload anti_join(other, join_keys, suffix: '.1', force_order: true)
|
262
559
|
#
|
263
560
|
# @macro join_before
|
264
561
|
# @macro join_key_in_array
|
562
|
+
# @macro join_force_order
|
265
563
|
# @macro join_after
|
564
|
+
# @macro join_common_example_1
|
565
|
+
# @example with a key
|
566
|
+
# df.anti_join(other, :KEY)
|
266
567
|
#
|
267
|
-
#
|
568
|
+
# # =>
|
569
|
+
# KEY X1
|
570
|
+
# <string> <uint8>
|
571
|
+
# 0 C 3
|
572
|
+
#
|
573
|
+
# @overload anti_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
268
574
|
#
|
269
575
|
# @macro join_before
|
270
576
|
# @macro join_key_in_hash
|
577
|
+
# @macro join_force_order
|
271
578
|
# @macro join_after
|
579
|
+
# @macro join_common_example_2
|
580
|
+
# @example with key pairs
|
581
|
+
# df2.anti_join(other2, { left: :KEY1, right: :KEY2 })
|
582
|
+
#
|
583
|
+
# # =>
|
584
|
+
# KEY1 X1
|
585
|
+
# <string> <uint8>
|
586
|
+
# 0 C 3
|
587
|
+
#
|
588
|
+
# @since 0.2.3
|
272
589
|
#
|
273
|
-
def anti_join(other, join_keys = nil, suffix: '.1')
|
274
|
-
join(other, join_keys, type: :left_anti, suffix: suffix)
|
590
|
+
def anti_join(other, join_keys = nil, suffix: '.1', force_order: true)
|
591
|
+
join(other, join_keys, type: :left_anti, suffix: suffix, force_order: force_order)
|
275
592
|
end
|
276
593
|
|
277
594
|
# Set operations (#intersect, #union, #difference, #set_operable?)
|
@@ -279,8 +596,13 @@ module RedAmber
|
|
279
596
|
# Check if set operation with self and other is possible.
|
280
597
|
#
|
281
598
|
# @macro join_before
|
599
|
+
# @return [Boolean]
|
600
|
+
# true if set operation is possible.
|
601
|
+
# @macro join_common_example_3
|
602
|
+
# @example
|
603
|
+
# df3.set_operable?(other3) # => true
|
282
604
|
#
|
283
|
-
# @
|
605
|
+
# @since 0.2.3
|
284
606
|
#
|
285
607
|
def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
|
286
608
|
keys == other.keys.map(&:to_sym)
|
@@ -291,8 +613,18 @@ module RedAmber
|
|
291
613
|
# - A kind of set operations.
|
292
614
|
#
|
293
615
|
# @macro join_before
|
616
|
+
# @return [DataFrame]
|
617
|
+
# joined dataframe.
|
618
|
+
# @macro join_common_example_3
|
619
|
+
# @example
|
620
|
+
# df3.intersect(other3)
|
621
|
+
#
|
622
|
+
# # =>
|
623
|
+
# KEY1 KEY2
|
624
|
+
# <string> <uint8>
|
625
|
+
# 0 A 1
|
294
626
|
#
|
295
|
-
# @
|
627
|
+
# @since 0.2.3
|
296
628
|
#
|
297
629
|
def intersect(other)
|
298
630
|
unless keys == other.keys.map(&:to_sym)
|
@@ -307,8 +639,22 @@ module RedAmber
|
|
307
639
|
# - A kind of set operations.
|
308
640
|
#
|
309
641
|
# @macro join_before
|
310
|
-
#
|
311
|
-
#
|
642
|
+
# @return [DataFrame]
|
643
|
+
# joined dataframe.
|
644
|
+
# @macro join_common_example_3
|
645
|
+
# @example
|
646
|
+
# df3.intersect(other3)
|
647
|
+
#
|
648
|
+
# # =>
|
649
|
+
# KEY1 KEY2
|
650
|
+
# <string> <uint8>
|
651
|
+
# 0 A 1
|
652
|
+
# 1 B 2
|
653
|
+
# 2 C 3
|
654
|
+
# 3 B 4
|
655
|
+
# 4 D 5
|
656
|
+
#
|
657
|
+
# @since 0.2.3
|
312
658
|
#
|
313
659
|
def union(other)
|
314
660
|
unless keys == other.keys.map(&:to_sym)
|
@@ -323,8 +669,27 @@ module RedAmber
|
|
323
669
|
# - A kind of set operations.
|
324
670
|
#
|
325
671
|
# @macro join_before
|
672
|
+
# @return [DataFrame]
|
673
|
+
# joined dataframe.
|
674
|
+
# @macro join_common_example_3
|
675
|
+
# @example
|
676
|
+
# df3.intersect(other3)
|
677
|
+
#
|
678
|
+
# # =>
|
679
|
+
# KEY1 KEY2
|
680
|
+
# <string> <uint8>
|
681
|
+
# 0 B 2
|
682
|
+
# 1 C 3
|
683
|
+
#
|
684
|
+
# other.intersect(df)
|
685
|
+
#
|
686
|
+
# # =>
|
687
|
+
# KEY1 KEY2
|
688
|
+
# <string> <uint8>
|
689
|
+
# 0 B 4
|
690
|
+
# 1 D 5
|
326
691
|
#
|
327
|
-
# @
|
692
|
+
# @since 0.2.3
|
328
693
|
#
|
329
694
|
def difference(other)
|
330
695
|
unless keys == other.keys.map(&:to_sym)
|
@@ -338,60 +703,167 @@ module RedAmber
|
|
338
703
|
|
339
704
|
# Join another DataFrame or Table to self.
|
340
705
|
#
|
341
|
-
#
|
706
|
+
# @!macro join_common_type
|
707
|
+
# @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
|
708
|
+
# left_outer, :right_outer, :full_outer] type of join.
|
709
|
+
#
|
710
|
+
# @!macro join_common_example_4
|
711
|
+
# @example
|
712
|
+
# df4 = DataFrame.new(
|
713
|
+
# X1: %w[A B C],
|
714
|
+
# Y: %w[D E F]
|
715
|
+
# )
|
716
|
+
#
|
717
|
+
# # =>
|
718
|
+
# X1 Y1
|
719
|
+
# <string> <string>
|
720
|
+
# 0 A D
|
721
|
+
# 1 B E
|
722
|
+
# 2 C F
|
723
|
+
#
|
724
|
+
# other4 = DataFrame.new(
|
725
|
+
# X2: %w[A B D],
|
726
|
+
# Y: %w[e E E]
|
727
|
+
# )
|
728
|
+
#
|
729
|
+
# # =>
|
730
|
+
# X1 Y1
|
731
|
+
# <string> <string>
|
732
|
+
# 0 A D
|
733
|
+
# 1 B E
|
734
|
+
# 2 C F
|
735
|
+
|
736
|
+
# @note the order of joined results will be preserved by default.
|
737
|
+
# This is enabled by appending index column to sort after joining but
|
738
|
+
# it will cause some performance degradation. If you don't matter
|
739
|
+
# the order of the result, set `force_order` option to `false`.
|
740
|
+
#
|
741
|
+
# @overload join(other, type: :inner, suffix: '.1', force_order: true)
|
342
742
|
#
|
343
743
|
# If `join_key` is not specified, common keys in self and other are used
|
344
744
|
# (natural keys). Returns joined dataframe.
|
345
745
|
#
|
346
|
-
# @!macro join_common_type
|
347
|
-
# @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
|
348
|
-
# left_outer, :right_outer, :full_outer] type of join.
|
349
|
-
#
|
350
746
|
# @macro join_before
|
351
747
|
# @macro join_common_type
|
748
|
+
# @macro join_force_order
|
352
749
|
# @macro join_after
|
750
|
+
# @macro join_common_example_1
|
751
|
+
# @example
|
752
|
+
# df.join(other)
|
753
|
+
#
|
754
|
+
# # =>
|
755
|
+
# KEY X1 X2
|
756
|
+
# <string> <uint8> <boolean>
|
757
|
+
# 0 A 1 true
|
758
|
+
# 1 B 2 false
|
759
|
+
#
|
760
|
+
# df.join(other, type: :full_outer)
|
353
761
|
#
|
354
|
-
#
|
762
|
+
# # =>
|
763
|
+
# KEY X1 X2
|
764
|
+
# <string> <uint8> <boolean>
|
765
|
+
# 0 A 1 true
|
766
|
+
# 1 B 2 false
|
767
|
+
# 2 C 3 (nil)
|
768
|
+
# 3 D (nil) (nil)
|
769
|
+
#
|
770
|
+
# @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: true)
|
355
771
|
#
|
356
772
|
# @macro join_before
|
357
773
|
# @macro join_key_in_array
|
358
774
|
# @macro join_common_type
|
775
|
+
# @macro join_force_order
|
359
776
|
# @macro join_after
|
777
|
+
# @macro join_common_example_3
|
778
|
+
# @example join keys in an Array
|
779
|
+
# df3.join(other3, [:KEY1, :KEY2])
|
780
|
+
#
|
781
|
+
# # =>
|
782
|
+
# KEY1 KEY2
|
783
|
+
# <string> <uint8>
|
784
|
+
# 0 A 1
|
360
785
|
#
|
361
|
-
#
|
786
|
+
# @example partial join key and suffix
|
787
|
+
# df3.join(other3, :KEY1, suffix: '.a')
|
788
|
+
#
|
789
|
+
# # =>
|
790
|
+
# KEY1 KEY2 KEY2.a
|
791
|
+
# <string> <uint8> <uint8>
|
792
|
+
# 0 A 1 1
|
793
|
+
# 1 B 2 4
|
794
|
+
#
|
795
|
+
# @overload join(other, join_key_pairs, type: :inner, suffix: '.1', force_order: true)
|
362
796
|
#
|
363
797
|
# @macro join_before
|
364
798
|
# @macro join_key_in_hash
|
365
799
|
# @macro join_common_type
|
800
|
+
# @macro join_force_order
|
366
801
|
# @macro join_after
|
367
|
-
#
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
802
|
+
# @macro join_common_example_4
|
803
|
+
# @example without options
|
804
|
+
# df4.join(other4)
|
805
|
+
#
|
806
|
+
# # =>
|
807
|
+
# X1 Y X2
|
808
|
+
# <string> <string> <string>
|
809
|
+
# 0 B E D
|
810
|
+
# 1 B E B
|
811
|
+
#
|
812
|
+
# @example join by key pairs
|
813
|
+
# df4.join(other4, { left: [:X1, :Y], right: [:X2, :Y] })
|
814
|
+
#
|
815
|
+
# # =>
|
816
|
+
# X1 Y
|
817
|
+
# <string> <string>
|
818
|
+
# 0 B E
|
819
|
+
#
|
820
|
+
# @example join by key pairs, using renaming by suffix
|
821
|
+
# df4.join(other4, { left: :X1, right: :X2 })
|
822
|
+
#
|
823
|
+
# # =>
|
824
|
+
# X1 Y Y.1
|
825
|
+
# <string> <string> <string>
|
826
|
+
# 0 A D e
|
827
|
+
# 1 B E E
|
828
|
+
#
|
829
|
+
# @since 0.2.3
|
830
|
+
#
|
831
|
+
def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: true)
|
832
|
+
right_table =
|
833
|
+
case other
|
834
|
+
when DataFrame
|
835
|
+
other.table
|
836
|
+
when Arrow::Table
|
837
|
+
other
|
838
|
+
else
|
839
|
+
raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
|
840
|
+
end
|
841
|
+
|
842
|
+
type = type.to_sym
|
843
|
+
left_index = :__LEFT_INDEX__
|
844
|
+
right_index = :__RIGHT_INDEX__
|
845
|
+
if force_order
|
846
|
+
left_table = assign(left_index) { indices }.table
|
847
|
+
other = DataFrame.create(other) if other.is_a?(Arrow::Table)
|
848
|
+
right_table = other.assign(right_index) { indices }.table
|
374
849
|
else
|
375
|
-
|
850
|
+
left_table = table
|
376
851
|
end
|
377
852
|
|
378
|
-
table_keys =
|
379
|
-
other_keys =
|
380
|
-
type = type.to_sym
|
853
|
+
table_keys = left_table.keys
|
854
|
+
other_keys = right_table.keys
|
381
855
|
|
382
856
|
# natural keys (implicit common keys)
|
383
857
|
join_keys ||= table_keys.intersection(other_keys)
|
384
858
|
|
385
859
|
# This is not necessary if additional procedure is contributed to Red Arrow.
|
386
860
|
if join_keys.is_a?(Hash)
|
387
|
-
left_keys = join_keys[:left]
|
388
|
-
right_keys = join_keys[:right]
|
861
|
+
left_keys = ensure_keys(join_keys[:left])
|
862
|
+
right_keys = ensure_keys(join_keys[:right])
|
389
863
|
else
|
390
|
-
left_keys = join_keys
|
391
|
-
right_keys =
|
864
|
+
left_keys = ensure_keys(join_keys)
|
865
|
+
right_keys = left_keys
|
392
866
|
end
|
393
|
-
left_keys = Array(left_keys).map(&:to_s)
|
394
|
-
right_keys = Array(right_keys).map(&:to_s)
|
395
867
|
|
396
868
|
case type
|
397
869
|
when :full_outer, :left_semi, :left_anti, :right_semi, :right_anti
|
@@ -407,43 +879,73 @@ module RedAmber
|
|
407
879
|
|
408
880
|
# Should we rescue errors in Arrow::Table#join for usability ?
|
409
881
|
joined_table =
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
882
|
+
left_table.join(
|
883
|
+
right_table,
|
884
|
+
join_keys,
|
885
|
+
type: type,
|
886
|
+
left_outputs: left_outputs,
|
887
|
+
right_outputs: right_outputs
|
888
|
+
)
|
414
889
|
|
415
890
|
case type
|
416
891
|
when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
892
|
+
dataframe =
|
893
|
+
if joined_table.keys.uniq!
|
894
|
+
DataFrame.create(rename_table(joined_table, n_keys, suffix))
|
895
|
+
else
|
896
|
+
DataFrame.create(joined_table)
|
897
|
+
end
|
898
|
+
sorter =
|
899
|
+
case type
|
900
|
+
when :inner, :left_outer
|
901
|
+
[left_index, right_index]
|
902
|
+
when :left_semi, :left_anti
|
903
|
+
[left_index]
|
904
|
+
when :right_semi, :right_anti
|
905
|
+
[right_index]
|
906
|
+
end
|
422
907
|
when :full_outer
|
908
|
+
key_index_lr =
|
909
|
+
left_keys.map { left_table.keys.index(_1) }
|
910
|
+
.zip(right_keys.map { left_table.keys.size + right_table.keys.index(_1) })
|
423
911
|
renamed_table = rename_table(joined_table, n_keys, suffix)
|
424
|
-
renamed_keys = renamed_table.keys
|
425
912
|
dropper = []
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
913
|
+
dataframe =
|
914
|
+
DataFrame.create(renamed_table).assign do |df|
|
915
|
+
key_index_lr.map do |l, r|
|
916
|
+
dropper << df.keys[r]
|
917
|
+
[df.keys[l], merge_array(df.vectors[l].data, df.vectors[r].data)]
|
918
|
+
end
|
432
919
|
end
|
433
|
-
|
920
|
+
dataframe = dataframe.drop(dropper)
|
921
|
+
sorter = [left_index, right_index]
|
434
922
|
when :right_outer
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
923
|
+
dataframe =
|
924
|
+
if joined_table.keys.uniq!
|
925
|
+
DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
|
926
|
+
else
|
927
|
+
DataFrame.create(joined_table)
|
928
|
+
end
|
929
|
+
dataframe = dataframe.pick(right_keys, dataframe.keys - right_keys)
|
930
|
+
sorter = [left_index, right_index]
|
931
|
+
end
|
932
|
+
|
933
|
+
if force_order
|
934
|
+
dataframe
|
935
|
+
.sort(sorter)
|
936
|
+
.drop(sorter)
|
937
|
+
else
|
938
|
+
dataframe
|
442
939
|
end
|
443
940
|
end
|
444
941
|
|
445
942
|
private
|
446
943
|
|
944
|
+
# To ensure Array of Symbols
|
945
|
+
def ensure_keys(keys)
|
946
|
+
Array(keys).map(&:to_sym)
|
947
|
+
end
|
948
|
+
|
447
949
|
# Rename duplicate keys by suffix
|
448
950
|
def rename_table(joined_table, n_keys, suffix)
|
449
951
|
joined_keys = joined_table.keys
|
@@ -453,17 +955,9 @@ module RedAmber
|
|
453
955
|
renamed_right_keys =
|
454
956
|
other_keys.map do |key|
|
455
957
|
if dup_keys.include?(key)
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
break unless joined_keys.include?(new_key)
|
460
|
-
|
461
|
-
s = suffix.succ
|
462
|
-
raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
|
463
|
-
|
464
|
-
suffix = s
|
465
|
-
end
|
466
|
-
new_key
|
958
|
+
suffixed = "#{key}#{suffix}".to_sym
|
959
|
+
# Find a key from suffixed.succ
|
960
|
+
(suffixed..).find { !joined_keys.include?(_1) }
|
467
961
|
else
|
468
962
|
key
|
469
963
|
end
|