red_amber 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +133 -51
- data/.yardopts +2 -0
- data/CHANGELOG.md +203 -1
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +61 -45
- data/benchmark/basic.yml +11 -4
- data/benchmark/combine.yml +3 -4
- data/benchmark/dataframe.yml +62 -0
- data/benchmark/group.yml +7 -1
- data/benchmark/reshape.yml +6 -2
- data/benchmark/vector.yml +63 -0
- data/doc/DataFrame.md +35 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +295 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +537 -68
- data/lib/red_amber/data_frame_combinable.rb +776 -123
- data/lib/red_amber/data_frame_displayable.rb +248 -18
- data/lib/red_amber/data_frame_indexable.rb +122 -19
- data/lib/red_amber/data_frame_loadsave.rb +81 -10
- data/lib/red_amber/data_frame_reshaping.rb +216 -21
- data/lib/red_amber/data_frame_selectable.rb +781 -120
- data/lib/red_amber/data_frame_variable_operation.rb +561 -85
- data/lib/red_amber/group.rb +195 -21
- data/lib/red_amber/helper.rb +114 -32
- data/lib/red_amber/refinements.rb +206 -0
- data/lib/red_amber/subframes.rb +1066 -0
- data/lib/red_amber/vector.rb +435 -58
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +387 -0
- data/lib/red_amber/vector_selectable.rb +321 -69
- data/lib/red_amber/vector_unary_element_wise.rb +436 -0
- data/lib/red_amber/vector_updatable.rb +397 -24
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +15 -1
- data/red_amber.gemspec +4 -3
- metadata +19 -11
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -294
@@ -1,14 +1,36 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Mix-in for the class DataFrame
|
5
5
|
module DataFrameCombinable
|
6
|
-
#
|
6
|
+
# Refinements for Arrow::Table
|
7
|
+
using RefineArrowTable
|
8
|
+
|
9
|
+
# Concatenate other dataframes or tables onto the bottom of self.
|
7
10
|
#
|
11
|
+
# @note the `#types` must be same as `other#types`.
|
8
12
|
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
9
|
-
#
|
13
|
+
# DataFrames or Tables to concatenate.
|
10
14
|
# @return [DataFrame]
|
11
|
-
#
|
15
|
+
# concatenated dataframe.
|
16
|
+
# @example
|
17
|
+
# df = DataFrame.new(x: [1, 2], y: ['A', 'B'])
|
18
|
+
# other = DataFrame.new(x: [3, 4], y: ['C', 'D'])
|
19
|
+
# [df.types, other.types]
|
20
|
+
#
|
21
|
+
# # =>
|
22
|
+
# [[:uint8, :string], [:uint8, :string]]
|
23
|
+
#
|
24
|
+
# df.concatenate(other)
|
25
|
+
#
|
26
|
+
# # =>
|
27
|
+
# x y
|
28
|
+
# <uint8> <string>
|
29
|
+
# 0 1 A
|
30
|
+
# 1 2 B
|
31
|
+
# 2 3 C
|
32
|
+
# 3 4 D
|
33
|
+
#
|
12
34
|
def concatenate(*other)
|
13
35
|
case other
|
14
36
|
in [] | [nil] | [[]]
|
@@ -30,20 +52,33 @@ module RedAmber
|
|
30
52
|
end
|
31
53
|
end
|
32
54
|
|
33
|
-
DataFrame.
|
55
|
+
DataFrame.create(table.concatenate(table_array))
|
34
56
|
end
|
35
57
|
|
36
58
|
alias_method :concat, :concatenate
|
37
59
|
alias_method :bind_rows, :concatenate
|
38
60
|
|
39
|
-
# Merge other
|
40
|
-
#
|
41
|
-
#
|
42
|
-
#
|
61
|
+
# Merge other DataFrames or Tables.
|
62
|
+
#
|
63
|
+
# @note the `#size` must be same as `other#size`.
|
64
|
+
# @note self and other must not share the same key.
|
43
65
|
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
44
|
-
#
|
66
|
+
# DataFrames or Tables to merge.
|
67
|
+
# @raise [DataFrameArgumentError]
|
68
|
+
# if size is not same or self and other shares the same key.
|
45
69
|
# @return [DataFrame]
|
46
|
-
#
|
70
|
+
# merged dataframe.
|
71
|
+
# @example
|
72
|
+
# df = DataFrame.new(x: [1, 2], y: [3, 4])
|
73
|
+
# other = DataFrame.new(a: ['A', 'B'], b: ['C', 'D'])
|
74
|
+
# df.merge(other)
|
75
|
+
#
|
76
|
+
# # =>
|
77
|
+
# x y a b
|
78
|
+
# <uint8> <uint8> <string> <string>
|
79
|
+
# 0 1 3 A C
|
80
|
+
# 1 2 4 B D
|
81
|
+
#
|
47
82
|
def merge(*other)
|
48
83
|
case other
|
49
84
|
in [] | [nil] | [[]]
|
@@ -58,14 +93,16 @@ module RedAmber
|
|
58
93
|
df =
|
59
94
|
case e
|
60
95
|
when Arrow::Table
|
61
|
-
DataFrame.
|
96
|
+
DataFrame.create(e)
|
62
97
|
when DataFrame
|
63
98
|
e
|
64
99
|
else
|
65
100
|
raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
|
66
101
|
end
|
67
102
|
|
68
|
-
|
103
|
+
if size != df.size
|
104
|
+
raise DataFrameArgumentError, "#{e} do not have same size as self"
|
105
|
+
end
|
69
106
|
|
70
107
|
k = keys.intersection(df.keys).any?
|
71
108
|
raise DataFrameArgumentError, "There are some shared keys: #{k}" if k
|
@@ -78,206 +115,822 @@ module RedAmber
|
|
78
115
|
|
79
116
|
alias_method :bind_cols, :merge
|
80
117
|
|
81
|
-
# Mutating joins
|
118
|
+
# Mutating joins (#inner_join, #full_join, #left_join, #right_join)
|
82
119
|
|
83
|
-
#
|
120
|
+
# @!macro join_before
|
121
|
+
# @param other [DataFrame, Arrow::Table]
|
122
|
+
# A DataFrame or a Table to be joined with self.
|
123
|
+
#
|
124
|
+
# @!macro join_dorce_order
|
125
|
+
# @param force_order [Boolean]
|
126
|
+
# wheather force order of the output always same.
|
127
|
+
# - This option is used in `:full_outer` and `:right_outer`.
|
128
|
+
# - If this option is true (by default) it will append index to the source
|
129
|
+
# and sort after joining. It will cause some degradation in performance.
|
130
|
+
#
|
131
|
+
# @!macro join_after
|
132
|
+
# @param suffix [#succ]
|
133
|
+
# a suffix to rename keys when key names conflict as a result of join.
|
134
|
+
# `suffix` must be responsible to `#succ`.
|
135
|
+
# @return [DataFrame]
|
136
|
+
# joined dataframe.
|
137
|
+
#
|
138
|
+
# @!macro join_key_in_array
|
139
|
+
# @param join_keys [String, Symbol, Array<String, Symbol>]
|
140
|
+
# a key or keys to match.
|
141
|
+
#
|
142
|
+
# @!macro join_key_in_hash
|
143
|
+
# @param join_key_pairs [Hash]
|
144
|
+
# pairs of a key name or key names to match in left and right.
|
145
|
+
# @option join_key_pairs [String, Symbol, Array<String, Symbol>] :left
|
146
|
+
# join keys in `self`.
|
147
|
+
# @option join_key_pairs [String, Symbol, Array<String, Symbol>] :right
|
148
|
+
# join keys in `other`.
|
149
|
+
#
|
150
|
+
# @!macro join_common_example_1
|
151
|
+
# @example
|
152
|
+
# df = DataFrame.new(KEY: %w[A B C], X1: [1, 2, 3])
|
153
|
+
#
|
154
|
+
# # =>
|
155
|
+
# KEY X1
|
156
|
+
# <string> <uint8>
|
157
|
+
# 0 A 1
|
158
|
+
# 1 B 2
|
159
|
+
# 2 C 3
|
160
|
+
#
|
161
|
+
# other = DataFrame.new(KEY: %w[A B D], X2: [true, false, nil])
|
162
|
+
#
|
163
|
+
# # =>
|
164
|
+
# KEY X2
|
165
|
+
# <string> <boolean>
|
166
|
+
# 0 A true
|
167
|
+
# 1 B false
|
168
|
+
# 2 D (nil)
|
169
|
+
#
|
170
|
+
# @!macro join_common_example_2
|
171
|
+
# @example
|
172
|
+
# df2 = DataFrame.new(KEY1: %w[A B C], X1: [1, 2, 3])
|
84
173
|
#
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
174
|
+
# # =>
|
175
|
+
# KEY1 X1
|
176
|
+
# <string> <uint8>
|
177
|
+
# 0 A 1
|
178
|
+
# 1 B 2
|
179
|
+
# 2 C 3
|
180
|
+
#
|
181
|
+
# other2 = DataFrame.new(KEY2: %w[A B D], X2: [true, false, nil])
|
182
|
+
#
|
183
|
+
# # =>
|
184
|
+
# KEY2 X2
|
185
|
+
# <string> <boolean>
|
186
|
+
# 0 A true
|
187
|
+
# 1 B false
|
188
|
+
# 2 D (nil)
|
189
|
+
#
|
190
|
+
# @!macro join_common_example_3
|
191
|
+
# @example
|
192
|
+
# df3 = DataFrame.new(
|
193
|
+
# KEY1: %w[A B C],
|
194
|
+
# KEY2: [1, 2, 3]
|
195
|
+
# )
|
196
|
+
#
|
197
|
+
# # =>
|
198
|
+
# KEY1 KEY2
|
199
|
+
# <string> <uint8>
|
200
|
+
# 0 A 1
|
201
|
+
# 1 B 2
|
202
|
+
# 2 C 3
|
203
|
+
#
|
204
|
+
# other3 = DataFrame.new(
|
205
|
+
# KEY1: %w[A B D],
|
206
|
+
# KEY2: [1, 4, 5]
|
207
|
+
# )
|
208
|
+
#
|
209
|
+
# # =>
|
210
|
+
# KEY1 KEY2
|
211
|
+
# <string> <uint8>
|
212
|
+
# 0 A 1
|
213
|
+
# 1 B 4
|
214
|
+
# 2 D 5
|
215
|
+
|
216
|
+
# Join another DataFrame or Table, leaving only the matching records.
|
217
|
+
# - Same as `#join` with `type: :inner`
|
218
|
+
# - A kind of mutating join.
|
219
|
+
#
|
220
|
+
# @overload inner_join(other, suffix: '.1')
|
221
|
+
# If `join_key` is not specified, common keys in self and other are used
|
222
|
+
# (natural keys). Returns joined dataframe.
|
223
|
+
#
|
224
|
+
# @macro join_before
|
225
|
+
# @macro join_after
|
226
|
+
# @macro join_common_example_1
|
227
|
+
# @example without key (use implicit common key)
|
228
|
+
# df.inner_join(other)
|
229
|
+
#
|
230
|
+
# # =>
|
231
|
+
# KEY X1 X2
|
232
|
+
# <string> <uint8> <boolean>
|
233
|
+
# 0 A 1 true
|
234
|
+
# 1 B 2 false
|
235
|
+
#
|
236
|
+
# @overload inner_join(other, join_keys, suffix: '.1')
|
237
|
+
#
|
238
|
+
# @macro join_before
|
239
|
+
# @macro join_key_in_array
|
240
|
+
# @macro join_after
|
241
|
+
# @macro join_common_example_1
|
242
|
+
# @example with a key
|
243
|
+
# df.inner_join(other, :KEY)
|
244
|
+
#
|
245
|
+
# # =>
|
246
|
+
# KEY X1 X2
|
247
|
+
# <string> <uint8> <boolean>
|
248
|
+
# 0 A 1 true
|
249
|
+
# 1 B 2 false
|
250
|
+
#
|
251
|
+
# @overload inner_join(other, join_key_pairs, suffix: '.1')
|
252
|
+
#
|
253
|
+
# @macro join_before
|
254
|
+
# @macro join_key_in_hash
|
255
|
+
# @macro join_after
|
256
|
+
# @macro join_common_example_2
|
257
|
+
# @example with key pairs
|
258
|
+
# df2.inner_join(other2, { left: :KEY1, right: :KEY2 })
|
259
|
+
#
|
260
|
+
# # =>
|
261
|
+
# KEY1 X1 X2
|
262
|
+
# <string> <uint8> <boolean>
|
263
|
+
# 0 A 1 true
|
264
|
+
# 1 B 2 false
|
88
265
|
#
|
89
266
|
def inner_join(other, join_keys = nil, suffix: '.1')
|
90
267
|
join(other, join_keys, type: :inner, suffix: suffix)
|
91
268
|
end
|
92
269
|
|
93
|
-
# Join
|
270
|
+
# Join another DataFrame or Table, leaving all records.
|
271
|
+
# - Same as `#join` with `type: :full_outer`
|
272
|
+
# - A kind of mutating join.
|
273
|
+
#
|
274
|
+
# @overload full_join(other, suffix: '.1', force_order: true)
|
275
|
+
# If `join_key` is not specified, common keys in self and other are used
|
276
|
+
# (natural keys). Returns joined dataframe.
|
277
|
+
#
|
278
|
+
# @macro join_before
|
279
|
+
# @macro join_dorce_order
|
280
|
+
# @macro join_after
|
281
|
+
# @macro join_common_example_1
|
282
|
+
# @example without key (use implicit common key)
|
283
|
+
# df.full_join(other)
|
284
|
+
#
|
285
|
+
# # =>
|
286
|
+
# KEY X1 X2
|
287
|
+
# <string> <uint8> <boolean>
|
288
|
+
# 0 A 1 true
|
289
|
+
# 1 B 2 false
|
290
|
+
# 2 C 3 (nil)
|
291
|
+
# 3 D (nil) (nil)
|
94
292
|
#
|
95
|
-
# @
|
96
|
-
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
97
|
-
# @return [DataFrame] Joined dataframe.
|
293
|
+
# @overload full_join(other, join_keys, suffix: '.1', force_order: true)
|
98
294
|
#
|
99
|
-
|
100
|
-
|
295
|
+
# @macro join_before
|
296
|
+
# @macro join_key_in_array
|
297
|
+
# @macro join_dorce_order
|
298
|
+
# @macro join_after
|
299
|
+
# @macro join_common_example_1
|
300
|
+
# @example with a key
|
301
|
+
# df.full_join(other, :KEY)
|
302
|
+
#
|
303
|
+
# # =>
|
304
|
+
# KEY X1 X2
|
305
|
+
# <string> <uint8> <boolean>
|
306
|
+
# 0 A 1 true
|
307
|
+
# 1 B 2 false
|
308
|
+
# 2 C 3 (nil)
|
309
|
+
# 3 D (nil) (nil)
|
310
|
+
#
|
311
|
+
# @overload full_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
312
|
+
#
|
313
|
+
# @macro join_before
|
314
|
+
# @macro join_key_in_hash
|
315
|
+
# @macro join_dorce_order
|
316
|
+
# @macro join_after
|
317
|
+
# @macro join_common_example_2
|
318
|
+
# @example with key pairs
|
319
|
+
# df2.full_join(other2, { left: :KEY1, right: :KEY2 })
|
320
|
+
#
|
321
|
+
# # =>
|
322
|
+
# KEY1 X1 X2
|
323
|
+
# <string> <uint8> <boolean>
|
324
|
+
# 0 A 1 true
|
325
|
+
# 1 B 2 false
|
326
|
+
# 2 C 3 (nil)
|
327
|
+
# 3 D (nil) (nil)
|
328
|
+
#
|
329
|
+
def full_join(other, join_keys = nil, suffix: '.1', force_order: true)
|
330
|
+
join(other, join_keys,
|
331
|
+
type: :full_outer, suffix: suffix, force_order: force_order)
|
101
332
|
end
|
102
333
|
|
103
334
|
alias_method :outer_join, :full_join
|
104
335
|
|
105
336
|
# Join matching values to self from other.
|
337
|
+
# - Same as `#join` with `type: :left_outer`
|
338
|
+
# - A kind of mutating join.
|
339
|
+
#
|
340
|
+
# @overload left_join(other, suffix: '.1')
|
341
|
+
# If `join_key` is not specified, common keys in self and other are used
|
342
|
+
# (natural keys). Returns joined dataframe.
|
343
|
+
#
|
344
|
+
# @macro join_before
|
345
|
+
# @macro join_after
|
346
|
+
# @macro join_common_example_1
|
347
|
+
# @example without key (use implicit common key)
|
348
|
+
# df.left_join(other)
|
349
|
+
#
|
350
|
+
# # =>
|
351
|
+
# KEY X1 X2
|
352
|
+
# <string> <uint8> <boolean>
|
353
|
+
# 0 A 1 true
|
354
|
+
# 1 B 2 false
|
355
|
+
# 2 C 3 (nil)
|
356
|
+
#
|
357
|
+
# @overload left_join(other, join_keys, suffix: '.1')
|
106
358
|
#
|
107
|
-
#
|
108
|
-
#
|
109
|
-
#
|
359
|
+
# @macro join_before
|
360
|
+
# @macro join_key_in_array
|
361
|
+
# @macro join_after
|
362
|
+
# @macro join_common_example_1
|
363
|
+
# @example with a key
|
364
|
+
# df.left_join(other, :KEY)
|
365
|
+
#
|
366
|
+
# # =>
|
367
|
+
# KEY X1 X2
|
368
|
+
# <string> <uint8> <boolean>
|
369
|
+
# 0 A 1 true
|
370
|
+
# 1 B 2 false
|
371
|
+
# 2 C 3 (nil)
|
372
|
+
#
|
373
|
+
# @overload left_join(other, join_key_pairs, suffix: '.1')
|
374
|
+
#
|
375
|
+
# @macro join_before
|
376
|
+
# @macro join_key_in_hash
|
377
|
+
# @macro join_after
|
378
|
+
# @macro join_common_example_2
|
379
|
+
# @example with key pairs
|
380
|
+
# df2.left_join(other2, { left: :KEY1, right: :KEY2 })
|
381
|
+
#
|
382
|
+
# # =>
|
383
|
+
# KEY1 X1 X2
|
384
|
+
# <string> <uint8> <boolean>
|
385
|
+
# 0 A 1 true
|
386
|
+
# 1 B 2 false
|
387
|
+
# 2 C 3 (nil)
|
110
388
|
#
|
111
389
|
def left_join(other, join_keys = nil, suffix: '.1')
|
112
390
|
join(other, join_keys, type: :left_outer, suffix: suffix)
|
113
391
|
end
|
114
392
|
|
115
393
|
# Join matching values from self to other.
|
394
|
+
# - Same as `#join` with `type: :right_outer`
|
395
|
+
# - A kind of mutating join.
|
396
|
+
#
|
397
|
+
# @overload right_join(other, suffix: '.1', force_order: true)
|
398
|
+
# If `join_key` is not specified, common keys in self and other are used
|
399
|
+
# (natural keys). Returns joined dataframe.
|
400
|
+
#
|
401
|
+
# @macro join_before
|
402
|
+
# @macro join_dorce_order
|
403
|
+
# @macro join_after
|
404
|
+
# @macro join_common_example_1
|
405
|
+
# @example without key (use implicit common key)
|
406
|
+
# df.right_join(other)
|
407
|
+
#
|
408
|
+
# # =>
|
409
|
+
# KEY X1 X2
|
410
|
+
# <string> <uint8> <boolean>
|
411
|
+
# 0 A 1 true
|
412
|
+
# 1 B 2 false
|
413
|
+
# 2 D (nil) (nil)
|
414
|
+
#
|
415
|
+
# @overload right_join(other, join_keys, suffix: '.1', force_order: true)
|
416
|
+
#
|
417
|
+
# @macro join_before
|
418
|
+
# @macro join_key_in_array
|
419
|
+
# @macro join_dorce_order
|
420
|
+
# @macro join_after
|
421
|
+
# @macro join_common_example_1
|
422
|
+
# @example with a key
|
423
|
+
# df.right_join(other, :KEY)
|
116
424
|
#
|
117
|
-
#
|
118
|
-
#
|
119
|
-
#
|
425
|
+
# # =>
|
426
|
+
# KEY X1 X2
|
427
|
+
# <string> <uint8> <boolean>
|
428
|
+
# 0 A 1 true
|
429
|
+
# 1 B 2 false
|
430
|
+
# 2 D (nil) (nil)
|
120
431
|
#
|
121
|
-
|
122
|
-
|
432
|
+
# @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
433
|
+
#
|
434
|
+
# @macro join_before
|
435
|
+
# @macro join_key_in_hash
|
436
|
+
# @macro join_dorce_order
|
437
|
+
# @macro join_after
|
438
|
+
# @macro join_common_example_2
|
439
|
+
# @example with key pairs
|
440
|
+
# df2.right_join(other2, { left: :KEY1, right: :KEY2 })
|
441
|
+
#
|
442
|
+
# # =>
|
443
|
+
# KEY1 X1 X2
|
444
|
+
# <string> <uint8> <boolean>
|
445
|
+
# 0 A 1 true
|
446
|
+
# 1 B 2 false
|
447
|
+
# 2 D (nil) (nil)
|
448
|
+
#
|
449
|
+
def right_join(other, join_keys = nil, suffix: '.1', force_order: true)
|
450
|
+
join(
|
451
|
+
other,
|
452
|
+
join_keys,
|
453
|
+
type: :right_outer,
|
454
|
+
suffix: suffix,
|
455
|
+
force_order: force_order
|
456
|
+
)
|
123
457
|
end
|
124
458
|
|
125
|
-
# Filtering joins
|
459
|
+
# Filtering joins (#semi_join, #anti_join)
|
126
460
|
|
127
461
|
# Return records of self that have a match in other.
|
462
|
+
# - Same as `#join` with `type: :left_semi`
|
463
|
+
# - A kind of filtering join.
|
464
|
+
#
|
465
|
+
# @overload semi_join(other, suffix: '.1')
|
466
|
+
# If `join_key` is not specified, common keys in self and other are used
|
467
|
+
# (natural keys). Returns joined dataframe.
|
468
|
+
#
|
469
|
+
# @macro join_before
|
470
|
+
# @macro join_after
|
471
|
+
# @macro join_common_example_1
|
472
|
+
# @example without key (use implicit common key)
|
473
|
+
# df.semi_join(other)
|
474
|
+
#
|
475
|
+
# # =>
|
476
|
+
# KEY X1
|
477
|
+
# <string> <uint8>
|
478
|
+
# 0 A 1
|
479
|
+
# 1 B 2
|
480
|
+
#
|
481
|
+
# @overload semi_join(other, join_keys, suffix: '.1')
|
482
|
+
#
|
483
|
+
# @macro join_before
|
484
|
+
# @macro join_key_in_array
|
485
|
+
# @macro join_after
|
486
|
+
# @macro join_common_example_1
|
487
|
+
# @example with a key
|
488
|
+
# df.semi_join(other, :KEY)
|
489
|
+
#
|
490
|
+
# # =>
|
491
|
+
# KEY X1
|
492
|
+
# <string> <uint8>
|
493
|
+
# 0 A 1
|
494
|
+
# 1 B 2
|
128
495
|
#
|
129
|
-
# @
|
130
|
-
#
|
131
|
-
#
|
496
|
+
# @overload semi_join(other, join_key_pairs, suffix: '.1')
|
497
|
+
#
|
498
|
+
# @macro join_before
|
499
|
+
# @macro join_key_in_hash
|
500
|
+
# @macro join_after
|
501
|
+
# @macro join_common_example_2
|
502
|
+
# @example with key pairs
|
503
|
+
# df2.semi_join(other2, { left: :KEY1, right: :KEY2 })
|
504
|
+
#
|
505
|
+
# # =>
|
506
|
+
# KEY1 X1
|
507
|
+
# <string> <uint8>
|
508
|
+
# 0 A 1
|
509
|
+
# 1 B 2
|
132
510
|
#
|
133
511
|
def semi_join(other, join_keys = nil, suffix: '.1')
|
134
512
|
join(other, join_keys, type: :left_semi, suffix: suffix)
|
135
513
|
end
|
136
514
|
|
137
515
|
# Return records of self that do not have a match in other.
|
516
|
+
# - Same as `#join` with `type: :left_anti`
|
517
|
+
# - A kind of filtering join.
|
518
|
+
#
|
519
|
+
# @overload anti_join(other, suffix: '.1')
|
520
|
+
# If `join_key` is not specified, common keys in self and other are used
|
521
|
+
# (natural keys). Returns joined dataframe.
|
522
|
+
#
|
523
|
+
# @macro join_before
|
524
|
+
# @macro join_after
|
525
|
+
# @macro join_common_example_1
|
526
|
+
# @example without key (use implicit common key)
|
527
|
+
# df.anti_join(other)
|
528
|
+
#
|
529
|
+
# # =>
|
530
|
+
# KEY X1
|
531
|
+
# <string> <uint8>
|
532
|
+
# 0 C 3
|
533
|
+
#
|
534
|
+
# @overload anti_join(other, join_keys, suffix: '.1')
|
535
|
+
#
|
536
|
+
# @macro join_before
|
537
|
+
# @macro join_key_in_array
|
538
|
+
# @macro join_after
|
539
|
+
# @macro join_common_example_1
|
540
|
+
# @example with a key
|
541
|
+
# df.anti_join(other, :KEY)
|
542
|
+
#
|
543
|
+
# # =>
|
544
|
+
# KEY X1
|
545
|
+
# <string> <uint8>
|
546
|
+
# 0 C 3
|
138
547
|
#
|
139
|
-
# @
|
140
|
-
#
|
141
|
-
#
|
548
|
+
# @overload anti_join(other, join_key_pairs, suffix: '.1')
|
549
|
+
#
|
550
|
+
# @macro join_before
|
551
|
+
# @macro join_key_in_hash
|
552
|
+
# @macro join_after
|
553
|
+
# @macro join_common_example_2
|
554
|
+
# @example with key pairs
|
555
|
+
# df2.anti_join(other2, { left: :KEY1, right: :KEY2 })
|
556
|
+
#
|
557
|
+
# # =>
|
558
|
+
# KEY1 X1
|
559
|
+
# <string> <uint8>
|
560
|
+
# 0 C 3
|
142
561
|
#
|
143
562
|
def anti_join(other, join_keys = nil, suffix: '.1')
|
144
563
|
join(other, join_keys, type: :left_anti, suffix: suffix)
|
145
564
|
end
|
146
565
|
|
147
|
-
# Set operations
|
566
|
+
# Set operations (#intersect, #union, #difference, #set_operable?)
|
148
567
|
|
149
568
|
# Check if set operation with self and other is possible.
|
150
569
|
#
|
151
|
-
# @
|
152
|
-
# @return [Boolean]
|
570
|
+
# @macro join_before
|
571
|
+
# @return [Boolean]
|
572
|
+
# true if set operation is possible.
|
573
|
+
# @macro join_common_example_3
|
574
|
+
# @example
|
575
|
+
# df3.set_operable?(other3) # => true
|
153
576
|
#
|
154
577
|
def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
|
155
|
-
|
156
|
-
keys == other.keys
|
578
|
+
keys == other.keys.map(&:to_sym)
|
157
579
|
end
|
158
580
|
|
159
581
|
# Select records appearing in both self and other.
|
582
|
+
# - Same as `#join` with `type: :inner` when keys in self are same with other.
|
583
|
+
# - A kind of set operations.
|
160
584
|
#
|
161
|
-
# @
|
162
|
-
# @return [DataFrame]
|
585
|
+
# @macro join_before
|
586
|
+
# @return [DataFrame]
|
587
|
+
# joined dataframe.
|
588
|
+
# @macro join_common_example_3
|
589
|
+
# @example
|
590
|
+
# df3.intersect(other3)
|
591
|
+
#
|
592
|
+
# # =>
|
593
|
+
# KEY1 KEY2
|
594
|
+
# <string> <uint8>
|
595
|
+
# 0 A 1
|
163
596
|
#
|
164
597
|
def intersect(other)
|
165
|
-
|
166
|
-
|
598
|
+
unless keys == other.keys.map(&:to_sym)
|
599
|
+
raise DataFrameArgumentError, 'keys are not same with self and other'
|
600
|
+
end
|
167
601
|
|
168
602
|
join(other, keys, type: :inner)
|
169
603
|
end
|
170
604
|
|
171
605
|
# Select records appearing in self or other.
|
606
|
+
# - Same as `#join` with `type: :full_outer` when keys in self are same with other.
|
607
|
+
# - A kind of set operations.
|
172
608
|
#
|
173
|
-
# @
|
174
|
-
# @return [DataFrame]
|
609
|
+
# @macro join_before
|
610
|
+
# @return [DataFrame]
|
611
|
+
# joined dataframe.
|
612
|
+
# @macro join_common_example_3
|
613
|
+
# @example
|
614
|
+
# df3.intersect(other3)
|
615
|
+
#
|
616
|
+
# # =>
|
617
|
+
# KEY1 KEY2
|
618
|
+
# <string> <uint8>
|
619
|
+
# 0 A 1
|
620
|
+
# 1 B 2
|
621
|
+
# 2 C 3
|
622
|
+
# 3 B 4
|
623
|
+
# 4 D 5
|
175
624
|
#
|
176
625
|
def union(other)
|
177
|
-
|
178
|
-
|
626
|
+
unless keys == other.keys.map(&:to_sym)
|
627
|
+
raise DataFrameArgumentError, 'keys are not same with self and other'
|
628
|
+
end
|
179
629
|
|
180
630
|
join(other, keys, type: :full_outer)
|
181
631
|
end
|
182
632
|
|
183
633
|
# Select records appearing in self but not in other.
|
634
|
+
# - Same as `#join` with `type: :left_anti` when keys in self are same with other.
|
635
|
+
# - A kind of set operations.
|
636
|
+
#
|
637
|
+
# @macro join_before
|
638
|
+
# @return [DataFrame]
|
639
|
+
# joined dataframe.
|
640
|
+
# @macro join_common_example_3
|
641
|
+
# @example
|
642
|
+
# df3.intersect(other3)
|
643
|
+
#
|
644
|
+
# # =>
|
645
|
+
# KEY1 KEY2
|
646
|
+
# <string> <uint8>
|
647
|
+
# 0 B 2
|
648
|
+
# 1 C 3
|
649
|
+
#
|
650
|
+
# other.intersect(df)
|
184
651
|
#
|
185
|
-
#
|
186
|
-
#
|
652
|
+
# # =>
|
653
|
+
# KEY1 KEY2
|
654
|
+
# <string> <uint8>
|
655
|
+
# 0 B 4
|
656
|
+
# 1 D 5
|
187
657
|
#
|
188
658
|
def difference(other)
|
189
|
-
|
190
|
-
|
659
|
+
unless keys == other.keys.map(&:to_sym)
|
660
|
+
raise DataFrameArgumentError, 'keys are not same with self and other'
|
661
|
+
end
|
191
662
|
|
192
663
|
join(other, keys, type: :left_anti)
|
193
664
|
end
|
194
665
|
|
195
666
|
alias_method :setdiff, :difference
|
196
667
|
|
197
|
-
#
|
198
|
-
|
199
|
-
# Join other dataframe
|
668
|
+
# Join another DataFrame or Table to self.
|
200
669
|
#
|
201
|
-
#
|
202
|
-
#
|
203
|
-
#
|
670
|
+
# @!macro join_common_type
|
671
|
+
# @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
|
672
|
+
# left_outer, :right_outer, :full_outer] type of join.
|
204
673
|
#
|
205
|
-
#
|
206
|
-
#
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
674
|
+
# @!macro join_common_example_4
|
675
|
+
# @example
|
676
|
+
# df4 = DataFrame.new(
|
677
|
+
# X1: %w[A B C],
|
678
|
+
# Y: %w[D E F]
|
679
|
+
# )
|
680
|
+
#
|
681
|
+
# # =>
|
682
|
+
# X1 Y1
|
683
|
+
# <string> <string>
|
684
|
+
# 0 A D
|
685
|
+
# 1 B E
|
686
|
+
# 2 C F
|
687
|
+
#
|
688
|
+
# other4 = DataFrame.new(
|
689
|
+
# X2: %w[A B D],
|
690
|
+
# Y: %w[e E E]
|
691
|
+
# )
|
692
|
+
#
|
693
|
+
# # =>
|
694
|
+
# X1 Y1
|
695
|
+
# <string> <string>
|
696
|
+
# 0 A D
|
697
|
+
# 1 B E
|
698
|
+
# 2 C F
|
699
|
+
|
700
|
+
# @note the order of joined results will be preserved by default.
|
701
|
+
# This is enabled by appending index column to sort after joining but
|
702
|
+
# it will cause some performance degradation. If you don't matter
|
703
|
+
# the order of the result, set `force_order` option to `false`.
|
704
|
+
#
|
705
|
+
# @overload join(other, type: :inner, suffix: '.1', force_order: true)
|
706
|
+
#
|
707
|
+
# If `join_key` is not specified, common keys in self and other are used
|
708
|
+
# (natural keys). Returns joined dataframe.
|
709
|
+
#
|
710
|
+
# @macro join_before
|
711
|
+
# @macro join_common_type
|
712
|
+
# @macro join_dorce_order
|
713
|
+
# @macro join_after
|
714
|
+
# @macro join_common_example_1
|
715
|
+
# @example
|
716
|
+
# df.join(other)
|
717
|
+
#
|
718
|
+
# # =>
|
719
|
+
# KEY X1 X2
|
720
|
+
# <string> <uint8> <boolean>
|
721
|
+
# 0 A 1 true
|
722
|
+
# 1 B 2 false
|
723
|
+
#
|
724
|
+
# df.join(other, type: :full_outer)
|
725
|
+
#
|
726
|
+
# # =>
|
727
|
+
# KEY X1 X2
|
728
|
+
# <string> <uint8> <boolean>
|
729
|
+
# 0 A 1 true
|
730
|
+
# 1 B 2 false
|
731
|
+
# 2 C 3 (nil)
|
732
|
+
# 3 D (nil) (nil)
|
733
|
+
#
|
734
|
+
# @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: true)
|
735
|
+
#
|
736
|
+
# @macro join_before
|
737
|
+
# @macro join_key_in_array
|
738
|
+
# @macro join_common_type
|
739
|
+
# @macro join_dorce_order
|
740
|
+
# @macro join_after
|
741
|
+
# @macro join_common_example_3
|
742
|
+
# @example join keys in an Array
|
743
|
+
# df3.join(other3, [:KEY1, :KEY2])
|
744
|
+
#
|
745
|
+
# # =>
|
746
|
+
# KEY1 KEY2
|
747
|
+
# <string> <uint8>
|
748
|
+
# 0 A 1
|
749
|
+
#
|
750
|
+
# @example partial join key and suffix
|
751
|
+
# df3.join(other3, :KEY1, suffix: '.a')
|
752
|
+
#
|
753
|
+
# # =>
|
754
|
+
# KEY1 KEY2 KEY2.a
|
755
|
+
# <string> <uint8> <uint8>
|
756
|
+
# 0 A 1 1
|
757
|
+
# 1 B 2 4
|
758
|
+
#
|
759
|
+
# @overload join(other, join_key_pairs, type: :inner, suffix: '.1', force_order: true)
|
760
|
+
#
|
761
|
+
# @macro join_before
|
762
|
+
# @macro join_key_in_hash
|
763
|
+
# @macro join_common_type
|
764
|
+
# @macro join_dorce_order
|
765
|
+
# @macro join_after
|
766
|
+
# @macro join_common_example_4
|
767
|
+
# @example without options
|
768
|
+
# df4.join(other4)
|
769
|
+
#
|
770
|
+
# # =>
|
771
|
+
# X1 Y X2
|
772
|
+
# <string> <string> <string>
|
773
|
+
# 0 B E D
|
774
|
+
# 1 B E B
|
775
|
+
#
|
776
|
+
# @example join by key pairs
|
777
|
+
# df4.join(other4, { left: [:X1, :Y], right: [:X2, :Y] })
|
778
|
+
#
|
779
|
+
# # =>
|
780
|
+
# X1 Y
|
781
|
+
# <string> <string>
|
782
|
+
# 0 B E
|
783
|
+
#
|
784
|
+
# @example join by key pairs, using renaming by suffix
|
785
|
+
# df4.join(other4, { left: :X1, right: :X2 })
|
786
|
+
#
|
787
|
+
# # =>
|
788
|
+
# X1 Y Y.1
|
789
|
+
# <string> <string> <string>
|
790
|
+
# 0 A D e
|
791
|
+
# 1 B E E
|
792
|
+
#
|
793
|
+
def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: true)
|
794
|
+
right_table =
|
795
|
+
case other
|
796
|
+
when DataFrame
|
797
|
+
other.table
|
798
|
+
when Arrow::Table
|
799
|
+
other
|
800
|
+
else
|
801
|
+
raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
|
802
|
+
end
|
803
|
+
|
804
|
+
type = type.to_sym
|
805
|
+
left_index = :__LEFT_INDEX__
|
806
|
+
right_index = :__RIGHT_INDEX__
|
807
|
+
if force_order && %i[full_outer right_outer].include?(type)
|
808
|
+
left_table = assign(left_index) { indices }.table
|
809
|
+
other = DataFrame.create(other) if other.is_a?(Arrow::Table)
|
810
|
+
right_table = other.assign(right_index) { indices }.table
|
213
811
|
else
|
214
|
-
|
812
|
+
left_table = table
|
215
813
|
end
|
216
814
|
|
217
|
-
|
218
|
-
|
219
|
-
raise DataFrameArgumentError, "#{join_keys} are not common keys" if natural_keys.empty?
|
220
|
-
|
221
|
-
join_keys =
|
222
|
-
if join_keys
|
223
|
-
Array(join_keys).map(&:to_sym)
|
224
|
-
else
|
225
|
-
natural_keys
|
226
|
-
end
|
227
|
-
return self if join_keys.empty?
|
815
|
+
table_keys = left_table.keys
|
816
|
+
other_keys = right_table.keys
|
228
817
|
|
229
|
-
#
|
230
|
-
|
231
|
-
unless remainer_keys.empty?
|
232
|
-
renamer = remainer_keys.each_with_object({}) do |key, hash|
|
233
|
-
new_key = nil
|
234
|
-
loop do
|
235
|
-
new_key = "#{key}#{suffix}".to_sym
|
236
|
-
break unless keys.include?(new_key)
|
818
|
+
# natural keys (implicit common keys)
|
819
|
+
join_keys ||= table_keys.intersection(other_keys)
|
237
820
|
|
238
|
-
|
239
|
-
|
821
|
+
# This is not necessary if additional procedure is contributed to Red Arrow.
|
822
|
+
if join_keys.is_a?(Hash)
|
823
|
+
left_keys = join_keys[:left]
|
824
|
+
right_keys = join_keys[:right]
|
825
|
+
else
|
826
|
+
left_keys = join_keys
|
827
|
+
right_keys = join_keys
|
828
|
+
end
|
829
|
+
left_keys = Array(left_keys).map(&:to_s)
|
830
|
+
right_keys = Array(right_keys).map(&:to_s)
|
240
831
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
832
|
+
case type
|
833
|
+
when :full_outer, :left_semi, :left_anti, :right_semi, :right_anti
|
834
|
+
left_outputs = nil
|
835
|
+
right_outputs = nil
|
836
|
+
when :inner, :left_outer
|
837
|
+
left_outputs = table_keys
|
838
|
+
right_outputs = other_keys - right_keys
|
839
|
+
when :right_outer
|
840
|
+
left_outputs = table_keys - left_keys
|
841
|
+
right_outputs = other_keys
|
246
842
|
end
|
247
843
|
|
248
|
-
#
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
844
|
+
# Should we rescue errors in Arrow::Table#join for usability ?
|
845
|
+
joined_table =
|
846
|
+
left_table.join(
|
847
|
+
right_table,
|
848
|
+
join_keys,
|
849
|
+
type: type,
|
850
|
+
left_outputs: left_outputs,
|
851
|
+
right_outputs: right_outputs
|
852
|
+
)
|
254
853
|
|
255
854
|
case type
|
256
|
-
when :left_semi, :left_anti, :right_semi, :right_anti
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
855
|
+
when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
|
856
|
+
if joined_table.keys.uniq!
|
857
|
+
DataFrame.create(rename_table(joined_table, n_keys, suffix))
|
858
|
+
else
|
859
|
+
DataFrame.create(joined_table)
|
860
|
+
end
|
861
|
+
when :full_outer
|
862
|
+
renamed_table = rename_table(joined_table, n_keys, suffix)
|
863
|
+
renamed_keys = renamed_table.keys
|
864
|
+
dropper = []
|
865
|
+
dataframe = DataFrame.create(renamed_table).assign do |df|
|
866
|
+
left_keys.map do |left_key|
|
867
|
+
i_left_key = renamed_keys.index(left_key)
|
868
|
+
right_key = renamed_keys[i_left_key + table_keys.size]
|
869
|
+
dropper << right_key
|
870
|
+
[left_key.to_sym, merge_array(df[left_key].data, df[right_key].data)]
|
871
|
+
end
|
872
|
+
end
|
873
|
+
dataframe = dataframe.sort(left_index, right_index) if force_order
|
874
|
+
|
875
|
+
dataframe.drop(dropper, left_index, right_index)
|
876
|
+
when :right_outer
|
877
|
+
dataframe =
|
878
|
+
if joined_table.keys.uniq!
|
879
|
+
DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
|
880
|
+
else
|
881
|
+
DataFrame.create(joined_table)
|
882
|
+
end
|
883
|
+
if force_order
|
884
|
+
dataframe =
|
885
|
+
dataframe
|
886
|
+
.sort(left_index, right_index)
|
887
|
+
.drop(left_index, right_index)
|
888
|
+
end
|
889
|
+
dataframe.pick do
|
890
|
+
[right_keys, keys.map(&:to_s) - right_keys]
|
891
|
+
end
|
264
892
|
end
|
265
|
-
DataFrame.new(table_output[selected_indexes])
|
266
|
-
.assign(*join_keys) { merged_columns }
|
267
893
|
end
|
268
894
|
|
269
895
|
private
|
270
896
|
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
897
|
+
# Rename duplicate keys by suffix
|
898
|
+
def rename_table(joined_table, n_keys, suffix)
|
899
|
+
joined_keys = joined_table.keys
|
900
|
+
other_keys = joined_keys[n_keys..]
|
901
|
+
|
902
|
+
dup_keys = joined_keys.tally.select { |_, v| v > 1 }.keys
|
903
|
+
renamed_right_keys =
|
904
|
+
other_keys.map do |key|
|
905
|
+
if dup_keys.include?(key)
|
906
|
+
new_key = nil
|
907
|
+
loop do
|
908
|
+
new_key = "#{key}#{suffix}"
|
909
|
+
break unless joined_keys.include?(new_key)
|
910
|
+
|
911
|
+
s = suffix.succ
|
912
|
+
raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
|
913
|
+
|
914
|
+
suffix = s
|
915
|
+
end
|
916
|
+
new_key
|
917
|
+
else
|
918
|
+
key
|
919
|
+
end
|
920
|
+
end
|
921
|
+
joined_keys[n_keys..] = renamed_right_keys
|
922
|
+
|
923
|
+
fields =
|
924
|
+
joined_keys.map.with_index do |k, i|
|
925
|
+
Arrow::Field.new(k, joined_table[i].data_type)
|
926
|
+
end
|
927
|
+
Arrow::Table.new(Arrow::Schema.new(fields), joined_table.columns)
|
928
|
+
end
|
929
|
+
|
930
|
+
# Merge two Arrow::Arrays
|
931
|
+
def merge_array(array1, array2)
|
932
|
+
t = Arrow::Function.find(:is_null).execute([array1])
|
933
|
+
Arrow::Function.find(:if_else).execute([t, array2, array1]).value
|
281
934
|
end
|
282
935
|
end
|
283
936
|
end
|