red_amber 0.2.1 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +15 -0
- data/CHANGELOG.md +170 -20
- data/Gemfile +4 -2
- data/README.md +121 -302
- data/benchmark/basic.yml +79 -0
- data/benchmark/combine.yml +63 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +33 -0
- data/benchmark/reshape.yml +27 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/doc/DataFrame.md +611 -318
- data/doc/Vector.md +31 -36
- data/doc/image/basic_verbs.png +0 -0
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/assign_operation.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/pick_operation.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/rename_operation.png +0 -0
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe/slice_operation.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/group_operation.png +0 -0
- data/doc/image/replace-if_then.png +0 -0
- data/doc/image/reshaping_dataframe.png +0 -0
- data/doc/image/screenshot.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/lib/red_amber/data_frame.rb +16 -42
- data/lib/red_amber/data_frame_combinable.rb +283 -0
- data/lib/red_amber/data_frame_displayable.rb +58 -3
- data/lib/red_amber/data_frame_loadsave.rb +36 -0
- data/lib/red_amber/data_frame_reshaping.rb +8 -6
- data/lib/red_amber/data_frame_selectable.rb +9 -9
- data/lib/red_amber/data_frame_variable_operation.rb +27 -21
- data/lib/red_amber/group.rb +100 -17
- data/lib/red_amber/helper.rb +20 -30
- data/lib/red_amber/vector.rb +56 -30
- data/lib/red_amber/vector_functions.rb +0 -8
- data/lib/red_amber/vector_selectable.rb +9 -1
- data/lib/red_amber/vector_updatable.rb +61 -63
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -0
- data/red_amber.gemspec +1 -1
- metadata +32 -11
- data/doc/examples_of_red_amber.ipynb +0 -8979
data/doc/Vector.md
CHANGED
@@ -7,7 +7,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
7
7
|
### Create from a column in a DataFrame
|
8
8
|
|
9
9
|
```ruby
|
10
|
-
df =
|
10
|
+
df = DataFrame.new(x: [1, 2, 3])
|
11
11
|
df[:x]
|
12
12
|
# =>
|
13
13
|
#<RedAmber::Vector(:uint8, size=3):0x000000000000f4ec>
|
@@ -17,13 +17,16 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
17
17
|
### New from an Array
|
18
18
|
|
19
19
|
```ruby
|
20
|
-
vector =
|
20
|
+
vector = Vector.new([1, 2, 3])
|
21
21
|
# or
|
22
|
-
vector =
|
22
|
+
vector = Vector.new(1, 2, 3)
|
23
23
|
# or
|
24
|
-
vector =
|
24
|
+
vector = Vector.new(1..3)
|
25
25
|
# or
|
26
|
-
vector =
|
26
|
+
vector = Vector.new(Arrow::Array.new([1, 2, 3])
|
27
|
+
# or
|
28
|
+
require 'arrow-numo-narray'
|
29
|
+
vector = Vector.new(Numo::Int8[1, 2, 3])
|
27
30
|
|
28
31
|
# =>
|
29
32
|
#<RedAmber::Vector(:uint8, size=3):0x000000000000f514>
|
@@ -61,7 +64,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
61
64
|
|
62
65
|
### `type_class`
|
63
66
|
|
64
|
-
### `each`
|
67
|
+
### `each`, `map`, `collect`
|
65
68
|
|
66
69
|
If block is not given, returns Enumerator.
|
67
70
|
|
@@ -78,7 +81,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
78
81
|
- `limit` sets size limit to display a long array.
|
79
82
|
|
80
83
|
```ruby
|
81
|
-
vector =
|
84
|
+
vector = Vector.new((1..50).to_a)
|
82
85
|
# =>
|
83
86
|
#<RedAmber::Vector(:uint8, size=50):0x000000000000f528>
|
84
87
|
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ... ]
|
@@ -95,8 +98,8 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
95
98
|
- Negative index is also OK like the Ruby's primitive Array.
|
96
99
|
|
97
100
|
```ruby
|
98
|
-
array =
|
99
|
-
indices =
|
101
|
+
array = Vector.new(%w[A B C D E])
|
102
|
+
indices = Vector.new([0.1, -0.5, -5.1])
|
100
103
|
array.take(indices)
|
101
104
|
# or
|
102
105
|
array[indices]
|
@@ -106,7 +109,7 @@ array[indices]
|
|
106
109
|
["A", "E", "A"]
|
107
110
|
```
|
108
111
|
|
109
|
-
### `filter(booleans)`, `[](booleans)`
|
112
|
+
### `filter(booleans)`, `select(booleans)`, `[](booleans)`
|
110
113
|
|
111
114
|
- Acceptable class for booleans:
|
112
115
|
- An array of true, false, or nil
|
@@ -114,7 +117,7 @@ array[indices]
|
|
114
117
|
- Arrow::BooleanArray
|
115
118
|
|
116
119
|
```ruby
|
117
|
-
array =
|
120
|
+
array = Vector.new(%w[A B C D E])
|
118
121
|
booleans = [true, false, nil, false, true]
|
119
122
|
array.filter(booleans)
|
120
123
|
# or
|
@@ -124,6 +127,7 @@ array[booleans]
|
|
124
127
|
#<RedAmber::Vector(:string, size=2):0x000000000000f21c>
|
125
128
|
["A", "E"]
|
126
129
|
```
|
130
|
+
`filter` and `select` also accepts a block.
|
127
131
|
|
128
132
|
## Functions
|
129
133
|
|
@@ -158,7 +162,7 @@ Options can be used as follows.
|
|
158
162
|
See the [document of C++ function](https://arrow.apache.org/docs/cpp/compute.html) for detail.
|
159
163
|
|
160
164
|
```ruby
|
161
|
-
double =
|
165
|
+
double = Vector.new([1, 0/0.0, -1/0.0, 1/0.0, nil, ""])
|
162
166
|
#=>
|
163
167
|
#<RedAmber::Vector(:double, size=6):0x000000000000f910>
|
164
168
|
[1.0, NaN, -Infinity, Infinity, nil, 0.0]
|
@@ -168,7 +172,7 @@ double.count(mode: :only_valid) #=> 5, default
|
|
168
172
|
double.count(mode: :only_null) #=> 1
|
169
173
|
double.count(mode: :all) #=> 6
|
170
174
|
|
171
|
-
boolean =
|
175
|
+
boolean = Vector.new([true, true, nil])
|
172
176
|
#=>
|
173
177
|
#<RedAmber::Vector(:boolean, size=3):0x000000000000f924>
|
174
178
|
[true, true, nil]
|
@@ -215,7 +219,7 @@ Examples of options for `#round`;
|
|
215
219
|
- `round_mode` Specify rounding mode.
|
216
220
|
|
217
221
|
```ruby
|
218
|
-
double =
|
222
|
+
double = Vector.new([15.15, 2.5, 3.5, -4.5, -5.5])
|
219
223
|
# => [15.15, 2.5, 3.5, -4.5, -5.5]
|
220
224
|
double.round
|
221
225
|
# => [15.0, 2.0, 4.0, -4.0, -6.0]
|
@@ -293,7 +297,7 @@ double.round(n_digits: -1)
|
|
293
297
|
array = [0.0/0, Float::NAN]
|
294
298
|
array.tally #=> {NaN=>1, NaN=>1}
|
295
299
|
|
296
|
-
vector =
|
300
|
+
vector = Vector.new(array)
|
297
301
|
vector.tally #=> {NaN=>2}
|
298
302
|
vector.value_counts #=> {NaN=>2}
|
299
303
|
```
|
@@ -310,7 +314,7 @@ double.round(n_digits: -1)
|
|
310
314
|
## Coerce
|
311
315
|
|
312
316
|
```ruby
|
313
|
-
vector =
|
317
|
+
vector = Vector.new(1,2,3)
|
314
318
|
# =>
|
315
319
|
#<RedAmber::Vector(:uint8, size=3):0x00000000000decc4>
|
316
320
|
[1, 2, 3]
|
@@ -340,12 +344,13 @@ vector * -1
|
|
340
344
|
- Accepts Scalar, Range of Integer, Vector, Array, Arrow::Array as a specifier
|
341
345
|
- Accepts Scalar, Vector, Array and Arrow::Array as a replacer.
|
342
346
|
- Boolean specifiers specify the position of replacer in true.
|
347
|
+
- If booleans.any is false, no replacement happen and return self.
|
343
348
|
- Index specifiers specify the position of replacer in indices.
|
344
349
|
- replacer specifies the values to be replaced.
|
345
350
|
- The number of true in booleans must be equal to the length of replacer
|
346
351
|
|
347
352
|
```ruby
|
348
|
-
vector =
|
353
|
+
vector = Vector.new([1, 2, 3])
|
349
354
|
booleans = [true, false, true]
|
350
355
|
replacer = [4, 5]
|
351
356
|
vector.replace(booleans, replacer)
|
@@ -379,7 +384,7 @@ vector.replace(booleans, replacer)
|
|
379
384
|
```ruby
|
380
385
|
booleans = [true, false, nil]
|
381
386
|
replacer = -1
|
382
|
-
|
387
|
+
vector.replace(booleans, replacer)
|
383
388
|
=>
|
384
389
|
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
385
390
|
[-1, 2, nil]
|
@@ -390,17 +395,7 @@ vec.replace(booleans, replacer)
|
|
390
395
|
```ruby
|
391
396
|
booleans = [true, false, true]
|
392
397
|
replacer = [nil]
|
393
|
-
|
394
|
-
=>
|
395
|
-
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
396
|
-
[nil, 2, nil]
|
397
|
-
```
|
398
|
-
|
399
|
-
- If no replacer specified, it is same as to specify nil.
|
400
|
-
|
401
|
-
```ruby
|
402
|
-
booleans = [true, false, true]
|
403
|
-
vec.replace(booleans)
|
398
|
+
vector.replace(booleans, replacer)
|
404
399
|
=>
|
405
400
|
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
406
401
|
[nil, 2, nil]
|
@@ -409,7 +404,7 @@ vec.replace(booleans)
|
|
409
404
|
- An example to replace 'NA' to nil.
|
410
405
|
|
411
406
|
```ruby
|
412
|
-
vector =
|
407
|
+
vector = Vector.new(['A', 'B', 'NA'])
|
413
408
|
vector.replace(vector == 'NA', nil)
|
414
409
|
# =>
|
415
410
|
#<RedAmber::Vector(:string, size=3):0x000000000000f8ac>
|
@@ -421,7 +416,7 @@ vector.replace(vector == 'NA', nil)
|
|
421
416
|
Specified indices are used 'as sorted'. Position in indices and replacer may not have correspondence.
|
422
417
|
|
423
418
|
```ruby
|
424
|
-
vector =
|
419
|
+
vector = Vector.new([1, 2, 3])
|
425
420
|
indices = [2, 1]
|
426
421
|
replacer = [4, 5]
|
427
422
|
vector.replace(indices, replacer)
|
@@ -437,7 +432,7 @@ Propagate the last valid observation forward (or backward).
|
|
437
432
|
Or preserve nil if all previous values are nil or at the end.
|
438
433
|
|
439
434
|
```ruby
|
440
|
-
integer =
|
435
|
+
integer = Vector.new([0, 1, nil, 3, nil])
|
441
436
|
integer.fill_nil_forward
|
442
437
|
# =>
|
443
438
|
#<RedAmber::Vector(:uint8, size=5):0x000000000000f960>
|
@@ -459,7 +454,7 @@ Choose values based on self. Self must be a boolean Vector.
|
|
459
454
|
This example will normalize negative indices to positive ones.
|
460
455
|
|
461
456
|
```ruby
|
462
|
-
indices =
|
457
|
+
indices = Vector.new([1, -1, 3, -4])
|
463
458
|
array_size = 10
|
464
459
|
normalized_indices = (indices < 0).if_else(indices + array_size, indices)
|
465
460
|
|
@@ -474,7 +469,7 @@ For each element in self, return true if it is found in given `values`, false ot
|
|
474
469
|
By default, nulls are matched against the value set. (This will be changed in SetLookupOptions: not impremented.)
|
475
470
|
|
476
471
|
```ruby
|
477
|
-
vector =
|
472
|
+
vector = Vector.new %W[A B C D]
|
478
473
|
values = ['A', 'C', 'X']
|
479
474
|
vector.is_in(values)
|
480
475
|
|
@@ -486,7 +481,7 @@ vector.is_in(values)
|
|
486
481
|
`values` are casted to the same Class of Vector.
|
487
482
|
|
488
483
|
```ruby
|
489
|
-
vector =
|
484
|
+
vector = Vector.new([1, 2, 255])
|
490
485
|
vector.is_in(1, -1)
|
491
486
|
|
492
487
|
# =>
|
@@ -499,7 +494,7 @@ vector.is_in(1, -1)
|
|
499
494
|
Shift vector's values by specified `amount`. Shifted space is filled by value `fill`.
|
500
495
|
|
501
496
|
```ruby
|
502
|
-
vector =
|
497
|
+
vector = Vector.new([1, 2, 3, 4, 5])
|
503
498
|
vector.shift
|
504
499
|
|
505
500
|
# =>
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/red_amber/data_frame.rb
CHANGED
@@ -5,8 +5,10 @@ module RedAmber
|
|
5
5
|
# Variable @table holds an Arrow::Table object.
|
6
6
|
class DataFrame
|
7
7
|
# mix-in
|
8
|
+
include DataFrameCombinable
|
8
9
|
include DataFrameDisplayable
|
9
10
|
include DataFrameIndexable
|
11
|
+
include DataFrameLoadSave
|
10
12
|
include DataFrameReshaping
|
11
13
|
include DataFrameSelectable
|
12
14
|
include DataFrameVariableOperation
|
@@ -37,10 +39,15 @@ module RedAmber
|
|
37
39
|
# DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
38
40
|
# returns empty DataFrame
|
39
41
|
@table = Arrow::Table.new({}, [])
|
42
|
+
in [->(x) { x.respond_to?(:to_arrow) } => arrowable]
|
43
|
+
table = arrowable.to_arrow
|
44
|
+
unless table.is_a?(Arrow::Table)
|
45
|
+
raise DataFrameTypeError,
|
46
|
+
"to_arrow must return an Arrow::Table but #{table.class}: #{arrowable}"
|
47
|
+
end
|
48
|
+
@table = table
|
40
49
|
in [Arrow::Table => table]
|
41
50
|
@table = table
|
42
|
-
in [DataFrame => dataframe]
|
43
|
-
@table = dataframe.table
|
44
51
|
in [rover_or_hash]
|
45
52
|
begin
|
46
53
|
# Accepts Rover::DataFrame or Hash
|
@@ -52,10 +59,9 @@ module RedAmber
|
|
52
59
|
@table = Arrow::Table.new(*args)
|
53
60
|
end
|
54
61
|
name_unnamed_keys
|
55
|
-
end
|
56
62
|
|
57
|
-
|
58
|
-
|
63
|
+
duplicated_keys = keys.tally.select { |_k, v| v > 1 }.keys
|
64
|
+
raise DataFrameArgumentError, "duplicate keys: #{duplicated_keys}" unless duplicated_keys.empty?
|
59
65
|
end
|
60
66
|
|
61
67
|
attr_reader :table
|
@@ -64,18 +70,15 @@ module RedAmber
|
|
64
70
|
@table
|
65
71
|
end
|
66
72
|
|
67
|
-
def save(output, options = {})
|
68
|
-
@table.save(output, options)
|
69
|
-
end
|
70
|
-
|
71
73
|
# Returns the number of rows.
|
72
74
|
#
|
73
75
|
# @return [Integer] Number of rows.
|
74
76
|
def size
|
75
77
|
@table.n_rows
|
76
78
|
end
|
77
|
-
alias_method :
|
79
|
+
alias_method :n_records, :size
|
78
80
|
alias_method :n_obs, :size
|
81
|
+
alias_method :n_rows, :size
|
79
82
|
|
80
83
|
# Returns the number of columns.
|
81
84
|
#
|
@@ -83,8 +86,9 @@ module RedAmber
|
|
83
86
|
def n_keys
|
84
87
|
@table.n_columns
|
85
88
|
end
|
86
|
-
alias_method :
|
89
|
+
alias_method :n_variables, :n_keys
|
87
90
|
alias_method :n_vars, :n_keys
|
91
|
+
alias_method :n_cols, :n_keys
|
88
92
|
|
89
93
|
# Returns the numbers of rows and columns.
|
90
94
|
#
|
@@ -171,7 +175,7 @@ module RedAmber
|
|
171
175
|
# - indices(1) #=> [1, 2, 3, 4, 5]
|
172
176
|
# - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
|
173
177
|
def indices(start = 0)
|
174
|
-
(start..).take(size)
|
178
|
+
Vector.new((start..).take(size))
|
175
179
|
end
|
176
180
|
alias_method :indexes, :indices
|
177
181
|
|
@@ -215,17 +219,6 @@ module RedAmber
|
|
215
219
|
Rover::DataFrame.new(to_h)
|
216
220
|
end
|
217
221
|
|
218
|
-
def to_iruby
|
219
|
-
require 'iruby'
|
220
|
-
return ['text/plain', '(empty DataFrame)'] if empty?
|
221
|
-
|
222
|
-
if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') == 'TDR'
|
223
|
-
size <= 5 ? ['text/plain', tdr_str(tally: 0)] : ['text/plain', tdr_str]
|
224
|
-
else
|
225
|
-
['text/html', html_table]
|
226
|
-
end
|
227
|
-
end
|
228
|
-
|
229
222
|
def group(*group_keys, &block)
|
230
223
|
g = Group.new(self, group_keys)
|
231
224
|
g = g.summarize(&block) if block
|
@@ -260,25 +253,6 @@ module RedAmber
|
|
260
253
|
ary[%i[variables keys vectors].index(var)]
|
261
254
|
end
|
262
255
|
|
263
|
-
def html_table
|
264
|
-
reduced = size > 8 ? self[0..4, -4..-1] : self
|
265
|
-
|
266
|
-
converted = reduced.assign do
|
267
|
-
vectors.select.with_object({}) do |vector, assigner|
|
268
|
-
if vector.has_nil?
|
269
|
-
assigner[vector.key] = vector.to_a.map do |e|
|
270
|
-
e = e.nil? ? '<i>(nil)</i>' : e.to_s # nil
|
271
|
-
e = '""' if e.empty? # empty string
|
272
|
-
e.sub(/(\s+)/, '"\1"') # blank spaces
|
273
|
-
end
|
274
|
-
end
|
275
|
-
end
|
276
|
-
end
|
277
|
-
|
278
|
-
html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
|
279
|
-
"#{self.class} <#{size} x #{n_keys} vector#{pl(n_keys)}> #{html}"
|
280
|
-
end
|
281
|
-
|
282
256
|
def name_unnamed_keys
|
283
257
|
return unless @table[:'']
|
284
258
|
|
@@ -0,0 +1,283 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-in for the class DataFrame
|
5
|
+
module DataFrameCombinable
|
6
|
+
# Concatenate other dataframe onto the bottom.
|
7
|
+
#
|
8
|
+
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
9
|
+
# DataFrame/Table to concatenate onto the bottom of self.
|
10
|
+
# @return [DataFrame]
|
11
|
+
# Concatenated dataframe.
|
12
|
+
def concatenate(*other)
|
13
|
+
case other
|
14
|
+
in [] | [nil] | [[]]
|
15
|
+
return self
|
16
|
+
in [Array => array]
|
17
|
+
# Nop
|
18
|
+
else
|
19
|
+
array = other
|
20
|
+
end
|
21
|
+
|
22
|
+
table_array = array.map do |e|
|
23
|
+
case e
|
24
|
+
when Arrow::Table
|
25
|
+
e
|
26
|
+
when DataFrame
|
27
|
+
e.table
|
28
|
+
else
|
29
|
+
raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
DataFrame.new(table.concatenate(table_array))
|
34
|
+
end
|
35
|
+
|
36
|
+
alias_method :concat, :concatenate
|
37
|
+
alias_method :bind_rows, :concatenate
|
38
|
+
|
39
|
+
# Merge other DataFrame or Table from other.
|
40
|
+
# - Self and other must have same size.
|
41
|
+
# - Self and other do not share the same key.
|
42
|
+
# - If they share any keys, raise Error.
|
43
|
+
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
44
|
+
# DataFrame/Table to concatenate.
|
45
|
+
# @return [DataFrame]
|
46
|
+
# Merged dataframe.
|
47
|
+
def merge(*other)
|
48
|
+
case other
|
49
|
+
in [] | [nil] | [[]]
|
50
|
+
return self
|
51
|
+
in [Array => array]
|
52
|
+
# Nop
|
53
|
+
else
|
54
|
+
array = other
|
55
|
+
end
|
56
|
+
|
57
|
+
hash = array.each_with_object({}) do |e, h|
|
58
|
+
df =
|
59
|
+
case e
|
60
|
+
when Arrow::Table
|
61
|
+
DataFrame.new(e)
|
62
|
+
when DataFrame
|
63
|
+
e
|
64
|
+
else
|
65
|
+
raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
|
66
|
+
end
|
67
|
+
|
68
|
+
raise DataFrameArgumentError, "#{e} do not have same size as self" if size != df.size
|
69
|
+
|
70
|
+
k = keys.intersection(df.keys).any?
|
71
|
+
raise DataFrameArgumentError, "There are some shared keys: #{k}" if k
|
72
|
+
|
73
|
+
h.merge!(df.to_h)
|
74
|
+
end
|
75
|
+
|
76
|
+
assign(hash)
|
77
|
+
end
|
78
|
+
|
79
|
+
alias_method :bind_cols, :merge
|
80
|
+
|
81
|
+
# Mutating joins
|
82
|
+
|
83
|
+
# Join data, leaving only the matching records.
|
84
|
+
#
|
85
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
86
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
87
|
+
# @return [DataFrame] Joined dataframe.
|
88
|
+
#
|
89
|
+
def inner_join(other, join_keys = nil, suffix: '.1')
|
90
|
+
join(other, join_keys, type: :inner, suffix: suffix)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Join data, leaving all records.
|
94
|
+
#
|
95
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
96
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
97
|
+
# @return [DataFrame] Joined dataframe.
|
98
|
+
#
|
99
|
+
def full_join(other, join_keys = nil, suffix: '.1')
|
100
|
+
join(other, join_keys, type: :full_outer, suffix: suffix)
|
101
|
+
end
|
102
|
+
|
103
|
+
alias_method :outer_join, :full_join
|
104
|
+
|
105
|
+
# Join matching values to self from other.
|
106
|
+
#
|
107
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
108
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
109
|
+
# @return [DataFrame] Joined dataframe.
|
110
|
+
#
|
111
|
+
def left_join(other, join_keys = nil, suffix: '.1')
|
112
|
+
join(other, join_keys, type: :left_outer, suffix: suffix)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Join matching values from self to other.
|
116
|
+
#
|
117
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
118
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
119
|
+
# @return [DataFrame] Joined dataframe.
|
120
|
+
#
|
121
|
+
def right_join(other, join_keys = nil, suffix: '.1')
|
122
|
+
join(other, join_keys, type: :right_outer, suffix: suffix)
|
123
|
+
end
|
124
|
+
|
125
|
+
# Filtering joins
|
126
|
+
|
127
|
+
# Return records of self that have a match in other.
|
128
|
+
#
|
129
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
130
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
131
|
+
# @return [DataFrame] Joined dataframe.
|
132
|
+
#
|
133
|
+
def semi_join(other, join_keys = nil, suffix: '.1')
|
134
|
+
join(other, join_keys, type: :left_semi, suffix: suffix)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Return records of self that do not have a match in other.
|
138
|
+
#
|
139
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
140
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
141
|
+
# @return [DataFrame] Joined dataframe.
|
142
|
+
#
|
143
|
+
def anti_join(other, join_keys = nil, suffix: '.1')
|
144
|
+
join(other, join_keys, type: :left_anti, suffix: suffix)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Set operations
|
148
|
+
|
149
|
+
# Check if set operation with self and other is possible.
|
150
|
+
#
|
151
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be checked with self.
|
152
|
+
# @return [Boolean] true if set operation is possible.
|
153
|
+
#
|
154
|
+
def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
|
155
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
156
|
+
keys == other.keys
|
157
|
+
end
|
158
|
+
|
159
|
+
# Select records appearing in both self and other.
|
160
|
+
#
|
161
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
162
|
+
# @return [DataFrame] Joined dataframe.
|
163
|
+
#
|
164
|
+
def intersect(other)
|
165
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
166
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
167
|
+
|
168
|
+
join(other, keys, type: :inner)
|
169
|
+
end
|
170
|
+
|
171
|
+
# Select records appearing in self or other.
|
172
|
+
#
|
173
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
174
|
+
# @return [DataFrame] Joined dataframe.
|
175
|
+
#
|
176
|
+
def union(other)
|
177
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
178
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
179
|
+
|
180
|
+
join(other, keys, type: :full_outer)
|
181
|
+
end
|
182
|
+
|
183
|
+
# Select records appearing in self but not in other.
|
184
|
+
#
|
185
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
186
|
+
# @return [DataFrame] Joined dataframe.
|
187
|
+
#
|
188
|
+
def difference(other)
|
189
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
190
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
191
|
+
|
192
|
+
join(other, keys, type: :left_anti)
|
193
|
+
end
|
194
|
+
|
195
|
+
alias_method :setdiff, :difference
|
196
|
+
|
197
|
+
# Undocumented. It is preferable to call specific methods.
|
198
|
+
|
199
|
+
# Join other dataframe
|
200
|
+
#
|
201
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
202
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
203
|
+
# @return [DataFrame] Joined dataframe.
|
204
|
+
#
|
205
|
+
# :type is one of
|
206
|
+
# :left_semi, :right_semi, :left_anti, :right_anti inner, :left_outer, :right_outer, :full_outer.
|
207
|
+
def join(other, join_keys = nil, type: :inner, suffix: '.1', left_outputs: nil, right_outputs: nil)
|
208
|
+
case other
|
209
|
+
when DataFrame
|
210
|
+
# Nop
|
211
|
+
when Arrow::Table
|
212
|
+
other = DataFrame.new(other)
|
213
|
+
else
|
214
|
+
raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
|
215
|
+
end
|
216
|
+
|
217
|
+
# Support natural keys (implicit common keys)
|
218
|
+
natural_keys = keys.intersection(other.keys)
|
219
|
+
raise DataFrameArgumentError, "#{join_keys} are not common keys" if natural_keys.empty?
|
220
|
+
|
221
|
+
join_keys =
|
222
|
+
if join_keys
|
223
|
+
Array(join_keys).map(&:to_sym)
|
224
|
+
else
|
225
|
+
natural_keys
|
226
|
+
end
|
227
|
+
return self if join_keys.empty?
|
228
|
+
|
229
|
+
# Support partial join_keys (common key other than join_key will be renamed with suffix)
|
230
|
+
remainer_keys = natural_keys - join_keys
|
231
|
+
unless remainer_keys.empty?
|
232
|
+
renamer = remainer_keys.each_with_object({}) do |key, hash|
|
233
|
+
new_key = nil
|
234
|
+
loop do
|
235
|
+
new_key = "#{key}#{suffix}".to_sym
|
236
|
+
break unless keys.include?(new_key)
|
237
|
+
|
238
|
+
s = suffix.succ
|
239
|
+
raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
|
240
|
+
|
241
|
+
suffix = s
|
242
|
+
end
|
243
|
+
hash[key] = new_key
|
244
|
+
end
|
245
|
+
other = other.rename(renamer)
|
246
|
+
end
|
247
|
+
|
248
|
+
# Red Arrow's #join returns duplicated join_keys from self and other as of v9.0.0 .
|
249
|
+
# Temporally merge key vectors here to workaround.
|
250
|
+
table_output =
|
251
|
+
table.join(other.table, join_keys, type: type, left_outputs: left_outputs, right_outputs: right_outputs)
|
252
|
+
left_indexes = [*0...n_keys]
|
253
|
+
right_indexes = [*((other.keys - join_keys).map { |key| other.keys.index(key) + n_keys })]
|
254
|
+
|
255
|
+
case type
|
256
|
+
when :left_semi, :left_anti, :right_semi, :right_anti
|
257
|
+
return DataFrame.new(table_output)
|
258
|
+
else
|
259
|
+
selected_indexes = left_indexes.concat(right_indexes)
|
260
|
+
end
|
261
|
+
merged_columns = join_keys.map do |key|
|
262
|
+
i = keys.index(key)
|
263
|
+
merge_column(table_output[i], table_output[n_keys + i], type)
|
264
|
+
end
|
265
|
+
DataFrame.new(table_output[selected_indexes])
|
266
|
+
.assign(*join_keys) { merged_columns }
|
267
|
+
end
|
268
|
+
|
269
|
+
private
|
270
|
+
|
271
|
+
def merge_column(column1, column2, type)
|
272
|
+
a1 = column1.to_a
|
273
|
+
a2 = column2.to_a
|
274
|
+
if type == :full_outer
|
275
|
+
a1.zip(a2).map { |x, y| x || y }
|
276
|
+
elsif type.start_with?('right')
|
277
|
+
a2
|
278
|
+
else # :inner or :left-*
|
279
|
+
a1
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|