red_amber 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +15 -0
- data/CHANGELOG.md +170 -20
- data/Gemfile +4 -2
- data/README.md +121 -302
- data/benchmark/basic.yml +79 -0
- data/benchmark/combine.yml +63 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +33 -0
- data/benchmark/reshape.yml +27 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/doc/DataFrame.md +611 -318
- data/doc/Vector.md +31 -36
- data/doc/image/basic_verbs.png +0 -0
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/assign_operation.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/pick_operation.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/rename_operation.png +0 -0
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe/slice_operation.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/group_operation.png +0 -0
- data/doc/image/replace-if_then.png +0 -0
- data/doc/image/reshaping_dataframe.png +0 -0
- data/doc/image/screenshot.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/lib/red_amber/data_frame.rb +16 -42
- data/lib/red_amber/data_frame_combinable.rb +283 -0
- data/lib/red_amber/data_frame_displayable.rb +58 -3
- data/lib/red_amber/data_frame_loadsave.rb +36 -0
- data/lib/red_amber/data_frame_reshaping.rb +8 -6
- data/lib/red_amber/data_frame_selectable.rb +9 -9
- data/lib/red_amber/data_frame_variable_operation.rb +27 -21
- data/lib/red_amber/group.rb +100 -17
- data/lib/red_amber/helper.rb +20 -30
- data/lib/red_amber/vector.rb +56 -30
- data/lib/red_amber/vector_functions.rb +0 -8
- data/lib/red_amber/vector_selectable.rb +9 -1
- data/lib/red_amber/vector_updatable.rb +61 -63
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -0
- data/red_amber.gemspec +1 -1
- metadata +32 -11
- data/doc/examples_of_red_amber.ipynb +0 -8979
data/doc/Vector.md
CHANGED
@@ -7,7 +7,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
7
7
|
### Create from a column in a DataFrame
|
8
8
|
|
9
9
|
```ruby
|
10
|
-
df =
|
10
|
+
df = DataFrame.new(x: [1, 2, 3])
|
11
11
|
df[:x]
|
12
12
|
# =>
|
13
13
|
#<RedAmber::Vector(:uint8, size=3):0x000000000000f4ec>
|
@@ -17,13 +17,16 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
17
17
|
### New from an Array
|
18
18
|
|
19
19
|
```ruby
|
20
|
-
vector =
|
20
|
+
vector = Vector.new([1, 2, 3])
|
21
21
|
# or
|
22
|
-
vector =
|
22
|
+
vector = Vector.new(1, 2, 3)
|
23
23
|
# or
|
24
|
-
vector =
|
24
|
+
vector = Vector.new(1..3)
|
25
25
|
# or
|
26
|
-
vector =
|
26
|
+
vector = Vector.new(Arrow::Array.new([1, 2, 3])
|
27
|
+
# or
|
28
|
+
require 'arrow-numo-narray'
|
29
|
+
vector = Vector.new(Numo::Int8[1, 2, 3])
|
27
30
|
|
28
31
|
# =>
|
29
32
|
#<RedAmber::Vector(:uint8, size=3):0x000000000000f514>
|
@@ -61,7 +64,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
61
64
|
|
62
65
|
### `type_class`
|
63
66
|
|
64
|
-
### `each`
|
67
|
+
### `each`, `map`, `collect`
|
65
68
|
|
66
69
|
If block is not given, returns Enumerator.
|
67
70
|
|
@@ -78,7 +81,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
78
81
|
- `limit` sets size limit to display a long array.
|
79
82
|
|
80
83
|
```ruby
|
81
|
-
vector =
|
84
|
+
vector = Vector.new((1..50).to_a)
|
82
85
|
# =>
|
83
86
|
#<RedAmber::Vector(:uint8, size=50):0x000000000000f528>
|
84
87
|
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ... ]
|
@@ -95,8 +98,8 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
95
98
|
- Negative index is also OK like the Ruby's primitive Array.
|
96
99
|
|
97
100
|
```ruby
|
98
|
-
array =
|
99
|
-
indices =
|
101
|
+
array = Vector.new(%w[A B C D E])
|
102
|
+
indices = Vector.new([0.1, -0.5, -5.1])
|
100
103
|
array.take(indices)
|
101
104
|
# or
|
102
105
|
array[indices]
|
@@ -106,7 +109,7 @@ array[indices]
|
|
106
109
|
["A", "E", "A"]
|
107
110
|
```
|
108
111
|
|
109
|
-
### `filter(booleans)`, `[](booleans)`
|
112
|
+
### `filter(booleans)`, `select(booleans)`, `[](booleans)`
|
110
113
|
|
111
114
|
- Acceptable class for booleans:
|
112
115
|
- An array of true, false, or nil
|
@@ -114,7 +117,7 @@ array[indices]
|
|
114
117
|
- Arrow::BooleanArray
|
115
118
|
|
116
119
|
```ruby
|
117
|
-
array =
|
120
|
+
array = Vector.new(%w[A B C D E])
|
118
121
|
booleans = [true, false, nil, false, true]
|
119
122
|
array.filter(booleans)
|
120
123
|
# or
|
@@ -124,6 +127,7 @@ array[booleans]
|
|
124
127
|
#<RedAmber::Vector(:string, size=2):0x000000000000f21c>
|
125
128
|
["A", "E"]
|
126
129
|
```
|
130
|
+
`filter` and `select` also accepts a block.
|
127
131
|
|
128
132
|
## Functions
|
129
133
|
|
@@ -158,7 +162,7 @@ Options can be used as follows.
|
|
158
162
|
See the [document of C++ function](https://arrow.apache.org/docs/cpp/compute.html) for detail.
|
159
163
|
|
160
164
|
```ruby
|
161
|
-
double =
|
165
|
+
double = Vector.new([1, 0/0.0, -1/0.0, 1/0.0, nil, ""])
|
162
166
|
#=>
|
163
167
|
#<RedAmber::Vector(:double, size=6):0x000000000000f910>
|
164
168
|
[1.0, NaN, -Infinity, Infinity, nil, 0.0]
|
@@ -168,7 +172,7 @@ double.count(mode: :only_valid) #=> 5, default
|
|
168
172
|
double.count(mode: :only_null) #=> 1
|
169
173
|
double.count(mode: :all) #=> 6
|
170
174
|
|
171
|
-
boolean =
|
175
|
+
boolean = Vector.new([true, true, nil])
|
172
176
|
#=>
|
173
177
|
#<RedAmber::Vector(:boolean, size=3):0x000000000000f924>
|
174
178
|
[true, true, nil]
|
@@ -215,7 +219,7 @@ Examples of options for `#round`;
|
|
215
219
|
- `round_mode` Specify rounding mode.
|
216
220
|
|
217
221
|
```ruby
|
218
|
-
double =
|
222
|
+
double = Vector.new([15.15, 2.5, 3.5, -4.5, -5.5])
|
219
223
|
# => [15.15, 2.5, 3.5, -4.5, -5.5]
|
220
224
|
double.round
|
221
225
|
# => [15.0, 2.0, 4.0, -4.0, -6.0]
|
@@ -293,7 +297,7 @@ double.round(n_digits: -1)
|
|
293
297
|
array = [0.0/0, Float::NAN]
|
294
298
|
array.tally #=> {NaN=>1, NaN=>1}
|
295
299
|
|
296
|
-
vector =
|
300
|
+
vector = Vector.new(array)
|
297
301
|
vector.tally #=> {NaN=>2}
|
298
302
|
vector.value_counts #=> {NaN=>2}
|
299
303
|
```
|
@@ -310,7 +314,7 @@ double.round(n_digits: -1)
|
|
310
314
|
## Coerce
|
311
315
|
|
312
316
|
```ruby
|
313
|
-
vector =
|
317
|
+
vector = Vector.new(1,2,3)
|
314
318
|
# =>
|
315
319
|
#<RedAmber::Vector(:uint8, size=3):0x00000000000decc4>
|
316
320
|
[1, 2, 3]
|
@@ -340,12 +344,13 @@ vector * -1
|
|
340
344
|
- Accepts Scalar, Range of Integer, Vector, Array, Arrow::Array as a specifier
|
341
345
|
- Accepts Scalar, Vector, Array and Arrow::Array as a replacer.
|
342
346
|
- Boolean specifiers specify the position of replacer in true.
|
347
|
+
- If booleans.any is false, no replacement happen and return self.
|
343
348
|
- Index specifiers specify the position of replacer in indices.
|
344
349
|
- replacer specifies the values to be replaced.
|
345
350
|
- The number of true in booleans must be equal to the length of replacer
|
346
351
|
|
347
352
|
```ruby
|
348
|
-
vector =
|
353
|
+
vector = Vector.new([1, 2, 3])
|
349
354
|
booleans = [true, false, true]
|
350
355
|
replacer = [4, 5]
|
351
356
|
vector.replace(booleans, replacer)
|
@@ -379,7 +384,7 @@ vector.replace(booleans, replacer)
|
|
379
384
|
```ruby
|
380
385
|
booleans = [true, false, nil]
|
381
386
|
replacer = -1
|
382
|
-
|
387
|
+
vector.replace(booleans, replacer)
|
383
388
|
=>
|
384
389
|
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
385
390
|
[-1, 2, nil]
|
@@ -390,17 +395,7 @@ vec.replace(booleans, replacer)
|
|
390
395
|
```ruby
|
391
396
|
booleans = [true, false, true]
|
392
397
|
replacer = [nil]
|
393
|
-
|
394
|
-
=>
|
395
|
-
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
396
|
-
[nil, 2, nil]
|
397
|
-
```
|
398
|
-
|
399
|
-
- If no replacer specified, it is same as to specify nil.
|
400
|
-
|
401
|
-
```ruby
|
402
|
-
booleans = [true, false, true]
|
403
|
-
vec.replace(booleans)
|
398
|
+
vector.replace(booleans, replacer)
|
404
399
|
=>
|
405
400
|
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
406
401
|
[nil, 2, nil]
|
@@ -409,7 +404,7 @@ vec.replace(booleans)
|
|
409
404
|
- An example to replace 'NA' to nil.
|
410
405
|
|
411
406
|
```ruby
|
412
|
-
vector =
|
407
|
+
vector = Vector.new(['A', 'B', 'NA'])
|
413
408
|
vector.replace(vector == 'NA', nil)
|
414
409
|
# =>
|
415
410
|
#<RedAmber::Vector(:string, size=3):0x000000000000f8ac>
|
@@ -421,7 +416,7 @@ vector.replace(vector == 'NA', nil)
|
|
421
416
|
Specified indices are used 'as sorted'. Position in indices and replacer may not have correspondence.
|
422
417
|
|
423
418
|
```ruby
|
424
|
-
vector =
|
419
|
+
vector = Vector.new([1, 2, 3])
|
425
420
|
indices = [2, 1]
|
426
421
|
replacer = [4, 5]
|
427
422
|
vector.replace(indices, replacer)
|
@@ -437,7 +432,7 @@ Propagate the last valid observation forward (or backward).
|
|
437
432
|
Or preserve nil if all previous values are nil or at the end.
|
438
433
|
|
439
434
|
```ruby
|
440
|
-
integer =
|
435
|
+
integer = Vector.new([0, 1, nil, 3, nil])
|
441
436
|
integer.fill_nil_forward
|
442
437
|
# =>
|
443
438
|
#<RedAmber::Vector(:uint8, size=5):0x000000000000f960>
|
@@ -459,7 +454,7 @@ Choose values based on self. Self must be a boolean Vector.
|
|
459
454
|
This example will normalize negative indices to positive ones.
|
460
455
|
|
461
456
|
```ruby
|
462
|
-
indices =
|
457
|
+
indices = Vector.new([1, -1, 3, -4])
|
463
458
|
array_size = 10
|
464
459
|
normalized_indices = (indices < 0).if_else(indices + array_size, indices)
|
465
460
|
|
@@ -474,7 +469,7 @@ For each element in self, return true if it is found in given `values`, false ot
|
|
474
469
|
By default, nulls are matched against the value set. (This will be changed in SetLookupOptions: not impremented.)
|
475
470
|
|
476
471
|
```ruby
|
477
|
-
vector =
|
472
|
+
vector = Vector.new %W[A B C D]
|
478
473
|
values = ['A', 'C', 'X']
|
479
474
|
vector.is_in(values)
|
480
475
|
|
@@ -486,7 +481,7 @@ vector.is_in(values)
|
|
486
481
|
`values` are casted to the same Class of Vector.
|
487
482
|
|
488
483
|
```ruby
|
489
|
-
vector =
|
484
|
+
vector = Vector.new([1, 2, 255])
|
490
485
|
vector.is_in(1, -1)
|
491
486
|
|
492
487
|
# =>
|
@@ -499,7 +494,7 @@ vector.is_in(1, -1)
|
|
499
494
|
Shift vector's values by specified `amount`. Shifted space is filled by value `fill`.
|
500
495
|
|
501
496
|
```ruby
|
502
|
-
vector =
|
497
|
+
vector = Vector.new([1, 2, 3, 4, 5])
|
503
498
|
vector.shift
|
504
499
|
|
505
500
|
# =>
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/red_amber/data_frame.rb
CHANGED
@@ -5,8 +5,10 @@ module RedAmber
|
|
5
5
|
# Variable @table holds an Arrow::Table object.
|
6
6
|
class DataFrame
|
7
7
|
# mix-in
|
8
|
+
include DataFrameCombinable
|
8
9
|
include DataFrameDisplayable
|
9
10
|
include DataFrameIndexable
|
11
|
+
include DataFrameLoadSave
|
10
12
|
include DataFrameReshaping
|
11
13
|
include DataFrameSelectable
|
12
14
|
include DataFrameVariableOperation
|
@@ -37,10 +39,15 @@ module RedAmber
|
|
37
39
|
# DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
38
40
|
# returns empty DataFrame
|
39
41
|
@table = Arrow::Table.new({}, [])
|
42
|
+
in [->(x) { x.respond_to?(:to_arrow) } => arrowable]
|
43
|
+
table = arrowable.to_arrow
|
44
|
+
unless table.is_a?(Arrow::Table)
|
45
|
+
raise DataFrameTypeError,
|
46
|
+
"to_arrow must return an Arrow::Table but #{table.class}: #{arrowable}"
|
47
|
+
end
|
48
|
+
@table = table
|
40
49
|
in [Arrow::Table => table]
|
41
50
|
@table = table
|
42
|
-
in [DataFrame => dataframe]
|
43
|
-
@table = dataframe.table
|
44
51
|
in [rover_or_hash]
|
45
52
|
begin
|
46
53
|
# Accepts Rover::DataFrame or Hash
|
@@ -52,10 +59,9 @@ module RedAmber
|
|
52
59
|
@table = Arrow::Table.new(*args)
|
53
60
|
end
|
54
61
|
name_unnamed_keys
|
55
|
-
end
|
56
62
|
|
57
|
-
|
58
|
-
|
63
|
+
duplicated_keys = keys.tally.select { |_k, v| v > 1 }.keys
|
64
|
+
raise DataFrameArgumentError, "duplicate keys: #{duplicated_keys}" unless duplicated_keys.empty?
|
59
65
|
end
|
60
66
|
|
61
67
|
attr_reader :table
|
@@ -64,18 +70,15 @@ module RedAmber
|
|
64
70
|
@table
|
65
71
|
end
|
66
72
|
|
67
|
-
def save(output, options = {})
|
68
|
-
@table.save(output, options)
|
69
|
-
end
|
70
|
-
|
71
73
|
# Returns the number of rows.
|
72
74
|
#
|
73
75
|
# @return [Integer] Number of rows.
|
74
76
|
def size
|
75
77
|
@table.n_rows
|
76
78
|
end
|
77
|
-
alias_method :
|
79
|
+
alias_method :n_records, :size
|
78
80
|
alias_method :n_obs, :size
|
81
|
+
alias_method :n_rows, :size
|
79
82
|
|
80
83
|
# Returns the number of columns.
|
81
84
|
#
|
@@ -83,8 +86,9 @@ module RedAmber
|
|
83
86
|
def n_keys
|
84
87
|
@table.n_columns
|
85
88
|
end
|
86
|
-
alias_method :
|
89
|
+
alias_method :n_variables, :n_keys
|
87
90
|
alias_method :n_vars, :n_keys
|
91
|
+
alias_method :n_cols, :n_keys
|
88
92
|
|
89
93
|
# Returns the numbers of rows and columns.
|
90
94
|
#
|
@@ -171,7 +175,7 @@ module RedAmber
|
|
171
175
|
# - indices(1) #=> [1, 2, 3, 4, 5]
|
172
176
|
# - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
|
173
177
|
def indices(start = 0)
|
174
|
-
(start..).take(size)
|
178
|
+
Vector.new((start..).take(size))
|
175
179
|
end
|
176
180
|
alias_method :indexes, :indices
|
177
181
|
|
@@ -215,17 +219,6 @@ module RedAmber
|
|
215
219
|
Rover::DataFrame.new(to_h)
|
216
220
|
end
|
217
221
|
|
218
|
-
def to_iruby
|
219
|
-
require 'iruby'
|
220
|
-
return ['text/plain', '(empty DataFrame)'] if empty?
|
221
|
-
|
222
|
-
if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') == 'TDR'
|
223
|
-
size <= 5 ? ['text/plain', tdr_str(tally: 0)] : ['text/plain', tdr_str]
|
224
|
-
else
|
225
|
-
['text/html', html_table]
|
226
|
-
end
|
227
|
-
end
|
228
|
-
|
229
222
|
def group(*group_keys, &block)
|
230
223
|
g = Group.new(self, group_keys)
|
231
224
|
g = g.summarize(&block) if block
|
@@ -260,25 +253,6 @@ module RedAmber
|
|
260
253
|
ary[%i[variables keys vectors].index(var)]
|
261
254
|
end
|
262
255
|
|
263
|
-
def html_table
|
264
|
-
reduced = size > 8 ? self[0..4, -4..-1] : self
|
265
|
-
|
266
|
-
converted = reduced.assign do
|
267
|
-
vectors.select.with_object({}) do |vector, assigner|
|
268
|
-
if vector.has_nil?
|
269
|
-
assigner[vector.key] = vector.to_a.map do |e|
|
270
|
-
e = e.nil? ? '<i>(nil)</i>' : e.to_s # nil
|
271
|
-
e = '""' if e.empty? # empty string
|
272
|
-
e.sub(/(\s+)/, '"\1"') # blank spaces
|
273
|
-
end
|
274
|
-
end
|
275
|
-
end
|
276
|
-
end
|
277
|
-
|
278
|
-
html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
|
279
|
-
"#{self.class} <#{size} x #{n_keys} vector#{pl(n_keys)}> #{html}"
|
280
|
-
end
|
281
|
-
|
282
256
|
def name_unnamed_keys
|
283
257
|
return unless @table[:'']
|
284
258
|
|
@@ -0,0 +1,283 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-in for the class DataFrame
|
5
|
+
module DataFrameCombinable
|
6
|
+
# Concatenate other dataframe onto the bottom.
|
7
|
+
#
|
8
|
+
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
9
|
+
# DataFrame/Table to concatenate onto the bottom of self.
|
10
|
+
# @return [DataFrame]
|
11
|
+
# Concatenated dataframe.
|
12
|
+
def concatenate(*other)
|
13
|
+
case other
|
14
|
+
in [] | [nil] | [[]]
|
15
|
+
return self
|
16
|
+
in [Array => array]
|
17
|
+
# Nop
|
18
|
+
else
|
19
|
+
array = other
|
20
|
+
end
|
21
|
+
|
22
|
+
table_array = array.map do |e|
|
23
|
+
case e
|
24
|
+
when Arrow::Table
|
25
|
+
e
|
26
|
+
when DataFrame
|
27
|
+
e.table
|
28
|
+
else
|
29
|
+
raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
DataFrame.new(table.concatenate(table_array))
|
34
|
+
end
|
35
|
+
|
36
|
+
alias_method :concat, :concatenate
|
37
|
+
alias_method :bind_rows, :concatenate
|
38
|
+
|
39
|
+
# Merge other DataFrame or Table from other.
|
40
|
+
# - Self and other must have same size.
|
41
|
+
# - Self and other do not share the same key.
|
42
|
+
# - If they share any keys, raise Error.
|
43
|
+
# @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
|
44
|
+
# DataFrame/Table to concatenate.
|
45
|
+
# @return [DataFrame]
|
46
|
+
# Merged dataframe.
|
47
|
+
def merge(*other)
|
48
|
+
case other
|
49
|
+
in [] | [nil] | [[]]
|
50
|
+
return self
|
51
|
+
in [Array => array]
|
52
|
+
# Nop
|
53
|
+
else
|
54
|
+
array = other
|
55
|
+
end
|
56
|
+
|
57
|
+
hash = array.each_with_object({}) do |e, h|
|
58
|
+
df =
|
59
|
+
case e
|
60
|
+
when Arrow::Table
|
61
|
+
DataFrame.new(e)
|
62
|
+
when DataFrame
|
63
|
+
e
|
64
|
+
else
|
65
|
+
raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
|
66
|
+
end
|
67
|
+
|
68
|
+
raise DataFrameArgumentError, "#{e} do not have same size as self" if size != df.size
|
69
|
+
|
70
|
+
k = keys.intersection(df.keys).any?
|
71
|
+
raise DataFrameArgumentError, "There are some shared keys: #{k}" if k
|
72
|
+
|
73
|
+
h.merge!(df.to_h)
|
74
|
+
end
|
75
|
+
|
76
|
+
assign(hash)
|
77
|
+
end
|
78
|
+
|
79
|
+
alias_method :bind_cols, :merge
|
80
|
+
|
81
|
+
# Mutating joins
|
82
|
+
|
83
|
+
# Join data, leaving only the matching records.
|
84
|
+
#
|
85
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
86
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
87
|
+
# @return [DataFrame] Joined dataframe.
|
88
|
+
#
|
89
|
+
def inner_join(other, join_keys = nil, suffix: '.1')
|
90
|
+
join(other, join_keys, type: :inner, suffix: suffix)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Join data, leaving all records.
|
94
|
+
#
|
95
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
96
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
97
|
+
# @return [DataFrame] Joined dataframe.
|
98
|
+
#
|
99
|
+
def full_join(other, join_keys = nil, suffix: '.1')
|
100
|
+
join(other, join_keys, type: :full_outer, suffix: suffix)
|
101
|
+
end
|
102
|
+
|
103
|
+
alias_method :outer_join, :full_join
|
104
|
+
|
105
|
+
# Join matching values to self from other.
|
106
|
+
#
|
107
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
108
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
109
|
+
# @return [DataFrame] Joined dataframe.
|
110
|
+
#
|
111
|
+
def left_join(other, join_keys = nil, suffix: '.1')
|
112
|
+
join(other, join_keys, type: :left_outer, suffix: suffix)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Join matching values from self to other.
|
116
|
+
#
|
117
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
118
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
119
|
+
# @return [DataFrame] Joined dataframe.
|
120
|
+
#
|
121
|
+
def right_join(other, join_keys = nil, suffix: '.1')
|
122
|
+
join(other, join_keys, type: :right_outer, suffix: suffix)
|
123
|
+
end
|
124
|
+
|
125
|
+
# Filtering joins
|
126
|
+
|
127
|
+
# Return records of self that have a match in other.
|
128
|
+
#
|
129
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
130
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
131
|
+
# @return [DataFrame] Joined dataframe.
|
132
|
+
#
|
133
|
+
def semi_join(other, join_keys = nil, suffix: '.1')
|
134
|
+
join(other, join_keys, type: :left_semi, suffix: suffix)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Return records of self that do not have a match in other.
|
138
|
+
#
|
139
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
140
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
141
|
+
# @return [DataFrame] Joined dataframe.
|
142
|
+
#
|
143
|
+
def anti_join(other, join_keys = nil, suffix: '.1')
|
144
|
+
join(other, join_keys, type: :left_anti, suffix: suffix)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Set operations
|
148
|
+
|
149
|
+
# Check if set operation with self and other is possible.
|
150
|
+
#
|
151
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be checked with self.
|
152
|
+
# @return [Boolean] true if set operation is possible.
|
153
|
+
#
|
154
|
+
def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
|
155
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
156
|
+
keys == other.keys
|
157
|
+
end
|
158
|
+
|
159
|
+
# Select records appearing in both self and other.
|
160
|
+
#
|
161
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
162
|
+
# @return [DataFrame] Joined dataframe.
|
163
|
+
#
|
164
|
+
def intersect(other)
|
165
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
166
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
167
|
+
|
168
|
+
join(other, keys, type: :inner)
|
169
|
+
end
|
170
|
+
|
171
|
+
# Select records appearing in self or other.
|
172
|
+
#
|
173
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
174
|
+
# @return [DataFrame] Joined dataframe.
|
175
|
+
#
|
176
|
+
def union(other)
|
177
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
178
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
179
|
+
|
180
|
+
join(other, keys, type: :full_outer)
|
181
|
+
end
|
182
|
+
|
183
|
+
# Select records appearing in self but not in other.
|
184
|
+
#
|
185
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
186
|
+
# @return [DataFrame] Joined dataframe.
|
187
|
+
#
|
188
|
+
def difference(other)
|
189
|
+
other = DataFrame.new(other) if other.is_a?(Arrow::Table)
|
190
|
+
raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
|
191
|
+
|
192
|
+
join(other, keys, type: :left_anti)
|
193
|
+
end
|
194
|
+
|
195
|
+
alias_method :setdiff, :difference
|
196
|
+
|
197
|
+
# Undocumented. It is preferable to call specific methods.
|
198
|
+
|
199
|
+
# Join other dataframe
|
200
|
+
#
|
201
|
+
# @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
|
202
|
+
# @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
|
203
|
+
# @return [DataFrame] Joined dataframe.
|
204
|
+
#
|
205
|
+
# :type is one of
|
206
|
+
# :left_semi, :right_semi, :left_anti, :right_anti inner, :left_outer, :right_outer, :full_outer.
|
207
|
+
def join(other, join_keys = nil, type: :inner, suffix: '.1', left_outputs: nil, right_outputs: nil)
|
208
|
+
case other
|
209
|
+
when DataFrame
|
210
|
+
# Nop
|
211
|
+
when Arrow::Table
|
212
|
+
other = DataFrame.new(other)
|
213
|
+
else
|
214
|
+
raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
|
215
|
+
end
|
216
|
+
|
217
|
+
# Support natural keys (implicit common keys)
|
218
|
+
natural_keys = keys.intersection(other.keys)
|
219
|
+
raise DataFrameArgumentError, "#{join_keys} are not common keys" if natural_keys.empty?
|
220
|
+
|
221
|
+
join_keys =
|
222
|
+
if join_keys
|
223
|
+
Array(join_keys).map(&:to_sym)
|
224
|
+
else
|
225
|
+
natural_keys
|
226
|
+
end
|
227
|
+
return self if join_keys.empty?
|
228
|
+
|
229
|
+
# Support partial join_keys (common key other than join_key will be renamed with suffix)
|
230
|
+
remainer_keys = natural_keys - join_keys
|
231
|
+
unless remainer_keys.empty?
|
232
|
+
renamer = remainer_keys.each_with_object({}) do |key, hash|
|
233
|
+
new_key = nil
|
234
|
+
loop do
|
235
|
+
new_key = "#{key}#{suffix}".to_sym
|
236
|
+
break unless keys.include?(new_key)
|
237
|
+
|
238
|
+
s = suffix.succ
|
239
|
+
raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
|
240
|
+
|
241
|
+
suffix = s
|
242
|
+
end
|
243
|
+
hash[key] = new_key
|
244
|
+
end
|
245
|
+
other = other.rename(renamer)
|
246
|
+
end
|
247
|
+
|
248
|
+
# Red Arrow's #join returns duplicated join_keys from self and other as of v9.0.0 .
|
249
|
+
# Temporally merge key vectors here to workaround.
|
250
|
+
table_output =
|
251
|
+
table.join(other.table, join_keys, type: type, left_outputs: left_outputs, right_outputs: right_outputs)
|
252
|
+
left_indexes = [*0...n_keys]
|
253
|
+
right_indexes = [*((other.keys - join_keys).map { |key| other.keys.index(key) + n_keys })]
|
254
|
+
|
255
|
+
case type
|
256
|
+
when :left_semi, :left_anti, :right_semi, :right_anti
|
257
|
+
return DataFrame.new(table_output)
|
258
|
+
else
|
259
|
+
selected_indexes = left_indexes.concat(right_indexes)
|
260
|
+
end
|
261
|
+
merged_columns = join_keys.map do |key|
|
262
|
+
i = keys.index(key)
|
263
|
+
merge_column(table_output[i], table_output[n_keys + i], type)
|
264
|
+
end
|
265
|
+
DataFrame.new(table_output[selected_indexes])
|
266
|
+
.assign(*join_keys) { merged_columns }
|
267
|
+
end
|
268
|
+
|
269
|
+
private
|
270
|
+
|
271
|
+
def merge_column(column1, column2, type)
|
272
|
+
a1 = column1.to_a
|
273
|
+
a2 = column2.to_a
|
274
|
+
if type == :full_outer
|
275
|
+
a1.zip(a2).map { |x, y| x || y }
|
276
|
+
elsif type.start_with?('right')
|
277
|
+
a2
|
278
|
+
else # :inner or :left-*
|
279
|
+
a1
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|