red_amber 0.4.2 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.devcontainer/Dockerfile +75 -0
- data/.devcontainer/devcontainer.json +38 -0
- data/.devcontainer/onCreateCommand.sh +22 -0
- data/.rubocop.yml +11 -5
- data/CHANGELOG.md +141 -17
- data/Gemfile +5 -6
- data/README.ja.md +271 -0
- data/README.md +52 -31
- data/Rakefile +55 -0
- data/benchmark/group.yml +12 -5
- data/doc/Dev_Containers.ja.md +290 -0
- data/doc/Dev_Containers.md +292 -0
- data/doc/qmd/examples_of_red_amber.qmd +4596 -0
- data/doc/qmd/red-amber.qmd +90 -0
- data/docker/Dockerfile +2 -2
- data/docker/Gemfile +8 -3
- data/docker/docker-compose.yml +1 -1
- data/docker/readme.md +5 -5
- data/lib/red_amber/data_frame.rb +78 -4
- data/lib/red_amber/data_frame_combinable.rb +147 -119
- data/lib/red_amber/data_frame_displayable.rb +7 -6
- data/lib/red_amber/data_frame_loadsave.rb +1 -1
- data/lib/red_amber/data_frame_selectable.rb +51 -2
- data/lib/red_amber/data_frame_variable_operation.rb +6 -6
- data/lib/red_amber/group.rb +476 -127
- data/lib/red_amber/helper.rb +26 -0
- data/lib/red_amber/subframes.rb +18 -11
- data/lib/red_amber/vector.rb +45 -25
- data/lib/red_amber/vector_aggregation.rb +26 -0
- data/lib/red_amber/vector_selectable.rb +124 -40
- data/lib/red_amber/vector_string_function.rb +279 -0
- data/lib/red_amber/vector_unary_element_wise.rb +4 -0
- data/lib/red_amber/vector_updatable.rb +28 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -1
- data/red_amber.gemspec +3 -3
- metadata +19 -14
- data/docker/Gemfile.lock +0 -80
- data/docker/example +0 -74
- data/docker/notebook/examples_of_red_amber.ipynb +0 -8562
- data/docker/notebook/red-amber.ipynb +0 -188
@@ -0,0 +1,90 @@
|
|
1
|
+
---
|
2
|
+
title: RedAmber Examples
|
3
|
+
date: 2023-08-06
|
4
|
+
author: heronshoes
|
5
|
+
jupyter: ruby
|
6
|
+
format:
|
7
|
+
pdf:
|
8
|
+
toc: true
|
9
|
+
---
|
10
|
+
|
11
|
+
This notebook walks through the [README of RedAmber](https://github.com/heronshoes/red_amber#readme).
|
12
|
+
|
13
|
+
## `RedAmber::DataFrame`
|
14
|
+
|
15
|
+
```{ruby}
|
16
|
+
#| tags: []
|
17
|
+
require 'red_amber'
|
18
|
+
include RedAmber
|
19
|
+
require 'datasets-arrow'
|
20
|
+
|
21
|
+
{RedAmber: VERSION, Datasets: Datasets::VERSION}
|
22
|
+
```
|
23
|
+
|
24
|
+
## Example: diamonds dataset
|
25
|
+
|
26
|
+
For the first loading of Datasets::Diamonds, it will take some time to download.
|
27
|
+
|
28
|
+
```{ruby}
|
29
|
+
#| tags: []
|
30
|
+
dataset = Datasets::Diamonds.new
|
31
|
+
diamonds = DataFrame.new(dataset)
|
32
|
+
```
|
33
|
+
|
34
|
+
```{ruby}
|
35
|
+
#| tags: []
|
36
|
+
df = diamonds
|
37
|
+
.slice { carat > 1 } # or use #filter instead of #slice
|
38
|
+
.group(:cut)
|
39
|
+
.mean(:price) # `pick` prior to `group` is not required if `:price` is specified here.
|
40
|
+
.sort('-mean(price)')
|
41
|
+
```
|
42
|
+
|
43
|
+
```{ruby}
|
44
|
+
#| tags: []
|
45
|
+
usdjpy = 110.0 # when the yen was stronger
|
46
|
+
|
47
|
+
df.rename('mean(price)': :mean_price_USD)
|
48
|
+
.assign(:mean_price_JPY) { mean_price_USD * usdjpy }
|
49
|
+
```
|
50
|
+
|
51
|
+
## Example: starwars dataset
|
52
|
+
|
53
|
+
```{ruby}
|
54
|
+
#| tags: []
|
55
|
+
uri = URI('https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/starwars.csv')
|
56
|
+
|
57
|
+
starwars = DataFrame.load(uri)
|
58
|
+
```
|
59
|
+
|
60
|
+
```{ruby}
|
61
|
+
#| tags: []
|
62
|
+
starwars
|
63
|
+
.drop(0) # delete unnecessary index column
|
64
|
+
.remove { species == "NA" } # delete unnecessary rows
|
65
|
+
.group(:species) { [count(:species), mean(:height, :mass)] }
|
66
|
+
.slice { count > 1 } # or use #filter instead of slice
|
67
|
+
```
|
68
|
+
|
69
|
+
## `RedAmber::Vector`
|
70
|
+
|
71
|
+
```{ruby}
|
72
|
+
#| tags: []
|
73
|
+
penguins = DataFrame.new(Datasets::Penguins.new)
|
74
|
+
```
|
75
|
+
|
76
|
+
```{ruby}
|
77
|
+
#| tags: []
|
78
|
+
penguins[:bill_length_mm]
|
79
|
+
```
|
80
|
+
|
81
|
+
```{ruby}
|
82
|
+
#| tags: []
|
83
|
+
penguins[:bill_length_mm] < 40
|
84
|
+
```
|
85
|
+
|
86
|
+
```{ruby}
|
87
|
+
#| tags: []
|
88
|
+
penguins[:bill_length_mm].mean
|
89
|
+
```
|
90
|
+
|
data/docker/Dockerfile
CHANGED
data/docker/Gemfile
CHANGED
@@ -5,11 +5,11 @@ source 'https://rubygems.org'
|
|
5
5
|
gem 'irb'
|
6
6
|
|
7
7
|
gem 'numo-narray'
|
8
|
-
gem 'red-arrow', '~>
|
8
|
+
gem 'red-arrow', '~> 12.0.0'
|
9
9
|
gem 'red-arrow-numo-narray'
|
10
|
-
gem 'red-parquet', '~>
|
10
|
+
gem 'red-parquet', '~> 12.0.0'
|
11
11
|
|
12
|
-
gem 'red_amber'
|
12
|
+
gem 'red_amber'
|
13
13
|
gem 'red-amber-view'
|
14
14
|
gem 'rover-df'
|
15
15
|
|
@@ -18,4 +18,9 @@ gem 'red-datasets-arrow'
|
|
18
18
|
|
19
19
|
gem 'benchmark_driver'
|
20
20
|
gem 'benchmark-ips'
|
21
|
+
|
22
|
+
gem 'charty'
|
21
23
|
gem 'faker'
|
24
|
+
gem 'matplotlib'
|
25
|
+
gem 'pycall'
|
26
|
+
gem 'unicode_plot'
|
data/docker/docker-compose.yml
CHANGED
data/docker/readme.md
CHANGED
@@ -6,12 +6,12 @@ This is a docker image containing RedAmber created from
|
|
6
6
|
## Contents
|
7
7
|
|
8
8
|
- From jupyter/minimal-notebook:
|
9
|
-
- Based on 2023-
|
9
|
+
- Based on 2023-05-15 (513d0cb8a67c)
|
10
10
|
- x86-64
|
11
11
|
- Ubuntu-22.04
|
12
|
-
- python-3.10.
|
13
|
-
- lab-3.6.
|
14
|
-
- notebook-6.5.
|
12
|
+
- python-3.10.11
|
13
|
+
- lab-3.6.3
|
14
|
+
- notebook-6.5.4
|
15
15
|
- System ruby-dev:
|
16
16
|
- Ruby 3.0.2
|
17
17
|
- Arrow 11.0.0 for Ubuntu:
|
@@ -22,7 +22,7 @@ This is a docker image containing RedAmber created from
|
|
22
22
|
- Locally installed iruby:
|
23
23
|
- Using Ruby 3.0.2
|
24
24
|
- Locally installed bundler and Gemfile:
|
25
|
-
- RedAmber 0.
|
25
|
+
- RedAmber 0.5.0
|
26
26
|
- Others (see Gemfile)
|
27
27
|
|
28
28
|
## Install
|
data/lib/red_amber/data_frame.rb
CHANGED
@@ -422,12 +422,12 @@ module RedAmber
|
|
422
422
|
# Create SubFrames by value grouping.
|
423
423
|
#
|
424
424
|
# [Experimental feature] this method may be removed or be changed in the future.
|
425
|
-
# @param keys [Symbol, String
|
425
|
+
# @param keys [List<Symbol, String>, Array<Symbol, String>]
|
426
426
|
# grouping keys.
|
427
427
|
# @return [SubFrames]
|
428
428
|
# a created SubFrames grouped by column values on `keys`.
|
429
429
|
# @example
|
430
|
-
# df.sub_by_value(
|
430
|
+
# df.sub_by_value(:y)
|
431
431
|
#
|
432
432
|
# # =>
|
433
433
|
# #<RedAmber::SubFrames : 0x000000000000fc08>
|
@@ -454,10 +454,11 @@ module RedAmber
|
|
454
454
|
#
|
455
455
|
# @since 0.4.0
|
456
456
|
#
|
457
|
-
def sub_by_value(keys
|
458
|
-
SubFrames.new(self, group(keys).filters)
|
457
|
+
def sub_by_value(*keys)
|
458
|
+
SubFrames.new(self, group(keys.flatten).filters)
|
459
459
|
end
|
460
460
|
alias_method :subframes_by_value, :sub_by_value
|
461
|
+
alias_method :sub_group, :sub_by_value
|
461
462
|
|
462
463
|
# Create SubFrames by Windowing with `from`, `size` and `step`.
|
463
464
|
#
|
@@ -697,6 +698,79 @@ module RedAmber
|
|
697
698
|
end
|
698
699
|
end
|
699
700
|
|
701
|
+
# Returns a Vector such that all elements have value `scalar`
|
702
|
+
# and have same size as self.
|
703
|
+
#
|
704
|
+
# @overload propagate(scalar)
|
705
|
+
# Specifies scalar as an agrument.
|
706
|
+
#
|
707
|
+
# @param scalar [scalar]
|
708
|
+
# a value to propagate in Vector.
|
709
|
+
# @return [Vector]
|
710
|
+
# created Vector.
|
711
|
+
# @example propagate a value
|
712
|
+
# df
|
713
|
+
# # =>
|
714
|
+
# #<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000849a4>
|
715
|
+
# x y z
|
716
|
+
# <uint8> <string> <boolean>
|
717
|
+
# 0 1 A false
|
718
|
+
# 1 2 A true
|
719
|
+
# 2 3 B false
|
720
|
+
# 3 4 B (nil)
|
721
|
+
# 4 5 B true
|
722
|
+
# 5 6 C false
|
723
|
+
#
|
724
|
+
# df.assign(:sum_x) { propagate(x.sum) }
|
725
|
+
# # =>
|
726
|
+
# #<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000007bd04>
|
727
|
+
# x y z sum_x
|
728
|
+
# <uint8> <string> <boolean> <uint8>
|
729
|
+
# 0 1 A false 21
|
730
|
+
# 1 2 A true 21
|
731
|
+
# 2 3 B false 21
|
732
|
+
# 3 4 B (nil) 21
|
733
|
+
# 4 5 B true 21
|
734
|
+
# 5 6 C false 21
|
735
|
+
#
|
736
|
+
# # Using `Vector#propagate` like below has same result as above.
|
737
|
+
# df.assign(:sum_x) { x.propagate(:sum) }
|
738
|
+
#
|
739
|
+
# # Also it is same as creating column from an Array.
|
740
|
+
# df.assign(:sum_x) { [x.sum] * size }
|
741
|
+
#
|
742
|
+
# @overload propagate
|
743
|
+
#
|
744
|
+
# @yieldparam self [DataFrame]
|
745
|
+
# gives self to the block.
|
746
|
+
# @yieldreturn [scalar]
|
747
|
+
# a value to propagate in Vector
|
748
|
+
# @return [Vector]
|
749
|
+
# created Vector.
|
750
|
+
# @example propagate the value from the block
|
751
|
+
# df.assign(:range) { propagate { x.max - x.min } }
|
752
|
+
# # =>
|
753
|
+
# #<RedAmber::DataFrame : 6 x 4 Vectors, 0x00000000000e603c>
|
754
|
+
# x y z range
|
755
|
+
# <uint8> <string> <boolean> <uint8>
|
756
|
+
# 0 1 A false 5
|
757
|
+
# 1 2 A true 5
|
758
|
+
# 2 3 B false 5
|
759
|
+
# 3 4 B (nil) 5
|
760
|
+
# 4 5 B true 5
|
761
|
+
# 5 6 C false 5
|
762
|
+
#
|
763
|
+
# @since 0.5.0
|
764
|
+
#
|
765
|
+
def propagate(scalar = nil, &block)
|
766
|
+
if block
|
767
|
+
raise VectorArgumentError, "can't specify both function and block" if scalar
|
768
|
+
|
769
|
+
scalar = instance_eval(&block)
|
770
|
+
end
|
771
|
+
Vector.new([scalar] * size)
|
772
|
+
end
|
773
|
+
|
700
774
|
# Catch variable (column) key as method name.
|
701
775
|
def method_missing(name, *args, &block)
|
702
776
|
return variables[name] if args.empty? && key?(name)
|
@@ -221,6 +221,11 @@ module RedAmber
|
|
221
221
|
# - Same as `#join` with `type: :inner`
|
222
222
|
# - A kind of mutating join.
|
223
223
|
#
|
224
|
+
# @note the order of joined results will be preserved by default.
|
225
|
+
# This is enabled by appending index column to sort after joining but
|
226
|
+
# it will cause some performance degradation. If you don't matter
|
227
|
+
# the order of the result, set `force_order` option to `false`.
|
228
|
+
#
|
224
229
|
# @overload inner_join(other, suffix: '.1', force_order: true)
|
225
230
|
# If `join_key` is not specified, common keys in self and other are used
|
226
231
|
# (natural keys). Returns joined dataframe.
|
@@ -280,6 +285,11 @@ module RedAmber
|
|
280
285
|
# - Same as `#join` with `type: :full_outer`
|
281
286
|
# - A kind of mutating join.
|
282
287
|
#
|
288
|
+
# @note the order of joined results will be preserved by default.
|
289
|
+
# This is enabled by appending index column to sort after joining but
|
290
|
+
# it will cause some performance degradation. If you don't matter
|
291
|
+
# the order of the result, set `force_order` option to `false`.
|
292
|
+
#
|
283
293
|
# @overload full_join(other, suffix: '.1', force_order: true)
|
284
294
|
# If `join_key` is not specified, common keys in self and other are used
|
285
295
|
# (natural keys). Returns joined dataframe.
|
@@ -348,6 +358,11 @@ module RedAmber
|
|
348
358
|
# - Same as `#join` with `type: :left_outer`
|
349
359
|
# - A kind of mutating join.
|
350
360
|
#
|
361
|
+
# @note the order of joined results will be preserved by default.
|
362
|
+
# This is enabled by appending index column to sort after joining but
|
363
|
+
# it will cause some performance degradation. If you don't matter
|
364
|
+
# the order of the result, set `force_order` option to `false`.
|
365
|
+
#
|
351
366
|
# @overload left_join(other, suffix: '.1', force_order: true)
|
352
367
|
# If `join_key` is not specified, common keys in self and other are used
|
353
368
|
# (natural keys). Returns joined dataframe.
|
@@ -410,6 +425,11 @@ module RedAmber
|
|
410
425
|
# - Same as `#join` with `type: :right_outer`
|
411
426
|
# - A kind of mutating join.
|
412
427
|
#
|
428
|
+
# @note the order of joined results will be preserved by default.
|
429
|
+
# This is enabled by appending index column to sort after joining but
|
430
|
+
# it will cause some performance degradation. If you don't matter
|
431
|
+
# the order of the result, set `force_order` option to `false`.
|
432
|
+
#
|
413
433
|
# @overload right_join(other, suffix: '.1', force_order: true)
|
414
434
|
# If `join_key` is not specified, common keys in self and other are used
|
415
435
|
# (natural keys). Returns joined dataframe.
|
@@ -422,11 +442,11 @@ module RedAmber
|
|
422
442
|
# df.right_join(other)
|
423
443
|
#
|
424
444
|
# # =>
|
425
|
-
#
|
426
|
-
# <
|
427
|
-
# 0 A
|
428
|
-
# 1 B
|
429
|
-
# 2
|
445
|
+
# X1 KEY X2
|
446
|
+
# <uint8> <string> <boolean>
|
447
|
+
# 0 1 A true
|
448
|
+
# 1 2 B false
|
449
|
+
# 2 (nil) D (nil)
|
430
450
|
#
|
431
451
|
# @overload right_join(other, join_keys, suffix: '.1', force_order: true)
|
432
452
|
#
|
@@ -439,11 +459,11 @@ module RedAmber
|
|
439
459
|
# df.right_join(other, :KEY)
|
440
460
|
#
|
441
461
|
# # =>
|
442
|
-
#
|
443
|
-
# <
|
444
|
-
# 0 A
|
445
|
-
# 1 B
|
446
|
-
# 2
|
462
|
+
# X1 KEY X2
|
463
|
+
# <uint8> <string> <boolean>
|
464
|
+
# 0 1 A true
|
465
|
+
# 1 2 B false
|
466
|
+
# 2 (nil) D (nil)
|
447
467
|
#
|
448
468
|
# @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
449
469
|
#
|
@@ -456,11 +476,11 @@ module RedAmber
|
|
456
476
|
# df2.right_join(other2, { left: :KEY1, right: :KEY2 })
|
457
477
|
#
|
458
478
|
# # =>
|
459
|
-
#
|
460
|
-
# <
|
461
|
-
# 0 A
|
462
|
-
# 1 B
|
463
|
-
# 2
|
479
|
+
# X1 KEY2 X2
|
480
|
+
# <uint8> >string> <boolean>
|
481
|
+
# 0 1 A true
|
482
|
+
# 1 2 B false
|
483
|
+
# 2 (nil) D (nil)
|
464
484
|
#
|
465
485
|
# @since 0.2.3
|
466
486
|
#
|
@@ -480,6 +500,11 @@ module RedAmber
|
|
480
500
|
# - Same as `#join` with `type: :left_semi`
|
481
501
|
# - A kind of filtering join.
|
482
502
|
#
|
503
|
+
# @note the order of joined results will be preserved by default.
|
504
|
+
# This is enabled by appending index column to sort after joining but
|
505
|
+
# it will cause some performance degradation. If you don't matter
|
506
|
+
# the order of the result, set `force_order` option to `false`.
|
507
|
+
#
|
483
508
|
# @overload semi_join(other, suffix: '.1', force_order: true)
|
484
509
|
# If `join_key` is not specified, common keys in self and other are used
|
485
510
|
# (natural keys). Returns joined dataframe.
|
@@ -539,6 +564,11 @@ module RedAmber
|
|
539
564
|
# - Same as `#join` with `type: :left_anti`
|
540
565
|
# - A kind of filtering join.
|
541
566
|
#
|
567
|
+
# @note the order of joined results will be preserved by default.
|
568
|
+
# This is enabled by appending index column to sort after joining but
|
569
|
+
# it will cause some performance degradation. If you don't matter
|
570
|
+
# the order of the result, set `force_order` option to `false`.
|
571
|
+
#
|
542
572
|
# @overload anti_join(other, suffix: '.1', force_order: true)
|
543
573
|
# If `join_key` is not specified, common keys in self and other are used
|
544
574
|
# (natural keys). Returns joined dataframe.
|
@@ -661,7 +691,7 @@ module RedAmber
|
|
661
691
|
raise DataFrameArgumentError, 'keys are not same with self and other'
|
662
692
|
end
|
663
693
|
|
664
|
-
join(other, keys, type: :full_outer)
|
694
|
+
join(other, keys, type: :full_outer, force_order: true)
|
665
695
|
end
|
666
696
|
|
667
697
|
# Select records appearing in self but not in other.
|
@@ -733,12 +763,12 @@ module RedAmber
|
|
733
763
|
# 1 B E
|
734
764
|
# 2 C F
|
735
765
|
|
736
|
-
# @note the order of joined results
|
737
|
-
#
|
738
|
-
#
|
739
|
-
#
|
766
|
+
# @note the order of joined results may not be preserved by default.
|
767
|
+
# if you prefer to preserve the order of the result, set `force_order` option
|
768
|
+
# to `true`. This is enabled by appending index column to sort after joining
|
769
|
+
# so it will cause some performance degradation.
|
740
770
|
#
|
741
|
-
# @overload join(other, type: :inner, suffix: '.1', force_order:
|
771
|
+
# @overload join(other, type: :inner, suffix: '.1', force_order: false)
|
742
772
|
#
|
743
773
|
# If `join_key` is not specified, common keys in self and other are used
|
744
774
|
# (natural keys). Returns joined dataframe.
|
@@ -767,7 +797,7 @@ module RedAmber
|
|
767
797
|
# 2 C 3 (nil)
|
768
798
|
# 3 D (nil) (nil)
|
769
799
|
#
|
770
|
-
# @overload join(other, join_keys, type: :inner, suffix: '.1', force_order:
|
800
|
+
# @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: false)
|
771
801
|
#
|
772
802
|
# @macro join_before
|
773
803
|
# @macro join_key_in_array
|
@@ -792,7 +822,8 @@ module RedAmber
|
|
792
822
|
# 0 A 1 1
|
793
823
|
# 1 B 2 4
|
794
824
|
#
|
795
|
-
# @overload join(
|
825
|
+
# @overload join(
|
826
|
+
# other, join_key_pairs, type: :inner, suffix: '.1', force_order: false)
|
796
827
|
#
|
797
828
|
# @macro join_before
|
798
829
|
# @macro join_key_in_hash
|
@@ -828,7 +859,8 @@ module RedAmber
|
|
828
859
|
#
|
829
860
|
# @since 0.2.3
|
830
861
|
#
|
831
|
-
def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order:
|
862
|
+
def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: false)
|
863
|
+
left_table = table
|
832
864
|
right_table =
|
833
865
|
case other
|
834
866
|
when DataFrame
|
@@ -839,24 +871,26 @@ module RedAmber
|
|
839
871
|
raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
|
840
872
|
end
|
841
873
|
|
842
|
-
type = type.to_sym
|
843
|
-
left_index = :__LEFT_INDEX__
|
844
|
-
right_index = :__RIGHT_INDEX__
|
845
874
|
if force_order
|
875
|
+
left_index = :__LEFT_INDEX__
|
876
|
+
right_index = :__RIGHT_INDEX__
|
846
877
|
left_table = assign(left_index) { indices }.table
|
847
878
|
other = DataFrame.create(other) if other.is_a?(Arrow::Table)
|
848
879
|
right_table = other.assign(right_index) { indices }.table
|
849
|
-
else
|
850
|
-
left_table = table
|
851
880
|
end
|
852
881
|
|
853
|
-
|
854
|
-
|
855
|
-
|
882
|
+
left_table_keys = ensure_keys(left_table.keys)
|
883
|
+
right_table_keys = ensure_keys(right_table.keys)
|
856
884
|
# natural keys (implicit common keys)
|
857
|
-
join_keys ||=
|
885
|
+
join_keys ||= left_table_keys.intersection(right_table_keys)
|
886
|
+
|
887
|
+
type = Arrow::JoinType.try_convert(type) || type
|
888
|
+
type_nick = type.nick
|
889
|
+
|
890
|
+
plan = Arrow::ExecutePlan.new
|
891
|
+
left_node = plan.build_source_node(left_table)
|
892
|
+
right_node = plan.build_source_node(right_table)
|
858
893
|
|
859
|
-
# This is not necessary if additional procedure is contributed to Red Arrow.
|
860
894
|
if join_keys.is_a?(Hash)
|
861
895
|
left_keys = ensure_keys(join_keys[:left])
|
862
896
|
right_keys = ensure_keys(join_keys[:right])
|
@@ -865,116 +899,110 @@ module RedAmber
|
|
865
899
|
right_keys = left_keys
|
866
900
|
end
|
867
901
|
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
902
|
+
context =
|
903
|
+
[type_nick, left_table_keys, right_table_keys, left_keys, right_keys, suffix]
|
904
|
+
|
905
|
+
hash_join_node_options = Arrow::HashJoinNodeOptions.new(type, left_keys, right_keys)
|
906
|
+
case type_nick
|
907
|
+
when 'inner', 'left-outer'
|
908
|
+
hash_join_node_options.left_outputs = left_table_keys
|
909
|
+
hash_join_node_options.right_outputs = right_table_keys - right_keys
|
910
|
+
when 'right-outer'
|
911
|
+
hash_join_node_options.left_outputs = left_table_keys - left_keys
|
912
|
+
hash_join_node_options.right_outputs = right_table_keys
|
878
913
|
end
|
879
914
|
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
)
|
889
|
-
|
890
|
-
case type
|
891
|
-
when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
|
892
|
-
dataframe =
|
893
|
-
if joined_table.keys.uniq!
|
894
|
-
DataFrame.create(rename_table(joined_table, n_keys, suffix))
|
895
|
-
else
|
896
|
-
DataFrame.create(joined_table)
|
897
|
-
end
|
915
|
+
hash_join_node =
|
916
|
+
plan.build_hash_join_node(left_node, right_node, hash_join_node_options)
|
917
|
+
merge_node = merge_keys(plan, hash_join_node, context)
|
918
|
+
rename_node = rename_keys(plan, merge_node, context)
|
919
|
+
joined_table = sink_and_start_plan(plan, rename_node)
|
920
|
+
|
921
|
+
df = DataFrame.create(joined_table)
|
922
|
+
if force_order
|
898
923
|
sorter =
|
899
|
-
case
|
900
|
-
when
|
901
|
-
[left_index, right_index]
|
902
|
-
when :left_semi, :left_anti
|
903
|
-
[left_index]
|
904
|
-
when :right_semi, :right_anti
|
924
|
+
case type_nick
|
925
|
+
when 'right-semi', 'right-anti'
|
905
926
|
[right_index]
|
906
|
-
|
907
|
-
|
908
|
-
key_index_lr =
|
909
|
-
left_keys.map { left_table.keys.index(_1) }
|
910
|
-
.zip(right_keys.map { left_table.keys.size + right_table.keys.index(_1) })
|
911
|
-
renamed_table = rename_table(joined_table, n_keys, suffix)
|
912
|
-
dropper = []
|
913
|
-
dataframe =
|
914
|
-
DataFrame.create(renamed_table).assign do |df|
|
915
|
-
key_index_lr.map do |l, r|
|
916
|
-
dropper << df.keys[r]
|
917
|
-
[df.keys[l], merge_array(df.vectors[l].data, df.vectors[r].data)]
|
918
|
-
end
|
919
|
-
end
|
920
|
-
dataframe = dataframe.drop(dropper)
|
921
|
-
sorter = [left_index, right_index]
|
922
|
-
when :right_outer
|
923
|
-
dataframe =
|
924
|
-
if joined_table.keys.uniq!
|
925
|
-
DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
|
927
|
+
when 'left-semi', 'left-anti'
|
928
|
+
[left_index]
|
926
929
|
else
|
927
|
-
|
930
|
+
[left_index, right_index]
|
928
931
|
end
|
929
|
-
|
930
|
-
sorter = [left_index, right_index]
|
931
|
-
end
|
932
|
-
|
933
|
-
if force_order
|
934
|
-
dataframe
|
935
|
-
.sort(sorter)
|
932
|
+
df.sort(sorter)
|
936
933
|
.drop(sorter)
|
937
934
|
else
|
938
|
-
|
935
|
+
df
|
939
936
|
end
|
940
937
|
end
|
941
938
|
|
942
939
|
private
|
943
940
|
|
944
|
-
# To ensure Array of
|
941
|
+
# To ensure Array of Strings
|
945
942
|
def ensure_keys(keys)
|
946
|
-
Array(keys).map(&:
|
943
|
+
Array(keys).map(&:to_s)
|
944
|
+
end
|
945
|
+
|
946
|
+
# Merge key columns and preserve as left and remove right.
|
947
|
+
def merge_keys(plan, input_node, context)
|
948
|
+
type_nick, left_table_keys, right_table_keys, left_keys, right_keys, * = context
|
949
|
+
return input_node unless type_nick == 'full-outer'
|
950
|
+
|
951
|
+
left_indices = left_keys.map { left_table_keys.index(_1) }
|
952
|
+
right_offset = left_table_keys.size
|
953
|
+
right_indices = right_keys.map { right_table_keys.index(_1) + right_offset }
|
954
|
+
expressions = []
|
955
|
+
names = []
|
956
|
+
left_table_keys.each_with_index do |key, index|
|
957
|
+
names << key
|
958
|
+
expressions <<
|
959
|
+
if (i = left_indices.index(index))
|
960
|
+
left_field = Arrow::FieldExpression.new("[#{left_indices[i]}]")
|
961
|
+
right_field = Arrow::FieldExpression.new("[#{right_indices[i]}]")
|
962
|
+
is_left_null = Arrow::CallExpression.new('is_null', [left_field])
|
963
|
+
Arrow::CallExpression.new('if_else', [is_left_null, right_field, left_field])
|
964
|
+
else
|
965
|
+
Arrow::FieldExpression.new("[#{index}]")
|
966
|
+
end
|
967
|
+
end
|
968
|
+
right_table_keys.each.with_index(right_offset) do |key, index|
|
969
|
+
unless right_indices.include?(index)
|
970
|
+
names << key
|
971
|
+
expressions << Arrow::FieldExpression.new("[#{index}]")
|
972
|
+
end
|
973
|
+
end
|
974
|
+
project_node_options = Arrow::ProjectNodeOptions.new(expressions, names)
|
975
|
+
plan.build_project_node(input_node, project_node_options)
|
947
976
|
end
|
948
977
|
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
978
|
+
def rename_keys(plan, input_node, context)
|
979
|
+
type_nick, left_table_keys, right_table_keys, *, suffix = context
|
980
|
+
names = input_node.output_schema.fields.map(&:name)
|
981
|
+
return input_node unless names.dup.uniq!
|
953
982
|
|
954
|
-
|
983
|
+
pos_rights =
|
984
|
+
if type_nick.start_with?('right')
|
985
|
+
names.size - right_table_keys.size
|
986
|
+
else
|
987
|
+
left_table_keys.size
|
988
|
+
end
|
989
|
+
rights = names[pos_rights..]
|
990
|
+
dup_keys = names.tally.select { |_, v| v > 1 }.keys
|
955
991
|
renamed_right_keys =
|
956
|
-
|
992
|
+
rights.map do |key|
|
957
993
|
if dup_keys.include?(key)
|
958
|
-
suffixed = "#{key}#{suffix}".
|
994
|
+
suffixed = "#{key}#{suffix}".to_s
|
959
995
|
# Find a key from suffixed.succ
|
960
|
-
(suffixed..).find { !
|
996
|
+
(suffixed..).find { !names.include?(_1) }
|
961
997
|
else
|
962
998
|
key
|
963
999
|
end
|
964
1000
|
end
|
965
|
-
|
966
|
-
|
967
|
-
fields =
|
968
|
-
joined_keys.map.with_index do |k, i|
|
969
|
-
Arrow::Field.new(k, joined_table[i].data_type)
|
970
|
-
end
|
971
|
-
Arrow::Table.new(Arrow::Schema.new(fields), joined_table.columns)
|
972
|
-
end
|
1001
|
+
names[pos_rights..] = renamed_right_keys
|
973
1002
|
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
Arrow::Function.find(:if_else).execute([t, array2, array1]).value
|
1003
|
+
expressions = names.map.with_index { |_, i| Arrow::FieldExpression.new("[#{i}]") }
|
1004
|
+
project_node_options = Arrow::ProjectNodeOptions.new(expressions, names)
|
1005
|
+
plan.build_project_node(input_node, project_node_options)
|
978
1006
|
end
|
979
1007
|
end
|
980
1008
|
end
|