red_amber 0.4.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -2
- data/CHANGELOG.md +57 -0
- data/Gemfile +5 -6
- data/README.ja.md +252 -0
- data/README.md +19 -14
- data/benchmark/group.yml +12 -5
- data/docker/Gemfile +8 -3
- data/docker/Gemfile.lock +54 -16
- data/docker/example +29 -17
- data/lib/red_amber/data_frame.rb +78 -4
- data/lib/red_amber/data_frame_combinable.rb +147 -119
- data/lib/red_amber/data_frame_displayable.rb +6 -5
- data/lib/red_amber/data_frame_selectable.rb +49 -0
- data/lib/red_amber/group.rb +191 -90
- data/lib/red_amber/helper.rb +26 -0
- data/lib/red_amber/subframes.rb +12 -5
- data/lib/red_amber/vector.rb +43 -24
- data/lib/red_amber/vector_aggregation.rb +26 -0
- data/lib/red_amber/vector_selectable.rb +74 -23
- data/lib/red_amber/vector_string_function.rb +211 -0
- data/lib/red_amber/vector_unary_element_wise.rb +4 -0
- data/lib/red_amber/vector_updatable.rb +28 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -1
- data/red_amber.gemspec +1 -1
- metadata +7 -5
data/docker/example
CHANGED
@@ -1,46 +1,46 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
|
-
|
4
|
+
print "starting.\r"
|
5
5
|
|
6
|
-
require 'bundler/setup'
|
6
|
+
Dir.chdir(__dir__) { require 'bundler/setup' }
|
7
7
|
|
8
|
-
|
8
|
+
print "starting..\r"
|
9
9
|
require 'red_amber'
|
10
10
|
include RedAmber
|
11
11
|
|
12
|
-
|
12
|
+
print "starting...\r"
|
13
13
|
require 'datasets-arrow'
|
14
14
|
|
15
|
-
|
15
|
+
print "reading penguins...\r"
|
16
16
|
penguins = DataFrame.new(Datasets::Penguins.new)
|
17
17
|
|
18
|
-
|
18
|
+
print "reading diamonds...\r"
|
19
19
|
diamonds = DataFrame.new(Datasets::Diamonds.new)
|
20
20
|
|
21
|
-
|
21
|
+
print "reading starwars...\r"
|
22
22
|
starwars = DataFrame.new(Datasets::Rdataset.new('dplyr', 'starwars'))
|
23
23
|
|
24
|
-
|
24
|
+
print "reading openintro/simpsons_paradox_covid...\r"
|
25
25
|
ds = Datasets::Rdataset.new('openintro', 'simpsons_paradox_covid')
|
26
26
|
simpsons_paradox_covid = DataFrame.new(ds.to_arrow)
|
27
27
|
|
28
|
-
|
28
|
+
print "reading mtcars... \r"
|
29
29
|
mtcars = DataFrame.new(Datasets::Rdatasets.new('datasets', 'mtcars'))
|
30
30
|
|
31
|
-
|
31
|
+
print "reading iris... \r"
|
32
32
|
iris = DataFrame.new(Datasets::Iris.new)
|
33
33
|
|
34
|
-
|
34
|
+
print "reading band_members...\r"
|
35
35
|
band_members = DataFrame.new(Datasets::Rdatasets.new('dplyr', 'band_members'))
|
36
36
|
|
37
|
-
|
37
|
+
print "reading band_instruments...\r"
|
38
38
|
band_instruments = DataFrame.new(Datasets::Rdatasets.new('dplyr', 'band_instruments'))
|
39
39
|
|
40
|
-
|
40
|
+
print "reading band_instruments2...\r"
|
41
41
|
band_instruments2 = DataFrame.new(Datasets::Rdatasets.new('dplyr', 'band_instruments2'))
|
42
42
|
|
43
|
-
|
43
|
+
print "reading import_cars... \r"
|
44
44
|
import_cars = DataFrame.load(Arrow::Buffer.new(<<~TSV), format: :tsv)
|
45
45
|
Year Audi BMW BMW_MINI Mercedes-Benz VW
|
46
46
|
2017 28336 52527 25427 68221 49040
|
@@ -50,7 +50,7 @@ import_cars = DataFrame.load(Arrow::Buffer.new(<<~TSV), format: :tsv)
|
|
50
50
|
2021 22535 35905 18211 51722 35215
|
51
51
|
TSV
|
52
52
|
|
53
|
-
|
53
|
+
print "reading comecome... \r"
|
54
54
|
comecome = DataFrame.load(Arrow::Buffer.new(<<~CSV), format: :csv)
|
55
55
|
name,age
|
56
56
|
Yasuko,68
|
@@ -58,7 +58,19 @@ comecome = DataFrame.load(Arrow::Buffer.new(<<~CSV), format: :csv)
|
|
58
58
|
Hinata,28
|
59
59
|
CSV
|
60
60
|
|
61
|
-
|
61
|
+
print "reading rubykaigi... \r"
|
62
|
+
rubykaigi = DataFrame.load(Arrow::Buffer.new(<<~CSV), format: :csv)
|
63
|
+
year,venue,prefecture,city,venue_en
|
64
|
+
2015,ベルサール汐留,東京都,中央区,"Bellesalle Shiodome"
|
65
|
+
2016,京都国際会議場,京都府,京都市左京区,"Kyoto International Conference Center"
|
66
|
+
2017,広島国際会議場,広島県,広島市中区,"International Conference Center Hiroshima"
|
67
|
+
2018,仙台国際センター,宮城県,仙台市青葉区,"Sendai International Center"
|
68
|
+
2019,福岡国際会議場,福岡県,福岡市博多区,"Fukuoka International Congress Center"
|
69
|
+
2022,三重県総合文化センター,三重県,津市,"Mie Center for the Arts"
|
70
|
+
2023,松本市民芸術館,長野県,松本市,"Matsumoto Performing Arts Centre"
|
71
|
+
CSV
|
72
|
+
|
73
|
+
print "reading general dataframe and subframes...\r"
|
62
74
|
dataframe = DataFrame.new(
|
63
75
|
x: [*1..6],
|
64
76
|
y: %w[A A B B B C],
|
@@ -70,5 +82,5 @@ subframes = SubFrames.new(dataframe, [[0, 1], [2, 3, 4], [5]])
|
|
70
82
|
# This environment will offer these pre-loaded datasets:
|
71
83
|
# penguins, diamonds, iris, starwars, simpsons_paradox_covid,
|
72
84
|
# mtcars, band_members, band_instruments, band_instruments2
|
73
|
-
#
|
85
|
+
# import_cars, comecome, rubykaigi, dataframe, subframes
|
74
86
|
binding.irb
|
data/lib/red_amber/data_frame.rb
CHANGED
@@ -422,12 +422,12 @@ module RedAmber
|
|
422
422
|
# Create SubFrames by value grouping.
|
423
423
|
#
|
424
424
|
# [Experimental feature] this method may be removed or be changed in the future.
|
425
|
-
# @param keys [Symbol, String
|
425
|
+
# @param keys [List<Symbol, String>, Array<Symbol, String>]
|
426
426
|
# grouping keys.
|
427
427
|
# @return [SubFrames]
|
428
428
|
# a created SubFrames grouped by column values on `keys`.
|
429
429
|
# @example
|
430
|
-
# df.sub_by_value(
|
430
|
+
# df.sub_by_value(:y)
|
431
431
|
#
|
432
432
|
# # =>
|
433
433
|
# #<RedAmber::SubFrames : 0x000000000000fc08>
|
@@ -454,10 +454,11 @@ module RedAmber
|
|
454
454
|
#
|
455
455
|
# @since 0.4.0
|
456
456
|
#
|
457
|
-
def sub_by_value(keys
|
458
|
-
SubFrames.new(self, group(keys).filters)
|
457
|
+
def sub_by_value(*keys)
|
458
|
+
SubFrames.new(self, group(keys.flatten).filters)
|
459
459
|
end
|
460
460
|
alias_method :subframes_by_value, :sub_by_value
|
461
|
+
alias_method :sub_group, :sub_by_value
|
461
462
|
|
462
463
|
# Create SubFrames by Windowing with `from`, `size` and `step`.
|
463
464
|
#
|
@@ -697,6 +698,79 @@ module RedAmber
|
|
697
698
|
end
|
698
699
|
end
|
699
700
|
|
701
|
+
# Returns a Vector such that all elements have value `scalar`
|
702
|
+
# and have same size as self.
|
703
|
+
#
|
704
|
+
# @overload propagate(scalar)
|
705
|
+
# Specifies scalar as an agrument.
|
706
|
+
#
|
707
|
+
# @param scalar [scalar]
|
708
|
+
# a value to propagate in Vector.
|
709
|
+
# @return [Vector]
|
710
|
+
# created Vector.
|
711
|
+
# @example propagate a value
|
712
|
+
# df
|
713
|
+
# # =>
|
714
|
+
# #<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000849a4>
|
715
|
+
# x y z
|
716
|
+
# <uint8> <string> <boolean>
|
717
|
+
# 0 1 A false
|
718
|
+
# 1 2 A true
|
719
|
+
# 2 3 B false
|
720
|
+
# 3 4 B (nil)
|
721
|
+
# 4 5 B true
|
722
|
+
# 5 6 C false
|
723
|
+
#
|
724
|
+
# df.assign(:sum_x) { propagate(x.sum) }
|
725
|
+
# # =>
|
726
|
+
# #<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000007bd04>
|
727
|
+
# x y z sum_x
|
728
|
+
# <uint8> <string> <boolean> <uint8>
|
729
|
+
# 0 1 A false 21
|
730
|
+
# 1 2 A true 21
|
731
|
+
# 2 3 B false 21
|
732
|
+
# 3 4 B (nil) 21
|
733
|
+
# 4 5 B true 21
|
734
|
+
# 5 6 C false 21
|
735
|
+
#
|
736
|
+
# # Using `Vector#propagate` like below has same result as above.
|
737
|
+
# df.assign(:sum_x) { x.propagate(:sum) }
|
738
|
+
#
|
739
|
+
# # Also it is same as creating column from an Array.
|
740
|
+
# df.assign(:sum_x) { [x.sum] * size }
|
741
|
+
#
|
742
|
+
# @overload propagate
|
743
|
+
#
|
744
|
+
# @yieldparam self [DataFrame]
|
745
|
+
# gives self to the block.
|
746
|
+
# @yieldreturn [scalar]
|
747
|
+
# a value to propagate in Vector
|
748
|
+
# @return [Vector]
|
749
|
+
# created Vector.
|
750
|
+
# @example propagate the value from the block
|
751
|
+
# df.assign(:range) { propagate { x.max - x.min } }
|
752
|
+
# # =>
|
753
|
+
# #<RedAmber::DataFrame : 6 x 4 Vectors, 0x00000000000e603c>
|
754
|
+
# x y z range
|
755
|
+
# <uint8> <string> <boolean> <uint8>
|
756
|
+
# 0 1 A false 5
|
757
|
+
# 1 2 A true 5
|
758
|
+
# 2 3 B false 5
|
759
|
+
# 3 4 B (nil) 5
|
760
|
+
# 4 5 B true 5
|
761
|
+
# 5 6 C false 5
|
762
|
+
#
|
763
|
+
# @since 0.5.0
|
764
|
+
#
|
765
|
+
def propagate(scalar = nil, &block)
|
766
|
+
if block
|
767
|
+
raise VectorArgumentError, "can't specify both function and block" if scalar
|
768
|
+
|
769
|
+
scalar = instance_eval(&block)
|
770
|
+
end
|
771
|
+
Vector.new([scalar] * size)
|
772
|
+
end
|
773
|
+
|
700
774
|
# Catch variable (column) key as method name.
|
701
775
|
def method_missing(name, *args, &block)
|
702
776
|
return variables[name] if args.empty? && key?(name)
|
@@ -221,6 +221,11 @@ module RedAmber
|
|
221
221
|
# - Same as `#join` with `type: :inner`
|
222
222
|
# - A kind of mutating join.
|
223
223
|
#
|
224
|
+
# @note the order of joined results will be preserved by default.
|
225
|
+
# This is enabled by appending index column to sort after joining but
|
226
|
+
# it will cause some performance degradation. If you don't matter
|
227
|
+
# the order of the result, set `force_order` option to `false`.
|
228
|
+
#
|
224
229
|
# @overload inner_join(other, suffix: '.1', force_order: true)
|
225
230
|
# If `join_key` is not specified, common keys in self and other are used
|
226
231
|
# (natural keys). Returns joined dataframe.
|
@@ -280,6 +285,11 @@ module RedAmber
|
|
280
285
|
# - Same as `#join` with `type: :full_outer`
|
281
286
|
# - A kind of mutating join.
|
282
287
|
#
|
288
|
+
# @note the order of joined results will be preserved by default.
|
289
|
+
# This is enabled by appending index column to sort after joining but
|
290
|
+
# it will cause some performance degradation. If you don't matter
|
291
|
+
# the order of the result, set `force_order` option to `false`.
|
292
|
+
#
|
283
293
|
# @overload full_join(other, suffix: '.1', force_order: true)
|
284
294
|
# If `join_key` is not specified, common keys in self and other are used
|
285
295
|
# (natural keys). Returns joined dataframe.
|
@@ -348,6 +358,11 @@ module RedAmber
|
|
348
358
|
# - Same as `#join` with `type: :left_outer`
|
349
359
|
# - A kind of mutating join.
|
350
360
|
#
|
361
|
+
# @note the order of joined results will be preserved by default.
|
362
|
+
# This is enabled by appending index column to sort after joining but
|
363
|
+
# it will cause some performance degradation. If you don't matter
|
364
|
+
# the order of the result, set `force_order` option to `false`.
|
365
|
+
#
|
351
366
|
# @overload left_join(other, suffix: '.1', force_order: true)
|
352
367
|
# If `join_key` is not specified, common keys in self and other are used
|
353
368
|
# (natural keys). Returns joined dataframe.
|
@@ -410,6 +425,11 @@ module RedAmber
|
|
410
425
|
# - Same as `#join` with `type: :right_outer`
|
411
426
|
# - A kind of mutating join.
|
412
427
|
#
|
428
|
+
# @note the order of joined results will be preserved by default.
|
429
|
+
# This is enabled by appending index column to sort after joining but
|
430
|
+
# it will cause some performance degradation. If you don't matter
|
431
|
+
# the order of the result, set `force_order` option to `false`.
|
432
|
+
#
|
413
433
|
# @overload right_join(other, suffix: '.1', force_order: true)
|
414
434
|
# If `join_key` is not specified, common keys in self and other are used
|
415
435
|
# (natural keys). Returns joined dataframe.
|
@@ -422,11 +442,11 @@ module RedAmber
|
|
422
442
|
# df.right_join(other)
|
423
443
|
#
|
424
444
|
# # =>
|
425
|
-
#
|
426
|
-
# <
|
427
|
-
# 0 A
|
428
|
-
# 1 B
|
429
|
-
# 2
|
445
|
+
# X1 KEY X2
|
446
|
+
# <uint8> <string> <boolean>
|
447
|
+
# 0 1 A true
|
448
|
+
# 1 2 B false
|
449
|
+
# 2 (nil) D (nil)
|
430
450
|
#
|
431
451
|
# @overload right_join(other, join_keys, suffix: '.1', force_order: true)
|
432
452
|
#
|
@@ -439,11 +459,11 @@ module RedAmber
|
|
439
459
|
# df.right_join(other, :KEY)
|
440
460
|
#
|
441
461
|
# # =>
|
442
|
-
#
|
443
|
-
# <
|
444
|
-
# 0 A
|
445
|
-
# 1 B
|
446
|
-
# 2
|
462
|
+
# X1 KEY X2
|
463
|
+
# <uint8> <string> <boolean>
|
464
|
+
# 0 1 A true
|
465
|
+
# 1 2 B false
|
466
|
+
# 2 (nil) D (nil)
|
447
467
|
#
|
448
468
|
# @overload right_join(other, join_key_pairs, suffix: '.1', force_order: true)
|
449
469
|
#
|
@@ -456,11 +476,11 @@ module RedAmber
|
|
456
476
|
# df2.right_join(other2, { left: :KEY1, right: :KEY2 })
|
457
477
|
#
|
458
478
|
# # =>
|
459
|
-
#
|
460
|
-
# <
|
461
|
-
# 0 A
|
462
|
-
# 1 B
|
463
|
-
# 2
|
479
|
+
# X1 KEY2 X2
|
480
|
+
# <uint8> >string> <boolean>
|
481
|
+
# 0 1 A true
|
482
|
+
# 1 2 B false
|
483
|
+
# 2 (nil) D (nil)
|
464
484
|
#
|
465
485
|
# @since 0.2.3
|
466
486
|
#
|
@@ -480,6 +500,11 @@ module RedAmber
|
|
480
500
|
# - Same as `#join` with `type: :left_semi`
|
481
501
|
# - A kind of filtering join.
|
482
502
|
#
|
503
|
+
# @note the order of joined results will be preserved by default.
|
504
|
+
# This is enabled by appending index column to sort after joining but
|
505
|
+
# it will cause some performance degradation. If you don't matter
|
506
|
+
# the order of the result, set `force_order` option to `false`.
|
507
|
+
#
|
483
508
|
# @overload semi_join(other, suffix: '.1', force_order: true)
|
484
509
|
# If `join_key` is not specified, common keys in self and other are used
|
485
510
|
# (natural keys). Returns joined dataframe.
|
@@ -539,6 +564,11 @@ module RedAmber
|
|
539
564
|
# - Same as `#join` with `type: :left_anti`
|
540
565
|
# - A kind of filtering join.
|
541
566
|
#
|
567
|
+
# @note the order of joined results will be preserved by default.
|
568
|
+
# This is enabled by appending index column to sort after joining but
|
569
|
+
# it will cause some performance degradation. If you don't matter
|
570
|
+
# the order of the result, set `force_order` option to `false`.
|
571
|
+
#
|
542
572
|
# @overload anti_join(other, suffix: '.1', force_order: true)
|
543
573
|
# If `join_key` is not specified, common keys in self and other are used
|
544
574
|
# (natural keys). Returns joined dataframe.
|
@@ -661,7 +691,7 @@ module RedAmber
|
|
661
691
|
raise DataFrameArgumentError, 'keys are not same with self and other'
|
662
692
|
end
|
663
693
|
|
664
|
-
join(other, keys, type: :full_outer)
|
694
|
+
join(other, keys, type: :full_outer, force_order: true)
|
665
695
|
end
|
666
696
|
|
667
697
|
# Select records appearing in self but not in other.
|
@@ -733,12 +763,12 @@ module RedAmber
|
|
733
763
|
# 1 B E
|
734
764
|
# 2 C F
|
735
765
|
|
736
|
-
# @note the order of joined results
|
737
|
-
#
|
738
|
-
#
|
739
|
-
#
|
766
|
+
# @note the order of joined results may not be preserved by default.
|
767
|
+
# if you prefer to preserve the order of the result, set `force_order` option
|
768
|
+
# to `true`. This is enabled by appending index column to sort after joining
|
769
|
+
# so it will cause some performance degradation.
|
740
770
|
#
|
741
|
-
# @overload join(other, type: :inner, suffix: '.1', force_order:
|
771
|
+
# @overload join(other, type: :inner, suffix: '.1', force_order: false)
|
742
772
|
#
|
743
773
|
# If `join_key` is not specified, common keys in self and other are used
|
744
774
|
# (natural keys). Returns joined dataframe.
|
@@ -767,7 +797,7 @@ module RedAmber
|
|
767
797
|
# 2 C 3 (nil)
|
768
798
|
# 3 D (nil) (nil)
|
769
799
|
#
|
770
|
-
# @overload join(other, join_keys, type: :inner, suffix: '.1', force_order:
|
800
|
+
# @overload join(other, join_keys, type: :inner, suffix: '.1', force_order: false)
|
771
801
|
#
|
772
802
|
# @macro join_before
|
773
803
|
# @macro join_key_in_array
|
@@ -792,7 +822,8 @@ module RedAmber
|
|
792
822
|
# 0 A 1 1
|
793
823
|
# 1 B 2 4
|
794
824
|
#
|
795
|
-
# @overload join(
|
825
|
+
# @overload join(
|
826
|
+
# other, join_key_pairs, type: :inner, suffix: '.1', force_order: false)
|
796
827
|
#
|
797
828
|
# @macro join_before
|
798
829
|
# @macro join_key_in_hash
|
@@ -828,7 +859,8 @@ module RedAmber
|
|
828
859
|
#
|
829
860
|
# @since 0.2.3
|
830
861
|
#
|
831
|
-
def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order:
|
862
|
+
def join(other, join_keys = nil, type: :inner, suffix: '.1', force_order: false)
|
863
|
+
left_table = table
|
832
864
|
right_table =
|
833
865
|
case other
|
834
866
|
when DataFrame
|
@@ -839,24 +871,26 @@ module RedAmber
|
|
839
871
|
raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
|
840
872
|
end
|
841
873
|
|
842
|
-
type = type.to_sym
|
843
|
-
left_index = :__LEFT_INDEX__
|
844
|
-
right_index = :__RIGHT_INDEX__
|
845
874
|
if force_order
|
875
|
+
left_index = :__LEFT_INDEX__
|
876
|
+
right_index = :__RIGHT_INDEX__
|
846
877
|
left_table = assign(left_index) { indices }.table
|
847
878
|
other = DataFrame.create(other) if other.is_a?(Arrow::Table)
|
848
879
|
right_table = other.assign(right_index) { indices }.table
|
849
|
-
else
|
850
|
-
left_table = table
|
851
880
|
end
|
852
881
|
|
853
|
-
|
854
|
-
|
855
|
-
|
882
|
+
left_table_keys = ensure_keys(left_table.keys)
|
883
|
+
right_table_keys = ensure_keys(right_table.keys)
|
856
884
|
# natural keys (implicit common keys)
|
857
|
-
join_keys ||=
|
885
|
+
join_keys ||= left_table_keys.intersection(right_table_keys)
|
886
|
+
|
887
|
+
type = Arrow::JoinType.try_convert(type) || type
|
888
|
+
type_nick = type.nick
|
889
|
+
|
890
|
+
plan = Arrow::ExecutePlan.new
|
891
|
+
left_node = plan.build_source_node(left_table)
|
892
|
+
right_node = plan.build_source_node(right_table)
|
858
893
|
|
859
|
-
# This is not necessary if additional procedure is contributed to Red Arrow.
|
860
894
|
if join_keys.is_a?(Hash)
|
861
895
|
left_keys = ensure_keys(join_keys[:left])
|
862
896
|
right_keys = ensure_keys(join_keys[:right])
|
@@ -865,116 +899,110 @@ module RedAmber
|
|
865
899
|
right_keys = left_keys
|
866
900
|
end
|
867
901
|
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
902
|
+
context =
|
903
|
+
[type_nick, left_table_keys, right_table_keys, left_keys, right_keys, suffix]
|
904
|
+
|
905
|
+
hash_join_node_options = Arrow::HashJoinNodeOptions.new(type, left_keys, right_keys)
|
906
|
+
case type_nick
|
907
|
+
when 'inner', 'left-outer'
|
908
|
+
hash_join_node_options.left_outputs = left_table_keys
|
909
|
+
hash_join_node_options.right_outputs = right_table_keys - right_keys
|
910
|
+
when 'right-outer'
|
911
|
+
hash_join_node_options.left_outputs = left_table_keys - left_keys
|
912
|
+
hash_join_node_options.right_outputs = right_table_keys
|
878
913
|
end
|
879
914
|
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
)
|
889
|
-
|
890
|
-
case type
|
891
|
-
when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
|
892
|
-
dataframe =
|
893
|
-
if joined_table.keys.uniq!
|
894
|
-
DataFrame.create(rename_table(joined_table, n_keys, suffix))
|
895
|
-
else
|
896
|
-
DataFrame.create(joined_table)
|
897
|
-
end
|
915
|
+
hash_join_node =
|
916
|
+
plan.build_hash_join_node(left_node, right_node, hash_join_node_options)
|
917
|
+
merge_node = merge_keys(plan, hash_join_node, context)
|
918
|
+
rename_node = rename_keys(plan, merge_node, context)
|
919
|
+
joined_table = sink_and_start_plan(plan, rename_node)
|
920
|
+
|
921
|
+
df = DataFrame.create(joined_table)
|
922
|
+
if force_order
|
898
923
|
sorter =
|
899
|
-
case
|
900
|
-
when
|
901
|
-
[left_index, right_index]
|
902
|
-
when :left_semi, :left_anti
|
903
|
-
[left_index]
|
904
|
-
when :right_semi, :right_anti
|
924
|
+
case type_nick
|
925
|
+
when 'right-semi', 'right-anti'
|
905
926
|
[right_index]
|
906
|
-
|
907
|
-
|
908
|
-
key_index_lr =
|
909
|
-
left_keys.map { left_table.keys.index(_1) }
|
910
|
-
.zip(right_keys.map { left_table.keys.size + right_table.keys.index(_1) })
|
911
|
-
renamed_table = rename_table(joined_table, n_keys, suffix)
|
912
|
-
dropper = []
|
913
|
-
dataframe =
|
914
|
-
DataFrame.create(renamed_table).assign do |df|
|
915
|
-
key_index_lr.map do |l, r|
|
916
|
-
dropper << df.keys[r]
|
917
|
-
[df.keys[l], merge_array(df.vectors[l].data, df.vectors[r].data)]
|
918
|
-
end
|
919
|
-
end
|
920
|
-
dataframe = dataframe.drop(dropper)
|
921
|
-
sorter = [left_index, right_index]
|
922
|
-
when :right_outer
|
923
|
-
dataframe =
|
924
|
-
if joined_table.keys.uniq!
|
925
|
-
DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
|
927
|
+
when 'left-semi', 'left-anti'
|
928
|
+
[left_index]
|
926
929
|
else
|
927
|
-
|
930
|
+
[left_index, right_index]
|
928
931
|
end
|
929
|
-
|
930
|
-
sorter = [left_index, right_index]
|
931
|
-
end
|
932
|
-
|
933
|
-
if force_order
|
934
|
-
dataframe
|
935
|
-
.sort(sorter)
|
932
|
+
df.sort(sorter)
|
936
933
|
.drop(sorter)
|
937
934
|
else
|
938
|
-
|
935
|
+
df
|
939
936
|
end
|
940
937
|
end
|
941
938
|
|
942
939
|
private
|
943
940
|
|
944
|
-
# To ensure Array of
|
941
|
+
# To ensure Array of Strings
|
945
942
|
def ensure_keys(keys)
|
946
|
-
Array(keys).map(&:
|
943
|
+
Array(keys).map(&:to_s)
|
944
|
+
end
|
945
|
+
|
946
|
+
# Merge key columns and preserve as left and remove right.
|
947
|
+
def merge_keys(plan, input_node, context)
|
948
|
+
type_nick, left_table_keys, right_table_keys, left_keys, right_keys, * = context
|
949
|
+
return input_node unless type_nick == 'full-outer'
|
950
|
+
|
951
|
+
left_indices = left_keys.map { left_table_keys.index(_1) }
|
952
|
+
right_offset = left_table_keys.size
|
953
|
+
right_indices = right_keys.map { right_table_keys.index(_1) + right_offset }
|
954
|
+
expressions = []
|
955
|
+
names = []
|
956
|
+
left_table_keys.each_with_index do |key, index|
|
957
|
+
names << key
|
958
|
+
expressions <<
|
959
|
+
if (i = left_indices.index(index))
|
960
|
+
left_field = Arrow::FieldExpression.new("[#{left_indices[i]}]")
|
961
|
+
right_field = Arrow::FieldExpression.new("[#{right_indices[i]}]")
|
962
|
+
is_left_null = Arrow::CallExpression.new('is_null', [left_field])
|
963
|
+
Arrow::CallExpression.new('if_else', [is_left_null, right_field, left_field])
|
964
|
+
else
|
965
|
+
Arrow::FieldExpression.new("[#{index}]")
|
966
|
+
end
|
967
|
+
end
|
968
|
+
right_table_keys.each.with_index(right_offset) do |key, index|
|
969
|
+
unless right_indices.include?(index)
|
970
|
+
names << key
|
971
|
+
expressions << Arrow::FieldExpression.new("[#{index}]")
|
972
|
+
end
|
973
|
+
end
|
974
|
+
project_node_options = Arrow::ProjectNodeOptions.new(expressions, names)
|
975
|
+
plan.build_project_node(input_node, project_node_options)
|
947
976
|
end
|
948
977
|
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
978
|
+
def rename_keys(plan, input_node, context)
|
979
|
+
type_nick, left_table_keys, right_table_keys, *, suffix = context
|
980
|
+
names = input_node.output_schema.fields.map(&:name)
|
981
|
+
return input_node unless names.dup.uniq!
|
953
982
|
|
954
|
-
|
983
|
+
pos_rights =
|
984
|
+
if type_nick.start_with?('right')
|
985
|
+
names.size - right_table_keys.size
|
986
|
+
else
|
987
|
+
left_table_keys.size
|
988
|
+
end
|
989
|
+
rights = names[pos_rights..]
|
990
|
+
dup_keys = names.tally.select { |_, v| v > 1 }.keys
|
955
991
|
renamed_right_keys =
|
956
|
-
|
992
|
+
rights.map do |key|
|
957
993
|
if dup_keys.include?(key)
|
958
|
-
suffixed = "#{key}#{suffix}".
|
994
|
+
suffixed = "#{key}#{suffix}".to_s
|
959
995
|
# Find a key from suffixed.succ
|
960
|
-
(suffixed..).find { !
|
996
|
+
(suffixed..).find { !names.include?(_1) }
|
961
997
|
else
|
962
998
|
key
|
963
999
|
end
|
964
1000
|
end
|
965
|
-
|
966
|
-
|
967
|
-
fields =
|
968
|
-
joined_keys.map.with_index do |k, i|
|
969
|
-
Arrow::Field.new(k, joined_table[i].data_type)
|
970
|
-
end
|
971
|
-
Arrow::Table.new(Arrow::Schema.new(fields), joined_table.columns)
|
972
|
-
end
|
1001
|
+
names[pos_rights..] = renamed_right_keys
|
973
1002
|
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
Arrow::Function.find(:if_else).execute([t, array2, array1]).value
|
1003
|
+
expressions = names.map.with_index { |_, i| Arrow::FieldExpression.new("[#{i}]") }
|
1004
|
+
project_node_options = Arrow::ProjectNodeOptions.new(expressions, names)
|
1005
|
+
plan.build_project_node(input_node, project_node_options)
|
978
1006
|
end
|
979
1007
|
end
|
980
1008
|
end
|
@@ -269,12 +269,13 @@ module RedAmber
|
|
269
269
|
end
|
270
270
|
alias_method :glimpse, :tdr
|
271
271
|
|
272
|
-
# Shortcut for `tdr(:all)
|
272
|
+
# Shortcut for `tdr(:all)`.
|
273
273
|
#
|
274
|
+
# @param (see #tdr)
|
274
275
|
# @return (see #tdr)
|
275
276
|
#
|
276
|
-
def tdra
|
277
|
-
puts tdr_str(:all)
|
277
|
+
def tdra(tally: 5, elements: 5)
|
278
|
+
puts tdr_str(:all, tally: tally, elements: elements)
|
278
279
|
end
|
279
280
|
|
280
281
|
# rubocop:enable Layout/LineLength
|
@@ -504,9 +505,9 @@ module RedAmber
|
|
504
505
|
row.zip(formats).map do |elem, format|
|
505
506
|
non_ascii_diff = elem.ascii_only? ? 0 : elem.width - elem.size
|
506
507
|
if format.negative?
|
507
|
-
elem.ljust(-format
|
508
|
+
elem.ljust(-format - non_ascii_diff)
|
508
509
|
else
|
509
|
-
elem.rjust(format
|
510
|
+
elem.rjust(format - non_ascii_diff)
|
510
511
|
end
|
511
512
|
end
|
512
513
|
str.puts a.join(' ').rstrip
|