red-arrow 6.0.1 → 7.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +10 -0
- data/ext/arrow/extconf.rb +1 -1
- data/lib/arrow/datum.rb +2 -0
- data/lib/arrow/function.rb +52 -0
- data/lib/arrow/loader.rb +14 -0
- data/lib/arrow/s3-global-options.rb +38 -0
- data/lib/arrow/sort-key.rb +61 -55
- data/lib/arrow/sort-options.rb +8 -8
- data/lib/arrow/table-loader.rb +99 -62
- data/lib/arrow/table-saver.rb +7 -2
- data/lib/arrow/table.rb +78 -0
- data/lib/arrow/version.rb +1 -1
- data/red-arrow.gemspec +1 -10
- data/test/helper.rb +2 -0
- data/test/test-function.rb +48 -14
- data/test/test-table.rb +186 -3
- metadata +84 -194
data/lib/arrow/table.rb
CHANGED
@@ -448,6 +448,84 @@ module Arrow
|
|
448
448
|
self.class.new(schema, packed_arrays)
|
449
449
|
end
|
450
450
|
|
451
|
+
# @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
|
452
|
+
# @!macro join_common_before
|
453
|
+
# @param right [Arrow::Table] The right table.
|
454
|
+
#
|
455
|
+
# Join columns with `right` on join key columns.
|
456
|
+
#
|
457
|
+
# @!macro join_common_after
|
458
|
+
# @param type [Arrow::JoinType] How to join.
|
459
|
+
# @param left_outputs [::Array<String, Symbol>] Output columns in
|
460
|
+
# `self`.
|
461
|
+
#
|
462
|
+
# If both of `left_outputs` and `right_outputs` aren't
|
463
|
+
# specified, all columns in `self` and `right` are
|
464
|
+
# outputted.
|
465
|
+
# @param right_outputs [::Array<String, Symbol>] Output columns in
|
466
|
+
# `right`.
|
467
|
+
#
|
468
|
+
# If both of `left_outputs` and `right_outputs` aren't
|
469
|
+
# specified, all columns in `self` and `right` are
|
470
|
+
# outputted.
|
471
|
+
# @return [Arrow::Table]
|
472
|
+
# The joined `Arrow::Table`.
|
473
|
+
#
|
474
|
+
# @macro join_common_before
|
475
|
+
# @param key [String, Symbol] A join key.
|
476
|
+
# @macro join_common_after
|
477
|
+
#
|
478
|
+
# @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
|
479
|
+
#
|
480
|
+
# @macro join_common_before
|
481
|
+
# @param keys [::Array<String, Symbol>] Join keys.
|
482
|
+
# @macro join_common_after
|
483
|
+
#
|
484
|
+
# @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
|
485
|
+
#
|
486
|
+
# @macro join_common_before
|
487
|
+
# @param keys [Hash] Specify join keys in `self` and `right` separately.
|
488
|
+
# @option keys [String, Symbol, ::Array<String, Symbol>] :left
|
489
|
+
# Join keys in `self`.
|
490
|
+
# @option keys [String, Symbol, ::Array<String, Symbol>] :right
|
491
|
+
# Join keys in `right`.
|
492
|
+
# @macro join_common_after
|
493
|
+
#
|
494
|
+
# @since 7.0.0
|
495
|
+
def join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
|
496
|
+
plan = ExecutePlan.new
|
497
|
+
left_node = plan.build_source_node(self)
|
498
|
+
right_node = plan.build_source_node(right)
|
499
|
+
if keys.is_a?(Hash)
|
500
|
+
left_keys = keys[:left]
|
501
|
+
right_keys = keys[:right]
|
502
|
+
else
|
503
|
+
left_keys = keys
|
504
|
+
right_keys = keys
|
505
|
+
end
|
506
|
+
left_keys = Array(left_keys)
|
507
|
+
right_keys = Array(right_keys)
|
508
|
+
hash_join_node_options = HashJoinNodeOptions.new(type,
|
509
|
+
left_keys,
|
510
|
+
right_keys)
|
511
|
+
unless left_outputs.nil?
|
512
|
+
hash_join_node_options.left_outputs = left_outputs
|
513
|
+
end
|
514
|
+
unless right_outputs.nil?
|
515
|
+
hash_join_node_options.right_outputs = right_outputs
|
516
|
+
end
|
517
|
+
hash_join_node = plan.build_hash_join_node(left_node,
|
518
|
+
right_node,
|
519
|
+
hash_join_node_options)
|
520
|
+
sink_node_options = SinkNodeOptions.new
|
521
|
+
plan.build_sink_node(hash_join_node, sink_node_options)
|
522
|
+
plan.validate
|
523
|
+
plan.start
|
524
|
+
plan.wait
|
525
|
+
reader = sink_node_options.get_reader(hash_join_node.output_schema)
|
526
|
+
reader.read_all
|
527
|
+
end
|
528
|
+
|
451
529
|
alias_method :to_s_raw, :to_s
|
452
530
|
def to_s(options={})
|
453
531
|
format = options[:format]
|
data/lib/arrow/version.rb
CHANGED
data/red-arrow.gemspec
CHANGED
@@ -48,19 +48,10 @@ Gem::Specification.new do |spec|
|
|
48
48
|
|
49
49
|
spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
|
50
50
|
spec.add_runtime_dependency("extpp", ">= 0.0.7")
|
51
|
-
spec.add_runtime_dependency("gio2", ">= 3.
|
51
|
+
spec.add_runtime_dependency("gio2", ">= 3.5.0")
|
52
52
|
spec.add_runtime_dependency("native-package-installer")
|
53
53
|
spec.add_runtime_dependency("pkg-config")
|
54
54
|
|
55
|
-
spec.add_development_dependency("benchmark-driver")
|
56
|
-
spec.add_development_dependency("bundler")
|
57
|
-
spec.add_development_dependency("faker")
|
58
|
-
spec.add_development_dependency("fiddle", ">= 1.0.9")
|
59
|
-
spec.add_development_dependency("rake")
|
60
|
-
spec.add_development_dependency("redcarpet")
|
61
|
-
spec.add_development_dependency("test-unit")
|
62
|
-
spec.add_development_dependency("yard")
|
63
|
-
|
64
55
|
required_msys2_package_version = version_components[0, 3].join(".")
|
65
56
|
spec.metadata["msys2_mingw_dependencies"] =
|
66
57
|
"arrow>=#{required_msys2_package_version}"
|
data/test/helper.rb
CHANGED
data/test/test-function.rb
CHANGED
@@ -53,6 +53,14 @@ class FunctionTest < Test::Unit::TestCase
|
|
53
53
|
or_function.execute(args).value.to_a)
|
54
54
|
end
|
55
55
|
|
56
|
+
test("Arrow::Column") do
|
57
|
+
or_function = Arrow::Function.find("or")
|
58
|
+
table = Arrow::Table.new(a: [true, false, false],
|
59
|
+
b: [true, false, true])
|
60
|
+
assert_equal([true, false, true],
|
61
|
+
or_function.execute([table.a, table.b]).value.to_a)
|
62
|
+
end
|
63
|
+
|
56
64
|
test("Arrow::Scalar") do
|
57
65
|
add_function = Arrow::Function.find("add")
|
58
66
|
args = [
|
@@ -116,12 +124,13 @@ class FunctionTest < Test::Unit::TestCase
|
|
116
124
|
cast_function = Arrow::Function.find("cast")
|
117
125
|
date = Date.new(2021, 6, 12)
|
118
126
|
args = [date]
|
119
|
-
options =
|
120
|
-
|
127
|
+
options = {
|
128
|
+
to_data_type: Arrow::TimestampDataType.new(:second),
|
129
|
+
}
|
121
130
|
time = Time.utc(date.year,
|
122
131
|
date.month,
|
123
132
|
date.day)
|
124
|
-
assert_equal(Arrow::TimestampScalar.new(options
|
133
|
+
assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
|
125
134
|
time.to_i),
|
126
135
|
cast_function.execute(args, options).value)
|
127
136
|
end
|
@@ -132,9 +141,10 @@ class FunctionTest < Test::Unit::TestCase
|
|
132
141
|
# 00:10:00
|
133
142
|
60 * 10)
|
134
143
|
args = [arrow_time]
|
135
|
-
options =
|
136
|
-
|
137
|
-
|
144
|
+
options = {
|
145
|
+
to_data_type: Arrow::Time64DataType.new(:micro),
|
146
|
+
}
|
147
|
+
assert_equal(Arrow::Time64Scalar.new(options[:to_data_type],
|
138
148
|
# 00:10:00.000000
|
139
149
|
60 * 10 * 1000 * 1000),
|
140
150
|
cast_function.execute(args, options).value)
|
@@ -146,10 +156,11 @@ class FunctionTest < Test::Unit::TestCase
|
|
146
156
|
# 00:10:00.000000
|
147
157
|
60 * 10 * 1000 * 1000)
|
148
158
|
args = [arrow_time]
|
149
|
-
options =
|
150
|
-
|
151
|
-
|
152
|
-
|
159
|
+
options = {
|
160
|
+
to_data_type: Arrow::Time32DataType.new(:second),
|
161
|
+
allow_time_truncate: true,
|
162
|
+
}
|
163
|
+
assert_equal(Arrow::Time32Scalar.new(options[:to_data_type],
|
153
164
|
# 00:10:00
|
154
165
|
60 * 10),
|
155
166
|
cast_function.execute(args, options).value)
|
@@ -159,18 +170,41 @@ class FunctionTest < Test::Unit::TestCase
|
|
159
170
|
cast_function = Arrow::Function.find("cast")
|
160
171
|
time = Time.utc(2021, 6, 12, 1, 2, 3, 1)
|
161
172
|
args = [time]
|
162
|
-
options =
|
163
|
-
|
164
|
-
|
173
|
+
options = {
|
174
|
+
to_data_type: Arrow::TimestampDataType.new(:second),
|
175
|
+
allow_time_truncate: true,
|
176
|
+
}
|
165
177
|
time = Time.utc(time.year,
|
166
178
|
time.month,
|
167
179
|
time.day,
|
168
180
|
time.hour,
|
169
181
|
time.min,
|
170
182
|
time.sec)
|
171
|
-
assert_equal(Arrow::TimestampScalar.new(options
|
183
|
+
assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
|
172
184
|
time.to_i),
|
173
185
|
cast_function.execute(args, options).value)
|
174
186
|
end
|
187
|
+
|
188
|
+
test("SetLookupOptions") do
|
189
|
+
is_in_function = Arrow::Function.find("is_in")
|
190
|
+
args = [
|
191
|
+
Arrow::Int16Array.new([1, 0, 1, 2]),
|
192
|
+
]
|
193
|
+
options = {
|
194
|
+
value_set: Arrow::Int16Array.new([2, 0]),
|
195
|
+
}
|
196
|
+
assert_equal(Arrow::BooleanArray.new([false, true, false, true]),
|
197
|
+
is_in_function.execute(args, options).value)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def test_call
|
202
|
+
or_function = Arrow::Function.find("or")
|
203
|
+
args = [
|
204
|
+
Arrow::BooleanArray.new([true, false, false]),
|
205
|
+
Arrow::BooleanArray.new([true, false, true]),
|
206
|
+
]
|
207
|
+
assert_equal([true, false, true],
|
208
|
+
or_function.call(args).value.to_a)
|
175
209
|
end
|
176
210
|
end
|
data/test/test-table.rb
CHANGED
@@ -573,14 +573,20 @@ class TableTest < Test::Unit::TestCase
|
|
573
573
|
assert_equal(@table, Arrow::Table.load(output, format: :batch))
|
574
574
|
end
|
575
575
|
|
576
|
+
def test_arrows
|
577
|
+
output = create_output(".arrows")
|
578
|
+
@table.save(output, format: :arrows)
|
579
|
+
assert_equal(@table, Arrow::Table.load(output, format: :arrows))
|
580
|
+
end
|
581
|
+
|
576
582
|
def test_arrow_streaming
|
577
|
-
output = create_output(".
|
583
|
+
output = create_output(".arrows")
|
578
584
|
@table.save(output, format: :arrow_streaming)
|
579
585
|
assert_equal(@table, Arrow::Table.load(output, format: :arrow_streaming))
|
580
586
|
end
|
581
587
|
|
582
588
|
def test_stream
|
583
|
-
output = create_output(".
|
589
|
+
output = create_output(".arrows")
|
584
590
|
@table.save(output, format: :stream)
|
585
591
|
assert_equal(@table, Arrow::Table.load(output, format: :stream))
|
586
592
|
end
|
@@ -626,6 +632,24 @@ class TableTest < Test::Unit::TestCase
|
|
626
632
|
end
|
627
633
|
|
628
634
|
sub_test_case("save: auto detect") do
|
635
|
+
test("arrow") do
|
636
|
+
output = create_output(".arrow")
|
637
|
+
@table.save(output)
|
638
|
+
assert_equal(@table,
|
639
|
+
Arrow::Table.load(output,
|
640
|
+
format: :arrow,
|
641
|
+
schema: @table.schema))
|
642
|
+
end
|
643
|
+
|
644
|
+
test("arrows") do
|
645
|
+
output = create_output(".arrows")
|
646
|
+
@table.save(output)
|
647
|
+
assert_equal(@table,
|
648
|
+
Arrow::Table.load(output,
|
649
|
+
format: :arrows,
|
650
|
+
schema: @table.schema))
|
651
|
+
end
|
652
|
+
|
629
653
|
test("csv") do
|
630
654
|
output = create_output(".csv")
|
631
655
|
@table.save(output)
|
@@ -664,7 +688,13 @@ class TableTest < Test::Unit::TestCase
|
|
664
688
|
|
665
689
|
test("arrow: streaming") do
|
666
690
|
output = create_output(".arrow")
|
667
|
-
@table.save(output, format: :
|
691
|
+
@table.save(output, format: :arrows)
|
692
|
+
assert_equal(@table, Arrow::Table.load(output))
|
693
|
+
end
|
694
|
+
|
695
|
+
test("arrows") do
|
696
|
+
output = create_output(".arrows")
|
697
|
+
@table.save(output, format: :arrows)
|
668
698
|
assert_equal(@table, Arrow::Table.load(output))
|
669
699
|
end
|
670
700
|
|
@@ -728,6 +758,47 @@ chris\t-1
|
|
728
758
|
end
|
729
759
|
end
|
730
760
|
end
|
761
|
+
|
762
|
+
sub_test_case("URI") do
|
763
|
+
def start_web_server(path, data, content_type)
|
764
|
+
http_server = WEBrick::HTTPServer.new(:Port => 0)
|
765
|
+
http_server.mount_proc(path) do |request, response|
|
766
|
+
response.body = data
|
767
|
+
response.content_type = content_type
|
768
|
+
end
|
769
|
+
http_server_thread = Thread.new do
|
770
|
+
http_server.start
|
771
|
+
end
|
772
|
+
begin
|
773
|
+
Timeout.timeout(1) do
|
774
|
+
yield(http_server[:Port])
|
775
|
+
end
|
776
|
+
ensure
|
777
|
+
http_server.shutdown
|
778
|
+
http_server_thread.join
|
779
|
+
end
|
780
|
+
end
|
781
|
+
|
782
|
+
data("Arrow File",
|
783
|
+
["arrow", "application/vnd.apache.arrow.file"])
|
784
|
+
data("Arrow Stream",
|
785
|
+
["arrows", "application/vnd.apache.arrow.stream"])
|
786
|
+
data("CSV",
|
787
|
+
["csv", "text/csv"])
|
788
|
+
def test_http(data)
|
789
|
+
extension, content_type = data
|
790
|
+
output = Arrow::ResizableBuffer.new(1024)
|
791
|
+
@table.save(output, format: extension.to_sym)
|
792
|
+
path = "/data.#{extension}"
|
793
|
+
start_web_server(path,
|
794
|
+
output.data.to_s,
|
795
|
+
content_type) do |port|
|
796
|
+
input = URI("http://127.0.0.1:#{port}#{path}")
|
797
|
+
loaded_table = Arrow::Table.load(input)
|
798
|
+
assert_equal(@table.to_s, loaded_table.to_s)
|
799
|
+
end
|
800
|
+
end
|
801
|
+
end
|
731
802
|
end
|
732
803
|
|
733
804
|
test("#pack") do
|
@@ -922,4 +993,116 @@ visible: false
|
|
922
993
|
TABLE
|
923
994
|
end
|
924
995
|
end
|
996
|
+
|
997
|
+
sub_test_case("#join") do
|
998
|
+
test("keys: String") do
|
999
|
+
table1 = Arrow::Table.new(key: [1, 2, 3],
|
1000
|
+
number: [10, 20, 30])
|
1001
|
+
table2 = Arrow::Table.new(key: [3, 1],
|
1002
|
+
string: ["three", "one"])
|
1003
|
+
assert_equal(Arrow::Table.new([
|
1004
|
+
["key", [1, 3]],
|
1005
|
+
["number", [10, 30]],
|
1006
|
+
["key", [1, 3]],
|
1007
|
+
["string", ["one", "three"]],
|
1008
|
+
]),
|
1009
|
+
table1.join(table2, "key"))
|
1010
|
+
end
|
1011
|
+
|
1012
|
+
test("keys: Symbol") do
|
1013
|
+
table1 = Arrow::Table.new(key: [1, 2, 3],
|
1014
|
+
number: [10, 20, 30])
|
1015
|
+
table2 = Arrow::Table.new(key: [3, 1],
|
1016
|
+
string: ["three", "one"])
|
1017
|
+
assert_equal(Arrow::Table.new([
|
1018
|
+
["key", [1, 3]],
|
1019
|
+
["number", [10, 30]],
|
1020
|
+
["key", [1, 3]],
|
1021
|
+
["string", ["one", "three"]],
|
1022
|
+
]),
|
1023
|
+
table1.join(table2, :key))
|
1024
|
+
end
|
1025
|
+
|
1026
|
+
test("keys: [String, Symbol]") do
|
1027
|
+
table1 = Arrow::Table.new(key1: [1, 1, 2, 2],
|
1028
|
+
key2: [10, 100, 20, 200],
|
1029
|
+
number: [1010, 1100, 2020, 2200])
|
1030
|
+
table2 = Arrow::Table.new(key1: [1, 2, 2],
|
1031
|
+
key2: [100, 20, 50],
|
1032
|
+
string: ["1-100", "2-20", "2-50"])
|
1033
|
+
assert_equal(Arrow::Table.new([
|
1034
|
+
["key1", [1, 2]],
|
1035
|
+
["key2", [100, 20]],
|
1036
|
+
["number", [1100, 2020]],
|
1037
|
+
["key1", [1, 2]],
|
1038
|
+
["key2", [100, 20]],
|
1039
|
+
["string", ["1-100", "2-20"]],
|
1040
|
+
]),
|
1041
|
+
table1.join(table2, ["key1", :key2]))
|
1042
|
+
end
|
1043
|
+
|
1044
|
+
test("keys: {left: String, right: Symbol}") do
|
1045
|
+
table1 = Arrow::Table.new(left_key: [1, 2, 3],
|
1046
|
+
number: [10, 20, 30])
|
1047
|
+
table2 = Arrow::Table.new(right_key: [3, 1],
|
1048
|
+
string: ["three", "one"])
|
1049
|
+
assert_equal(Arrow::Table.new([
|
1050
|
+
["left_key", [1, 3]],
|
1051
|
+
["number", [10, 30]],
|
1052
|
+
["right_key", [1, 3]],
|
1053
|
+
["string", ["one", "three"]],
|
1054
|
+
]),
|
1055
|
+
table1.join(table2, {left: "left_key", right: :right_key}))
|
1056
|
+
end
|
1057
|
+
|
1058
|
+
test("keys: {left: [String, Symbol], right: [Symbol, String]}") do
|
1059
|
+
table1 = Arrow::Table.new(left_key1: [1, 1, 2, 2],
|
1060
|
+
left_key2: [10, 100, 20, 200],
|
1061
|
+
number: [1010, 1100, 2020, 2200])
|
1062
|
+
table2 = Arrow::Table.new(right_key1: [1, 2, 2],
|
1063
|
+
right_key2: [100, 20, 50],
|
1064
|
+
string: ["1-100", "2-20", "2-50"])
|
1065
|
+
assert_equal(Arrow::Table.new([
|
1066
|
+
["left_key1", [1, 2]],
|
1067
|
+
["left_key2", [100, 20]],
|
1068
|
+
["number", [1100, 2020]],
|
1069
|
+
["right_key1", [1, 2]],
|
1070
|
+
["right_key2", [100, 20]],
|
1071
|
+
["string", ["1-100", "2-20"]],
|
1072
|
+
]),
|
1073
|
+
table1.join(table2,
|
1074
|
+
{
|
1075
|
+
left: ["left_key1", :left_key2],
|
1076
|
+
right: [:right_key1, "right_key2"],
|
1077
|
+
}))
|
1078
|
+
end
|
1079
|
+
|
1080
|
+
test("type:") do
|
1081
|
+
table1 = Arrow::Table.new(key: [1, 2, 3],
|
1082
|
+
number: [10, 20, 30])
|
1083
|
+
table2 = Arrow::Table.new(key: [3, 1],
|
1084
|
+
string: ["three", "one"])
|
1085
|
+
assert_equal(Arrow::Table.new([
|
1086
|
+
["key", [1, 3, 2]],
|
1087
|
+
["number", [10, 30, 20]],
|
1088
|
+
["key", [1, 3, nil]],
|
1089
|
+
["string", ["one", "three", nil]],
|
1090
|
+
]),
|
1091
|
+
table1.join(table2, "key", type: :left_outer))
|
1092
|
+
end
|
1093
|
+
|
1094
|
+
test("left_outputs: & right_outputs:") do
|
1095
|
+
table1 = Arrow::Table.new(key: [1, 2, 3],
|
1096
|
+
number: [10, 20, 30])
|
1097
|
+
table2 = Arrow::Table.new(key: [3, 1],
|
1098
|
+
string: ["three", "one"])
|
1099
|
+
assert_equal(Arrow::Table.new(key: [1, 3],
|
1100
|
+
number: [10, 30],
|
1101
|
+
string: ["one", "three"]),
|
1102
|
+
table1.join(table2,
|
1103
|
+
"key",
|
1104
|
+
left_outputs: ["key", "number"],
|
1105
|
+
right_outputs: ["string"]))
|
1106
|
+
end
|
1107
|
+
end
|
925
1108
|
end
|