red-arrow 6.0.1 → 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +10 -0
- data/ext/arrow/extconf.rb +1 -1
- data/lib/arrow/datum.rb +2 -0
- data/lib/arrow/function.rb +52 -0
- data/lib/arrow/loader.rb +14 -0
- data/lib/arrow/s3-global-options.rb +38 -0
- data/lib/arrow/sort-key.rb +61 -55
- data/lib/arrow/sort-options.rb +8 -8
- data/lib/arrow/table-loader.rb +99 -62
- data/lib/arrow/table-saver.rb +7 -2
- data/lib/arrow/table.rb +78 -0
- data/lib/arrow/version.rb +1 -1
- data/red-arrow.gemspec +1 -10
- data/test/helper.rb +2 -0
- data/test/test-function.rb +48 -14
- data/test/test-table.rb +186 -3
- metadata +84 -194
data/lib/arrow/table.rb
CHANGED
@@ -448,6 +448,84 @@ module Arrow
|
|
448
448
|
self.class.new(schema, packed_arrays)
|
449
449
|
end
|
450
450
|
|
451
|
+
# @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
|
452
|
+
# @!macro join_common_before
|
453
|
+
# @param right [Arrow::Table] The right table.
|
454
|
+
#
|
455
|
+
# Join columns with `right` on join key columns.
|
456
|
+
#
|
457
|
+
# @!macro join_common_after
|
458
|
+
# @param type [Arrow::JoinType] How to join.
|
459
|
+
# @param left_outputs [::Array<String, Symbol>] Output columns in
|
460
|
+
# `self`.
|
461
|
+
#
|
462
|
+
# If both of `left_outputs` and `right_outputs` aren't
|
463
|
+
# specified, all columns in `self` and `right` are
|
464
|
+
# outputted.
|
465
|
+
# @param right_outputs [::Array<String, Symbol>] Output columns in
|
466
|
+
# `right`.
|
467
|
+
#
|
468
|
+
# If both of `left_outputs` and `right_outputs` aren't
|
469
|
+
# specified, all columns in `self` and `right` are
|
470
|
+
# outputted.
|
471
|
+
# @return [Arrow::Table]
|
472
|
+
# The joined `Arrow::Table`.
|
473
|
+
#
|
474
|
+
# @macro join_common_before
|
475
|
+
# @param key [String, Symbol] A join key.
|
476
|
+
# @macro join_common_after
|
477
|
+
#
|
478
|
+
# @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
|
479
|
+
#
|
480
|
+
# @macro join_common_before
|
481
|
+
# @param keys [::Array<String, Symbol>] Join keys.
|
482
|
+
# @macro join_common_after
|
483
|
+
#
|
484
|
+
# @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
|
485
|
+
#
|
486
|
+
# @macro join_common_before
|
487
|
+
# @param keys [Hash] Specify join keys in `self` and `right` separately.
|
488
|
+
# @option keys [String, Symbol, ::Array<String, Symbol>] :left
|
489
|
+
# Join keys in `self`.
|
490
|
+
# @option keys [String, Symbol, ::Array<String, Symbol>] :right
|
491
|
+
# Join keys in `right`.
|
492
|
+
# @macro join_common_after
|
493
|
+
#
|
494
|
+
# @since 7.0.0
|
495
|
+
def join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
|
496
|
+
plan = ExecutePlan.new
|
497
|
+
left_node = plan.build_source_node(self)
|
498
|
+
right_node = plan.build_source_node(right)
|
499
|
+
if keys.is_a?(Hash)
|
500
|
+
left_keys = keys[:left]
|
501
|
+
right_keys = keys[:right]
|
502
|
+
else
|
503
|
+
left_keys = keys
|
504
|
+
right_keys = keys
|
505
|
+
end
|
506
|
+
left_keys = Array(left_keys)
|
507
|
+
right_keys = Array(right_keys)
|
508
|
+
hash_join_node_options = HashJoinNodeOptions.new(type,
|
509
|
+
left_keys,
|
510
|
+
right_keys)
|
511
|
+
unless left_outputs.nil?
|
512
|
+
hash_join_node_options.left_outputs = left_outputs
|
513
|
+
end
|
514
|
+
unless right_outputs.nil?
|
515
|
+
hash_join_node_options.right_outputs = right_outputs
|
516
|
+
end
|
517
|
+
hash_join_node = plan.build_hash_join_node(left_node,
|
518
|
+
right_node,
|
519
|
+
hash_join_node_options)
|
520
|
+
sink_node_options = SinkNodeOptions.new
|
521
|
+
plan.build_sink_node(hash_join_node, sink_node_options)
|
522
|
+
plan.validate
|
523
|
+
plan.start
|
524
|
+
plan.wait
|
525
|
+
reader = sink_node_options.get_reader(hash_join_node.output_schema)
|
526
|
+
reader.read_all
|
527
|
+
end
|
528
|
+
|
451
529
|
alias_method :to_s_raw, :to_s
|
452
530
|
def to_s(options={})
|
453
531
|
format = options[:format]
|
data/lib/arrow/version.rb
CHANGED
data/red-arrow.gemspec
CHANGED
@@ -48,19 +48,10 @@ Gem::Specification.new do |spec|
|
|
48
48
|
|
49
49
|
spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
|
50
50
|
spec.add_runtime_dependency("extpp", ">= 0.0.7")
|
51
|
-
spec.add_runtime_dependency("gio2", ">= 3.
|
51
|
+
spec.add_runtime_dependency("gio2", ">= 3.5.0")
|
52
52
|
spec.add_runtime_dependency("native-package-installer")
|
53
53
|
spec.add_runtime_dependency("pkg-config")
|
54
54
|
|
55
|
-
spec.add_development_dependency("benchmark-driver")
|
56
|
-
spec.add_development_dependency("bundler")
|
57
|
-
spec.add_development_dependency("faker")
|
58
|
-
spec.add_development_dependency("fiddle", ">= 1.0.9")
|
59
|
-
spec.add_development_dependency("rake")
|
60
|
-
spec.add_development_dependency("redcarpet")
|
61
|
-
spec.add_development_dependency("test-unit")
|
62
|
-
spec.add_development_dependency("yard")
|
63
|
-
|
64
55
|
required_msys2_package_version = version_components[0, 3].join(".")
|
65
56
|
spec.metadata["msys2_mingw_dependencies"] =
|
66
57
|
"arrow>=#{required_msys2_package_version}"
|
data/test/helper.rb
CHANGED
data/test/test-function.rb
CHANGED
@@ -53,6 +53,14 @@ class FunctionTest < Test::Unit::TestCase
|
|
53
53
|
or_function.execute(args).value.to_a)
|
54
54
|
end
|
55
55
|
|
56
|
+
test("Arrow::Column") do
|
57
|
+
or_function = Arrow::Function.find("or")
|
58
|
+
table = Arrow::Table.new(a: [true, false, false],
|
59
|
+
b: [true, false, true])
|
60
|
+
assert_equal([true, false, true],
|
61
|
+
or_function.execute([table.a, table.b]).value.to_a)
|
62
|
+
end
|
63
|
+
|
56
64
|
test("Arrow::Scalar") do
|
57
65
|
add_function = Arrow::Function.find("add")
|
58
66
|
args = [
|
@@ -116,12 +124,13 @@ class FunctionTest < Test::Unit::TestCase
|
|
116
124
|
cast_function = Arrow::Function.find("cast")
|
117
125
|
date = Date.new(2021, 6, 12)
|
118
126
|
args = [date]
|
119
|
-
options =
|
120
|
-
|
127
|
+
options = {
|
128
|
+
to_data_type: Arrow::TimestampDataType.new(:second),
|
129
|
+
}
|
121
130
|
time = Time.utc(date.year,
|
122
131
|
date.month,
|
123
132
|
date.day)
|
124
|
-
assert_equal(Arrow::TimestampScalar.new(options
|
133
|
+
assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
|
125
134
|
time.to_i),
|
126
135
|
cast_function.execute(args, options).value)
|
127
136
|
end
|
@@ -132,9 +141,10 @@ class FunctionTest < Test::Unit::TestCase
|
|
132
141
|
# 00:10:00
|
133
142
|
60 * 10)
|
134
143
|
args = [arrow_time]
|
135
|
-
options =
|
136
|
-
|
137
|
-
|
144
|
+
options = {
|
145
|
+
to_data_type: Arrow::Time64DataType.new(:micro),
|
146
|
+
}
|
147
|
+
assert_equal(Arrow::Time64Scalar.new(options[:to_data_type],
|
138
148
|
# 00:10:00.000000
|
139
149
|
60 * 10 * 1000 * 1000),
|
140
150
|
cast_function.execute(args, options).value)
|
@@ -146,10 +156,11 @@ class FunctionTest < Test::Unit::TestCase
|
|
146
156
|
# 00:10:00.000000
|
147
157
|
60 * 10 * 1000 * 1000)
|
148
158
|
args = [arrow_time]
|
149
|
-
options =
|
150
|
-
|
151
|
-
|
152
|
-
|
159
|
+
options = {
|
160
|
+
to_data_type: Arrow::Time32DataType.new(:second),
|
161
|
+
allow_time_truncate: true,
|
162
|
+
}
|
163
|
+
assert_equal(Arrow::Time32Scalar.new(options[:to_data_type],
|
153
164
|
# 00:10:00
|
154
165
|
60 * 10),
|
155
166
|
cast_function.execute(args, options).value)
|
@@ -159,18 +170,41 @@ class FunctionTest < Test::Unit::TestCase
|
|
159
170
|
cast_function = Arrow::Function.find("cast")
|
160
171
|
time = Time.utc(2021, 6, 12, 1, 2, 3, 1)
|
161
172
|
args = [time]
|
162
|
-
options =
|
163
|
-
|
164
|
-
|
173
|
+
options = {
|
174
|
+
to_data_type: Arrow::TimestampDataType.new(:second),
|
175
|
+
allow_time_truncate: true,
|
176
|
+
}
|
165
177
|
time = Time.utc(time.year,
|
166
178
|
time.month,
|
167
179
|
time.day,
|
168
180
|
time.hour,
|
169
181
|
time.min,
|
170
182
|
time.sec)
|
171
|
-
assert_equal(Arrow::TimestampScalar.new(options
|
183
|
+
assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
|
172
184
|
time.to_i),
|
173
185
|
cast_function.execute(args, options).value)
|
174
186
|
end
|
187
|
+
|
188
|
+
test("SetLookupOptions") do
|
189
|
+
is_in_function = Arrow::Function.find("is_in")
|
190
|
+
args = [
|
191
|
+
Arrow::Int16Array.new([1, 0, 1, 2]),
|
192
|
+
]
|
193
|
+
options = {
|
194
|
+
value_set: Arrow::Int16Array.new([2, 0]),
|
195
|
+
}
|
196
|
+
assert_equal(Arrow::BooleanArray.new([false, true, false, true]),
|
197
|
+
is_in_function.execute(args, options).value)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def test_call
|
202
|
+
or_function = Arrow::Function.find("or")
|
203
|
+
args = [
|
204
|
+
Arrow::BooleanArray.new([true, false, false]),
|
205
|
+
Arrow::BooleanArray.new([true, false, true]),
|
206
|
+
]
|
207
|
+
assert_equal([true, false, true],
|
208
|
+
or_function.call(args).value.to_a)
|
175
209
|
end
|
176
210
|
end
|
data/test/test-table.rb
CHANGED
@@ -573,14 +573,20 @@ class TableTest < Test::Unit::TestCase
|
|
573
573
|
assert_equal(@table, Arrow::Table.load(output, format: :batch))
|
574
574
|
end
|
575
575
|
|
576
|
+
def test_arrows
|
577
|
+
output = create_output(".arrows")
|
578
|
+
@table.save(output, format: :arrows)
|
579
|
+
assert_equal(@table, Arrow::Table.load(output, format: :arrows))
|
580
|
+
end
|
581
|
+
|
576
582
|
def test_arrow_streaming
|
577
|
-
output = create_output(".
|
583
|
+
output = create_output(".arrows")
|
578
584
|
@table.save(output, format: :arrow_streaming)
|
579
585
|
assert_equal(@table, Arrow::Table.load(output, format: :arrow_streaming))
|
580
586
|
end
|
581
587
|
|
582
588
|
def test_stream
|
583
|
-
output = create_output(".
|
589
|
+
output = create_output(".arrows")
|
584
590
|
@table.save(output, format: :stream)
|
585
591
|
assert_equal(@table, Arrow::Table.load(output, format: :stream))
|
586
592
|
end
|
@@ -626,6 +632,24 @@ class TableTest < Test::Unit::TestCase
|
|
626
632
|
end
|
627
633
|
|
628
634
|
sub_test_case("save: auto detect") do
|
635
|
+
test("arrow") do
|
636
|
+
output = create_output(".arrow")
|
637
|
+
@table.save(output)
|
638
|
+
assert_equal(@table,
|
639
|
+
Arrow::Table.load(output,
|
640
|
+
format: :arrow,
|
641
|
+
schema: @table.schema))
|
642
|
+
end
|
643
|
+
|
644
|
+
test("arrows") do
|
645
|
+
output = create_output(".arrows")
|
646
|
+
@table.save(output)
|
647
|
+
assert_equal(@table,
|
648
|
+
Arrow::Table.load(output,
|
649
|
+
format: :arrows,
|
650
|
+
schema: @table.schema))
|
651
|
+
end
|
652
|
+
|
629
653
|
test("csv") do
|
630
654
|
output = create_output(".csv")
|
631
655
|
@table.save(output)
|
@@ -664,7 +688,13 @@ class TableTest < Test::Unit::TestCase
|
|
664
688
|
|
665
689
|
test("arrow: streaming") do
|
666
690
|
output = create_output(".arrow")
|
667
|
-
@table.save(output, format: :
|
691
|
+
@table.save(output, format: :arrows)
|
692
|
+
assert_equal(@table, Arrow::Table.load(output))
|
693
|
+
end
|
694
|
+
|
695
|
+
test("arrows") do
|
696
|
+
output = create_output(".arrows")
|
697
|
+
@table.save(output, format: :arrows)
|
668
698
|
assert_equal(@table, Arrow::Table.load(output))
|
669
699
|
end
|
670
700
|
|
@@ -728,6 +758,47 @@ chris\t-1
|
|
728
758
|
end
|
729
759
|
end
|
730
760
|
end
|
761
|
+
|
762
|
+
sub_test_case("URI") do
|
763
|
+
def start_web_server(path, data, content_type)
|
764
|
+
http_server = WEBrick::HTTPServer.new(:Port => 0)
|
765
|
+
http_server.mount_proc(path) do |request, response|
|
766
|
+
response.body = data
|
767
|
+
response.content_type = content_type
|
768
|
+
end
|
769
|
+
http_server_thread = Thread.new do
|
770
|
+
http_server.start
|
771
|
+
end
|
772
|
+
begin
|
773
|
+
Timeout.timeout(1) do
|
774
|
+
yield(http_server[:Port])
|
775
|
+
end
|
776
|
+
ensure
|
777
|
+
http_server.shutdown
|
778
|
+
http_server_thread.join
|
779
|
+
end
|
780
|
+
end
|
781
|
+
|
782
|
+
data("Arrow File",
|
783
|
+
["arrow", "application/vnd.apache.arrow.file"])
|
784
|
+
data("Arrow Stream",
|
785
|
+
["arrows", "application/vnd.apache.arrow.stream"])
|
786
|
+
data("CSV",
|
787
|
+
["csv", "text/csv"])
|
788
|
+
def test_http(data)
|
789
|
+
extension, content_type = data
|
790
|
+
output = Arrow::ResizableBuffer.new(1024)
|
791
|
+
@table.save(output, format: extension.to_sym)
|
792
|
+
path = "/data.#{extension}"
|
793
|
+
start_web_server(path,
|
794
|
+
output.data.to_s,
|
795
|
+
content_type) do |port|
|
796
|
+
input = URI("http://127.0.0.1:#{port}#{path}")
|
797
|
+
loaded_table = Arrow::Table.load(input)
|
798
|
+
assert_equal(@table.to_s, loaded_table.to_s)
|
799
|
+
end
|
800
|
+
end
|
801
|
+
end
|
731
802
|
end
|
732
803
|
|
733
804
|
test("#pack") do
|
@@ -922,4 +993,116 @@ visible: false
|
|
922
993
|
TABLE
|
923
994
|
end
|
924
995
|
end
|
996
|
+
|
997
|
+
sub_test_case("#join") do
|
998
|
+
test("keys: String") do
|
999
|
+
table1 = Arrow::Table.new(key: [1, 2, 3],
|
1000
|
+
number: [10, 20, 30])
|
1001
|
+
table2 = Arrow::Table.new(key: [3, 1],
|
1002
|
+
string: ["three", "one"])
|
1003
|
+
assert_equal(Arrow::Table.new([
|
1004
|
+
["key", [1, 3]],
|
1005
|
+
["number", [10, 30]],
|
1006
|
+
["key", [1, 3]],
|
1007
|
+
["string", ["one", "three"]],
|
1008
|
+
]),
|
1009
|
+
table1.join(table2, "key"))
|
1010
|
+
end
|
1011
|
+
|
1012
|
+
test("keys: Symbol") do
|
1013
|
+
table1 = Arrow::Table.new(key: [1, 2, 3],
|
1014
|
+
number: [10, 20, 30])
|
1015
|
+
table2 = Arrow::Table.new(key: [3, 1],
|
1016
|
+
string: ["three", "one"])
|
1017
|
+
assert_equal(Arrow::Table.new([
|
1018
|
+
["key", [1, 3]],
|
1019
|
+
["number", [10, 30]],
|
1020
|
+
["key", [1, 3]],
|
1021
|
+
["string", ["one", "three"]],
|
1022
|
+
]),
|
1023
|
+
table1.join(table2, :key))
|
1024
|
+
end
|
1025
|
+
|
1026
|
+
test("keys: [String, Symbol]") do
|
1027
|
+
table1 = Arrow::Table.new(key1: [1, 1, 2, 2],
|
1028
|
+
key2: [10, 100, 20, 200],
|
1029
|
+
number: [1010, 1100, 2020, 2200])
|
1030
|
+
table2 = Arrow::Table.new(key1: [1, 2, 2],
|
1031
|
+
key2: [100, 20, 50],
|
1032
|
+
string: ["1-100", "2-20", "2-50"])
|
1033
|
+
assert_equal(Arrow::Table.new([
|
1034
|
+
["key1", [1, 2]],
|
1035
|
+
["key2", [100, 20]],
|
1036
|
+
["number", [1100, 2020]],
|
1037
|
+
["key1", [1, 2]],
|
1038
|
+
["key2", [100, 20]],
|
1039
|
+
["string", ["1-100", "2-20"]],
|
1040
|
+
]),
|
1041
|
+
table1.join(table2, ["key1", :key2]))
|
1042
|
+
end
|
1043
|
+
|
1044
|
+
test("keys: {left: String, right: Symbol}") do
|
1045
|
+
table1 = Arrow::Table.new(left_key: [1, 2, 3],
|
1046
|
+
number: [10, 20, 30])
|
1047
|
+
table2 = Arrow::Table.new(right_key: [3, 1],
|
1048
|
+
string: ["three", "one"])
|
1049
|
+
assert_equal(Arrow::Table.new([
|
1050
|
+
["left_key", [1, 3]],
|
1051
|
+
["number", [10, 30]],
|
1052
|
+
["right_key", [1, 3]],
|
1053
|
+
["string", ["one", "three"]],
|
1054
|
+
]),
|
1055
|
+
table1.join(table2, {left: "left_key", right: :right_key}))
|
1056
|
+
end
|
1057
|
+
|
1058
|
+
test("keys: {left: [String, Symbol], right: [Symbol, String]}") do
|
1059
|
+
table1 = Arrow::Table.new(left_key1: [1, 1, 2, 2],
|
1060
|
+
left_key2: [10, 100, 20, 200],
|
1061
|
+
number: [1010, 1100, 2020, 2200])
|
1062
|
+
table2 = Arrow::Table.new(right_key1: [1, 2, 2],
|
1063
|
+
right_key2: [100, 20, 50],
|
1064
|
+
string: ["1-100", "2-20", "2-50"])
|
1065
|
+
assert_equal(Arrow::Table.new([
|
1066
|
+
["left_key1", [1, 2]],
|
1067
|
+
["left_key2", [100, 20]],
|
1068
|
+
["number", [1100, 2020]],
|
1069
|
+
["right_key1", [1, 2]],
|
1070
|
+
["right_key2", [100, 20]],
|
1071
|
+
["string", ["1-100", "2-20"]],
|
1072
|
+
]),
|
1073
|
+
table1.join(table2,
|
1074
|
+
{
|
1075
|
+
left: ["left_key1", :left_key2],
|
1076
|
+
right: [:right_key1, "right_key2"],
|
1077
|
+
}))
|
1078
|
+
end
|
1079
|
+
|
1080
|
+
test("type:") do
|
1081
|
+
table1 = Arrow::Table.new(key: [1, 2, 3],
|
1082
|
+
number: [10, 20, 30])
|
1083
|
+
table2 = Arrow::Table.new(key: [3, 1],
|
1084
|
+
string: ["three", "one"])
|
1085
|
+
assert_equal(Arrow::Table.new([
|
1086
|
+
["key", [1, 3, 2]],
|
1087
|
+
["number", [10, 30, 20]],
|
1088
|
+
["key", [1, 3, nil]],
|
1089
|
+
["string", ["one", "three", nil]],
|
1090
|
+
]),
|
1091
|
+
table1.join(table2, "key", type: :left_outer))
|
1092
|
+
end
|
1093
|
+
|
1094
|
+
test("left_outputs: & right_outputs:") do
|
1095
|
+
table1 = Arrow::Table.new(key: [1, 2, 3],
|
1096
|
+
number: [10, 20, 30])
|
1097
|
+
table2 = Arrow::Table.new(key: [3, 1],
|
1098
|
+
string: ["three", "one"])
|
1099
|
+
assert_equal(Arrow::Table.new(key: [1, 3],
|
1100
|
+
number: [10, 30],
|
1101
|
+
string: ["one", "three"]),
|
1102
|
+
table1.join(table2,
|
1103
|
+
"key",
|
1104
|
+
left_outputs: ["key", "number"],
|
1105
|
+
right_outputs: ["string"]))
|
1106
|
+
end
|
1107
|
+
end
|
925
1108
|
end
|