red-arrow 6.0.1 → 7.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/arrow/table.rb CHANGED
@@ -448,6 +448,84 @@ module Arrow
448
448
  self.class.new(schema, packed_arrays)
449
449
  end
450
450
 
451
+ # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
452
+ # @!macro join_common_before
453
+ # @param right [Arrow::Table] The right table.
454
+ #
455
+ # Join columns with `right` on join key columns.
456
+ #
457
+ # @!macro join_common_after
458
+ # @param type [Arrow::JoinType] How to join.
459
+ # @param left_outputs [::Array<String, Symbol>] Output columns in
460
+ # `self`.
461
+ #
462
+ # If both of `left_outputs` and `right_outputs` aren't
463
+ # specified, all columns in `self` and `right` are
464
+ # outputted.
465
+ # @param right_outputs [::Array<String, Symbol>] Output columns in
466
+ # `right`.
467
+ #
468
+ # If both of `left_outputs` and `right_outputs` aren't
469
+ # specified, all columns in `self` and `right` are
470
+ # outputted.
471
+ # @return [Arrow::Table]
472
+ # The joined `Arrow::Table`.
473
+ #
474
+ # @macro join_common_before
475
+ # @param key [String, Symbol] A join key.
476
+ # @macro join_common_after
477
+ #
478
+ # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
479
+ #
480
+ # @macro join_common_before
481
+ # @param keys [::Array<String, Symbol>] Join keys.
482
+ # @macro join_common_after
483
+ #
484
+ # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
485
+ #
486
+ # @macro join_common_before
487
+ # @param keys [Hash] Specify join keys in `self` and `right` separately.
488
+ # @option keys [String, Symbol, ::Array<String, Symbol>] :left
489
+ # Join keys in `self`.
490
+ # @option keys [String, Symbol, ::Array<String, Symbol>] :right
491
+ # Join keys in `right`.
492
+ # @macro join_common_after
493
+ #
494
+ # @since 7.0.0
495
+ def join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
496
+ plan = ExecutePlan.new
497
+ left_node = plan.build_source_node(self)
498
+ right_node = plan.build_source_node(right)
499
+ if keys.is_a?(Hash)
500
+ left_keys = keys[:left]
501
+ right_keys = keys[:right]
502
+ else
503
+ left_keys = keys
504
+ right_keys = keys
505
+ end
506
+ left_keys = Array(left_keys)
507
+ right_keys = Array(right_keys)
508
+ hash_join_node_options = HashJoinNodeOptions.new(type,
509
+ left_keys,
510
+ right_keys)
511
+ unless left_outputs.nil?
512
+ hash_join_node_options.left_outputs = left_outputs
513
+ end
514
+ unless right_outputs.nil?
515
+ hash_join_node_options.right_outputs = right_outputs
516
+ end
517
+ hash_join_node = plan.build_hash_join_node(left_node,
518
+ right_node,
519
+ hash_join_node_options)
520
+ sink_node_options = SinkNodeOptions.new
521
+ plan.build_sink_node(hash_join_node, sink_node_options)
522
+ plan.validate
523
+ plan.start
524
+ plan.wait
525
+ reader = sink_node_options.get_reader(hash_join_node.output_schema)
526
+ reader.read_all
527
+ end
528
+
451
529
  alias_method :to_s_raw, :to_s
452
530
  def to_s(options={})
453
531
  format = options[:format]
data/lib/arrow/version.rb CHANGED
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- VERSION = "6.0.1"
19
+ VERSION = "7.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/red-arrow.gemspec CHANGED
@@ -48,19 +48,10 @@ Gem::Specification.new do |spec|
48
48
 
49
49
  spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
50
50
  spec.add_runtime_dependency("extpp", ">= 0.0.7")
51
- spec.add_runtime_dependency("gio2", ">= 3.4.9")
51
+ spec.add_runtime_dependency("gio2", ">= 3.5.0")
52
52
  spec.add_runtime_dependency("native-package-installer")
53
53
  spec.add_runtime_dependency("pkg-config")
54
54
 
55
- spec.add_development_dependency("benchmark-driver")
56
- spec.add_development_dependency("bundler")
57
- spec.add_development_dependency("faker")
58
- spec.add_development_dependency("fiddle", ">= 1.0.9")
59
- spec.add_development_dependency("rake")
60
- spec.add_development_dependency("redcarpet")
61
- spec.add_development_dependency("test-unit")
62
- spec.add_development_dependency("yard")
63
-
64
55
  required_msys2_package_version = version_components[0, 3].join(".")
65
56
  spec.metadata["msys2_mingw_dependencies"] =
66
57
  "arrow>=#{required_msys2_package_version}"
data/test/helper.rb CHANGED
@@ -20,6 +20,8 @@ require "arrow"
20
20
  require "fiddle"
21
21
  require "pathname"
22
22
  require "tempfile"
23
+ require "timeout"
24
+ require "webrick"
23
25
  require "zlib"
24
26
 
25
27
  require "test-unit"
@@ -53,6 +53,14 @@ class FunctionTest < Test::Unit::TestCase
53
53
  or_function.execute(args).value.to_a)
54
54
  end
55
55
 
56
+ test("Arrow::Column") do
57
+ or_function = Arrow::Function.find("or")
58
+ table = Arrow::Table.new(a: [true, false, false],
59
+ b: [true, false, true])
60
+ assert_equal([true, false, true],
61
+ or_function.execute([table.a, table.b]).value.to_a)
62
+ end
63
+
56
64
  test("Arrow::Scalar") do
57
65
  add_function = Arrow::Function.find("add")
58
66
  args = [
@@ -116,12 +124,13 @@ class FunctionTest < Test::Unit::TestCase
116
124
  cast_function = Arrow::Function.find("cast")
117
125
  date = Date.new(2021, 6, 12)
118
126
  args = [date]
119
- options = Arrow::CastOptions.new
120
- options.to_data_type = Arrow::TimestampDataType.new(:second)
127
+ options = {
128
+ to_data_type: Arrow::TimestampDataType.new(:second),
129
+ }
121
130
  time = Time.utc(date.year,
122
131
  date.month,
123
132
  date.day)
124
- assert_equal(Arrow::TimestampScalar.new(options.to_data_type,
133
+ assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
125
134
  time.to_i),
126
135
  cast_function.execute(args, options).value)
127
136
  end
@@ -132,9 +141,10 @@ class FunctionTest < Test::Unit::TestCase
132
141
  # 00:10:00
133
142
  60 * 10)
134
143
  args = [arrow_time]
135
- options = Arrow::CastOptions.new
136
- options.to_data_type = Arrow::Time64DataType.new(:micro)
137
- assert_equal(Arrow::Time64Scalar.new(options.to_data_type,
144
+ options = {
145
+ to_data_type: Arrow::Time64DataType.new(:micro),
146
+ }
147
+ assert_equal(Arrow::Time64Scalar.new(options[:to_data_type],
138
148
  # 00:10:00.000000
139
149
  60 * 10 * 1000 * 1000),
140
150
  cast_function.execute(args, options).value)
@@ -146,10 +156,11 @@ class FunctionTest < Test::Unit::TestCase
146
156
  # 00:10:00.000000
147
157
  60 * 10 * 1000 * 1000)
148
158
  args = [arrow_time]
149
- options = Arrow::CastOptions.new
150
- options.to_data_type = Arrow::Time32DataType.new(:second)
151
- options.allow_time_truncate = true
152
- assert_equal(Arrow::Time32Scalar.new(options.to_data_type,
159
+ options = {
160
+ to_data_type: Arrow::Time32DataType.new(:second),
161
+ allow_time_truncate: true,
162
+ }
163
+ assert_equal(Arrow::Time32Scalar.new(options[:to_data_type],
153
164
  # 00:10:00
154
165
  60 * 10),
155
166
  cast_function.execute(args, options).value)
@@ -159,18 +170,41 @@ class FunctionTest < Test::Unit::TestCase
159
170
  cast_function = Arrow::Function.find("cast")
160
171
  time = Time.utc(2021, 6, 12, 1, 2, 3, 1)
161
172
  args = [time]
162
- options = Arrow::CastOptions.new
163
- options.to_data_type = Arrow::TimestampDataType.new(:second)
164
- options.allow_time_truncate = true
173
+ options = {
174
+ to_data_type: Arrow::TimestampDataType.new(:second),
175
+ allow_time_truncate: true,
176
+ }
165
177
  time = Time.utc(time.year,
166
178
  time.month,
167
179
  time.day,
168
180
  time.hour,
169
181
  time.min,
170
182
  time.sec)
171
- assert_equal(Arrow::TimestampScalar.new(options.to_data_type,
183
+ assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
172
184
  time.to_i),
173
185
  cast_function.execute(args, options).value)
174
186
  end
187
+
188
+ test("SetLookupOptions") do
189
+ is_in_function = Arrow::Function.find("is_in")
190
+ args = [
191
+ Arrow::Int16Array.new([1, 0, 1, 2]),
192
+ ]
193
+ options = {
194
+ value_set: Arrow::Int16Array.new([2, 0]),
195
+ }
196
+ assert_equal(Arrow::BooleanArray.new([false, true, false, true]),
197
+ is_in_function.execute(args, options).value)
198
+ end
199
+ end
200
+
201
+ def test_call
202
+ or_function = Arrow::Function.find("or")
203
+ args = [
204
+ Arrow::BooleanArray.new([true, false, false]),
205
+ Arrow::BooleanArray.new([true, false, true]),
206
+ ]
207
+ assert_equal([true, false, true],
208
+ or_function.call(args).value.to_a)
175
209
  end
176
210
  end
data/test/test-table.rb CHANGED
@@ -573,14 +573,20 @@ class TableTest < Test::Unit::TestCase
573
573
  assert_equal(@table, Arrow::Table.load(output, format: :batch))
574
574
  end
575
575
 
576
+ def test_arrows
577
+ output = create_output(".arrows")
578
+ @table.save(output, format: :arrows)
579
+ assert_equal(@table, Arrow::Table.load(output, format: :arrows))
580
+ end
581
+
576
582
  def test_arrow_streaming
577
- output = create_output(".arrow")
583
+ output = create_output(".arrows")
578
584
  @table.save(output, format: :arrow_streaming)
579
585
  assert_equal(@table, Arrow::Table.load(output, format: :arrow_streaming))
580
586
  end
581
587
 
582
588
  def test_stream
583
- output = create_output(".arrow")
589
+ output = create_output(".arrows")
584
590
  @table.save(output, format: :stream)
585
591
  assert_equal(@table, Arrow::Table.load(output, format: :stream))
586
592
  end
@@ -626,6 +632,24 @@ class TableTest < Test::Unit::TestCase
626
632
  end
627
633
 
628
634
  sub_test_case("save: auto detect") do
635
+ test("arrow") do
636
+ output = create_output(".arrow")
637
+ @table.save(output)
638
+ assert_equal(@table,
639
+ Arrow::Table.load(output,
640
+ format: :arrow,
641
+ schema: @table.schema))
642
+ end
643
+
644
+ test("arrows") do
645
+ output = create_output(".arrows")
646
+ @table.save(output)
647
+ assert_equal(@table,
648
+ Arrow::Table.load(output,
649
+ format: :arrows,
650
+ schema: @table.schema))
651
+ end
652
+
629
653
  test("csv") do
630
654
  output = create_output(".csv")
631
655
  @table.save(output)
@@ -664,7 +688,13 @@ class TableTest < Test::Unit::TestCase
664
688
 
665
689
  test("arrow: streaming") do
666
690
  output = create_output(".arrow")
667
- @table.save(output, format: :arrow_streaming)
691
+ @table.save(output, format: :arrows)
692
+ assert_equal(@table, Arrow::Table.load(output))
693
+ end
694
+
695
+ test("arrows") do
696
+ output = create_output(".arrows")
697
+ @table.save(output, format: :arrows)
668
698
  assert_equal(@table, Arrow::Table.load(output))
669
699
  end
670
700
 
@@ -728,6 +758,47 @@ chris\t-1
728
758
  end
729
759
  end
730
760
  end
761
+
762
+ sub_test_case("URI") do
763
+ def start_web_server(path, data, content_type)
764
+ http_server = WEBrick::HTTPServer.new(:Port => 0)
765
+ http_server.mount_proc(path) do |request, response|
766
+ response.body = data
767
+ response.content_type = content_type
768
+ end
769
+ http_server_thread = Thread.new do
770
+ http_server.start
771
+ end
772
+ begin
773
+ Timeout.timeout(1) do
774
+ yield(http_server[:Port])
775
+ end
776
+ ensure
777
+ http_server.shutdown
778
+ http_server_thread.join
779
+ end
780
+ end
781
+
782
+ data("Arrow File",
783
+ ["arrow", "application/vnd.apache.arrow.file"])
784
+ data("Arrow Stream",
785
+ ["arrows", "application/vnd.apache.arrow.stream"])
786
+ data("CSV",
787
+ ["csv", "text/csv"])
788
+ def test_http(data)
789
+ extension, content_type = data
790
+ output = Arrow::ResizableBuffer.new(1024)
791
+ @table.save(output, format: extension.to_sym)
792
+ path = "/data.#{extension}"
793
+ start_web_server(path,
794
+ output.data.to_s,
795
+ content_type) do |port|
796
+ input = URI("http://127.0.0.1:#{port}#{path}")
797
+ loaded_table = Arrow::Table.load(input)
798
+ assert_equal(@table.to_s, loaded_table.to_s)
799
+ end
800
+ end
801
+ end
731
802
  end
732
803
 
733
804
  test("#pack") do
@@ -922,4 +993,116 @@ visible: false
922
993
  TABLE
923
994
  end
924
995
  end
996
+
997
+ sub_test_case("#join") do
998
+ test("keys: String") do
999
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1000
+ number: [10, 20, 30])
1001
+ table2 = Arrow::Table.new(key: [3, 1],
1002
+ string: ["three", "one"])
1003
+ assert_equal(Arrow::Table.new([
1004
+ ["key", [1, 3]],
1005
+ ["number", [10, 30]],
1006
+ ["key", [1, 3]],
1007
+ ["string", ["one", "three"]],
1008
+ ]),
1009
+ table1.join(table2, "key"))
1010
+ end
1011
+
1012
+ test("keys: Symbol") do
1013
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1014
+ number: [10, 20, 30])
1015
+ table2 = Arrow::Table.new(key: [3, 1],
1016
+ string: ["three", "one"])
1017
+ assert_equal(Arrow::Table.new([
1018
+ ["key", [1, 3]],
1019
+ ["number", [10, 30]],
1020
+ ["key", [1, 3]],
1021
+ ["string", ["one", "three"]],
1022
+ ]),
1023
+ table1.join(table2, :key))
1024
+ end
1025
+
1026
+ test("keys: [String, Symbol]") do
1027
+ table1 = Arrow::Table.new(key1: [1, 1, 2, 2],
1028
+ key2: [10, 100, 20, 200],
1029
+ number: [1010, 1100, 2020, 2200])
1030
+ table2 = Arrow::Table.new(key1: [1, 2, 2],
1031
+ key2: [100, 20, 50],
1032
+ string: ["1-100", "2-20", "2-50"])
1033
+ assert_equal(Arrow::Table.new([
1034
+ ["key1", [1, 2]],
1035
+ ["key2", [100, 20]],
1036
+ ["number", [1100, 2020]],
1037
+ ["key1", [1, 2]],
1038
+ ["key2", [100, 20]],
1039
+ ["string", ["1-100", "2-20"]],
1040
+ ]),
1041
+ table1.join(table2, ["key1", :key2]))
1042
+ end
1043
+
1044
+ test("keys: {left: String, right: Symbol}") do
1045
+ table1 = Arrow::Table.new(left_key: [1, 2, 3],
1046
+ number: [10, 20, 30])
1047
+ table2 = Arrow::Table.new(right_key: [3, 1],
1048
+ string: ["three", "one"])
1049
+ assert_equal(Arrow::Table.new([
1050
+ ["left_key", [1, 3]],
1051
+ ["number", [10, 30]],
1052
+ ["right_key", [1, 3]],
1053
+ ["string", ["one", "three"]],
1054
+ ]),
1055
+ table1.join(table2, {left: "left_key", right: :right_key}))
1056
+ end
1057
+
1058
+ test("keys: {left: [String, Symbol], right: [Symbol, String]}") do
1059
+ table1 = Arrow::Table.new(left_key1: [1, 1, 2, 2],
1060
+ left_key2: [10, 100, 20, 200],
1061
+ number: [1010, 1100, 2020, 2200])
1062
+ table2 = Arrow::Table.new(right_key1: [1, 2, 2],
1063
+ right_key2: [100, 20, 50],
1064
+ string: ["1-100", "2-20", "2-50"])
1065
+ assert_equal(Arrow::Table.new([
1066
+ ["left_key1", [1, 2]],
1067
+ ["left_key2", [100, 20]],
1068
+ ["number", [1100, 2020]],
1069
+ ["right_key1", [1, 2]],
1070
+ ["right_key2", [100, 20]],
1071
+ ["string", ["1-100", "2-20"]],
1072
+ ]),
1073
+ table1.join(table2,
1074
+ {
1075
+ left: ["left_key1", :left_key2],
1076
+ right: [:right_key1, "right_key2"],
1077
+ }))
1078
+ end
1079
+
1080
+ test("type:") do
1081
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1082
+ number: [10, 20, 30])
1083
+ table2 = Arrow::Table.new(key: [3, 1],
1084
+ string: ["three", "one"])
1085
+ assert_equal(Arrow::Table.new([
1086
+ ["key", [1, 3, 2]],
1087
+ ["number", [10, 30, 20]],
1088
+ ["key", [1, 3, nil]],
1089
+ ["string", ["one", "three", nil]],
1090
+ ]),
1091
+ table1.join(table2, "key", type: :left_outer))
1092
+ end
1093
+
1094
+ test("left_outputs: & right_outputs:") do
1095
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1096
+ number: [10, 20, 30])
1097
+ table2 = Arrow::Table.new(key: [3, 1],
1098
+ string: ["three", "one"])
1099
+ assert_equal(Arrow::Table.new(key: [1, 3],
1100
+ number: [10, 30],
1101
+ string: ["one", "three"]),
1102
+ table1.join(table2,
1103
+ "key",
1104
+ left_outputs: ["key", "number"],
1105
+ right_outputs: ["string"]))
1106
+ end
1107
+ end
925
1108
  end