red-arrow 6.0.1 → 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/arrow/table.rb CHANGED
@@ -448,6 +448,84 @@ module Arrow
448
448
  self.class.new(schema, packed_arrays)
449
449
  end
450
450
 
451
+ # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
452
+ # @!macro join_common_before
453
+ # @param right [Arrow::Table] The right table.
454
+ #
455
+ # Join columns with `right` on join key columns.
456
+ #
457
+ # @!macro join_common_after
458
+ # @param type [Arrow::JoinType] How to join.
459
+ # @param left_outputs [::Array<String, Symbol>] Output columns in
460
+ # `self`.
461
+ #
462
+ # If both of `left_outputs` and `right_outputs` aren't
463
+ # specified, all columns in `self` and `right` are
464
+ # outputted.
465
+ # @param right_outputs [::Array<String, Symbol>] Output columns in
466
+ # `right`.
467
+ #
468
+ # If both of `left_outputs` and `right_outputs` aren't
469
+ # specified, all columns in `self` and `right` are
470
+ # outputted.
471
+ # @return [Arrow::Table]
472
+ # The joined `Arrow::Table`.
473
+ #
474
+ # @macro join_common_before
475
+ # @param key [String, Symbol] A join key.
476
+ # @macro join_common_after
477
+ #
478
+ # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
479
+ #
480
+ # @macro join_common_before
481
+ # @param keys [::Array<String, Symbol>] Join keys.
482
+ # @macro join_common_after
483
+ #
484
+ # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
485
+ #
486
+ # @macro join_common_before
487
+ # @param keys [Hash] Specify join keys in `self` and `right` separately.
488
+ # @option keys [String, Symbol, ::Array<String, Symbol>] :left
489
+ # Join keys in `self`.
490
+ # @option keys [String, Symbol, ::Array<String, Symbol>] :right
491
+ # Join keys in `right`.
492
+ # @macro join_common_after
493
+ #
494
+ # @since 7.0.0
495
+ def join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
496
+ plan = ExecutePlan.new
497
+ left_node = plan.build_source_node(self)
498
+ right_node = plan.build_source_node(right)
499
+ if keys.is_a?(Hash)
500
+ left_keys = keys[:left]
501
+ right_keys = keys[:right]
502
+ else
503
+ left_keys = keys
504
+ right_keys = keys
505
+ end
506
+ left_keys = Array(left_keys)
507
+ right_keys = Array(right_keys)
508
+ hash_join_node_options = HashJoinNodeOptions.new(type,
509
+ left_keys,
510
+ right_keys)
511
+ unless left_outputs.nil?
512
+ hash_join_node_options.left_outputs = left_outputs
513
+ end
514
+ unless right_outputs.nil?
515
+ hash_join_node_options.right_outputs = right_outputs
516
+ end
517
+ hash_join_node = plan.build_hash_join_node(left_node,
518
+ right_node,
519
+ hash_join_node_options)
520
+ sink_node_options = SinkNodeOptions.new
521
+ plan.build_sink_node(hash_join_node, sink_node_options)
522
+ plan.validate
523
+ plan.start
524
+ plan.wait
525
+ reader = sink_node_options.get_reader(hash_join_node.output_schema)
526
+ reader.read_all
527
+ end
528
+
451
529
  alias_method :to_s_raw, :to_s
452
530
  def to_s(options={})
453
531
  format = options[:format]
data/lib/arrow/version.rb CHANGED
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- VERSION = "6.0.1"
19
+ VERSION = "7.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/red-arrow.gemspec CHANGED
@@ -48,19 +48,10 @@ Gem::Specification.new do |spec|
48
48
 
49
49
  spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
50
50
  spec.add_runtime_dependency("extpp", ">= 0.0.7")
51
- spec.add_runtime_dependency("gio2", ">= 3.4.9")
51
+ spec.add_runtime_dependency("gio2", ">= 3.5.0")
52
52
  spec.add_runtime_dependency("native-package-installer")
53
53
  spec.add_runtime_dependency("pkg-config")
54
54
 
55
- spec.add_development_dependency("benchmark-driver")
56
- spec.add_development_dependency("bundler")
57
- spec.add_development_dependency("faker")
58
- spec.add_development_dependency("fiddle", ">= 1.0.9")
59
- spec.add_development_dependency("rake")
60
- spec.add_development_dependency("redcarpet")
61
- spec.add_development_dependency("test-unit")
62
- spec.add_development_dependency("yard")
63
-
64
55
  required_msys2_package_version = version_components[0, 3].join(".")
65
56
  spec.metadata["msys2_mingw_dependencies"] =
66
57
  "arrow>=#{required_msys2_package_version}"
data/test/helper.rb CHANGED
@@ -20,6 +20,8 @@ require "arrow"
20
20
  require "fiddle"
21
21
  require "pathname"
22
22
  require "tempfile"
23
+ require "timeout"
24
+ require "webrick"
23
25
  require "zlib"
24
26
 
25
27
  require "test-unit"
@@ -53,6 +53,14 @@ class FunctionTest < Test::Unit::TestCase
53
53
  or_function.execute(args).value.to_a)
54
54
  end
55
55
 
56
+ test("Arrow::Column") do
57
+ or_function = Arrow::Function.find("or")
58
+ table = Arrow::Table.new(a: [true, false, false],
59
+ b: [true, false, true])
60
+ assert_equal([true, false, true],
61
+ or_function.execute([table.a, table.b]).value.to_a)
62
+ end
63
+
56
64
  test("Arrow::Scalar") do
57
65
  add_function = Arrow::Function.find("add")
58
66
  args = [
@@ -116,12 +124,13 @@ class FunctionTest < Test::Unit::TestCase
116
124
  cast_function = Arrow::Function.find("cast")
117
125
  date = Date.new(2021, 6, 12)
118
126
  args = [date]
119
- options = Arrow::CastOptions.new
120
- options.to_data_type = Arrow::TimestampDataType.new(:second)
127
+ options = {
128
+ to_data_type: Arrow::TimestampDataType.new(:second),
129
+ }
121
130
  time = Time.utc(date.year,
122
131
  date.month,
123
132
  date.day)
124
- assert_equal(Arrow::TimestampScalar.new(options.to_data_type,
133
+ assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
125
134
  time.to_i),
126
135
  cast_function.execute(args, options).value)
127
136
  end
@@ -132,9 +141,10 @@ class FunctionTest < Test::Unit::TestCase
132
141
  # 00:10:00
133
142
  60 * 10)
134
143
  args = [arrow_time]
135
- options = Arrow::CastOptions.new
136
- options.to_data_type = Arrow::Time64DataType.new(:micro)
137
- assert_equal(Arrow::Time64Scalar.new(options.to_data_type,
144
+ options = {
145
+ to_data_type: Arrow::Time64DataType.new(:micro),
146
+ }
147
+ assert_equal(Arrow::Time64Scalar.new(options[:to_data_type],
138
148
  # 00:10:00.000000
139
149
  60 * 10 * 1000 * 1000),
140
150
  cast_function.execute(args, options).value)
@@ -146,10 +156,11 @@ class FunctionTest < Test::Unit::TestCase
146
156
  # 00:10:00.000000
147
157
  60 * 10 * 1000 * 1000)
148
158
  args = [arrow_time]
149
- options = Arrow::CastOptions.new
150
- options.to_data_type = Arrow::Time32DataType.new(:second)
151
- options.allow_time_truncate = true
152
- assert_equal(Arrow::Time32Scalar.new(options.to_data_type,
159
+ options = {
160
+ to_data_type: Arrow::Time32DataType.new(:second),
161
+ allow_time_truncate: true,
162
+ }
163
+ assert_equal(Arrow::Time32Scalar.new(options[:to_data_type],
153
164
  # 00:10:00
154
165
  60 * 10),
155
166
  cast_function.execute(args, options).value)
@@ -159,18 +170,41 @@ class FunctionTest < Test::Unit::TestCase
159
170
  cast_function = Arrow::Function.find("cast")
160
171
  time = Time.utc(2021, 6, 12, 1, 2, 3, 1)
161
172
  args = [time]
162
- options = Arrow::CastOptions.new
163
- options.to_data_type = Arrow::TimestampDataType.new(:second)
164
- options.allow_time_truncate = true
173
+ options = {
174
+ to_data_type: Arrow::TimestampDataType.new(:second),
175
+ allow_time_truncate: true,
176
+ }
165
177
  time = Time.utc(time.year,
166
178
  time.month,
167
179
  time.day,
168
180
  time.hour,
169
181
  time.min,
170
182
  time.sec)
171
- assert_equal(Arrow::TimestampScalar.new(options.to_data_type,
183
+ assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
172
184
  time.to_i),
173
185
  cast_function.execute(args, options).value)
174
186
  end
187
+
188
+ test("SetLookupOptions") do
189
+ is_in_function = Arrow::Function.find("is_in")
190
+ args = [
191
+ Arrow::Int16Array.new([1, 0, 1, 2]),
192
+ ]
193
+ options = {
194
+ value_set: Arrow::Int16Array.new([2, 0]),
195
+ }
196
+ assert_equal(Arrow::BooleanArray.new([false, true, false, true]),
197
+ is_in_function.execute(args, options).value)
198
+ end
199
+ end
200
+
201
+ def test_call
202
+ or_function = Arrow::Function.find("or")
203
+ args = [
204
+ Arrow::BooleanArray.new([true, false, false]),
205
+ Arrow::BooleanArray.new([true, false, true]),
206
+ ]
207
+ assert_equal([true, false, true],
208
+ or_function.call(args).value.to_a)
175
209
  end
176
210
  end
data/test/test-table.rb CHANGED
@@ -573,14 +573,20 @@ class TableTest < Test::Unit::TestCase
573
573
  assert_equal(@table, Arrow::Table.load(output, format: :batch))
574
574
  end
575
575
 
576
+ def test_arrows
577
+ output = create_output(".arrows")
578
+ @table.save(output, format: :arrows)
579
+ assert_equal(@table, Arrow::Table.load(output, format: :arrows))
580
+ end
581
+
576
582
  def test_arrow_streaming
577
- output = create_output(".arrow")
583
+ output = create_output(".arrows")
578
584
  @table.save(output, format: :arrow_streaming)
579
585
  assert_equal(@table, Arrow::Table.load(output, format: :arrow_streaming))
580
586
  end
581
587
 
582
588
  def test_stream
583
- output = create_output(".arrow")
589
+ output = create_output(".arrows")
584
590
  @table.save(output, format: :stream)
585
591
  assert_equal(@table, Arrow::Table.load(output, format: :stream))
586
592
  end
@@ -626,6 +632,24 @@ class TableTest < Test::Unit::TestCase
626
632
  end
627
633
 
628
634
  sub_test_case("save: auto detect") do
635
+ test("arrow") do
636
+ output = create_output(".arrow")
637
+ @table.save(output)
638
+ assert_equal(@table,
639
+ Arrow::Table.load(output,
640
+ format: :arrow,
641
+ schema: @table.schema))
642
+ end
643
+
644
+ test("arrows") do
645
+ output = create_output(".arrows")
646
+ @table.save(output)
647
+ assert_equal(@table,
648
+ Arrow::Table.load(output,
649
+ format: :arrows,
650
+ schema: @table.schema))
651
+ end
652
+
629
653
  test("csv") do
630
654
  output = create_output(".csv")
631
655
  @table.save(output)
@@ -664,7 +688,13 @@ class TableTest < Test::Unit::TestCase
664
688
 
665
689
  test("arrow: streaming") do
666
690
  output = create_output(".arrow")
667
- @table.save(output, format: :arrow_streaming)
691
+ @table.save(output, format: :arrows)
692
+ assert_equal(@table, Arrow::Table.load(output))
693
+ end
694
+
695
+ test("arrows") do
696
+ output = create_output(".arrows")
697
+ @table.save(output, format: :arrows)
668
698
  assert_equal(@table, Arrow::Table.load(output))
669
699
  end
670
700
 
@@ -728,6 +758,47 @@ chris\t-1
728
758
  end
729
759
  end
730
760
  end
761
+
762
+ sub_test_case("URI") do
763
+ def start_web_server(path, data, content_type)
764
+ http_server = WEBrick::HTTPServer.new(:Port => 0)
765
+ http_server.mount_proc(path) do |request, response|
766
+ response.body = data
767
+ response.content_type = content_type
768
+ end
769
+ http_server_thread = Thread.new do
770
+ http_server.start
771
+ end
772
+ begin
773
+ Timeout.timeout(1) do
774
+ yield(http_server[:Port])
775
+ end
776
+ ensure
777
+ http_server.shutdown
778
+ http_server_thread.join
779
+ end
780
+ end
781
+
782
+ data("Arrow File",
783
+ ["arrow", "application/vnd.apache.arrow.file"])
784
+ data("Arrow Stream",
785
+ ["arrows", "application/vnd.apache.arrow.stream"])
786
+ data("CSV",
787
+ ["csv", "text/csv"])
788
+ def test_http(data)
789
+ extension, content_type = data
790
+ output = Arrow::ResizableBuffer.new(1024)
791
+ @table.save(output, format: extension.to_sym)
792
+ path = "/data.#{extension}"
793
+ start_web_server(path,
794
+ output.data.to_s,
795
+ content_type) do |port|
796
+ input = URI("http://127.0.0.1:#{port}#{path}")
797
+ loaded_table = Arrow::Table.load(input)
798
+ assert_equal(@table.to_s, loaded_table.to_s)
799
+ end
800
+ end
801
+ end
731
802
  end
732
803
 
733
804
  test("#pack") do
@@ -922,4 +993,116 @@ visible: false
922
993
  TABLE
923
994
  end
924
995
  end
996
+
997
+ sub_test_case("#join") do
998
+ test("keys: String") do
999
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1000
+ number: [10, 20, 30])
1001
+ table2 = Arrow::Table.new(key: [3, 1],
1002
+ string: ["three", "one"])
1003
+ assert_equal(Arrow::Table.new([
1004
+ ["key", [1, 3]],
1005
+ ["number", [10, 30]],
1006
+ ["key", [1, 3]],
1007
+ ["string", ["one", "three"]],
1008
+ ]),
1009
+ table1.join(table2, "key"))
1010
+ end
1011
+
1012
+ test("keys: Symbol") do
1013
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1014
+ number: [10, 20, 30])
1015
+ table2 = Arrow::Table.new(key: [3, 1],
1016
+ string: ["three", "one"])
1017
+ assert_equal(Arrow::Table.new([
1018
+ ["key", [1, 3]],
1019
+ ["number", [10, 30]],
1020
+ ["key", [1, 3]],
1021
+ ["string", ["one", "three"]],
1022
+ ]),
1023
+ table1.join(table2, :key))
1024
+ end
1025
+
1026
+ test("keys: [String, Symbol]") do
1027
+ table1 = Arrow::Table.new(key1: [1, 1, 2, 2],
1028
+ key2: [10, 100, 20, 200],
1029
+ number: [1010, 1100, 2020, 2200])
1030
+ table2 = Arrow::Table.new(key1: [1, 2, 2],
1031
+ key2: [100, 20, 50],
1032
+ string: ["1-100", "2-20", "2-50"])
1033
+ assert_equal(Arrow::Table.new([
1034
+ ["key1", [1, 2]],
1035
+ ["key2", [100, 20]],
1036
+ ["number", [1100, 2020]],
1037
+ ["key1", [1, 2]],
1038
+ ["key2", [100, 20]],
1039
+ ["string", ["1-100", "2-20"]],
1040
+ ]),
1041
+ table1.join(table2, ["key1", :key2]))
1042
+ end
1043
+
1044
+ test("keys: {left: String, right: Symbol}") do
1045
+ table1 = Arrow::Table.new(left_key: [1, 2, 3],
1046
+ number: [10, 20, 30])
1047
+ table2 = Arrow::Table.new(right_key: [3, 1],
1048
+ string: ["three", "one"])
1049
+ assert_equal(Arrow::Table.new([
1050
+ ["left_key", [1, 3]],
1051
+ ["number", [10, 30]],
1052
+ ["right_key", [1, 3]],
1053
+ ["string", ["one", "three"]],
1054
+ ]),
1055
+ table1.join(table2, {left: "left_key", right: :right_key}))
1056
+ end
1057
+
1058
+ test("keys: {left: [String, Symbol], right: [Symbol, String]}") do
1059
+ table1 = Arrow::Table.new(left_key1: [1, 1, 2, 2],
1060
+ left_key2: [10, 100, 20, 200],
1061
+ number: [1010, 1100, 2020, 2200])
1062
+ table2 = Arrow::Table.new(right_key1: [1, 2, 2],
1063
+ right_key2: [100, 20, 50],
1064
+ string: ["1-100", "2-20", "2-50"])
1065
+ assert_equal(Arrow::Table.new([
1066
+ ["left_key1", [1, 2]],
1067
+ ["left_key2", [100, 20]],
1068
+ ["number", [1100, 2020]],
1069
+ ["right_key1", [1, 2]],
1070
+ ["right_key2", [100, 20]],
1071
+ ["string", ["1-100", "2-20"]],
1072
+ ]),
1073
+ table1.join(table2,
1074
+ {
1075
+ left: ["left_key1", :left_key2],
1076
+ right: [:right_key1, "right_key2"],
1077
+ }))
1078
+ end
1079
+
1080
+ test("type:") do
1081
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1082
+ number: [10, 20, 30])
1083
+ table2 = Arrow::Table.new(key: [3, 1],
1084
+ string: ["three", "one"])
1085
+ assert_equal(Arrow::Table.new([
1086
+ ["key", [1, 3, 2]],
1087
+ ["number", [10, 30, 20]],
1088
+ ["key", [1, 3, nil]],
1089
+ ["string", ["one", "three", nil]],
1090
+ ]),
1091
+ table1.join(table2, "key", type: :left_outer))
1092
+ end
1093
+
1094
+ test("left_outputs: & right_outputs:") do
1095
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1096
+ number: [10, 20, 30])
1097
+ table2 = Arrow::Table.new(key: [3, 1],
1098
+ string: ["three", "one"])
1099
+ assert_equal(Arrow::Table.new(key: [1, 3],
1100
+ number: [10, 30],
1101
+ string: ["one", "three"]),
1102
+ table1.join(table2,
1103
+ "key",
1104
+ left_outputs: ["key", "number"],
1105
+ right_outputs: ["string"]))
1106
+ end
1107
+ end
925
1108
  end