red-arrow 6.0.0 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +10 -0
  3. data/ext/arrow/arrow.cpp +12 -0
  4. data/ext/arrow/converters.hpp +46 -10
  5. data/ext/arrow/extconf.rb +1 -1
  6. data/ext/arrow/raw-records.cpp +3 -2
  7. data/ext/arrow/red-arrow.hpp +7 -0
  8. data/ext/arrow/values.cpp +3 -2
  9. data/lib/arrow/datum.rb +2 -0
  10. data/lib/arrow/day-time-interval-array-builder.rb +29 -0
  11. data/lib/arrow/function.rb +52 -0
  12. data/lib/arrow/loader.rb +16 -0
  13. data/lib/arrow/month-day-nano-interval-array-builder.rb +29 -0
  14. data/lib/arrow/s3-global-options.rb +38 -0
  15. data/lib/arrow/sort-key.rb +61 -55
  16. data/lib/arrow/sort-options.rb +8 -8
  17. data/lib/arrow/table-loader.rb +99 -62
  18. data/lib/arrow/table-saver.rb +7 -2
  19. data/lib/arrow/table.rb +78 -0
  20. data/lib/arrow/version.rb +1 -1
  21. data/red-arrow.gemspec +1 -10
  22. data/test/helper.rb +2 -0
  23. data/test/raw-records/test-basic-arrays.rb +30 -0
  24. data/test/raw-records/test-dense-union-array.rb +27 -0
  25. data/test/raw-records/test-list-array.rb +39 -0
  26. data/test/raw-records/test-map-array.rb +37 -0
  27. data/test/raw-records/test-sparse-union-array.rb +27 -0
  28. data/test/raw-records/test-struct-array.rb +30 -0
  29. data/test/test-function.rb +48 -14
  30. data/test/test-table.rb +204 -6
  31. data/test/values/test-basic-arrays.rb +30 -0
  32. data/test/values/test-dense-union-array.rb +27 -0
  33. data/test/values/test-dictionary-array.rb +295 -0
  34. data/test/values/test-list-array.rb +39 -0
  35. data/test/values/test-map-array.rb +33 -0
  36. data/test/values/test-sparse-union-array.rb +27 -0
  37. data/test/values/test-struct-array.rb +30 -0
  38. metadata +88 -194
@@ -53,6 +53,14 @@ class FunctionTest < Test::Unit::TestCase
53
53
  or_function.execute(args).value.to_a)
54
54
  end
55
55
 
56
+ test("Arrow::Column") do
57
+ or_function = Arrow::Function.find("or")
58
+ table = Arrow::Table.new(a: [true, false, false],
59
+ b: [true, false, true])
60
+ assert_equal([true, false, true],
61
+ or_function.execute([table.a, table.b]).value.to_a)
62
+ end
63
+
56
64
  test("Arrow::Scalar") do
57
65
  add_function = Arrow::Function.find("add")
58
66
  args = [
@@ -116,12 +124,13 @@ class FunctionTest < Test::Unit::TestCase
116
124
  cast_function = Arrow::Function.find("cast")
117
125
  date = Date.new(2021, 6, 12)
118
126
  args = [date]
119
- options = Arrow::CastOptions.new
120
- options.to_data_type = Arrow::TimestampDataType.new(:second)
127
+ options = {
128
+ to_data_type: Arrow::TimestampDataType.new(:second),
129
+ }
121
130
  time = Time.utc(date.year,
122
131
  date.month,
123
132
  date.day)
124
- assert_equal(Arrow::TimestampScalar.new(options.to_data_type,
133
+ assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
125
134
  time.to_i),
126
135
  cast_function.execute(args, options).value)
127
136
  end
@@ -132,9 +141,10 @@ class FunctionTest < Test::Unit::TestCase
132
141
  # 00:10:00
133
142
  60 * 10)
134
143
  args = [arrow_time]
135
- options = Arrow::CastOptions.new
136
- options.to_data_type = Arrow::Time64DataType.new(:micro)
137
- assert_equal(Arrow::Time64Scalar.new(options.to_data_type,
144
+ options = {
145
+ to_data_type: Arrow::Time64DataType.new(:micro),
146
+ }
147
+ assert_equal(Arrow::Time64Scalar.new(options[:to_data_type],
138
148
  # 00:10:00.000000
139
149
  60 * 10 * 1000 * 1000),
140
150
  cast_function.execute(args, options).value)
@@ -146,10 +156,11 @@ class FunctionTest < Test::Unit::TestCase
146
156
  # 00:10:00.000000
147
157
  60 * 10 * 1000 * 1000)
148
158
  args = [arrow_time]
149
- options = Arrow::CastOptions.new
150
- options.to_data_type = Arrow::Time32DataType.new(:second)
151
- options.allow_time_truncate = true
152
- assert_equal(Arrow::Time32Scalar.new(options.to_data_type,
159
+ options = {
160
+ to_data_type: Arrow::Time32DataType.new(:second),
161
+ allow_time_truncate: true,
162
+ }
163
+ assert_equal(Arrow::Time32Scalar.new(options[:to_data_type],
153
164
  # 00:10:00
154
165
  60 * 10),
155
166
  cast_function.execute(args, options).value)
@@ -159,18 +170,41 @@ class FunctionTest < Test::Unit::TestCase
159
170
  cast_function = Arrow::Function.find("cast")
160
171
  time = Time.utc(2021, 6, 12, 1, 2, 3, 1)
161
172
  args = [time]
162
- options = Arrow::CastOptions.new
163
- options.to_data_type = Arrow::TimestampDataType.new(:second)
164
- options.allow_time_truncate = true
173
+ options = {
174
+ to_data_type: Arrow::TimestampDataType.new(:second),
175
+ allow_time_truncate: true,
176
+ }
165
177
  time = Time.utc(time.year,
166
178
  time.month,
167
179
  time.day,
168
180
  time.hour,
169
181
  time.min,
170
182
  time.sec)
171
- assert_equal(Arrow::TimestampScalar.new(options.to_data_type,
183
+ assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
172
184
  time.to_i),
173
185
  cast_function.execute(args, options).value)
174
186
  end
187
+
188
+ test("SetLookupOptions") do
189
+ is_in_function = Arrow::Function.find("is_in")
190
+ args = [
191
+ Arrow::Int16Array.new([1, 0, 1, 2]),
192
+ ]
193
+ options = {
194
+ value_set: Arrow::Int16Array.new([2, 0]),
195
+ }
196
+ assert_equal(Arrow::BooleanArray.new([false, true, false, true]),
197
+ is_in_function.execute(args, options).value)
198
+ end
199
+ end
200
+
201
+ def test_call
202
+ or_function = Arrow::Function.find("or")
203
+ args = [
204
+ Arrow::BooleanArray.new([true, false, false]),
205
+ Arrow::BooleanArray.new([true, false, true]),
206
+ ]
207
+ assert_equal([true, false, true],
208
+ or_function.call(args).value.to_a)
175
209
  end
176
210
  end
data/test/test-table.rb CHANGED
@@ -186,7 +186,12 @@ class TableTest < Test::Unit::TestCase
186
186
  end
187
187
 
188
188
  test("{key: Range}: beginless include end") do
189
- assert_equal(<<-TABLE, @table.slice(count: ..8).to_s)
189
+ begin
190
+ range = eval("..8")
191
+ rescue SyntaxError
192
+ omit("beginless range isn't supported")
193
+ end
194
+ assert_equal(<<-TABLE, @table.slice(count: range).to_s)
190
195
  count visible
191
196
  0 1 true
192
197
  1 2 false
@@ -196,7 +201,12 @@ class TableTest < Test::Unit::TestCase
196
201
  end
197
202
 
198
203
  test("{key: Range}: beginless exclude end") do
199
- assert_equal(<<-TABLE, @table.slice(count: ...8).to_s)
204
+ begin
205
+ range = eval("...8")
206
+ rescue SyntaxError
207
+ omit("beginless range isn't supported")
208
+ end
209
+ assert_equal(<<-TABLE, @table.slice(count: range).to_s)
200
210
  count visible
201
211
  0 1 true
202
212
  1 2 false
@@ -205,7 +215,12 @@ class TableTest < Test::Unit::TestCase
205
215
  end
206
216
 
207
217
  test("{key: Range}: endless") do
208
- assert_equal(<<-TABLE, @table.slice(count: 16..).to_s)
218
+ begin
219
+ range = eval("16..")
220
+ rescue SyntaxError
221
+ omit("endless range isn't supported")
222
+ end
223
+ assert_equal(<<-TABLE, @table.slice(count: range).to_s)
209
224
  count visible
210
225
  0 16 true
211
226
  1 32 false
@@ -573,14 +588,20 @@ class TableTest < Test::Unit::TestCase
573
588
  assert_equal(@table, Arrow::Table.load(output, format: :batch))
574
589
  end
575
590
 
591
+ def test_arrows
592
+ output = create_output(".arrows")
593
+ @table.save(output, format: :arrows)
594
+ assert_equal(@table, Arrow::Table.load(output, format: :arrows))
595
+ end
596
+
576
597
  def test_arrow_streaming
577
- output = create_output(".arrow")
598
+ output = create_output(".arrows")
578
599
  @table.save(output, format: :arrow_streaming)
579
600
  assert_equal(@table, Arrow::Table.load(output, format: :arrow_streaming))
580
601
  end
581
602
 
582
603
  def test_stream
583
- output = create_output(".arrow")
604
+ output = create_output(".arrows")
584
605
  @table.save(output, format: :stream)
585
606
  assert_equal(@table, Arrow::Table.load(output, format: :stream))
586
607
  end
@@ -626,6 +647,24 @@ class TableTest < Test::Unit::TestCase
626
647
  end
627
648
 
628
649
  sub_test_case("save: auto detect") do
650
+ test("arrow") do
651
+ output = create_output(".arrow")
652
+ @table.save(output)
653
+ assert_equal(@table,
654
+ Arrow::Table.load(output,
655
+ format: :arrow,
656
+ schema: @table.schema))
657
+ end
658
+
659
+ test("arrows") do
660
+ output = create_output(".arrows")
661
+ @table.save(output)
662
+ assert_equal(@table,
663
+ Arrow::Table.load(output,
664
+ format: :arrows,
665
+ schema: @table.schema))
666
+ end
667
+
629
668
  test("csv") do
630
669
  output = create_output(".csv")
631
670
  @table.save(output)
@@ -664,7 +703,13 @@ class TableTest < Test::Unit::TestCase
664
703
 
665
704
  test("arrow: streaming") do
666
705
  output = create_output(".arrow")
667
- @table.save(output, format: :arrow_streaming)
706
+ @table.save(output, format: :arrows)
707
+ assert_equal(@table, Arrow::Table.load(output))
708
+ end
709
+
710
+ test("arrows") do
711
+ output = create_output(".arrows")
712
+ @table.save(output, format: :arrows)
668
713
  assert_equal(@table, Arrow::Table.load(output))
669
714
  end
670
715
 
@@ -728,6 +773,47 @@ chris\t-1
728
773
  end
729
774
  end
730
775
  end
776
+
777
+ sub_test_case("URI") do
778
+ def start_web_server(path, data, content_type)
779
+ http_server = WEBrick::HTTPServer.new(:Port => 0)
780
+ http_server.mount_proc(path) do |request, response|
781
+ response.body = data
782
+ response.content_type = content_type
783
+ end
784
+ http_server_thread = Thread.new do
785
+ http_server.start
786
+ end
787
+ begin
788
+ Timeout.timeout(1) do
789
+ yield(http_server[:Port])
790
+ end
791
+ ensure
792
+ http_server.shutdown
793
+ http_server_thread.join
794
+ end
795
+ end
796
+
797
+ data("Arrow File",
798
+ ["arrow", "application/vnd.apache.arrow.file"])
799
+ data("Arrow Stream",
800
+ ["arrows", "application/vnd.apache.arrow.stream"])
801
+ data("CSV",
802
+ ["csv", "text/csv"])
803
+ def test_http(data)
804
+ extension, content_type = data
805
+ output = Arrow::ResizableBuffer.new(1024)
806
+ @table.save(output, format: extension.to_sym)
807
+ path = "/data.#{extension}"
808
+ start_web_server(path,
809
+ output.data.to_s,
810
+ content_type) do |port|
811
+ input = URI("http://127.0.0.1:#{port}#{path}")
812
+ loaded_table = Arrow::Table.load(input)
813
+ assert_equal(@table.to_s, loaded_table.to_s)
814
+ end
815
+ end
816
+ end
731
817
  end
732
818
 
733
819
  test("#pack") do
@@ -922,4 +1008,116 @@ visible: false
922
1008
  TABLE
923
1009
  end
924
1010
  end
1011
+
1012
+ sub_test_case("#join") do
1013
+ test("keys: String") do
1014
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1015
+ number: [10, 20, 30])
1016
+ table2 = Arrow::Table.new(key: [3, 1],
1017
+ string: ["three", "one"])
1018
+ assert_equal(Arrow::Table.new([
1019
+ ["key", [1, 3]],
1020
+ ["number", [10, 30]],
1021
+ ["key", [1, 3]],
1022
+ ["string", ["one", "three"]],
1023
+ ]),
1024
+ table1.join(table2, "key"))
1025
+ end
1026
+
1027
+ test("keys: Symbol") do
1028
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1029
+ number: [10, 20, 30])
1030
+ table2 = Arrow::Table.new(key: [3, 1],
1031
+ string: ["three", "one"])
1032
+ assert_equal(Arrow::Table.new([
1033
+ ["key", [1, 3]],
1034
+ ["number", [10, 30]],
1035
+ ["key", [1, 3]],
1036
+ ["string", ["one", "three"]],
1037
+ ]),
1038
+ table1.join(table2, :key))
1039
+ end
1040
+
1041
+ test("keys: [String, Symbol]") do
1042
+ table1 = Arrow::Table.new(key1: [1, 1, 2, 2],
1043
+ key2: [10, 100, 20, 200],
1044
+ number: [1010, 1100, 2020, 2200])
1045
+ table2 = Arrow::Table.new(key1: [1, 2, 2],
1046
+ key2: [100, 20, 50],
1047
+ string: ["1-100", "2-20", "2-50"])
1048
+ assert_equal(Arrow::Table.new([
1049
+ ["key1", [1, 2]],
1050
+ ["key2", [100, 20]],
1051
+ ["number", [1100, 2020]],
1052
+ ["key1", [1, 2]],
1053
+ ["key2", [100, 20]],
1054
+ ["string", ["1-100", "2-20"]],
1055
+ ]),
1056
+ table1.join(table2, ["key1", :key2]))
1057
+ end
1058
+
1059
+ test("keys: {left: String, right: Symbol}") do
1060
+ table1 = Arrow::Table.new(left_key: [1, 2, 3],
1061
+ number: [10, 20, 30])
1062
+ table2 = Arrow::Table.new(right_key: [3, 1],
1063
+ string: ["three", "one"])
1064
+ assert_equal(Arrow::Table.new([
1065
+ ["left_key", [1, 3]],
1066
+ ["number", [10, 30]],
1067
+ ["right_key", [1, 3]],
1068
+ ["string", ["one", "three"]],
1069
+ ]),
1070
+ table1.join(table2, {left: "left_key", right: :right_key}))
1071
+ end
1072
+
1073
+ test("keys: {left: [String, Symbol], right: [Symbol, String]}") do
1074
+ table1 = Arrow::Table.new(left_key1: [1, 1, 2, 2],
1075
+ left_key2: [10, 100, 20, 200],
1076
+ number: [1010, 1100, 2020, 2200])
1077
+ table2 = Arrow::Table.new(right_key1: [1, 2, 2],
1078
+ right_key2: [100, 20, 50],
1079
+ string: ["1-100", "2-20", "2-50"])
1080
+ assert_equal(Arrow::Table.new([
1081
+ ["left_key1", [1, 2]],
1082
+ ["left_key2", [100, 20]],
1083
+ ["number", [1100, 2020]],
1084
+ ["right_key1", [1, 2]],
1085
+ ["right_key2", [100, 20]],
1086
+ ["string", ["1-100", "2-20"]],
1087
+ ]),
1088
+ table1.join(table2,
1089
+ {
1090
+ left: ["left_key1", :left_key2],
1091
+ right: [:right_key1, "right_key2"],
1092
+ }))
1093
+ end
1094
+
1095
+ test("type:") do
1096
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1097
+ number: [10, 20, 30])
1098
+ table2 = Arrow::Table.new(key: [3, 1],
1099
+ string: ["three", "one"])
1100
+ assert_equal(Arrow::Table.new([
1101
+ ["key", [1, 3, 2]],
1102
+ ["number", [10, 30, 20]],
1103
+ ["key", [1, 3, nil]],
1104
+ ["string", ["one", "three", nil]],
1105
+ ]),
1106
+ table1.join(table2, "key", type: :left_outer))
1107
+ end
1108
+
1109
+ test("left_outputs: & right_outputs:") do
1110
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1111
+ number: [10, 20, 30])
1112
+ table2 = Arrow::Table.new(key: [3, 1],
1113
+ string: ["three", "one"])
1114
+ assert_equal(Arrow::Table.new(key: [1, 3],
1115
+ number: [10, 30],
1116
+ string: ["one", "three"]),
1117
+ table1.join(table2,
1118
+ "key",
1119
+ left_outputs: ["key", "number"],
1120
+ right_outputs: ["string"]))
1121
+ end
1122
+ end
925
1123
  end
@@ -276,6 +276,36 @@ module ValuesBasicArraysTests
276
276
  target = build(Arrow::Decimal256Array.new(data_type, values))
277
277
  assert_equal(values, target.values)
278
278
  end
279
+
280
+ def test_month_interval
281
+ values = [
282
+ 1,
283
+ nil,
284
+ 12,
285
+ ]
286
+ target = build(Arrow::MonthIntervalArray.new(values))
287
+ assert_equal(values, target.values)
288
+ end
289
+
290
+ def test_day_time_interval
291
+ values = [
292
+ {day: 1, millisecond: 100},
293
+ nil,
294
+ {day: 2, millisecond: 300},
295
+ ]
296
+ target = build(Arrow::DayTimeIntervalArray.new(values))
297
+ assert_equal(values, target.values)
298
+ end
299
+
300
+ def test_month_day_nano_interval
301
+ values = [
302
+ {month: 1, day: 1, nanosecond: 100},
303
+ nil,
304
+ {month: 2, day: 3, nanosecond: 400},
305
+ ]
306
+ target = build(Arrow::MonthDayNanoIntervalArray.new(values))
307
+ assert_equal(values, target.values)
308
+ end
279
309
  end
280
310
 
281
311
  class ValuesArrayBasicArraysTest < Test::Unit::TestCase
@@ -347,6 +347,33 @@ module ValuesDenseUnionArrayTests
347
347
  assert_equal(values, target.values)
348
348
  end
349
349
 
350
+ def test_month_interval
351
+ values = [
352
+ {"0" => 1},
353
+ {"1" => nil},
354
+ ]
355
+ target = build(:month_interval, values)
356
+ assert_equal(values, target.values)
357
+ end
358
+
359
+ def test_day_time_interval
360
+ values = [
361
+ {"0" => {day: 1, millisecond: 100}},
362
+ {"1" => nil},
363
+ ]
364
+ target = build(:day_time_interval, values)
365
+ assert_equal(values, target.values)
366
+ end
367
+
368
+ def test_month_day_nano_interval
369
+ values = [
370
+ {"0" => {month: 1, day: 1, nanosecond: 100}},
371
+ {"1" => nil},
372
+ ]
373
+ target = build(:month_day_nano_interval, values)
374
+ assert_equal(values, target.values)
375
+ end
376
+
350
377
  def test_list
351
378
  values = [
352
379
  {"0" => [true, nil, false]},