red-arrow 6.0.0 → 8.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +10 -0
  3. data/ext/arrow/arrow.cpp +12 -0
  4. data/ext/arrow/converters.hpp +46 -10
  5. data/ext/arrow/extconf.rb +1 -1
  6. data/ext/arrow/raw-records.cpp +3 -2
  7. data/ext/arrow/red-arrow.hpp +7 -0
  8. data/ext/arrow/values.cpp +3 -2
  9. data/lib/arrow/datum.rb +2 -0
  10. data/lib/arrow/day-time-interval-array-builder.rb +29 -0
  11. data/lib/arrow/function.rb +52 -0
  12. data/lib/arrow/loader.rb +16 -0
  13. data/lib/arrow/month-day-nano-interval-array-builder.rb +29 -0
  14. data/lib/arrow/s3-global-options.rb +38 -0
  15. data/lib/arrow/sort-key.rb +61 -55
  16. data/lib/arrow/sort-options.rb +8 -8
  17. data/lib/arrow/table-loader.rb +99 -62
  18. data/lib/arrow/table-saver.rb +7 -2
  19. data/lib/arrow/table.rb +78 -0
  20. data/lib/arrow/version.rb +1 -1
  21. data/red-arrow.gemspec +1 -10
  22. data/test/helper.rb +2 -0
  23. data/test/raw-records/test-basic-arrays.rb +30 -0
  24. data/test/raw-records/test-dense-union-array.rb +27 -0
  25. data/test/raw-records/test-list-array.rb +39 -0
  26. data/test/raw-records/test-map-array.rb +37 -0
  27. data/test/raw-records/test-sparse-union-array.rb +27 -0
  28. data/test/raw-records/test-struct-array.rb +30 -0
  29. data/test/test-function.rb +48 -14
  30. data/test/test-table.rb +204 -6
  31. data/test/values/test-basic-arrays.rb +30 -0
  32. data/test/values/test-dense-union-array.rb +27 -0
  33. data/test/values/test-dictionary-array.rb +295 -0
  34. data/test/values/test-list-array.rb +39 -0
  35. data/test/values/test-map-array.rb +33 -0
  36. data/test/values/test-sparse-union-array.rb +27 -0
  37. data/test/values/test-struct-array.rb +30 -0
  38. metadata +88 -194
@@ -53,6 +53,14 @@ class FunctionTest < Test::Unit::TestCase
53
53
  or_function.execute(args).value.to_a)
54
54
  end
55
55
 
56
+ test("Arrow::Column") do
57
+ or_function = Arrow::Function.find("or")
58
+ table = Arrow::Table.new(a: [true, false, false],
59
+ b: [true, false, true])
60
+ assert_equal([true, false, true],
61
+ or_function.execute([table.a, table.b]).value.to_a)
62
+ end
63
+
56
64
  test("Arrow::Scalar") do
57
65
  add_function = Arrow::Function.find("add")
58
66
  args = [
@@ -116,12 +124,13 @@ class FunctionTest < Test::Unit::TestCase
116
124
  cast_function = Arrow::Function.find("cast")
117
125
  date = Date.new(2021, 6, 12)
118
126
  args = [date]
119
- options = Arrow::CastOptions.new
120
- options.to_data_type = Arrow::TimestampDataType.new(:second)
127
+ options = {
128
+ to_data_type: Arrow::TimestampDataType.new(:second),
129
+ }
121
130
  time = Time.utc(date.year,
122
131
  date.month,
123
132
  date.day)
124
- assert_equal(Arrow::TimestampScalar.new(options.to_data_type,
133
+ assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
125
134
  time.to_i),
126
135
  cast_function.execute(args, options).value)
127
136
  end
@@ -132,9 +141,10 @@ class FunctionTest < Test::Unit::TestCase
132
141
  # 00:10:00
133
142
  60 * 10)
134
143
  args = [arrow_time]
135
- options = Arrow::CastOptions.new
136
- options.to_data_type = Arrow::Time64DataType.new(:micro)
137
- assert_equal(Arrow::Time64Scalar.new(options.to_data_type,
144
+ options = {
145
+ to_data_type: Arrow::Time64DataType.new(:micro),
146
+ }
147
+ assert_equal(Arrow::Time64Scalar.new(options[:to_data_type],
138
148
  # 00:10:00.000000
139
149
  60 * 10 * 1000 * 1000),
140
150
  cast_function.execute(args, options).value)
@@ -146,10 +156,11 @@ class FunctionTest < Test::Unit::TestCase
146
156
  # 00:10:00.000000
147
157
  60 * 10 * 1000 * 1000)
148
158
  args = [arrow_time]
149
- options = Arrow::CastOptions.new
150
- options.to_data_type = Arrow::Time32DataType.new(:second)
151
- options.allow_time_truncate = true
152
- assert_equal(Arrow::Time32Scalar.new(options.to_data_type,
159
+ options = {
160
+ to_data_type: Arrow::Time32DataType.new(:second),
161
+ allow_time_truncate: true,
162
+ }
163
+ assert_equal(Arrow::Time32Scalar.new(options[:to_data_type],
153
164
  # 00:10:00
154
165
  60 * 10),
155
166
  cast_function.execute(args, options).value)
@@ -159,18 +170,41 @@ class FunctionTest < Test::Unit::TestCase
159
170
  cast_function = Arrow::Function.find("cast")
160
171
  time = Time.utc(2021, 6, 12, 1, 2, 3, 1)
161
172
  args = [time]
162
- options = Arrow::CastOptions.new
163
- options.to_data_type = Arrow::TimestampDataType.new(:second)
164
- options.allow_time_truncate = true
173
+ options = {
174
+ to_data_type: Arrow::TimestampDataType.new(:second),
175
+ allow_time_truncate: true,
176
+ }
165
177
  time = Time.utc(time.year,
166
178
  time.month,
167
179
  time.day,
168
180
  time.hour,
169
181
  time.min,
170
182
  time.sec)
171
- assert_equal(Arrow::TimestampScalar.new(options.to_data_type,
183
+ assert_equal(Arrow::TimestampScalar.new(options[:to_data_type],
172
184
  time.to_i),
173
185
  cast_function.execute(args, options).value)
174
186
  end
187
+
188
+ test("SetLookupOptions") do
189
+ is_in_function = Arrow::Function.find("is_in")
190
+ args = [
191
+ Arrow::Int16Array.new([1, 0, 1, 2]),
192
+ ]
193
+ options = {
194
+ value_set: Arrow::Int16Array.new([2, 0]),
195
+ }
196
+ assert_equal(Arrow::BooleanArray.new([false, true, false, true]),
197
+ is_in_function.execute(args, options).value)
198
+ end
199
+ end
200
+
201
+ def test_call
202
+ or_function = Arrow::Function.find("or")
203
+ args = [
204
+ Arrow::BooleanArray.new([true, false, false]),
205
+ Arrow::BooleanArray.new([true, false, true]),
206
+ ]
207
+ assert_equal([true, false, true],
208
+ or_function.call(args).value.to_a)
175
209
  end
176
210
  end
data/test/test-table.rb CHANGED
@@ -186,7 +186,12 @@ class TableTest < Test::Unit::TestCase
186
186
  end
187
187
 
188
188
  test("{key: Range}: beginless include end") do
189
- assert_equal(<<-TABLE, @table.slice(count: ..8).to_s)
189
+ begin
190
+ range = eval("..8")
191
+ rescue SyntaxError
192
+ omit("beginless range isn't supported")
193
+ end
194
+ assert_equal(<<-TABLE, @table.slice(count: range).to_s)
190
195
  count visible
191
196
  0 1 true
192
197
  1 2 false
@@ -196,7 +201,12 @@ class TableTest < Test::Unit::TestCase
196
201
  end
197
202
 
198
203
  test("{key: Range}: beginless exclude end") do
199
- assert_equal(<<-TABLE, @table.slice(count: ...8).to_s)
204
+ begin
205
+ range = eval("...8")
206
+ rescue SyntaxError
207
+ omit("beginless range isn't supported")
208
+ end
209
+ assert_equal(<<-TABLE, @table.slice(count: range).to_s)
200
210
  count visible
201
211
  0 1 true
202
212
  1 2 false
@@ -205,7 +215,12 @@ class TableTest < Test::Unit::TestCase
205
215
  end
206
216
 
207
217
  test("{key: Range}: endless") do
208
- assert_equal(<<-TABLE, @table.slice(count: 16..).to_s)
218
+ begin
219
+ range = eval("16..")
220
+ rescue SyntaxError
221
+ omit("endless range isn't supported")
222
+ end
223
+ assert_equal(<<-TABLE, @table.slice(count: range).to_s)
209
224
  count visible
210
225
  0 16 true
211
226
  1 32 false
@@ -573,14 +588,20 @@ class TableTest < Test::Unit::TestCase
573
588
  assert_equal(@table, Arrow::Table.load(output, format: :batch))
574
589
  end
575
590
 
591
+ def test_arrows
592
+ output = create_output(".arrows")
593
+ @table.save(output, format: :arrows)
594
+ assert_equal(@table, Arrow::Table.load(output, format: :arrows))
595
+ end
596
+
576
597
  def test_arrow_streaming
577
- output = create_output(".arrow")
598
+ output = create_output(".arrows")
578
599
  @table.save(output, format: :arrow_streaming)
579
600
  assert_equal(@table, Arrow::Table.load(output, format: :arrow_streaming))
580
601
  end
581
602
 
582
603
  def test_stream
583
- output = create_output(".arrow")
604
+ output = create_output(".arrows")
584
605
  @table.save(output, format: :stream)
585
606
  assert_equal(@table, Arrow::Table.load(output, format: :stream))
586
607
  end
@@ -626,6 +647,24 @@ class TableTest < Test::Unit::TestCase
626
647
  end
627
648
 
628
649
  sub_test_case("save: auto detect") do
650
+ test("arrow") do
651
+ output = create_output(".arrow")
652
+ @table.save(output)
653
+ assert_equal(@table,
654
+ Arrow::Table.load(output,
655
+ format: :arrow,
656
+ schema: @table.schema))
657
+ end
658
+
659
+ test("arrows") do
660
+ output = create_output(".arrows")
661
+ @table.save(output)
662
+ assert_equal(@table,
663
+ Arrow::Table.load(output,
664
+ format: :arrows,
665
+ schema: @table.schema))
666
+ end
667
+
629
668
  test("csv") do
630
669
  output = create_output(".csv")
631
670
  @table.save(output)
@@ -664,7 +703,13 @@ class TableTest < Test::Unit::TestCase
664
703
 
665
704
  test("arrow: streaming") do
666
705
  output = create_output(".arrow")
667
- @table.save(output, format: :arrow_streaming)
706
+ @table.save(output, format: :arrows)
707
+ assert_equal(@table, Arrow::Table.load(output))
708
+ end
709
+
710
+ test("arrows") do
711
+ output = create_output(".arrows")
712
+ @table.save(output, format: :arrows)
668
713
  assert_equal(@table, Arrow::Table.load(output))
669
714
  end
670
715
 
@@ -728,6 +773,47 @@ chris\t-1
728
773
  end
729
774
  end
730
775
  end
776
+
777
+ sub_test_case("URI") do
778
+ def start_web_server(path, data, content_type)
779
+ http_server = WEBrick::HTTPServer.new(:Port => 0)
780
+ http_server.mount_proc(path) do |request, response|
781
+ response.body = data
782
+ response.content_type = content_type
783
+ end
784
+ http_server_thread = Thread.new do
785
+ http_server.start
786
+ end
787
+ begin
788
+ Timeout.timeout(1) do
789
+ yield(http_server[:Port])
790
+ end
791
+ ensure
792
+ http_server.shutdown
793
+ http_server_thread.join
794
+ end
795
+ end
796
+
797
+ data("Arrow File",
798
+ ["arrow", "application/vnd.apache.arrow.file"])
799
+ data("Arrow Stream",
800
+ ["arrows", "application/vnd.apache.arrow.stream"])
801
+ data("CSV",
802
+ ["csv", "text/csv"])
803
+ def test_http(data)
804
+ extension, content_type = data
805
+ output = Arrow::ResizableBuffer.new(1024)
806
+ @table.save(output, format: extension.to_sym)
807
+ path = "/data.#{extension}"
808
+ start_web_server(path,
809
+ output.data.to_s,
810
+ content_type) do |port|
811
+ input = URI("http://127.0.0.1:#{port}#{path}")
812
+ loaded_table = Arrow::Table.load(input)
813
+ assert_equal(@table.to_s, loaded_table.to_s)
814
+ end
815
+ end
816
+ end
731
817
  end
732
818
 
733
819
  test("#pack") do
@@ -922,4 +1008,116 @@ visible: false
922
1008
  TABLE
923
1009
  end
924
1010
  end
1011
+
1012
+ sub_test_case("#join") do
1013
+ test("keys: String") do
1014
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1015
+ number: [10, 20, 30])
1016
+ table2 = Arrow::Table.new(key: [3, 1],
1017
+ string: ["three", "one"])
1018
+ assert_equal(Arrow::Table.new([
1019
+ ["key", [1, 3]],
1020
+ ["number", [10, 30]],
1021
+ ["key", [1, 3]],
1022
+ ["string", ["one", "three"]],
1023
+ ]),
1024
+ table1.join(table2, "key"))
1025
+ end
1026
+
1027
+ test("keys: Symbol") do
1028
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1029
+ number: [10, 20, 30])
1030
+ table2 = Arrow::Table.new(key: [3, 1],
1031
+ string: ["three", "one"])
1032
+ assert_equal(Arrow::Table.new([
1033
+ ["key", [1, 3]],
1034
+ ["number", [10, 30]],
1035
+ ["key", [1, 3]],
1036
+ ["string", ["one", "three"]],
1037
+ ]),
1038
+ table1.join(table2, :key))
1039
+ end
1040
+
1041
+ test("keys: [String, Symbol]") do
1042
+ table1 = Arrow::Table.new(key1: [1, 1, 2, 2],
1043
+ key2: [10, 100, 20, 200],
1044
+ number: [1010, 1100, 2020, 2200])
1045
+ table2 = Arrow::Table.new(key1: [1, 2, 2],
1046
+ key2: [100, 20, 50],
1047
+ string: ["1-100", "2-20", "2-50"])
1048
+ assert_equal(Arrow::Table.new([
1049
+ ["key1", [1, 2]],
1050
+ ["key2", [100, 20]],
1051
+ ["number", [1100, 2020]],
1052
+ ["key1", [1, 2]],
1053
+ ["key2", [100, 20]],
1054
+ ["string", ["1-100", "2-20"]],
1055
+ ]),
1056
+ table1.join(table2, ["key1", :key2]))
1057
+ end
1058
+
1059
+ test("keys: {left: String, right: Symbol}") do
1060
+ table1 = Arrow::Table.new(left_key: [1, 2, 3],
1061
+ number: [10, 20, 30])
1062
+ table2 = Arrow::Table.new(right_key: [3, 1],
1063
+ string: ["three", "one"])
1064
+ assert_equal(Arrow::Table.new([
1065
+ ["left_key", [1, 3]],
1066
+ ["number", [10, 30]],
1067
+ ["right_key", [1, 3]],
1068
+ ["string", ["one", "three"]],
1069
+ ]),
1070
+ table1.join(table2, {left: "left_key", right: :right_key}))
1071
+ end
1072
+
1073
+ test("keys: {left: [String, Symbol], right: [Symbol, String]}") do
1074
+ table1 = Arrow::Table.new(left_key1: [1, 1, 2, 2],
1075
+ left_key2: [10, 100, 20, 200],
1076
+ number: [1010, 1100, 2020, 2200])
1077
+ table2 = Arrow::Table.new(right_key1: [1, 2, 2],
1078
+ right_key2: [100, 20, 50],
1079
+ string: ["1-100", "2-20", "2-50"])
1080
+ assert_equal(Arrow::Table.new([
1081
+ ["left_key1", [1, 2]],
1082
+ ["left_key2", [100, 20]],
1083
+ ["number", [1100, 2020]],
1084
+ ["right_key1", [1, 2]],
1085
+ ["right_key2", [100, 20]],
1086
+ ["string", ["1-100", "2-20"]],
1087
+ ]),
1088
+ table1.join(table2,
1089
+ {
1090
+ left: ["left_key1", :left_key2],
1091
+ right: [:right_key1, "right_key2"],
1092
+ }))
1093
+ end
1094
+
1095
+ test("type:") do
1096
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1097
+ number: [10, 20, 30])
1098
+ table2 = Arrow::Table.new(key: [3, 1],
1099
+ string: ["three", "one"])
1100
+ assert_equal(Arrow::Table.new([
1101
+ ["key", [1, 3, 2]],
1102
+ ["number", [10, 30, 20]],
1103
+ ["key", [1, 3, nil]],
1104
+ ["string", ["one", "three", nil]],
1105
+ ]),
1106
+ table1.join(table2, "key", type: :left_outer))
1107
+ end
1108
+
1109
+ test("left_outputs: & right_outputs:") do
1110
+ table1 = Arrow::Table.new(key: [1, 2, 3],
1111
+ number: [10, 20, 30])
1112
+ table2 = Arrow::Table.new(key: [3, 1],
1113
+ string: ["three", "one"])
1114
+ assert_equal(Arrow::Table.new(key: [1, 3],
1115
+ number: [10, 30],
1116
+ string: ["one", "three"]),
1117
+ table1.join(table2,
1118
+ "key",
1119
+ left_outputs: ["key", "number"],
1120
+ right_outputs: ["string"]))
1121
+ end
1122
+ end
925
1123
  end
@@ -276,6 +276,36 @@ module ValuesBasicArraysTests
276
276
  target = build(Arrow::Decimal256Array.new(data_type, values))
277
277
  assert_equal(values, target.values)
278
278
  end
279
+
280
+ def test_month_interval
281
+ values = [
282
+ 1,
283
+ nil,
284
+ 12,
285
+ ]
286
+ target = build(Arrow::MonthIntervalArray.new(values))
287
+ assert_equal(values, target.values)
288
+ end
289
+
290
+ def test_day_time_interval
291
+ values = [
292
+ {day: 1, millisecond: 100},
293
+ nil,
294
+ {day: 2, millisecond: 300},
295
+ ]
296
+ target = build(Arrow::DayTimeIntervalArray.new(values))
297
+ assert_equal(values, target.values)
298
+ end
299
+
300
+ def test_month_day_nano_interval
301
+ values = [
302
+ {month: 1, day: 1, nanosecond: 100},
303
+ nil,
304
+ {month: 2, day: 3, nanosecond: 400},
305
+ ]
306
+ target = build(Arrow::MonthDayNanoIntervalArray.new(values))
307
+ assert_equal(values, target.values)
308
+ end
279
309
  end
280
310
 
281
311
  class ValuesArrayBasicArraysTest < Test::Unit::TestCase
@@ -347,6 +347,33 @@ module ValuesDenseUnionArrayTests
347
347
  assert_equal(values, target.values)
348
348
  end
349
349
 
350
+ def test_month_interval
351
+ values = [
352
+ {"0" => 1},
353
+ {"1" => nil},
354
+ ]
355
+ target = build(:month_interval, values)
356
+ assert_equal(values, target.values)
357
+ end
358
+
359
+ def test_day_time_interval
360
+ values = [
361
+ {"0" => {day: 1, millisecond: 100}},
362
+ {"1" => nil},
363
+ ]
364
+ target = build(:day_time_interval, values)
365
+ assert_equal(values, target.values)
366
+ end
367
+
368
+ def test_month_day_nano_interval
369
+ values = [
370
+ {"0" => {month: 1, day: 1, nanosecond: 100}},
371
+ {"1" => nil},
372
+ ]
373
+ target = build(:month_day_nano_interval, values)
374
+ assert_equal(values, target.values)
375
+ end
376
+
350
377
  def test_list
351
378
  values = [
352
379
  {"0" => [true, nil, false]},