red-arrow 0.15.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +28 -16
  3. data/ext/arrow/converters.hpp +63 -33
  4. data/ext/arrow/raw-records.cpp +2 -1
  5. data/ext/arrow/values.cpp +2 -1
  6. data/lib/arrow/array-builder.rb +101 -52
  7. data/lib/arrow/array.rb +28 -10
  8. data/lib/arrow/{binary-array-builder.rb → buffer.rb} +7 -15
  9. data/lib/arrow/chunked-array.rb +2 -0
  10. data/lib/arrow/csv-loader.rb +15 -3
  11. data/lib/arrow/csv-read-options.rb +18 -0
  12. data/lib/arrow/data-type.rb +35 -2
  13. data/lib/arrow/decimal128-array-builder.rb +0 -2
  14. data/lib/arrow/dictionary-array.rb +24 -0
  15. data/lib/arrow/field.rb +1 -1
  16. data/lib/arrow/generic-filterable.rb +43 -0
  17. data/lib/arrow/generic-takeable.rb +38 -0
  18. data/lib/arrow/list-data-type.rb +58 -8
  19. data/lib/arrow/loader.rb +12 -1
  20. data/lib/arrow/null-array-builder.rb +1 -1
  21. data/lib/arrow/null-array.rb +24 -0
  22. data/lib/arrow/raw-table-converter.rb +47 -0
  23. data/lib/arrow/record-batch-iterator.rb +22 -0
  24. data/lib/arrow/record-batch.rb +8 -3
  25. data/lib/arrow/schema.rb +5 -2
  26. data/lib/arrow/struct-array-builder.rb +13 -7
  27. data/lib/arrow/struct-data-type.rb +0 -2
  28. data/lib/arrow/table-loader.rb +29 -6
  29. data/lib/arrow/table-saver.rb +37 -13
  30. data/lib/arrow/table.rb +20 -73
  31. data/lib/arrow/version.rb +1 -1
  32. data/red-arrow.gemspec +4 -2
  33. data/test/helper.rb +1 -0
  34. data/test/helper/omittable.rb +36 -0
  35. data/test/raw-records/test-dense-union-array.rb +1 -34
  36. data/test/raw-records/test-sparse-union-array.rb +1 -33
  37. data/test/run-test.rb +14 -3
  38. data/test/test-array-builder.rb +17 -0
  39. data/test/test-array.rb +104 -0
  40. data/test/test-buffer.rb +11 -0
  41. data/test/test-chunked-array.rb +96 -0
  42. data/test/test-csv-loader.rb +77 -2
  43. data/test/test-data-type.rb +11 -0
  44. data/test/test-dense-union-data-type.rb +2 -2
  45. data/test/test-dictionary-array.rb +41 -0
  46. data/test/test-feather.rb +21 -6
  47. data/test/test-list-data-type.rb +27 -1
  48. data/test/test-null-array.rb +23 -0
  49. data/test/test-record-batch-iterator.rb +37 -0
  50. data/test/test-record-batch.rb +14 -0
  51. data/test/test-schema.rb +16 -0
  52. data/test/test-slicer.rb +74 -30
  53. data/test/test-sparse-union-data-type.rb +2 -2
  54. data/test/test-struct-array-builder.rb +8 -4
  55. data/test/test-table.rb +153 -14
  56. data/test/test-timestamp-array.rb +19 -0
  57. data/test/values/test-dense-union-array.rb +1 -34
  58. data/test/values/test-sparse-union-array.rb +1 -33
  59. metadata +76 -63
@@ -54,11 +54,12 @@ class TableTest < Test::Unit::TestCase
54
54
  target_rows = Arrow::BooleanArray.new(target_rows_raw)
55
55
  assert_equal(<<-TABLE, @table.slice(target_rows).to_s)
56
56
  count visible
57
- 0 2 false
58
- 1 4
59
- 2 16 true
60
- 3 64
61
- 4 128
57
+ 0
58
+ 1 2 false
59
+ 2 4
60
+ 3 16 true
61
+ 4 64
62
+ 5 128
62
63
  TABLE
63
64
  end
64
65
 
@@ -66,11 +67,12 @@ class TableTest < Test::Unit::TestCase
66
67
  target_rows_raw = [nil, true, true, false, true, false, true, true]
67
68
  assert_equal(<<-TABLE, @table.slice(target_rows_raw).to_s)
68
69
  count visible
69
- 0 2 false
70
- 1 4
71
- 2 16 true
72
- 3 64
73
- 4 128
70
+ 0
71
+ 1 2 false
72
+ 2 4
73
+ 3 16 true
74
+ 4 64
75
+ 5 128
74
76
  TABLE
75
77
  end
76
78
 
@@ -436,12 +438,24 @@ class TableTest < Test::Unit::TestCase
436
438
  assert_equal(@table, Arrow::Table.load(output))
437
439
  end
438
440
 
441
+ def test_arrow_file
442
+ output = create_output(".arrow")
443
+ @table.save(output, format: :arrow_file)
444
+ assert_equal(@table, Arrow::Table.load(output, format: :arrow_file))
445
+ end
446
+
439
447
  def test_batch
440
448
  output = create_output(".arrow")
441
449
  @table.save(output, format: :batch)
442
450
  assert_equal(@table, Arrow::Table.load(output, format: :batch))
443
451
  end
444
452
 
453
+ def test_arrow_streaming
454
+ output = create_output(".arrow")
455
+ @table.save(output, format: :arrow_streaming)
456
+ assert_equal(@table, Arrow::Table.load(output, format: :arrow_streaming))
457
+ end
458
+
445
459
  def test_stream
446
460
  output = create_output(".arrow")
447
461
  @table.save(output, format: :stream)
@@ -468,6 +482,15 @@ class TableTest < Test::Unit::TestCase
468
482
  compression: :gzip,
469
483
  schema: @table.schema))
470
484
  end
485
+
486
+ def test_tsv
487
+ output = create_output(".tsv")
488
+ @table.save(output, format: :tsv)
489
+ assert_equal(@table,
490
+ Arrow::Table.load(output,
491
+ format: :tsv,
492
+ schema: @table.schema))
493
+ end
471
494
  end
472
495
 
473
496
  sub_test_case("path") do
@@ -498,18 +521,27 @@ class TableTest < Test::Unit::TestCase
498
521
  compression: :gzip,
499
522
  schema: @table.schema))
500
523
  end
524
+
525
+ test("tsv") do
526
+ output = create_output(".tsv")
527
+ @table.save(output)
528
+ assert_equal(@table,
529
+ Arrow::Table.load(output,
530
+ format: :tsv,
531
+ schema: @table.schema))
532
+ end
501
533
  end
502
534
 
503
535
  sub_test_case("load: auto detect") do
504
- test("batch") do
536
+ test("arrow: file") do
505
537
  output = create_output(".arrow")
506
- @table.save(output, format: :batch)
538
+ @table.save(output, format: :arrow_file)
507
539
  assert_equal(@table, Arrow::Table.load(output))
508
540
  end
509
541
 
510
- test("stream") do
542
+ test("arrow: streaming") do
511
543
  output = create_output(".arrow")
512
- @table.save(output, format: :stream)
544
+ @table.save(output, format: :arrow_streaming)
513
545
  assert_equal(@table, Arrow::Table.load(output))
514
546
  end
515
547
 
@@ -539,6 +571,24 @@ chris,-1
539
571
  name score
540
572
  0 alice 10
541
573
  1 bob 29
574
+ 2 chris -1
575
+ TABLE
576
+ end
577
+
578
+ test("tsv") do
579
+ file = Tempfile.new(["red-arrow", ".tsv"])
580
+ file.puts(<<-TSV)
581
+ name\tscore
582
+ alice\t10
583
+ bob\t29
584
+ chris\t-1
585
+ TSV
586
+ file.close
587
+ table = Arrow::Table.load(file.path)
588
+ assert_equal(<<-TABLE, table.to_s)
589
+ name score
590
+ 0 alice 10
591
+ 1 bob 29
542
592
  2 chris -1
543
593
  TABLE
544
594
  end
@@ -646,4 +696,93 @@ visible: false
646
696
  end
647
697
  end
648
698
  end
699
+
700
+ sub_test_case("#filter") do
701
+ def setup
702
+ super
703
+ @options = Arrow::FilterOptions.new
704
+ @options.null_selection_behavior = :emit_null
705
+ end
706
+
707
+ test("Array: boolean") do
708
+ filter = [nil, true, true, false, true, false, true, true]
709
+ assert_equal(<<-TABLE, @table.filter(filter, @options).to_s)
710
+ count visible
711
+ 0
712
+ 1 2 false
713
+ 2 4
714
+ 3 16 true
715
+ 4 64
716
+ 5 128
717
+ TABLE
718
+ end
719
+
720
+ test("Arrow::BooleanArray") do
721
+ array = [nil, true, true, false, true, false, true, true]
722
+ filter = Arrow::BooleanArray.new(array)
723
+ assert_equal(<<-TABLE, @table.filter(filter, @options).to_s)
724
+ count visible
725
+ 0
726
+ 1 2 false
727
+ 2 4
728
+ 3 16 true
729
+ 4 64
730
+ 5 128
731
+ TABLE
732
+ end
733
+
734
+ test("Arrow::ChunkedArray") do
735
+ filter_chunks = [
736
+ Arrow::BooleanArray.new([nil, true, true]),
737
+ Arrow::BooleanArray.new([false, true, false]),
738
+ Arrow::BooleanArray.new([true, true]),
739
+ ]
740
+ filter = Arrow::ChunkedArray.new(filter_chunks)
741
+ assert_equal(<<-TABLE, @table.filter(filter, @options).to_s)
742
+ count visible
743
+ 0
744
+ 1 2 false
745
+ 2 4
746
+ 3 16 true
747
+ 4 64
748
+ 5 128
749
+ TABLE
750
+ end
751
+ end
752
+
753
+ sub_test_case("#take") do
754
+ test("Arrow: boolean") do
755
+ indices = [1, 0, 2]
756
+ assert_equal(<<-TABLE, @table.take(indices).to_s)
757
+ count visible
758
+ 0 2 false
759
+ 1 1 true
760
+ 2 4
761
+ TABLE
762
+ end
763
+
764
+ test("Arrow::Array") do
765
+ indices = Arrow::Int16Array.new([1, 0, 2])
766
+ assert_equal(<<-TABLE, @table.take(indices).to_s)
767
+ count visible
768
+ 0 2 false
769
+ 1 1 true
770
+ 2 4
771
+ TABLE
772
+ end
773
+
774
+ test("Arrow::ChunkedArray") do
775
+ chunks = [
776
+ Arrow::Int16Array.new([1, 0]),
777
+ Arrow::Int16Array.new([2])
778
+ ]
779
+ indices = Arrow::ChunkedArray.new(chunks)
780
+ assert_equal(<<-TABLE, @table.take(indices).to_s)
781
+ count visible
782
+ 0 2 false
783
+ 1 1 true
784
+ 2 4
785
+ TABLE
786
+ end
787
+ end
649
788
  end
@@ -23,4 +23,23 @@ class TimestampArrayTest < Test::Unit::TestCase
23
23
  time = Time.at(sec, usec)
24
24
  assert_equal(time, array[0])
25
25
  end
26
+
27
+ sub_test_case("#is_in") do
28
+ def setup
29
+ values = [
30
+ Time.parse("2019-11-18T00:09:11"),
31
+ Time.parse("2019-11-18T00:09:12"),
32
+ Time.parse("2019-11-18T00:09:13"),
33
+ ]
34
+ @array = Arrow::TimestampArray.new(:micro, values)
35
+ end
36
+
37
+ test("Arrow: Array") do
38
+ right = [
39
+ Time.parse("2019-11-18T00:09:12"),
40
+ ]
41
+ assert_equal(Arrow::BooleanArray.new([false, true, false]),
42
+ @array.is_in(right))
43
+ end
44
+ end
26
45
  end
@@ -48,10 +48,7 @@ module ValuesDenseUnionArrayTests
48
48
  sub_record_batch.columns[0].data
49
49
  end
50
50
  values.each do |value|
51
- if value.nil?
52
- type_ids << nil
53
- offsets << 0
54
- elsif value.key?("0")
51
+ if value.key?("0")
55
52
  type_id = type_codes[0]
56
53
  type_ids << type_id
57
54
  offsets << (type_ids.count(type_id) - 1)
@@ -70,7 +67,6 @@ module ValuesDenseUnionArrayTests
70
67
  def test_null
71
68
  values = [
72
69
  {"0" => nil},
73
- nil,
74
70
  ]
75
71
  target = build(:null, values)
76
72
  assert_equal(values, target.values)
@@ -79,7 +75,6 @@ module ValuesDenseUnionArrayTests
79
75
  def test_boolean
80
76
  values = [
81
77
  {"0" => true},
82
- nil,
83
78
  {"1" => nil},
84
79
  ]
85
80
  target = build(:boolean, values)
@@ -89,7 +84,6 @@ module ValuesDenseUnionArrayTests
89
84
  def test_int8
90
85
  values = [
91
86
  {"0" => -(2 ** 7)},
92
- nil,
93
87
  {"1" => nil},
94
88
  ]
95
89
  target = build(:int8, values)
@@ -99,7 +93,6 @@ module ValuesDenseUnionArrayTests
99
93
  def test_uint8
100
94
  values = [
101
95
  {"0" => (2 ** 8) - 1},
102
- nil,
103
96
  {"1" => nil},
104
97
  ]
105
98
  target = build(:uint8, values)
@@ -109,7 +102,6 @@ module ValuesDenseUnionArrayTests
109
102
  def test_int16
110
103
  values = [
111
104
  {"0" => -(2 ** 15)},
112
- nil,
113
105
  {"1" => nil},
114
106
  ]
115
107
  target = build(:int16, values)
@@ -119,7 +111,6 @@ module ValuesDenseUnionArrayTests
119
111
  def test_uint16
120
112
  values = [
121
113
  {"0" => (2 ** 16) - 1},
122
- nil,
123
114
  {"1" => nil},
124
115
  ]
125
116
  target = build(:uint16, values)
@@ -129,7 +120,6 @@ module ValuesDenseUnionArrayTests
129
120
  def test_int32
130
121
  values = [
131
122
  {"0" => -(2 ** 31)},
132
- nil,
133
123
  {"1" => nil},
134
124
  ]
135
125
  target = build(:int32, values)
@@ -139,7 +129,6 @@ module ValuesDenseUnionArrayTests
139
129
  def test_uint32
140
130
  values = [
141
131
  {"0" => (2 ** 32) - 1},
142
- nil,
143
132
  {"1" => nil},
144
133
  ]
145
134
  target = build(:uint32, values)
@@ -149,7 +138,6 @@ module ValuesDenseUnionArrayTests
149
138
  def test_int64
150
139
  values = [
151
140
  {"0" => -(2 ** 63)},
152
- nil,
153
141
  {"1" => nil},
154
142
  ]
155
143
  target = build(:int64, values)
@@ -159,7 +147,6 @@ module ValuesDenseUnionArrayTests
159
147
  def test_uint64
160
148
  values = [
161
149
  {"0" => (2 ** 64) - 1},
162
- nil,
163
150
  {"1" => nil},
164
151
  ]
165
152
  target = build(:uint64, values)
@@ -169,7 +156,6 @@ module ValuesDenseUnionArrayTests
169
156
  def test_float
170
157
  values = [
171
158
  {"0" => -1.0},
172
- nil,
173
159
  {"1" => nil},
174
160
  ]
175
161
  target = build(:float, values)
@@ -179,7 +165,6 @@ module ValuesDenseUnionArrayTests
179
165
  def test_double
180
166
  values = [
181
167
  {"0" => -1.0},
182
- nil,
183
168
  {"1" => nil},
184
169
  ]
185
170
  target = build(:double, values)
@@ -189,7 +174,6 @@ module ValuesDenseUnionArrayTests
189
174
  def test_binary
190
175
  values = [
191
176
  {"0" => "\xff".b},
192
- nil,
193
177
  {"1" => nil},
194
178
  ]
195
179
  target = build(:binary, values)
@@ -199,7 +183,6 @@ module ValuesDenseUnionArrayTests
199
183
  def test_string
200
184
  values = [
201
185
  {"0" => "Ruby"},
202
- nil,
203
186
  {"1" => nil},
204
187
  ]
205
188
  target = build(:string, values)
@@ -209,7 +192,6 @@ module ValuesDenseUnionArrayTests
209
192
  def test_date32
210
193
  values = [
211
194
  {"0" => Date.new(1960, 1, 1)},
212
- nil,
213
195
  {"1" => nil},
214
196
  ]
215
197
  target = build(:date32, values)
@@ -219,7 +201,6 @@ module ValuesDenseUnionArrayTests
219
201
  def test_date64
220
202
  values = [
221
203
  {"0" => DateTime.new(1960, 1, 1, 2, 9, 30)},
222
- nil,
223
204
  {"1" => nil},
224
205
  ]
225
206
  target = build(:date64, values)
@@ -229,7 +210,6 @@ module ValuesDenseUnionArrayTests
229
210
  def test_timestamp_second
230
211
  values = [
231
212
  {"0" => Time.parse("1960-01-01T02:09:30Z")},
232
- nil,
233
213
  {"1" => nil},
234
214
  ]
235
215
  target = build({
@@ -243,7 +223,6 @@ module ValuesDenseUnionArrayTests
243
223
  def test_timestamp_milli
244
224
  values = [
245
225
  {"0" => Time.parse("1960-01-01T02:09:30.123Z")},
246
- nil,
247
226
  {"1" => nil},
248
227
  ]
249
228
  target = build({
@@ -257,7 +236,6 @@ module ValuesDenseUnionArrayTests
257
236
  def test_timestamp_micro
258
237
  values = [
259
238
  {"0" => Time.parse("1960-01-01T02:09:30.123456Z")},
260
- nil,
261
239
  {"1" => nil},
262
240
  ]
263
241
  target = build({
@@ -271,7 +249,6 @@ module ValuesDenseUnionArrayTests
271
249
  def test_timestamp_nano
272
250
  values = [
273
251
  {"0" => Time.parse("1960-01-01T02:09:30.123456789Z")},
274
- nil,
275
252
  {"1" => nil},
276
253
  ]
277
254
  target = build({
@@ -287,7 +264,6 @@ module ValuesDenseUnionArrayTests
287
264
  values = [
288
265
  # 00:10:00
289
266
  {"0" => Arrow::Time.new(unit, 60 * 10)},
290
- nil,
291
267
  {"1" => nil},
292
268
  ]
293
269
  target = build({
@@ -303,7 +279,6 @@ module ValuesDenseUnionArrayTests
303
279
  values = [
304
280
  # 00:10:00.123
305
281
  {"0" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123)},
306
- nil,
307
282
  {"1" => nil},
308
283
  ]
309
284
  target = build({
@@ -319,7 +294,6 @@ module ValuesDenseUnionArrayTests
319
294
  values = [
320
295
  # 00:10:00.123456
321
296
  {"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)},
322
- nil,
323
297
  {"1" => nil},
324
298
  ]
325
299
  target = build({
@@ -335,7 +309,6 @@ module ValuesDenseUnionArrayTests
335
309
  values = [
336
310
  # 00:10:00.123456789
337
311
  {"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)},
338
- nil,
339
312
  {"1" => nil},
340
313
  ]
341
314
  target = build({
@@ -349,7 +322,6 @@ module ValuesDenseUnionArrayTests
349
322
  def test_decimal128
350
323
  values = [
351
324
  {"0" => BigDecimal("92.92")},
352
- nil,
353
325
  {"1" => nil},
354
326
  ]
355
327
  target = build({
@@ -364,7 +336,6 @@ module ValuesDenseUnionArrayTests
364
336
  def test_list
365
337
  values = [
366
338
  {"0" => [true, nil, false]},
367
- nil,
368
339
  {"1" => nil},
369
340
  ]
370
341
  target = build({
@@ -381,7 +352,6 @@ module ValuesDenseUnionArrayTests
381
352
  def test_struct
382
353
  values = [
383
354
  {"0" => {"sub_field" => true}},
384
- nil,
385
355
  {"1" => nil},
386
356
  {"0" => {"sub_field" => nil}},
387
357
  ]
@@ -402,7 +372,6 @@ module ValuesDenseUnionArrayTests
402
372
  omit("Need to add support for SparseUnionArrayBuilder")
403
373
  values = [
404
374
  {"0" => {"field1" => true}},
405
- nil,
406
375
  {"1" => nil},
407
376
  {"0" => {"field2" => nil}},
408
377
  ]
@@ -428,7 +397,6 @@ module ValuesDenseUnionArrayTests
428
397
  omit("Need to add support for DenseUnionArrayBuilder")
429
398
  values = [
430
399
  {"0" => {"field1" => true}},
431
- nil,
432
400
  {"1" => nil},
433
401
  {"0" => {"field2" => nil}},
434
402
  ]
@@ -454,7 +422,6 @@ module ValuesDenseUnionArrayTests
454
422
  omit("Need to add support for DictionaryArrayBuilder")
455
423
  values = [
456
424
  {"0" => "Ruby"},
457
- nil,
458
425
  {"1" => nil},
459
426
  {"0" => "GLib"},
460
427
  ]