red-arrow 0.15.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +28 -16
  3. data/ext/arrow/converters.hpp +63 -33
  4. data/ext/arrow/raw-records.cpp +2 -1
  5. data/ext/arrow/values.cpp +2 -1
  6. data/lib/arrow/array-builder.rb +101 -52
  7. data/lib/arrow/array.rb +28 -10
  8. data/lib/arrow/{binary-array-builder.rb → buffer.rb} +7 -15
  9. data/lib/arrow/chunked-array.rb +2 -0
  10. data/lib/arrow/csv-loader.rb +15 -3
  11. data/lib/arrow/csv-read-options.rb +18 -0
  12. data/lib/arrow/data-type.rb +35 -2
  13. data/lib/arrow/decimal128-array-builder.rb +0 -2
  14. data/lib/arrow/dictionary-array.rb +24 -0
  15. data/lib/arrow/field.rb +1 -1
  16. data/lib/arrow/generic-filterable.rb +43 -0
  17. data/lib/arrow/generic-takeable.rb +38 -0
  18. data/lib/arrow/list-data-type.rb +58 -8
  19. data/lib/arrow/loader.rb +12 -1
  20. data/lib/arrow/null-array-builder.rb +1 -1
  21. data/lib/arrow/null-array.rb +24 -0
  22. data/lib/arrow/raw-table-converter.rb +47 -0
  23. data/lib/arrow/record-batch-iterator.rb +22 -0
  24. data/lib/arrow/record-batch.rb +8 -3
  25. data/lib/arrow/schema.rb +5 -2
  26. data/lib/arrow/struct-array-builder.rb +13 -7
  27. data/lib/arrow/struct-data-type.rb +0 -2
  28. data/lib/arrow/table-loader.rb +29 -6
  29. data/lib/arrow/table-saver.rb +37 -13
  30. data/lib/arrow/table.rb +20 -73
  31. data/lib/arrow/version.rb +1 -1
  32. data/red-arrow.gemspec +4 -2
  33. data/test/helper.rb +1 -0
  34. data/test/helper/omittable.rb +36 -0
  35. data/test/raw-records/test-dense-union-array.rb +1 -34
  36. data/test/raw-records/test-sparse-union-array.rb +1 -33
  37. data/test/run-test.rb +14 -3
  38. data/test/test-array-builder.rb +17 -0
  39. data/test/test-array.rb +104 -0
  40. data/test/test-buffer.rb +11 -0
  41. data/test/test-chunked-array.rb +96 -0
  42. data/test/test-csv-loader.rb +77 -2
  43. data/test/test-data-type.rb +11 -0
  44. data/test/test-dense-union-data-type.rb +2 -2
  45. data/test/test-dictionary-array.rb +41 -0
  46. data/test/test-feather.rb +21 -6
  47. data/test/test-list-data-type.rb +27 -1
  48. data/test/test-null-array.rb +23 -0
  49. data/test/test-record-batch-iterator.rb +37 -0
  50. data/test/test-record-batch.rb +14 -0
  51. data/test/test-schema.rb +16 -0
  52. data/test/test-slicer.rb +74 -30
  53. data/test/test-sparse-union-data-type.rb +2 -2
  54. data/test/test-struct-array-builder.rb +8 -4
  55. data/test/test-table.rb +153 -14
  56. data/test/test-timestamp-array.rb +19 -0
  57. data/test/values/test-dense-union-array.rb +1 -34
  58. data/test/values/test-sparse-union-array.rb +1 -33
  59. metadata +76 -63
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 65ddeae926378c460b0945ff6949ecbf3ee911611cdcb95bf21f3cf3765efe6d
4
- data.tar.gz: 193bce59b05b836fb5a5d6d8b650ea9adf441bf04beda8a6083633692c796957
3
+ metadata.gz: a45e003f0a453f175b1dbfc81c1fcf092cbfe964dd43e44d7b16e2087834ef5d
4
+ data.tar.gz: 5d1dc1d87a821d1ac4c49603d1d92d723230eb1f8a7cdd540dafe3715a53c12e
5
5
  SHA512:
6
- metadata.gz: b502a0bc9f65b24d04d9f5ee1f58e7eb5db27e885d756158c249fba8be843af7f1ef12c234110cc7136d67f5e998ceb726a7ecf8688b653ba84ec207f7308e78
7
- data.tar.gz: 1d8f8798f582fdb8b4e0e566d52407b3303391c4c43deef9471dc0011c9a377f83fca308b230709ef0b332ce83892a1cd967096531b4701382d4ed40066b216c
6
+ metadata.gz: 85329e05ae20268d44a9ffff5fb3278263e5b77deda6f03fda31595b117252b7970fd17328ce2650c8650895a1813a1fb95799d3331a312bce2c276f3e5d55a4
7
+ data.tar.gz: fe90b0ff0dfb9b9126765818d54f7d4d203e6c72ab53dfb2680a48dc474fef09b8ed3f0553f3c1fe110d500d625ab2300563c3e4169403326c63c6bf27f33a5f
data/Rakefile CHANGED
@@ -30,36 +30,44 @@ spec = helper.gemspec
30
30
  release_task = Rake::Task["release"]
31
31
  release_task.prerequisites.replace(["build", "release:rubygem_push"])
32
32
 
33
- def run_extconf(extension_dir, *arguments)
34
- cd(extension_dir) do
35
- ruby("extconf.rb", *arguments)
33
+ def run_extconf(build_dir, extension_dir, *arguments)
34
+ cd(build_dir) do
35
+ ruby(File.join(extension_dir, "extconf.rb"),
36
+ *arguments)
36
37
  end
37
38
  end
38
39
 
39
40
  spec.extensions.each do |extension|
40
- extension_dir = File.dirname(extension)
41
- CLOBBER << File.join(extension_dir, "Makefile")
42
- CLOBBER << File.join(extension_dir, "mkmf.log")
41
+ extension_dir = File.join(base_dir, File.dirname(extension))
42
+ build_dir = ENV["BUILD_DIR"]
43
+ if build_dir
44
+ build_dir = File.join(build_dir, "red-arrow")
45
+ directory build_dir
46
+ else
47
+ build_dir = extension_dir
48
+ end
49
+ CLOBBER << File.join(build_dir, "Makefile")
50
+ CLOBBER << File.join(build_dir, "mkmf.log")
43
51
 
44
- makefile = File.join(extension_dir, "Makefile")
45
- file makefile do
46
- run_extconf(extension_dir)
52
+ makefile = File.join(build_dir, "Makefile")
53
+ file makefile => build_dir do
54
+ run_extconf(build_dir, extension_dir)
47
55
  end
48
56
 
49
57
  desc "Configure"
50
- task :configure do
51
- run_extconf(extension_dir)
58
+ task :configure => build_dir do
59
+ run_extconf(build_dir, extension_dir)
52
60
  end
53
61
 
54
62
  desc "Compile"
55
63
  task :compile => makefile do
56
- cd(extension_dir) do
64
+ cd(build_dir) do
57
65
  sh("make")
58
66
  end
59
67
  end
60
68
 
61
69
  task :clean do
62
- cd(extension_dir) do
70
+ cd(build_dir) do
63
71
  sh("make", "clean") if File.exist?("Makefile")
64
72
  end
65
73
  end
@@ -67,7 +75,9 @@ end
67
75
 
68
76
  desc "Run tests"
69
77
  task :test do
70
- ruby("test/run-test.rb")
78
+ cd(base_dir) do
79
+ ruby("test/run-test.rb")
80
+ end
71
81
  end
72
82
 
73
83
  task default: :test
@@ -79,8 +89,10 @@ task :benchmark do
79
89
  else
80
90
  FileList["benchmark/{,*/**/}*.yml"]
81
91
  end
82
- benchmarks.each do |benchmark|
83
- sh("benchmark-driver", benchmark)
92
+ cd(base_dir) do
93
+ benchmarks.each do |benchmark|
94
+ sh("benchmark-driver", benchmark)
95
+ end
84
96
  end
85
97
  end
86
98
 
@@ -285,7 +285,8 @@ namespace red_arrow {
285
285
  // VISIT(Interval)
286
286
  VISIT(List)
287
287
  VISIT(Struct)
288
- VISIT(Union)
288
+ VISIT(SparseUnion)
289
+ VISIT(DenseUnion)
289
290
  VISIT(Dictionary)
290
291
  VISIT(Decimal128)
291
292
  // TODO
@@ -339,9 +340,9 @@ namespace red_arrow {
339
340
  index_ = index;
340
341
  result_ = rb_hash_new();
341
342
  const auto struct_type = array.struct_type();
342
- const auto n = struct_type->num_children();
343
+ const auto n = struct_type->num_fields();
343
344
  for (int i = 0; i < n; ++i) {
344
- const auto field_type = struct_type->child(i).get();
345
+ const auto field_type = struct_type->field(i).get();
345
346
  const auto& field_name = field_type->name();
346
347
  auto key_keep = key_;
347
348
  key_ = rb_utf8_str_new(field_name.data(), field_name.length());
@@ -388,7 +389,8 @@ namespace red_arrow {
388
389
  // VISIT(Interval)
389
390
  VISIT(List)
390
391
  VISIT(Struct)
391
- VISIT(Union)
392
+ VISIT(SparseUnion)
393
+ VISIT(DenseUnion)
392
394
  VISIT(Dictionary)
393
395
  VISIT(Decimal128)
394
396
  // TODO
@@ -432,10 +434,10 @@ namespace red_arrow {
432
434
  index_ = index;
433
435
  switch (array.mode()) {
434
436
  case arrow::UnionMode::SPARSE:
435
- convert_sparse(array);
437
+ convert_sparse(static_cast<const arrow::SparseUnionArray&>(array));
436
438
  break;
437
439
  case arrow::UnionMode::DENSE:
438
- convert_dense(array);
440
+ convert_dense(static_cast<const arrow::DenseUnionArray&>(array));
439
441
  break;
440
442
  default:
441
443
  rb_raise(rb_eArgError, "Invalid union mode");
@@ -479,7 +481,8 @@ namespace red_arrow {
479
481
  // VISIT(Interval)
480
482
  VISIT(List)
481
483
  VISIT(Struct)
482
- VISIT(Union)
484
+ VISIT(SparseUnion)
485
+ VISIT(DenseUnion)
483
486
  VISIT(Dictionary)
484
487
  VISIT(Decimal128)
485
488
  // TODO
@@ -501,48 +504,48 @@ namespace red_arrow {
501
504
  result_ = result;
502
505
  }
503
506
 
504
- uint8_t compute_child_index(const arrow::UnionArray& array,
507
+ uint8_t compute_field_index(const arrow::UnionArray& array,
505
508
  arrow::UnionType* type,
506
509
  const char* tag) {
507
- const auto type_id = array.raw_type_ids()[index_];
508
- const auto& type_codes = type->type_codes();
509
- for (uint8_t i = 0; i < type_codes.size(); ++i) {
510
- if (type_codes[i] == type_id) {
511
- return i;
510
+ const auto type_code = array.raw_type_codes()[index_];
511
+ if (type_code >= 0 && type_code <= arrow::UnionType::kMaxTypeCode) {
512
+ const auto field_id = type->child_ids()[type_code];
513
+ if (field_id >= 0) {
514
+ return field_id;
512
515
  }
513
516
  }
514
- check_status(arrow::Status::Invalid("Unknown type ID: ", type_id),
517
+ check_status(arrow::Status::Invalid("Unknown type ID: ", type_code),
515
518
  tag);
516
519
  return 0;
517
520
  }
518
521
 
519
- void convert_sparse(const arrow::UnionArray& array) {
522
+ void convert_sparse(const arrow::SparseUnionArray& array) {
520
523
  const auto type =
521
524
  std::static_pointer_cast<arrow::UnionType>(array.type()).get();
522
525
  const auto tag = "[raw-records][union-sparse-array]";
523
- const auto child_index = compute_child_index(array, type, tag);
524
- const auto child_field = type->child(child_index).get();
525
- const auto& field_name = child_field->name();
526
+ const auto index = compute_field_index(array, type, tag);
527
+ const auto field = type->field(index).get();
528
+ const auto& field_name = field->name();
526
529
  const auto field_name_keep = field_name_;
527
530
  field_name_ = rb_utf8_str_new(field_name.data(), field_name.length());
528
- const auto child_array = array.child(child_index).get();
529
- check_status(child_array->Accept(this), tag);
531
+ const auto field_array = array.field(index).get();
532
+ check_status(field_array->Accept(this), tag);
530
533
  field_name_ = field_name_keep;
531
534
  }
532
535
 
533
- void convert_dense(const arrow::UnionArray& array) {
536
+ void convert_dense(const arrow::DenseUnionArray& array) {
534
537
  const auto type =
535
538
  std::static_pointer_cast<arrow::UnionType>(array.type()).get();
536
539
  const auto tag = "[raw-records][union-dense-array]";
537
- const auto child_index = compute_child_index(array, type, tag);
538
- const auto child_field = type->child(child_index).get();
539
- const auto& field_name = child_field->name();
540
+ const auto index = compute_field_index(array, type, tag);
541
+ const auto field = type->field(index).get();
542
+ const auto& field_name = field->name();
540
543
  const auto field_name_keep = field_name_;
541
544
  field_name_ = rb_utf8_str_new(field_name.data(), field_name.length());
542
- const auto child_array = array.child(child_index);
545
+ const auto field_array = array.field(index);
543
546
  const auto index_keep = index_;
544
547
  index_ = array.value_offset(index_);
545
- check_status(child_array->Accept(this), tag);
548
+ check_status(field_array->Accept(this), tag);
546
549
  index_ = index_keep;
547
550
  field_name_ = field_name_keep;
548
551
  }
@@ -557,30 +560,57 @@ namespace red_arrow {
557
560
  public:
558
561
  explicit DictionaryArrayValueConverter(ArrayValueConverter* converter)
559
562
  : array_value_converter_(converter),
560
- index_(0),
563
+ value_index_(0),
561
564
  result_(Qnil) {
562
565
  }
563
566
 
564
567
  VALUE convert(const arrow::DictionaryArray& array,
565
568
  const int64_t index) {
566
- index_ = index;
567
- auto indices = array.indices().get();
568
- check_status(indices->Accept(this),
569
+ value_index_ = array.GetValueIndex(index);
570
+ auto dictionary = array.dictionary().get();
571
+ check_status(dictionary->Accept(this),
569
572
  "[raw-records][dictionary-array]");
570
573
  return result_;
571
574
  }
572
575
 
573
- // TODO: Convert to real value.
574
576
  #define VISIT(TYPE) \
575
577
  arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
576
- result_ = convert_value(array, index_); \
578
+ result_ = convert_value(array, value_index_); \
577
579
  return arrow::Status::OK(); \
578
580
  }
579
581
 
582
+ VISIT(Null)
583
+ VISIT(Boolean)
580
584
  VISIT(Int8)
581
585
  VISIT(Int16)
582
586
  VISIT(Int32)
583
587
  VISIT(Int64)
588
+ VISIT(UInt8)
589
+ VISIT(UInt16)
590
+ VISIT(UInt32)
591
+ VISIT(UInt64)
592
+ // TODO
593
+ // VISIT(HalfFloat)
594
+ VISIT(Float)
595
+ VISIT(Double)
596
+ VISIT(Binary)
597
+ VISIT(String)
598
+ VISIT(FixedSizeBinary)
599
+ VISIT(Date32)
600
+ VISIT(Date64)
601
+ VISIT(Time32)
602
+ VISIT(Time64)
603
+ VISIT(Timestamp)
604
+ // TODO
605
+ // VISIT(Interval)
606
+ VISIT(List)
607
+ VISIT(Struct)
608
+ VISIT(SparseUnion)
609
+ VISIT(DenseUnion)
610
+ VISIT(Dictionary)
611
+ VISIT(Decimal128)
612
+ // TODO
613
+ // VISIT(Extension)
584
614
 
585
615
  #undef VISIT
586
616
 
@@ -592,7 +622,7 @@ namespace red_arrow {
592
622
  }
593
623
 
594
624
  ArrayValueConverter* array_value_converter_;
595
- int64_t index_;
625
+ int64_t value_index_;
596
626
  VALUE result_;
597
627
  };
598
628
 
@@ -100,7 +100,8 @@ namespace red_arrow {
100
100
  // VISIT(Interval)
101
101
  VISIT(List)
102
102
  VISIT(Struct)
103
- VISIT(Union)
103
+ VISIT(SparseUnion)
104
+ VISIT(DenseUnion)
104
105
  VISIT(Dictionary)
105
106
  VISIT(Decimal128)
106
107
  // TODO
@@ -81,7 +81,8 @@ namespace red_arrow {
81
81
  // VISIT(Interval)
82
82
  VISIT(List)
83
83
  VISIT(Struct)
84
- VISIT(Union)
84
+ VISIT(SparseUnion)
85
+ VISIT(DenseUnion)
85
86
  VISIT(Dictionary)
86
87
  VISIT(Decimal128)
87
88
  // TODO
@@ -26,60 +26,13 @@ module Arrow
26
26
  return builder.build(values)
27
27
  end
28
28
 
29
- builder_class = nil
30
- builder_class_arguments = []
29
+ builder_info = nil
31
30
  values.each do |value|
32
- case value
33
- when nil
34
- # Ignore
35
- when true, false
36
- return BooleanArray.new(values)
37
- when String
38
- return StringArray.new(values)
39
- when Float
40
- return DoubleArray.new(values)
41
- when Integer
42
- if value < 0
43
- builder = IntArrayBuilder.new
44
- return builder.build(values)
45
- else
46
- builder_class = UIntArrayBuilder
47
- builder_class_arguments = []
48
- end
49
- when Time
50
- data_type = value.data_type
51
- case data_type.unit
52
- when TimeUnit::SECOND
53
- if builder.nil?
54
- builder = Time32ArrayBuilder
55
- builder_class_arguments = [data_type]
56
- end
57
- when TimeUnit::MILLI
58
- if builder != Time64ArrayBuilder
59
- builder = Time32ArrayBuilder
60
- builder_class_arguments = [data_type]
61
- end
62
- when TimeUnit::MICRO
63
- builder = Time64ArrayBuilder
64
- builder_class_arguments = [data_type]
65
- when TimeUnit::NANO
66
- builder = Time64ArrayBuilder.new(data_type)
67
- return builder.build(values)
68
- end
69
- when ::Time
70
- data_type = TimestampDataType.new(:nano)
71
- builder = TimestampArrayBuilder.new(data_type)
72
- return builder.build(values)
73
- when DateTime
74
- return Date64Array.new(values)
75
- when Date
76
- return Date32Array.new(values)
77
- else
78
- return StringArray.new(values)
79
- end
31
+ builder_info = detect_builder_info(value, builder_info)
32
+ break if builder_info and builder_info[:detected]
80
33
  end
81
- if builder_class
82
- builder = builder_class.new(*builder_class_arguments)
34
+ if builder_info
35
+ builder = builder_info[:builder]
83
36
  builder.build(values)
84
37
  else
85
38
  Arrow::StringArray.new(values)
@@ -89,6 +42,102 @@ module Arrow
89
42
  def buildable?(args)
90
43
  args.size == method(:build).arity
91
44
  end
45
+
46
+ private
47
+ def detect_builder_info(value, builder_info)
48
+ case value
49
+ when nil
50
+ builder_info
51
+ when true, false
52
+ {
53
+ builder: BooleanArrayBuilder.new,
54
+ detected: true,
55
+ }
56
+ when String
57
+ {
58
+ builder: StringArrayBuilder.new,
59
+ detected: true,
60
+ }
61
+ when Float
62
+ {
63
+ builder: DoubleArrayBuilder.new,
64
+ detected: true,
65
+ }
66
+ when Integer
67
+ if value < 0
68
+ {
69
+ builder: IntArrayBuilder.new,
70
+ detected: true,
71
+ }
72
+ else
73
+ {
74
+ builder: UIntArrayBuilder.new,
75
+ }
76
+ end
77
+ when Time
78
+ data_type = value.data_type
79
+ case data_type.unit
80
+ when TimeUnit::SECOND
81
+ builder_info || {
82
+ builder: Time32ArrayBuilder.new(data_type)
83
+ }
84
+ when TimeUnit::MILLI
85
+ if builder_info and builder_info[:builder].is_a?(Time64ArrayBuilder)
86
+ builder_info
87
+ else
88
+ {
89
+ builder: Time32ArrayBuilder.new(data_type),
90
+ }
91
+ end
92
+ when TimeUnit::MICRO
93
+ {
94
+ builder: Time64ArrayBuilder.new(data_type),
95
+ }
96
+ when TimeUnit::NANO
97
+ {
98
+ builder: Time64ArrayBuilder.new(data_type),
99
+ detected: true
100
+ }
101
+ end
102
+ when ::Time
103
+ data_type = TimestampDataType.new(:nano)
104
+ {
105
+ builder: TimestampArrayBuilder.new(data_type),
106
+ detected: true,
107
+ }
108
+ when DateTime
109
+ {
110
+ builder: Date64ArrayBuilder.new,
111
+ detected: true,
112
+ }
113
+ when Date
114
+ {
115
+ builder: Date32ArrayBuilder.new,
116
+ detected: true,
117
+ }
118
+ when ::Array
119
+ sub_builder_info = nil
120
+ value.each do |sub_value|
121
+ sub_builder_info = detect_builder_info(sub_value, sub_builder_info)
122
+ break if sub_builder_info and sub_builder_info[:detected]
123
+ end
124
+ if sub_builder_info and sub_builder_info[:detected]
125
+ sub_value_data_type = sub_builder_info[:builder].value_data_type
126
+ field = Field.new("item", sub_value_data_type)
127
+ {
128
+ builder: ListArrayBuilder.new(ListDataType.new(field)),
129
+ detected: true,
130
+ }
131
+ else
132
+ builder_info
133
+ end
134
+ else
135
+ {
136
+ builder: StringArrayBuilder.new,
137
+ detected: true,
138
+ }
139
+ end
140
+ end
92
141
  end
93
142
 
94
143
  def build(values)