red-arrow 0.15.1 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +28 -16
  3. data/ext/arrow/converters.hpp +63 -33
  4. data/ext/arrow/raw-records.cpp +2 -1
  5. data/ext/arrow/values.cpp +2 -1
  6. data/lib/arrow/array-builder.rb +101 -52
  7. data/lib/arrow/array.rb +28 -10
  8. data/lib/arrow/{binary-array-builder.rb → buffer.rb} +7 -15
  9. data/lib/arrow/chunked-array.rb +2 -0
  10. data/lib/arrow/csv-loader.rb +5 -0
  11. data/lib/arrow/csv-read-options.rb +18 -0
  12. data/lib/arrow/data-type.rb +35 -2
  13. data/lib/arrow/decimal128-array-builder.rb +0 -2
  14. data/lib/arrow/dictionary-array.rb +24 -0
  15. data/lib/arrow/field.rb +1 -1
  16. data/lib/arrow/generic-filterable.rb +43 -0
  17. data/lib/arrow/generic-takeable.rb +38 -0
  18. data/lib/arrow/list-data-type.rb +58 -8
  19. data/lib/arrow/loader.rb +12 -1
  20. data/lib/arrow/null-array-builder.rb +1 -1
  21. data/lib/arrow/null-array.rb +24 -0
  22. data/lib/arrow/raw-table-converter.rb +47 -0
  23. data/lib/arrow/record-batch-iterator.rb +22 -0
  24. data/lib/arrow/record-batch.rb +8 -3
  25. data/lib/arrow/schema.rb +5 -2
  26. data/lib/arrow/struct-array-builder.rb +13 -7
  27. data/lib/arrow/struct-data-type.rb +0 -2
  28. data/lib/arrow/table-loader.rb +29 -6
  29. data/lib/arrow/table-saver.rb +37 -13
  30. data/lib/arrow/table.rb +20 -73
  31. data/lib/arrow/version.rb +1 -1
  32. data/red-arrow.gemspec +3 -1
  33. data/test/helper.rb +1 -0
  34. data/test/helper/omittable.rb +36 -0
  35. data/test/raw-records/test-dense-union-array.rb +1 -34
  36. data/test/raw-records/test-sparse-union-array.rb +1 -33
  37. data/test/run-test.rb +14 -3
  38. data/test/test-array-builder.rb +17 -0
  39. data/test/test-array.rb +104 -0
  40. data/test/test-buffer.rb +11 -0
  41. data/test/test-chunked-array.rb +96 -0
  42. data/test/test-csv-loader.rb +2 -2
  43. data/test/test-data-type.rb +11 -0
  44. data/test/test-dense-union-data-type.rb +2 -2
  45. data/test/test-dictionary-array.rb +41 -0
  46. data/test/test-feather.rb +21 -6
  47. data/test/test-list-data-type.rb +27 -1
  48. data/test/test-null-array.rb +23 -0
  49. data/test/test-record-batch-iterator.rb +37 -0
  50. data/test/test-record-batch.rb +14 -0
  51. data/test/test-schema.rb +16 -0
  52. data/test/test-slicer.rb +74 -30
  53. data/test/test-sparse-union-data-type.rb +2 -2
  54. data/test/test-struct-array-builder.rb +8 -4
  55. data/test/test-table.rb +153 -14
  56. data/test/test-timestamp-array.rb +19 -0
  57. data/test/values/test-dense-union-array.rb +1 -34
  58. data/test/values/test-sparse-union-array.rb +1 -33
  59. metadata +22 -8
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 04ae0088ccf17ff76c09ad884e9c4ce6c9a9c637c8f724964c24a379e37d652e
4
- data.tar.gz: a3387f67b30dfd07963730ff79c694d703a08f872fdb7990509d019b460e20b9
3
+ metadata.gz: ad05c81bcbf6fb37d40a76f56fcf53a1722e27d7e490616b7acadf55d37f6ce8
4
+ data.tar.gz: 292922670f00f1c9eecca1dcc42d326e0b45b8efcaca0b341bcb3692e983dec2
5
5
  SHA512:
6
- metadata.gz: dc03abf2065a62ffeb9061f64d9e6074dd90dee464bddc4a06773635a727abc025f6b249dad19bcdfa207308ecf24e7a5f8b7e18ec127c897f63d5916a20aa31
7
- data.tar.gz: 7dc98d5e71d23595baffda2b167494677a2d7d2269b010bea23555a6442d848248068f1e87ae3fa79c9fb61957243f038438ae3fa429f4daae3ead429c9066a1
6
+ metadata.gz: 8dfd19d31f8bab080978747f63cd42207ba6b30dc2fb0b69196a467ab2ea4fc1f0277e4dd7dd55f4cca54765e0d4bcc4bdcedf2dcd2a2c3a3e8857e6be02fefa
7
+ data.tar.gz: c688b857b052cbdc21d67e5b1466d2d93207f6a3fdd29100e9ddb68837187e0ca48de2010ace8fd1e05fb6c633aed275057a6b64c36f50d5a3c6a769e2396f38
data/Rakefile CHANGED
@@ -30,36 +30,44 @@ spec = helper.gemspec
30
30
  release_task = Rake::Task["release"]
31
31
  release_task.prerequisites.replace(["build", "release:rubygem_push"])
32
32
 
33
- def run_extconf(extension_dir, *arguments)
34
- cd(extension_dir) do
35
- ruby("extconf.rb", *arguments)
33
+ def run_extconf(build_dir, extension_dir, *arguments)
34
+ cd(build_dir) do
35
+ ruby(File.join(extension_dir, "extconf.rb"),
36
+ *arguments)
36
37
  end
37
38
  end
38
39
 
39
40
  spec.extensions.each do |extension|
40
- extension_dir = File.dirname(extension)
41
- CLOBBER << File.join(extension_dir, "Makefile")
42
- CLOBBER << File.join(extension_dir, "mkmf.log")
41
+ extension_dir = File.join(base_dir, File.dirname(extension))
42
+ build_dir = ENV["BUILD_DIR"]
43
+ if build_dir
44
+ build_dir = File.join(build_dir, "red-arrow")
45
+ directory build_dir
46
+ else
47
+ build_dir = extension_dir
48
+ end
49
+ CLOBBER << File.join(build_dir, "Makefile")
50
+ CLOBBER << File.join(build_dir, "mkmf.log")
43
51
 
44
- makefile = File.join(extension_dir, "Makefile")
45
- file makefile do
46
- run_extconf(extension_dir)
52
+ makefile = File.join(build_dir, "Makefile")
53
+ file makefile => build_dir do
54
+ run_extconf(build_dir, extension_dir)
47
55
  end
48
56
 
49
57
  desc "Configure"
50
- task :configure do
51
- run_extconf(extension_dir)
58
+ task :configure => build_dir do
59
+ run_extconf(build_dir, extension_dir)
52
60
  end
53
61
 
54
62
  desc "Compile"
55
63
  task :compile => makefile do
56
- cd(extension_dir) do
64
+ cd(build_dir) do
57
65
  sh("make")
58
66
  end
59
67
  end
60
68
 
61
69
  task :clean do
62
- cd(extension_dir) do
70
+ cd(build_dir) do
63
71
  sh("make", "clean") if File.exist?("Makefile")
64
72
  end
65
73
  end
@@ -67,7 +75,9 @@ end
67
75
 
68
76
  desc "Run tests"
69
77
  task :test do
70
- ruby("test/run-test.rb")
78
+ cd(base_dir) do
79
+ ruby("test/run-test.rb")
80
+ end
71
81
  end
72
82
 
73
83
  task default: :test
@@ -79,8 +89,10 @@ task :benchmark do
79
89
  else
80
90
  FileList["benchmark/{,*/**/}*.yml"]
81
91
  end
82
- benchmarks.each do |benchmark|
83
- sh("benchmark-driver", benchmark)
92
+ cd(base_dir) do
93
+ benchmarks.each do |benchmark|
94
+ sh("benchmark-driver", benchmark)
95
+ end
84
96
  end
85
97
  end
86
98
 
@@ -285,7 +285,8 @@ namespace red_arrow {
285
285
  // VISIT(Interval)
286
286
  VISIT(List)
287
287
  VISIT(Struct)
288
- VISIT(Union)
288
+ VISIT(SparseUnion)
289
+ VISIT(DenseUnion)
289
290
  VISIT(Dictionary)
290
291
  VISIT(Decimal128)
291
292
  // TODO
@@ -339,9 +340,9 @@ namespace red_arrow {
339
340
  index_ = index;
340
341
  result_ = rb_hash_new();
341
342
  const auto struct_type = array.struct_type();
342
- const auto n = struct_type->num_children();
343
+ const auto n = struct_type->num_fields();
343
344
  for (int i = 0; i < n; ++i) {
344
- const auto field_type = struct_type->child(i).get();
345
+ const auto field_type = struct_type->field(i).get();
345
346
  const auto& field_name = field_type->name();
346
347
  auto key_keep = key_;
347
348
  key_ = rb_utf8_str_new(field_name.data(), field_name.length());
@@ -388,7 +389,8 @@ namespace red_arrow {
388
389
  // VISIT(Interval)
389
390
  VISIT(List)
390
391
  VISIT(Struct)
391
- VISIT(Union)
392
+ VISIT(SparseUnion)
393
+ VISIT(DenseUnion)
392
394
  VISIT(Dictionary)
393
395
  VISIT(Decimal128)
394
396
  // TODO
@@ -432,10 +434,10 @@ namespace red_arrow {
432
434
  index_ = index;
433
435
  switch (array.mode()) {
434
436
  case arrow::UnionMode::SPARSE:
435
- convert_sparse(array);
437
+ convert_sparse(static_cast<const arrow::SparseUnionArray&>(array));
436
438
  break;
437
439
  case arrow::UnionMode::DENSE:
438
- convert_dense(array);
440
+ convert_dense(static_cast<const arrow::DenseUnionArray&>(array));
439
441
  break;
440
442
  default:
441
443
  rb_raise(rb_eArgError, "Invalid union mode");
@@ -479,7 +481,8 @@ namespace red_arrow {
479
481
  // VISIT(Interval)
480
482
  VISIT(List)
481
483
  VISIT(Struct)
482
- VISIT(Union)
484
+ VISIT(SparseUnion)
485
+ VISIT(DenseUnion)
483
486
  VISIT(Dictionary)
484
487
  VISIT(Decimal128)
485
488
  // TODO
@@ -501,48 +504,48 @@ namespace red_arrow {
501
504
  result_ = result;
502
505
  }
503
506
 
504
- uint8_t compute_child_index(const arrow::UnionArray& array,
507
+ uint8_t compute_field_index(const arrow::UnionArray& array,
505
508
  arrow::UnionType* type,
506
509
  const char* tag) {
507
- const auto type_id = array.raw_type_ids()[index_];
508
- const auto& type_codes = type->type_codes();
509
- for (uint8_t i = 0; i < type_codes.size(); ++i) {
510
- if (type_codes[i] == type_id) {
511
- return i;
510
+ const auto type_code = array.raw_type_codes()[index_];
511
+ if (type_code >= 0 && type_code <= arrow::UnionType::kMaxTypeCode) {
512
+ const auto field_id = type->child_ids()[type_code];
513
+ if (field_id >= 0) {
514
+ return field_id;
512
515
  }
513
516
  }
514
- check_status(arrow::Status::Invalid("Unknown type ID: ", type_id),
517
+ check_status(arrow::Status::Invalid("Unknown type ID: ", type_code),
515
518
  tag);
516
519
  return 0;
517
520
  }
518
521
 
519
- void convert_sparse(const arrow::UnionArray& array) {
522
+ void convert_sparse(const arrow::SparseUnionArray& array) {
520
523
  const auto type =
521
524
  std::static_pointer_cast<arrow::UnionType>(array.type()).get();
522
525
  const auto tag = "[raw-records][union-sparse-array]";
523
- const auto child_index = compute_child_index(array, type, tag);
524
- const auto child_field = type->child(child_index).get();
525
- const auto& field_name = child_field->name();
526
+ const auto index = compute_field_index(array, type, tag);
527
+ const auto field = type->field(index).get();
528
+ const auto& field_name = field->name();
526
529
  const auto field_name_keep = field_name_;
527
530
  field_name_ = rb_utf8_str_new(field_name.data(), field_name.length());
528
- const auto child_array = array.child(child_index).get();
529
- check_status(child_array->Accept(this), tag);
531
+ const auto field_array = array.field(index).get();
532
+ check_status(field_array->Accept(this), tag);
530
533
  field_name_ = field_name_keep;
531
534
  }
532
535
 
533
- void convert_dense(const arrow::UnionArray& array) {
536
+ void convert_dense(const arrow::DenseUnionArray& array) {
534
537
  const auto type =
535
538
  std::static_pointer_cast<arrow::UnionType>(array.type()).get();
536
539
  const auto tag = "[raw-records][union-dense-array]";
537
- const auto child_index = compute_child_index(array, type, tag);
538
- const auto child_field = type->child(child_index).get();
539
- const auto& field_name = child_field->name();
540
+ const auto index = compute_field_index(array, type, tag);
541
+ const auto field = type->field(index).get();
542
+ const auto& field_name = field->name();
540
543
  const auto field_name_keep = field_name_;
541
544
  field_name_ = rb_utf8_str_new(field_name.data(), field_name.length());
542
- const auto child_array = array.child(child_index);
545
+ const auto field_array = array.field(index);
543
546
  const auto index_keep = index_;
544
547
  index_ = array.value_offset(index_);
545
- check_status(child_array->Accept(this), tag);
548
+ check_status(field_array->Accept(this), tag);
546
549
  index_ = index_keep;
547
550
  field_name_ = field_name_keep;
548
551
  }
@@ -557,30 +560,57 @@ namespace red_arrow {
557
560
  public:
558
561
  explicit DictionaryArrayValueConverter(ArrayValueConverter* converter)
559
562
  : array_value_converter_(converter),
560
- index_(0),
563
+ value_index_(0),
561
564
  result_(Qnil) {
562
565
  }
563
566
 
564
567
  VALUE convert(const arrow::DictionaryArray& array,
565
568
  const int64_t index) {
566
- index_ = index;
567
- auto indices = array.indices().get();
568
- check_status(indices->Accept(this),
569
+ value_index_ = array.GetValueIndex(index);
570
+ auto dictionary = array.dictionary().get();
571
+ check_status(dictionary->Accept(this),
569
572
  "[raw-records][dictionary-array]");
570
573
  return result_;
571
574
  }
572
575
 
573
- // TODO: Convert to real value.
574
576
  #define VISIT(TYPE) \
575
577
  arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
576
- result_ = convert_value(array, index_); \
578
+ result_ = convert_value(array, value_index_); \
577
579
  return arrow::Status::OK(); \
578
580
  }
579
581
 
582
+ VISIT(Null)
583
+ VISIT(Boolean)
580
584
  VISIT(Int8)
581
585
  VISIT(Int16)
582
586
  VISIT(Int32)
583
587
  VISIT(Int64)
588
+ VISIT(UInt8)
589
+ VISIT(UInt16)
590
+ VISIT(UInt32)
591
+ VISIT(UInt64)
592
+ // TODO
593
+ // VISIT(HalfFloat)
594
+ VISIT(Float)
595
+ VISIT(Double)
596
+ VISIT(Binary)
597
+ VISIT(String)
598
+ VISIT(FixedSizeBinary)
599
+ VISIT(Date32)
600
+ VISIT(Date64)
601
+ VISIT(Time32)
602
+ VISIT(Time64)
603
+ VISIT(Timestamp)
604
+ // TODO
605
+ // VISIT(Interval)
606
+ VISIT(List)
607
+ VISIT(Struct)
608
+ VISIT(SparseUnion)
609
+ VISIT(DenseUnion)
610
+ VISIT(Dictionary)
611
+ VISIT(Decimal128)
612
+ // TODO
613
+ // VISIT(Extension)
584
614
 
585
615
  #undef VISIT
586
616
 
@@ -592,7 +622,7 @@ namespace red_arrow {
592
622
  }
593
623
 
594
624
  ArrayValueConverter* array_value_converter_;
595
- int64_t index_;
625
+ int64_t value_index_;
596
626
  VALUE result_;
597
627
  };
598
628
 
@@ -100,7 +100,8 @@ namespace red_arrow {
100
100
  // VISIT(Interval)
101
101
  VISIT(List)
102
102
  VISIT(Struct)
103
- VISIT(Union)
103
+ VISIT(SparseUnion)
104
+ VISIT(DenseUnion)
104
105
  VISIT(Dictionary)
105
106
  VISIT(Decimal128)
106
107
  // TODO
@@ -81,7 +81,8 @@ namespace red_arrow {
81
81
  // VISIT(Interval)
82
82
  VISIT(List)
83
83
  VISIT(Struct)
84
- VISIT(Union)
84
+ VISIT(SparseUnion)
85
+ VISIT(DenseUnion)
85
86
  VISIT(Dictionary)
86
87
  VISIT(Decimal128)
87
88
  // TODO
@@ -26,60 +26,13 @@ module Arrow
26
26
  return builder.build(values)
27
27
  end
28
28
 
29
- builder_class = nil
30
- builder_class_arguments = []
29
+ builder_info = nil
31
30
  values.each do |value|
32
- case value
33
- when nil
34
- # Ignore
35
- when true, false
36
- return BooleanArray.new(values)
37
- when String
38
- return StringArray.new(values)
39
- when Float
40
- return DoubleArray.new(values)
41
- when Integer
42
- if value < 0
43
- builder = IntArrayBuilder.new
44
- return builder.build(values)
45
- else
46
- builder_class = UIntArrayBuilder
47
- builder_class_arguments = []
48
- end
49
- when Time
50
- data_type = value.data_type
51
- case data_type.unit
52
- when TimeUnit::SECOND
53
- if builder.nil?
54
- builder = Time32ArrayBuilder
55
- builder_class_arguments = [data_type]
56
- end
57
- when TimeUnit::MILLI
58
- if builder != Time64ArrayBuilder
59
- builder = Time32ArrayBuilder
60
- builder_class_arguments = [data_type]
61
- end
62
- when TimeUnit::MICRO
63
- builder = Time64ArrayBuilder
64
- builder_class_arguments = [data_type]
65
- when TimeUnit::NANO
66
- builder = Time64ArrayBuilder.new(data_type)
67
- return builder.build(values)
68
- end
69
- when ::Time
70
- data_type = TimestampDataType.new(:nano)
71
- builder = TimestampArrayBuilder.new(data_type)
72
- return builder.build(values)
73
- when DateTime
74
- return Date64Array.new(values)
75
- when Date
76
- return Date32Array.new(values)
77
- else
78
- return StringArray.new(values)
79
- end
31
+ builder_info = detect_builder_info(value, builder_info)
32
+ break if builder_info and builder_info[:detected]
80
33
  end
81
- if builder_class
82
- builder = builder_class.new(*builder_class_arguments)
34
+ if builder_info
35
+ builder = builder_info[:builder]
83
36
  builder.build(values)
84
37
  else
85
38
  Arrow::StringArray.new(values)
@@ -89,6 +42,102 @@ module Arrow
89
42
  def buildable?(args)
90
43
  args.size == method(:build).arity
91
44
  end
45
+
46
+ private
47
+ def detect_builder_info(value, builder_info)
48
+ case value
49
+ when nil
50
+ builder_info
51
+ when true, false
52
+ {
53
+ builder: BooleanArrayBuilder.new,
54
+ detected: true,
55
+ }
56
+ when String
57
+ {
58
+ builder: StringArrayBuilder.new,
59
+ detected: true,
60
+ }
61
+ when Float
62
+ {
63
+ builder: DoubleArrayBuilder.new,
64
+ detected: true,
65
+ }
66
+ when Integer
67
+ if value < 0
68
+ {
69
+ builder: IntArrayBuilder.new,
70
+ detected: true,
71
+ }
72
+ else
73
+ {
74
+ builder: UIntArrayBuilder.new,
75
+ }
76
+ end
77
+ when Time
78
+ data_type = value.data_type
79
+ case data_type.unit
80
+ when TimeUnit::SECOND
81
+ builder_info || {
82
+ builder: Time32ArrayBuilder.new(data_type)
83
+ }
84
+ when TimeUnit::MILLI
85
+ if builder_info and builder_info[:builder].is_a?(Time64ArrayBuilder)
86
+ builder_info
87
+ else
88
+ {
89
+ builder: Time32ArrayBuilder.new(data_type),
90
+ }
91
+ end
92
+ when TimeUnit::MICRO
93
+ {
94
+ builder: Time64ArrayBuilder.new(data_type),
95
+ }
96
+ when TimeUnit::NANO
97
+ {
98
+ builder: Time64ArrayBuilder.new(data_type),
99
+ detected: true
100
+ }
101
+ end
102
+ when ::Time
103
+ data_type = TimestampDataType.new(:nano)
104
+ {
105
+ builder: TimestampArrayBuilder.new(data_type),
106
+ detected: true,
107
+ }
108
+ when DateTime
109
+ {
110
+ builder: Date64ArrayBuilder.new,
111
+ detected: true,
112
+ }
113
+ when Date
114
+ {
115
+ builder: Date32ArrayBuilder.new,
116
+ detected: true,
117
+ }
118
+ when ::Array
119
+ sub_builder_info = nil
120
+ value.each do |sub_value|
121
+ sub_builder_info = detect_builder_info(sub_value, sub_builder_info)
122
+ break if sub_builder_info and sub_builder_info[:detected]
123
+ end
124
+ if sub_builder_info and sub_builder_info[:detected]
125
+ sub_value_data_type = sub_builder_info[:builder].value_data_type
126
+ field = Field.new("item", sub_value_data_type)
127
+ {
128
+ builder: ListArrayBuilder.new(ListDataType.new(field)),
129
+ detected: true,
130
+ }
131
+ else
132
+ builder_info
133
+ end
134
+ else
135
+ {
136
+ builder: StringArrayBuilder.new,
137
+ detected: true,
138
+ }
139
+ end
140
+ end
92
141
  end
93
142
 
94
143
  def build(values)