red-arrow 0.15.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +28 -16
- data/ext/arrow/converters.hpp +63 -33
- data/ext/arrow/raw-records.cpp +2 -1
- data/ext/arrow/values.cpp +2 -1
- data/lib/arrow/array-builder.rb +101 -52
- data/lib/arrow/array.rb +28 -10
- data/lib/arrow/{binary-array-builder.rb → buffer.rb} +7 -15
- data/lib/arrow/chunked-array.rb +2 -0
- data/lib/arrow/csv-loader.rb +15 -3
- data/lib/arrow/csv-read-options.rb +18 -0
- data/lib/arrow/data-type.rb +35 -2
- data/lib/arrow/decimal128-array-builder.rb +0 -2
- data/lib/arrow/dictionary-array.rb +24 -0
- data/lib/arrow/field.rb +1 -1
- data/lib/arrow/generic-filterable.rb +43 -0
- data/lib/arrow/generic-takeable.rb +38 -0
- data/lib/arrow/list-data-type.rb +58 -8
- data/lib/arrow/loader.rb +12 -1
- data/lib/arrow/null-array-builder.rb +1 -1
- data/lib/arrow/null-array.rb +24 -0
- data/lib/arrow/raw-table-converter.rb +47 -0
- data/lib/arrow/record-batch-iterator.rb +22 -0
- data/lib/arrow/record-batch.rb +8 -3
- data/lib/arrow/schema.rb +5 -2
- data/lib/arrow/struct-array-builder.rb +13 -7
- data/lib/arrow/struct-data-type.rb +0 -2
- data/lib/arrow/table-loader.rb +29 -6
- data/lib/arrow/table-saver.rb +37 -13
- data/lib/arrow/table.rb +20 -73
- data/lib/arrow/version.rb +1 -1
- data/red-arrow.gemspec +4 -2
- data/test/helper.rb +1 -0
- data/test/helper/omittable.rb +36 -0
- data/test/raw-records/test-dense-union-array.rb +1 -34
- data/test/raw-records/test-sparse-union-array.rb +1 -33
- data/test/run-test.rb +14 -3
- data/test/test-array-builder.rb +17 -0
- data/test/test-array.rb +104 -0
- data/test/test-buffer.rb +11 -0
- data/test/test-chunked-array.rb +96 -0
- data/test/test-csv-loader.rb +77 -2
- data/test/test-data-type.rb +11 -0
- data/test/test-dense-union-data-type.rb +2 -2
- data/test/test-dictionary-array.rb +41 -0
- data/test/test-feather.rb +21 -6
- data/test/test-list-data-type.rb +27 -1
- data/test/test-null-array.rb +23 -0
- data/test/test-record-batch-iterator.rb +37 -0
- data/test/test-record-batch.rb +14 -0
- data/test/test-schema.rb +16 -0
- data/test/test-slicer.rb +74 -30
- data/test/test-sparse-union-data-type.rb +2 -2
- data/test/test-struct-array-builder.rb +8 -4
- data/test/test-table.rb +153 -14
- data/test/test-timestamp-array.rb +19 -0
- data/test/values/test-dense-union-array.rb +1 -34
- data/test/values/test-sparse-union-array.rb +1 -33
- metadata +76 -63
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a45e003f0a453f175b1dbfc81c1fcf092cbfe964dd43e44d7b16e2087834ef5d
|
4
|
+
data.tar.gz: 5d1dc1d87a821d1ac4c49603d1d92d723230eb1f8a7cdd540dafe3715a53c12e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 85329e05ae20268d44a9ffff5fb3278263e5b77deda6f03fda31595b117252b7970fd17328ce2650c8650895a1813a1fb95799d3331a312bce2c276f3e5d55a4
|
7
|
+
data.tar.gz: fe90b0ff0dfb9b9126765818d54f7d4d203e6c72ab53dfb2680a48dc474fef09b8ed3f0553f3c1fe110d500d625ab2300563c3e4169403326c63c6bf27f33a5f
|
data/Rakefile
CHANGED
@@ -30,36 +30,44 @@ spec = helper.gemspec
|
|
30
30
|
release_task = Rake::Task["release"]
|
31
31
|
release_task.prerequisites.replace(["build", "release:rubygem_push"])
|
32
32
|
|
33
|
-
def run_extconf(extension_dir, *arguments)
|
34
|
-
cd(
|
35
|
-
ruby("extconf.rb",
|
33
|
+
def run_extconf(build_dir, extension_dir, *arguments)
|
34
|
+
cd(build_dir) do
|
35
|
+
ruby(File.join(extension_dir, "extconf.rb"),
|
36
|
+
*arguments)
|
36
37
|
end
|
37
38
|
end
|
38
39
|
|
39
40
|
spec.extensions.each do |extension|
|
40
|
-
extension_dir = File.dirname(extension)
|
41
|
-
|
42
|
-
|
41
|
+
extension_dir = File.join(base_dir, File.dirname(extension))
|
42
|
+
build_dir = ENV["BUILD_DIR"]
|
43
|
+
if build_dir
|
44
|
+
build_dir = File.join(build_dir, "red-arrow")
|
45
|
+
directory build_dir
|
46
|
+
else
|
47
|
+
build_dir = extension_dir
|
48
|
+
end
|
49
|
+
CLOBBER << File.join(build_dir, "Makefile")
|
50
|
+
CLOBBER << File.join(build_dir, "mkmf.log")
|
43
51
|
|
44
|
-
makefile = File.join(
|
45
|
-
file makefile do
|
46
|
-
run_extconf(extension_dir)
|
52
|
+
makefile = File.join(build_dir, "Makefile")
|
53
|
+
file makefile => build_dir do
|
54
|
+
run_extconf(build_dir, extension_dir)
|
47
55
|
end
|
48
56
|
|
49
57
|
desc "Configure"
|
50
|
-
task :configure do
|
51
|
-
run_extconf(extension_dir)
|
58
|
+
task :configure => build_dir do
|
59
|
+
run_extconf(build_dir, extension_dir)
|
52
60
|
end
|
53
61
|
|
54
62
|
desc "Compile"
|
55
63
|
task :compile => makefile do
|
56
|
-
cd(
|
64
|
+
cd(build_dir) do
|
57
65
|
sh("make")
|
58
66
|
end
|
59
67
|
end
|
60
68
|
|
61
69
|
task :clean do
|
62
|
-
cd(
|
70
|
+
cd(build_dir) do
|
63
71
|
sh("make", "clean") if File.exist?("Makefile")
|
64
72
|
end
|
65
73
|
end
|
@@ -67,7 +75,9 @@ end
|
|
67
75
|
|
68
76
|
desc "Run tests"
|
69
77
|
task :test do
|
70
|
-
|
78
|
+
cd(base_dir) do
|
79
|
+
ruby("test/run-test.rb")
|
80
|
+
end
|
71
81
|
end
|
72
82
|
|
73
83
|
task default: :test
|
@@ -79,8 +89,10 @@ task :benchmark do
|
|
79
89
|
else
|
80
90
|
FileList["benchmark/{,*/**/}*.yml"]
|
81
91
|
end
|
82
|
-
|
83
|
-
|
92
|
+
cd(base_dir) do
|
93
|
+
benchmarks.each do |benchmark|
|
94
|
+
sh("benchmark-driver", benchmark)
|
95
|
+
end
|
84
96
|
end
|
85
97
|
end
|
86
98
|
|
data/ext/arrow/converters.hpp
CHANGED
@@ -285,7 +285,8 @@ namespace red_arrow {
|
|
285
285
|
// VISIT(Interval)
|
286
286
|
VISIT(List)
|
287
287
|
VISIT(Struct)
|
288
|
-
VISIT(
|
288
|
+
VISIT(SparseUnion)
|
289
|
+
VISIT(DenseUnion)
|
289
290
|
VISIT(Dictionary)
|
290
291
|
VISIT(Decimal128)
|
291
292
|
// TODO
|
@@ -339,9 +340,9 @@ namespace red_arrow {
|
|
339
340
|
index_ = index;
|
340
341
|
result_ = rb_hash_new();
|
341
342
|
const auto struct_type = array.struct_type();
|
342
|
-
const auto n = struct_type->
|
343
|
+
const auto n = struct_type->num_fields();
|
343
344
|
for (int i = 0; i < n; ++i) {
|
344
|
-
const auto field_type = struct_type->
|
345
|
+
const auto field_type = struct_type->field(i).get();
|
345
346
|
const auto& field_name = field_type->name();
|
346
347
|
auto key_keep = key_;
|
347
348
|
key_ = rb_utf8_str_new(field_name.data(), field_name.length());
|
@@ -388,7 +389,8 @@ namespace red_arrow {
|
|
388
389
|
// VISIT(Interval)
|
389
390
|
VISIT(List)
|
390
391
|
VISIT(Struct)
|
391
|
-
VISIT(
|
392
|
+
VISIT(SparseUnion)
|
393
|
+
VISIT(DenseUnion)
|
392
394
|
VISIT(Dictionary)
|
393
395
|
VISIT(Decimal128)
|
394
396
|
// TODO
|
@@ -432,10 +434,10 @@ namespace red_arrow {
|
|
432
434
|
index_ = index;
|
433
435
|
switch (array.mode()) {
|
434
436
|
case arrow::UnionMode::SPARSE:
|
435
|
-
convert_sparse(array);
|
437
|
+
convert_sparse(static_cast<const arrow::SparseUnionArray&>(array));
|
436
438
|
break;
|
437
439
|
case arrow::UnionMode::DENSE:
|
438
|
-
convert_dense(array);
|
440
|
+
convert_dense(static_cast<const arrow::DenseUnionArray&>(array));
|
439
441
|
break;
|
440
442
|
default:
|
441
443
|
rb_raise(rb_eArgError, "Invalid union mode");
|
@@ -479,7 +481,8 @@ namespace red_arrow {
|
|
479
481
|
// VISIT(Interval)
|
480
482
|
VISIT(List)
|
481
483
|
VISIT(Struct)
|
482
|
-
VISIT(
|
484
|
+
VISIT(SparseUnion)
|
485
|
+
VISIT(DenseUnion)
|
483
486
|
VISIT(Dictionary)
|
484
487
|
VISIT(Decimal128)
|
485
488
|
// TODO
|
@@ -501,48 +504,48 @@ namespace red_arrow {
|
|
501
504
|
result_ = result;
|
502
505
|
}
|
503
506
|
|
504
|
-
uint8_t
|
507
|
+
uint8_t compute_field_index(const arrow::UnionArray& array,
|
505
508
|
arrow::UnionType* type,
|
506
509
|
const char* tag) {
|
507
|
-
const auto
|
508
|
-
|
509
|
-
|
510
|
-
if (
|
511
|
-
return
|
510
|
+
const auto type_code = array.raw_type_codes()[index_];
|
511
|
+
if (type_code >= 0 && type_code <= arrow::UnionType::kMaxTypeCode) {
|
512
|
+
const auto field_id = type->child_ids()[type_code];
|
513
|
+
if (field_id >= 0) {
|
514
|
+
return field_id;
|
512
515
|
}
|
513
516
|
}
|
514
|
-
check_status(arrow::Status::Invalid("Unknown type ID: ",
|
517
|
+
check_status(arrow::Status::Invalid("Unknown type ID: ", type_code),
|
515
518
|
tag);
|
516
519
|
return 0;
|
517
520
|
}
|
518
521
|
|
519
|
-
void convert_sparse(const arrow::
|
522
|
+
void convert_sparse(const arrow::SparseUnionArray& array) {
|
520
523
|
const auto type =
|
521
524
|
std::static_pointer_cast<arrow::UnionType>(array.type()).get();
|
522
525
|
const auto tag = "[raw-records][union-sparse-array]";
|
523
|
-
const auto
|
524
|
-
const auto
|
525
|
-
const auto& field_name =
|
526
|
+
const auto index = compute_field_index(array, type, tag);
|
527
|
+
const auto field = type->field(index).get();
|
528
|
+
const auto& field_name = field->name();
|
526
529
|
const auto field_name_keep = field_name_;
|
527
530
|
field_name_ = rb_utf8_str_new(field_name.data(), field_name.length());
|
528
|
-
const auto
|
529
|
-
check_status(
|
531
|
+
const auto field_array = array.field(index).get();
|
532
|
+
check_status(field_array->Accept(this), tag);
|
530
533
|
field_name_ = field_name_keep;
|
531
534
|
}
|
532
535
|
|
533
|
-
void convert_dense(const arrow::
|
536
|
+
void convert_dense(const arrow::DenseUnionArray& array) {
|
534
537
|
const auto type =
|
535
538
|
std::static_pointer_cast<arrow::UnionType>(array.type()).get();
|
536
539
|
const auto tag = "[raw-records][union-dense-array]";
|
537
|
-
const auto
|
538
|
-
const auto
|
539
|
-
const auto& field_name =
|
540
|
+
const auto index = compute_field_index(array, type, tag);
|
541
|
+
const auto field = type->field(index).get();
|
542
|
+
const auto& field_name = field->name();
|
540
543
|
const auto field_name_keep = field_name_;
|
541
544
|
field_name_ = rb_utf8_str_new(field_name.data(), field_name.length());
|
542
|
-
const auto
|
545
|
+
const auto field_array = array.field(index);
|
543
546
|
const auto index_keep = index_;
|
544
547
|
index_ = array.value_offset(index_);
|
545
|
-
check_status(
|
548
|
+
check_status(field_array->Accept(this), tag);
|
546
549
|
index_ = index_keep;
|
547
550
|
field_name_ = field_name_keep;
|
548
551
|
}
|
@@ -557,30 +560,57 @@ namespace red_arrow {
|
|
557
560
|
public:
|
558
561
|
explicit DictionaryArrayValueConverter(ArrayValueConverter* converter)
|
559
562
|
: array_value_converter_(converter),
|
560
|
-
|
563
|
+
value_index_(0),
|
561
564
|
result_(Qnil) {
|
562
565
|
}
|
563
566
|
|
564
567
|
VALUE convert(const arrow::DictionaryArray& array,
|
565
568
|
const int64_t index) {
|
566
|
-
|
567
|
-
auto
|
568
|
-
check_status(
|
569
|
+
value_index_ = array.GetValueIndex(index);
|
570
|
+
auto dictionary = array.dictionary().get();
|
571
|
+
check_status(dictionary->Accept(this),
|
569
572
|
"[raw-records][dictionary-array]");
|
570
573
|
return result_;
|
571
574
|
}
|
572
575
|
|
573
|
-
// TODO: Convert to real value.
|
574
576
|
#define VISIT(TYPE) \
|
575
577
|
arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
|
576
|
-
result_ = convert_value(array,
|
578
|
+
result_ = convert_value(array, value_index_); \
|
577
579
|
return arrow::Status::OK(); \
|
578
580
|
}
|
579
581
|
|
582
|
+
VISIT(Null)
|
583
|
+
VISIT(Boolean)
|
580
584
|
VISIT(Int8)
|
581
585
|
VISIT(Int16)
|
582
586
|
VISIT(Int32)
|
583
587
|
VISIT(Int64)
|
588
|
+
VISIT(UInt8)
|
589
|
+
VISIT(UInt16)
|
590
|
+
VISIT(UInt32)
|
591
|
+
VISIT(UInt64)
|
592
|
+
// TODO
|
593
|
+
// VISIT(HalfFloat)
|
594
|
+
VISIT(Float)
|
595
|
+
VISIT(Double)
|
596
|
+
VISIT(Binary)
|
597
|
+
VISIT(String)
|
598
|
+
VISIT(FixedSizeBinary)
|
599
|
+
VISIT(Date32)
|
600
|
+
VISIT(Date64)
|
601
|
+
VISIT(Time32)
|
602
|
+
VISIT(Time64)
|
603
|
+
VISIT(Timestamp)
|
604
|
+
// TODO
|
605
|
+
// VISIT(Interval)
|
606
|
+
VISIT(List)
|
607
|
+
VISIT(Struct)
|
608
|
+
VISIT(SparseUnion)
|
609
|
+
VISIT(DenseUnion)
|
610
|
+
VISIT(Dictionary)
|
611
|
+
VISIT(Decimal128)
|
612
|
+
// TODO
|
613
|
+
// VISIT(Extension)
|
584
614
|
|
585
615
|
#undef VISIT
|
586
616
|
|
@@ -592,7 +622,7 @@ namespace red_arrow {
|
|
592
622
|
}
|
593
623
|
|
594
624
|
ArrayValueConverter* array_value_converter_;
|
595
|
-
int64_t
|
625
|
+
int64_t value_index_;
|
596
626
|
VALUE result_;
|
597
627
|
};
|
598
628
|
|
data/ext/arrow/raw-records.cpp
CHANGED
data/ext/arrow/values.cpp
CHANGED
data/lib/arrow/array-builder.rb
CHANGED
@@ -26,60 +26,13 @@ module Arrow
|
|
26
26
|
return builder.build(values)
|
27
27
|
end
|
28
28
|
|
29
|
-
|
30
|
-
builder_class_arguments = []
|
29
|
+
builder_info = nil
|
31
30
|
values.each do |value|
|
32
|
-
|
33
|
-
|
34
|
-
# Ignore
|
35
|
-
when true, false
|
36
|
-
return BooleanArray.new(values)
|
37
|
-
when String
|
38
|
-
return StringArray.new(values)
|
39
|
-
when Float
|
40
|
-
return DoubleArray.new(values)
|
41
|
-
when Integer
|
42
|
-
if value < 0
|
43
|
-
builder = IntArrayBuilder.new
|
44
|
-
return builder.build(values)
|
45
|
-
else
|
46
|
-
builder_class = UIntArrayBuilder
|
47
|
-
builder_class_arguments = []
|
48
|
-
end
|
49
|
-
when Time
|
50
|
-
data_type = value.data_type
|
51
|
-
case data_type.unit
|
52
|
-
when TimeUnit::SECOND
|
53
|
-
if builder.nil?
|
54
|
-
builder = Time32ArrayBuilder
|
55
|
-
builder_class_arguments = [data_type]
|
56
|
-
end
|
57
|
-
when TimeUnit::MILLI
|
58
|
-
if builder != Time64ArrayBuilder
|
59
|
-
builder = Time32ArrayBuilder
|
60
|
-
builder_class_arguments = [data_type]
|
61
|
-
end
|
62
|
-
when TimeUnit::MICRO
|
63
|
-
builder = Time64ArrayBuilder
|
64
|
-
builder_class_arguments = [data_type]
|
65
|
-
when TimeUnit::NANO
|
66
|
-
builder = Time64ArrayBuilder.new(data_type)
|
67
|
-
return builder.build(values)
|
68
|
-
end
|
69
|
-
when ::Time
|
70
|
-
data_type = TimestampDataType.new(:nano)
|
71
|
-
builder = TimestampArrayBuilder.new(data_type)
|
72
|
-
return builder.build(values)
|
73
|
-
when DateTime
|
74
|
-
return Date64Array.new(values)
|
75
|
-
when Date
|
76
|
-
return Date32Array.new(values)
|
77
|
-
else
|
78
|
-
return StringArray.new(values)
|
79
|
-
end
|
31
|
+
builder_info = detect_builder_info(value, builder_info)
|
32
|
+
break if builder_info and builder_info[:detected]
|
80
33
|
end
|
81
|
-
if
|
82
|
-
builder =
|
34
|
+
if builder_info
|
35
|
+
builder = builder_info[:builder]
|
83
36
|
builder.build(values)
|
84
37
|
else
|
85
38
|
Arrow::StringArray.new(values)
|
@@ -89,6 +42,102 @@ module Arrow
|
|
89
42
|
def buildable?(args)
|
90
43
|
args.size == method(:build).arity
|
91
44
|
end
|
45
|
+
|
46
|
+
private
|
47
|
+
def detect_builder_info(value, builder_info)
|
48
|
+
case value
|
49
|
+
when nil
|
50
|
+
builder_info
|
51
|
+
when true, false
|
52
|
+
{
|
53
|
+
builder: BooleanArrayBuilder.new,
|
54
|
+
detected: true,
|
55
|
+
}
|
56
|
+
when String
|
57
|
+
{
|
58
|
+
builder: StringArrayBuilder.new,
|
59
|
+
detected: true,
|
60
|
+
}
|
61
|
+
when Float
|
62
|
+
{
|
63
|
+
builder: DoubleArrayBuilder.new,
|
64
|
+
detected: true,
|
65
|
+
}
|
66
|
+
when Integer
|
67
|
+
if value < 0
|
68
|
+
{
|
69
|
+
builder: IntArrayBuilder.new,
|
70
|
+
detected: true,
|
71
|
+
}
|
72
|
+
else
|
73
|
+
{
|
74
|
+
builder: UIntArrayBuilder.new,
|
75
|
+
}
|
76
|
+
end
|
77
|
+
when Time
|
78
|
+
data_type = value.data_type
|
79
|
+
case data_type.unit
|
80
|
+
when TimeUnit::SECOND
|
81
|
+
builder_info || {
|
82
|
+
builder: Time32ArrayBuilder.new(data_type)
|
83
|
+
}
|
84
|
+
when TimeUnit::MILLI
|
85
|
+
if builder_info and builder_info[:builder].is_a?(Time64ArrayBuilder)
|
86
|
+
builder_info
|
87
|
+
else
|
88
|
+
{
|
89
|
+
builder: Time32ArrayBuilder.new(data_type),
|
90
|
+
}
|
91
|
+
end
|
92
|
+
when TimeUnit::MICRO
|
93
|
+
{
|
94
|
+
builder: Time64ArrayBuilder.new(data_type),
|
95
|
+
}
|
96
|
+
when TimeUnit::NANO
|
97
|
+
{
|
98
|
+
builder: Time64ArrayBuilder.new(data_type),
|
99
|
+
detected: true
|
100
|
+
}
|
101
|
+
end
|
102
|
+
when ::Time
|
103
|
+
data_type = TimestampDataType.new(:nano)
|
104
|
+
{
|
105
|
+
builder: TimestampArrayBuilder.new(data_type),
|
106
|
+
detected: true,
|
107
|
+
}
|
108
|
+
when DateTime
|
109
|
+
{
|
110
|
+
builder: Date64ArrayBuilder.new,
|
111
|
+
detected: true,
|
112
|
+
}
|
113
|
+
when Date
|
114
|
+
{
|
115
|
+
builder: Date32ArrayBuilder.new,
|
116
|
+
detected: true,
|
117
|
+
}
|
118
|
+
when ::Array
|
119
|
+
sub_builder_info = nil
|
120
|
+
value.each do |sub_value|
|
121
|
+
sub_builder_info = detect_builder_info(sub_value, sub_builder_info)
|
122
|
+
break if sub_builder_info and sub_builder_info[:detected]
|
123
|
+
end
|
124
|
+
if sub_builder_info and sub_builder_info[:detected]
|
125
|
+
sub_value_data_type = sub_builder_info[:builder].value_data_type
|
126
|
+
field = Field.new("item", sub_value_data_type)
|
127
|
+
{
|
128
|
+
builder: ListArrayBuilder.new(ListDataType.new(field)),
|
129
|
+
detected: true,
|
130
|
+
}
|
131
|
+
else
|
132
|
+
builder_info
|
133
|
+
end
|
134
|
+
else
|
135
|
+
{
|
136
|
+
builder: StringArrayBuilder.new,
|
137
|
+
detected: true,
|
138
|
+
}
|
139
|
+
end
|
140
|
+
end
|
92
141
|
end
|
93
142
|
|
94
143
|
def build(values)
|