red-arrow 0.14.1 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of red-arrow might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/ext/arrow/arrow.cpp +34 -0
- data/ext/arrow/converters.cpp +42 -0
- data/ext/arrow/converters.hpp +626 -0
- data/ext/arrow/raw-records.cpp +6 -625
- data/ext/arrow/red-arrow.hpp +37 -3
- data/ext/arrow/values.cpp +154 -0
- data/lib/arrow/array-builder.rb +24 -1
- data/lib/arrow/array.rb +9 -0
- data/lib/arrow/chunked-array.rb +5 -0
- data/lib/arrow/column-containable.rb +48 -0
- data/lib/arrow/column.rb +36 -10
- data/lib/arrow/csv-loader.rb +2 -2
- data/lib/arrow/data-type.rb +22 -5
- data/lib/arrow/date64-array-builder.rb +2 -2
- data/lib/arrow/date64-array.rb +1 -1
- data/lib/arrow/decimal128-array.rb +24 -0
- data/lib/arrow/field-containable.rb +3 -0
- data/lib/arrow/group.rb +10 -13
- data/lib/arrow/loader.rb +20 -1
- data/lib/arrow/record-batch.rb +6 -4
- data/lib/arrow/record-containable.rb +0 -35
- data/lib/arrow/record.rb +12 -9
- data/lib/arrow/slicer.rb +2 -2
- data/lib/arrow/struct-array-builder.rb +1 -7
- data/lib/arrow/struct-array.rb +13 -11
- data/lib/arrow/table-loader.rb +3 -9
- data/lib/arrow/table-table-formatter.rb +2 -2
- data/lib/arrow/table.rb +61 -24
- data/lib/arrow/time.rb +159 -0
- data/lib/arrow/time32-array-builder.rb +49 -0
- data/lib/arrow/time32-array.rb +28 -0
- data/lib/arrow/time64-array-builder.rb +49 -0
- data/lib/arrow/time64-array.rb +28 -0
- data/lib/arrow/timestamp-array-builder.rb +20 -1
- data/lib/arrow/timestamp-array.rb +10 -22
- data/lib/arrow/version.rb +1 -1
- data/red-arrow.gemspec +1 -1
- data/test/raw-records/test-basic-arrays.rb +16 -8
- data/test/raw-records/test-dense-union-array.rb +12 -5
- data/test/raw-records/test-list-array.rb +21 -9
- data/test/raw-records/test-sparse-union-array.rb +13 -5
- data/test/raw-records/test-struct-array.rb +11 -4
- data/test/test-column.rb +56 -31
- data/test/test-decimal128-array-builder.rb +11 -11
- data/test/test-decimal128-array.rb +4 -4
- data/test/test-slicer.rb +1 -3
- data/test/test-struct-array-builder.rb +4 -4
- data/test/test-struct-array.rb +4 -4
- data/test/test-table.rb +17 -8
- data/test/test-time.rb +288 -0
- data/test/test-time32-array.rb +81 -0
- data/test/test-time64-array.rb +81 -0
- data/test/values/test-basic-arrays.rb +284 -0
- data/test/values/test-dense-union-array.rb +487 -0
- data/test/values/test-list-array.rb +497 -0
- data/test/values/test-sparse-union-array.rb +477 -0
- data/test/values/test-struct-array.rb +452 -0
- metadata +78 -54
- data/lib/arrow/struct.rb +0 -79
- data/test/test-struct.rb +0 -81
data/ext/arrow/red-arrow.hpp
CHANGED
@@ -35,14 +35,25 @@
|
|
35
35
|
namespace red_arrow {
|
36
36
|
extern VALUE cDate;
|
37
37
|
|
38
|
+
extern VALUE cArrowTime;
|
39
|
+
|
40
|
+
extern VALUE ArrowTimeUnitSECOND;
|
41
|
+
extern VALUE ArrowTimeUnitMILLI;
|
42
|
+
extern VALUE ArrowTimeUnitMICRO;
|
43
|
+
extern VALUE ArrowTimeUnitNANO;
|
44
|
+
|
38
45
|
extern ID id_BigDecimal;
|
39
46
|
extern ID id_jd;
|
47
|
+
extern ID id_new;
|
40
48
|
extern ID id_to_datetime;
|
41
49
|
|
50
|
+
VALUE array_values(VALUE obj);
|
51
|
+
VALUE chunked_array_values(VALUE obj);
|
52
|
+
|
42
53
|
VALUE record_batch_raw_records(VALUE obj);
|
43
54
|
VALUE table_raw_records(VALUE obj);
|
44
55
|
|
45
|
-
inline VALUE time_unit_to_scale(arrow::TimeUnit::type unit) {
|
56
|
+
inline VALUE time_unit_to_scale(const arrow::TimeUnit::type unit) {
|
46
57
|
switch (unit) {
|
47
58
|
case arrow::TimeUnit::SECOND:
|
48
59
|
return INT2FIX(1);
|
@@ -54,8 +65,31 @@ namespace red_arrow {
|
|
54
65
|
// NOTE: INT2FIX works for 1e+9 because: FIXNUM_MAX >= (1<<30) - 1 > 1e+9
|
55
66
|
return INT2FIX(1000 * 1000 * 1000);
|
56
67
|
default:
|
57
|
-
|
68
|
+
rb_raise(rb_eArgError, "invalid arrow::TimeUnit: %d", unit);
|
69
|
+
return Qnil;
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
inline VALUE time_unit_to_enum(const arrow::TimeUnit::type unit) {
|
74
|
+
switch (unit) {
|
75
|
+
case arrow::TimeUnit::SECOND:
|
76
|
+
return red_arrow::ArrowTimeUnitSECOND;
|
77
|
+
case arrow::TimeUnit::MILLI:
|
78
|
+
return red_arrow::ArrowTimeUnitMILLI;
|
79
|
+
case arrow::TimeUnit::MICRO:
|
80
|
+
return red_arrow::ArrowTimeUnitMICRO;
|
81
|
+
case arrow::TimeUnit::NANO:
|
82
|
+
return red_arrow::ArrowTimeUnitNANO;
|
83
|
+
default:
|
84
|
+
rb_raise(rb_eArgError, "invalid arrow::TimeUnit: %d", unit);
|
85
|
+
return Qnil;
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
inline void check_status(const arrow::Status&& status, const char* context) {
|
90
|
+
GError* error = nullptr;
|
91
|
+
if (!garrow_error_check(&error, status, context)) {
|
92
|
+
RG_RAISE_ERROR(error);
|
58
93
|
}
|
59
|
-
return Qnil;
|
60
94
|
}
|
61
95
|
}
|
@@ -0,0 +1,154 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include "converters.hpp"
|
21
|
+
|
22
|
+
namespace red_arrow {
|
23
|
+
namespace {
|
24
|
+
class ValuesBuilder : private Converter, public arrow::ArrayVisitor {
|
25
|
+
public:
|
26
|
+
explicit ValuesBuilder(VALUE values)
|
27
|
+
: Converter(),
|
28
|
+
values_(values),
|
29
|
+
row_offset_(0) {
|
30
|
+
}
|
31
|
+
|
32
|
+
void build(const arrow::Array& array, VALUE rb_array) {
|
33
|
+
rb::protect([&] {
|
34
|
+
check_status(array.Accept(this),
|
35
|
+
"[array][values]");
|
36
|
+
return Qnil;
|
37
|
+
});
|
38
|
+
}
|
39
|
+
|
40
|
+
void build(const arrow::ChunkedArray& chunked_array,
|
41
|
+
VALUE rb_chunked_array) {
|
42
|
+
rb::protect([&] {
|
43
|
+
for (const auto& array : chunked_array.chunks()) {
|
44
|
+
check_status(array->Accept(this),
|
45
|
+
"[chunked-array][values]");
|
46
|
+
row_offset_ += array->length();
|
47
|
+
}
|
48
|
+
return Qnil;
|
49
|
+
});
|
50
|
+
}
|
51
|
+
|
52
|
+
#define VISIT(TYPE) \
|
53
|
+
arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
|
54
|
+
convert(array); \
|
55
|
+
return arrow::Status::OK(); \
|
56
|
+
}
|
57
|
+
|
58
|
+
VISIT(Null)
|
59
|
+
VISIT(Boolean)
|
60
|
+
VISIT(Int8)
|
61
|
+
VISIT(Int16)
|
62
|
+
VISIT(Int32)
|
63
|
+
VISIT(Int64)
|
64
|
+
VISIT(UInt8)
|
65
|
+
VISIT(UInt16)
|
66
|
+
VISIT(UInt32)
|
67
|
+
VISIT(UInt64)
|
68
|
+
// TODO
|
69
|
+
// VISIT(HalfFloat)
|
70
|
+
VISIT(Float)
|
71
|
+
VISIT(Double)
|
72
|
+
VISIT(Binary)
|
73
|
+
VISIT(String)
|
74
|
+
VISIT(FixedSizeBinary)
|
75
|
+
VISIT(Date32)
|
76
|
+
VISIT(Date64)
|
77
|
+
VISIT(Time32)
|
78
|
+
VISIT(Time64)
|
79
|
+
VISIT(Timestamp)
|
80
|
+
// TODO
|
81
|
+
// VISIT(Interval)
|
82
|
+
VISIT(List)
|
83
|
+
VISIT(Struct)
|
84
|
+
VISIT(Union)
|
85
|
+
VISIT(Dictionary)
|
86
|
+
VISIT(Decimal128)
|
87
|
+
// TODO
|
88
|
+
// VISIT(Extension)
|
89
|
+
|
90
|
+
#undef VISIT
|
91
|
+
|
92
|
+
private:
|
93
|
+
template <typename ArrayType>
|
94
|
+
void convert(const ArrayType& array) {
|
95
|
+
const auto n = array.length();
|
96
|
+
if (array.null_count() > 0) {
|
97
|
+
for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
|
98
|
+
auto value = Qnil;
|
99
|
+
if (!array.IsNull(i)) {
|
100
|
+
value = convert_value(array, i);
|
101
|
+
}
|
102
|
+
rb_ary_store(values_, ii, value);
|
103
|
+
}
|
104
|
+
} else {
|
105
|
+
for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
|
106
|
+
rb_ary_store(values_, ii, convert_value(array, i));
|
107
|
+
}
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
// Destination for converted values.
|
112
|
+
VALUE values_;
|
113
|
+
|
114
|
+
// The current row offset.
|
115
|
+
int64_t row_offset_;
|
116
|
+
};
|
117
|
+
}
|
118
|
+
|
119
|
+
VALUE
|
120
|
+
array_values(VALUE rb_array) {
|
121
|
+
auto garrow_array = GARROW_ARRAY(RVAL2GOBJ(rb_array));
|
122
|
+
auto array = garrow_array_get_raw(garrow_array).get();
|
123
|
+
const auto n_rows = array->length();
|
124
|
+
auto values = rb_ary_new_capa(n_rows);
|
125
|
+
|
126
|
+
try {
|
127
|
+
ValuesBuilder builder(values);
|
128
|
+
builder.build(*array, rb_array);
|
129
|
+
} catch (rb::State& state) {
|
130
|
+
state.jump();
|
131
|
+
}
|
132
|
+
|
133
|
+
return values;
|
134
|
+
}
|
135
|
+
|
136
|
+
VALUE
|
137
|
+
chunked_array_values(VALUE rb_chunked_array) {
|
138
|
+
auto garrow_chunked_array =
|
139
|
+
GARROW_CHUNKED_ARRAY(RVAL2GOBJ(rb_chunked_array));
|
140
|
+
auto chunked_array =
|
141
|
+
garrow_chunked_array_get_raw(garrow_chunked_array).get();
|
142
|
+
const auto n_rows = chunked_array->length();
|
143
|
+
auto values = rb_ary_new_capa(n_rows);
|
144
|
+
|
145
|
+
try {
|
146
|
+
ValuesBuilder builder(values);
|
147
|
+
builder.build(*chunked_array, rb_chunked_array);
|
148
|
+
} catch (rb::State& state) {
|
149
|
+
state.jump();
|
150
|
+
}
|
151
|
+
|
152
|
+
return values;
|
153
|
+
}
|
154
|
+
}
|
data/lib/arrow/array-builder.rb
CHANGED
@@ -27,6 +27,7 @@ module Arrow
|
|
27
27
|
end
|
28
28
|
|
29
29
|
builder_class = nil
|
30
|
+
builder_class_arguments = []
|
30
31
|
values.each do |value|
|
31
32
|
case value
|
32
33
|
when nil
|
@@ -43,8 +44,29 @@ module Arrow
|
|
43
44
|
return builder.build(values)
|
44
45
|
else
|
45
46
|
builder_class = UIntArrayBuilder
|
47
|
+
builder_class_arguments = []
|
46
48
|
end
|
47
49
|
when Time
|
50
|
+
data_type = value.data_type
|
51
|
+
case data_type.unit
|
52
|
+
when TimeUnit::SECOND
|
53
|
+
if builder.nil?
|
54
|
+
builder = Time32ArrayBuilder
|
55
|
+
builder_class_arguments = [data_type]
|
56
|
+
end
|
57
|
+
when TimeUnit::MILLI
|
58
|
+
if builder != Time64ArrayBuilder
|
59
|
+
builder = Time32ArrayBuilder
|
60
|
+
builder_class_arguments = [data_type]
|
61
|
+
end
|
62
|
+
when TimeUnit::MICRO
|
63
|
+
builder = Time64ArrayBuilder
|
64
|
+
builder_class_arguments = [data_type]
|
65
|
+
when TimeUnit::NANO
|
66
|
+
builder = Time64ArrayBuilder.new(data_type)
|
67
|
+
return builder.build(values)
|
68
|
+
end
|
69
|
+
when ::Time
|
48
70
|
data_type = TimestampDataType.new(:nano)
|
49
71
|
builder = TimestampArrayBuilder.new(data_type)
|
50
72
|
return builder.build(values)
|
@@ -57,7 +79,8 @@ module Arrow
|
|
57
79
|
end
|
58
80
|
end
|
59
81
|
if builder_class
|
60
|
-
builder_class.new
|
82
|
+
builder = builder_class.new(*builder_class_arguments)
|
83
|
+
builder.build(values)
|
61
84
|
else
|
62
85
|
Arrow::StringArray.new(values)
|
63
86
|
end
|
data/lib/arrow/array.rb
CHANGED
data/lib/arrow/chunked-array.rb
CHANGED
@@ -0,0 +1,48 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
module Arrow
|
19
|
+
module ColumnContainable
|
20
|
+
def columns
|
21
|
+
@columns ||= schema.n_fields.times.collect do |i|
|
22
|
+
Column.new(self, i)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def each_column(&block)
|
27
|
+
columns.each(&block)
|
28
|
+
end
|
29
|
+
|
30
|
+
def find_column(name_or_index)
|
31
|
+
case name_or_index
|
32
|
+
when String, Symbol
|
33
|
+
name = name_or_index.to_s
|
34
|
+
index = schema.get_field_index(name)
|
35
|
+
return nil if index == -1
|
36
|
+
Column.new(self, index)
|
37
|
+
when Integer
|
38
|
+
index = name_or_index
|
39
|
+
index += n_columns if index < 0
|
40
|
+
return nil if index < 0 or index >= n_columns
|
41
|
+
Column.new(self, index)
|
42
|
+
else
|
43
|
+
message = "column name or index must be String, Symbol or Integer"
|
44
|
+
raise ArgumentError, message
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/lib/arrow/column.rb
CHANGED
@@ -19,32 +19,58 @@ module Arrow
|
|
19
19
|
class Column
|
20
20
|
include Enumerable
|
21
21
|
|
22
|
+
attr_reader :container
|
23
|
+
attr_reader :field
|
24
|
+
attr_reader :data
|
25
|
+
def initialize(container, index)
|
26
|
+
@container = container
|
27
|
+
@index = index
|
28
|
+
@field = @container.schema[@index]
|
29
|
+
@data = @container.get_column_data(@index)
|
30
|
+
end
|
31
|
+
|
32
|
+
def name
|
33
|
+
@field.name
|
34
|
+
end
|
35
|
+
|
36
|
+
def data_type
|
37
|
+
@field.data_type
|
38
|
+
end
|
39
|
+
|
22
40
|
def null?(i)
|
23
|
-
data.null?(i)
|
41
|
+
@data.null?(i)
|
24
42
|
end
|
25
43
|
|
26
44
|
def valid?(i)
|
27
|
-
data.valid?(i)
|
45
|
+
@data.valid?(i)
|
28
46
|
end
|
29
47
|
|
30
48
|
def [](i)
|
31
|
-
data[i]
|
49
|
+
@data[i]
|
32
50
|
end
|
33
51
|
|
34
52
|
def each(&block)
|
35
|
-
|
36
|
-
|
37
|
-
data.each(&block)
|
53
|
+
@data.each(&block)
|
38
54
|
end
|
39
55
|
|
40
56
|
def reverse_each(&block)
|
41
|
-
|
57
|
+
@data.reverse_each(&block)
|
58
|
+
end
|
59
|
+
|
60
|
+
def n_rows
|
61
|
+
@data.n_rows
|
62
|
+
end
|
63
|
+
alias_method :size, :n_rows
|
64
|
+
alias_method :length, :n_rows
|
42
65
|
|
43
|
-
|
66
|
+
def n_nulls
|
67
|
+
@data.n_nulls
|
44
68
|
end
|
45
69
|
|
46
|
-
def
|
47
|
-
self.class
|
70
|
+
def ==(other)
|
71
|
+
other.is_a?(self.class) and
|
72
|
+
@field == other.field and
|
73
|
+
@data == other.data
|
48
74
|
end
|
49
75
|
end
|
50
76
|
end
|
data/lib/arrow/csv-loader.rb
CHANGED
@@ -221,7 +221,7 @@ module Arrow
|
|
221
221
|
field
|
222
222
|
else
|
223
223
|
begin
|
224
|
-
Time.iso8601(encoded_field)
|
224
|
+
::Time.iso8601(encoded_field)
|
225
225
|
rescue ArgumentError
|
226
226
|
field
|
227
227
|
end
|
@@ -317,7 +317,7 @@ module Arrow
|
|
317
317
|
if current_column_type == :integer
|
318
318
|
column_types[i] = candidate_type
|
319
319
|
end
|
320
|
-
when Time
|
320
|
+
when ::Time
|
321
321
|
candidate_type = :time
|
322
322
|
when DateTime
|
323
323
|
candidate_type = :date_time
|
data/lib/arrow/data-type.rb
CHANGED
@@ -29,24 +29,33 @@ module Arrow
|
|
29
29
|
#
|
30
30
|
# @return [Arrow::DataType] The given data type itself.
|
31
31
|
#
|
32
|
-
# @overload resolve(name
|
32
|
+
# @overload resolve(name)
|
33
33
|
#
|
34
34
|
# Creates a suitable data type from type name. For example,
|
35
35
|
# you can create {Arrow::BooleanDataType} from `:boolean`.
|
36
36
|
#
|
37
37
|
# @param name [String, Symbol] The type name of the data type.
|
38
38
|
#
|
39
|
-
# @
|
40
|
-
#
|
39
|
+
# @example Create a boolean data type
|
40
|
+
# Arrow::DataType.resolve(:boolean)
|
41
|
+
#
|
42
|
+
# @overload resolve(name_with_arguments)
|
43
|
+
#
|
44
|
+
# Creates a suitable data type from type name with arguments.
|
45
|
+
#
|
46
|
+
# @param name_with_arguments [::Array<String, ...>]
|
47
|
+
# The type name of the data type as the first element.
|
48
|
+
#
|
49
|
+
# The rest elements are additional information of the data type.
|
41
50
|
#
|
42
51
|
# For example, {Arrow::TimestampDataType} needs unit as
|
43
52
|
# additional information.
|
44
53
|
#
|
45
54
|
# @example Create a boolean data type
|
46
|
-
# Arrow::DataType.resolve(:boolean)
|
55
|
+
# Arrow::DataType.resolve([:boolean])
|
47
56
|
#
|
48
57
|
# @example Create a milliseconds unit timestamp data type
|
49
|
-
# Arrow::DataType.resolve(:timestamp, :milli)
|
58
|
+
# Arrow::DataType.resolve([:timestamp, :milli])
|
50
59
|
#
|
51
60
|
# @overload resolve(description)
|
52
61
|
#
|
@@ -135,5 +144,13 @@ module Arrow
|
|
135
144
|
Arrow.const_get(data_type_class_name)
|
136
145
|
end
|
137
146
|
end
|
147
|
+
|
148
|
+
def build_array(values)
|
149
|
+
base_name = self.class.name.gsub(/DataType\z/, "")
|
150
|
+
builder_class = self.class.const_get("#{base_name}ArrayBuilder")
|
151
|
+
args = [values]
|
152
|
+
args.unshift(self) unless builder_class.buildable?(args)
|
153
|
+
builder_class.build(*args)
|
154
|
+
end
|
138
155
|
end
|
139
156
|
end
|