duckdb 0.8.2-dev2068.0 → 0.8.2-dev2133.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +4 -0
- package/package.json +1 -1
- package/src/duckdb/extension/json/buffered_json_reader.cpp +2 -0
- package/src/duckdb/extension/json/include/buffered_json_reader.hpp +5 -19
- package/src/duckdb/extension/json/include/json_enums.hpp +60 -0
- package/src/duckdb/extension/json/include/json_scan.hpp +14 -10
- package/src/duckdb/extension/json/include/json_transform.hpp +3 -0
- package/src/duckdb/extension/json/json_enums.cpp +105 -0
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +2 -0
- package/src/duckdb/extension/json/json_scan.cpp +44 -0
- package/src/duckdb/extension/json/serialize_json.cpp +92 -0
- package/src/duckdb/extension/parquet/include/parquet_reader.hpp +3 -0
- package/src/duckdb/extension/parquet/parquet_extension.cpp +23 -0
- package/src/duckdb/extension/parquet/parquet_reader.cpp +3 -0
- package/src/duckdb/extension/parquet/serialize_parquet.cpp +26 -0
- package/src/duckdb/src/common/arrow/appender/bool_data.cpp +44 -0
- package/src/duckdb/src/common/arrow/appender/list_data.cpp +78 -0
- package/src/duckdb/src/common/arrow/appender/map_data.cpp +86 -0
- package/src/duckdb/src/common/arrow/appender/struct_data.cpp +45 -0
- package/src/duckdb/src/common/arrow/appender/union_data.cpp +70 -0
- package/src/duckdb/src/common/arrow/arrow_appender.cpp +89 -727
- package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +2 -1
- package/src/duckdb/src/common/local_file_system.cpp +17 -14
- package/src/duckdb/src/common/serializer/format_serializer.cpp +15 -0
- package/src/duckdb/src/core_functions/aggregate/holistic/approximate_quantile.cpp +26 -0
- package/src/duckdb/src/core_functions/aggregate/holistic/quantile.cpp +47 -0
- package/src/duckdb/src/core_functions/aggregate/holistic/reservoir_quantile.cpp +28 -0
- package/src/duckdb/src/core_functions/scalar/date/strftime.cpp +10 -0
- package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +22 -3
- package/src/duckdb/src/function/aggregate/distributive/count.cpp +0 -11
- package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +1 -9
- package/src/duckdb/src/function/scalar/system/aggregate_export.cpp +27 -0
- package/src/duckdb/src/function/scalar_function.cpp +2 -1
- package/src/duckdb/src/function/table/read_csv.cpp +18 -0
- package/src/duckdb/src/function/table/table_scan.cpp +35 -0
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/function/table_function.cpp +4 -3
- package/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp +109 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/bool_data.hpp +15 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp +69 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/list.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/scalar_data.hpp +88 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/struct_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/union_data.hpp +21 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp +105 -0
- package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/multi_file_reader.hpp +5 -1
- package/src/duckdb/src/include/duckdb/common/multi_file_reader_options.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +32 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +45 -15
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +10 -0
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +2 -0
- package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +11 -2
- package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +81 -0
- package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +8 -0
- package/src/duckdb/src/include/duckdb/function/scalar_function.hpp +8 -0
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +7 -0
- package/src/duckdb/src/include/duckdb/function/table_function.hpp +8 -0
- package/src/duckdb/src/include/duckdb/planner/expression/bound_aggregate_expression.hpp +3 -0
- package/src/duckdb/src/include/duckdb/planner/expression/bound_function_expression.hpp +4 -0
- package/src/duckdb/src/include/duckdb/planner/expression/bound_window_expression.hpp +3 -0
- package/src/duckdb/src/include/duckdb/planner/filter/conjunction_filter.hpp +4 -0
- package/src/duckdb/src/include/duckdb/planner/filter/constant_filter.hpp +2 -0
- package/src/duckdb/src/include/duckdb/planner/filter/null_filter.hpp +4 -0
- package/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +2 -0
- package/src/duckdb/src/include/duckdb/planner/operator/logical_get.hpp +7 -1
- package/src/duckdb/src/include/duckdb/planner/table_filter.hpp +7 -1
- package/src/duckdb/src/main/extension/extension_helper.cpp +13 -0
- package/src/duckdb/src/parallel/executor.cpp +1 -1
- package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +23 -0
- package/src/duckdb/src/planner/expression/bound_function_expression.cpp +22 -0
- package/src/duckdb/src/planner/expression/bound_window_expression.cpp +47 -0
- package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +8 -0
- package/src/duckdb/src/planner/operator/logical_get.cpp +69 -0
- package/src/duckdb/src/storage/serialization/serialize_expression.cpp +9 -0
- package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +6 -0
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +190 -0
- package/src/duckdb/src/storage/serialization/serialize_table_filter.cpp +97 -0
- package/src/duckdb/ub_src_common_arrow_appender.cpp +10 -0
- package/src/duckdb/ub_src_common_serializer.cpp +2 -0
- package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
@@ -5,53 +5,19 @@
|
|
5
5
|
#include "duckdb/common/types/interval.hpp"
|
6
6
|
#include "duckdb/common/types/uuid.hpp"
|
7
7
|
#include "duckdb/function/table/arrow.hpp"
|
8
|
+
#include "duckdb/common/arrow/appender/append_data.hpp"
|
9
|
+
#include "duckdb/common/arrow/appender/list.hpp"
|
8
10
|
|
9
11
|
namespace duckdb {
|
10
12
|
|
11
|
-
//===--------------------------------------------------------------------===//
|
12
|
-
// Arrow append data
|
13
|
-
//===--------------------------------------------------------------------===//
|
14
|
-
typedef void (*initialize_t)(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
|
15
|
-
typedef void (*append_vector_t)(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
|
16
|
-
typedef void (*finalize_t)(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
|
17
|
-
|
18
|
-
struct ArrowAppendData {
|
19
|
-
explicit ArrowAppendData(ArrowOptions &options_p) : options(options_p) {
|
20
|
-
}
|
21
|
-
// the buffers of the arrow vector
|
22
|
-
ArrowBuffer validity;
|
23
|
-
ArrowBuffer main_buffer;
|
24
|
-
ArrowBuffer aux_buffer;
|
25
|
-
|
26
|
-
idx_t row_count = 0;
|
27
|
-
idx_t null_count = 0;
|
28
|
-
|
29
|
-
// function pointers for construction
|
30
|
-
initialize_t initialize = nullptr;
|
31
|
-
append_vector_t append_vector = nullptr;
|
32
|
-
finalize_t finalize = nullptr;
|
33
|
-
|
34
|
-
// child data (if any)
|
35
|
-
vector<unique_ptr<ArrowAppendData>> child_data;
|
36
|
-
|
37
|
-
// the arrow array C API data, only set after Finalize
|
38
|
-
unique_ptr<ArrowArray> array;
|
39
|
-
duckdb::array<const void *, 3> buffers = {{nullptr, nullptr, nullptr}};
|
40
|
-
vector<ArrowArray *> child_pointers;
|
41
|
-
|
42
|
-
ArrowOptions options;
|
43
|
-
};
|
44
|
-
|
45
13
|
//===--------------------------------------------------------------------===//
|
46
14
|
// ArrowAppender
|
47
15
|
//===--------------------------------------------------------------------===//
|
48
|
-
static unique_ptr<ArrowAppendData> InitializeArrowChild(const LogicalType &type, idx_t capacity, ArrowOptions &options);
|
49
|
-
static ArrowArray *FinalizeArrowChild(const LogicalType &type, ArrowAppendData &append_data);
|
50
16
|
|
51
17
|
ArrowAppender::ArrowAppender(vector<LogicalType> types_p, idx_t initial_capacity, ArrowOptions options)
|
52
18
|
: types(std::move(types_p)) {
|
53
19
|
for (auto &type : types) {
|
54
|
-
auto entry =
|
20
|
+
auto entry = ArrowAppender::InitializeChild(type, initial_capacity, options);
|
55
21
|
root_data.push_back(std::move(entry));
|
56
22
|
}
|
57
23
|
}
|
@@ -59,629 +25,87 @@ ArrowAppender::ArrowAppender(vector<LogicalType> types_p, idx_t initial_capacity
|
|
59
25
|
ArrowAppender::~ArrowAppender() {
|
60
26
|
}
|
61
27
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
}
|
69
|
-
|
70
|
-
static void UnsetBit(uint8_t *data, idx_t current_byte, uint8_t current_bit) {
|
71
|
-
data[current_byte] &= ~((uint64_t)1 << current_bit);
|
72
|
-
}
|
73
|
-
|
74
|
-
static void NextBit(idx_t ¤t_byte, uint8_t ¤t_bit) {
|
75
|
-
current_bit++;
|
76
|
-
if (current_bit == 8) {
|
77
|
-
current_byte++;
|
78
|
-
current_bit = 0;
|
28
|
+
//! Append a data chunk to the underlying arrow array
|
29
|
+
void ArrowAppender::Append(DataChunk &input, idx_t from, idx_t to, idx_t input_size) {
|
30
|
+
D_ASSERT(types == input.GetTypes());
|
31
|
+
D_ASSERT(to >= from);
|
32
|
+
for (idx_t i = 0; i < input.ColumnCount(); i++) {
|
33
|
+
root_data[i]->append_vector(*root_data[i], input.data[i], from, to, input_size);
|
79
34
|
}
|
35
|
+
row_count += to - from;
|
80
36
|
}
|
81
37
|
|
82
|
-
|
83
|
-
|
84
|
-
buffer.resize(byte_count, 0xFF);
|
85
|
-
}
|
86
|
-
|
87
|
-
static void SetNull(ArrowAppendData &append_data, uint8_t *validity_data, idx_t current_byte, uint8_t current_bit) {
|
88
|
-
UnsetBit(validity_data, current_byte, current_bit);
|
89
|
-
append_data.null_count++;
|
90
|
-
}
|
91
|
-
|
92
|
-
static void AppendValidity(ArrowAppendData &append_data, UnifiedVectorFormat &format, idx_t from, idx_t to) {
|
93
|
-
// resize the buffer, filling the validity buffer with all valid values
|
94
|
-
idx_t size = to - from;
|
95
|
-
ResizeValidity(append_data.validity, append_data.row_count + size);
|
96
|
-
if (format.validity.AllValid()) {
|
97
|
-
// if all values are valid we don't need to do anything else
|
38
|
+
void ArrowAppender::ReleaseArray(ArrowArray *array) {
|
39
|
+
if (!array || !array->release) {
|
98
40
|
return;
|
99
41
|
}
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
uint8_t current_bit;
|
104
|
-
idx_t current_byte;
|
105
|
-
GetBitPosition(append_data.row_count, current_byte, current_bit);
|
106
|
-
for (idx_t i = from; i < to; i++) {
|
107
|
-
auto source_idx = format.sel->get_index(i);
|
108
|
-
// append the validity mask
|
109
|
-
if (!format.validity.RowIsValid(source_idx)) {
|
110
|
-
SetNull(append_data, validity_data, current_byte, current_bit);
|
111
|
-
}
|
112
|
-
NextBit(current_byte, current_bit);
|
113
|
-
}
|
42
|
+
array->release = nullptr;
|
43
|
+
auto holder = static_cast<ArrowAppendData *>(array->private_data);
|
44
|
+
delete holder;
|
114
45
|
}
|
115
46
|
|
116
47
|
//===--------------------------------------------------------------------===//
|
117
|
-
//
|
118
|
-
//===--------------------------------------------------------------------===//
|
119
|
-
struct ArrowScalarConverter {
|
120
|
-
template <class TGT, class SRC>
|
121
|
-
static TGT Operation(SRC input) {
|
122
|
-
return input;
|
123
|
-
}
|
124
|
-
|
125
|
-
static bool SkipNulls() {
|
126
|
-
return false;
|
127
|
-
}
|
128
|
-
|
129
|
-
template <class TGT>
|
130
|
-
static void SetNull(TGT &value) {
|
131
|
-
}
|
132
|
-
};
|
133
|
-
|
134
|
-
struct ArrowIntervalConverter {
|
135
|
-
template <class TGT, class SRC>
|
136
|
-
static TGT Operation(SRC input) {
|
137
|
-
ArrowInterval result;
|
138
|
-
result.months = input.months;
|
139
|
-
result.days = input.days;
|
140
|
-
result.nanoseconds = input.micros * Interval::NANOS_PER_MICRO;
|
141
|
-
return result;
|
142
|
-
}
|
143
|
-
|
144
|
-
static bool SkipNulls() {
|
145
|
-
return true;
|
146
|
-
}
|
147
|
-
|
148
|
-
template <class TGT>
|
149
|
-
static void SetNull(TGT &value) {
|
150
|
-
}
|
151
|
-
};
|
152
|
-
|
153
|
-
template <class TGT, class SRC = TGT, class OP = ArrowScalarConverter>
|
154
|
-
struct ArrowScalarBaseData {
|
155
|
-
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
156
|
-
D_ASSERT(to >= from);
|
157
|
-
idx_t size = to - from;
|
158
|
-
D_ASSERT(size <= input_size);
|
159
|
-
UnifiedVectorFormat format;
|
160
|
-
input.ToUnifiedFormat(input_size, format);
|
161
|
-
|
162
|
-
// append the validity mask
|
163
|
-
AppendValidity(append_data, format, from, to);
|
164
|
-
|
165
|
-
// append the main data
|
166
|
-
append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(TGT) * size);
|
167
|
-
auto data = UnifiedVectorFormat::GetData<SRC>(format);
|
168
|
-
auto result_data = append_data.main_buffer.GetData<TGT>();
|
169
|
-
|
170
|
-
for (idx_t i = from; i < to; i++) {
|
171
|
-
auto source_idx = format.sel->get_index(i);
|
172
|
-
auto result_idx = append_data.row_count + i - from;
|
173
|
-
|
174
|
-
if (OP::SkipNulls() && !format.validity.RowIsValid(source_idx)) {
|
175
|
-
OP::template SetNull<TGT>(result_data[result_idx]);
|
176
|
-
continue;
|
177
|
-
}
|
178
|
-
result_data[result_idx] = OP::template Operation<TGT, SRC>(data[source_idx]);
|
179
|
-
}
|
180
|
-
append_data.row_count += size;
|
181
|
-
}
|
182
|
-
};
|
183
|
-
|
184
|
-
template <class TGT, class SRC = TGT, class OP = ArrowScalarConverter>
|
185
|
-
struct ArrowScalarData : public ArrowScalarBaseData<TGT, SRC, OP> {
|
186
|
-
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
187
|
-
result.main_buffer.reserve(capacity * sizeof(TGT));
|
188
|
-
}
|
189
|
-
|
190
|
-
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
191
|
-
result->n_buffers = 2;
|
192
|
-
result->buffers[1] = append_data.main_buffer.data();
|
193
|
-
}
|
194
|
-
};
|
195
|
-
|
196
|
-
//===--------------------------------------------------------------------===//
|
197
|
-
// Enums
|
198
|
-
//===--------------------------------------------------------------------===//
|
199
|
-
template <class TGT>
|
200
|
-
struct ArrowEnumData : public ArrowScalarBaseData<TGT> {
|
201
|
-
static idx_t GetLength(string_t input) {
|
202
|
-
return input.GetSize();
|
203
|
-
}
|
204
|
-
static void WriteData(data_ptr_t target, string_t input) {
|
205
|
-
memcpy(target, input.GetData(), input.GetSize());
|
206
|
-
}
|
207
|
-
static void EnumAppendVector(ArrowAppendData &append_data, const Vector &input, idx_t size) {
|
208
|
-
D_ASSERT(input.GetVectorType() == VectorType::FLAT_VECTOR);
|
209
|
-
|
210
|
-
// resize the validity mask and set up the validity buffer for iteration
|
211
|
-
ResizeValidity(append_data.validity, append_data.row_count + size);
|
212
|
-
|
213
|
-
// resize the offset buffer - the offset buffer holds the offsets into the child array
|
214
|
-
append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(uint32_t) * (size + 1));
|
215
|
-
auto data = FlatVector::GetData<string_t>(input);
|
216
|
-
auto offset_data = append_data.main_buffer.GetData<uint32_t>();
|
217
|
-
if (append_data.row_count == 0) {
|
218
|
-
// first entry
|
219
|
-
offset_data[0] = 0;
|
220
|
-
}
|
221
|
-
// now append the string data to the auxiliary buffer
|
222
|
-
// the auxiliary buffer's length depends on the string lengths, so we resize as required
|
223
|
-
auto last_offset = offset_data[append_data.row_count];
|
224
|
-
for (idx_t i = 0; i < size; i++) {
|
225
|
-
auto offset_idx = append_data.row_count + i + 1;
|
226
|
-
|
227
|
-
auto string_length = GetLength(data[i]);
|
228
|
-
|
229
|
-
// append the offset data
|
230
|
-
auto current_offset = last_offset + string_length;
|
231
|
-
offset_data[offset_idx] = current_offset;
|
232
|
-
|
233
|
-
// resize the string buffer if required, and write the string data
|
234
|
-
append_data.aux_buffer.resize(current_offset);
|
235
|
-
WriteData(append_data.aux_buffer.data() + last_offset, data[i]);
|
236
|
-
|
237
|
-
last_offset = current_offset;
|
238
|
-
}
|
239
|
-
append_data.row_count += size;
|
240
|
-
}
|
241
|
-
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
242
|
-
result.main_buffer.reserve(capacity * sizeof(TGT));
|
243
|
-
// construct the enum child data
|
244
|
-
auto enum_data = InitializeArrowChild(LogicalType::VARCHAR, EnumType::GetSize(type), result.options);
|
245
|
-
EnumAppendVector(*enum_data, EnumType::GetValuesInsertOrder(type), EnumType::GetSize(type));
|
246
|
-
result.child_data.push_back(std::move(enum_data));
|
247
|
-
}
|
248
|
-
|
249
|
-
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
250
|
-
result->n_buffers = 2;
|
251
|
-
result->buffers[1] = append_data.main_buffer.data();
|
252
|
-
// finalize the enum child data, and assign it to the dictionary
|
253
|
-
result->dictionary = FinalizeArrowChild(LogicalType::VARCHAR, *append_data.child_data[0]);
|
254
|
-
}
|
255
|
-
};
|
256
|
-
|
257
|
-
//===--------------------------------------------------------------------===//
|
258
|
-
// Boolean
|
259
|
-
//===--------------------------------------------------------------------===//
|
260
|
-
struct ArrowBoolData {
|
261
|
-
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
262
|
-
auto byte_count = (capacity + 7) / 8;
|
263
|
-
result.main_buffer.reserve(byte_count);
|
264
|
-
}
|
265
|
-
|
266
|
-
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
267
|
-
idx_t size = to - from;
|
268
|
-
UnifiedVectorFormat format;
|
269
|
-
input.ToUnifiedFormat(input_size, format);
|
270
|
-
|
271
|
-
// we initialize both the validity and the bit set to 1's
|
272
|
-
ResizeValidity(append_data.validity, append_data.row_count + size);
|
273
|
-
ResizeValidity(append_data.main_buffer, append_data.row_count + size);
|
274
|
-
auto data = UnifiedVectorFormat::GetData<bool>(format);
|
275
|
-
|
276
|
-
auto result_data = append_data.main_buffer.GetData<uint8_t>();
|
277
|
-
auto validity_data = append_data.validity.GetData<uint8_t>();
|
278
|
-
uint8_t current_bit;
|
279
|
-
idx_t current_byte;
|
280
|
-
GetBitPosition(append_data.row_count, current_byte, current_bit);
|
281
|
-
for (idx_t i = from; i < to; i++) {
|
282
|
-
auto source_idx = format.sel->get_index(i);
|
283
|
-
// append the validity mask
|
284
|
-
if (!format.validity.RowIsValid(source_idx)) {
|
285
|
-
SetNull(append_data, validity_data, current_byte, current_bit);
|
286
|
-
} else if (!data[source_idx]) {
|
287
|
-
UnsetBit(result_data, current_byte, current_bit);
|
288
|
-
}
|
289
|
-
NextBit(current_byte, current_bit);
|
290
|
-
}
|
291
|
-
append_data.row_count += size;
|
292
|
-
}
|
293
|
-
|
294
|
-
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
295
|
-
result->n_buffers = 2;
|
296
|
-
result->buffers[1] = append_data.main_buffer.data();
|
297
|
-
}
|
298
|
-
};
|
299
|
-
|
300
|
-
//===--------------------------------------------------------------------===//
|
301
|
-
// Varchar
|
302
|
-
//===--------------------------------------------------------------------===//
|
303
|
-
struct ArrowVarcharConverter {
|
304
|
-
template <class SRC>
|
305
|
-
static idx_t GetLength(SRC input) {
|
306
|
-
return input.GetSize();
|
307
|
-
}
|
308
|
-
|
309
|
-
template <class SRC>
|
310
|
-
static void WriteData(data_ptr_t target, SRC input) {
|
311
|
-
memcpy(target, input.GetData(), input.GetSize());
|
312
|
-
}
|
313
|
-
};
|
314
|
-
|
315
|
-
struct ArrowUUIDConverter {
|
316
|
-
template <class SRC>
|
317
|
-
static idx_t GetLength(SRC input) {
|
318
|
-
return UUID::STRING_SIZE;
|
319
|
-
}
|
320
|
-
|
321
|
-
template <class SRC>
|
322
|
-
static void WriteData(data_ptr_t target, SRC input) {
|
323
|
-
UUID::ToString(input, char_ptr_cast(target));
|
324
|
-
}
|
325
|
-
};
|
326
|
-
|
327
|
-
template <class SRC = string_t, class OP = ArrowVarcharConverter, class BUFTYPE = uint64_t>
|
328
|
-
struct ArrowVarcharData {
|
329
|
-
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
330
|
-
result.main_buffer.reserve((capacity + 1) * sizeof(BUFTYPE));
|
331
|
-
|
332
|
-
result.aux_buffer.reserve(capacity);
|
333
|
-
}
|
334
|
-
|
335
|
-
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
336
|
-
idx_t size = to - from;
|
337
|
-
UnifiedVectorFormat format;
|
338
|
-
input.ToUnifiedFormat(input_size, format);
|
339
|
-
|
340
|
-
// resize the validity mask and set up the validity buffer for iteration
|
341
|
-
ResizeValidity(append_data.validity, append_data.row_count + size);
|
342
|
-
auto validity_data = (uint8_t *)append_data.validity.data();
|
343
|
-
|
344
|
-
// resize the offset buffer - the offset buffer holds the offsets into the child array
|
345
|
-
append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(BUFTYPE) * (size + 1));
|
346
|
-
auto data = UnifiedVectorFormat::GetData<SRC>(format);
|
347
|
-
auto offset_data = append_data.main_buffer.GetData<BUFTYPE>();
|
348
|
-
if (append_data.row_count == 0) {
|
349
|
-
// first entry
|
350
|
-
offset_data[0] = 0;
|
351
|
-
}
|
352
|
-
// now append the string data to the auxiliary buffer
|
353
|
-
// the auxiliary buffer's length depends on the string lengths, so we resize as required
|
354
|
-
auto last_offset = offset_data[append_data.row_count];
|
355
|
-
idx_t max_offset = append_data.row_count + to - from;
|
356
|
-
if (max_offset > NumericLimits<uint32_t>::Maximum() &&
|
357
|
-
append_data.options.offset_size == ArrowOffsetSize::REGULAR) {
|
358
|
-
throw InvalidInputException("Arrow Appender: The maximum total string size for regular string buffers is "
|
359
|
-
"%u but the offset of %lu exceeds this.",
|
360
|
-
NumericLimits<uint32_t>::Maximum(), max_offset);
|
361
|
-
}
|
362
|
-
for (idx_t i = from; i < to; i++) {
|
363
|
-
auto source_idx = format.sel->get_index(i);
|
364
|
-
auto offset_idx = append_data.row_count + i + 1 - from;
|
365
|
-
|
366
|
-
if (!format.validity.RowIsValid(source_idx)) {
|
367
|
-
uint8_t current_bit;
|
368
|
-
idx_t current_byte;
|
369
|
-
GetBitPosition(append_data.row_count + i - from, current_byte, current_bit);
|
370
|
-
SetNull(append_data, validity_data, current_byte, current_bit);
|
371
|
-
offset_data[offset_idx] = last_offset;
|
372
|
-
continue;
|
373
|
-
}
|
374
|
-
|
375
|
-
auto string_length = OP::GetLength(data[source_idx]);
|
376
|
-
|
377
|
-
// append the offset data
|
378
|
-
auto current_offset = last_offset + string_length;
|
379
|
-
offset_data[offset_idx] = current_offset;
|
380
|
-
|
381
|
-
// resize the string buffer if required, and write the string data
|
382
|
-
append_data.aux_buffer.resize(current_offset);
|
383
|
-
OP::WriteData(append_data.aux_buffer.data() + last_offset, data[source_idx]);
|
384
|
-
|
385
|
-
last_offset = current_offset;
|
386
|
-
}
|
387
|
-
append_data.row_count += size;
|
388
|
-
}
|
389
|
-
|
390
|
-
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
391
|
-
result->n_buffers = 3;
|
392
|
-
result->buffers[1] = append_data.main_buffer.data();
|
393
|
-
result->buffers[2] = append_data.aux_buffer.data();
|
394
|
-
}
|
395
|
-
};
|
396
|
-
|
397
|
-
//===--------------------------------------------------------------------===//
|
398
|
-
// Unions
|
399
|
-
//===--------------------------------------------------------------------===//
|
400
|
-
/**
|
401
|
-
* Based on https://arrow.apache.org/docs/format/Columnar.html#union-layout &
|
402
|
-
* https://arrow.apache.org/docs/format/CDataInterface.html
|
403
|
-
*/
|
404
|
-
struct ArrowUnionData {
|
405
|
-
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
406
|
-
result.main_buffer.reserve(capacity * sizeof(int8_t));
|
407
|
-
|
408
|
-
for (auto &child : UnionType::CopyMemberTypes(type)) {
|
409
|
-
auto child_buffer = InitializeArrowChild(child.second, capacity, result.options);
|
410
|
-
result.child_data.push_back(std::move(child_buffer));
|
411
|
-
}
|
412
|
-
}
|
413
|
-
|
414
|
-
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
415
|
-
UnifiedVectorFormat format;
|
416
|
-
input.ToUnifiedFormat(input_size, format);
|
417
|
-
idx_t size = to - from;
|
418
|
-
|
419
|
-
auto &types_buffer = append_data.main_buffer;
|
420
|
-
|
421
|
-
duckdb::vector<Vector> child_vectors;
|
422
|
-
for (const auto &child : UnionType::CopyMemberTypes(input.GetType())) {
|
423
|
-
child_vectors.emplace_back(child.second);
|
424
|
-
}
|
425
|
-
|
426
|
-
for (idx_t input_idx = from; input_idx < to; input_idx++) {
|
427
|
-
const auto &val = input.GetValue(input_idx);
|
428
|
-
|
429
|
-
idx_t tag = 0;
|
430
|
-
Value resolved_value(nullptr);
|
431
|
-
if (!val.IsNull()) {
|
432
|
-
tag = UnionValue::GetTag(val);
|
433
|
-
|
434
|
-
resolved_value = UnionValue::GetValue(val);
|
435
|
-
}
|
436
|
-
|
437
|
-
for (idx_t child_idx = 0; child_idx < child_vectors.size(); child_idx++) {
|
438
|
-
child_vectors[child_idx].SetValue(input_idx, child_idx == tag ? resolved_value : Value(nullptr));
|
439
|
-
}
|
440
|
-
|
441
|
-
types_buffer.data()[input_idx] = tag;
|
442
|
-
}
|
443
|
-
|
444
|
-
for (idx_t child_idx = 0; child_idx < child_vectors.size(); child_idx++) {
|
445
|
-
auto &child_buffer = append_data.child_data[child_idx];
|
446
|
-
auto &child = child_vectors[child_idx];
|
447
|
-
child_buffer->append_vector(*child_buffer, child, from, to, size);
|
448
|
-
}
|
449
|
-
append_data.row_count += size;
|
450
|
-
}
|
451
|
-
|
452
|
-
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
453
|
-
result->n_buffers = 2;
|
454
|
-
result->buffers[1] = append_data.main_buffer.data();
|
455
|
-
|
456
|
-
auto &child_types = UnionType::CopyMemberTypes(type);
|
457
|
-
append_data.child_pointers.resize(child_types.size());
|
458
|
-
result->children = append_data.child_pointers.data();
|
459
|
-
result->n_children = child_types.size();
|
460
|
-
for (idx_t i = 0; i < child_types.size(); i++) {
|
461
|
-
auto &child_type = child_types[i].second;
|
462
|
-
append_data.child_pointers[i] = FinalizeArrowChild(child_type, *append_data.child_data[i]);
|
463
|
-
}
|
464
|
-
}
|
465
|
-
};
|
466
|
-
|
467
|
-
//===--------------------------------------------------------------------===//
|
468
|
-
// Structs
|
48
|
+
// Finalize Arrow Child
|
469
49
|
//===--------------------------------------------------------------------===//
|
470
|
-
|
471
|
-
|
472
|
-
auto &children = StructType::GetChildTypes(type);
|
473
|
-
for (auto &child : children) {
|
474
|
-
auto child_buffer = InitializeArrowChild(child.second, capacity, result.options);
|
475
|
-
result.child_data.push_back(std::move(child_buffer));
|
476
|
-
}
|
477
|
-
}
|
478
|
-
|
479
|
-
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
480
|
-
UnifiedVectorFormat format;
|
481
|
-
input.ToUnifiedFormat(input_size, format);
|
482
|
-
idx_t size = to - from;
|
483
|
-
AppendValidity(append_data, format, from, to);
|
484
|
-
// append the children of the struct
|
485
|
-
auto &children = StructVector::GetEntries(input);
|
486
|
-
for (idx_t child_idx = 0; child_idx < children.size(); child_idx++) {
|
487
|
-
auto &child = children[child_idx];
|
488
|
-
auto &child_data = *append_data.child_data[child_idx];
|
489
|
-
child_data.append_vector(child_data, *child, from, to, size);
|
490
|
-
}
|
491
|
-
append_data.row_count += size;
|
492
|
-
}
|
493
|
-
|
494
|
-
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
495
|
-
result->n_buffers = 1;
|
50
|
+
ArrowArray *ArrowAppender::FinalizeChild(const LogicalType &type, ArrowAppendData &append_data) {
|
51
|
+
auto result = make_uniq<ArrowArray>();
|
496
52
|
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
53
|
+
result->private_data = nullptr;
|
54
|
+
result->release = ArrowAppender::ReleaseArray;
|
55
|
+
result->n_children = 0;
|
56
|
+
result->null_count = 0;
|
57
|
+
result->offset = 0;
|
58
|
+
result->dictionary = nullptr;
|
59
|
+
result->buffers = append_data.buffers.data();
|
60
|
+
result->null_count = append_data.null_count;
|
61
|
+
result->length = append_data.row_count;
|
62
|
+
result->buffers[0] = append_data.validity.data();
|
507
63
|
|
508
|
-
|
509
|
-
|
510
|
-
//===--------------------------------------------------------------------===//
|
511
|
-
void AppendListOffsets(ArrowAppendData &append_data, UnifiedVectorFormat &format, idx_t from, idx_t to,
|
512
|
-
vector<sel_t> &child_sel) {
|
513
|
-
// resize the offset buffer - the offset buffer holds the offsets into the child array
|
514
|
-
idx_t size = to - from;
|
515
|
-
append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(uint32_t) * (size + 1));
|
516
|
-
auto data = UnifiedVectorFormat::GetData<list_entry_t>(format);
|
517
|
-
auto offset_data = append_data.main_buffer.GetData<uint32_t>();
|
518
|
-
if (append_data.row_count == 0) {
|
519
|
-
// first entry
|
520
|
-
offset_data[0] = 0;
|
64
|
+
if (append_data.finalize) {
|
65
|
+
append_data.finalize(append_data, type, result.get());
|
521
66
|
}
|
522
|
-
// set up the offsets using the list entries
|
523
|
-
auto last_offset = offset_data[append_data.row_count];
|
524
|
-
for (idx_t i = from; i < to; i++) {
|
525
|
-
auto source_idx = format.sel->get_index(i);
|
526
|
-
auto offset_idx = append_data.row_count + i + 1 - from;
|
527
|
-
|
528
|
-
if (!format.validity.RowIsValid(source_idx)) {
|
529
|
-
offset_data[offset_idx] = last_offset;
|
530
|
-
continue;
|
531
|
-
}
|
532
|
-
|
533
|
-
// append the offset data
|
534
|
-
auto list_length = data[source_idx].length;
|
535
|
-
last_offset += list_length;
|
536
|
-
offset_data[offset_idx] = last_offset;
|
537
67
|
|
538
|
-
|
539
|
-
|
540
|
-
}
|
541
|
-
}
|
68
|
+
append_data.array = std::move(result);
|
69
|
+
return append_data.array.get();
|
542
70
|
}
|
543
71
|
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
auto child_buffer = InitializeArrowChild(child_type, capacity, result.options);
|
549
|
-
result.child_data.push_back(std::move(child_buffer));
|
550
|
-
}
|
551
|
-
|
552
|
-
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
553
|
-
UnifiedVectorFormat format;
|
554
|
-
input.ToUnifiedFormat(input_size, format);
|
555
|
-
idx_t size = to - from;
|
556
|
-
vector<sel_t> child_indices;
|
557
|
-
AppendValidity(append_data, format, from, to);
|
558
|
-
AppendListOffsets(append_data, format, from, to, child_indices);
|
559
|
-
|
560
|
-
// append the child vector of the list
|
561
|
-
SelectionVector child_sel(child_indices.data());
|
562
|
-
auto &child = ListVector::GetEntry(input);
|
563
|
-
auto child_size = child_indices.size();
|
564
|
-
Vector child_copy(child.GetType());
|
565
|
-
child_copy.Slice(child, child_sel, child_size);
|
566
|
-
append_data.child_data[0]->append_vector(*append_data.child_data[0], child_copy, 0, child_size, child_size);
|
567
|
-
append_data.row_count += size;
|
568
|
-
}
|
569
|
-
|
570
|
-
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
571
|
-
result->n_buffers = 2;
|
572
|
-
result->buffers[1] = append_data.main_buffer.data();
|
573
|
-
|
574
|
-
auto &child_type = ListType::GetChildType(type);
|
575
|
-
append_data.child_pointers.resize(1);
|
576
|
-
result->children = append_data.child_pointers.data();
|
577
|
-
result->n_children = 1;
|
578
|
-
append_data.child_pointers[0] = FinalizeArrowChild(child_type, *append_data.child_data[0]);
|
579
|
-
}
|
580
|
-
};
|
581
|
-
|
582
|
-
//===--------------------------------------------------------------------===//
|
583
|
-
// Maps
|
584
|
-
//===--------------------------------------------------------------------===//
|
585
|
-
struct ArrowMapData {
|
586
|
-
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
587
|
-
// map types are stored in a (too) clever way
|
588
|
-
// the main buffer holds the null values and the offsets
|
589
|
-
// then we have a single child, which is a struct of the map_type, and the key_type
|
590
|
-
result.main_buffer.reserve((capacity + 1) * sizeof(uint32_t));
|
591
|
-
|
592
|
-
auto &key_type = MapType::KeyType(type);
|
593
|
-
auto &value_type = MapType::ValueType(type);
|
594
|
-
auto internal_struct = make_uniq<ArrowAppendData>(result.options);
|
595
|
-
internal_struct->child_data.push_back(InitializeArrowChild(key_type, capacity, result.options));
|
596
|
-
internal_struct->child_data.push_back(InitializeArrowChild(value_type, capacity, result.options));
|
597
|
-
|
598
|
-
result.child_data.push_back(std::move(internal_struct));
|
599
|
-
}
|
600
|
-
|
601
|
-
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
602
|
-
UnifiedVectorFormat format;
|
603
|
-
input.ToUnifiedFormat(input_size, format);
|
604
|
-
idx_t size = to - from;
|
605
|
-
AppendValidity(append_data, format, from, to);
|
606
|
-
vector<sel_t> child_indices;
|
607
|
-
AppendListOffsets(append_data, format, from, to, child_indices);
|
608
|
-
|
609
|
-
SelectionVector child_sel(child_indices.data());
|
610
|
-
auto &key_vector = MapVector::GetKeys(input);
|
611
|
-
auto &value_vector = MapVector::GetValues(input);
|
612
|
-
auto list_size = child_indices.size();
|
613
|
-
|
614
|
-
auto &struct_data = *append_data.child_data[0];
|
615
|
-
auto &key_data = *struct_data.child_data[0];
|
616
|
-
auto &value_data = *struct_data.child_data[1];
|
617
|
-
|
618
|
-
if (size != input_size) {
|
619
|
-
// Let's avoid doing this
|
620
|
-
Vector key_vector_copy(key_vector.GetType());
|
621
|
-
key_vector_copy.Slice(key_vector, child_sel, list_size);
|
622
|
-
Vector value_vector_copy(value_vector.GetType());
|
623
|
-
value_vector_copy.Slice(value_vector, child_sel, list_size);
|
624
|
-
key_data.append_vector(key_data, key_vector_copy, 0, list_size, list_size);
|
625
|
-
value_data.append_vector(value_data, value_vector_copy, 0, list_size, list_size);
|
626
|
-
} else {
|
627
|
-
// We don't care about the vector, slice it
|
628
|
-
key_vector.Slice(child_sel, list_size);
|
629
|
-
value_vector.Slice(child_sel, list_size);
|
630
|
-
key_data.append_vector(key_data, key_vector, 0, list_size, list_size);
|
631
|
-
value_data.append_vector(value_data, value_vector, 0, list_size, list_size);
|
632
|
-
}
|
633
|
-
|
634
|
-
append_data.row_count += size;
|
635
|
-
struct_data.row_count += size;
|
636
|
-
}
|
637
|
-
|
638
|
-
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
639
|
-
// set up the main map buffer
|
640
|
-
result->n_buffers = 2;
|
641
|
-
result->buffers[1] = append_data.main_buffer.data();
|
642
|
-
|
643
|
-
// the main map buffer has a single child: a struct
|
644
|
-
append_data.child_pointers.resize(1);
|
645
|
-
result->children = append_data.child_pointers.data();
|
646
|
-
result->n_children = 1;
|
647
|
-
append_data.child_pointers[0] = FinalizeArrowChild(type, *append_data.child_data[0]);
|
648
|
-
|
649
|
-
// now that struct has two children: the key and the value type
|
650
|
-
auto &struct_data = *append_data.child_data[0];
|
651
|
-
auto &struct_result = append_data.child_pointers[0];
|
652
|
-
struct_data.child_pointers.resize(2);
|
653
|
-
struct_result->n_buffers = 1;
|
654
|
-
struct_result->n_children = 2;
|
655
|
-
struct_result->length = struct_data.child_data[0]->row_count;
|
656
|
-
struct_result->children = struct_data.child_pointers.data();
|
72
|
+
//! Returns the underlying arrow array
|
73
|
+
ArrowArray ArrowAppender::Finalize() {
|
74
|
+
D_ASSERT(root_data.size() == types.size());
|
75
|
+
auto root_holder = make_uniq<ArrowAppendData>(options);
|
657
76
|
|
658
|
-
|
77
|
+
ArrowArray result;
|
78
|
+
root_holder->child_pointers.resize(types.size());
|
79
|
+
result.children = root_holder->child_pointers.data();
|
80
|
+
result.n_children = types.size();
|
659
81
|
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
82
|
+
// Configure root array
|
83
|
+
result.length = row_count;
|
84
|
+
result.n_buffers = 1;
|
85
|
+
result.buffers = root_holder->buffers.data(); // there is no actual buffer there since we don't have NULLs
|
86
|
+
result.offset = 0;
|
87
|
+
result.null_count = 0; // needs to be 0
|
88
|
+
result.dictionary = nullptr;
|
89
|
+
root_holder->child_data = std::move(root_data);
|
664
90
|
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
91
|
+
// FIXME: this violates a property of the arrow format, if root owns all the child memory then consumers can't move
|
92
|
+
// child arrays https://arrow.apache.org/docs/format/CDataInterface.html#moving-child-arrays
|
93
|
+
for (idx_t i = 0; i < root_holder->child_data.size(); i++) {
|
94
|
+
root_holder->child_pointers[i] = ArrowAppender::FinalizeChild(types[i], *root_holder->child_data[i]);
|
669
95
|
}
|
670
|
-
};
|
671
96
|
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
root_data[i]->append_vector(*root_data[i], input.data[i], from, to, input_size);
|
677
|
-
}
|
678
|
-
row_count += to - from;
|
97
|
+
// Release ownership to caller
|
98
|
+
result.private_data = root_holder.release();
|
99
|
+
result.release = ArrowAppender::ReleaseArray;
|
100
|
+
return result;
|
679
101
|
}
|
102
|
+
|
680
103
|
//===--------------------------------------------------------------------===//
|
681
104
|
// Initialize Arrow Child
|
682
105
|
//===--------------------------------------------------------------------===//
|
106
|
+
|
683
107
|
template <class OP>
|
684
|
-
static void
|
108
|
+
static void InitializeAppenderForType(ArrowAppendData &append_data) {
|
685
109
|
append_data.initialize = OP::Initialize;
|
686
110
|
append_data.append_vector = OP::Append;
|
687
111
|
append_data.finalize = OP::Finalize;
|
@@ -691,17 +115,17 @@ static void InitializeFunctionPointers(ArrowAppendData &append_data, const Logic
|
|
691
115
|
// handle special logical types
|
692
116
|
switch (type.id()) {
|
693
117
|
case LogicalTypeId::BOOLEAN:
|
694
|
-
|
118
|
+
InitializeAppenderForType<ArrowBoolData>(append_data);
|
695
119
|
break;
|
696
120
|
case LogicalTypeId::TINYINT:
|
697
|
-
|
121
|
+
InitializeAppenderForType<ArrowScalarData<int8_t>>(append_data);
|
698
122
|
break;
|
699
123
|
case LogicalTypeId::SMALLINT:
|
700
|
-
|
124
|
+
InitializeAppenderForType<ArrowScalarData<int16_t>>(append_data);
|
701
125
|
break;
|
702
126
|
case LogicalTypeId::DATE:
|
703
127
|
case LogicalTypeId::INTEGER:
|
704
|
-
|
128
|
+
InitializeAppenderForType<ArrowScalarData<int32_t>>(append_data);
|
705
129
|
break;
|
706
130
|
case LogicalTypeId::TIME:
|
707
131
|
case LogicalTypeId::TIMESTAMP_SEC:
|
@@ -711,42 +135,42 @@ static void InitializeFunctionPointers(ArrowAppendData &append_data, const Logic
|
|
711
135
|
case LogicalTypeId::TIMESTAMP_TZ:
|
712
136
|
case LogicalTypeId::TIME_TZ:
|
713
137
|
case LogicalTypeId::BIGINT:
|
714
|
-
|
138
|
+
InitializeAppenderForType<ArrowScalarData<int64_t>>(append_data);
|
715
139
|
break;
|
716
140
|
case LogicalTypeId::HUGEINT:
|
717
|
-
|
141
|
+
InitializeAppenderForType<ArrowScalarData<hugeint_t>>(append_data);
|
718
142
|
break;
|
719
143
|
case LogicalTypeId::UTINYINT:
|
720
|
-
|
144
|
+
InitializeAppenderForType<ArrowScalarData<uint8_t>>(append_data);
|
721
145
|
break;
|
722
146
|
case LogicalTypeId::USMALLINT:
|
723
|
-
|
147
|
+
InitializeAppenderForType<ArrowScalarData<uint16_t>>(append_data);
|
724
148
|
break;
|
725
149
|
case LogicalTypeId::UINTEGER:
|
726
|
-
|
150
|
+
InitializeAppenderForType<ArrowScalarData<uint32_t>>(append_data);
|
727
151
|
break;
|
728
152
|
case LogicalTypeId::UBIGINT:
|
729
|
-
|
153
|
+
InitializeAppenderForType<ArrowScalarData<uint64_t>>(append_data);
|
730
154
|
break;
|
731
155
|
case LogicalTypeId::FLOAT:
|
732
|
-
|
156
|
+
InitializeAppenderForType<ArrowScalarData<float>>(append_data);
|
733
157
|
break;
|
734
158
|
case LogicalTypeId::DOUBLE:
|
735
|
-
|
159
|
+
InitializeAppenderForType<ArrowScalarData<double>>(append_data);
|
736
160
|
break;
|
737
161
|
case LogicalTypeId::DECIMAL:
|
738
162
|
switch (type.InternalType()) {
|
739
163
|
case PhysicalType::INT16:
|
740
|
-
|
164
|
+
InitializeAppenderForType<ArrowScalarData<hugeint_t, int16_t>>(append_data);
|
741
165
|
break;
|
742
166
|
case PhysicalType::INT32:
|
743
|
-
|
167
|
+
InitializeAppenderForType<ArrowScalarData<hugeint_t, int32_t>>(append_data);
|
744
168
|
break;
|
745
169
|
case PhysicalType::INT64:
|
746
|
-
|
170
|
+
InitializeAppenderForType<ArrowScalarData<hugeint_t, int64_t>>(append_data);
|
747
171
|
break;
|
748
172
|
case PhysicalType::INT128:
|
749
|
-
|
173
|
+
InitializeAppenderForType<ArrowScalarData<hugeint_t>>(append_data);
|
750
174
|
break;
|
751
175
|
default:
|
752
176
|
throw InternalException("Unsupported internal decimal type");
|
@@ -756,54 +180,55 @@ static void InitializeFunctionPointers(ArrowAppendData &append_data, const Logic
|
|
756
180
|
case LogicalTypeId::BLOB:
|
757
181
|
case LogicalTypeId::BIT:
|
758
182
|
if (append_data.options.offset_size == ArrowOffsetSize::LARGE) {
|
759
|
-
|
183
|
+
InitializeAppenderForType<ArrowVarcharData<string_t>>(append_data);
|
760
184
|
} else {
|
761
|
-
|
185
|
+
InitializeAppenderForType<ArrowVarcharData<string_t, ArrowVarcharConverter, uint32_t>>(append_data);
|
762
186
|
}
|
763
187
|
break;
|
764
188
|
case LogicalTypeId::UUID:
|
765
189
|
if (append_data.options.offset_size == ArrowOffsetSize::LARGE) {
|
766
|
-
|
190
|
+
InitializeAppenderForType<ArrowVarcharData<hugeint_t, ArrowUUIDConverter>>(append_data);
|
767
191
|
} else {
|
768
|
-
|
192
|
+
InitializeAppenderForType<ArrowVarcharData<hugeint_t, ArrowUUIDConverter, uint32_t>>(append_data);
|
769
193
|
}
|
770
194
|
break;
|
771
195
|
case LogicalTypeId::ENUM:
|
772
196
|
switch (type.InternalType()) {
|
773
197
|
case PhysicalType::UINT8:
|
774
|
-
|
198
|
+
InitializeAppenderForType<ArrowEnumData<uint8_t>>(append_data);
|
775
199
|
break;
|
776
200
|
case PhysicalType::UINT16:
|
777
|
-
|
201
|
+
InitializeAppenderForType<ArrowEnumData<uint16_t>>(append_data);
|
778
202
|
break;
|
779
203
|
case PhysicalType::UINT32:
|
780
|
-
|
204
|
+
InitializeAppenderForType<ArrowEnumData<uint32_t>>(append_data);
|
781
205
|
break;
|
782
206
|
default:
|
783
207
|
throw InternalException("Unsupported internal enum type");
|
784
208
|
}
|
785
209
|
break;
|
786
210
|
case LogicalTypeId::INTERVAL:
|
787
|
-
|
211
|
+
InitializeAppenderForType<ArrowScalarData<ArrowInterval, interval_t, ArrowIntervalConverter>>(append_data);
|
788
212
|
break;
|
789
213
|
case LogicalTypeId::UNION:
|
790
|
-
|
214
|
+
InitializeAppenderForType<ArrowUnionData>(append_data);
|
791
215
|
break;
|
792
216
|
case LogicalTypeId::STRUCT:
|
793
|
-
|
217
|
+
InitializeAppenderForType<ArrowStructData>(append_data);
|
794
218
|
break;
|
795
219
|
case LogicalTypeId::LIST:
|
796
|
-
|
220
|
+
InitializeAppenderForType<ArrowListData>(append_data);
|
797
221
|
break;
|
798
222
|
case LogicalTypeId::MAP:
|
799
|
-
|
223
|
+
InitializeAppenderForType<ArrowMapData>(append_data);
|
800
224
|
break;
|
801
225
|
default:
|
802
226
|
throw NotImplementedException("Unsupported type in DuckDB -> Arrow Conversion: %s\n", type.ToString());
|
803
227
|
}
|
804
228
|
}
|
805
229
|
|
806
|
-
unique_ptr<ArrowAppendData>
|
230
|
+
unique_ptr<ArrowAppendData> ArrowAppender::InitializeChild(const LogicalType &type, idx_t capacity,
|
231
|
+
ArrowOptions &options) {
|
807
232
|
auto result = make_uniq<ArrowAppendData>(options);
|
808
233
|
InitializeFunctionPointers(*result, type);
|
809
234
|
|
@@ -813,67 +238,4 @@ unique_ptr<ArrowAppendData> InitializeArrowChild(const LogicalType &type, idx_t
|
|
813
238
|
return result;
|
814
239
|
}
|
815
240
|
|
816
|
-
static void ReleaseDuckDBArrowAppendArray(ArrowArray *array) {
|
817
|
-
if (!array || !array->release) {
|
818
|
-
return;
|
819
|
-
}
|
820
|
-
array->release = nullptr;
|
821
|
-
auto holder = static_cast<ArrowAppendData *>(array->private_data);
|
822
|
-
delete holder;
|
823
|
-
}
|
824
|
-
|
825
|
-
//===--------------------------------------------------------------------===//
|
826
|
-
// Finalize Arrow Child
|
827
|
-
//===--------------------------------------------------------------------===//
|
828
|
-
ArrowArray *FinalizeArrowChild(const LogicalType &type, ArrowAppendData &append_data) {
|
829
|
-
auto result = make_uniq<ArrowArray>();
|
830
|
-
|
831
|
-
result->private_data = nullptr;
|
832
|
-
result->release = ReleaseDuckDBArrowAppendArray;
|
833
|
-
result->n_children = 0;
|
834
|
-
result->null_count = 0;
|
835
|
-
result->offset = 0;
|
836
|
-
result->dictionary = nullptr;
|
837
|
-
result->buffers = append_data.buffers.data();
|
838
|
-
result->null_count = append_data.null_count;
|
839
|
-
result->length = append_data.row_count;
|
840
|
-
result->buffers[0] = append_data.validity.data();
|
841
|
-
|
842
|
-
if (append_data.finalize) {
|
843
|
-
append_data.finalize(append_data, type, result.get());
|
844
|
-
}
|
845
|
-
|
846
|
-
append_data.array = std::move(result);
|
847
|
-
return append_data.array.get();
|
848
|
-
}
|
849
|
-
|
850
|
-
//! Returns the underlying arrow array
|
851
|
-
ArrowArray ArrowAppender::Finalize() {
|
852
|
-
D_ASSERT(root_data.size() == types.size());
|
853
|
-
auto root_holder = make_uniq<ArrowAppendData>(options);
|
854
|
-
|
855
|
-
ArrowArray result;
|
856
|
-
root_holder->child_pointers.resize(types.size());
|
857
|
-
result.children = root_holder->child_pointers.data();
|
858
|
-
result.n_children = types.size();
|
859
|
-
|
860
|
-
// Configure root array
|
861
|
-
result.length = row_count;
|
862
|
-
result.n_buffers = 1;
|
863
|
-
result.buffers = root_holder->buffers.data(); // there is no actual buffer there since we don't have NULLs
|
864
|
-
result.offset = 0;
|
865
|
-
result.null_count = 0; // needs to be 0
|
866
|
-
result.dictionary = nullptr;
|
867
|
-
root_holder->child_data = std::move(root_data);
|
868
|
-
|
869
|
-
for (idx_t i = 0; i < root_holder->child_data.size(); i++) {
|
870
|
-
root_holder->child_pointers[i] = FinalizeArrowChild(types[i], *root_holder->child_data[i]);
|
871
|
-
}
|
872
|
-
|
873
|
-
// Release ownership to caller
|
874
|
-
result.private_data = root_holder.release();
|
875
|
-
result.release = ReleaseDuckDBArrowAppendArray;
|
876
|
-
return result;
|
877
|
-
}
|
878
|
-
|
879
241
|
} // namespace duckdb
|