duckdb 0.8.2-dev2068.0 → 0.8.2-dev2090.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. package/binding.gyp +1 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/src/common/arrow/appender/bool_data.cpp +44 -0
  4. package/src/duckdb/src/common/arrow/appender/list_data.cpp +78 -0
  5. package/src/duckdb/src/common/arrow/appender/map_data.cpp +86 -0
  6. package/src/duckdb/src/common/arrow/appender/struct_data.cpp +45 -0
  7. package/src/duckdb/src/common/arrow/appender/union_data.cpp +70 -0
  8. package/src/duckdb/src/common/arrow/arrow_appender.cpp +89 -727
  9. package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +2 -1
  10. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  11. package/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp +109 -0
  12. package/src/duckdb/src/include/duckdb/common/arrow/appender/bool_data.hpp +15 -0
  13. package/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp +69 -0
  14. package/src/duckdb/src/include/duckdb/common/arrow/appender/list.hpp +8 -0
  15. package/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp +18 -0
  16. package/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp +18 -0
  17. package/src/duckdb/src/include/duckdb/common/arrow/appender/scalar_data.hpp +88 -0
  18. package/src/duckdb/src/include/duckdb/common/arrow/appender/struct_data.hpp +18 -0
  19. package/src/duckdb/src/include/duckdb/common/arrow/appender/union_data.hpp +21 -0
  20. package/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp +105 -0
  21. package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +5 -0
  22. package/src/duckdb/src/parallel/executor.cpp +1 -1
  23. package/src/duckdb/ub_src_common_arrow_appender.cpp +10 -0
@@ -5,53 +5,19 @@
5
5
  #include "duckdb/common/types/interval.hpp"
6
6
  #include "duckdb/common/types/uuid.hpp"
7
7
  #include "duckdb/function/table/arrow.hpp"
8
+ #include "duckdb/common/arrow/appender/append_data.hpp"
9
+ #include "duckdb/common/arrow/appender/list.hpp"
8
10
 
9
11
  namespace duckdb {
10
12
 
11
- //===--------------------------------------------------------------------===//
12
- // Arrow append data
13
- //===--------------------------------------------------------------------===//
14
- typedef void (*initialize_t)(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
15
- typedef void (*append_vector_t)(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
16
- typedef void (*finalize_t)(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
17
-
18
- struct ArrowAppendData {
19
- explicit ArrowAppendData(ArrowOptions &options_p) : options(options_p) {
20
- }
21
- // the buffers of the arrow vector
22
- ArrowBuffer validity;
23
- ArrowBuffer main_buffer;
24
- ArrowBuffer aux_buffer;
25
-
26
- idx_t row_count = 0;
27
- idx_t null_count = 0;
28
-
29
- // function pointers for construction
30
- initialize_t initialize = nullptr;
31
- append_vector_t append_vector = nullptr;
32
- finalize_t finalize = nullptr;
33
-
34
- // child data (if any)
35
- vector<unique_ptr<ArrowAppendData>> child_data;
36
-
37
- // the arrow array C API data, only set after Finalize
38
- unique_ptr<ArrowArray> array;
39
- duckdb::array<const void *, 3> buffers = {{nullptr, nullptr, nullptr}};
40
- vector<ArrowArray *> child_pointers;
41
-
42
- ArrowOptions options;
43
- };
44
-
45
13
  //===--------------------------------------------------------------------===//
46
14
  // ArrowAppender
47
15
  //===--------------------------------------------------------------------===//
48
- static unique_ptr<ArrowAppendData> InitializeArrowChild(const LogicalType &type, idx_t capacity, ArrowOptions &options);
49
- static ArrowArray *FinalizeArrowChild(const LogicalType &type, ArrowAppendData &append_data);
50
16
 
51
17
  ArrowAppender::ArrowAppender(vector<LogicalType> types_p, idx_t initial_capacity, ArrowOptions options)
52
18
  : types(std::move(types_p)) {
53
19
  for (auto &type : types) {
54
- auto entry = InitializeArrowChild(type, initial_capacity, options);
20
+ auto entry = ArrowAppender::InitializeChild(type, initial_capacity, options);
55
21
  root_data.push_back(std::move(entry));
56
22
  }
57
23
  }
@@ -59,629 +25,87 @@ ArrowAppender::ArrowAppender(vector<LogicalType> types_p, idx_t initial_capacity
59
25
  ArrowAppender::~ArrowAppender() {
60
26
  }
61
27
 
62
- //===--------------------------------------------------------------------===//
63
- // Append Helper Functions
64
- //===--------------------------------------------------------------------===//
65
- static void GetBitPosition(idx_t row_idx, idx_t &current_byte, uint8_t &current_bit) {
66
- current_byte = row_idx / 8;
67
- current_bit = row_idx % 8;
68
- }
69
-
70
- static void UnsetBit(uint8_t *data, idx_t current_byte, uint8_t current_bit) {
71
- data[current_byte] &= ~((uint64_t)1 << current_bit);
72
- }
73
-
74
- static void NextBit(idx_t &current_byte, uint8_t &current_bit) {
75
- current_bit++;
76
- if (current_bit == 8) {
77
- current_byte++;
78
- current_bit = 0;
28
+ //! Append a data chunk to the underlying arrow array
29
+ void ArrowAppender::Append(DataChunk &input, idx_t from, idx_t to, idx_t input_size) {
30
+ D_ASSERT(types == input.GetTypes());
31
+ D_ASSERT(to >= from);
32
+ for (idx_t i = 0; i < input.ColumnCount(); i++) {
33
+ root_data[i]->append_vector(*root_data[i], input.data[i], from, to, input_size);
79
34
  }
35
+ row_count += to - from;
80
36
  }
81
37
 
82
- static void ResizeValidity(ArrowBuffer &buffer, idx_t row_count) {
83
- auto byte_count = (row_count + 7) / 8;
84
- buffer.resize(byte_count, 0xFF);
85
- }
86
-
87
- static void SetNull(ArrowAppendData &append_data, uint8_t *validity_data, idx_t current_byte, uint8_t current_bit) {
88
- UnsetBit(validity_data, current_byte, current_bit);
89
- append_data.null_count++;
90
- }
91
-
92
- static void AppendValidity(ArrowAppendData &append_data, UnifiedVectorFormat &format, idx_t from, idx_t to) {
93
- // resize the buffer, filling the validity buffer with all valid values
94
- idx_t size = to - from;
95
- ResizeValidity(append_data.validity, append_data.row_count + size);
96
- if (format.validity.AllValid()) {
97
- // if all values are valid we don't need to do anything else
38
+ void ArrowAppender::ReleaseArray(ArrowArray *array) {
39
+ if (!array || !array->release) {
98
40
  return;
99
41
  }
100
-
101
- // otherwise we iterate through the validity mask
102
- auto validity_data = (uint8_t *)append_data.validity.data();
103
- uint8_t current_bit;
104
- idx_t current_byte;
105
- GetBitPosition(append_data.row_count, current_byte, current_bit);
106
- for (idx_t i = from; i < to; i++) {
107
- auto source_idx = format.sel->get_index(i);
108
- // append the validity mask
109
- if (!format.validity.RowIsValid(source_idx)) {
110
- SetNull(append_data, validity_data, current_byte, current_bit);
111
- }
112
- NextBit(current_byte, current_bit);
113
- }
42
+ array->release = nullptr;
43
+ auto holder = static_cast<ArrowAppendData *>(array->private_data);
44
+ delete holder;
114
45
  }
115
46
 
116
47
  //===--------------------------------------------------------------------===//
117
- // Scalar Types
118
- //===--------------------------------------------------------------------===//
119
- struct ArrowScalarConverter {
120
- template <class TGT, class SRC>
121
- static TGT Operation(SRC input) {
122
- return input;
123
- }
124
-
125
- static bool SkipNulls() {
126
- return false;
127
- }
128
-
129
- template <class TGT>
130
- static void SetNull(TGT &value) {
131
- }
132
- };
133
-
134
- struct ArrowIntervalConverter {
135
- template <class TGT, class SRC>
136
- static TGT Operation(SRC input) {
137
- ArrowInterval result;
138
- result.months = input.months;
139
- result.days = input.days;
140
- result.nanoseconds = input.micros * Interval::NANOS_PER_MICRO;
141
- return result;
142
- }
143
-
144
- static bool SkipNulls() {
145
- return true;
146
- }
147
-
148
- template <class TGT>
149
- static void SetNull(TGT &value) {
150
- }
151
- };
152
-
153
- template <class TGT, class SRC = TGT, class OP = ArrowScalarConverter>
154
- struct ArrowScalarBaseData {
155
- static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
156
- D_ASSERT(to >= from);
157
- idx_t size = to - from;
158
- D_ASSERT(size <= input_size);
159
- UnifiedVectorFormat format;
160
- input.ToUnifiedFormat(input_size, format);
161
-
162
- // append the validity mask
163
- AppendValidity(append_data, format, from, to);
164
-
165
- // append the main data
166
- append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(TGT) * size);
167
- auto data = UnifiedVectorFormat::GetData<SRC>(format);
168
- auto result_data = append_data.main_buffer.GetData<TGT>();
169
-
170
- for (idx_t i = from; i < to; i++) {
171
- auto source_idx = format.sel->get_index(i);
172
- auto result_idx = append_data.row_count + i - from;
173
-
174
- if (OP::SkipNulls() && !format.validity.RowIsValid(source_idx)) {
175
- OP::template SetNull<TGT>(result_data[result_idx]);
176
- continue;
177
- }
178
- result_data[result_idx] = OP::template Operation<TGT, SRC>(data[source_idx]);
179
- }
180
- append_data.row_count += size;
181
- }
182
- };
183
-
184
- template <class TGT, class SRC = TGT, class OP = ArrowScalarConverter>
185
- struct ArrowScalarData : public ArrowScalarBaseData<TGT, SRC, OP> {
186
- static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
187
- result.main_buffer.reserve(capacity * sizeof(TGT));
188
- }
189
-
190
- static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
191
- result->n_buffers = 2;
192
- result->buffers[1] = append_data.main_buffer.data();
193
- }
194
- };
195
-
196
- //===--------------------------------------------------------------------===//
197
- // Enums
198
- //===--------------------------------------------------------------------===//
199
- template <class TGT>
200
- struct ArrowEnumData : public ArrowScalarBaseData<TGT> {
201
- static idx_t GetLength(string_t input) {
202
- return input.GetSize();
203
- }
204
- static void WriteData(data_ptr_t target, string_t input) {
205
- memcpy(target, input.GetData(), input.GetSize());
206
- }
207
- static void EnumAppendVector(ArrowAppendData &append_data, const Vector &input, idx_t size) {
208
- D_ASSERT(input.GetVectorType() == VectorType::FLAT_VECTOR);
209
-
210
- // resize the validity mask and set up the validity buffer for iteration
211
- ResizeValidity(append_data.validity, append_data.row_count + size);
212
-
213
- // resize the offset buffer - the offset buffer holds the offsets into the child array
214
- append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(uint32_t) * (size + 1));
215
- auto data = FlatVector::GetData<string_t>(input);
216
- auto offset_data = append_data.main_buffer.GetData<uint32_t>();
217
- if (append_data.row_count == 0) {
218
- // first entry
219
- offset_data[0] = 0;
220
- }
221
- // now append the string data to the auxiliary buffer
222
- // the auxiliary buffer's length depends on the string lengths, so we resize as required
223
- auto last_offset = offset_data[append_data.row_count];
224
- for (idx_t i = 0; i < size; i++) {
225
- auto offset_idx = append_data.row_count + i + 1;
226
-
227
- auto string_length = GetLength(data[i]);
228
-
229
- // append the offset data
230
- auto current_offset = last_offset + string_length;
231
- offset_data[offset_idx] = current_offset;
232
-
233
- // resize the string buffer if required, and write the string data
234
- append_data.aux_buffer.resize(current_offset);
235
- WriteData(append_data.aux_buffer.data() + last_offset, data[i]);
236
-
237
- last_offset = current_offset;
238
- }
239
- append_data.row_count += size;
240
- }
241
- static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
242
- result.main_buffer.reserve(capacity * sizeof(TGT));
243
- // construct the enum child data
244
- auto enum_data = InitializeArrowChild(LogicalType::VARCHAR, EnumType::GetSize(type), result.options);
245
- EnumAppendVector(*enum_data, EnumType::GetValuesInsertOrder(type), EnumType::GetSize(type));
246
- result.child_data.push_back(std::move(enum_data));
247
- }
248
-
249
- static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
250
- result->n_buffers = 2;
251
- result->buffers[1] = append_data.main_buffer.data();
252
- // finalize the enum child data, and assign it to the dictionary
253
- result->dictionary = FinalizeArrowChild(LogicalType::VARCHAR, *append_data.child_data[0]);
254
- }
255
- };
256
-
257
- //===--------------------------------------------------------------------===//
258
- // Boolean
259
- //===--------------------------------------------------------------------===//
260
- struct ArrowBoolData {
261
- static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
262
- auto byte_count = (capacity + 7) / 8;
263
- result.main_buffer.reserve(byte_count);
264
- }
265
-
266
- static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
267
- idx_t size = to - from;
268
- UnifiedVectorFormat format;
269
- input.ToUnifiedFormat(input_size, format);
270
-
271
- // we initialize both the validity and the bit set to 1's
272
- ResizeValidity(append_data.validity, append_data.row_count + size);
273
- ResizeValidity(append_data.main_buffer, append_data.row_count + size);
274
- auto data = UnifiedVectorFormat::GetData<bool>(format);
275
-
276
- auto result_data = append_data.main_buffer.GetData<uint8_t>();
277
- auto validity_data = append_data.validity.GetData<uint8_t>();
278
- uint8_t current_bit;
279
- idx_t current_byte;
280
- GetBitPosition(append_data.row_count, current_byte, current_bit);
281
- for (idx_t i = from; i < to; i++) {
282
- auto source_idx = format.sel->get_index(i);
283
- // append the validity mask
284
- if (!format.validity.RowIsValid(source_idx)) {
285
- SetNull(append_data, validity_data, current_byte, current_bit);
286
- } else if (!data[source_idx]) {
287
- UnsetBit(result_data, current_byte, current_bit);
288
- }
289
- NextBit(current_byte, current_bit);
290
- }
291
- append_data.row_count += size;
292
- }
293
-
294
- static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
295
- result->n_buffers = 2;
296
- result->buffers[1] = append_data.main_buffer.data();
297
- }
298
- };
299
-
300
- //===--------------------------------------------------------------------===//
301
- // Varchar
302
- //===--------------------------------------------------------------------===//
303
- struct ArrowVarcharConverter {
304
- template <class SRC>
305
- static idx_t GetLength(SRC input) {
306
- return input.GetSize();
307
- }
308
-
309
- template <class SRC>
310
- static void WriteData(data_ptr_t target, SRC input) {
311
- memcpy(target, input.GetData(), input.GetSize());
312
- }
313
- };
314
-
315
- struct ArrowUUIDConverter {
316
- template <class SRC>
317
- static idx_t GetLength(SRC input) {
318
- return UUID::STRING_SIZE;
319
- }
320
-
321
- template <class SRC>
322
- static void WriteData(data_ptr_t target, SRC input) {
323
- UUID::ToString(input, char_ptr_cast(target));
324
- }
325
- };
326
-
327
- template <class SRC = string_t, class OP = ArrowVarcharConverter, class BUFTYPE = uint64_t>
328
- struct ArrowVarcharData {
329
- static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
330
- result.main_buffer.reserve((capacity + 1) * sizeof(BUFTYPE));
331
-
332
- result.aux_buffer.reserve(capacity);
333
- }
334
-
335
- static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
336
- idx_t size = to - from;
337
- UnifiedVectorFormat format;
338
- input.ToUnifiedFormat(input_size, format);
339
-
340
- // resize the validity mask and set up the validity buffer for iteration
341
- ResizeValidity(append_data.validity, append_data.row_count + size);
342
- auto validity_data = (uint8_t *)append_data.validity.data();
343
-
344
- // resize the offset buffer - the offset buffer holds the offsets into the child array
345
- append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(BUFTYPE) * (size + 1));
346
- auto data = UnifiedVectorFormat::GetData<SRC>(format);
347
- auto offset_data = append_data.main_buffer.GetData<BUFTYPE>();
348
- if (append_data.row_count == 0) {
349
- // first entry
350
- offset_data[0] = 0;
351
- }
352
- // now append the string data to the auxiliary buffer
353
- // the auxiliary buffer's length depends on the string lengths, so we resize as required
354
- auto last_offset = offset_data[append_data.row_count];
355
- idx_t max_offset = append_data.row_count + to - from;
356
- if (max_offset > NumericLimits<uint32_t>::Maximum() &&
357
- append_data.options.offset_size == ArrowOffsetSize::REGULAR) {
358
- throw InvalidInputException("Arrow Appender: The maximum total string size for regular string buffers is "
359
- "%u but the offset of %lu exceeds this.",
360
- NumericLimits<uint32_t>::Maximum(), max_offset);
361
- }
362
- for (idx_t i = from; i < to; i++) {
363
- auto source_idx = format.sel->get_index(i);
364
- auto offset_idx = append_data.row_count + i + 1 - from;
365
-
366
- if (!format.validity.RowIsValid(source_idx)) {
367
- uint8_t current_bit;
368
- idx_t current_byte;
369
- GetBitPosition(append_data.row_count + i - from, current_byte, current_bit);
370
- SetNull(append_data, validity_data, current_byte, current_bit);
371
- offset_data[offset_idx] = last_offset;
372
- continue;
373
- }
374
-
375
- auto string_length = OP::GetLength(data[source_idx]);
376
-
377
- // append the offset data
378
- auto current_offset = last_offset + string_length;
379
- offset_data[offset_idx] = current_offset;
380
-
381
- // resize the string buffer if required, and write the string data
382
- append_data.aux_buffer.resize(current_offset);
383
- OP::WriteData(append_data.aux_buffer.data() + last_offset, data[source_idx]);
384
-
385
- last_offset = current_offset;
386
- }
387
- append_data.row_count += size;
388
- }
389
-
390
- static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
391
- result->n_buffers = 3;
392
- result->buffers[1] = append_data.main_buffer.data();
393
- result->buffers[2] = append_data.aux_buffer.data();
394
- }
395
- };
396
-
397
- //===--------------------------------------------------------------------===//
398
- // Unions
399
- //===--------------------------------------------------------------------===//
400
- /**
401
- * Based on https://arrow.apache.org/docs/format/Columnar.html#union-layout &
402
- * https://arrow.apache.org/docs/format/CDataInterface.html
403
- */
404
- struct ArrowUnionData {
405
- static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
406
- result.main_buffer.reserve(capacity * sizeof(int8_t));
407
-
408
- for (auto &child : UnionType::CopyMemberTypes(type)) {
409
- auto child_buffer = InitializeArrowChild(child.second, capacity, result.options);
410
- result.child_data.push_back(std::move(child_buffer));
411
- }
412
- }
413
-
414
- static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
415
- UnifiedVectorFormat format;
416
- input.ToUnifiedFormat(input_size, format);
417
- idx_t size = to - from;
418
-
419
- auto &types_buffer = append_data.main_buffer;
420
-
421
- duckdb::vector<Vector> child_vectors;
422
- for (const auto &child : UnionType::CopyMemberTypes(input.GetType())) {
423
- child_vectors.emplace_back(child.second);
424
- }
425
-
426
- for (idx_t input_idx = from; input_idx < to; input_idx++) {
427
- const auto &val = input.GetValue(input_idx);
428
-
429
- idx_t tag = 0;
430
- Value resolved_value(nullptr);
431
- if (!val.IsNull()) {
432
- tag = UnionValue::GetTag(val);
433
-
434
- resolved_value = UnionValue::GetValue(val);
435
- }
436
-
437
- for (idx_t child_idx = 0; child_idx < child_vectors.size(); child_idx++) {
438
- child_vectors[child_idx].SetValue(input_idx, child_idx == tag ? resolved_value : Value(nullptr));
439
- }
440
-
441
- types_buffer.data()[input_idx] = tag;
442
- }
443
-
444
- for (idx_t child_idx = 0; child_idx < child_vectors.size(); child_idx++) {
445
- auto &child_buffer = append_data.child_data[child_idx];
446
- auto &child = child_vectors[child_idx];
447
- child_buffer->append_vector(*child_buffer, child, from, to, size);
448
- }
449
- append_data.row_count += size;
450
- }
451
-
452
- static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
453
- result->n_buffers = 2;
454
- result->buffers[1] = append_data.main_buffer.data();
455
-
456
- auto &child_types = UnionType::CopyMemberTypes(type);
457
- append_data.child_pointers.resize(child_types.size());
458
- result->children = append_data.child_pointers.data();
459
- result->n_children = child_types.size();
460
- for (idx_t i = 0; i < child_types.size(); i++) {
461
- auto &child_type = child_types[i].second;
462
- append_data.child_pointers[i] = FinalizeArrowChild(child_type, *append_data.child_data[i]);
463
- }
464
- }
465
- };
466
-
467
- //===--------------------------------------------------------------------===//
468
- // Structs
48
+ // Finalize Arrow Child
469
49
  //===--------------------------------------------------------------------===//
470
- struct ArrowStructData {
471
- static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
472
- auto &children = StructType::GetChildTypes(type);
473
- for (auto &child : children) {
474
- auto child_buffer = InitializeArrowChild(child.second, capacity, result.options);
475
- result.child_data.push_back(std::move(child_buffer));
476
- }
477
- }
478
-
479
- static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
480
- UnifiedVectorFormat format;
481
- input.ToUnifiedFormat(input_size, format);
482
- idx_t size = to - from;
483
- AppendValidity(append_data, format, from, to);
484
- // append the children of the struct
485
- auto &children = StructVector::GetEntries(input);
486
- for (idx_t child_idx = 0; child_idx < children.size(); child_idx++) {
487
- auto &child = children[child_idx];
488
- auto &child_data = *append_data.child_data[child_idx];
489
- child_data.append_vector(child_data, *child, from, to, size);
490
- }
491
- append_data.row_count += size;
492
- }
493
-
494
- static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
495
- result->n_buffers = 1;
50
+ ArrowArray *ArrowAppender::FinalizeChild(const LogicalType &type, ArrowAppendData &append_data) {
51
+ auto result = make_uniq<ArrowArray>();
496
52
 
497
- auto &child_types = StructType::GetChildTypes(type);
498
- append_data.child_pointers.resize(child_types.size());
499
- result->children = append_data.child_pointers.data();
500
- result->n_children = child_types.size();
501
- for (idx_t i = 0; i < child_types.size(); i++) {
502
- auto &child_type = child_types[i].second;
503
- append_data.child_pointers[i] = FinalizeArrowChild(child_type, *append_data.child_data[i]);
504
- }
505
- }
506
- };
53
+ result->private_data = nullptr;
54
+ result->release = ArrowAppender::ReleaseArray;
55
+ result->n_children = 0;
56
+ result->null_count = 0;
57
+ result->offset = 0;
58
+ result->dictionary = nullptr;
59
+ result->buffers = append_data.buffers.data();
60
+ result->null_count = append_data.null_count;
61
+ result->length = append_data.row_count;
62
+ result->buffers[0] = append_data.validity.data();
507
63
 
508
- //===--------------------------------------------------------------------===//
509
- // Lists
510
- //===--------------------------------------------------------------------===//
511
- void AppendListOffsets(ArrowAppendData &append_data, UnifiedVectorFormat &format, idx_t from, idx_t to,
512
- vector<sel_t> &child_sel) {
513
- // resize the offset buffer - the offset buffer holds the offsets into the child array
514
- idx_t size = to - from;
515
- append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(uint32_t) * (size + 1));
516
- auto data = UnifiedVectorFormat::GetData<list_entry_t>(format);
517
- auto offset_data = append_data.main_buffer.GetData<uint32_t>();
518
- if (append_data.row_count == 0) {
519
- // first entry
520
- offset_data[0] = 0;
64
+ if (append_data.finalize) {
65
+ append_data.finalize(append_data, type, result.get());
521
66
  }
522
- // set up the offsets using the list entries
523
- auto last_offset = offset_data[append_data.row_count];
524
- for (idx_t i = from; i < to; i++) {
525
- auto source_idx = format.sel->get_index(i);
526
- auto offset_idx = append_data.row_count + i + 1 - from;
527
-
528
- if (!format.validity.RowIsValid(source_idx)) {
529
- offset_data[offset_idx] = last_offset;
530
- continue;
531
- }
532
-
533
- // append the offset data
534
- auto list_length = data[source_idx].length;
535
- last_offset += list_length;
536
- offset_data[offset_idx] = last_offset;
537
67
 
538
- for (idx_t k = 0; k < list_length; k++) {
539
- child_sel.push_back(data[source_idx].offset + k);
540
- }
541
- }
68
+ append_data.array = std::move(result);
69
+ return append_data.array.get();
542
70
  }
543
71
 
544
- struct ArrowListData {
545
- static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
546
- auto &child_type = ListType::GetChildType(type);
547
- result.main_buffer.reserve((capacity + 1) * sizeof(uint32_t));
548
- auto child_buffer = InitializeArrowChild(child_type, capacity, result.options);
549
- result.child_data.push_back(std::move(child_buffer));
550
- }
551
-
552
- static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
553
- UnifiedVectorFormat format;
554
- input.ToUnifiedFormat(input_size, format);
555
- idx_t size = to - from;
556
- vector<sel_t> child_indices;
557
- AppendValidity(append_data, format, from, to);
558
- AppendListOffsets(append_data, format, from, to, child_indices);
559
-
560
- // append the child vector of the list
561
- SelectionVector child_sel(child_indices.data());
562
- auto &child = ListVector::GetEntry(input);
563
- auto child_size = child_indices.size();
564
- Vector child_copy(child.GetType());
565
- child_copy.Slice(child, child_sel, child_size);
566
- append_data.child_data[0]->append_vector(*append_data.child_data[0], child_copy, 0, child_size, child_size);
567
- append_data.row_count += size;
568
- }
569
-
570
- static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
571
- result->n_buffers = 2;
572
- result->buffers[1] = append_data.main_buffer.data();
573
-
574
- auto &child_type = ListType::GetChildType(type);
575
- append_data.child_pointers.resize(1);
576
- result->children = append_data.child_pointers.data();
577
- result->n_children = 1;
578
- append_data.child_pointers[0] = FinalizeArrowChild(child_type, *append_data.child_data[0]);
579
- }
580
- };
581
-
582
- //===--------------------------------------------------------------------===//
583
- // Maps
584
- //===--------------------------------------------------------------------===//
585
- struct ArrowMapData {
586
- static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
587
- // map types are stored in a (too) clever way
588
- // the main buffer holds the null values and the offsets
589
- // then we have a single child, which is a struct of the map_type, and the key_type
590
- result.main_buffer.reserve((capacity + 1) * sizeof(uint32_t));
591
-
592
- auto &key_type = MapType::KeyType(type);
593
- auto &value_type = MapType::ValueType(type);
594
- auto internal_struct = make_uniq<ArrowAppendData>(result.options);
595
- internal_struct->child_data.push_back(InitializeArrowChild(key_type, capacity, result.options));
596
- internal_struct->child_data.push_back(InitializeArrowChild(value_type, capacity, result.options));
597
-
598
- result.child_data.push_back(std::move(internal_struct));
599
- }
600
-
601
- static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
602
- UnifiedVectorFormat format;
603
- input.ToUnifiedFormat(input_size, format);
604
- idx_t size = to - from;
605
- AppendValidity(append_data, format, from, to);
606
- vector<sel_t> child_indices;
607
- AppendListOffsets(append_data, format, from, to, child_indices);
608
-
609
- SelectionVector child_sel(child_indices.data());
610
- auto &key_vector = MapVector::GetKeys(input);
611
- auto &value_vector = MapVector::GetValues(input);
612
- auto list_size = child_indices.size();
613
-
614
- auto &struct_data = *append_data.child_data[0];
615
- auto &key_data = *struct_data.child_data[0];
616
- auto &value_data = *struct_data.child_data[1];
617
-
618
- if (size != input_size) {
619
- // Let's avoid doing this
620
- Vector key_vector_copy(key_vector.GetType());
621
- key_vector_copy.Slice(key_vector, child_sel, list_size);
622
- Vector value_vector_copy(value_vector.GetType());
623
- value_vector_copy.Slice(value_vector, child_sel, list_size);
624
- key_data.append_vector(key_data, key_vector_copy, 0, list_size, list_size);
625
- value_data.append_vector(value_data, value_vector_copy, 0, list_size, list_size);
626
- } else {
627
- // We don't care about the vector, slice it
628
- key_vector.Slice(child_sel, list_size);
629
- value_vector.Slice(child_sel, list_size);
630
- key_data.append_vector(key_data, key_vector, 0, list_size, list_size);
631
- value_data.append_vector(value_data, value_vector, 0, list_size, list_size);
632
- }
633
-
634
- append_data.row_count += size;
635
- struct_data.row_count += size;
636
- }
637
-
638
- static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
639
- // set up the main map buffer
640
- result->n_buffers = 2;
641
- result->buffers[1] = append_data.main_buffer.data();
642
-
643
- // the main map buffer has a single child: a struct
644
- append_data.child_pointers.resize(1);
645
- result->children = append_data.child_pointers.data();
646
- result->n_children = 1;
647
- append_data.child_pointers[0] = FinalizeArrowChild(type, *append_data.child_data[0]);
648
-
649
- // now that struct has two children: the key and the value type
650
- auto &struct_data = *append_data.child_data[0];
651
- auto &struct_result = append_data.child_pointers[0];
652
- struct_data.child_pointers.resize(2);
653
- struct_result->n_buffers = 1;
654
- struct_result->n_children = 2;
655
- struct_result->length = struct_data.child_data[0]->row_count;
656
- struct_result->children = struct_data.child_pointers.data();
72
+ //! Returns the underlying arrow array
73
+ ArrowArray ArrowAppender::Finalize() {
74
+ D_ASSERT(root_data.size() == types.size());
75
+ auto root_holder = make_uniq<ArrowAppendData>(options);
657
76
 
658
- D_ASSERT(struct_data.child_data[0]->row_count == struct_data.child_data[1]->row_count);
77
+ ArrowArray result;
78
+ root_holder->child_pointers.resize(types.size());
79
+ result.children = root_holder->child_pointers.data();
80
+ result.n_children = types.size();
659
81
 
660
- auto &key_type = MapType::KeyType(type);
661
- auto &value_type = MapType::ValueType(type);
662
- struct_data.child_pointers[0] = FinalizeArrowChild(key_type, *struct_data.child_data[0]);
663
- struct_data.child_pointers[1] = FinalizeArrowChild(value_type, *struct_data.child_data[1]);
82
+ // Configure root array
83
+ result.length = row_count;
84
+ result.n_buffers = 1;
85
+ result.buffers = root_holder->buffers.data(); // there is no actual buffer there since we don't have NULLs
86
+ result.offset = 0;
87
+ result.null_count = 0; // needs to be 0
88
+ result.dictionary = nullptr;
89
+ root_holder->child_data = std::move(root_data);
664
90
 
665
- // keys cannot have null values
666
- if (struct_data.child_pointers[0]->null_count > 0) {
667
- throw std::runtime_error("Arrow doesn't accept NULL keys on Maps");
668
- }
91
+ // FIXME: this violates a property of the arrow format, if root owns all the child memory then consumers can't move
92
+ // child arrays https://arrow.apache.org/docs/format/CDataInterface.html#moving-child-arrays
93
+ for (idx_t i = 0; i < root_holder->child_data.size(); i++) {
94
+ root_holder->child_pointers[i] = ArrowAppender::FinalizeChild(types[i], *root_holder->child_data[i]);
669
95
  }
670
- };
671
96
 
672
- //! Append a data chunk to the underlying arrow array
673
- void ArrowAppender::Append(DataChunk &input, idx_t from, idx_t to, idx_t input_size) {
674
- D_ASSERT(types == input.GetTypes());
675
- for (idx_t i = 0; i < input.ColumnCount(); i++) {
676
- root_data[i]->append_vector(*root_data[i], input.data[i], from, to, input_size);
677
- }
678
- row_count += to - from;
97
+ // Release ownership to caller
98
+ result.private_data = root_holder.release();
99
+ result.release = ArrowAppender::ReleaseArray;
100
+ return result;
679
101
  }
102
+
680
103
  //===--------------------------------------------------------------------===//
681
104
  // Initialize Arrow Child
682
105
  //===--------------------------------------------------------------------===//
106
+
683
107
  template <class OP>
684
- static void InitializeFunctionPointers(ArrowAppendData &append_data) {
108
+ static void InitializeAppenderForType(ArrowAppendData &append_data) {
685
109
  append_data.initialize = OP::Initialize;
686
110
  append_data.append_vector = OP::Append;
687
111
  append_data.finalize = OP::Finalize;
@@ -691,17 +115,17 @@ static void InitializeFunctionPointers(ArrowAppendData &append_data, const Logic
691
115
  // handle special logical types
692
116
  switch (type.id()) {
693
117
  case LogicalTypeId::BOOLEAN:
694
- InitializeFunctionPointers<ArrowBoolData>(append_data);
118
+ InitializeAppenderForType<ArrowBoolData>(append_data);
695
119
  break;
696
120
  case LogicalTypeId::TINYINT:
697
- InitializeFunctionPointers<ArrowScalarData<int8_t>>(append_data);
121
+ InitializeAppenderForType<ArrowScalarData<int8_t>>(append_data);
698
122
  break;
699
123
  case LogicalTypeId::SMALLINT:
700
- InitializeFunctionPointers<ArrowScalarData<int16_t>>(append_data);
124
+ InitializeAppenderForType<ArrowScalarData<int16_t>>(append_data);
701
125
  break;
702
126
  case LogicalTypeId::DATE:
703
127
  case LogicalTypeId::INTEGER:
704
- InitializeFunctionPointers<ArrowScalarData<int32_t>>(append_data);
128
+ InitializeAppenderForType<ArrowScalarData<int32_t>>(append_data);
705
129
  break;
706
130
  case LogicalTypeId::TIME:
707
131
  case LogicalTypeId::TIMESTAMP_SEC:
@@ -711,42 +135,42 @@ static void InitializeFunctionPointers(ArrowAppendData &append_data, const Logic
711
135
  case LogicalTypeId::TIMESTAMP_TZ:
712
136
  case LogicalTypeId::TIME_TZ:
713
137
  case LogicalTypeId::BIGINT:
714
- InitializeFunctionPointers<ArrowScalarData<int64_t>>(append_data);
138
+ InitializeAppenderForType<ArrowScalarData<int64_t>>(append_data);
715
139
  break;
716
140
  case LogicalTypeId::HUGEINT:
717
- InitializeFunctionPointers<ArrowScalarData<hugeint_t>>(append_data);
141
+ InitializeAppenderForType<ArrowScalarData<hugeint_t>>(append_data);
718
142
  break;
719
143
  case LogicalTypeId::UTINYINT:
720
- InitializeFunctionPointers<ArrowScalarData<uint8_t>>(append_data);
144
+ InitializeAppenderForType<ArrowScalarData<uint8_t>>(append_data);
721
145
  break;
722
146
  case LogicalTypeId::USMALLINT:
723
- InitializeFunctionPointers<ArrowScalarData<uint16_t>>(append_data);
147
+ InitializeAppenderForType<ArrowScalarData<uint16_t>>(append_data);
724
148
  break;
725
149
  case LogicalTypeId::UINTEGER:
726
- InitializeFunctionPointers<ArrowScalarData<uint32_t>>(append_data);
150
+ InitializeAppenderForType<ArrowScalarData<uint32_t>>(append_data);
727
151
  break;
728
152
  case LogicalTypeId::UBIGINT:
729
- InitializeFunctionPointers<ArrowScalarData<uint64_t>>(append_data);
153
+ InitializeAppenderForType<ArrowScalarData<uint64_t>>(append_data);
730
154
  break;
731
155
  case LogicalTypeId::FLOAT:
732
- InitializeFunctionPointers<ArrowScalarData<float>>(append_data);
156
+ InitializeAppenderForType<ArrowScalarData<float>>(append_data);
733
157
  break;
734
158
  case LogicalTypeId::DOUBLE:
735
- InitializeFunctionPointers<ArrowScalarData<double>>(append_data);
159
+ InitializeAppenderForType<ArrowScalarData<double>>(append_data);
736
160
  break;
737
161
  case LogicalTypeId::DECIMAL:
738
162
  switch (type.InternalType()) {
739
163
  case PhysicalType::INT16:
740
- InitializeFunctionPointers<ArrowScalarData<hugeint_t, int16_t>>(append_data);
164
+ InitializeAppenderForType<ArrowScalarData<hugeint_t, int16_t>>(append_data);
741
165
  break;
742
166
  case PhysicalType::INT32:
743
- InitializeFunctionPointers<ArrowScalarData<hugeint_t, int32_t>>(append_data);
167
+ InitializeAppenderForType<ArrowScalarData<hugeint_t, int32_t>>(append_data);
744
168
  break;
745
169
  case PhysicalType::INT64:
746
- InitializeFunctionPointers<ArrowScalarData<hugeint_t, int64_t>>(append_data);
170
+ InitializeAppenderForType<ArrowScalarData<hugeint_t, int64_t>>(append_data);
747
171
  break;
748
172
  case PhysicalType::INT128:
749
- InitializeFunctionPointers<ArrowScalarData<hugeint_t>>(append_data);
173
+ InitializeAppenderForType<ArrowScalarData<hugeint_t>>(append_data);
750
174
  break;
751
175
  default:
752
176
  throw InternalException("Unsupported internal decimal type");
@@ -756,54 +180,55 @@ static void InitializeFunctionPointers(ArrowAppendData &append_data, const Logic
756
180
  case LogicalTypeId::BLOB:
757
181
  case LogicalTypeId::BIT:
758
182
  if (append_data.options.offset_size == ArrowOffsetSize::LARGE) {
759
- InitializeFunctionPointers<ArrowVarcharData<string_t>>(append_data);
183
+ InitializeAppenderForType<ArrowVarcharData<string_t>>(append_data);
760
184
  } else {
761
- InitializeFunctionPointers<ArrowVarcharData<string_t, ArrowVarcharConverter, uint32_t>>(append_data);
185
+ InitializeAppenderForType<ArrowVarcharData<string_t, ArrowVarcharConverter, uint32_t>>(append_data);
762
186
  }
763
187
  break;
764
188
  case LogicalTypeId::UUID:
765
189
  if (append_data.options.offset_size == ArrowOffsetSize::LARGE) {
766
- InitializeFunctionPointers<ArrowVarcharData<hugeint_t, ArrowUUIDConverter>>(append_data);
190
+ InitializeAppenderForType<ArrowVarcharData<hugeint_t, ArrowUUIDConverter>>(append_data);
767
191
  } else {
768
- InitializeFunctionPointers<ArrowVarcharData<hugeint_t, ArrowUUIDConverter, uint32_t>>(append_data);
192
+ InitializeAppenderForType<ArrowVarcharData<hugeint_t, ArrowUUIDConverter, uint32_t>>(append_data);
769
193
  }
770
194
  break;
771
195
  case LogicalTypeId::ENUM:
772
196
  switch (type.InternalType()) {
773
197
  case PhysicalType::UINT8:
774
- InitializeFunctionPointers<ArrowEnumData<uint8_t>>(append_data);
198
+ InitializeAppenderForType<ArrowEnumData<uint8_t>>(append_data);
775
199
  break;
776
200
  case PhysicalType::UINT16:
777
- InitializeFunctionPointers<ArrowEnumData<uint16_t>>(append_data);
201
+ InitializeAppenderForType<ArrowEnumData<uint16_t>>(append_data);
778
202
  break;
779
203
  case PhysicalType::UINT32:
780
- InitializeFunctionPointers<ArrowEnumData<uint32_t>>(append_data);
204
+ InitializeAppenderForType<ArrowEnumData<uint32_t>>(append_data);
781
205
  break;
782
206
  default:
783
207
  throw InternalException("Unsupported internal enum type");
784
208
  }
785
209
  break;
786
210
  case LogicalTypeId::INTERVAL:
787
- InitializeFunctionPointers<ArrowScalarData<ArrowInterval, interval_t, ArrowIntervalConverter>>(append_data);
211
+ InitializeAppenderForType<ArrowScalarData<ArrowInterval, interval_t, ArrowIntervalConverter>>(append_data);
788
212
  break;
789
213
  case LogicalTypeId::UNION:
790
- InitializeFunctionPointers<ArrowUnionData>(append_data);
214
+ InitializeAppenderForType<ArrowUnionData>(append_data);
791
215
  break;
792
216
  case LogicalTypeId::STRUCT:
793
- InitializeFunctionPointers<ArrowStructData>(append_data);
217
+ InitializeAppenderForType<ArrowStructData>(append_data);
794
218
  break;
795
219
  case LogicalTypeId::LIST:
796
- InitializeFunctionPointers<ArrowListData>(append_data);
220
+ InitializeAppenderForType<ArrowListData>(append_data);
797
221
  break;
798
222
  case LogicalTypeId::MAP:
799
- InitializeFunctionPointers<ArrowMapData>(append_data);
223
+ InitializeAppenderForType<ArrowMapData>(append_data);
800
224
  break;
801
225
  default:
802
226
  throw NotImplementedException("Unsupported type in DuckDB -> Arrow Conversion: %s\n", type.ToString());
803
227
  }
804
228
  }
805
229
 
806
- unique_ptr<ArrowAppendData> InitializeArrowChild(const LogicalType &type, idx_t capacity, ArrowOptions &options) {
230
+ unique_ptr<ArrowAppendData> ArrowAppender::InitializeChild(const LogicalType &type, idx_t capacity,
231
+ ArrowOptions &options) {
807
232
  auto result = make_uniq<ArrowAppendData>(options);
808
233
  InitializeFunctionPointers(*result, type);
809
234
 
@@ -813,67 +238,4 @@ unique_ptr<ArrowAppendData> InitializeArrowChild(const LogicalType &type, idx_t
813
238
  return result;
814
239
  }
815
240
 
816
- static void ReleaseDuckDBArrowAppendArray(ArrowArray *array) {
817
- if (!array || !array->release) {
818
- return;
819
- }
820
- array->release = nullptr;
821
- auto holder = static_cast<ArrowAppendData *>(array->private_data);
822
- delete holder;
823
- }
824
-
825
- //===--------------------------------------------------------------------===//
826
- // Finalize Arrow Child
827
- //===--------------------------------------------------------------------===//
828
- ArrowArray *FinalizeArrowChild(const LogicalType &type, ArrowAppendData &append_data) {
829
- auto result = make_uniq<ArrowArray>();
830
-
831
- result->private_data = nullptr;
832
- result->release = ReleaseDuckDBArrowAppendArray;
833
- result->n_children = 0;
834
- result->null_count = 0;
835
- result->offset = 0;
836
- result->dictionary = nullptr;
837
- result->buffers = append_data.buffers.data();
838
- result->null_count = append_data.null_count;
839
- result->length = append_data.row_count;
840
- result->buffers[0] = append_data.validity.data();
841
-
842
- if (append_data.finalize) {
843
- append_data.finalize(append_data, type, result.get());
844
- }
845
-
846
- append_data.array = std::move(result);
847
- return append_data.array.get();
848
- }
849
-
850
- //! Returns the underlying arrow array
851
- ArrowArray ArrowAppender::Finalize() {
852
- D_ASSERT(root_data.size() == types.size());
853
- auto root_holder = make_uniq<ArrowAppendData>(options);
854
-
855
- ArrowArray result;
856
- root_holder->child_pointers.resize(types.size());
857
- result.children = root_holder->child_pointers.data();
858
- result.n_children = types.size();
859
-
860
- // Configure root array
861
- result.length = row_count;
862
- result.n_buffers = 1;
863
- result.buffers = root_holder->buffers.data(); // there is no actual buffer there since we don't have NULLs
864
- result.offset = 0;
865
- result.null_count = 0; // needs to be 0
866
- result.dictionary = nullptr;
867
- root_holder->child_data = std::move(root_data);
868
-
869
- for (idx_t i = 0; i < root_holder->child_data.size(); i++) {
870
- root_holder->child_pointers[i] = FinalizeArrowChild(types[i], *root_holder->child_data[i]);
871
- }
872
-
873
- // Release ownership to caller
874
- result.private_data = root_holder.release();
875
- result.release = ReleaseDuckDBArrowAppendArray;
876
- return result;
877
- }
878
-
879
241
  } // namespace duckdb