duckdb 0.8.2-dev2068.0 → 0.8.2-dev2090.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +1 -0
- package/package.json +1 -1
- package/src/duckdb/src/common/arrow/appender/bool_data.cpp +44 -0
- package/src/duckdb/src/common/arrow/appender/list_data.cpp +78 -0
- package/src/duckdb/src/common/arrow/appender/map_data.cpp +86 -0
- package/src/duckdb/src/common/arrow/appender/struct_data.cpp +45 -0
- package/src/duckdb/src/common/arrow/appender/union_data.cpp +70 -0
- package/src/duckdb/src/common/arrow/arrow_appender.cpp +89 -727
- package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +2 -1
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp +109 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/bool_data.hpp +15 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp +69 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/list.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/scalar_data.hpp +88 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/struct_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/union_data.hpp +21 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp +105 -0
- package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +5 -0
- package/src/duckdb/src/parallel/executor.cpp +1 -1
- package/src/duckdb/ub_src_common_arrow_appender.cpp +10 -0
package/binding.gyp
CHANGED
@@ -14,6 +14,7 @@
|
|
14
14
|
"src/duckdb/ub_src_catalog_default.cpp",
|
15
15
|
"src/duckdb/ub_src_common_adbc.cpp",
|
16
16
|
"src/duckdb/ub_src_common.cpp",
|
17
|
+
"src/duckdb/ub_src_common_arrow_appender.cpp",
|
17
18
|
"src/duckdb/ub_src_common_arrow.cpp",
|
18
19
|
"src/duckdb/ub_src_common_crypto.cpp",
|
19
20
|
"src/duckdb/ub_src_common_enums.cpp",
|
package/package.json
CHANGED
@@ -0,0 +1,44 @@
|
|
1
|
+
#include "duckdb/common/arrow/arrow_appender.hpp"
|
2
|
+
#include "duckdb/common/arrow/appender/bool_data.hpp"
|
3
|
+
|
4
|
+
namespace duckdb {
|
5
|
+
|
6
|
+
void ArrowBoolData::Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
7
|
+
auto byte_count = (capacity + 7) / 8;
|
8
|
+
result.main_buffer.reserve(byte_count);
|
9
|
+
}
|
10
|
+
|
11
|
+
void ArrowBoolData::Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
12
|
+
idx_t size = to - from;
|
13
|
+
UnifiedVectorFormat format;
|
14
|
+
input.ToUnifiedFormat(input_size, format);
|
15
|
+
|
16
|
+
// we initialize both the validity and the bit set to 1's
|
17
|
+
ResizeValidity(append_data.validity, append_data.row_count + size);
|
18
|
+
ResizeValidity(append_data.main_buffer, append_data.row_count + size);
|
19
|
+
auto data = UnifiedVectorFormat::GetData<bool>(format);
|
20
|
+
|
21
|
+
auto result_data = append_data.main_buffer.GetData<uint8_t>();
|
22
|
+
auto validity_data = append_data.validity.GetData<uint8_t>();
|
23
|
+
uint8_t current_bit;
|
24
|
+
idx_t current_byte;
|
25
|
+
GetBitPosition(append_data.row_count, current_byte, current_bit);
|
26
|
+
for (idx_t i = from; i < to; i++) {
|
27
|
+
auto source_idx = format.sel->get_index(i);
|
28
|
+
// append the validity mask
|
29
|
+
if (!format.validity.RowIsValid(source_idx)) {
|
30
|
+
SetNull(append_data, validity_data, current_byte, current_bit);
|
31
|
+
} else if (!data[source_idx]) {
|
32
|
+
UnsetBit(result_data, current_byte, current_bit);
|
33
|
+
}
|
34
|
+
NextBit(current_byte, current_bit);
|
35
|
+
}
|
36
|
+
append_data.row_count += size;
|
37
|
+
}
|
38
|
+
|
39
|
+
void ArrowBoolData::Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
40
|
+
result->n_buffers = 2;
|
41
|
+
result->buffers[1] = append_data.main_buffer.data();
|
42
|
+
}
|
43
|
+
|
44
|
+
} // namespace duckdb
|
@@ -0,0 +1,78 @@
|
|
1
|
+
#include "duckdb/common/arrow/arrow_appender.hpp"
|
2
|
+
#include "duckdb/common/arrow/appender/list_data.hpp"
|
3
|
+
|
4
|
+
namespace duckdb {
|
5
|
+
|
6
|
+
//===--------------------------------------------------------------------===//
|
7
|
+
// Lists
|
8
|
+
//===--------------------------------------------------------------------===//
|
9
|
+
void ArrowListData::AppendOffsets(ArrowAppendData &append_data, UnifiedVectorFormat &format, idx_t from, idx_t to,
|
10
|
+
vector<sel_t> &child_sel) {
|
11
|
+
// resize the offset buffer - the offset buffer holds the offsets into the child array
|
12
|
+
idx_t size = to - from;
|
13
|
+
append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(uint32_t) * (size + 1));
|
14
|
+
auto data = UnifiedVectorFormat::GetData<list_entry_t>(format);
|
15
|
+
auto offset_data = append_data.main_buffer.GetData<uint32_t>();
|
16
|
+
if (append_data.row_count == 0) {
|
17
|
+
// first entry
|
18
|
+
offset_data[0] = 0;
|
19
|
+
}
|
20
|
+
// set up the offsets using the list entries
|
21
|
+
auto last_offset = offset_data[append_data.row_count];
|
22
|
+
for (idx_t i = from; i < to; i++) {
|
23
|
+
auto source_idx = format.sel->get_index(i);
|
24
|
+
auto offset_idx = append_data.row_count + i + 1 - from;
|
25
|
+
|
26
|
+
if (!format.validity.RowIsValid(source_idx)) {
|
27
|
+
offset_data[offset_idx] = last_offset;
|
28
|
+
continue;
|
29
|
+
}
|
30
|
+
|
31
|
+
// append the offset data
|
32
|
+
auto list_length = data[source_idx].length;
|
33
|
+
last_offset += list_length;
|
34
|
+
offset_data[offset_idx] = last_offset;
|
35
|
+
|
36
|
+
for (idx_t k = 0; k < list_length; k++) {
|
37
|
+
child_sel.push_back(data[source_idx].offset + k);
|
38
|
+
}
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
void ArrowListData::Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
43
|
+
auto &child_type = ListType::GetChildType(type);
|
44
|
+
result.main_buffer.reserve((capacity + 1) * sizeof(uint32_t));
|
45
|
+
auto child_buffer = ArrowAppender::InitializeChild(child_type, capacity, result.options);
|
46
|
+
result.child_data.push_back(std::move(child_buffer));
|
47
|
+
}
|
48
|
+
|
49
|
+
void ArrowListData::Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
50
|
+
UnifiedVectorFormat format;
|
51
|
+
input.ToUnifiedFormat(input_size, format);
|
52
|
+
idx_t size = to - from;
|
53
|
+
vector<sel_t> child_indices;
|
54
|
+
AppendValidity(append_data, format, from, to);
|
55
|
+
ArrowListData::AppendOffsets(append_data, format, from, to, child_indices);
|
56
|
+
|
57
|
+
// append the child vector of the list
|
58
|
+
SelectionVector child_sel(child_indices.data());
|
59
|
+
auto &child = ListVector::GetEntry(input);
|
60
|
+
auto child_size = child_indices.size();
|
61
|
+
Vector child_copy(child.GetType());
|
62
|
+
child_copy.Slice(child, child_sel, child_size);
|
63
|
+
append_data.child_data[0]->append_vector(*append_data.child_data[0], child_copy, 0, child_size, child_size);
|
64
|
+
append_data.row_count += size;
|
65
|
+
}
|
66
|
+
|
67
|
+
void ArrowListData::Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
68
|
+
result->n_buffers = 2;
|
69
|
+
result->buffers[1] = append_data.main_buffer.data();
|
70
|
+
|
71
|
+
auto &child_type = ListType::GetChildType(type);
|
72
|
+
append_data.child_pointers.resize(1);
|
73
|
+
result->children = append_data.child_pointers.data();
|
74
|
+
result->n_children = 1;
|
75
|
+
append_data.child_pointers[0] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[0]);
|
76
|
+
}
|
77
|
+
|
78
|
+
} // namespace duckdb
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#include "duckdb/common/arrow/arrow_appender.hpp"
|
2
|
+
#include "duckdb/common/arrow/appender/map_data.hpp"
|
3
|
+
#include "duckdb/common/arrow/appender/list_data.hpp"
|
4
|
+
|
5
|
+
namespace duckdb {
|
6
|
+
|
7
|
+
//===--------------------------------------------------------------------===//
|
8
|
+
// Maps
|
9
|
+
//===--------------------------------------------------------------------===//
|
10
|
+
void ArrowMapData::Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
11
|
+
// map types are stored in a (too) clever way
|
12
|
+
// the main buffer holds the null values and the offsets
|
13
|
+
// then we have a single child, which is a struct of the map_type, and the key_type
|
14
|
+
result.main_buffer.reserve((capacity + 1) * sizeof(uint32_t));
|
15
|
+
|
16
|
+
auto &key_type = MapType::KeyType(type);
|
17
|
+
auto &value_type = MapType::ValueType(type);
|
18
|
+
auto internal_struct = make_uniq<ArrowAppendData>(result.options);
|
19
|
+
internal_struct->child_data.push_back(ArrowAppender::InitializeChild(key_type, capacity, result.options));
|
20
|
+
internal_struct->child_data.push_back(ArrowAppender::InitializeChild(value_type, capacity, result.options));
|
21
|
+
|
22
|
+
result.child_data.push_back(std::move(internal_struct));
|
23
|
+
}
|
24
|
+
|
25
|
+
void ArrowMapData::Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
26
|
+
UnifiedVectorFormat format;
|
27
|
+
input.ToUnifiedFormat(input_size, format);
|
28
|
+
idx_t size = to - from;
|
29
|
+
AppendValidity(append_data, format, from, to);
|
30
|
+
vector<sel_t> child_indices;
|
31
|
+
ArrowListData::AppendOffsets(append_data, format, from, to, child_indices);
|
32
|
+
|
33
|
+
SelectionVector child_sel(child_indices.data());
|
34
|
+
auto &key_vector = MapVector::GetKeys(input);
|
35
|
+
auto &value_vector = MapVector::GetValues(input);
|
36
|
+
auto list_size = child_indices.size();
|
37
|
+
|
38
|
+
auto &struct_data = *append_data.child_data[0];
|
39
|
+
auto &key_data = *struct_data.child_data[0];
|
40
|
+
auto &value_data = *struct_data.child_data[1];
|
41
|
+
|
42
|
+
Vector key_vector_copy(key_vector.GetType());
|
43
|
+
key_vector_copy.Slice(key_vector, child_sel, list_size);
|
44
|
+
Vector value_vector_copy(value_vector.GetType());
|
45
|
+
value_vector_copy.Slice(value_vector, child_sel, list_size);
|
46
|
+
key_data.append_vector(key_data, key_vector_copy, 0, list_size, list_size);
|
47
|
+
value_data.append_vector(value_data, value_vector_copy, 0, list_size, list_size);
|
48
|
+
|
49
|
+
append_data.row_count += size;
|
50
|
+
struct_data.row_count += size;
|
51
|
+
}
|
52
|
+
|
53
|
+
void ArrowMapData::Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
54
|
+
// set up the main map buffer
|
55
|
+
result->n_buffers = 2;
|
56
|
+
result->buffers[1] = append_data.main_buffer.data();
|
57
|
+
|
58
|
+
// the main map buffer has a single child: a struct
|
59
|
+
append_data.child_pointers.resize(1);
|
60
|
+
result->children = append_data.child_pointers.data();
|
61
|
+
result->n_children = 1;
|
62
|
+
append_data.child_pointers[0] = ArrowAppender::FinalizeChild(type, *append_data.child_data[0]);
|
63
|
+
|
64
|
+
// now that struct has two children: the key and the value type
|
65
|
+
auto &struct_data = *append_data.child_data[0];
|
66
|
+
auto &struct_result = append_data.child_pointers[0];
|
67
|
+
struct_data.child_pointers.resize(2);
|
68
|
+
struct_result->n_buffers = 1;
|
69
|
+
struct_result->n_children = 2;
|
70
|
+
struct_result->length = struct_data.child_data[0]->row_count;
|
71
|
+
struct_result->children = struct_data.child_pointers.data();
|
72
|
+
|
73
|
+
D_ASSERT(struct_data.child_data[0]->row_count == struct_data.child_data[1]->row_count);
|
74
|
+
|
75
|
+
auto &key_type = MapType::KeyType(type);
|
76
|
+
auto &value_type = MapType::ValueType(type);
|
77
|
+
struct_data.child_pointers[0] = ArrowAppender::FinalizeChild(key_type, *struct_data.child_data[0]);
|
78
|
+
struct_data.child_pointers[1] = ArrowAppender::FinalizeChild(value_type, *struct_data.child_data[1]);
|
79
|
+
|
80
|
+
// keys cannot have null values
|
81
|
+
if (struct_data.child_pointers[0]->null_count > 0) {
|
82
|
+
throw std::runtime_error("Arrow doesn't accept NULL keys on Maps");
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
} // namespace duckdb
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#include "duckdb/common/arrow/arrow_appender.hpp"
|
2
|
+
#include "duckdb/common/arrow/appender/struct_data.hpp"
|
3
|
+
|
4
|
+
namespace duckdb {
|
5
|
+
|
6
|
+
//===--------------------------------------------------------------------===//
|
7
|
+
// Structs
|
8
|
+
//===--------------------------------------------------------------------===//
|
9
|
+
void ArrowStructData::Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
10
|
+
auto &children = StructType::GetChildTypes(type);
|
11
|
+
for (auto &child : children) {
|
12
|
+
auto child_buffer = ArrowAppender::InitializeChild(child.second, capacity, result.options);
|
13
|
+
result.child_data.push_back(std::move(child_buffer));
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
void ArrowStructData::Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
18
|
+
UnifiedVectorFormat format;
|
19
|
+
input.ToUnifiedFormat(input_size, format);
|
20
|
+
idx_t size = to - from;
|
21
|
+
AppendValidity(append_data, format, from, to);
|
22
|
+
// append the children of the struct
|
23
|
+
auto &children = StructVector::GetEntries(input);
|
24
|
+
for (idx_t child_idx = 0; child_idx < children.size(); child_idx++) {
|
25
|
+
auto &child = children[child_idx];
|
26
|
+
auto &child_data = *append_data.child_data[child_idx];
|
27
|
+
child_data.append_vector(child_data, *child, from, to, size);
|
28
|
+
}
|
29
|
+
append_data.row_count += size;
|
30
|
+
}
|
31
|
+
|
32
|
+
void ArrowStructData::Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
33
|
+
result->n_buffers = 1;
|
34
|
+
|
35
|
+
auto &child_types = StructType::GetChildTypes(type);
|
36
|
+
append_data.child_pointers.resize(child_types.size());
|
37
|
+
result->children = append_data.child_pointers.data();
|
38
|
+
result->n_children = child_types.size();
|
39
|
+
for (idx_t i = 0; i < child_types.size(); i++) {
|
40
|
+
auto &child_type = child_types[i].second;
|
41
|
+
append_data.child_pointers[i] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[i]);
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
} // namespace duckdb
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#include "duckdb/common/arrow/arrow_appender.hpp"
|
2
|
+
#include "duckdb/common/arrow/appender/union_data.hpp"
|
3
|
+
|
4
|
+
namespace duckdb {
|
5
|
+
|
6
|
+
//===--------------------------------------------------------------------===//
|
7
|
+
// Unions
|
8
|
+
//===--------------------------------------------------------------------===//
|
9
|
+
void ArrowUnionData::Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
10
|
+
result.main_buffer.reserve(capacity * sizeof(int8_t));
|
11
|
+
|
12
|
+
for (auto &child : UnionType::CopyMemberTypes(type)) {
|
13
|
+
auto child_buffer = ArrowAppender::InitializeChild(child.second, capacity, result.options);
|
14
|
+
result.child_data.push_back(std::move(child_buffer));
|
15
|
+
}
|
16
|
+
}
|
17
|
+
|
18
|
+
void ArrowUnionData::Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
19
|
+
UnifiedVectorFormat format;
|
20
|
+
input.ToUnifiedFormat(input_size, format);
|
21
|
+
idx_t size = to - from;
|
22
|
+
|
23
|
+
auto &types_buffer = append_data.main_buffer;
|
24
|
+
|
25
|
+
duckdb::vector<Vector> child_vectors;
|
26
|
+
for (const auto &child : UnionType::CopyMemberTypes(input.GetType())) {
|
27
|
+
child_vectors.emplace_back(child.second);
|
28
|
+
}
|
29
|
+
|
30
|
+
for (idx_t input_idx = from; input_idx < to; input_idx++) {
|
31
|
+
const auto &val = input.GetValue(input_idx);
|
32
|
+
|
33
|
+
idx_t tag = 0;
|
34
|
+
Value resolved_value(nullptr);
|
35
|
+
if (!val.IsNull()) {
|
36
|
+
tag = UnionValue::GetTag(val);
|
37
|
+
|
38
|
+
resolved_value = UnionValue::GetValue(val);
|
39
|
+
}
|
40
|
+
|
41
|
+
for (idx_t child_idx = 0; child_idx < child_vectors.size(); child_idx++) {
|
42
|
+
child_vectors[child_idx].SetValue(input_idx, child_idx == tag ? resolved_value : Value(nullptr));
|
43
|
+
}
|
44
|
+
|
45
|
+
types_buffer.data()[input_idx] = tag;
|
46
|
+
}
|
47
|
+
|
48
|
+
for (idx_t child_idx = 0; child_idx < child_vectors.size(); child_idx++) {
|
49
|
+
auto &child_buffer = append_data.child_data[child_idx];
|
50
|
+
auto &child = child_vectors[child_idx];
|
51
|
+
child_buffer->append_vector(*child_buffer, child, from, to, size);
|
52
|
+
}
|
53
|
+
append_data.row_count += size;
|
54
|
+
}
|
55
|
+
|
56
|
+
void ArrowUnionData::Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
57
|
+
result->n_buffers = 2;
|
58
|
+
result->buffers[1] = append_data.main_buffer.data();
|
59
|
+
|
60
|
+
auto &child_types = UnionType::CopyMemberTypes(type);
|
61
|
+
append_data.child_pointers.resize(child_types.size());
|
62
|
+
result->children = append_data.child_pointers.data();
|
63
|
+
result->n_children = child_types.size();
|
64
|
+
for (idx_t i = 0; i < child_types.size(); i++) {
|
65
|
+
auto &child_type = child_types[i].second;
|
66
|
+
append_data.child_pointers[i] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[i]);
|
67
|
+
}
|
68
|
+
}
|
69
|
+
|
70
|
+
} // namespace duckdb
|