duckdb 0.8.2-dev2068.0 → 0.8.2-dev2133.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +4 -0
- package/package.json +1 -1
- package/src/duckdb/extension/json/buffered_json_reader.cpp +2 -0
- package/src/duckdb/extension/json/include/buffered_json_reader.hpp +5 -19
- package/src/duckdb/extension/json/include/json_enums.hpp +60 -0
- package/src/duckdb/extension/json/include/json_scan.hpp +14 -10
- package/src/duckdb/extension/json/include/json_transform.hpp +3 -0
- package/src/duckdb/extension/json/json_enums.cpp +105 -0
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +2 -0
- package/src/duckdb/extension/json/json_scan.cpp +44 -0
- package/src/duckdb/extension/json/serialize_json.cpp +92 -0
- package/src/duckdb/extension/parquet/include/parquet_reader.hpp +3 -0
- package/src/duckdb/extension/parquet/parquet_extension.cpp +23 -0
- package/src/duckdb/extension/parquet/parquet_reader.cpp +3 -0
- package/src/duckdb/extension/parquet/serialize_parquet.cpp +26 -0
- package/src/duckdb/src/common/arrow/appender/bool_data.cpp +44 -0
- package/src/duckdb/src/common/arrow/appender/list_data.cpp +78 -0
- package/src/duckdb/src/common/arrow/appender/map_data.cpp +86 -0
- package/src/duckdb/src/common/arrow/appender/struct_data.cpp +45 -0
- package/src/duckdb/src/common/arrow/appender/union_data.cpp +70 -0
- package/src/duckdb/src/common/arrow/arrow_appender.cpp +89 -727
- package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +2 -1
- package/src/duckdb/src/common/local_file_system.cpp +17 -14
- package/src/duckdb/src/common/serializer/format_serializer.cpp +15 -0
- package/src/duckdb/src/core_functions/aggregate/holistic/approximate_quantile.cpp +26 -0
- package/src/duckdb/src/core_functions/aggregate/holistic/quantile.cpp +47 -0
- package/src/duckdb/src/core_functions/aggregate/holistic/reservoir_quantile.cpp +28 -0
- package/src/duckdb/src/core_functions/scalar/date/strftime.cpp +10 -0
- package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +22 -3
- package/src/duckdb/src/function/aggregate/distributive/count.cpp +0 -11
- package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +1 -9
- package/src/duckdb/src/function/scalar/system/aggregate_export.cpp +27 -0
- package/src/duckdb/src/function/scalar_function.cpp +2 -1
- package/src/duckdb/src/function/table/read_csv.cpp +18 -0
- package/src/duckdb/src/function/table/table_scan.cpp +35 -0
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/function/table_function.cpp +4 -3
- package/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp +109 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/bool_data.hpp +15 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp +69 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/list.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/scalar_data.hpp +88 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/struct_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/union_data.hpp +21 -0
- package/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp +105 -0
- package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/multi_file_reader.hpp +5 -1
- package/src/duckdb/src/include/duckdb/common/multi_file_reader_options.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +32 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +45 -15
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +10 -0
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +2 -0
- package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +11 -2
- package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +81 -0
- package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +8 -0
- package/src/duckdb/src/include/duckdb/function/scalar_function.hpp +8 -0
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +7 -0
- package/src/duckdb/src/include/duckdb/function/table_function.hpp +8 -0
- package/src/duckdb/src/include/duckdb/planner/expression/bound_aggregate_expression.hpp +3 -0
- package/src/duckdb/src/include/duckdb/planner/expression/bound_function_expression.hpp +4 -0
- package/src/duckdb/src/include/duckdb/planner/expression/bound_window_expression.hpp +3 -0
- package/src/duckdb/src/include/duckdb/planner/filter/conjunction_filter.hpp +4 -0
- package/src/duckdb/src/include/duckdb/planner/filter/constant_filter.hpp +2 -0
- package/src/duckdb/src/include/duckdb/planner/filter/null_filter.hpp +4 -0
- package/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +2 -0
- package/src/duckdb/src/include/duckdb/planner/operator/logical_get.hpp +7 -1
- package/src/duckdb/src/include/duckdb/planner/table_filter.hpp +7 -1
- package/src/duckdb/src/main/extension/extension_helper.cpp +13 -0
- package/src/duckdb/src/parallel/executor.cpp +1 -1
- package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +23 -0
- package/src/duckdb/src/planner/expression/bound_function_expression.cpp +22 -0
- package/src/duckdb/src/planner/expression/bound_window_expression.cpp +47 -0
- package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +8 -0
- package/src/duckdb/src/planner/operator/logical_get.cpp +69 -0
- package/src/duckdb/src/storage/serialization/serialize_expression.cpp +9 -0
- package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +6 -0
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +190 -0
- package/src/duckdb/src/storage/serialization/serialize_table_filter.cpp +97 -0
- package/src/duckdb/ub_src_common_arrow_appender.cpp +10 -0
- package/src/duckdb/ub_src_common_serializer.cpp +2 -0
- package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
@@ -15,6 +15,8 @@
|
|
15
15
|
#include "duckdb/catalog/dependency_list.hpp"
|
16
16
|
#include "duckdb/function/function_set.hpp"
|
17
17
|
#include "duckdb/storage/table/scan_state.hpp"
|
18
|
+
#include "duckdb/common/serializer/format_serializer.hpp"
|
19
|
+
#include "duckdb/common/serializer/format_deserializer.hpp"
|
18
20
|
|
19
21
|
namespace duckdb {
|
20
22
|
|
@@ -447,6 +449,35 @@ static unique_ptr<FunctionData> TableScanDeserialize(PlanDeserializationState &s
|
|
447
449
|
return std::move(result);
|
448
450
|
}
|
449
451
|
|
452
|
+
static void TableScanFormatSerialize(FormatSerializer &serializer, const optional_ptr<FunctionData> bind_data_p,
|
453
|
+
const TableFunction &function) {
|
454
|
+
auto &bind_data = bind_data_p->Cast<TableScanBindData>();
|
455
|
+
serializer.WriteProperty("catalog", bind_data.table.schema.catalog.GetName());
|
456
|
+
serializer.WriteProperty("schema", bind_data.table.schema.name);
|
457
|
+
serializer.WriteProperty("table", bind_data.table.name);
|
458
|
+
serializer.WriteProperty("is_index_scan", bind_data.is_index_scan);
|
459
|
+
serializer.WriteProperty("is_create_index", bind_data.is_create_index);
|
460
|
+
serializer.WriteProperty("result_ids", bind_data.result_ids);
|
461
|
+
serializer.WriteProperty("result_ids", bind_data.result_ids);
|
462
|
+
}
|
463
|
+
|
464
|
+
static unique_ptr<FunctionData> TableScanFormatDeserialize(FormatDeserializer &deserializer, TableFunction &function) {
|
465
|
+
auto catalog = deserializer.ReadProperty<string>("catalog");
|
466
|
+
auto schema = deserializer.ReadProperty<string>("schema");
|
467
|
+
auto table = deserializer.ReadProperty<string>("table");
|
468
|
+
auto &catalog_entry =
|
469
|
+
Catalog::GetEntry<TableCatalogEntry>(deserializer.Get<ClientContext &>(), catalog, schema, table);
|
470
|
+
if (catalog_entry.type != CatalogType::TABLE_ENTRY) {
|
471
|
+
throw SerializationException("Cant find table for %s.%s", schema, table);
|
472
|
+
}
|
473
|
+
auto result = make_uniq<TableScanBindData>(catalog_entry.Cast<DuckTableEntry>());
|
474
|
+
deserializer.ReadProperty("is_index_scan", result->is_index_scan);
|
475
|
+
deserializer.ReadProperty("is_create_index", result->is_create_index);
|
476
|
+
deserializer.ReadProperty("result_ids", result->result_ids);
|
477
|
+
deserializer.ReadProperty("result_ids", result->result_ids);
|
478
|
+
return std::move(result);
|
479
|
+
}
|
480
|
+
|
450
481
|
TableFunction TableScanFunction::GetIndexScanFunction() {
|
451
482
|
TableFunction scan_function("index_scan", {}, IndexScanFunction);
|
452
483
|
scan_function.init_local = nullptr;
|
@@ -462,6 +493,8 @@ TableFunction TableScanFunction::GetIndexScanFunction() {
|
|
462
493
|
scan_function.filter_pushdown = false;
|
463
494
|
scan_function.serialize = TableScanSerialize;
|
464
495
|
scan_function.deserialize = TableScanDeserialize;
|
496
|
+
scan_function.format_serialize = TableScanFormatSerialize;
|
497
|
+
scan_function.format_deserialize = TableScanFormatDeserialize;
|
465
498
|
return scan_function;
|
466
499
|
}
|
467
500
|
|
@@ -482,6 +515,8 @@ TableFunction TableScanFunction::GetFunction() {
|
|
482
515
|
scan_function.filter_prune = true;
|
483
516
|
scan_function.serialize = TableScanSerialize;
|
484
517
|
scan_function.deserialize = TableScanDeserialize;
|
518
|
+
scan_function.format_serialize = TableScanFormatSerialize;
|
519
|
+
scan_function.format_deserialize = TableScanFormatDeserialize;
|
485
520
|
return scan_function;
|
486
521
|
}
|
487
522
|
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.8.2-
|
2
|
+
#define DUCKDB_VERSION "0.8.2-dev2133"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "726b6d1566"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -18,8 +18,8 @@ TableFunction::TableFunction(string name, vector<LogicalType> arguments, table_f
|
|
18
18
|
init_global(init_global), init_local(init_local), function(function), in_out_function(nullptr),
|
19
19
|
in_out_function_final(nullptr), statistics(nullptr), dependency(nullptr), cardinality(nullptr),
|
20
20
|
pushdown_complex_filter(nullptr), to_string(nullptr), table_scan_progress(nullptr), get_batch_index(nullptr),
|
21
|
-
get_batch_info(nullptr), serialize(nullptr), deserialize(nullptr),
|
22
|
-
filter_pushdown(false), filter_prune(false) {
|
21
|
+
get_batch_info(nullptr), serialize(nullptr), deserialize(nullptr), format_serialize(nullptr),
|
22
|
+
format_deserialize(nullptr), projection_pushdown(false), filter_pushdown(false), filter_prune(false) {
|
23
23
|
}
|
24
24
|
|
25
25
|
TableFunction::TableFunction(const vector<LogicalType> &arguments, table_function_t function,
|
@@ -32,7 +32,8 @@ TableFunction::TableFunction()
|
|
32
32
|
init_local(nullptr), function(nullptr), in_out_function(nullptr), statistics(nullptr), dependency(nullptr),
|
33
33
|
cardinality(nullptr), pushdown_complex_filter(nullptr), to_string(nullptr), table_scan_progress(nullptr),
|
34
34
|
get_batch_index(nullptr), get_batch_info(nullptr), serialize(nullptr), deserialize(nullptr),
|
35
|
-
|
35
|
+
format_serialize(nullptr), format_deserialize(nullptr), projection_pushdown(false), filter_pushdown(false),
|
36
|
+
filter_prune(false) {
|
36
37
|
}
|
37
38
|
|
38
39
|
bool TableFunction::Equal(const TableFunction &rhs) const {
|
@@ -0,0 +1,109 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "duckdb/common/types.hpp"
|
4
|
+
#include "duckdb/common/types/vector.hpp"
|
5
|
+
#include "duckdb/common/arrow/arrow.hpp"
|
6
|
+
#include "duckdb/common/arrow/arrow_buffer.hpp"
|
7
|
+
#include "duckdb/common/arrow/arrow_options.hpp"
|
8
|
+
#include "duckdb/common/array.hpp"
|
9
|
+
|
10
|
+
namespace duckdb {
|
11
|
+
|
12
|
+
//===--------------------------------------------------------------------===//
|
13
|
+
// Arrow append data
|
14
|
+
//===--------------------------------------------------------------------===//
|
15
|
+
typedef void (*initialize_t)(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
|
16
|
+
// append_data: The arrow array we're appending into
|
17
|
+
// input: The data we're appending
|
18
|
+
// from: The offset into the input we're scanning
|
19
|
+
// to: The last index of the input we're scanning
|
20
|
+
// input_size: The total size of the 'input' Vector.
|
21
|
+
typedef void (*append_vector_t)(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
|
22
|
+
typedef void (*finalize_t)(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
|
23
|
+
|
24
|
+
// This struct is used to save state for appending a column
|
25
|
+
// afterwards the ownership is passed to the arrow array, as 'private_data'
|
26
|
+
// FIXME: we should separate the append state variables from the variables required by the ArrowArray into
|
27
|
+
// ArrowAppendState
|
28
|
+
struct ArrowAppendData {
|
29
|
+
explicit ArrowAppendData(ArrowOptions &options_p) : options(options_p) {
|
30
|
+
}
|
31
|
+
// the buffers of the arrow vector
|
32
|
+
ArrowBuffer validity;
|
33
|
+
ArrowBuffer main_buffer;
|
34
|
+
ArrowBuffer aux_buffer;
|
35
|
+
|
36
|
+
idx_t row_count = 0;
|
37
|
+
idx_t null_count = 0;
|
38
|
+
|
39
|
+
// function pointers for construction
|
40
|
+
initialize_t initialize = nullptr;
|
41
|
+
append_vector_t append_vector = nullptr;
|
42
|
+
finalize_t finalize = nullptr;
|
43
|
+
|
44
|
+
// child data (if any)
|
45
|
+
vector<unique_ptr<ArrowAppendData>> child_data;
|
46
|
+
|
47
|
+
// the arrow array C API data, only set after Finalize
|
48
|
+
unique_ptr<ArrowArray> array;
|
49
|
+
duckdb::array<const void *, 3> buffers = {{nullptr, nullptr, nullptr}};
|
50
|
+
vector<ArrowArray *> child_pointers;
|
51
|
+
|
52
|
+
ArrowOptions options;
|
53
|
+
};
|
54
|
+
|
55
|
+
//===--------------------------------------------------------------------===//
|
56
|
+
// Append Helper Functions
|
57
|
+
//===--------------------------------------------------------------------===//
|
58
|
+
static void GetBitPosition(idx_t row_idx, idx_t ¤t_byte, uint8_t ¤t_bit) {
|
59
|
+
current_byte = row_idx / 8;
|
60
|
+
current_bit = row_idx % 8;
|
61
|
+
}
|
62
|
+
|
63
|
+
static void UnsetBit(uint8_t *data, idx_t current_byte, uint8_t current_bit) {
|
64
|
+
data[current_byte] &= ~((uint64_t)1 << current_bit);
|
65
|
+
}
|
66
|
+
|
67
|
+
static void NextBit(idx_t ¤t_byte, uint8_t ¤t_bit) {
|
68
|
+
current_bit++;
|
69
|
+
if (current_bit == 8) {
|
70
|
+
current_byte++;
|
71
|
+
current_bit = 0;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
static void ResizeValidity(ArrowBuffer &buffer, idx_t row_count) {
|
76
|
+
auto byte_count = (row_count + 7) / 8;
|
77
|
+
buffer.resize(byte_count, 0xFF);
|
78
|
+
}
|
79
|
+
|
80
|
+
static void SetNull(ArrowAppendData &append_data, uint8_t *validity_data, idx_t current_byte, uint8_t current_bit) {
|
81
|
+
UnsetBit(validity_data, current_byte, current_bit);
|
82
|
+
append_data.null_count++;
|
83
|
+
}
|
84
|
+
|
85
|
+
static void AppendValidity(ArrowAppendData &append_data, UnifiedVectorFormat &format, idx_t from, idx_t to) {
|
86
|
+
// resize the buffer, filling the validity buffer with all valid values
|
87
|
+
idx_t size = to - from;
|
88
|
+
ResizeValidity(append_data.validity, append_data.row_count + size);
|
89
|
+
if (format.validity.AllValid()) {
|
90
|
+
// if all values are valid we don't need to do anything else
|
91
|
+
return;
|
92
|
+
}
|
93
|
+
|
94
|
+
// otherwise we iterate through the validity mask
|
95
|
+
auto validity_data = (uint8_t *)append_data.validity.data();
|
96
|
+
uint8_t current_bit;
|
97
|
+
idx_t current_byte;
|
98
|
+
GetBitPosition(append_data.row_count, current_byte, current_bit);
|
99
|
+
for (idx_t i = from; i < to; i++) {
|
100
|
+
auto source_idx = format.sel->get_index(i);
|
101
|
+
// append the validity mask
|
102
|
+
if (!format.validity.RowIsValid(source_idx)) {
|
103
|
+
SetNull(append_data, validity_data, current_byte, current_bit);
|
104
|
+
}
|
105
|
+
NextBit(current_byte, current_bit);
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
} // namespace duckdb
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "duckdb/common/arrow/appender/append_data.hpp"
|
4
|
+
#include "duckdb/common/types/vector.hpp"
|
5
|
+
|
6
|
+
namespace duckdb {
|
7
|
+
|
8
|
+
struct ArrowBoolData {
|
9
|
+
public:
|
10
|
+
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
|
11
|
+
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
|
12
|
+
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
|
13
|
+
};
|
14
|
+
|
15
|
+
} // namespace duckdb
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "duckdb/common/arrow/appender/append_data.hpp"
|
4
|
+
#include "duckdb/common/arrow/appender/scalar_data.hpp"
|
5
|
+
|
6
|
+
namespace duckdb {
|
7
|
+
|
8
|
+
//===--------------------------------------------------------------------===//
|
9
|
+
// Enums
|
10
|
+
//===--------------------------------------------------------------------===//
|
11
|
+
template <class TGT>
|
12
|
+
struct ArrowEnumData : public ArrowScalarBaseData<TGT> {
|
13
|
+
static idx_t GetLength(string_t input) {
|
14
|
+
return input.GetSize();
|
15
|
+
}
|
16
|
+
static void WriteData(data_ptr_t target, string_t input) {
|
17
|
+
memcpy(target, input.GetData(), input.GetSize());
|
18
|
+
}
|
19
|
+
static void EnumAppendVector(ArrowAppendData &append_data, const Vector &input, idx_t size) {
|
20
|
+
D_ASSERT(input.GetVectorType() == VectorType::FLAT_VECTOR);
|
21
|
+
|
22
|
+
// resize the validity mask and set up the validity buffer for iteration
|
23
|
+
ResizeValidity(append_data.validity, append_data.row_count + size);
|
24
|
+
|
25
|
+
// resize the offset buffer - the offset buffer holds the offsets into the child array
|
26
|
+
append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(uint32_t) * (size + 1));
|
27
|
+
auto data = FlatVector::GetData<string_t>(input);
|
28
|
+
auto offset_data = append_data.main_buffer.GetData<uint32_t>();
|
29
|
+
if (append_data.row_count == 0) {
|
30
|
+
// first entry
|
31
|
+
offset_data[0] = 0;
|
32
|
+
}
|
33
|
+
// now append the string data to the auxiliary buffer
|
34
|
+
// the auxiliary buffer's length depends on the string lengths, so we resize as required
|
35
|
+
auto last_offset = offset_data[append_data.row_count];
|
36
|
+
for (idx_t i = 0; i < size; i++) {
|
37
|
+
auto offset_idx = append_data.row_count + i + 1;
|
38
|
+
|
39
|
+
auto string_length = GetLength(data[i]);
|
40
|
+
|
41
|
+
// append the offset data
|
42
|
+
auto current_offset = last_offset + string_length;
|
43
|
+
offset_data[offset_idx] = current_offset;
|
44
|
+
|
45
|
+
// resize the string buffer if required, and write the string data
|
46
|
+
append_data.aux_buffer.resize(current_offset);
|
47
|
+
WriteData(append_data.aux_buffer.data() + last_offset, data[i]);
|
48
|
+
|
49
|
+
last_offset = current_offset;
|
50
|
+
}
|
51
|
+
append_data.row_count += size;
|
52
|
+
}
|
53
|
+
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
54
|
+
result.main_buffer.reserve(capacity * sizeof(TGT));
|
55
|
+
// construct the enum child data
|
56
|
+
auto enum_data = ArrowAppender::InitializeChild(LogicalType::VARCHAR, EnumType::GetSize(type), result.options);
|
57
|
+
EnumAppendVector(*enum_data, EnumType::GetValuesInsertOrder(type), EnumType::GetSize(type));
|
58
|
+
result.child_data.push_back(std::move(enum_data));
|
59
|
+
}
|
60
|
+
|
61
|
+
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
62
|
+
result->n_buffers = 2;
|
63
|
+
result->buffers[1] = append_data.main_buffer.data();
|
64
|
+
// finalize the enum child data, and assign it to the dictionary
|
65
|
+
result->dictionary = ArrowAppender::FinalizeChild(LogicalType::VARCHAR, *append_data.child_data[0]);
|
66
|
+
}
|
67
|
+
};
|
68
|
+
|
69
|
+
} // namespace duckdb
|
@@ -0,0 +1,8 @@
|
|
1
|
+
#include "duckdb/common/arrow/appender/bool_data.hpp"
|
2
|
+
#include "duckdb/common/arrow/appender/enum_data.hpp"
|
3
|
+
#include "duckdb/common/arrow/appender/list_data.hpp"
|
4
|
+
#include "duckdb/common/arrow/appender/map_data.hpp"
|
5
|
+
#include "duckdb/common/arrow/appender/scalar_data.hpp"
|
6
|
+
#include "duckdb/common/arrow/appender/struct_data.hpp"
|
7
|
+
#include "duckdb/common/arrow/appender/union_data.hpp"
|
8
|
+
#include "duckdb/common/arrow/appender/varchar_data.hpp"
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "duckdb/common/arrow/appender/append_data.hpp"
|
4
|
+
|
5
|
+
namespace duckdb {
|
6
|
+
|
7
|
+
struct ArrowListData {
|
8
|
+
public:
|
9
|
+
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
|
10
|
+
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
|
11
|
+
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
|
12
|
+
|
13
|
+
public:
|
14
|
+
static void AppendOffsets(ArrowAppendData &append_data, UnifiedVectorFormat &format, idx_t from, idx_t to,
|
15
|
+
vector<sel_t> &child_sel);
|
16
|
+
};
|
17
|
+
|
18
|
+
} // namespace duckdb
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "duckdb/common/arrow/arrow_appender.hpp"
|
4
|
+
#include "duckdb/common/arrow/appender/append_data.hpp"
|
5
|
+
|
6
|
+
namespace duckdb {
|
7
|
+
|
8
|
+
//===--------------------------------------------------------------------===//
|
9
|
+
// Maps
|
10
|
+
//===--------------------------------------------------------------------===//
|
11
|
+
struct ArrowMapData {
|
12
|
+
public:
|
13
|
+
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
|
14
|
+
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
|
15
|
+
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
|
16
|
+
};
|
17
|
+
|
18
|
+
} // namespace duckdb
|
@@ -0,0 +1,88 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "duckdb/common/arrow/appender/append_data.hpp"
|
4
|
+
#include "duckdb/function/table/arrow.hpp"
|
5
|
+
|
6
|
+
namespace duckdb {
|
7
|
+
|
8
|
+
//===--------------------------------------------------------------------===//
|
9
|
+
// Scalar Types
|
10
|
+
//===--------------------------------------------------------------------===//
|
11
|
+
struct ArrowScalarConverter {
|
12
|
+
template <class TGT, class SRC>
|
13
|
+
static TGT Operation(SRC input) {
|
14
|
+
return input;
|
15
|
+
}
|
16
|
+
|
17
|
+
static bool SkipNulls() {
|
18
|
+
return false;
|
19
|
+
}
|
20
|
+
|
21
|
+
template <class TGT>
|
22
|
+
static void SetNull(TGT &value) {
|
23
|
+
}
|
24
|
+
};
|
25
|
+
|
26
|
+
struct ArrowIntervalConverter {
|
27
|
+
template <class TGT, class SRC>
|
28
|
+
static TGT Operation(SRC input) {
|
29
|
+
ArrowInterval result;
|
30
|
+
result.months = input.months;
|
31
|
+
result.days = input.days;
|
32
|
+
result.nanoseconds = input.micros * Interval::NANOS_PER_MICRO;
|
33
|
+
return result;
|
34
|
+
}
|
35
|
+
|
36
|
+
static bool SkipNulls() {
|
37
|
+
return true;
|
38
|
+
}
|
39
|
+
|
40
|
+
template <class TGT>
|
41
|
+
static void SetNull(TGT &value) {
|
42
|
+
}
|
43
|
+
};
|
44
|
+
|
45
|
+
template <class TGT, class SRC = TGT, class OP = ArrowScalarConverter>
|
46
|
+
struct ArrowScalarBaseData {
|
47
|
+
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
48
|
+
D_ASSERT(to >= from);
|
49
|
+
idx_t size = to - from;
|
50
|
+
D_ASSERT(size <= input_size);
|
51
|
+
UnifiedVectorFormat format;
|
52
|
+
input.ToUnifiedFormat(input_size, format);
|
53
|
+
|
54
|
+
// append the validity mask
|
55
|
+
AppendValidity(append_data, format, from, to);
|
56
|
+
|
57
|
+
// append the main data
|
58
|
+
append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(TGT) * size);
|
59
|
+
auto data = UnifiedVectorFormat::GetData<SRC>(format);
|
60
|
+
auto result_data = append_data.main_buffer.GetData<TGT>();
|
61
|
+
|
62
|
+
for (idx_t i = from; i < to; i++) {
|
63
|
+
auto source_idx = format.sel->get_index(i);
|
64
|
+
auto result_idx = append_data.row_count + i - from;
|
65
|
+
|
66
|
+
if (OP::SkipNulls() && !format.validity.RowIsValid(source_idx)) {
|
67
|
+
OP::template SetNull<TGT>(result_data[result_idx]);
|
68
|
+
continue;
|
69
|
+
}
|
70
|
+
result_data[result_idx] = OP::template Operation<TGT, SRC>(data[source_idx]);
|
71
|
+
}
|
72
|
+
append_data.row_count += size;
|
73
|
+
}
|
74
|
+
};
|
75
|
+
|
76
|
+
template <class TGT, class SRC = TGT, class OP = ArrowScalarConverter>
|
77
|
+
struct ArrowScalarData : public ArrowScalarBaseData<TGT, SRC, OP> {
|
78
|
+
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
79
|
+
result.main_buffer.reserve(capacity * sizeof(TGT));
|
80
|
+
}
|
81
|
+
|
82
|
+
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
83
|
+
result->n_buffers = 2;
|
84
|
+
result->buffers[1] = append_data.main_buffer.data();
|
85
|
+
}
|
86
|
+
};
|
87
|
+
|
88
|
+
} // namespace duckdb
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "duckdb/common/arrow/appender/append_data.hpp"
|
4
|
+
#include "duckdb/common/arrow/appender/scalar_data.hpp"
|
5
|
+
|
6
|
+
namespace duckdb {
|
7
|
+
|
8
|
+
//===--------------------------------------------------------------------===//
|
9
|
+
// Structs
|
10
|
+
//===--------------------------------------------------------------------===//
|
11
|
+
struct ArrowStructData {
|
12
|
+
public:
|
13
|
+
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
|
14
|
+
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
|
15
|
+
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
|
16
|
+
};
|
17
|
+
|
18
|
+
} // namespace duckdb
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "duckdb/common/arrow/appender/append_data.hpp"
|
4
|
+
|
5
|
+
namespace duckdb {
|
6
|
+
|
7
|
+
//===--------------------------------------------------------------------===//
|
8
|
+
// Unions
|
9
|
+
//===--------------------------------------------------------------------===//
|
10
|
+
/**
|
11
|
+
* Based on https://arrow.apache.org/docs/format/Columnar.html#union-layout &
|
12
|
+
* https://arrow.apache.org/docs/format/CDataInterface.html
|
13
|
+
*/
|
14
|
+
struct ArrowUnionData {
|
15
|
+
public:
|
16
|
+
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
|
17
|
+
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
|
18
|
+
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
|
19
|
+
};
|
20
|
+
|
21
|
+
} // namespace duckdb
|
@@ -0,0 +1,105 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "duckdb/common/arrow/appender/append_data.hpp"
|
4
|
+
#include "duckdb/common/arrow/appender/scalar_data.hpp"
|
5
|
+
|
6
|
+
namespace duckdb {
|
7
|
+
|
8
|
+
//===--------------------------------------------------------------------===//
|
9
|
+
// Varchar
|
10
|
+
//===--------------------------------------------------------------------===//
|
11
|
+
struct ArrowVarcharConverter {
|
12
|
+
template <class SRC>
|
13
|
+
static idx_t GetLength(SRC input) {
|
14
|
+
return input.GetSize();
|
15
|
+
}
|
16
|
+
|
17
|
+
template <class SRC>
|
18
|
+
static void WriteData(data_ptr_t target, SRC input) {
|
19
|
+
memcpy(target, input.GetData(), input.GetSize());
|
20
|
+
}
|
21
|
+
};
|
22
|
+
|
23
|
+
struct ArrowUUIDConverter {
|
24
|
+
template <class SRC>
|
25
|
+
static idx_t GetLength(SRC input) {
|
26
|
+
return UUID::STRING_SIZE;
|
27
|
+
}
|
28
|
+
|
29
|
+
template <class SRC>
|
30
|
+
static void WriteData(data_ptr_t target, SRC input) {
|
31
|
+
UUID::ToString(input, char_ptr_cast(target));
|
32
|
+
}
|
33
|
+
};
|
34
|
+
|
35
|
+
template <class SRC = string_t, class OP = ArrowVarcharConverter, class BUFTYPE = uint64_t>
|
36
|
+
struct ArrowVarcharData {
|
37
|
+
static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
|
38
|
+
result.main_buffer.reserve((capacity + 1) * sizeof(BUFTYPE));
|
39
|
+
|
40
|
+
result.aux_buffer.reserve(capacity);
|
41
|
+
}
|
42
|
+
|
43
|
+
static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
|
44
|
+
idx_t size = to - from;
|
45
|
+
UnifiedVectorFormat format;
|
46
|
+
input.ToUnifiedFormat(input_size, format);
|
47
|
+
|
48
|
+
// resize the validity mask and set up the validity buffer for iteration
|
49
|
+
ResizeValidity(append_data.validity, append_data.row_count + size);
|
50
|
+
auto validity_data = (uint8_t *)append_data.validity.data();
|
51
|
+
|
52
|
+
// resize the offset buffer - the offset buffer holds the offsets into the child array
|
53
|
+
append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(BUFTYPE) * (size + 1));
|
54
|
+
auto data = UnifiedVectorFormat::GetData<SRC>(format);
|
55
|
+
auto offset_data = append_data.main_buffer.GetData<BUFTYPE>();
|
56
|
+
if (append_data.row_count == 0) {
|
57
|
+
// first entry
|
58
|
+
offset_data[0] = 0;
|
59
|
+
}
|
60
|
+
// now append the string data to the auxiliary buffer
|
61
|
+
// the auxiliary buffer's length depends on the string lengths, so we resize as required
|
62
|
+
auto last_offset = offset_data[append_data.row_count];
|
63
|
+
idx_t max_offset = append_data.row_count + to - from;
|
64
|
+
if (max_offset > NumericLimits<uint32_t>::Maximum() &&
|
65
|
+
append_data.options.offset_size == ArrowOffsetSize::REGULAR) {
|
66
|
+
throw InvalidInputException("Arrow Appender: The maximum total string size for regular string buffers is "
|
67
|
+
"%u but the offset of %lu exceeds this.",
|
68
|
+
NumericLimits<uint32_t>::Maximum(), max_offset);
|
69
|
+
}
|
70
|
+
for (idx_t i = from; i < to; i++) {
|
71
|
+
auto source_idx = format.sel->get_index(i);
|
72
|
+
auto offset_idx = append_data.row_count + i + 1 - from;
|
73
|
+
|
74
|
+
if (!format.validity.RowIsValid(source_idx)) {
|
75
|
+
uint8_t current_bit;
|
76
|
+
idx_t current_byte;
|
77
|
+
GetBitPosition(append_data.row_count + i - from, current_byte, current_bit);
|
78
|
+
SetNull(append_data, validity_data, current_byte, current_bit);
|
79
|
+
offset_data[offset_idx] = last_offset;
|
80
|
+
continue;
|
81
|
+
}
|
82
|
+
|
83
|
+
auto string_length = OP::GetLength(data[source_idx]);
|
84
|
+
|
85
|
+
// append the offset data
|
86
|
+
auto current_offset = last_offset + string_length;
|
87
|
+
offset_data[offset_idx] = current_offset;
|
88
|
+
|
89
|
+
// resize the string buffer if required, and write the string data
|
90
|
+
append_data.aux_buffer.resize(current_offset);
|
91
|
+
OP::WriteData(append_data.aux_buffer.data() + last_offset, data[source_idx]);
|
92
|
+
|
93
|
+
last_offset = current_offset;
|
94
|
+
}
|
95
|
+
append_data.row_count += size;
|
96
|
+
}
|
97
|
+
|
98
|
+
static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
|
99
|
+
result->n_buffers = 3;
|
100
|
+
result->buffers[1] = append_data.main_buffer.data();
|
101
|
+
result->buffers[2] = append_data.aux_buffer.data();
|
102
|
+
}
|
103
|
+
};
|
104
|
+
|
105
|
+
} // namespace duckdb
|
@@ -27,6 +27,11 @@ public:
|
|
27
27
|
//! Returns the underlying arrow array
|
28
28
|
DUCKDB_API ArrowArray Finalize();
|
29
29
|
|
30
|
+
public:
|
31
|
+
static void ReleaseArray(ArrowArray *array);
|
32
|
+
static ArrowArray *FinalizeChild(const LogicalType &type, ArrowAppendData &append_data);
|
33
|
+
static unique_ptr<ArrowAppendData> InitializeChild(const LogicalType &type, idx_t capacity, ArrowOptions &options);
|
34
|
+
|
30
35
|
private:
|
31
36
|
//! The types of the chunks that will be appended in
|
32
37
|
vector<LogicalType> types;
|
@@ -8,7 +8,7 @@
|
|
8
8
|
|
9
9
|
#pragma once
|
10
10
|
|
11
|
-
#include "duckdb/common/
|
11
|
+
#include "duckdb/common/common.hpp"
|
12
12
|
#include "duckdb/common/multi_file_reader_options.hpp"
|
13
13
|
#include "duckdb/common/enums/file_glob_options.hpp"
|
14
14
|
#include "duckdb/common/union_by_name.hpp"
|
@@ -32,6 +32,8 @@ struct HivePartitioningIndex {
|
|
32
32
|
|
33
33
|
DUCKDB_API void Serialize(Serializer &serializer) const;
|
34
34
|
DUCKDB_API static HivePartitioningIndex Deserialize(Deserializer &source);
|
35
|
+
DUCKDB_API void FormatSerialize(FormatSerializer &serializer) const;
|
36
|
+
DUCKDB_API static HivePartitioningIndex FormatDeserialize(FormatDeserializer &deserializer);
|
35
37
|
};
|
36
38
|
|
37
39
|
//! The bind data for the multi-file reader, obtained through MultiFileReader::BindReader
|
@@ -43,6 +45,8 @@ struct MultiFileReaderBindData {
|
|
43
45
|
|
44
46
|
DUCKDB_API void Serialize(Serializer &serializer) const;
|
45
47
|
DUCKDB_API static MultiFileReaderBindData Deserialize(Deserializer &source);
|
48
|
+
DUCKDB_API void FormatSerialize(FormatSerializer &serializer) const;
|
49
|
+
DUCKDB_API static MultiFileReaderBindData FormatDeserialize(FormatDeserializer &deserializer);
|
46
50
|
};
|
47
51
|
|
48
52
|
struct MultiFileFilterEntry {
|
@@ -28,6 +28,8 @@ struct MultiFileReaderOptions {
|
|
28
28
|
|
29
29
|
DUCKDB_API void Serialize(Serializer &serializer) const;
|
30
30
|
DUCKDB_API static MultiFileReaderOptions Deserialize(Deserializer &source);
|
31
|
+
DUCKDB_API void FormatSerialize(FormatSerializer &serializer) const;
|
32
|
+
DUCKDB_API static MultiFileReaderOptions FormatDeserialize(FormatDeserializer &source);
|
31
33
|
DUCKDB_API void AddBatchInfo(BindInfo &bind_info) const;
|
32
34
|
DUCKDB_API void AutoDetectHivePartitioning(const vector<string> &files, ClientContext &context);
|
33
35
|
DUCKDB_API static bool AutoDetectHivePartitioningInternal(const vector<string> &files);
|
@@ -127,6 +127,16 @@ public:
|
|
127
127
|
return data.Unset<T>();
|
128
128
|
}
|
129
129
|
|
130
|
+
// Manually begin an object - should be followed by EndObject
|
131
|
+
void BeginObject(const char *tag) {
|
132
|
+
SetTag(tag);
|
133
|
+
OnObjectBegin();
|
134
|
+
}
|
135
|
+
|
136
|
+
void EndObject() {
|
137
|
+
OnObjectEnd();
|
138
|
+
}
|
139
|
+
|
130
140
|
private:
|
131
141
|
// Deserialize anything implementing a FormatDeserialize method
|
132
142
|
template <typename T = void>
|
@@ -208,6 +218,28 @@ private:
|
|
208
218
|
return map;
|
209
219
|
}
|
210
220
|
|
221
|
+
template <typename T = void>
|
222
|
+
inline typename std::enable_if<is_map<T>::value, T>::type Read() {
|
223
|
+
using KEY_TYPE = typename is_map<T>::KEY_TYPE;
|
224
|
+
using VALUE_TYPE = typename is_map<T>::VALUE_TYPE;
|
225
|
+
|
226
|
+
T map;
|
227
|
+
auto size = OnMapBegin();
|
228
|
+
for (idx_t i = 0; i < size; i++) {
|
229
|
+
OnMapEntryBegin();
|
230
|
+
OnMapKeyBegin();
|
231
|
+
auto key = Read<KEY_TYPE>();
|
232
|
+
OnMapKeyEnd();
|
233
|
+
OnMapValueBegin();
|
234
|
+
auto value = Read<VALUE_TYPE>();
|
235
|
+
OnMapValueEnd();
|
236
|
+
OnMapEntryEnd();
|
237
|
+
map[std::move(key)] = std::move(value);
|
238
|
+
}
|
239
|
+
OnMapEnd();
|
240
|
+
return map;
|
241
|
+
}
|
242
|
+
|
211
243
|
// Deserialize an unordered set
|
212
244
|
template <typename T = void>
|
213
245
|
inline typename std::enable_if<is_unordered_set<T>::value, T>::type Read() {
|