duckdb 0.8.2-dev2068.0 → 0.8.2-dev2133.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/binding.gyp +4 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/json/buffered_json_reader.cpp +2 -0
  4. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +5 -19
  5. package/src/duckdb/extension/json/include/json_enums.hpp +60 -0
  6. package/src/duckdb/extension/json/include/json_scan.hpp +14 -10
  7. package/src/duckdb/extension/json/include/json_transform.hpp +3 -0
  8. package/src/duckdb/extension/json/json_enums.cpp +105 -0
  9. package/src/duckdb/extension/json/json_functions/json_transform.cpp +2 -0
  10. package/src/duckdb/extension/json/json_scan.cpp +44 -0
  11. package/src/duckdb/extension/json/serialize_json.cpp +92 -0
  12. package/src/duckdb/extension/parquet/include/parquet_reader.hpp +3 -0
  13. package/src/duckdb/extension/parquet/parquet_extension.cpp +23 -0
  14. package/src/duckdb/extension/parquet/parquet_reader.cpp +3 -0
  15. package/src/duckdb/extension/parquet/serialize_parquet.cpp +26 -0
  16. package/src/duckdb/src/common/arrow/appender/bool_data.cpp +44 -0
  17. package/src/duckdb/src/common/arrow/appender/list_data.cpp +78 -0
  18. package/src/duckdb/src/common/arrow/appender/map_data.cpp +86 -0
  19. package/src/duckdb/src/common/arrow/appender/struct_data.cpp +45 -0
  20. package/src/duckdb/src/common/arrow/appender/union_data.cpp +70 -0
  21. package/src/duckdb/src/common/arrow/arrow_appender.cpp +89 -727
  22. package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +2 -1
  23. package/src/duckdb/src/common/local_file_system.cpp +17 -14
  24. package/src/duckdb/src/common/serializer/format_serializer.cpp +15 -0
  25. package/src/duckdb/src/core_functions/aggregate/holistic/approximate_quantile.cpp +26 -0
  26. package/src/duckdb/src/core_functions/aggregate/holistic/quantile.cpp +47 -0
  27. package/src/duckdb/src/core_functions/aggregate/holistic/reservoir_quantile.cpp +28 -0
  28. package/src/duckdb/src/core_functions/scalar/date/strftime.cpp +10 -0
  29. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +22 -3
  30. package/src/duckdb/src/function/aggregate/distributive/count.cpp +0 -11
  31. package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +1 -9
  32. package/src/duckdb/src/function/scalar/system/aggregate_export.cpp +27 -0
  33. package/src/duckdb/src/function/scalar_function.cpp +2 -1
  34. package/src/duckdb/src/function/table/read_csv.cpp +18 -0
  35. package/src/duckdb/src/function/table/table_scan.cpp +35 -0
  36. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  37. package/src/duckdb/src/function/table_function.cpp +4 -3
  38. package/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp +109 -0
  39. package/src/duckdb/src/include/duckdb/common/arrow/appender/bool_data.hpp +15 -0
  40. package/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp +69 -0
  41. package/src/duckdb/src/include/duckdb/common/arrow/appender/list.hpp +8 -0
  42. package/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp +18 -0
  43. package/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp +18 -0
  44. package/src/duckdb/src/include/duckdb/common/arrow/appender/scalar_data.hpp +88 -0
  45. package/src/duckdb/src/include/duckdb/common/arrow/appender/struct_data.hpp +18 -0
  46. package/src/duckdb/src/include/duckdb/common/arrow/appender/union_data.hpp +21 -0
  47. package/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp +105 -0
  48. package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +5 -0
  49. package/src/duckdb/src/include/duckdb/common/multi_file_reader.hpp +5 -1
  50. package/src/duckdb/src/include/duckdb/common/multi_file_reader_options.hpp +2 -0
  51. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +32 -0
  52. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +45 -15
  53. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +10 -0
  54. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +2 -0
  55. package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +11 -2
  56. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +81 -0
  57. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +8 -0
  58. package/src/duckdb/src/include/duckdb/function/scalar_function.hpp +8 -0
  59. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +7 -0
  60. package/src/duckdb/src/include/duckdb/function/table_function.hpp +8 -0
  61. package/src/duckdb/src/include/duckdb/planner/expression/bound_aggregate_expression.hpp +3 -0
  62. package/src/duckdb/src/include/duckdb/planner/expression/bound_function_expression.hpp +4 -0
  63. package/src/duckdb/src/include/duckdb/planner/expression/bound_window_expression.hpp +3 -0
  64. package/src/duckdb/src/include/duckdb/planner/filter/conjunction_filter.hpp +4 -0
  65. package/src/duckdb/src/include/duckdb/planner/filter/constant_filter.hpp +2 -0
  66. package/src/duckdb/src/include/duckdb/planner/filter/null_filter.hpp +4 -0
  67. package/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +2 -0
  68. package/src/duckdb/src/include/duckdb/planner/operator/logical_get.hpp +7 -1
  69. package/src/duckdb/src/include/duckdb/planner/table_filter.hpp +7 -1
  70. package/src/duckdb/src/main/extension/extension_helper.cpp +13 -0
  71. package/src/duckdb/src/parallel/executor.cpp +1 -1
  72. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +23 -0
  73. package/src/duckdb/src/planner/expression/bound_function_expression.cpp +22 -0
  74. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +47 -0
  75. package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +8 -0
  76. package/src/duckdb/src/planner/operator/logical_get.cpp +69 -0
  77. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +9 -0
  78. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +6 -0
  79. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +190 -0
  80. package/src/duckdb/src/storage/serialization/serialize_table_filter.cpp +97 -0
  81. package/src/duckdb/ub_src_common_arrow_appender.cpp +10 -0
  82. package/src/duckdb/ub_src_common_serializer.cpp +2 -0
  83. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
@@ -15,6 +15,8 @@
15
15
  #include "duckdb/catalog/dependency_list.hpp"
16
16
  #include "duckdb/function/function_set.hpp"
17
17
  #include "duckdb/storage/table/scan_state.hpp"
18
+ #include "duckdb/common/serializer/format_serializer.hpp"
19
+ #include "duckdb/common/serializer/format_deserializer.hpp"
18
20
 
19
21
  namespace duckdb {
20
22
 
@@ -447,6 +449,35 @@ static unique_ptr<FunctionData> TableScanDeserialize(PlanDeserializationState &s
447
449
  return std::move(result);
448
450
  }
449
451
 
452
+ static void TableScanFormatSerialize(FormatSerializer &serializer, const optional_ptr<FunctionData> bind_data_p,
453
+ const TableFunction &function) {
454
+ auto &bind_data = bind_data_p->Cast<TableScanBindData>();
455
+ serializer.WriteProperty("catalog", bind_data.table.schema.catalog.GetName());
456
+ serializer.WriteProperty("schema", bind_data.table.schema.name);
457
+ serializer.WriteProperty("table", bind_data.table.name);
458
+ serializer.WriteProperty("is_index_scan", bind_data.is_index_scan);
459
+ serializer.WriteProperty("is_create_index", bind_data.is_create_index);
460
+ serializer.WriteProperty("result_ids", bind_data.result_ids);
461
+ serializer.WriteProperty("result_ids", bind_data.result_ids);
462
+ }
463
+
464
+ static unique_ptr<FunctionData> TableScanFormatDeserialize(FormatDeserializer &deserializer, TableFunction &function) {
465
+ auto catalog = deserializer.ReadProperty<string>("catalog");
466
+ auto schema = deserializer.ReadProperty<string>("schema");
467
+ auto table = deserializer.ReadProperty<string>("table");
468
+ auto &catalog_entry =
469
+ Catalog::GetEntry<TableCatalogEntry>(deserializer.Get<ClientContext &>(), catalog, schema, table);
470
+ if (catalog_entry.type != CatalogType::TABLE_ENTRY) {
471
+ throw SerializationException("Cant find table for %s.%s", schema, table);
472
+ }
473
+ auto result = make_uniq<TableScanBindData>(catalog_entry.Cast<DuckTableEntry>());
474
+ deserializer.ReadProperty("is_index_scan", result->is_index_scan);
475
+ deserializer.ReadProperty("is_create_index", result->is_create_index);
476
+ deserializer.ReadProperty("result_ids", result->result_ids);
477
+ deserializer.ReadProperty("result_ids", result->result_ids);
478
+ return std::move(result);
479
+ }
480
+
450
481
  TableFunction TableScanFunction::GetIndexScanFunction() {
451
482
  TableFunction scan_function("index_scan", {}, IndexScanFunction);
452
483
  scan_function.init_local = nullptr;
@@ -462,6 +493,8 @@ TableFunction TableScanFunction::GetIndexScanFunction() {
462
493
  scan_function.filter_pushdown = false;
463
494
  scan_function.serialize = TableScanSerialize;
464
495
  scan_function.deserialize = TableScanDeserialize;
496
+ scan_function.format_serialize = TableScanFormatSerialize;
497
+ scan_function.format_deserialize = TableScanFormatDeserialize;
465
498
  return scan_function;
466
499
  }
467
500
 
@@ -482,6 +515,8 @@ TableFunction TableScanFunction::GetFunction() {
482
515
  scan_function.filter_prune = true;
483
516
  scan_function.serialize = TableScanSerialize;
484
517
  scan_function.deserialize = TableScanDeserialize;
518
+ scan_function.format_serialize = TableScanFormatSerialize;
519
+ scan_function.format_deserialize = TableScanFormatDeserialize;
485
520
  return scan_function;
486
521
  }
487
522
 
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.8.2-dev2068"
2
+ #define DUCKDB_VERSION "0.8.2-dev2133"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "83481168e0"
5
+ #define DUCKDB_SOURCE_ID "726b6d1566"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -18,8 +18,8 @@ TableFunction::TableFunction(string name, vector<LogicalType> arguments, table_f
18
18
  init_global(init_global), init_local(init_local), function(function), in_out_function(nullptr),
19
19
  in_out_function_final(nullptr), statistics(nullptr), dependency(nullptr), cardinality(nullptr),
20
20
  pushdown_complex_filter(nullptr), to_string(nullptr), table_scan_progress(nullptr), get_batch_index(nullptr),
21
- get_batch_info(nullptr), serialize(nullptr), deserialize(nullptr), projection_pushdown(false),
22
- filter_pushdown(false), filter_prune(false) {
21
+ get_batch_info(nullptr), serialize(nullptr), deserialize(nullptr), format_serialize(nullptr),
22
+ format_deserialize(nullptr), projection_pushdown(false), filter_pushdown(false), filter_prune(false) {
23
23
  }
24
24
 
25
25
  TableFunction::TableFunction(const vector<LogicalType> &arguments, table_function_t function,
@@ -32,7 +32,8 @@ TableFunction::TableFunction()
32
32
  init_local(nullptr), function(nullptr), in_out_function(nullptr), statistics(nullptr), dependency(nullptr),
33
33
  cardinality(nullptr), pushdown_complex_filter(nullptr), to_string(nullptr), table_scan_progress(nullptr),
34
34
  get_batch_index(nullptr), get_batch_info(nullptr), serialize(nullptr), deserialize(nullptr),
35
- projection_pushdown(false), filter_pushdown(false), filter_prune(false) {
35
+ format_serialize(nullptr), format_deserialize(nullptr), projection_pushdown(false), filter_pushdown(false),
36
+ filter_prune(false) {
36
37
  }
37
38
 
38
39
  bool TableFunction::Equal(const TableFunction &rhs) const {
@@ -0,0 +1,109 @@
1
+ #pragma once
2
+
3
+ #include "duckdb/common/types.hpp"
4
+ #include "duckdb/common/types/vector.hpp"
5
+ #include "duckdb/common/arrow/arrow.hpp"
6
+ #include "duckdb/common/arrow/arrow_buffer.hpp"
7
+ #include "duckdb/common/arrow/arrow_options.hpp"
8
+ #include "duckdb/common/array.hpp"
9
+
10
+ namespace duckdb {
11
+
12
+ //===--------------------------------------------------------------------===//
13
+ // Arrow append data
14
+ //===--------------------------------------------------------------------===//
15
+ typedef void (*initialize_t)(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
16
+ // append_data: The arrow array we're appending into
17
+ // input: The data we're appending
18
+ // from: The offset into the input we're scanning
19
+ // to: The last index of the input we're scanning
20
+ // input_size: The total size of the 'input' Vector.
21
+ typedef void (*append_vector_t)(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
22
+ typedef void (*finalize_t)(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
23
+
24
+ // This struct is used to save state for appending a column
25
+ // afterwards the ownership is passed to the arrow array, as 'private_data'
26
+ // FIXME: we should separate the append state variables from the variables required by the ArrowArray into
27
+ // ArrowAppendState
28
+ struct ArrowAppendData {
29
+ explicit ArrowAppendData(ArrowOptions &options_p) : options(options_p) {
30
+ }
31
+ // the buffers of the arrow vector
32
+ ArrowBuffer validity;
33
+ ArrowBuffer main_buffer;
34
+ ArrowBuffer aux_buffer;
35
+
36
+ idx_t row_count = 0;
37
+ idx_t null_count = 0;
38
+
39
+ // function pointers for construction
40
+ initialize_t initialize = nullptr;
41
+ append_vector_t append_vector = nullptr;
42
+ finalize_t finalize = nullptr;
43
+
44
+ // child data (if any)
45
+ vector<unique_ptr<ArrowAppendData>> child_data;
46
+
47
+ // the arrow array C API data, only set after Finalize
48
+ unique_ptr<ArrowArray> array;
49
+ duckdb::array<const void *, 3> buffers = {{nullptr, nullptr, nullptr}};
50
+ vector<ArrowArray *> child_pointers;
51
+
52
+ ArrowOptions options;
53
+ };
54
+
55
+ //===--------------------------------------------------------------------===//
56
+ // Append Helper Functions
57
+ //===--------------------------------------------------------------------===//
58
+ static void GetBitPosition(idx_t row_idx, idx_t &current_byte, uint8_t &current_bit) {
59
+ current_byte = row_idx / 8;
60
+ current_bit = row_idx % 8;
61
+ }
62
+
63
+ static void UnsetBit(uint8_t *data, idx_t current_byte, uint8_t current_bit) {
64
+ data[current_byte] &= ~((uint64_t)1 << current_bit);
65
+ }
66
+
67
+ static void NextBit(idx_t &current_byte, uint8_t &current_bit) {
68
+ current_bit++;
69
+ if (current_bit == 8) {
70
+ current_byte++;
71
+ current_bit = 0;
72
+ }
73
+ }
74
+
75
+ static void ResizeValidity(ArrowBuffer &buffer, idx_t row_count) {
76
+ auto byte_count = (row_count + 7) / 8;
77
+ buffer.resize(byte_count, 0xFF);
78
+ }
79
+
80
+ static void SetNull(ArrowAppendData &append_data, uint8_t *validity_data, idx_t current_byte, uint8_t current_bit) {
81
+ UnsetBit(validity_data, current_byte, current_bit);
82
+ append_data.null_count++;
83
+ }
84
+
85
+ static void AppendValidity(ArrowAppendData &append_data, UnifiedVectorFormat &format, idx_t from, idx_t to) {
86
+ // resize the buffer, filling the validity buffer with all valid values
87
+ idx_t size = to - from;
88
+ ResizeValidity(append_data.validity, append_data.row_count + size);
89
+ if (format.validity.AllValid()) {
90
+ // if all values are valid we don't need to do anything else
91
+ return;
92
+ }
93
+
94
+ // otherwise we iterate through the validity mask
95
+ auto validity_data = (uint8_t *)append_data.validity.data();
96
+ uint8_t current_bit;
97
+ idx_t current_byte;
98
+ GetBitPosition(append_data.row_count, current_byte, current_bit);
99
+ for (idx_t i = from; i < to; i++) {
100
+ auto source_idx = format.sel->get_index(i);
101
+ // append the validity mask
102
+ if (!format.validity.RowIsValid(source_idx)) {
103
+ SetNull(append_data, validity_data, current_byte, current_bit);
104
+ }
105
+ NextBit(current_byte, current_bit);
106
+ }
107
+ }
108
+
109
+ } // namespace duckdb
@@ -0,0 +1,15 @@
1
+ #pragma once
2
+
3
+ #include "duckdb/common/arrow/appender/append_data.hpp"
4
+ #include "duckdb/common/types/vector.hpp"
5
+
6
+ namespace duckdb {
7
+
8
+ struct ArrowBoolData {
9
+ public:
10
+ static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
11
+ static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
12
+ static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
13
+ };
14
+
15
+ } // namespace duckdb
@@ -0,0 +1,69 @@
1
+ #pragma once
2
+
3
+ #include "duckdb/common/arrow/appender/append_data.hpp"
4
+ #include "duckdb/common/arrow/appender/scalar_data.hpp"
5
+
6
+ namespace duckdb {
7
+
8
+ //===--------------------------------------------------------------------===//
9
+ // Enums
10
+ //===--------------------------------------------------------------------===//
11
+ template <class TGT>
12
+ struct ArrowEnumData : public ArrowScalarBaseData<TGT> {
13
+ static idx_t GetLength(string_t input) {
14
+ return input.GetSize();
15
+ }
16
+ static void WriteData(data_ptr_t target, string_t input) {
17
+ memcpy(target, input.GetData(), input.GetSize());
18
+ }
19
+ static void EnumAppendVector(ArrowAppendData &append_data, const Vector &input, idx_t size) {
20
+ D_ASSERT(input.GetVectorType() == VectorType::FLAT_VECTOR);
21
+
22
+ // resize the validity mask and set up the validity buffer for iteration
23
+ ResizeValidity(append_data.validity, append_data.row_count + size);
24
+
25
+ // resize the offset buffer - the offset buffer holds the offsets into the child array
26
+ append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(uint32_t) * (size + 1));
27
+ auto data = FlatVector::GetData<string_t>(input);
28
+ auto offset_data = append_data.main_buffer.GetData<uint32_t>();
29
+ if (append_data.row_count == 0) {
30
+ // first entry
31
+ offset_data[0] = 0;
32
+ }
33
+ // now append the string data to the auxiliary buffer
34
+ // the auxiliary buffer's length depends on the string lengths, so we resize as required
35
+ auto last_offset = offset_data[append_data.row_count];
36
+ for (idx_t i = 0; i < size; i++) {
37
+ auto offset_idx = append_data.row_count + i + 1;
38
+
39
+ auto string_length = GetLength(data[i]);
40
+
41
+ // append the offset data
42
+ auto current_offset = last_offset + string_length;
43
+ offset_data[offset_idx] = current_offset;
44
+
45
+ // resize the string buffer if required, and write the string data
46
+ append_data.aux_buffer.resize(current_offset);
47
+ WriteData(append_data.aux_buffer.data() + last_offset, data[i]);
48
+
49
+ last_offset = current_offset;
50
+ }
51
+ append_data.row_count += size;
52
+ }
53
+ static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
54
+ result.main_buffer.reserve(capacity * sizeof(TGT));
55
+ // construct the enum child data
56
+ auto enum_data = ArrowAppender::InitializeChild(LogicalType::VARCHAR, EnumType::GetSize(type), result.options);
57
+ EnumAppendVector(*enum_data, EnumType::GetValuesInsertOrder(type), EnumType::GetSize(type));
58
+ result.child_data.push_back(std::move(enum_data));
59
+ }
60
+
61
+ static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
62
+ result->n_buffers = 2;
63
+ result->buffers[1] = append_data.main_buffer.data();
64
+ // finalize the enum child data, and assign it to the dictionary
65
+ result->dictionary = ArrowAppender::FinalizeChild(LogicalType::VARCHAR, *append_data.child_data[0]);
66
+ }
67
+ };
68
+
69
+ } // namespace duckdb
@@ -0,0 +1,8 @@
1
+ #include "duckdb/common/arrow/appender/bool_data.hpp"
2
+ #include "duckdb/common/arrow/appender/enum_data.hpp"
3
+ #include "duckdb/common/arrow/appender/list_data.hpp"
4
+ #include "duckdb/common/arrow/appender/map_data.hpp"
5
+ #include "duckdb/common/arrow/appender/scalar_data.hpp"
6
+ #include "duckdb/common/arrow/appender/struct_data.hpp"
7
+ #include "duckdb/common/arrow/appender/union_data.hpp"
8
+ #include "duckdb/common/arrow/appender/varchar_data.hpp"
@@ -0,0 +1,18 @@
1
+ #pragma once
2
+
3
+ #include "duckdb/common/arrow/appender/append_data.hpp"
4
+
5
+ namespace duckdb {
6
+
7
+ struct ArrowListData {
8
+ public:
9
+ static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
10
+ static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
11
+ static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
12
+
13
+ public:
14
+ static void AppendOffsets(ArrowAppendData &append_data, UnifiedVectorFormat &format, idx_t from, idx_t to,
15
+ vector<sel_t> &child_sel);
16
+ };
17
+
18
+ } // namespace duckdb
@@ -0,0 +1,18 @@
1
+ #pragma once
2
+
3
+ #include "duckdb/common/arrow/arrow_appender.hpp"
4
+ #include "duckdb/common/arrow/appender/append_data.hpp"
5
+
6
+ namespace duckdb {
7
+
8
+ //===--------------------------------------------------------------------===//
9
+ // Maps
10
+ //===--------------------------------------------------------------------===//
11
+ struct ArrowMapData {
12
+ public:
13
+ static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
14
+ static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
15
+ static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
16
+ };
17
+
18
+ } // namespace duckdb
@@ -0,0 +1,88 @@
1
+ #pragma once
2
+
3
+ #include "duckdb/common/arrow/appender/append_data.hpp"
4
+ #include "duckdb/function/table/arrow.hpp"
5
+
6
+ namespace duckdb {
7
+
8
+ //===--------------------------------------------------------------------===//
9
+ // Scalar Types
10
+ //===--------------------------------------------------------------------===//
11
+ struct ArrowScalarConverter {
12
+ template <class TGT, class SRC>
13
+ static TGT Operation(SRC input) {
14
+ return input;
15
+ }
16
+
17
+ static bool SkipNulls() {
18
+ return false;
19
+ }
20
+
21
+ template <class TGT>
22
+ static void SetNull(TGT &value) {
23
+ }
24
+ };
25
+
26
+ struct ArrowIntervalConverter {
27
+ template <class TGT, class SRC>
28
+ static TGT Operation(SRC input) {
29
+ ArrowInterval result;
30
+ result.months = input.months;
31
+ result.days = input.days;
32
+ result.nanoseconds = input.micros * Interval::NANOS_PER_MICRO;
33
+ return result;
34
+ }
35
+
36
+ static bool SkipNulls() {
37
+ return true;
38
+ }
39
+
40
+ template <class TGT>
41
+ static void SetNull(TGT &value) {
42
+ }
43
+ };
44
+
45
+ template <class TGT, class SRC = TGT, class OP = ArrowScalarConverter>
46
+ struct ArrowScalarBaseData {
47
+ static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
48
+ D_ASSERT(to >= from);
49
+ idx_t size = to - from;
50
+ D_ASSERT(size <= input_size);
51
+ UnifiedVectorFormat format;
52
+ input.ToUnifiedFormat(input_size, format);
53
+
54
+ // append the validity mask
55
+ AppendValidity(append_data, format, from, to);
56
+
57
+ // append the main data
58
+ append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(TGT) * size);
59
+ auto data = UnifiedVectorFormat::GetData<SRC>(format);
60
+ auto result_data = append_data.main_buffer.GetData<TGT>();
61
+
62
+ for (idx_t i = from; i < to; i++) {
63
+ auto source_idx = format.sel->get_index(i);
64
+ auto result_idx = append_data.row_count + i - from;
65
+
66
+ if (OP::SkipNulls() && !format.validity.RowIsValid(source_idx)) {
67
+ OP::template SetNull<TGT>(result_data[result_idx]);
68
+ continue;
69
+ }
70
+ result_data[result_idx] = OP::template Operation<TGT, SRC>(data[source_idx]);
71
+ }
72
+ append_data.row_count += size;
73
+ }
74
+ };
75
+
76
+ template <class TGT, class SRC = TGT, class OP = ArrowScalarConverter>
77
+ struct ArrowScalarData : public ArrowScalarBaseData<TGT, SRC, OP> {
78
+ static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
79
+ result.main_buffer.reserve(capacity * sizeof(TGT));
80
+ }
81
+
82
+ static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
83
+ result->n_buffers = 2;
84
+ result->buffers[1] = append_data.main_buffer.data();
85
+ }
86
+ };
87
+
88
+ } // namespace duckdb
@@ -0,0 +1,18 @@
1
+ #pragma once
2
+
3
+ #include "duckdb/common/arrow/appender/append_data.hpp"
4
+ #include "duckdb/common/arrow/appender/scalar_data.hpp"
5
+
6
+ namespace duckdb {
7
+
8
+ //===--------------------------------------------------------------------===//
9
+ // Structs
10
+ //===--------------------------------------------------------------------===//
11
+ struct ArrowStructData {
12
+ public:
13
+ static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
14
+ static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
15
+ static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
16
+ };
17
+
18
+ } // namespace duckdb
@@ -0,0 +1,21 @@
1
+ #pragma once
2
+
3
+ #include "duckdb/common/arrow/appender/append_data.hpp"
4
+
5
+ namespace duckdb {
6
+
7
+ //===--------------------------------------------------------------------===//
8
+ // Unions
9
+ //===--------------------------------------------------------------------===//
10
+ /**
11
+ * Based on https://arrow.apache.org/docs/format/Columnar.html#union-layout &
12
+ * https://arrow.apache.org/docs/format/CDataInterface.html
13
+ */
14
+ struct ArrowUnionData {
15
+ public:
16
+ static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity);
17
+ static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size);
18
+ static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result);
19
+ };
20
+
21
+ } // namespace duckdb
@@ -0,0 +1,105 @@
1
+ #pragma once
2
+
3
+ #include "duckdb/common/arrow/appender/append_data.hpp"
4
+ #include "duckdb/common/arrow/appender/scalar_data.hpp"
5
+
6
+ namespace duckdb {
7
+
8
+ //===--------------------------------------------------------------------===//
9
+ // Varchar
10
+ //===--------------------------------------------------------------------===//
11
+ struct ArrowVarcharConverter {
12
+ template <class SRC>
13
+ static idx_t GetLength(SRC input) {
14
+ return input.GetSize();
15
+ }
16
+
17
+ template <class SRC>
18
+ static void WriteData(data_ptr_t target, SRC input) {
19
+ memcpy(target, input.GetData(), input.GetSize());
20
+ }
21
+ };
22
+
23
+ struct ArrowUUIDConverter {
24
+ template <class SRC>
25
+ static idx_t GetLength(SRC input) {
26
+ return UUID::STRING_SIZE;
27
+ }
28
+
29
+ template <class SRC>
30
+ static void WriteData(data_ptr_t target, SRC input) {
31
+ UUID::ToString(input, char_ptr_cast(target));
32
+ }
33
+ };
34
+
35
+ template <class SRC = string_t, class OP = ArrowVarcharConverter, class BUFTYPE = uint64_t>
36
+ struct ArrowVarcharData {
37
+ static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) {
38
+ result.main_buffer.reserve((capacity + 1) * sizeof(BUFTYPE));
39
+
40
+ result.aux_buffer.reserve(capacity);
41
+ }
42
+
43
+ static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) {
44
+ idx_t size = to - from;
45
+ UnifiedVectorFormat format;
46
+ input.ToUnifiedFormat(input_size, format);
47
+
48
+ // resize the validity mask and set up the validity buffer for iteration
49
+ ResizeValidity(append_data.validity, append_data.row_count + size);
50
+ auto validity_data = (uint8_t *)append_data.validity.data();
51
+
52
+ // resize the offset buffer - the offset buffer holds the offsets into the child array
53
+ append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(BUFTYPE) * (size + 1));
54
+ auto data = UnifiedVectorFormat::GetData<SRC>(format);
55
+ auto offset_data = append_data.main_buffer.GetData<BUFTYPE>();
56
+ if (append_data.row_count == 0) {
57
+ // first entry
58
+ offset_data[0] = 0;
59
+ }
60
+ // now append the string data to the auxiliary buffer
61
+ // the auxiliary buffer's length depends on the string lengths, so we resize as required
62
+ auto last_offset = offset_data[append_data.row_count];
63
+ idx_t max_offset = append_data.row_count + to - from;
64
+ if (max_offset > NumericLimits<uint32_t>::Maximum() &&
65
+ append_data.options.offset_size == ArrowOffsetSize::REGULAR) {
66
+ throw InvalidInputException("Arrow Appender: The maximum total string size for regular string buffers is "
67
+ "%u but the offset of %lu exceeds this.",
68
+ NumericLimits<uint32_t>::Maximum(), max_offset);
69
+ }
70
+ for (idx_t i = from; i < to; i++) {
71
+ auto source_idx = format.sel->get_index(i);
72
+ auto offset_idx = append_data.row_count + i + 1 - from;
73
+
74
+ if (!format.validity.RowIsValid(source_idx)) {
75
+ uint8_t current_bit;
76
+ idx_t current_byte;
77
+ GetBitPosition(append_data.row_count + i - from, current_byte, current_bit);
78
+ SetNull(append_data, validity_data, current_byte, current_bit);
79
+ offset_data[offset_idx] = last_offset;
80
+ continue;
81
+ }
82
+
83
+ auto string_length = OP::GetLength(data[source_idx]);
84
+
85
+ // append the offset data
86
+ auto current_offset = last_offset + string_length;
87
+ offset_data[offset_idx] = current_offset;
88
+
89
+ // resize the string buffer if required, and write the string data
90
+ append_data.aux_buffer.resize(current_offset);
91
+ OP::WriteData(append_data.aux_buffer.data() + last_offset, data[source_idx]);
92
+
93
+ last_offset = current_offset;
94
+ }
95
+ append_data.row_count += size;
96
+ }
97
+
98
+ static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
99
+ result->n_buffers = 3;
100
+ result->buffers[1] = append_data.main_buffer.data();
101
+ result->buffers[2] = append_data.aux_buffer.data();
102
+ }
103
+ };
104
+
105
+ } // namespace duckdb
@@ -27,6 +27,11 @@ public:
27
27
  //! Returns the underlying arrow array
28
28
  DUCKDB_API ArrowArray Finalize();
29
29
 
30
+ public:
31
+ static void ReleaseArray(ArrowArray *array);
32
+ static ArrowArray *FinalizeChild(const LogicalType &type, ArrowAppendData &append_data);
33
+ static unique_ptr<ArrowAppendData> InitializeChild(const LogicalType &type, idx_t capacity, ArrowOptions &options);
34
+
30
35
  private:
31
36
  //! The types of the chunks that will be appended in
32
37
  vector<LogicalType> types;
@@ -8,7 +8,7 @@
8
8
 
9
9
  #pragma once
10
10
 
11
- #include "duckdb/common/types.hpp"
11
+ #include "duckdb/common/common.hpp"
12
12
  #include "duckdb/common/multi_file_reader_options.hpp"
13
13
  #include "duckdb/common/enums/file_glob_options.hpp"
14
14
  #include "duckdb/common/union_by_name.hpp"
@@ -32,6 +32,8 @@ struct HivePartitioningIndex {
32
32
 
33
33
  DUCKDB_API void Serialize(Serializer &serializer) const;
34
34
  DUCKDB_API static HivePartitioningIndex Deserialize(Deserializer &source);
35
+ DUCKDB_API void FormatSerialize(FormatSerializer &serializer) const;
36
+ DUCKDB_API static HivePartitioningIndex FormatDeserialize(FormatDeserializer &deserializer);
35
37
  };
36
38
 
37
39
  //! The bind data for the multi-file reader, obtained through MultiFileReader::BindReader
@@ -43,6 +45,8 @@ struct MultiFileReaderBindData {
43
45
 
44
46
  DUCKDB_API void Serialize(Serializer &serializer) const;
45
47
  DUCKDB_API static MultiFileReaderBindData Deserialize(Deserializer &source);
48
+ DUCKDB_API void FormatSerialize(FormatSerializer &serializer) const;
49
+ DUCKDB_API static MultiFileReaderBindData FormatDeserialize(FormatDeserializer &deserializer);
46
50
  };
47
51
 
48
52
  struct MultiFileFilterEntry {
@@ -28,6 +28,8 @@ struct MultiFileReaderOptions {
28
28
 
29
29
  DUCKDB_API void Serialize(Serializer &serializer) const;
30
30
  DUCKDB_API static MultiFileReaderOptions Deserialize(Deserializer &source);
31
+ DUCKDB_API void FormatSerialize(FormatSerializer &serializer) const;
32
+ DUCKDB_API static MultiFileReaderOptions FormatDeserialize(FormatDeserializer &source);
31
33
  DUCKDB_API void AddBatchInfo(BindInfo &bind_info) const;
32
34
  DUCKDB_API void AutoDetectHivePartitioning(const vector<string> &files, ClientContext &context);
33
35
  DUCKDB_API static bool AutoDetectHivePartitioningInternal(const vector<string> &files);
@@ -127,6 +127,16 @@ public:
127
127
  return data.Unset<T>();
128
128
  }
129
129
 
130
+ // Manually begin an object - should be followed by EndObject
131
+ void BeginObject(const char *tag) {
132
+ SetTag(tag);
133
+ OnObjectBegin();
134
+ }
135
+
136
+ void EndObject() {
137
+ OnObjectEnd();
138
+ }
139
+
130
140
  private:
131
141
  // Deserialize anything implementing a FormatDeserialize method
132
142
  template <typename T = void>
@@ -208,6 +218,28 @@ private:
208
218
  return map;
209
219
  }
210
220
 
221
+ template <typename T = void>
222
+ inline typename std::enable_if<is_map<T>::value, T>::type Read() {
223
+ using KEY_TYPE = typename is_map<T>::KEY_TYPE;
224
+ using VALUE_TYPE = typename is_map<T>::VALUE_TYPE;
225
+
226
+ T map;
227
+ auto size = OnMapBegin();
228
+ for (idx_t i = 0; i < size; i++) {
229
+ OnMapEntryBegin();
230
+ OnMapKeyBegin();
231
+ auto key = Read<KEY_TYPE>();
232
+ OnMapKeyEnd();
233
+ OnMapValueBegin();
234
+ auto value = Read<VALUE_TYPE>();
235
+ OnMapValueEnd();
236
+ OnMapEntryEnd();
237
+ map[std::move(key)] = std::move(value);
238
+ }
239
+ OnMapEnd();
240
+ return map;
241
+ }
242
+
211
243
  // Deserialize an unordered set
212
244
  template <typename T = void>
213
245
  inline typename std::enable_if<is_unordered_set<T>::value, T>::type Read() {