duckdb 1.4.2-dev4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/package.json +2 -2
  2. package/src/duckdb/extension/icu/icu_extension.cpp +67 -6
  3. package/src/duckdb/extension/icu/third_party/icu/common/putil.cpp +9 -3
  4. package/src/duckdb/extension/json/include/json_serializer.hpp +12 -0
  5. package/src/duckdb/extension/json/json_functions/json_create.cpp +10 -10
  6. package/src/duckdb/extension/parquet/decoder/delta_length_byte_array_decoder.cpp +19 -5
  7. package/src/duckdb/extension/parquet/include/decoder/delta_length_byte_array_decoder.hpp +1 -1
  8. package/src/duckdb/extension/parquet/include/parquet_dbp_decoder.hpp +11 -2
  9. package/src/duckdb/extension/parquet/include/reader/string_column_reader.hpp +2 -1
  10. package/src/duckdb/extension/parquet/parquet_reader.cpp +3 -1
  11. package/src/duckdb/extension/parquet/parquet_writer.cpp +16 -1
  12. package/src/duckdb/extension/parquet/reader/string_column_reader.cpp +1 -1
  13. package/src/duckdb/extension/parquet/writer/primitive_column_writer.cpp +1 -1
  14. package/src/duckdb/src/catalog/default/default_table_functions.cpp +1 -1
  15. package/src/duckdb/src/common/adbc/adbc.cpp +8 -6
  16. package/src/duckdb/src/common/csv_writer.cpp +1 -13
  17. package/src/duckdb/src/common/encryption_key_manager.cpp +10 -9
  18. package/src/duckdb/src/common/enum_util.cpp +19 -0
  19. package/src/duckdb/src/common/enums/compression_type.cpp +51 -16
  20. package/src/duckdb/src/common/exception/binder_exception.cpp +7 -2
  21. package/src/duckdb/src/common/progress_bar/unscented_kalman_filter.cpp +2 -2
  22. package/src/duckdb/src/common/random_engine.cpp +10 -0
  23. package/src/duckdb/src/execution/expression_executor/execute_comparison.cpp +13 -2
  24. package/src/duckdb/src/execution/index/art/art.cpp +6 -3
  25. package/src/duckdb/src/execution/index/bound_index.cpp +32 -21
  26. package/src/duckdb/src/execution/index/unbound_index.cpp +20 -9
  27. package/src/duckdb/src/execution/join_hashtable.cpp +9 -3
  28. package/src/duckdb/src/execution/operator/helper/physical_buffered_batch_collector.cpp +1 -1
  29. package/src/duckdb/src/execution/operator/helper/physical_buffered_collector.cpp +1 -1
  30. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +5 -0
  31. package/src/duckdb/src/function/cast/cast_function_set.cpp +3 -1
  32. package/src/duckdb/src/function/macro_function.cpp +1 -1
  33. package/src/duckdb/src/function/scalar/compressed_materialization/compress_string.cpp +1 -1
  34. package/src/duckdb/src/function/scalar/create_sort_key.cpp +5 -3
  35. package/src/duckdb/src/function/scalar/operator/arithmetic.cpp +1 -1
  36. package/src/duckdb/src/function/scalar/system/parse_log_message.cpp +4 -2
  37. package/src/duckdb/src/function/table/copy_csv.cpp +28 -4
  38. package/src/duckdb/src/function/table/direct_file_reader.cpp +10 -0
  39. package/src/duckdb/src/function/table/read_file.cpp +65 -1
  40. package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
  41. package/src/duckdb/src/include/duckdb/common/csv_writer.hpp +0 -3
  42. package/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp +2 -0
  43. package/src/duckdb/src/include/duckdb/common/encryption_state.hpp +5 -0
  44. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -0
  45. package/src/duckdb/src/include/duckdb/common/enums/compression_type.hpp +42 -2
  46. package/src/duckdb/src/include/duckdb/common/http_util.hpp +7 -0
  47. package/src/duckdb/src/include/duckdb/common/hugeint.hpp +1 -1
  48. package/src/duckdb/src/include/duckdb/common/operator/comparison_operators.hpp +0 -11
  49. package/src/duckdb/src/include/duckdb/common/random_engine.hpp +2 -0
  50. package/src/duckdb/src/include/duckdb/common/sort/duckdb_pdqsort.hpp +1 -0
  51. package/src/duckdb/src/include/duckdb/common/types/hugeint.hpp +6 -6
  52. package/src/duckdb/src/include/duckdb/common/types/row/block_iterator.hpp +115 -97
  53. package/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp +54 -0
  54. package/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp +21 -2
  55. package/src/duckdb/src/include/duckdb/execution/index/unbound_index.hpp +26 -8
  56. package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +2 -0
  57. package/src/duckdb/src/include/duckdb/function/table/read_file.hpp +0 -49
  58. package/src/duckdb/src/include/duckdb/logging/log_manager.hpp +1 -1
  59. package/src/duckdb/src/include/duckdb/logging/log_type.hpp +14 -0
  60. package/src/duckdb/src/include/duckdb/main/attached_database.hpp +2 -1
  61. package/src/duckdb/src/include/duckdb/main/buffered_data/batched_buffered_data.hpp +1 -1
  62. package/src/duckdb/src/include/duckdb/main/buffered_data/buffered_data.hpp +1 -1
  63. package/src/duckdb/src/include/duckdb/main/buffered_data/simple_buffered_data.hpp +1 -1
  64. package/src/duckdb/src/include/duckdb/main/capi/capi_internal.hpp +2 -0
  65. package/src/duckdb/src/include/duckdb/main/database.hpp +2 -2
  66. package/src/duckdb/src/include/duckdb/main/database_file_path_manager.hpp +10 -6
  67. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +4 -0
  68. package/src/duckdb/src/include/duckdb/main/profiling_info.hpp +1 -0
  69. package/src/duckdb/src/include/duckdb/main/query_profiler.hpp +1 -0
  70. package/src/duckdb/src/include/duckdb/main/relation/create_table_relation.hpp +3 -0
  71. package/src/duckdb/src/include/duckdb/main/relation/insert_relation.hpp +2 -0
  72. package/src/duckdb/src/include/duckdb/main/relation/table_relation.hpp +2 -0
  73. package/src/duckdb/src/include/duckdb/main/relation.hpp +10 -2
  74. package/src/duckdb/src/include/duckdb/main/settings.hpp +9 -0
  75. package/src/duckdb/src/include/duckdb/optimizer/filter_pullup.hpp +10 -14
  76. package/src/duckdb/src/include/duckdb/optimizer/join_order/relation_manager.hpp +5 -1
  77. package/src/duckdb/src/include/duckdb/parser/query_node.hpp +3 -0
  78. package/src/duckdb/src/include/duckdb/planner/bound_statement.hpp +1 -0
  79. package/src/duckdb/src/include/duckdb/storage/block.hpp +9 -0
  80. package/src/duckdb/src/include/duckdb/storage/block_manager.hpp +9 -2
  81. package/src/duckdb/src/include/duckdb/storage/index.hpp +8 -2
  82. package/src/duckdb/src/include/duckdb/storage/metadata/metadata_manager.hpp +2 -0
  83. package/src/duckdb/src/include/duckdb/storage/metadata/metadata_reader.hpp +1 -1
  84. package/src/duckdb/src/include/duckdb/storage/storage_options.hpp +0 -7
  85. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +6 -2
  86. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier.hpp +6 -0
  87. package/src/duckdb/src/logging/log_manager.cpp +2 -1
  88. package/src/duckdb/src/logging/log_types.cpp +30 -1
  89. package/src/duckdb/src/main/attached_database.cpp +4 -7
  90. package/src/duckdb/src/main/buffered_data/batched_buffered_data.cpp +2 -3
  91. package/src/duckdb/src/main/buffered_data/buffered_data.cpp +2 -3
  92. package/src/duckdb/src/main/buffered_data/simple_buffered_data.cpp +1 -2
  93. package/src/duckdb/src/main/capi/prepared-c.cpp +9 -2
  94. package/src/duckdb/src/main/config.cpp +6 -5
  95. package/src/duckdb/src/main/database.cpp +9 -3
  96. package/src/duckdb/src/main/database_file_path_manager.cpp +43 -14
  97. package/src/duckdb/src/main/database_manager.cpp +1 -1
  98. package/src/duckdb/src/main/http/http_util.cpp +19 -1
  99. package/src/duckdb/src/main/profiling_info.cpp +11 -0
  100. package/src/duckdb/src/main/query_profiler.cpp +16 -0
  101. package/src/duckdb/src/main/relation/create_table_relation.cpp +9 -0
  102. package/src/duckdb/src/main/relation/insert_relation.cpp +7 -0
  103. package/src/duckdb/src/main/relation/table_relation.cpp +14 -0
  104. package/src/duckdb/src/main/relation.cpp +28 -12
  105. package/src/duckdb/src/main/settings/custom_settings.cpp +9 -3
  106. package/src/duckdb/src/optimizer/filter_pullup.cpp +14 -0
  107. package/src/duckdb/src/optimizer/join_order/relation_manager.cpp +29 -10
  108. package/src/duckdb/src/optimizer/rule/regex_optimizations.cpp +7 -0
  109. package/src/duckdb/src/parallel/task_executor.cpp +4 -2
  110. package/src/duckdb/src/parser/query_node/cte_node.cpp +79 -0
  111. package/src/duckdb/src/parser/transform/expression/transform_cast.cpp +3 -1
  112. package/src/duckdb/src/planner/binder/expression/bind_macro_expression.cpp +1 -0
  113. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +12 -4
  114. package/src/duckdb/src/planner/binder/statement/bind_insert.cpp +16 -12
  115. package/src/duckdb/src/planner/binder/statement/bind_merge_into.cpp +42 -5
  116. package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +0 -24
  117. package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +1 -1
  118. package/src/duckdb/src/planner/binder.cpp +0 -1
  119. package/src/duckdb/src/planner/expression_binder/having_binder.cpp +1 -2
  120. package/src/duckdb/src/storage/buffer/block_manager.cpp +20 -6
  121. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +8 -6
  122. package/src/duckdb/src/storage/checkpoint_manager.cpp +24 -22
  123. package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +7 -0
  124. package/src/duckdb/src/storage/compression/zstd.cpp +34 -12
  125. package/src/duckdb/src/storage/data_table.cpp +1 -1
  126. package/src/duckdb/src/storage/local_storage.cpp +15 -2
  127. package/src/duckdb/src/storage/metadata/metadata_manager.cpp +29 -6
  128. package/src/duckdb/src/storage/metadata/metadata_reader.cpp +11 -15
  129. package/src/duckdb/src/storage/metadata/metadata_writer.cpp +1 -1
  130. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +1 -19
  131. package/src/duckdb/src/storage/single_file_block_manager.cpp +33 -3
  132. package/src/duckdb/src/storage/standard_buffer_manager.cpp +3 -1
  133. package/src/duckdb/src/storage/storage_info.cpp +4 -0
  134. package/src/duckdb/src/storage/storage_manager.cpp +8 -0
  135. package/src/duckdb/src/storage/table/array_column_data.cpp +1 -1
  136. package/src/duckdb/src/storage/table/column_data.cpp +3 -2
  137. package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +3 -2
  138. package/src/duckdb/src/storage/table/row_group.cpp +41 -24
  139. package/src/duckdb/src/storage/table/row_group_collection.cpp +114 -11
  140. package/src/duckdb/src/storage/table_index_list.cpp +18 -5
  141. package/src/duckdb/src/transaction/cleanup_state.cpp +7 -2
  142. package/src/duckdb/third_party/mbedtls/include/mbedtls_wrapper.hpp +5 -0
  143. package/src/duckdb/third_party/mbedtls/mbedtls_wrapper.cpp +8 -21
  144. package/src/duckdb/third_party/parquet/parquet_types.cpp +57 -35
  145. package/src/duckdb/third_party/parquet/parquet_types.h +9 -2
  146. package/src/duckdb/ub_src_common_types_row.cpp +0 -2
@@ -26,14 +26,14 @@
26
26
  namespace duckdb {
27
27
 
28
28
  RowGroup::RowGroup(RowGroupCollection &collection_p, idx_t start, idx_t count)
29
- : SegmentBase<RowGroup>(start, count), collection(collection_p), version_info(nullptr), allocation_size(0),
30
- row_id_is_loaded(false), has_changes(false) {
29
+ : SegmentBase<RowGroup>(start, count), collection(collection_p), version_info(nullptr), deletes_is_loaded(false),
30
+ allocation_size(0), row_id_is_loaded(false), has_changes(false) {
31
31
  Verify();
32
32
  }
33
33
 
34
34
  RowGroup::RowGroup(RowGroupCollection &collection_p, RowGroupPointer pointer)
35
35
  : SegmentBase<RowGroup>(pointer.row_start, pointer.tuple_count), collection(collection_p), version_info(nullptr),
36
- allocation_size(0), row_id_is_loaded(false), has_changes(false) {
36
+ deletes_is_loaded(false), allocation_size(0), row_id_is_loaded(false), has_changes(false) {
37
37
  // deserialize the columns
38
38
  if (pointer.data_pointers.size() != collection_p.GetTypes().size()) {
39
39
  throw IOException("Row group column count is unaligned with table column count. Corrupt file?");
@@ -45,7 +45,6 @@ RowGroup::RowGroup(RowGroupCollection &collection_p, RowGroupPointer pointer)
45
45
  this->is_loaded[c] = false;
46
46
  }
47
47
  this->deletes_pointers = std::move(pointer.deletes_pointers);
48
- this->deletes_is_loaded = false;
49
48
  this->has_metadata_blocks = pointer.has_metadata_blocks;
50
49
  this->extra_metadata_blocks = std::move(pointer.extra_metadata_blocks);
51
50
 
@@ -54,7 +53,7 @@ RowGroup::RowGroup(RowGroupCollection &collection_p, RowGroupPointer pointer)
54
53
 
55
54
  RowGroup::RowGroup(RowGroupCollection &collection_p, PersistentRowGroupData &data)
56
55
  : SegmentBase<RowGroup>(data.start, data.count), collection(collection_p), version_info(nullptr),
57
- allocation_size(0), row_id_is_loaded(false), has_changes(false) {
56
+ deletes_is_loaded(false), allocation_size(0), row_id_is_loaded(false), has_changes(false) {
58
57
  auto &block_manager = GetBlockManager();
59
58
  auto &info = GetTableInfo();
60
59
  auto &types = collection.get().GetTypes();
@@ -974,21 +973,15 @@ bool RowGroup::HasUnloadedDeletes() const {
974
973
  return !deletes_is_loaded;
975
974
  }
976
975
 
977
- vector<MetaBlockPointer> RowGroup::GetColumnPointers() {
978
- if (has_metadata_blocks) {
979
- // we have the column metadata from the file itself - no need to deserialize metadata to fetch it
980
- // read if from "column_pointers" and "extra_metadata_blocks"
981
- auto result = column_pointers;
982
- for (auto &block_pointer : extra_metadata_blocks) {
983
- result.emplace_back(block_pointer, 0);
984
- }
985
- return result;
976
+ vector<idx_t> RowGroup::GetOrComputeExtraMetadataBlocks(bool force_compute) {
977
+ if (has_metadata_blocks && !force_compute) {
978
+ return extra_metadata_blocks;
986
979
  }
987
- vector<MetaBlockPointer> result;
988
980
  if (column_pointers.empty()) {
989
981
  // no pointers
990
- return result;
982
+ return {};
991
983
  }
984
+ vector<MetaBlockPointer> read_pointers;
992
985
  // column_pointers stores the beginning of each column
993
986
  // if columns are big - they may span multiple metadata blocks
994
987
  // we need to figure out all blocks that this row group points to
@@ -999,13 +992,25 @@ vector<MetaBlockPointer> RowGroup::GetColumnPointers() {
999
992
  // for all but the last column pointer - we can just follow the linked list until we reach the last column
1000
993
  MetadataReader reader(metadata_manager, column_pointers[0]);
1001
994
  auto last_pointer = column_pointers[last_idx];
1002
- result = reader.GetRemainingBlocks(last_pointer);
995
+ read_pointers = reader.GetRemainingBlocks(last_pointer);
1003
996
  }
1004
997
  // for the last column we need to deserialize the column - because we don't know where it stops
1005
998
  auto &types = GetCollection().GetTypes();
1006
- MetadataReader reader(metadata_manager, column_pointers[last_idx], &result);
999
+ MetadataReader reader(metadata_manager, column_pointers[last_idx], &read_pointers);
1007
1000
  ColumnData::Deserialize(GetBlockManager(), GetTableInfo(), last_idx, start, reader, types[last_idx]);
1008
- return result;
1001
+
1002
+ unordered_set<idx_t> result_as_set;
1003
+ for (auto &ptr : read_pointers) {
1004
+ result_as_set.emplace(ptr.block_pointer);
1005
+ }
1006
+ for (auto &ptr : column_pointers) {
1007
+ result_as_set.erase(ptr.block_pointer);
1008
+ }
1009
+ return {result_as_set.begin(), result_as_set.end()};
1010
+ }
1011
+
1012
+ const vector<MetaBlockPointer> &RowGroup::GetColumnStartPointers() const {
1013
+ return column_pointers;
1009
1014
  }
1010
1015
 
1011
1016
  RowGroupWriteData RowGroup::WriteToDisk(RowGroupWriter &writer) {
@@ -1014,7 +1019,8 @@ RowGroupWriteData RowGroup::WriteToDisk(RowGroupWriter &writer) {
1014
1019
  // we have existing metadata and the row group has not been changed
1015
1020
  // re-use previous metadata
1016
1021
  RowGroupWriteData result;
1017
- result.existing_pointers = GetColumnPointers();
1022
+ result.reuse_existing_metadata_blocks = true;
1023
+ result.existing_extra_metadata_blocks = GetOrComputeExtraMetadataBlocks();
1018
1024
  return result;
1019
1025
  }
1020
1026
  auto &compression_types = writer.GetCompressionTypes();
@@ -1042,14 +1048,23 @@ RowGroupPointer RowGroup::Checkpoint(RowGroupWriteData write_data, RowGroupWrite
1042
1048
  // construct the row group pointer and write the column meta data to disk
1043
1049
  row_group_pointer.row_start = start;
1044
1050
  row_group_pointer.tuple_count = count;
1045
- if (!write_data.existing_pointers.empty()) {
1051
+ if (write_data.reuse_existing_metadata_blocks) {
1046
1052
  // we are re-using the previous metadata
1047
1053
  row_group_pointer.data_pointers = column_pointers;
1048
- row_group_pointer.has_metadata_blocks = has_metadata_blocks;
1049
- row_group_pointer.extra_metadata_blocks = extra_metadata_blocks;
1054
+ row_group_pointer.has_metadata_blocks = true;
1055
+ row_group_pointer.extra_metadata_blocks = write_data.existing_extra_metadata_blocks;
1050
1056
  row_group_pointer.deletes_pointers = deletes_pointers;
1051
- metadata_manager->ClearModifiedBlocks(write_data.existing_pointers);
1057
+ vector<MetaBlockPointer> extra_metadata_block_pointers;
1058
+ extra_metadata_block_pointers.reserve(write_data.existing_extra_metadata_blocks.size());
1059
+ for (auto &block_pointer : write_data.existing_extra_metadata_blocks) {
1060
+ extra_metadata_block_pointers.emplace_back(block_pointer, 0);
1061
+ }
1062
+ metadata_manager->ClearModifiedBlocks(column_pointers);
1063
+ metadata_manager->ClearModifiedBlocks(extra_metadata_block_pointers);
1052
1064
  metadata_manager->ClearModifiedBlocks(deletes_pointers);
1065
+ // remember metadata_blocks to avoid loading them on future checkpoints
1066
+ has_metadata_blocks = true;
1067
+ extra_metadata_blocks = row_group_pointer.extra_metadata_blocks;
1053
1068
  return row_group_pointer;
1054
1069
  }
1055
1070
  D_ASSERT(write_data.states.size() == columns.size());
@@ -1092,6 +1107,7 @@ RowGroupPointer RowGroup::Checkpoint(RowGroupWriteData write_data, RowGroupWrite
1092
1107
  }
1093
1108
  // this metadata block is not stored - add it to the extra metadata blocks
1094
1109
  row_group_pointer.extra_metadata_blocks.push_back(column_pointer.block_pointer);
1110
+ metadata_blocks.insert(column_pointer.block_pointer);
1095
1111
  }
1096
1112
  // set up the pointers correctly within this row group for future operations
1097
1113
  column_pointers = row_group_pointer.data_pointers;
@@ -1113,6 +1129,7 @@ bool RowGroup::HasChanges() const {
1113
1129
  // we have deletes
1114
1130
  return true;
1115
1131
  }
1132
+ D_ASSERT(!deletes_is_loaded.load());
1116
1133
  // check if any of the columns have changes
1117
1134
  // avoid loading unloaded columns - unloaded columns can never have changes
1118
1135
  for (idx_t c = 0; c < columns.size(); c++) {
@@ -665,14 +665,16 @@ void RowGroupCollection::Update(TransactionData transaction, DataTable &data_tab
665
665
  void RowGroupCollection::RemoveFromIndexes(TableIndexList &indexes, Vector &row_identifiers, idx_t count) {
666
666
  auto row_ids = FlatVector::GetData<row_t>(row_identifiers);
667
667
 
668
- // Collect all indexed columns.
668
+ // Collect all Indexed columns on the table.
669
669
  unordered_set<column_t> indexed_column_id_set;
670
670
  indexes.Scan([&](Index &index) {
671
- D_ASSERT(index.IsBound());
672
671
  auto &set = index.GetColumnIdSet();
673
672
  indexed_column_id_set.insert(set.begin(), set.end());
674
673
  return false;
675
674
  });
675
+
676
+ // If we are in WAL replay, delete data will be buffered, and so we sort the column_ids
677
+ // since the sorted form will be the mapping used to get back physical IDs from the buffered index chunk.
676
678
  vector<StorageIndex> column_ids;
677
679
  for (auto &col : indexed_column_id_set) {
678
680
  column_ids.emplace_back(col);
@@ -686,10 +688,10 @@ void RowGroupCollection::RemoveFromIndexes(TableIndexList &indexes, Vector &row_
686
688
 
687
689
  // Initialize the fetch state. Only use indexed columns.
688
690
  TableScanState state;
689
- state.Initialize(std::move(column_ids));
691
+ auto column_ids_copy = column_ids;
692
+ state.Initialize(std::move(column_ids_copy));
690
693
  state.table_state.max_row = row_start + total_rows;
691
694
 
692
- // Used for scanning data. Only contains the indexed columns.
693
695
  DataChunk fetch_chunk;
694
696
  fetch_chunk.Initialize(GetAllocator(), column_types);
695
697
 
@@ -749,17 +751,24 @@ void RowGroupCollection::RemoveFromIndexes(TableIndexList &indexes, Vector &row_
749
751
  result_chunk.SetCardinality(fetch_chunk);
750
752
 
751
753
  // Slice the vector with all rows that are present in this vector.
752
- // Then, erase all values from the indexes.
754
+ // If the index is bound, delete the data. If unbound, buffer into unbound_index.
753
755
  result_chunk.Slice(sel, sel_count);
754
756
  indexes.Scan([&](Index &index) {
755
757
  if (index.IsBound()) {
756
758
  index.Cast<BoundIndex>().Delete(result_chunk, row_identifiers);
757
759
  return false;
758
760
  }
759
- throw MissingExtensionException(
760
- "Cannot delete from index '%s', unknown index type '%s'. You need to load the "
761
- "extension that provides this index type before table '%s' can be modified.",
762
- index.GetIndexName(), index.GetIndexType(), info->GetTableName());
761
+ // Buffering takes only the indexed columns in ordering of the column_ids mapping.
762
+ DataChunk index_column_chunk;
763
+ index_column_chunk.InitializeEmpty(column_types);
764
+ for (idx_t i = 0; i < column_types.size(); i++) {
765
+ auto col_id = column_ids[i].GetPrimaryIndex();
766
+ index_column_chunk.data[i].Reference(result_chunk.data[col_id]);
767
+ }
768
+ index_column_chunk.SetCardinality(result_chunk.size());
769
+ auto &unbound_index = index.Cast<UnboundIndex>();
770
+ unbound_index.BufferChunk(index_column_chunk, row_identifiers, column_ids, BufferedIndexReplay::DEL_ENTRY);
771
+ return false;
763
772
  });
764
773
  }
765
774
  }
@@ -1136,7 +1145,7 @@ void RowGroupCollection::Checkpoint(TableDataWriter &writer, TableStatistics &gl
1136
1145
  break;
1137
1146
  }
1138
1147
  auto &write_state = checkpoint_state.write_data[segment_idx];
1139
- if (write_state.existing_pointers.empty()) {
1148
+ if (!write_state.reuse_existing_metadata_blocks) {
1140
1149
  table_has_changes = true;
1141
1150
  break;
1142
1151
  }
@@ -1150,7 +1159,14 @@ void RowGroupCollection::Checkpoint(TableDataWriter &writer, TableStatistics &gl
1150
1159
  auto &entry = segments[segment_idx];
1151
1160
  auto &row_group = *entry.node;
1152
1161
  auto &write_state = checkpoint_state.write_data[segment_idx];
1153
- metadata_manager.ClearModifiedBlocks(write_state.existing_pointers);
1162
+ metadata_manager.ClearModifiedBlocks(row_group.GetColumnStartPointers());
1163
+ D_ASSERT(write_state.reuse_existing_metadata_blocks);
1164
+ vector<MetaBlockPointer> extra_metadata_block_pointers;
1165
+ extra_metadata_block_pointers.reserve(write_state.existing_extra_metadata_blocks.size());
1166
+ for (auto &block_pointer : write_state.existing_extra_metadata_blocks) {
1167
+ extra_metadata_block_pointers.emplace_back(block_pointer, 0);
1168
+ }
1169
+ metadata_manager.ClearModifiedBlocks(extra_metadata_block_pointers);
1154
1170
  metadata_manager.ClearModifiedBlocks(row_group.GetDeletesPointers());
1155
1171
  row_groups->AppendSegment(l, std::move(entry.node));
1156
1172
  }
@@ -1178,11 +1194,98 @@ void RowGroupCollection::Checkpoint(TableDataWriter &writer, TableStatistics &gl
1178
1194
  if (!row_group_writer) {
1179
1195
  throw InternalException("Missing row group writer for index %llu", segment_idx);
1180
1196
  }
1197
+ bool metadata_reuse = checkpoint_state.write_data[segment_idx].reuse_existing_metadata_blocks;
1181
1198
  auto pointer =
1182
1199
  row_group.Checkpoint(std::move(checkpoint_state.write_data[segment_idx]), *row_group_writer, global_stats);
1200
+
1201
+ auto debug_verify_blocks = DBConfig::GetSetting<DebugVerifyBlocksSetting>(GetAttached().GetDatabase()) &&
1202
+ dynamic_cast<SingleFileTableDataWriter *>(&checkpoint_state.writer) != nullptr;
1203
+ RowGroupPointer pointer_copy;
1204
+ if (debug_verify_blocks) {
1205
+ pointer_copy = pointer;
1206
+ }
1183
1207
  writer.AddRowGroup(std::move(pointer), std::move(row_group_writer));
1184
1208
  row_groups->AppendSegment(l, std::move(entry.node));
1185
1209
  new_total_rows += row_group.count;
1210
+
1211
+ if (debug_verify_blocks) {
1212
+ if (!pointer_copy.has_metadata_blocks) {
1213
+ throw InternalException("Checkpointing should always remember metadata blocks");
1214
+ }
1215
+ if (metadata_reuse && pointer_copy.data_pointers != row_group.GetColumnStartPointers()) {
1216
+ throw InternalException("Colum start pointers changed during metadata reuse");
1217
+ }
1218
+
1219
+ // Capture blocks that have been written
1220
+ vector<MetaBlockPointer> all_written_blocks = pointer_copy.data_pointers;
1221
+ vector<MetaBlockPointer> all_metadata_blocks;
1222
+ for (auto &block : pointer_copy.extra_metadata_blocks) {
1223
+ all_written_blocks.emplace_back(block, 0);
1224
+ all_metadata_blocks.emplace_back(block, 0);
1225
+ }
1226
+
1227
+ // Verify that we can load the metadata correctly again
1228
+ vector<MetaBlockPointer> all_quick_read_blocks;
1229
+ for (auto &ptr : row_group.GetColumnStartPointers()) {
1230
+ all_quick_read_blocks.emplace_back(ptr);
1231
+ if (metadata_reuse && !block_manager.GetMetadataManager().BlockHasBeenCleared(ptr)) {
1232
+ throw InternalException("Found column start block that was not cleared");
1233
+ }
1234
+ }
1235
+ auto extra_metadata_blocks = row_group.GetOrComputeExtraMetadataBlocks(/* force_compute: */ true);
1236
+ for (auto &ptr : extra_metadata_blocks) {
1237
+ auto block_pointer = MetaBlockPointer(ptr, 0);
1238
+ all_quick_read_blocks.emplace_back(block_pointer);
1239
+ if (metadata_reuse && !block_manager.GetMetadataManager().BlockHasBeenCleared(block_pointer)) {
1240
+ throw InternalException("Found extra metadata block that was not cleared");
1241
+ }
1242
+ }
1243
+
1244
+ // Deserialize all columns to check if the quick read via GetOrComputeExtraMetadataBlocks was correct
1245
+ vector<MetaBlockPointer> all_full_read_blocks;
1246
+ auto column_start_pointers = row_group.GetColumnStartPointers();
1247
+ auto &types = row_group.GetCollection().GetTypes();
1248
+ auto &metadata_manager = row_group.GetCollection().GetMetadataManager();
1249
+ for (idx_t i = 0; i < column_start_pointers.size(); i++) {
1250
+ MetadataReader reader(metadata_manager, column_start_pointers[i], &all_full_read_blocks);
1251
+ ColumnData::Deserialize(GetBlockManager(), GetTableInfo(), i, row_group.start, reader, types[i]);
1252
+ }
1253
+
1254
+ // Derive sets of blocks to compare
1255
+ set<idx_t> all_written_block_ids;
1256
+ for (auto &ptr : all_written_blocks) {
1257
+ all_written_block_ids.insert(ptr.block_pointer);
1258
+ }
1259
+ set<idx_t> all_quick_read_block_ids;
1260
+ for (auto &ptr : all_quick_read_blocks) {
1261
+ all_quick_read_block_ids.insert(ptr.block_pointer);
1262
+ }
1263
+ set<idx_t> all_full_read_block_ids;
1264
+ for (auto &ptr : all_full_read_blocks) {
1265
+ all_full_read_block_ids.insert(ptr.block_pointer);
1266
+ }
1267
+ if (all_written_block_ids != all_quick_read_block_ids ||
1268
+ all_quick_read_block_ids != all_full_read_block_ids) {
1269
+ std::stringstream oss;
1270
+ oss << "Written: ";
1271
+ for (auto &block : all_written_blocks) {
1272
+ oss << block << ", ";
1273
+ }
1274
+ oss << "\n";
1275
+ oss << "Quick read: ";
1276
+ for (auto &block : all_quick_read_blocks) {
1277
+ oss << block << ", ";
1278
+ }
1279
+ oss << "\n";
1280
+ oss << "Full read: ";
1281
+ for (auto &block : all_full_read_blocks) {
1282
+ oss << block << ", ";
1283
+ }
1284
+ oss << "\n";
1285
+
1286
+ throw InternalException("Reloading blocks just written does not yield same blocks: " + oss.str());
1287
+ }
1288
+ }
1186
1289
  }
1187
1290
  total_rows = new_total_rows;
1188
1291
  l.Release();
@@ -147,11 +147,17 @@ void TableIndexList::Bind(ClientContext &context, DataTableInfo &table_info, con
147
147
  // Create an IndexBinder to bind the index
148
148
  IndexBinder idx_binder(*binder, context);
149
149
 
150
- // Apply any outstanding appends and replace the unbound index with a bound index.
150
+ // Apply any outstanding buffered replays and replace the unbound index with a bound index.
151
151
  auto &unbound_index = index_entry->index->Cast<UnboundIndex>();
152
152
  auto bound_idx = idx_binder.BindIndex(unbound_index);
153
- if (unbound_index.HasBufferedAppends()) {
154
- bound_idx->ApplyBufferedAppends(column_types, unbound_index.GetBufferedAppends(),
153
+ if (unbound_index.HasBufferedReplays()) {
154
+ // For replaying buffered index operations, we only want the physical column types (skip over
155
+ // generated column types).
156
+ vector<LogicalType> physical_column_types;
157
+ for (auto &col : table.GetColumns().Physical()) {
158
+ physical_column_types.push_back(col.Type());
159
+ }
160
+ bound_idx->ApplyBufferedReplays(physical_column_types, unbound_index.GetBufferedReplays(),
155
161
  unbound_index.GetMappedColumnIds());
156
162
  }
157
163
 
@@ -255,11 +261,18 @@ void TableIndexList::InitializeIndexChunk(DataChunk &index_chunk, const vector<L
255
261
  auto &index_list = data_table_info.GetIndexes();
256
262
  auto indexed_columns = index_list.GetRequiredColumns();
257
263
 
258
- vector<LogicalType> index_types;
264
+ // Store the mapped_column_ids and index_types in sorted canonical form, needed for
265
+ // buffering WAL index operations during replay (see notes in unbound_index.hpp).
266
+ // First sort mapped_column_ids, then populate index_types according to the sorted order.
259
267
  for (auto &col : indexed_columns) {
260
- index_types.push_back(table_types[col]);
261
268
  mapped_column_ids.emplace_back(col);
262
269
  }
270
+ std::sort(mapped_column_ids.begin(), mapped_column_ids.end());
271
+
272
+ vector<LogicalType> index_types;
273
+ for (auto &col : mapped_column_ids) {
274
+ index_types.push_back(table_types[col.GetPrimaryIndex()]);
275
+ }
263
276
 
264
277
  index_chunk.InitializeEmpty(index_types);
265
278
  }
@@ -95,10 +95,15 @@ void CleanupState::Flush() {
95
95
  // set up the row identifiers vector
96
96
  Vector row_identifiers(LogicalType::ROW_TYPE, data_ptr_cast(row_numbers));
97
97
 
98
- // delete the tuples from all the indexes
98
+ // delete the tuples from all the indexes.
99
+ // If there is any issue with removal, a FatalException must be thrown since there may be a corruption of
100
+ // data, hence the transaction cannot be guaranteed.
99
101
  try {
100
102
  current_table->RemoveFromIndexes(row_identifiers, count);
101
- } catch (...) { // NOLINT: ignore errors here
103
+ } catch (std::exception &ex) {
104
+ throw FatalException(ErrorData(ex).Message());
105
+ } catch (...) {
106
+ throw FatalException("unknown failure in CleanupState::Flush");
102
107
  }
103
108
 
104
109
  count = 0;
@@ -81,6 +81,7 @@ class AESStateMBEDTLS : public duckdb::EncryptionState {
81
81
  DUCKDB_API void GenerateRandomData(duckdb::data_ptr_t data, duckdb::idx_t len) override;
82
82
  DUCKDB_API void FinalizeGCM(duckdb::data_ptr_t tag, duckdb::idx_t tag_len);
83
83
  DUCKDB_API const mbedtls_cipher_info_t *GetCipher(size_t key_len);
84
+ DUCKDB_API static void SecureClearData(duckdb::data_ptr_t data, duckdb::idx_t len);
84
85
 
85
86
  private:
86
87
  DUCKDB_API void InitializeInternal(duckdb::const_data_ptr_t iv, duckdb::idx_t iv_len, duckdb::const_data_ptr_t aad, duckdb::idx_t aad_len);
@@ -98,6 +99,10 @@ class AESStateMBEDTLS : public duckdb::EncryptionState {
98
99
  }
99
100
 
100
101
  ~AESStateMBEDTLSFactory() override {} //
102
+
103
+ DUCKDB_API bool SupportsEncryption() override {
104
+ return false;
105
+ }
101
106
  };
102
107
  };
103
108
 
@@ -271,6 +271,10 @@ const mbedtls_cipher_info_t *MbedTlsWrapper::AESStateMBEDTLS::GetCipher(size_t k
271
271
  }
272
272
  }
273
273
 
274
+ void MbedTlsWrapper::AESStateMBEDTLS::SecureClearData(duckdb::data_ptr_t data, duckdb::idx_t len) {
275
+ mbedtls_platform_zeroize(data, len);
276
+ }
277
+
274
278
  MbedTlsWrapper::AESStateMBEDTLS::AESStateMBEDTLS(duckdb::EncryptionTypes::CipherType cipher_p, duckdb::idx_t key_len) : EncryptionState(cipher_p, key_len), context(duckdb::make_uniq<mbedtls_cipher_context_t>()) {
275
279
  mbedtls_cipher_init(context.get());
276
280
 
@@ -296,20 +300,12 @@ MbedTlsWrapper::AESStateMBEDTLS::~AESStateMBEDTLS() {
296
300
  }
297
301
  }
298
302
 
299
- void MbedTlsWrapper::AESStateMBEDTLS::GenerateRandomDataStatic(duckdb::data_ptr_t data, duckdb::idx_t len) {
300
- duckdb::RandomEngine random_engine;
301
-
302
- while (len) {
303
- const auto random_integer = random_engine.NextRandomInteger();
304
- const auto next = duckdb::MinValue<duckdb::idx_t>(len, sizeof(random_integer));
305
- memcpy(data, duckdb::const_data_ptr_cast(&random_integer), next);
306
- data += next;
307
- len -= next;
308
- }
303
+ static void ThrowInsecureRNG() {
304
+ throw duckdb::InvalidConfigurationException("DuckDB requires a secure random engine to be loaded to enable secure crypto. Normally, this will be handled automatically by DuckDB by autoloading the `httpfs` Extension, but that seems to have failed. Please ensure the httpfs extension is loaded manually using `LOAD httpfs`.");
309
305
  }
310
306
 
311
307
  void MbedTlsWrapper::AESStateMBEDTLS::GenerateRandomData(duckdb::data_ptr_t data, duckdb::idx_t len) {
312
- GenerateRandomDataStatic(data, len);
308
+ ThrowInsecureRNG();
313
309
  }
314
310
 
315
311
  void MbedTlsWrapper::AESStateMBEDTLS::InitializeInternal(duckdb::const_data_ptr_t iv, duckdb::idx_t iv_len, duckdb::const_data_ptr_t aad, duckdb::idx_t aad_len){
@@ -325,16 +321,7 @@ void MbedTlsWrapper::AESStateMBEDTLS::InitializeInternal(duckdb::const_data_ptr_
325
321
  }
326
322
 
327
323
  void MbedTlsWrapper::AESStateMBEDTLS::InitializeEncryption(duckdb::const_data_ptr_t iv, duckdb::idx_t iv_len, duckdb::const_data_ptr_t key, duckdb::idx_t key_len_p, duckdb::const_data_ptr_t aad, duckdb::idx_t aad_len) {
328
- mode = duckdb::EncryptionTypes::ENCRYPT;
329
-
330
- if (key_len_p != key_len) {
331
- throw duckdb::InternalException("Invalid encryption key length, expected %llu, got %llu", key_len, key_len_p);
332
- }
333
- if (mbedtls_cipher_setkey(context.get(), key, key_len * 8, MBEDTLS_ENCRYPT)) {
334
- throw runtime_error("Failed to set AES key for encryption");
335
- }
336
-
337
- InitializeInternal(iv, iv_len, aad, aad_len);
324
+ ThrowInsecureRNG();
338
325
  }
339
326
 
340
327
  void MbedTlsWrapper::AESStateMBEDTLS::InitializeDecryption(duckdb::const_data_ptr_t iv, duckdb::idx_t iv_len, duckdb::const_data_ptr_t key, duckdb::idx_t key_len_p, duckdb::const_data_ptr_t aad, duckdb::idx_t aad_len) {