duckdb 1.2.1-dev4.0 → 1.2.1-dev8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/package.json +1 -1
  2. package/src/connection.cpp +57 -35
  3. package/src/duckdb/extension/core_functions/aggregate/distributive/string_agg.cpp +14 -22
  4. package/src/duckdb/extension/core_functions/aggregate/nested/list.cpp +0 -1
  5. package/src/duckdb/extension/core_functions/lambda_functions.cpp +0 -11
  6. package/src/duckdb/extension/core_functions/scalar/list/list_aggregates.cpp +18 -6
  7. package/src/duckdb/extension/icu/icu-datefunc.cpp +9 -2
  8. package/src/duckdb/extension/icu/icu-strptime.cpp +7 -11
  9. package/src/duckdb/extension/icu/include/icu-datefunc.hpp +3 -1
  10. package/src/duckdb/extension/json/buffered_json_reader.cpp +18 -31
  11. package/src/duckdb/extension/json/json_extension.cpp +8 -3
  12. package/src/duckdb/extension/parquet/column_reader.cpp +4 -6
  13. package/src/duckdb/extension/parquet/column_writer.cpp +33 -12
  14. package/src/duckdb/extension/parquet/include/column_reader.hpp +0 -2
  15. package/src/duckdb/extension/parquet/include/parquet_bss_encoder.hpp +0 -1
  16. package/src/duckdb/extension/parquet/include/parquet_dlba_encoder.hpp +1 -2
  17. package/src/duckdb/src/catalog/catalog.cpp +12 -0
  18. package/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp +1 -1
  19. package/src/duckdb/src/catalog/catalog_entry_retriever.cpp +1 -1
  20. package/src/duckdb/src/catalog/catalog_search_path.cpp +8 -8
  21. package/src/duckdb/src/common/bind_helpers.cpp +3 -0
  22. package/src/duckdb/src/common/compressed_file_system.cpp +2 -0
  23. package/src/duckdb/src/common/hive_partitioning.cpp +1 -1
  24. package/src/duckdb/src/common/multi_file_reader.cpp +3 -3
  25. package/src/duckdb/src/execution/aggregate_hashtable.cpp +1 -1
  26. package/src/duckdb/src/execution/index/art/art.cpp +19 -6
  27. package/src/duckdb/src/execution/index/art/iterator.cpp +7 -3
  28. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +11 -4
  29. package/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +2 -2
  30. package/src/duckdb/src/execution/operator/csv_scanner/encode/csv_encoder.cpp +5 -1
  31. package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp +3 -2
  32. package/src/duckdb/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +2 -2
  33. package/src/duckdb/src/execution/operator/csv_scanner/scanner/scanner_boundary.cpp +1 -1
  34. package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +20 -12
  35. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +19 -22
  36. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +1 -1
  37. package/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp +1 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +16 -0
  39. package/src/duckdb/src/execution/operator/helper/physical_reservoir_sample.cpp +1 -0
  40. package/src/duckdb/src/execution/operator/helper/physical_streaming_sample.cpp +16 -7
  41. package/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp +3 -1
  42. package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +11 -1
  43. package/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +5 -7
  44. package/src/duckdb/src/execution/physical_plan/plan_create_index.cpp +11 -0
  45. package/src/duckdb/src/execution/physical_plan/plan_sample.cpp +1 -3
  46. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +14 -5
  47. package/src/duckdb/src/execution/sample/reservoir_sample.cpp +24 -12
  48. package/src/duckdb/src/function/scalar/generic/getvariable.cpp +3 -3
  49. package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
  50. package/src/duckdb/src/function/window/window_aggregate_states.cpp +3 -0
  51. package/src/duckdb/src/function/window/window_boundaries_state.cpp +108 -48
  52. package/src/duckdb/src/function/window/window_constant_aggregator.cpp +5 -5
  53. package/src/duckdb/src/function/window/window_distinct_aggregator.cpp +6 -0
  54. package/src/duckdb/src/include/duckdb/catalog/catalog_entry_retriever.hpp +1 -1
  55. package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +10 -9
  56. package/src/duckdb/src/include/duckdb/common/adbc/adbc-init.hpp +1 -1
  57. package/src/duckdb/src/include/duckdb/common/multi_file_reader.hpp +2 -2
  58. package/src/duckdb/src/include/duckdb/execution/index/art/iterator.hpp +2 -0
  59. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp +1 -1
  60. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp +5 -4
  61. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_option.hpp +1 -1
  62. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp +2 -2
  63. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/encode/csv_encoder.hpp +1 -1
  64. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +1 -1
  65. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +2 -2
  66. package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp +3 -7
  67. package/src/duckdb/src/include/duckdb/execution/reservoir_sample.hpp +2 -1
  68. package/src/duckdb/src/include/duckdb/function/lambda_functions.hpp +11 -3
  69. package/src/duckdb/src/include/duckdb/function/window/window_boundaries_state.hpp +4 -0
  70. package/src/duckdb/src/include/duckdb/main/client_context_state.hpp +4 -0
  71. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +25 -7
  72. package/src/duckdb/src/include/duckdb/main/pending_query_result.hpp +2 -0
  73. package/src/duckdb/src/include/duckdb/main/query_profiler.hpp +7 -0
  74. package/src/duckdb/src/include/duckdb/optimizer/filter_combiner.hpp +2 -2
  75. package/src/duckdb/src/include/duckdb/optimizer/late_materialization.hpp +2 -1
  76. package/src/duckdb/src/include/duckdb/optimizer/optimizer_extension.hpp +11 -5
  77. package/src/duckdb/src/include/duckdb/parallel/executor_task.hpp +4 -1
  78. package/src/duckdb/src/include/duckdb/parallel/pipeline.hpp +0 -1
  79. package/src/duckdb/src/include/duckdb/parallel/task_executor.hpp +3 -0
  80. package/src/duckdb/src/include/duckdb/parallel/task_notifier.hpp +27 -0
  81. package/src/duckdb/src/include/duckdb/parallel/task_scheduler.hpp +4 -0
  82. package/src/duckdb/src/include/duckdb/planner/expression/bound_subquery_expression.hpp +1 -1
  83. package/src/duckdb/src/include/duckdb/planner/tableref/bound_cteref.hpp +1 -0
  84. package/src/duckdb/src/include/duckdb/storage/checkpoint/table_data_writer.hpp +3 -1
  85. package/src/duckdb/src/include/duckdb/storage/checkpoint_manager.hpp +7 -1
  86. package/src/duckdb/src/include/duckdb/storage/storage_manager.hpp +3 -2
  87. package/src/duckdb/src/include/duckdb.h +495 -480
  88. package/src/duckdb/src/main/attached_database.cpp +1 -1
  89. package/src/duckdb/src/main/capi/duckdb-c.cpp +5 -1
  90. package/src/duckdb/src/main/capi/helper-c.cpp +8 -0
  91. package/src/duckdb/src/main/config.cpp +7 -1
  92. package/src/duckdb/src/main/database.cpp +8 -8
  93. package/src/duckdb/src/main/extension/extension_helper.cpp +3 -1
  94. package/src/duckdb/src/main/extension/extension_load.cpp +12 -12
  95. package/src/duckdb/src/optimizer/column_lifetime_analyzer.cpp +1 -0
  96. package/src/duckdb/src/optimizer/join_order/query_graph_manager.cpp +2 -2
  97. package/src/duckdb/src/optimizer/late_materialization.cpp +26 -5
  98. package/src/duckdb/src/optimizer/optimizer.cpp +12 -1
  99. package/src/duckdb/src/parallel/executor_task.cpp +10 -6
  100. package/src/duckdb/src/parallel/task_executor.cpp +4 -1
  101. package/src/duckdb/src/parallel/task_notifier.cpp +23 -0
  102. package/src/duckdb/src/parallel/task_scheduler.cpp +33 -0
  103. package/src/duckdb/src/parser/transform/expression/transform_subquery.cpp +4 -1
  104. package/src/duckdb/src/planner/binder/expression/bind_subquery_expression.cpp +1 -1
  105. package/src/duckdb/src/planner/binder/query_node/plan_subquery.cpp +4 -2
  106. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +7 -2
  107. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +6 -5
  108. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +4 -2
  109. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  110. package/src/duckdb/src/storage/compression/string_uncompressed.cpp +21 -10
  111. package/src/duckdb/src/storage/storage_info.cpp +2 -0
  112. package/src/duckdb/src/storage/storage_manager.cpp +2 -2
  113. package/src/duckdb/src/storage/table/row_group.cpp +5 -6
  114. package/src/duckdb/src/storage/table/scan_state.cpp +6 -0
  115. package/src/duckdb/src/transaction/duck_transaction.cpp +11 -3
  116. package/src/duckdb/src/transaction/duck_transaction_manager.cpp +2 -2
  117. package/src/duckdb/third_party/concurrentqueue/concurrentqueue.h +17 -0
  118. package/src/duckdb/ub_src_parallel.cpp +2 -0
@@ -309,6 +309,7 @@ struct PageInformation {
309
309
  idx_t offset = 0;
310
310
  idx_t row_count = 0;
311
311
  idx_t empty_count = 0;
312
+ idx_t null_count = 0;
312
313
  idx_t estimated_page_size = 0;
313
314
  };
314
315
 
@@ -388,7 +389,7 @@ protected:
388
389
  virtual unique_ptr<ColumnWriterStatistics> InitializeStatsState();
389
390
 
390
391
  //! Initialize the writer for a specific page. Only used for scalar types.
391
- virtual unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state);
392
+ virtual unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state, idx_t page_idx);
392
393
 
393
394
  //! Flushes the writer for a specific page. Only used for scalar types.
394
395
  virtual void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state);
@@ -427,7 +428,8 @@ void BasicColumnWriter::RegisterToRowGroup(duckdb_parquet::RowGroup &row_group)
427
428
  row_group.columns.push_back(std::move(column_chunk));
428
429
  }
429
430
 
430
- unique_ptr<ColumnWriterPageState> BasicColumnWriter::InitializePageState(BasicColumnWriterState &state) {
431
+ unique_ptr<ColumnWriterPageState> BasicColumnWriter::InitializePageState(BasicColumnWriterState &state,
432
+ idx_t page_idx) {
431
433
  return nullptr;
432
434
  }
433
435
 
@@ -463,6 +465,8 @@ void BasicColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *p
463
465
  state.page_info.push_back(new_info);
464
466
  page_info_ref = state.page_info.back();
465
467
  }
468
+ } else {
469
+ page_info.null_count++;
466
470
  }
467
471
  vector_index++;
468
472
  }
@@ -502,7 +506,7 @@ void BasicColumnWriter::BeginWrite(ColumnWriterState &state_p) {
502
506
  MaxValue<idx_t>(NextPowerOfTwo(page_info.estimated_page_size), MemoryStream::DEFAULT_INITIAL_CAPACITY));
503
507
  write_info.write_count = page_info.empty_count;
504
508
  write_info.max_write_count = page_info.row_count;
505
- write_info.page_state = InitializePageState(state);
509
+ write_info.page_state = InitializePageState(state, page_idx);
506
510
 
507
511
  write_info.compressed_size = 0;
508
512
  write_info.compressed_data = nullptr;
@@ -796,7 +800,6 @@ public:
796
800
  };
797
801
 
798
802
  struct BaseParquetOperator {
799
-
800
803
  template <class SRC, class TGT>
801
804
  static void WriteToStream(const TGT &input, WriteStream &ser) {
802
805
  ser.WriteData(const_data_ptr_cast(&input), sizeof(TGT));
@@ -815,6 +818,11 @@ struct BaseParquetOperator {
815
818
  template <class SRC, class TGT>
816
819
  static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
817
820
  }
821
+
822
+ template <class SRC, class TGT>
823
+ static idx_t GetRowSize(const Vector &, idx_t) {
824
+ return sizeof(TGT);
825
+ }
818
826
  };
819
827
 
820
828
  struct ParquetCastOperator : public BaseParquetOperator {
@@ -936,6 +944,11 @@ struct ParquetStringOperator : public BaseParquetOperator {
936
944
  static uint64_t XXHash64(const TGT &target_value) {
937
945
  return duckdb_zstd::XXH64(target_value.GetData(), target_value.GetSize(), 0);
938
946
  }
947
+
948
+ template <class SRC, class TGT>
949
+ static idx_t GetRowSize(const Vector &vector, idx_t index) {
950
+ return FlatVector::GetData<string_t>(vector)[index].GetSize();
951
+ }
939
952
  };
940
953
 
941
954
  struct ParquetIntervalTargetType {
@@ -1066,6 +1079,7 @@ public:
1066
1079
  // analysis state for integer values for DELTA_BINARY_PACKED/DELTA_LENGTH_BYTE_ARRAY
1067
1080
  idx_t total_value_count = 0;
1068
1081
  idx_t total_string_size = 0;
1082
+ uint32_t key_bit_width = 0;
1069
1083
 
1070
1084
  unordered_map<T, uint32_t> dictionary;
1071
1085
  duckdb_parquet::Encoding::type encoding;
@@ -1222,11 +1236,12 @@ public:
1222
1236
  return std::move(result);
1223
1237
  }
1224
1238
 
1225
- unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state_p) override {
1239
+ unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state_p, idx_t page_idx) override {
1226
1240
  auto &state = state_p.Cast<StandardColumnWriterState<SRC>>();
1227
-
1228
- auto result = make_uniq<StandardWriterPageState<SRC, TGT>>(state.total_value_count, state.total_string_size,
1229
- state.encoding, state.dictionary);
1241
+ const auto &page_info = state_p.page_info[page_idx];
1242
+ auto result = make_uniq<StandardWriterPageState<SRC, TGT>>(
1243
+ page_info.row_count - (page_info.empty_count + page_info.null_count), state.total_string_size,
1244
+ state.encoding, state.dictionary);
1230
1245
  return std::move(result);
1231
1246
  }
1232
1247
 
@@ -1335,6 +1350,8 @@ public:
1335
1350
  }
1336
1351
  }
1337
1352
  state.dictionary.clear();
1353
+ } else {
1354
+ state.key_bit_width = RleBpDecoder::ComputeBitWidth(state.dictionary.size());
1338
1355
  }
1339
1356
  }
1340
1357
 
@@ -1488,9 +1505,13 @@ public:
1488
1505
  // bloom filter will be queued for writing in ParquetWriter::BufferBloomFilter one level up
1489
1506
  }
1490
1507
 
1491
- // TODO this now vastly over-estimates the page size
1492
1508
  idx_t GetRowSize(const Vector &vector, const idx_t index, const BasicColumnWriterState &state_p) const override {
1493
- return sizeof(TGT);
1509
+ auto &state = state_p.Cast<StandardColumnWriterState<SRC>>();
1510
+ if (state.encoding == Encoding::RLE_DICTIONARY) {
1511
+ return (state.key_bit_width + 7) / 8;
1512
+ } else {
1513
+ return OP::template GetRowSize<SRC, TGT>(vector, index);
1514
+ }
1494
1515
  }
1495
1516
  };
1496
1517
 
@@ -1570,7 +1591,7 @@ public:
1570
1591
  }
1571
1592
  }
1572
1593
 
1573
- unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state) override {
1594
+ unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state, idx_t page_idx) override {
1574
1595
  return make_uniq<BooleanWriterPageState>();
1575
1596
  }
1576
1597
 
@@ -1812,7 +1833,7 @@ public:
1812
1833
  }
1813
1834
  }
1814
1835
 
1815
- unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state) override {
1836
+ unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state, idx_t page_idx) override {
1816
1837
  return make_uniq<EnumWriterPageState>(bit_width);
1817
1838
  }
1818
1839
 
@@ -160,7 +160,6 @@ protected:
160
160
 
161
161
  private:
162
162
  void AllocateBlock(idx_t size);
163
- void AllocateCompressed(idx_t size);
164
163
  void PrepareRead(parquet_filter_t &filter);
165
164
  void PreparePage(PageHeader &page_hdr);
166
165
  void PrepareDataPage(PageHeader &page_hdr);
@@ -178,7 +177,6 @@ private:
178
177
 
179
178
  shared_ptr<ResizeableBuffer> block;
180
179
 
181
- ResizeableBuffer compressed_buffer;
182
180
  ResizeableBuffer offset_buffer;
183
181
 
184
182
  unique_ptr<RleBpDecoder> dict_decoder;
@@ -30,7 +30,6 @@ public:
30
30
  }
31
31
 
32
32
  void FinishWrite(WriteStream &writer) {
33
- D_ASSERT(count == total_value_count);
34
33
  writer.WriteData(buffer.get(), total_value_count * bit_width);
35
34
  }
36
35
 
@@ -33,9 +33,8 @@ public:
33
33
  }
34
34
 
35
35
  void FinishWrite(WriteStream &writer) {
36
- D_ASSERT(stream->GetPosition() == total_string_size);
37
36
  dbp_encoder.FinishWrite(writer);
38
- writer.WriteData(buffer.get(), total_string_size);
37
+ writer.WriteData(buffer.get(), stream->GetPosition());
39
38
  }
40
39
 
41
40
  private:
@@ -769,6 +769,12 @@ CatalogEntryLookup Catalog::TryLookupEntry(CatalogEntryRetriever &retriever, Cat
769
769
 
770
770
  if (if_not_found == OnEntryNotFound::RETURN_NULL) {
771
771
  return {nullptr, nullptr, ErrorData()};
772
+ }
773
+ // Check if the default database is actually attached. CreateMissingEntryException will throw binder exception
774
+ // otherwise.
775
+ if (!GetCatalogEntry(context, GetDefaultCatalog(retriever))) {
776
+ auto except = CatalogException("%s with name %s does not exist!", CatalogTypeToString(type), name);
777
+ return {nullptr, nullptr, ErrorData(except)};
772
778
  } else {
773
779
  auto except = CreateMissingEntryException(retriever, name, type, schemas, error_context);
774
780
  return {nullptr, nullptr, ErrorData(except)};
@@ -805,6 +811,12 @@ CatalogEntryLookup Catalog::TryLookupEntry(CatalogEntryRetriever &retriever, vec
805
811
 
806
812
  if (if_not_found == OnEntryNotFound::RETURN_NULL) {
807
813
  return {nullptr, nullptr, ErrorData()};
814
+ }
815
+ // Check if the default database is actually attached. CreateMissingEntryException will throw binder exception
816
+ // otherwise.
817
+ if (!GetCatalogEntry(context, GetDefaultCatalog(retriever))) {
818
+ auto except = CatalogException("%s with name %s does not exist!", CatalogTypeToString(type), name);
819
+ return {nullptr, nullptr, ErrorData(except)};
808
820
  } else {
809
821
  auto except = CreateMissingEntryException(retriever, name, type, schemas, error_context);
810
822
  return {nullptr, nullptr, ErrorData(except)};
@@ -863,7 +863,7 @@ unique_ptr<CatalogEntry> DuckTableEntry::Copy(ClientContext &context) const {
863
863
  }
864
864
 
865
865
  auto binder = Binder::CreateBinder(context);
866
- auto bound_create_info = binder->BindCreateTableInfo(std::move(create_info), schema);
866
+ auto bound_create_info = binder->BindCreateTableCheckpoint(std::move(create_info), schema);
867
867
  return make_uniq<DuckTableEntry>(catalog, schema, *bound_create_info, storage);
868
868
  }
869
869
 
@@ -76,7 +76,7 @@ void CatalogEntryRetriever::Inherit(const CatalogEntryRetriever &parent) {
76
76
  this->search_path = parent.search_path;
77
77
  }
78
78
 
79
- CatalogSearchPath &CatalogEntryRetriever::GetSearchPath() {
79
+ const CatalogSearchPath &CatalogEntryRetriever::GetSearchPath() const {
80
80
  if (search_path) {
81
81
  return *search_path;
82
82
  }
@@ -189,11 +189,11 @@ void CatalogSearchPath::Set(CatalogSearchEntry new_value, CatalogSetPathType set
189
189
  Set(std::move(new_paths), set_type);
190
190
  }
191
191
 
192
- const vector<CatalogSearchEntry> &CatalogSearchPath::Get() {
192
+ const vector<CatalogSearchEntry> &CatalogSearchPath::Get() const {
193
193
  return paths;
194
194
  }
195
195
 
196
- string CatalogSearchPath::GetDefaultSchema(const string &catalog) {
196
+ string CatalogSearchPath::GetDefaultSchema(const string &catalog) const {
197
197
  for (auto &path : paths) {
198
198
  if (path.catalog == TEMP_CATALOG) {
199
199
  continue;
@@ -205,7 +205,7 @@ string CatalogSearchPath::GetDefaultSchema(const string &catalog) {
205
205
  return DEFAULT_SCHEMA;
206
206
  }
207
207
 
208
- string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string &catalog) {
208
+ string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string &catalog) const {
209
209
  for (auto &path : paths) {
210
210
  if (path.catalog == TEMP_CATALOG) {
211
211
  continue;
@@ -221,7 +221,7 @@ string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string
221
221
  return DEFAULT_SCHEMA;
222
222
  }
223
223
 
224
- string CatalogSearchPath::GetDefaultCatalog(const string &schema) {
224
+ string CatalogSearchPath::GetDefaultCatalog(const string &schema) const {
225
225
  if (DefaultSchemaGenerator::IsDefaultSchema(schema)) {
226
226
  return SYSTEM_CATALOG;
227
227
  }
@@ -236,7 +236,7 @@ string CatalogSearchPath::GetDefaultCatalog(const string &schema) {
236
236
  return INVALID_CATALOG;
237
237
  }
238
238
 
239
- vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) {
239
+ vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) const {
240
240
  vector<string> catalogs;
241
241
  if (DefaultSchemaGenerator::IsDefaultSchema(schema)) {
242
242
  catalogs.push_back(SYSTEM_CATALOG);
@@ -250,7 +250,7 @@ vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) {
250
250
  return catalogs;
251
251
  }
252
252
 
253
- vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) {
253
+ vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) const {
254
254
  vector<string> schemas;
255
255
  for (auto &path : paths) {
256
256
  if (StringUtil::CIEquals(path.catalog, catalog)) {
@@ -260,7 +260,7 @@ vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) {
260
260
  return schemas;
261
261
  }
262
262
 
263
- const CatalogSearchEntry &CatalogSearchPath::GetDefault() {
263
+ const CatalogSearchEntry &CatalogSearchPath::GetDefault() const {
264
264
  const auto &paths = Get();
265
265
  D_ASSERT(paths.size() >= 2);
266
266
  return paths[1];
@@ -281,7 +281,7 @@ void CatalogSearchPath::SetPathsInternal(vector<CatalogSearchEntry> new_paths) {
281
281
  }
282
282
 
283
283
  bool CatalogSearchPath::SchemaInSearchPath(ClientContext &context, const string &catalog_name,
284
- const string &schema_name) {
284
+ const string &schema_name) const {
285
285
  for (auto &path : paths) {
286
286
  if (!StringUtil::CIEquals(path.schema, schema_name)) {
287
287
  continue;
@@ -56,6 +56,9 @@ vector<bool> ParseColumnList(const Value &value, vector<string> &names, const st
56
56
  }
57
57
  throw BinderException("\"%s\" expects a column list or * as parameter", loption);
58
58
  }
59
+ if (value.IsNull()) {
60
+ throw BinderException("\"%s\" expects a column list or * as parameter, it can't be a NULL value", loption);
61
+ }
59
62
  auto &children = ListValue::GetChildren(value);
60
63
  // accept '*' as single argument
61
64
  if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
@@ -31,6 +31,8 @@ void CompressedFile::Initialize(bool write) {
31
31
  stream_data.out_buff_start = stream_data.out_buff.get();
32
32
  stream_data.out_buff_end = stream_data.out_buff.get();
33
33
 
34
+ current_position = 0;
35
+
34
36
  stream_wrapper = compressed_fs.CreateStream();
35
37
  stream_wrapper->Initialize(*this, write);
36
38
  }
@@ -245,7 +245,7 @@ static void TemplatedGetHivePartitionValues(Vector &input, vector<HivePartitionK
245
245
 
246
246
  const auto &type = input.GetType();
247
247
 
248
- const auto reinterpret = Value::CreateValue<T>(data[0]).GetTypeMutable() != type;
248
+ const auto reinterpret = Value::CreateValue<T>(data[sel.get_index(0)]).GetTypeMutable() != type;
249
249
  if (reinterpret) {
250
250
  for (idx_t i = 0; i < count; i++) {
251
251
  auto &key = keys[i];
@@ -508,14 +508,14 @@ void MultiFileReader::CreateMapping(const string &file_name,
508
508
  // copy global columns and inject any different defaults
509
509
  CreateColumnMapping(file_name, local_columns, global_columns, global_column_ids, reader_data, bind_data,
510
510
  initial_file, global_state);
511
- CreateFilterMap(global_columns, filters, reader_data, global_state);
511
+ CreateFilterMap(global_column_ids, filters, reader_data, global_state);
512
512
  }
513
513
 
514
- void MultiFileReader::CreateFilterMap(const vector<MultiFileReaderColumnDefinition> &global_columns,
514
+ void MultiFileReader::CreateFilterMap(const vector<ColumnIndex> &global_column_ids,
515
515
  optional_ptr<TableFilterSet> filters, MultiFileReaderData &reader_data,
516
516
  optional_ptr<MultiFileReaderGlobalState> global_state) {
517
517
  if (filters) {
518
- auto filter_map_size = global_columns.size();
518
+ auto filter_map_size = global_column_ids.size();
519
519
  if (global_state) {
520
520
  filter_map_size += global_state->extra_columns.size();
521
521
  }
@@ -329,7 +329,7 @@ optional_idx GroupedAggregateHashTable::TryAddDictionaryGroups(DataChunk &groups
329
329
  if (dictionary_id.empty()) {
330
330
  // dictionary has no id, we can't cache across vectors
331
331
  // only use dictionary compression if there are fewer entries than groups
332
- if (dict_size >= groups.size() * DICTIONARY_THRESHOLD) {
332
+ if (dict_size * DICTIONARY_THRESHOLD >= groups.size()) {
333
333
  // dictionary is too large - use regular aggregation
334
334
  return optional_idx();
335
335
  }
@@ -1038,9 +1038,11 @@ string ART::GenerateConstraintErrorMessage(VerifyExistenceType verify_type, cons
1038
1038
  }
1039
1039
  case VerifyExistenceType::DELETE_FK: {
1040
1040
  // DELETE_FK that still exists in a FK table, i.e., not a valid delete.
1041
- return StringUtil::Format("Violates foreign key constraint because key \"%s\" is still referenced by a foreign "
1042
- "key in a different table",
1043
- key_name);
1041
+ return StringUtil::Format(
1042
+ "Violates foreign key constraint because key \"%s\" is still referenced by a foreign "
1043
+ "key in a different table. If this is an unexpected constraint violation, please refer to our "
1044
+ "foreign key limitations in the documentation",
1045
+ key_name);
1044
1046
  }
1045
1047
  default:
1046
1048
  throw NotImplementedException("Type not implemented for VerifyExistenceType");
@@ -1091,16 +1093,27 @@ void ART::VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr<ART> dele
1091
1093
  return;
1092
1094
  }
1093
1095
 
1096
+ // Fast path for FOREIGN KEY constraints.
1097
+ // Up to here, the above code paths work implicitly for FKs, as the leaf is inlined.
1094
1098
  // FIXME: proper foreign key + delete ART support.
1095
- // This implicitly works for foreign keys, as we do not have to consider the actual row IDs.
1096
- // We only need to know that there are conflicts (for now), as we still perform over-eager constraint checking.
1099
+ if (index_constraint_type == IndexConstraintType::FOREIGN) {
1100
+ D_ASSERT(!deleted_leaf);
1101
+ // We don't handle FK conflicts in UPSERT, so the row ID should not matter.
1102
+ if (manager.AddHit(i, MAX_ROW_ID)) {
1103
+ conflict_idx = i;
1104
+ }
1105
+ return;
1106
+ }
1097
1107
 
1098
1108
  // Scan the two row IDs in the leaf.
1099
1109
  Iterator it(*this);
1100
1110
  it.FindMinimum(leaf);
1101
1111
  ARTKey empty_key = ARTKey();
1102
1112
  unsafe_vector<row_t> row_ids;
1103
- it.Scan(empty_key, 2, row_ids, false);
1113
+ auto success = it.Scan(empty_key, 2, row_ids, false);
1114
+ if (!success || row_ids.size() != 2) {
1115
+ throw InternalException("VerifyLeaf expects exactly two row IDs to be scanned");
1116
+ }
1104
1117
 
1105
1118
  if (!deleted_leaf) {
1106
1119
  if (manager.AddHit(i, row_ids[0]) || manager.AddHit(i, row_ids[1])) {
@@ -46,9 +46,11 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec
46
46
  bool has_next;
47
47
  do {
48
48
  // An empty upper bound indicates that no upper bound exists.
49
- if (!upper_bound.Empty() && status == GateStatus::GATE_NOT_SET) {
50
- if (current_key.GreaterThan(upper_bound, equal, nested_depth)) {
51
- return true;
49
+ if (!upper_bound.Empty()) {
50
+ if (status == GateStatus::GATE_NOT_SET || entered_nested_leaf) {
51
+ if (current_key.GreaterThan(upper_bound, equal, nested_depth)) {
52
+ return true;
53
+ }
52
54
  }
53
55
  }
54
56
 
@@ -86,6 +88,7 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec
86
88
  throw InternalException("Invalid leaf type for index scan.");
87
89
  }
88
90
 
91
+ entered_nested_leaf = false;
89
92
  has_next = Next();
90
93
  } while (has_next);
91
94
  return true;
@@ -104,6 +107,7 @@ void Iterator::FindMinimum(const Node &node) {
104
107
  if (node.GetGateStatus() == GateStatus::GATE_SET) {
105
108
  D_ASSERT(status == GateStatus::GATE_NOT_SET);
106
109
  status = GateStatus::GATE_SET;
110
+ entered_nested_leaf = true;
107
111
  nested_depth = 0;
108
112
  }
109
113
 
@@ -575,6 +575,11 @@ public:
575
575
 
576
576
  explicit WindowLocalSourceState(WindowGlobalSourceState &gsource);
577
577
 
578
+ void ReleaseLocalStates() {
579
+ auto &local_states = window_hash_group->thread_states.at(task->thread_idx);
580
+ local_states.clear();
581
+ }
582
+
578
583
  //! Does the task have more work to do?
579
584
  bool TaskFinished() const {
580
585
  return !task || task->begin_idx == task->end_idx;
@@ -792,6 +797,12 @@ void WindowGlobalSourceState::FinishTask(TaskPtr task) {
792
797
  }
793
798
 
794
799
  bool WindowLocalSourceState::TryAssignTask() {
800
+ D_ASSERT(TaskFinished());
801
+ if (task && task->stage == WindowGroupStage::GETDATA) {
802
+ // If this state completed the last block in the previous iteration,
803
+ // release out local state memory.
804
+ ReleaseLocalStates();
805
+ }
795
806
  // Because downstream operators may be using our internal buffers,
796
807
  // we can't "finish" a task until we are about to get the next one.
797
808
 
@@ -888,10 +899,6 @@ void WindowLocalSourceState::GetData(DataChunk &result) {
888
899
  ++task->begin_idx;
889
900
  }
890
901
 
891
- // If that was the last block, release out local state memory.
892
- if (TaskFinished()) {
893
- local_states.clear();
894
- }
895
902
  result.Verify();
896
903
  }
897
904
 
@@ -4,7 +4,7 @@
4
4
  namespace duckdb {
5
5
 
6
6
  CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
7
- idx_t &global_csv_current_position, idx_t file_number_p)
7
+ const idx_t &global_csv_current_position, idx_t file_number_p)
8
8
  : context(context), requested_size(buffer_size_p), file_number(file_number_p), can_seek(file_handle.CanSeek()),
9
9
  is_pipe(file_handle.IsPipe()) {
10
10
  AllocateBuffer(buffer_size_p);
@@ -34,7 +34,7 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b
34
34
  }
35
35
 
36
36
  shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p,
37
- bool &has_seaked) {
37
+ bool &has_seaked) const {
38
38
  if (has_seaked) {
39
39
  // This means that at some point a reload was done, and we are currently on the incorrect position in our file
40
40
  // handle
@@ -36,7 +36,7 @@ void CSVEncoderBuffer::Reset() {
36
36
  actual_encoded_buffer_size = 0;
37
37
  }
38
38
 
39
- CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, idx_t buffer_size) {
39
+ CSVEncoder::CSVEncoder(const DBConfig &config, const string &encoding_name_to_find, idx_t buffer_size) {
40
40
  encoding_name = StringUtil::Lower(encoding_name_to_find);
41
41
  auto function = config.GetEncodeFunction(encoding_name_to_find);
42
42
  if (!function) {
@@ -51,6 +51,10 @@ CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, id
51
51
  }
52
52
  // We ensure that the encoded buffer size is an even number to make the two byte lookup on utf-16 work
53
53
  idx_t encoded_buffer_size = buffer_size % 2 != 0 ? buffer_size - 1 : buffer_size;
54
+ if (encoded_buffer_size == 0) {
55
+ // This might happen if buffer size = 1
56
+ encoded_buffer_size = 2;
57
+ }
54
58
  D_ASSERT(encoded_buffer_size > 0);
55
59
  encoded_buffer.Initialize(encoded_buffer_size);
56
60
  remaining_bytes_buffer.Initialize(function->GetBytesPerIteration());
@@ -11,9 +11,10 @@ ScannerResult::ScannerResult(CSVStates &states_p, CSVStateMachine &state_machine
11
11
 
12
12
  BaseScanner::BaseScanner(shared_ptr<CSVBufferManager> buffer_manager_p, shared_ptr<CSVStateMachine> state_machine_p,
13
13
  shared_ptr<CSVErrorHandler> error_handler_p, bool sniffing_p,
14
- shared_ptr<CSVFileScan> csv_file_scan_p, CSVIterator iterator_p)
14
+ shared_ptr<CSVFileScan> csv_file_scan_p, const CSVIterator &iterator_p)
15
15
  : csv_file_scan(std::move(csv_file_scan_p)), sniffing(sniffing_p), error_handler(std::move(error_handler_p)),
16
- state_machine(std::move(state_machine_p)), buffer_manager(std::move(buffer_manager_p)), iterator(iterator_p) {
16
+ state_machine(std::move(state_machine_p)), states(), buffer_manager(std::move(buffer_manager_p)),
17
+ iterator(iterator_p) {
17
18
  D_ASSERT(buffer_manager);
18
19
  D_ASSERT(state_machine);
19
20
  // Initialize current buffer handle
@@ -76,8 +76,8 @@ void CSVSchema::MergeSchemas(CSVSchema &other, bool null_padding) {
76
76
  }
77
77
  }
78
78
 
79
- CSVSchema::CSVSchema(vector<string> &names, vector<LogicalType> &types, const string &file_path, idx_t rows_read_p,
80
- const bool empty_p)
79
+ CSVSchema::CSVSchema(const vector<string> &names, const vector<LogicalType> &types, const string &file_path,
80
+ idx_t rows_read_p, const bool empty_p)
81
81
  : rows_read(rows_read_p), empty(empty_p) {
82
82
  Initialize(names, types, file_path);
83
83
  }
@@ -13,7 +13,7 @@ CSVBoundary::CSVBoundary(idx_t buffer_idx_p, idx_t buffer_pos_p, idx_t boundary_
13
13
  CSVBoundary::CSVBoundary() : buffer_idx(0), buffer_pos(0), boundary_idx(0), end_pos(NumericLimits<idx_t>::Maximum()) {
14
14
  }
15
15
 
16
- CSVIterator::CSVIterator() : is_set(false) {
16
+ CSVIterator::CSVIterator() : buffer_size(0), is_set(false) {
17
17
  }
18
18
 
19
19
  void CSVBoundary::Print() const {
@@ -688,23 +688,29 @@ bool LineError::HandleErrors(StringValueResult &result) {
688
688
  line_pos.GetGlobalPosition(result.requested_size), result.path);
689
689
  }
690
690
  break;
691
- case CAST_ERROR:
691
+ case CAST_ERROR: {
692
+ string column_name;
693
+ LogicalTypeId type_id;
694
+ if (cur_error.col_idx < result.names.size()) {
695
+ column_name = result.names[cur_error.col_idx];
696
+ }
697
+ if (cur_error.col_idx < result.number_of_columns) {
698
+ type_id = result.parse_types[cur_error.chunk_idx].type_id;
699
+ }
692
700
  if (result.current_line_position.begin == line_pos) {
693
701
  csv_error = CSVError::CastError(
694
- result.state_machine.options, result.names[cur_error.col_idx], cur_error.error_message,
695
- cur_error.col_idx, borked_line, lines_per_batch,
702
+ result.state_machine.options, column_name, cur_error.error_message, cur_error.col_idx, borked_line,
703
+ lines_per_batch,
696
704
  result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl),
697
- line_pos.GetGlobalPosition(result.requested_size, first_nl),
698
- result.parse_types[cur_error.chunk_idx].type_id, result.path);
705
+ line_pos.GetGlobalPosition(result.requested_size, first_nl), type_id, result.path);
699
706
  } else {
700
707
  csv_error = CSVError::CastError(
701
- result.state_machine.options, result.names[cur_error.col_idx], cur_error.error_message,
702
- cur_error.col_idx, borked_line, lines_per_batch,
708
+ result.state_machine.options, column_name, cur_error.error_message, cur_error.col_idx, borked_line,
709
+ lines_per_batch,
703
710
  result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl),
704
- line_pos.GetGlobalPosition(result.requested_size), result.parse_types[cur_error.chunk_idx].type_id,
705
- result.path);
711
+ line_pos.GetGlobalPosition(result.requested_size), type_id, result.path);
706
712
  }
707
- break;
713
+ } break;
708
714
  case MAXIMUM_LINE_SIZE:
709
715
  csv_error = CSVError::LineSizeError(
710
716
  result.state_machine.options, lines_per_batch, borked_line,
@@ -964,7 +970,8 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptr<CSV
964
970
  result(states, *state_machine, cur_buffer_handle, BufferAllocator::Get(buffer_manager->context), result_size,
965
971
  iterator.pos.buffer_pos, *error_handler, iterator,
966
972
  buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
967
- buffer_manager->GetFilePath(), scanner_idx_p) {
973
+ buffer_manager->GetFilePath(), scanner_idx_p),
974
+ start_pos(0) {
968
975
  iterator.buffer_size = state_machine->options.buffer_size_option.GetValue();
969
976
  }
970
977
 
@@ -976,7 +983,8 @@ StringValueScanner::StringValueScanner(const shared_ptr<CSVBufferManager> &buffe
976
983
  result(states, *state_machine, cur_buffer_handle, Allocator::DefaultAllocator(), result_size,
977
984
  iterator.pos.buffer_pos, *error_handler, iterator,
978
985
  buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
979
- buffer_manager->GetFilePath(), 0) {
986
+ buffer_manager->GetFilePath(), 0),
987
+ start_pos(0) {
980
988
  iterator.buffer_size = state_machine->options.buffer_size_option.GetValue();
981
989
  }
982
990