duckdb 1.4.3-dev0.0 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/core_functions/aggregate/holistic/approximate_quantile.cpp +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +14 -5
  4. package/src/duckdb/extension/parquet/column_writer.cpp +4 -4
  5. package/src/duckdb/extension/parquet/include/writer/templated_column_writer.hpp +12 -4
  6. package/src/duckdb/src/common/encryption_key_manager.cpp +4 -0
  7. package/src/duckdb/src/common/local_file_system.cpp +23 -0
  8. package/src/duckdb/src/common/types/column/column_data_collection.cpp +6 -0
  9. package/src/duckdb/src/common/types/conflict_manager.cpp +1 -1
  10. package/src/duckdb/src/execution/index/art/base_node.cpp +3 -1
  11. package/src/duckdb/src/execution/index/art/prefix.cpp +5 -8
  12. package/src/duckdb/src/execution/index/bound_index.cpp +68 -25
  13. package/src/duckdb/src/execution/index/unbound_index.cpp +21 -10
  14. package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp +4 -0
  15. package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +36 -28
  16. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +3 -2
  17. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +12 -6
  18. package/src/duckdb/src/execution/operator/scan/physical_positional_scan.cpp +8 -4
  19. package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
  20. package/src/duckdb/src/execution/physical_plan/plan_aggregate.cpp +4 -3
  21. package/src/duckdb/src/execution/physical_plan/plan_distinct.cpp +3 -2
  22. package/src/duckdb/src/execution/physical_plan/plan_filter.cpp +0 -1
  23. package/src/duckdb/src/execution/physical_plan/plan_window.cpp +6 -8
  24. package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +4 -3
  25. package/src/duckdb/src/function/macro_function.cpp +20 -2
  26. package/src/duckdb/src/function/table/system/duckdb_log.cpp +3 -0
  27. package/src/duckdb/src/function/table/system/test_all_types.cpp +26 -13
  28. package/src/duckdb/src/function/table/table_scan.cpp +72 -38
  29. package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
  30. package/src/duckdb/src/function/table_function.cpp +24 -0
  31. package/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp +1 -0
  32. package/src/duckdb/src/include/duckdb/common/limits.hpp +4 -2
  33. package/src/duckdb/src/include/duckdb/common/local_file_system.hpp +2 -0
  34. package/src/duckdb/src/include/duckdb/common/types/row/block_iterator.hpp +2 -0
  35. package/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp +2 -0
  36. package/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +2 -2
  37. package/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp +1 -1
  38. package/src/duckdb/src/include/duckdb/execution/index/unbound_index.hpp +41 -7
  39. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp +15 -1
  40. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +1 -0
  41. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +2 -1
  42. package/src/duckdb/src/include/duckdb/execution/physical_plan_generator.hpp +3 -1
  43. package/src/duckdb/src/include/duckdb/function/function_binder.hpp +2 -1
  44. package/src/duckdb/src/include/duckdb/function/table_function.hpp +2 -0
  45. package/src/duckdb/src/include/duckdb/main/db_instance_cache.hpp +5 -0
  46. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +2 -0
  47. package/src/duckdb/src/include/duckdb/optimizer/filter_combiner.hpp +1 -0
  48. package/src/duckdb/src/include/duckdb/optimizer/join_order/relation_manager.hpp +4 -4
  49. package/src/duckdb/src/include/duckdb/optimizer/rule/ordered_aggregate_optimizer.hpp +3 -1
  50. package/src/duckdb/src/include/duckdb/parser/parsed_data/sample_options.hpp +3 -0
  51. package/src/duckdb/src/include/duckdb/planner/binder.hpp +1 -1
  52. package/src/duckdb/src/include/duckdb/planner/bound_result_modifier.hpp +4 -2
  53. package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +1 -2
  54. package/src/duckdb/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp +1 -1
  55. package/src/duckdb/src/include/duckdb/planner/subquery/rewrite_cte_scan.hpp +3 -1
  56. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +3 -3
  57. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +2 -6
  58. package/src/duckdb/src/include/duckdb/storage/table/row_version_manager.hpp +4 -1
  59. package/src/duckdb/src/include/duckdb/storage/table/validity_column_data.hpp +2 -0
  60. package/src/duckdb/src/logging/log_storage.cpp +17 -23
  61. package/src/duckdb/src/main/capi/duckdb-c.cpp +1 -1
  62. package/src/duckdb/src/main/connection.cpp +0 -5
  63. package/src/duckdb/src/main/database_manager.cpp +12 -9
  64. package/src/duckdb/src/main/db_instance_cache.cpp +15 -1
  65. package/src/duckdb/src/main/extension/extension_alias.cpp +1 -0
  66. package/src/duckdb/src/optimizer/filter_combiner.cpp +38 -4
  67. package/src/duckdb/src/optimizer/join_order/relation_manager.cpp +15 -15
  68. package/src/duckdb/src/optimizer/late_materialization.cpp +5 -0
  69. package/src/duckdb/src/optimizer/rule/ordered_aggregate_optimizer.cpp +6 -3
  70. package/src/duckdb/src/parser/transform/helpers/transform_sample.cpp +3 -2
  71. package/src/duckdb/src/planner/binder/expression/bind_star_expression.cpp +1 -1
  72. package/src/duckdb/src/planner/binder/query_node/plan_select_node.cpp +1 -1
  73. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +4 -1
  74. package/src/duckdb/src/planner/binder/statement/bind_insert.cpp +17 -10
  75. package/src/duckdb/src/planner/binder.cpp +3 -3
  76. package/src/duckdb/src/planner/bound_result_modifier.cpp +22 -5
  77. package/src/duckdb/src/planner/expression/bound_function_expression.cpp +4 -1
  78. package/src/duckdb/src/planner/expression_binder/constant_binder.cpp +1 -1
  79. package/src/duckdb/src/planner/expression_binder.cpp +1 -2
  80. package/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp +57 -24
  81. package/src/duckdb/src/planner/subquery/rewrite_cte_scan.cpp +5 -3
  82. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +9 -0
  83. package/src/duckdb/src/storage/storage_info.cpp +2 -0
  84. package/src/duckdb/src/storage/table/chunk_info.cpp +3 -3
  85. package/src/duckdb/src/storage/table/column_data.cpp +5 -1
  86. package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +1 -1
  87. package/src/duckdb/src/storage/table/column_segment.cpp +3 -1
  88. package/src/duckdb/src/storage/table/row_group.cpp +6 -8
  89. package/src/duckdb/src/storage/table/row_group_collection.cpp +41 -1
  90. package/src/duckdb/src/storage/table/row_version_manager.cpp +37 -23
  91. package/src/duckdb/src/storage/table/standard_column_data.cpp +5 -5
  92. package/src/duckdb/src/storage/table/validity_column_data.cpp +17 -0
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "1.4.3-dev0.0",
5
+ "version": "1.4.3",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -355,11 +355,11 @@ AggregateFunction GetApproxQuantileListAggregateFunction(const LogicalType &type
355
355
  return GetTypedApproxQuantileListAggregateFunction<int16_t, int16_t>(type);
356
356
  case LogicalTypeId::INTEGER:
357
357
  case LogicalTypeId::DATE:
358
- case LogicalTypeId::TIME:
359
358
  return GetTypedApproxQuantileListAggregateFunction<int32_t, int32_t>(type);
360
359
  case LogicalTypeId::BIGINT:
361
360
  case LogicalTypeId::TIMESTAMP:
362
361
  case LogicalTypeId::TIMESTAMP_TZ:
362
+ case LogicalTypeId::TIME:
363
363
  return GetTypedApproxQuantileListAggregateFunction<int64_t, int64_t>(type);
364
364
  case LogicalTypeId::TIME_TZ:
365
365
  // Not binary comparable
@@ -230,8 +230,16 @@ static string NormalizeTimeZone(const string &tz_str) {
230
230
  }
231
231
 
232
232
  idx_t pos = 3;
233
- const auto sign = tz_str[pos++];
234
- if (sign != '+' && sign != '-') {
233
+ const auto utc = tz_str[pos++];
234
+ // Invert the sign (UTC and Etc use opposite sign conventions)
235
+ // https://en.wikipedia.org/wiki/Tz_database#Area
236
+ auto sign = utc;
237
+ if (utc == '+') {
238
+ sign = '-';
239
+ ;
240
+ } else if (utc == '-') {
241
+ sign = '+';
242
+ } else {
235
243
  break;
236
244
  }
237
245
 
@@ -424,12 +432,13 @@ static void LoadInternal(ExtensionLoader &loader) {
424
432
  auto locales = icu::Collator::getAvailableLocales(count);
425
433
  for (int32_t i = 0; i < count; i++) {
426
434
  string collation;
427
- if (string(locales[i].getCountry()).empty()) {
435
+ const auto &locale = locales[i]; // NOLINT
436
+ if (string(locale.getCountry()).empty()) {
428
437
  // language only
429
- collation = locales[i].getLanguage();
438
+ collation = locale.getLanguage();
430
439
  } else {
431
440
  // language + country
432
- collation = locales[i].getLanguage() + string("_") + locales[i].getCountry();
441
+ collation = locale.getLanguage() + string("_") + locale.getCountry();
433
442
  }
434
443
  collation = StringUtil::Lower(collation);
435
444
 
@@ -534,10 +534,10 @@ ColumnWriter::CreateWriterRecursive(ClientContext &context, ParquetWriter &write
534
534
  template <>
535
535
  struct NumericLimits<float_na_equal> {
536
536
  static constexpr float Minimum() {
537
- return std::numeric_limits<float>::lowest();
537
+ return NumericLimits<float>::Minimum();
538
538
  };
539
539
  static constexpr float Maximum() {
540
- return std::numeric_limits<float>::max();
540
+ return NumericLimits<float>::Maximum();
541
541
  };
542
542
  static constexpr bool IsSigned() {
543
543
  return std::is_signed<float>::value;
@@ -550,10 +550,10 @@ struct NumericLimits<float_na_equal> {
550
550
  template <>
551
551
  struct NumericLimits<double_na_equal> {
552
552
  static constexpr double Minimum() {
553
- return std::numeric_limits<double>::lowest();
553
+ return NumericLimits<double>::Minimum();
554
554
  };
555
555
  static constexpr double Maximum() {
556
- return std::numeric_limits<double>::max();
556
+ return NumericLimits<double>::Maximum();
557
557
  };
558
558
  static constexpr bool IsSigned() {
559
559
  return std::is_signed<double>::value;
@@ -126,7 +126,8 @@ public:
126
126
  public:
127
127
  unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override {
128
128
  auto result = make_uniq<StandardColumnWriterState<SRC, TGT, OP>>(writer, row_group, row_group.columns.size());
129
- result->encoding = duckdb_parquet::Encoding::RLE_DICTIONARY;
129
+ result->encoding = writer.GetParquetVersion() == ParquetVersion::V1 ? duckdb_parquet::Encoding::PLAIN_DICTIONARY
130
+ : duckdb_parquet::Encoding::RLE_DICTIONARY;
130
131
  RegisterToRowGroup(row_group);
131
132
  return std::move(result);
132
133
  }
@@ -150,6 +151,8 @@ public:
150
151
  }
151
152
  page_state.dbp_encoder.FinishWrite(temp_writer);
152
153
  break;
154
+ case duckdb_parquet::Encoding::PLAIN_DICTIONARY:
155
+ // PLAIN_DICTIONARY can be treated the same as RLE_DICTIONARY
153
156
  case duckdb_parquet::Encoding::RLE_DICTIONARY:
154
157
  D_ASSERT(page_state.dict_bit_width != 0);
155
158
  if (!page_state.dict_written_value) {
@@ -265,7 +268,8 @@ public:
265
268
 
266
269
  bool HasDictionary(PrimitiveColumnWriterState &state_p) override {
267
270
  auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
268
- return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY;
271
+ return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY ||
272
+ state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY;
269
273
  }
270
274
 
271
275
  idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override {
@@ -285,7 +289,8 @@ public:
285
289
 
286
290
  void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override {
287
291
  auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
288
- D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY);
292
+ D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY ||
293
+ state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY);
289
294
 
290
295
  if (writer.EnableBloomFilters()) {
291
296
  state.bloom_filter =
@@ -310,7 +315,8 @@ public:
310
315
  idx_t GetRowSize(const Vector &vector, const idx_t index,
311
316
  const PrimitiveColumnWriterState &state_p) const override {
312
317
  auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
313
- if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY) {
318
+ if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY ||
319
+ state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY) {
314
320
  return (state.key_bit_width + 7) / 8;
315
321
  } else {
316
322
  return OP::template GetRowSize<SRC, TGT>(vector, index);
@@ -328,6 +334,8 @@ private:
328
334
  const auto *data_ptr = FlatVector::GetData<SRC>(input_column);
329
335
 
330
336
  switch (page_state.encoding) {
337
+ case duckdb_parquet::Encoding::PLAIN_DICTIONARY:
338
+ // PLAIN_DICTIONARY can be treated the same as RLE_DICTIONARY
331
339
  case duckdb_parquet::Encoding::RLE_DICTIONARY: {
332
340
  idx_t r = chunk_start;
333
341
  if (!page_state.dict_written_value) {
@@ -72,21 +72,25 @@ string EncryptionKeyManager::GenerateRandomKeyID() {
72
72
  }
73
73
 
74
74
  void EncryptionKeyManager::AddKey(const string &key_name, data_ptr_t key) {
75
+ lock_guard<mutex> guard(lock);
75
76
  derived_keys.emplace(key_name, EncryptionKey(key));
76
77
  // Zero-out the encryption key
77
78
  duckdb_mbedtls::MbedTlsWrapper::AESStateMBEDTLS::SecureClearData(key, DERIVED_KEY_LENGTH);
78
79
  }
79
80
 
80
81
  bool EncryptionKeyManager::HasKey(const string &key_name) const {
82
+ lock_guard<mutex> guard(lock);
81
83
  return derived_keys.find(key_name) != derived_keys.end();
82
84
  }
83
85
 
84
86
  const_data_ptr_t EncryptionKeyManager::GetKey(const string &key_name) const {
85
87
  D_ASSERT(HasKey(key_name));
88
+ lock_guard<mutex> guard(lock);
86
89
  return derived_keys.at(key_name).GetPtr();
87
90
  }
88
91
 
89
92
  void EncryptionKeyManager::DeleteKey(const string &key_name) {
93
+ lock_guard<mutex> guard(lock);
90
94
  derived_keys.erase(key_name);
91
95
  }
92
96
 
@@ -1283,6 +1283,29 @@ bool LocalFileSystem::OnDiskFile(FileHandle &handle) {
1283
1283
  return true;
1284
1284
  }
1285
1285
 
1286
+ string LocalFileSystem::GetVersionTag(FileHandle &handle) {
1287
+ // TODO: Fix using FileSystem::Stats for v1.5, which should also fix it for Windows
1288
+ #ifdef _WIN32
1289
+ return "";
1290
+ #else
1291
+ int fd = handle.Cast<UnixFileHandle>().fd;
1292
+ struct stat s;
1293
+ if (fstat(fd, &s) == -1) {
1294
+ throw IOException("Failed to get file size for file \"%s\": %s", {{"errno", std::to_string(errno)}},
1295
+ handle.path, strerror(errno));
1296
+ }
1297
+
1298
+ // dev/ino should be enough, but to guard against in-place writes we also add file size and modification time
1299
+ uint64_t version_tag[4];
1300
+ Store(NumericCast<uint64_t>(s.st_dev), data_ptr_cast(&version_tag[0]));
1301
+ Store(NumericCast<uint64_t>(s.st_ino), data_ptr_cast(&version_tag[1]));
1302
+ Store(NumericCast<uint64_t>(s.st_size), data_ptr_cast(&version_tag[2]));
1303
+ Store(Timestamp::FromEpochSeconds(s.st_mtime).value, data_ptr_cast(&version_tag[3]));
1304
+
1305
+ return string(char_ptr_cast(version_tag), sizeof(uint64_t) * 4);
1306
+ #endif
1307
+ }
1308
+
1286
1309
  void LocalFileSystem::Seek(FileHandle &handle, idx_t location) {
1287
1310
  if (!CanSeek()) {
1288
1311
  throw IOException("Cannot seek in files of this type");
@@ -1036,6 +1036,7 @@ void ColumnDataCollection::InitializeScan(ColumnDataParallelScanState &state, ve
1036
1036
 
1037
1037
  bool ColumnDataCollection::Scan(ColumnDataParallelScanState &state, ColumnDataLocalScanState &lstate,
1038
1038
  DataChunk &result) const {
1039
+ D_ASSERT(result.GetTypes() == types);
1039
1040
  result.Reset();
1040
1041
 
1041
1042
  idx_t chunk_index;
@@ -1129,6 +1130,10 @@ void ColumnDataCollection::ScanAtIndex(ColumnDataParallelScanState &state, Colum
1129
1130
  }
1130
1131
 
1131
1132
  bool ColumnDataCollection::Scan(ColumnDataScanState &state, DataChunk &result) const {
1133
+ for (idx_t i = 0; i < state.column_ids.size(); i++) {
1134
+ D_ASSERT(result.GetTypes()[i] == types[state.column_ids[i]]);
1135
+ }
1136
+
1132
1137
  result.Reset();
1133
1138
 
1134
1139
  idx_t chunk_index;
@@ -1213,6 +1218,7 @@ idx_t ColumnDataCollection::ChunkCount() const {
1213
1218
  }
1214
1219
 
1215
1220
  void ColumnDataCollection::FetchChunk(idx_t chunk_idx, DataChunk &result) const {
1221
+ D_ASSERT(result.GetTypes() == types);
1216
1222
  D_ASSERT(chunk_idx < ChunkCount());
1217
1223
  for (auto &segment : segments) {
1218
1224
  if (chunk_idx >= segment->ChunkCount()) {
@@ -87,7 +87,7 @@ optional_idx ConflictManager::GetFirstInvalidIndex(const idx_t count, const bool
87
87
  for (idx_t i = 0; i < count; i++) {
88
88
  if (negate && !validity.RowIsValid(i)) {
89
89
  return i;
90
- } else if (validity.RowIsValid(i)) {
90
+ } else if (!negate && validity.RowIsValid(i)) {
91
91
  return i;
92
92
  }
93
93
  }
@@ -95,7 +95,9 @@ void Node4::DeleteChild(ART &art, Node &node, Node &parent, const uint8_t byte,
95
95
 
96
96
  auto prev_node4_status = node.GetGateStatus();
97
97
  Node::FreeNode(art, node);
98
- Prefix::Concat(art, parent, node, child, remaining_byte, prev_node4_status);
98
+ // Propagate both the prev_node_4 status and the general gate status (if the gate was earlier on),
99
+ // since the concatenation logic depends on both.
100
+ Prefix::Concat(art, parent, node, child, remaining_byte, prev_node4_status, status);
99
101
  }
100
102
 
101
103
  void Node4::ShrinkNode16(ART &art, Node &node4, Node &node16) {
@@ -65,8 +65,8 @@ void Prefix::New(ART &art, reference<Node> &ref, const ARTKey &key, const idx_t
65
65
  }
66
66
  }
67
67
 
68
- void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte,
69
- const GateStatus node4_status) {
68
+ void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte, const GateStatus node4_status,
69
+ const GateStatus status) {
70
70
  // We have four situations from which we enter here:
71
71
  // 1: PREFIX (parent) - Node4 (prev_node4) - PREFIX (child) - INLINED_LEAF, or
72
72
  // 2: PREFIX (parent) - Node4 (prev_node4) - INLINED_LEAF (child), or
@@ -90,10 +90,7 @@ void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8
90
90
  ConcatChildIsGate(art, parent, node4, child, byte);
91
91
  return;
92
92
  }
93
-
94
- auto inside_gate = parent.GetGateStatus() == GateStatus::GATE_SET;
95
- ConcatInternal(art, parent, node4, child, byte, inside_gate);
96
- return;
93
+ ConcatInternal(art, parent, node4, child, byte, status);
97
94
  }
98
95
 
99
96
  void Prefix::Reduce(ART &art, Node &node, const idx_t pos) {
@@ -286,9 +283,9 @@ Prefix Prefix::GetTail(ART &art, const Node &node) {
286
283
  }
287
284
 
288
285
  void Prefix::ConcatInternal(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte,
289
- const bool inside_gate) {
286
+ const GateStatus status) {
290
287
  if (child.GetType() == NType::LEAF_INLINED) {
291
- if (inside_gate) {
288
+ if (status == GateStatus::GATE_SET) {
292
289
  if (parent.GetType() == NType::PREFIX) {
293
290
  // The parent only contained the Node4, so we can now inline 'all the way up',
294
291
  // and the gate is no longer nested.
@@ -1,11 +1,13 @@
1
1
  #include "duckdb/execution/index/bound_index.hpp"
2
2
 
3
+ #include "duckdb/common/array.hpp"
3
4
  #include "duckdb/common/radix.hpp"
4
5
  #include "duckdb/common/serializer/serializer.hpp"
5
6
  #include "duckdb/planner/expression/bound_columnref_expression.hpp"
6
7
  #include "duckdb/planner/expression/bound_reference_expression.hpp"
7
8
  #include "duckdb/planner/expression_iterator.hpp"
8
9
  #include "duckdb/storage/table/append_state.hpp"
10
+ #include "duckdb/common/types/selection_vector.hpp"
9
11
 
10
12
  namespace duckdb {
11
13
 
@@ -154,39 +156,80 @@ string BoundIndex::AppendRowError(DataChunk &input, idx_t index) {
154
156
  return error;
155
157
  }
156
158
 
157
- void BoundIndex::ApplyBufferedReplays(const vector<LogicalType> &table_types,
158
- vector<BufferedIndexData> &buffered_replays,
159
+ namespace {
160
+
161
+ struct BufferedReplayState {
162
+ optional_ptr<ColumnDataCollection> buffer = nullptr;
163
+ ColumnDataScanState scan_state;
164
+ DataChunk current_chunk;
165
+ bool scan_initialized = false;
166
+ };
167
+ } // namespace
168
+
169
+ void BoundIndex::ApplyBufferedReplays(const vector<LogicalType> &table_types, BufferedIndexReplays &buffered_replays,
159
170
  const vector<StorageIndex> &mapped_column_ids) {
160
- for (auto &replay : buffered_replays) {
161
- ColumnDataScanState state;
162
- auto &buffered_data = *replay.data;
163
- buffered_data.InitializeScan(state);
164
-
165
- DataChunk scan_chunk;
166
- buffered_data.InitializeScanChunk(scan_chunk);
167
- DataChunk table_chunk;
168
- table_chunk.InitializeEmpty(table_types);
169
-
170
- while (buffered_data.Scan(state, scan_chunk)) {
171
- for (idx_t i = 0; i < scan_chunk.ColumnCount() - 1; i++) {
172
- auto col_id = mapped_column_ids[i].GetPrimaryIndex();
173
- table_chunk.data[col_id].Reference(scan_chunk.data[i]);
171
+ if (!buffered_replays.HasBufferedReplays()) {
172
+ return;
173
+ }
174
+
175
+ // We have two replay states: one for inserts and one for deletes. These are indexed into using the
176
+ // replay_type. Both scans are interleaved, so the state maintains the position of each scan.
177
+ array<BufferedReplayState, 2> replay_states;
178
+ DataChunk table_chunk;
179
+ table_chunk.InitializeEmpty(table_types);
180
+
181
+ for (const auto &replay_range : buffered_replays.ranges) {
182
+ const auto type_idx = static_cast<idx_t>(replay_range.type);
183
+ auto &state = replay_states[type_idx];
184
+
185
+ // Initialize the scan state if necessary. Take ownership of buffered operations, since we won't need
186
+ // them after replaying anyways.
187
+ if (!state.scan_initialized) {
188
+ state.buffer = buffered_replays.GetBuffer(replay_range.type);
189
+ state.buffer->InitializeScan(state.scan_state);
190
+ state.buffer->InitializeScanChunk(state.current_chunk);
191
+ state.scan_initialized = true;
192
+ }
193
+
194
+ idx_t current_row = replay_range.start;
195
+ while (current_row < replay_range.end) {
196
+ // Scan the next DataChunk from the ColumnDataCollection buffer if the current row is on or after
197
+ // that chunk's starting row index.
198
+ if (current_row >= state.scan_state.next_row_index) {
199
+ if (!state.buffer->Scan(state.scan_state, state.current_chunk)) {
200
+ throw InternalException("Buffered index data exhausted during replay");
201
+ }
174
202
  }
175
- table_chunk.SetCardinality(scan_chunk.size());
176
203
 
177
- switch (replay.type) {
178
- case BufferedIndexReplay::INSERT_ENTRY: {
179
- IndexAppendInfo index_append_info(IndexAppendMode::INSERT_DUPLICATES, nullptr);
180
- auto error = Append(table_chunk, scan_chunk.data.back(), index_append_info);
204
+ // We need to process the remaining rows in the current chunk, which is the minimum of the available
205
+ // rows in the chunk and the remaining rows in the current range.
206
+ const auto offset_in_chunk = current_row - state.scan_state.current_row_index;
207
+ const auto available_in_chunk = state.current_chunk.size() - offset_in_chunk;
208
+ // [start, end) in ReplayRange is [inclusive, exclusive).
209
+ const auto range_remaining = replay_range.end - current_row;
210
+ const auto rows_to_process = MinValue<idx_t>(available_in_chunk, range_remaining);
211
+
212
+ SelectionVector sel(offset_in_chunk, rows_to_process);
213
+
214
+ for (idx_t col_idx = 0; col_idx < state.current_chunk.ColumnCount() - 1; col_idx++) {
215
+ const auto col_id = mapped_column_ids[col_idx].GetPrimaryIndex();
216
+ table_chunk.data[col_id].Reference(state.current_chunk.data[col_idx]);
217
+ table_chunk.data[col_id].Slice(sel, rows_to_process);
218
+ }
219
+ table_chunk.SetCardinality(rows_to_process);
220
+ Vector row_ids(state.current_chunk.data.back(), sel, rows_to_process);
221
+
222
+ if (replay_range.type == BufferedIndexReplay::INSERT_ENTRY) {
223
+ IndexAppendInfo append_info(IndexAppendMode::INSERT_DUPLICATES, nullptr);
224
+ const auto error = Append(table_chunk, row_ids, append_info);
181
225
  if (error.HasError()) {
182
226
  throw InternalException("error while applying buffered appends: " + error.Message());
183
227
  }
228
+ current_row += rows_to_process;
184
229
  continue;
185
230
  }
186
- case BufferedIndexReplay::DEL_ENTRY: {
187
- Delete(table_chunk, scan_chunk.data.back());
188
- }
189
- }
231
+ Delete(table_chunk, row_ids);
232
+ current_row += rows_to_process;
190
233
  }
191
234
  }
192
235
  }
@@ -8,10 +8,6 @@
8
8
 
9
9
  namespace duckdb {
10
10
 
11
- BufferedIndexData::BufferedIndexData(BufferedIndexReplay replay_type, unique_ptr<ColumnDataCollection> data_p)
12
- : type(replay_type), data(std::move(data_p)) {
13
- }
14
-
15
11
  UnboundIndex::UnboundIndex(unique_ptr<CreateInfo> create_info, IndexStorageInfo storage_info_p,
16
12
  TableIOManager &table_io_manager, AttachedDatabase &db)
17
13
  : Index(create_info->Cast<CreateIndexInfo>().column_ids, table_io_manager, db), create_info(std::move(create_info)),
@@ -40,15 +36,13 @@ void UnboundIndex::CommitDrop() {
40
36
  }
41
37
 
42
38
  void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids,
43
- const vector<StorageIndex> &mapped_column_ids_p, BufferedIndexReplay replay_type) {
39
+ const vector<StorageIndex> &mapped_column_ids_p, const BufferedIndexReplay replay_type) {
44
40
  D_ASSERT(!column_ids.empty());
45
41
  auto types = index_column_chunk.GetTypes(); // column types
46
42
  types.push_back(LogicalType::ROW_TYPE);
47
43
 
48
44
  auto &allocator = Allocator::Get(db);
49
45
 
50
- BufferedIndexData buffered_data(replay_type, make_uniq<ColumnDataCollection>(allocator, types));
51
-
52
46
  //! First time we are buffering data, canonical column_id mapping is stored.
53
47
  //! This should be a sorted list of all the physical offsets of Indexed columns on this table.
54
48
  if (mapped_column_ids.empty()) {
@@ -56,7 +50,7 @@ void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids,
56
50
  }
57
51
  D_ASSERT(mapped_column_ids == mapped_column_ids_p);
58
52
 
59
- // Combined chunk has all the indexed columns and rowids.
53
+ // combined_chunk has all the indexed columns according to mapped_column_ids ordering, as well as a rowid column.
60
54
  DataChunk combined_chunk;
61
55
  combined_chunk.InitializeEmpty(types);
62
56
  for (idx_t i = 0; i < index_column_chunk.ColumnCount(); i++) {
@@ -64,8 +58,25 @@ void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids,
64
58
  }
65
59
  combined_chunk.data.back().Reference(row_ids);
66
60
  combined_chunk.SetCardinality(index_column_chunk.size());
67
- buffered_data.data->Append(combined_chunk);
68
- buffered_replays.emplace_back(std::move(buffered_data));
61
+
62
+ auto &buffer = buffered_replays.GetBuffer(replay_type);
63
+ if (buffer == nullptr) {
64
+ buffer = make_uniq<ColumnDataCollection>(allocator, types);
65
+ }
66
+ // The starting index of the buffer range is the size of the buffer.
67
+ const idx_t start = buffer->Count();
68
+ const idx_t end = start + combined_chunk.size();
69
+ auto &ranges = buffered_replays.ranges;
70
+
71
+ if (ranges.empty() || ranges.back().type != replay_type) {
72
+ // If there are no buffered ranges, or the replay types don't match, append a new range.
73
+ ranges.emplace_back(replay_type, start, end);
74
+ buffer->Append(combined_chunk);
75
+ return;
76
+ }
77
+ // Otherwise merge the range with the previous one.
78
+ ranges.back().end = end;
79
+ buffer->Append(combined_chunk);
69
80
  }
70
81
 
71
82
  } // namespace duckdb
@@ -26,6 +26,10 @@ BaseScanner::BaseScanner(shared_ptr<CSVBufferManager> buffer_manager_p, shared_p
26
26
  }
27
27
  }
28
28
 
29
+ void BaseScanner::Print() const {
30
+ state_machine->Print();
31
+ }
32
+
29
33
  string BaseScanner::RemoveSeparator(const char *value_ptr, const idx_t size, char thousands_separator) {
30
34
  string result;
31
35
  result.reserve(size);
@@ -22,7 +22,7 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m
22
22
  idx_t result_size_p, idx_t buffer_position, CSVErrorHandler &error_hander_p,
23
23
  CSVIterator &iterator_p, bool store_line_size_p,
24
24
  shared_ptr<CSVFileScan> csv_file_scan_p, idx_t &lines_read_p, bool sniffing_p,
25
- string path_p, idx_t scan_id)
25
+ const string &path_p, idx_t scan_id, bool &used_unstrictness)
26
26
  : ScannerResult(states, state_machine, result_size_p),
27
27
  number_of_columns(NumericCast<uint32_t>(state_machine.dialect_options.num_cols)),
28
28
  null_padding(state_machine.options.null_padding), ignore_errors(state_machine.options.ignore_errors.GetValue()),
@@ -30,8 +30,8 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m
30
30
  ? 0
31
31
  : state_machine.dialect_options.state_machine_options.delimiter.GetValue().size() - 1),
32
32
  error_handler(error_hander_p), iterator(iterator_p), store_line_size(store_line_size_p),
33
- csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p),
34
- current_errors(scan_id, state_machine.options.IgnoreErrors()), sniffing(sniffing_p), path(std::move(path_p)) {
33
+ csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p), used_unstrictness(used_unstrictness),
34
+ current_errors(scan_id, state_machine.options.IgnoreErrors()), sniffing(sniffing_p), path(path_p) {
35
35
  // Vector information
36
36
  D_ASSERT(number_of_columns > 0);
37
37
  if (!buffer_handle) {
@@ -154,23 +154,26 @@ inline bool IsValueNull(const char *null_str_ptr, const char *value_ptr, const i
154
154
  }
155
155
 
156
156
  bool StringValueResult::HandleTooManyColumnsError(const char *value_ptr, const idx_t size) {
157
- if (cur_col_id >= number_of_columns && state_machine.state_machine_options.strict_mode.GetValue()) {
158
- bool error = true;
159
- if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
160
- // we make an exception if the first over-value is null
161
- bool is_value_null = false;
162
- for (idx_t i = 0; i < null_str_count; i++) {
163
- is_value_null = is_value_null || IsValueNull(null_str_ptr[i], value_ptr, size);
157
+ if (cur_col_id >= number_of_columns) {
158
+ if (state_machine.state_machine_options.strict_mode.GetValue()) {
159
+ bool error = true;
160
+ if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
161
+ // we make an exception if the first over-value is null
162
+ bool is_value_null = false;
163
+ for (idx_t i = 0; i < null_str_count; i++) {
164
+ is_value_null = is_value_null || IsValueNull(null_str_ptr[i], value_ptr, size);
165
+ }
166
+ error = !is_value_null;
164
167
  }
165
- error = !is_value_null;
166
- }
167
- if (error) {
168
- // We error pointing to the current value error.
169
- current_errors.Insert(TOO_MANY_COLUMNS, cur_col_id, chunk_col_id, last_position);
170
- cur_col_id++;
168
+ if (error) {
169
+ // We error pointing to the current value error.
170
+ current_errors.Insert(TOO_MANY_COLUMNS, cur_col_id, chunk_col_id, last_position);
171
+ cur_col_id++;
172
+ }
173
+ // We had an error
174
+ return true;
171
175
  }
172
- // We had an error
173
- return true;
176
+ used_unstrictness = true;
174
177
  }
175
178
  return false;
176
179
  }
@@ -231,6 +234,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, idx_t size, bool
231
234
  }
232
235
  if (cur_col_id >= number_of_columns) {
233
236
  if (!state_machine.state_machine_options.strict_mode.GetValue()) {
237
+ used_unstrictness = true;
234
238
  return;
235
239
  }
236
240
  bool error = true;
@@ -549,6 +553,7 @@ void StringValueResult::AddPossiblyEscapedValue(StringValueResult &result, const
549
553
  }
550
554
  if (result.cur_col_id >= result.number_of_columns &&
551
555
  !result.state_machine.state_machine_options.strict_mode.GetValue()) {
556
+ result.used_unstrictness = true;
552
557
  return;
553
558
  }
554
559
  if (!result.HandleTooManyColumnsError(value_ptr, length)) {
@@ -980,7 +985,7 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptr<CSV
980
985
  result(states, *state_machine, cur_buffer_handle, BufferAllocator::Get(buffer_manager->context), result_size,
981
986
  iterator.pos.buffer_pos, *error_handler, iterator,
982
987
  buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
983
- buffer_manager->GetFilePath(), scanner_idx_p),
988
+ buffer_manager->GetFilePath(), scanner_idx_p, used_unstrictness),
984
989
  start_pos(0) {
985
990
  if (scanner_idx == 0 && csv_file_scan) {
986
991
  lines_read += csv_file_scan->skipped_rows;
@@ -997,7 +1002,7 @@ StringValueScanner::StringValueScanner(const shared_ptr<CSVBufferManager> &buffe
997
1002
  result(states, *state_machine, cur_buffer_handle, Allocator::DefaultAllocator(), result_size,
998
1003
  iterator.pos.buffer_pos, *error_handler, iterator,
999
1004
  buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
1000
- buffer_manager->GetFilePath(), 0),
1005
+ buffer_manager->GetFilePath(), 0, used_unstrictness),
1001
1006
  start_pos(0) {
1002
1007
  if (scanner_idx == 0 && csv_file_scan) {
1003
1008
  lines_read += csv_file_scan->skipped_rows;
@@ -1939,14 +1944,17 @@ void StringValueScanner::FinalizeChunkProcess() {
1939
1944
  if (result.current_errors.HandleErrors(result)) {
1940
1945
  result.number_of_rows++;
1941
1946
  }
1942
- if (states.IsQuotedCurrent() && !found_error &&
1943
- state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
1944
- type = UNTERMINATED_QUOTES;
1945
- // If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated
1946
- // quotes
1947
- result.current_errors.Insert(type, result.cur_col_id, result.chunk_col_id, result.last_position);
1948
- if (result.current_errors.HandleErrors(result)) {
1949
- result.number_of_rows++;
1947
+ if (states.IsQuotedCurrent() && !found_error) {
1948
+ if (state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
1949
+ type = UNTERMINATED_QUOTES;
1950
+ // If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated
1951
+ // quotes
1952
+ result.current_errors.Insert(type, result.cur_col_id, result.chunk_col_id, result.last_position);
1953
+ if (result.current_errors.HandleErrors(result)) {
1954
+ result.number_of_rows++;
1955
+ }
1956
+ } else {
1957
+ used_unstrictness = true;
1950
1958
  }
1951
1959
  }
1952
1960
  if (!iterator.done) {
@@ -14,7 +14,7 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, const MultiFileOptions &file
14
14
  auto &logical_type = format_template.first;
15
15
  best_format_candidates[logical_type].clear();
16
16
  }
17
- // Initialize max columns found to either 0 or however many were set
17
+ // Initialize max columns found to either 0, or however many were set
18
18
  max_columns_found = set_columns.Size();
19
19
  error_handler = make_shared_ptr<CSVErrorHandler>(options.ignore_errors.GetValue());
20
20
  detection_error_handler = make_shared_ptr<CSVErrorHandler>(true);
@@ -193,7 +193,8 @@ SnifferResult CSVSniffer::SniffCSV(const bool force_match) {
193
193
  buffer_manager->ResetBufferManager();
194
194
  }
195
195
  buffer_manager->sniffing = false;
196
- if (best_candidate->error_handler->AnyErrors() && !options.ignore_errors.GetValue()) {
196
+ if (best_candidate->error_handler->AnyErrors() && !options.ignore_errors.GetValue() &&
197
+ best_candidate->state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
197
198
  best_candidate->error_handler->ErrorIfTypeExists(MAXIMUM_LINE_SIZE);
198
199
  }
199
200
  D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());