duckdb 1.4.2-dev4.0 → 1.4.3-dev0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/package.json +2 -2
  2. package/src/duckdb/extension/icu/icu_extension.cpp +67 -6
  3. package/src/duckdb/extension/icu/third_party/icu/common/putil.cpp +9 -3
  4. package/src/duckdb/extension/json/include/json_serializer.hpp +12 -0
  5. package/src/duckdb/extension/json/json_functions/json_create.cpp +10 -10
  6. package/src/duckdb/extension/parquet/decoder/delta_length_byte_array_decoder.cpp +19 -5
  7. package/src/duckdb/extension/parquet/include/decoder/delta_length_byte_array_decoder.hpp +1 -1
  8. package/src/duckdb/extension/parquet/include/parquet_dbp_decoder.hpp +11 -2
  9. package/src/duckdb/extension/parquet/include/reader/string_column_reader.hpp +2 -1
  10. package/src/duckdb/extension/parquet/parquet_reader.cpp +3 -1
  11. package/src/duckdb/extension/parquet/parquet_writer.cpp +16 -1
  12. package/src/duckdb/extension/parquet/reader/string_column_reader.cpp +1 -1
  13. package/src/duckdb/extension/parquet/writer/primitive_column_writer.cpp +1 -1
  14. package/src/duckdb/src/catalog/default/default_table_functions.cpp +1 -1
  15. package/src/duckdb/src/common/adbc/adbc.cpp +8 -6
  16. package/src/duckdb/src/common/csv_writer.cpp +1 -13
  17. package/src/duckdb/src/common/encryption_key_manager.cpp +10 -9
  18. package/src/duckdb/src/common/enum_util.cpp +19 -0
  19. package/src/duckdb/src/common/enums/compression_type.cpp +51 -16
  20. package/src/duckdb/src/common/exception/binder_exception.cpp +7 -2
  21. package/src/duckdb/src/common/progress_bar/unscented_kalman_filter.cpp +2 -2
  22. package/src/duckdb/src/common/random_engine.cpp +10 -0
  23. package/src/duckdb/src/execution/expression_executor/execute_comparison.cpp +13 -2
  24. package/src/duckdb/src/execution/index/art/art.cpp +6 -3
  25. package/src/duckdb/src/execution/index/bound_index.cpp +32 -21
  26. package/src/duckdb/src/execution/index/unbound_index.cpp +20 -9
  27. package/src/duckdb/src/execution/join_hashtable.cpp +9 -3
  28. package/src/duckdb/src/execution/operator/helper/physical_buffered_batch_collector.cpp +1 -1
  29. package/src/duckdb/src/execution/operator/helper/physical_buffered_collector.cpp +1 -1
  30. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +5 -0
  31. package/src/duckdb/src/function/cast/cast_function_set.cpp +3 -1
  32. package/src/duckdb/src/function/macro_function.cpp +1 -1
  33. package/src/duckdb/src/function/scalar/compressed_materialization/compress_string.cpp +1 -1
  34. package/src/duckdb/src/function/scalar/create_sort_key.cpp +5 -3
  35. package/src/duckdb/src/function/scalar/operator/arithmetic.cpp +1 -1
  36. package/src/duckdb/src/function/scalar/system/parse_log_message.cpp +4 -2
  37. package/src/duckdb/src/function/table/copy_csv.cpp +28 -4
  38. package/src/duckdb/src/function/table/direct_file_reader.cpp +10 -0
  39. package/src/duckdb/src/function/table/read_file.cpp +65 -1
  40. package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
  41. package/src/duckdb/src/include/duckdb/common/csv_writer.hpp +0 -3
  42. package/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp +2 -0
  43. package/src/duckdb/src/include/duckdb/common/encryption_state.hpp +5 -0
  44. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -0
  45. package/src/duckdb/src/include/duckdb/common/enums/compression_type.hpp +42 -2
  46. package/src/duckdb/src/include/duckdb/common/http_util.hpp +7 -0
  47. package/src/duckdb/src/include/duckdb/common/hugeint.hpp +1 -1
  48. package/src/duckdb/src/include/duckdb/common/operator/comparison_operators.hpp +0 -11
  49. package/src/duckdb/src/include/duckdb/common/random_engine.hpp +2 -0
  50. package/src/duckdb/src/include/duckdb/common/sort/duckdb_pdqsort.hpp +1 -0
  51. package/src/duckdb/src/include/duckdb/common/types/hugeint.hpp +6 -6
  52. package/src/duckdb/src/include/duckdb/common/types/row/block_iterator.hpp +115 -97
  53. package/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp +54 -0
  54. package/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp +21 -2
  55. package/src/duckdb/src/include/duckdb/execution/index/unbound_index.hpp +26 -8
  56. package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +2 -0
  57. package/src/duckdb/src/include/duckdb/function/table/read_file.hpp +0 -49
  58. package/src/duckdb/src/include/duckdb/logging/log_manager.hpp +1 -1
  59. package/src/duckdb/src/include/duckdb/logging/log_type.hpp +14 -0
  60. package/src/duckdb/src/include/duckdb/main/attached_database.hpp +2 -1
  61. package/src/duckdb/src/include/duckdb/main/buffered_data/batched_buffered_data.hpp +1 -1
  62. package/src/duckdb/src/include/duckdb/main/buffered_data/buffered_data.hpp +1 -1
  63. package/src/duckdb/src/include/duckdb/main/buffered_data/simple_buffered_data.hpp +1 -1
  64. package/src/duckdb/src/include/duckdb/main/capi/capi_internal.hpp +2 -0
  65. package/src/duckdb/src/include/duckdb/main/database.hpp +2 -2
  66. package/src/duckdb/src/include/duckdb/main/database_file_path_manager.hpp +10 -6
  67. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +4 -0
  68. package/src/duckdb/src/include/duckdb/main/profiling_info.hpp +1 -0
  69. package/src/duckdb/src/include/duckdb/main/query_profiler.hpp +1 -0
  70. package/src/duckdb/src/include/duckdb/main/relation/create_table_relation.hpp +3 -0
  71. package/src/duckdb/src/include/duckdb/main/relation/insert_relation.hpp +2 -0
  72. package/src/duckdb/src/include/duckdb/main/relation/table_relation.hpp +2 -0
  73. package/src/duckdb/src/include/duckdb/main/relation.hpp +10 -2
  74. package/src/duckdb/src/include/duckdb/main/settings.hpp +9 -0
  75. package/src/duckdb/src/include/duckdb/optimizer/filter_pullup.hpp +10 -14
  76. package/src/duckdb/src/include/duckdb/optimizer/join_order/relation_manager.hpp +5 -1
  77. package/src/duckdb/src/include/duckdb/parser/query_node.hpp +3 -0
  78. package/src/duckdb/src/include/duckdb/planner/bound_statement.hpp +1 -0
  79. package/src/duckdb/src/include/duckdb/storage/block.hpp +9 -0
  80. package/src/duckdb/src/include/duckdb/storage/block_manager.hpp +9 -2
  81. package/src/duckdb/src/include/duckdb/storage/index.hpp +8 -2
  82. package/src/duckdb/src/include/duckdb/storage/metadata/metadata_manager.hpp +2 -0
  83. package/src/duckdb/src/include/duckdb/storage/metadata/metadata_reader.hpp +1 -1
  84. package/src/duckdb/src/include/duckdb/storage/storage_options.hpp +0 -7
  85. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +6 -2
  86. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier.hpp +6 -0
  87. package/src/duckdb/src/logging/log_manager.cpp +2 -1
  88. package/src/duckdb/src/logging/log_types.cpp +30 -1
  89. package/src/duckdb/src/main/attached_database.cpp +4 -7
  90. package/src/duckdb/src/main/buffered_data/batched_buffered_data.cpp +2 -3
  91. package/src/duckdb/src/main/buffered_data/buffered_data.cpp +2 -3
  92. package/src/duckdb/src/main/buffered_data/simple_buffered_data.cpp +1 -2
  93. package/src/duckdb/src/main/capi/prepared-c.cpp +9 -2
  94. package/src/duckdb/src/main/config.cpp +6 -5
  95. package/src/duckdb/src/main/database.cpp +9 -3
  96. package/src/duckdb/src/main/database_file_path_manager.cpp +43 -14
  97. package/src/duckdb/src/main/database_manager.cpp +1 -1
  98. package/src/duckdb/src/main/http/http_util.cpp +19 -1
  99. package/src/duckdb/src/main/profiling_info.cpp +11 -0
  100. package/src/duckdb/src/main/query_profiler.cpp +16 -0
  101. package/src/duckdb/src/main/relation/create_table_relation.cpp +9 -0
  102. package/src/duckdb/src/main/relation/insert_relation.cpp +7 -0
  103. package/src/duckdb/src/main/relation/table_relation.cpp +14 -0
  104. package/src/duckdb/src/main/relation.cpp +28 -12
  105. package/src/duckdb/src/main/settings/custom_settings.cpp +9 -3
  106. package/src/duckdb/src/optimizer/filter_pullup.cpp +14 -0
  107. package/src/duckdb/src/optimizer/join_order/relation_manager.cpp +29 -10
  108. package/src/duckdb/src/optimizer/rule/regex_optimizations.cpp +7 -0
  109. package/src/duckdb/src/parallel/task_executor.cpp +4 -2
  110. package/src/duckdb/src/parser/query_node/cte_node.cpp +79 -0
  111. package/src/duckdb/src/parser/transform/expression/transform_cast.cpp +3 -1
  112. package/src/duckdb/src/planner/binder/expression/bind_macro_expression.cpp +1 -0
  113. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +12 -4
  114. package/src/duckdb/src/planner/binder/statement/bind_insert.cpp +16 -12
  115. package/src/duckdb/src/planner/binder/statement/bind_merge_into.cpp +42 -5
  116. package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +0 -24
  117. package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +1 -1
  118. package/src/duckdb/src/planner/binder.cpp +0 -1
  119. package/src/duckdb/src/planner/expression_binder/having_binder.cpp +1 -2
  120. package/src/duckdb/src/storage/buffer/block_manager.cpp +20 -6
  121. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +8 -6
  122. package/src/duckdb/src/storage/checkpoint_manager.cpp +24 -22
  123. package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +7 -0
  124. package/src/duckdb/src/storage/compression/zstd.cpp +34 -12
  125. package/src/duckdb/src/storage/data_table.cpp +1 -1
  126. package/src/duckdb/src/storage/local_storage.cpp +15 -2
  127. package/src/duckdb/src/storage/metadata/metadata_manager.cpp +29 -6
  128. package/src/duckdb/src/storage/metadata/metadata_reader.cpp +11 -15
  129. package/src/duckdb/src/storage/metadata/metadata_writer.cpp +1 -1
  130. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +1 -19
  131. package/src/duckdb/src/storage/single_file_block_manager.cpp +33 -3
  132. package/src/duckdb/src/storage/standard_buffer_manager.cpp +3 -1
  133. package/src/duckdb/src/storage/storage_info.cpp +4 -0
  134. package/src/duckdb/src/storage/storage_manager.cpp +8 -0
  135. package/src/duckdb/src/storage/table/array_column_data.cpp +1 -1
  136. package/src/duckdb/src/storage/table/column_data.cpp +3 -2
  137. package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +3 -2
  138. package/src/duckdb/src/storage/table/row_group.cpp +41 -24
  139. package/src/duckdb/src/storage/table/row_group_collection.cpp +114 -11
  140. package/src/duckdb/src/storage/table_index_list.cpp +18 -5
  141. package/src/duckdb/src/transaction/cleanup_state.cpp +7 -2
  142. package/src/duckdb/third_party/mbedtls/include/mbedtls_wrapper.hpp +5 -0
  143. package/src/duckdb/third_party/mbedtls/mbedtls_wrapper.cpp +8 -21
  144. package/src/duckdb/third_party/parquet/parquet_types.cpp +57 -35
  145. package/src/duckdb/third_party/parquet/parquet_types.h +9 -2
  146. package/src/duckdb/ub_src_common_types_row.cpp +0 -2
@@ -17,25 +17,60 @@ vector<string> ListCompressionTypes(void) {
17
17
  return compression_types;
18
18
  }
19
19
 
20
- bool CompressionTypeIsDeprecated(CompressionType compression_type, optional_ptr<StorageManager> storage_manager) {
21
- vector<CompressionType> types({CompressionType::COMPRESSION_PATAS, CompressionType::COMPRESSION_CHIMP});
22
- if (storage_manager) {
23
- if (storage_manager->GetStorageVersion() >= 5) {
24
- //! NOTE: storage_manager is an optional_ptr because it's called from ForceCompressionSetting, which doesn't
25
- //! have guaranteed access to a StorageManager The introduction of DICT_FSST deprecates Dictionary and FSST
26
- //! compression methods
27
- types.emplace_back(CompressionType::COMPRESSION_DICTIONARY);
28
- types.emplace_back(CompressionType::COMPRESSION_FSST);
29
- } else {
30
- types.emplace_back(CompressionType::COMPRESSION_DICT_FSST);
31
- }
20
+ namespace {
21
+ struct CompressionMethodRequirements {
22
+ CompressionType type;
23
+ optional_idx minimum_storage_version;
24
+ optional_idx maximum_storage_version;
25
+ };
26
+ } // namespace
27
+
28
+ CompressionAvailabilityResult CompressionTypeIsAvailable(CompressionType compression_type,
29
+ optional_ptr<StorageManager> storage_manager) {
30
+ //! Max storage compatibility
31
+ vector<CompressionMethodRequirements> candidates({{CompressionType::COMPRESSION_PATAS, optional_idx(), 0},
32
+ {CompressionType::COMPRESSION_CHIMP, optional_idx(), 0},
33
+ {CompressionType::COMPRESSION_DICTIONARY, 0, 4},
34
+ {CompressionType::COMPRESSION_FSST, 0, 4},
35
+ {CompressionType::COMPRESSION_DICT_FSST, 5, optional_idx()}});
36
+
37
+ optional_idx current_storage_version;
38
+ if (storage_manager && storage_manager->HasStorageVersion()) {
39
+ current_storage_version = storage_manager->GetStorageVersion();
32
40
  }
33
- for (auto &type : types) {
34
- if (type == compression_type) {
35
- return true;
41
+ for (auto &candidate : candidates) {
42
+ auto &type = candidate.type;
43
+ if (type != compression_type) {
44
+ continue;
45
+ }
46
+ auto &min = candidate.minimum_storage_version;
47
+ auto &max = candidate.maximum_storage_version;
48
+
49
+ if (!min.IsValid()) {
50
+ //! Used to signal: always deprecated
51
+ return CompressionAvailabilityResult::Deprecated();
52
+ }
53
+
54
+ if (!current_storage_version.IsValid()) {
55
+ //! Can't determine in this call whether it's available or not, default to available
56
+ return CompressionAvailabilityResult();
57
+ }
58
+
59
+ auto current_version = current_storage_version.GetIndex();
60
+ D_ASSERT(min.IsValid());
61
+ if (min.GetIndex() > current_version) {
62
+ //! Minimum required storage version is higher than the current storage version, this method isn't available
63
+ //! yet
64
+ return CompressionAvailabilityResult::NotAvailableYet();
65
+ }
66
+ if (max.IsValid() && max.GetIndex() < current_version) {
67
+ //! Maximum supported storage version is lower than the current storage version, this method is no longer
68
+ //! available
69
+ return CompressionAvailabilityResult::Deprecated();
36
70
  }
71
+ return CompressionAvailabilityResult();
37
72
  }
38
- return false;
73
+ return CompressionAvailabilityResult();
39
74
  }
40
75
 
41
76
  CompressionType CompressionTypeFromString(const string &str) {
@@ -18,9 +18,14 @@ BinderException BinderException::ColumnNotFound(const string &name, const vector
18
18
  extra_info["name"] = name;
19
19
  if (!similar_bindings.empty()) {
20
20
  extra_info["candidates"] = StringUtil::Join(similar_bindings, ",");
21
+ return BinderException(
22
+ StringUtil::Format("Referenced column \"%s\" not found in FROM clause!%s", name, candidate_str),
23
+ extra_info);
24
+ } else {
25
+ return BinderException(
26
+ StringUtil::Format("Referenced column \"%s\" was not found because the FROM clause is missing", name),
27
+ extra_info);
21
28
  }
22
- return BinderException(
23
- StringUtil::Format("Referenced column \"%s\" not found in FROM clause!%s", name, candidate_str), extra_info);
24
29
  }
25
30
 
26
31
  BinderException BinderException::NoMatchingFunction(const string &catalog_name, const string &schema_name,
@@ -254,11 +254,11 @@ void UnscentedKalmanFilter::UpdateInternal(double measured_progress) {
254
254
  }
255
255
 
256
256
  // Ensure progress stays in bounds
257
- x[0] = std::max(0.0, std::min(1.0, x[0]));
257
+ x[0] = std::max(0.0, std::min(scale_factor, x[0]));
258
258
  }
259
259
 
260
260
  double UnscentedKalmanFilter::GetProgress() const {
261
- return x[0];
261
+ return x[0] / scale_factor;
262
262
  }
263
263
 
264
264
  double UnscentedKalmanFilter::GetVelocity() const {
@@ -82,4 +82,14 @@ void RandomEngine::SetSeed(uint64_t seed) {
82
82
  random_state->pcg.seed(seed);
83
83
  }
84
84
 
85
+ void RandomEngine::RandomData(duckdb::data_ptr_t data, duckdb::idx_t len) {
86
+ while (len) {
87
+ const auto random_integer = NextRandomInteger();
88
+ const auto next = duckdb::MinValue<duckdb::idx_t>(len, sizeof(random_integer));
89
+ memcpy(data, duckdb::const_data_ptr_cast(&random_integer), next);
90
+ data += next;
91
+ len -= next;
92
+ }
93
+ }
94
+
85
95
  } // namespace duckdb
@@ -138,8 +138,19 @@ static idx_t TemplatedSelectOperation(Vector &left, Vector &right, optional_ptr<
138
138
  false_sel.get());
139
139
  case PhysicalType::LIST:
140
140
  case PhysicalType::STRUCT:
141
- case PhysicalType::ARRAY:
142
- return NestedSelectOperation<OP>(left, right, sel, count, true_sel, false_sel, null_mask);
141
+ case PhysicalType::ARRAY: {
142
+ auto result_count = NestedSelectOperation<OP>(left, right, sel, count, true_sel, false_sel, null_mask);
143
+ if (true_sel && result_count > 0) {
144
+ std::sort(true_sel->data(), true_sel->data() + result_count);
145
+ }
146
+ if (false_sel) {
147
+ idx_t false_count = count - result_count;
148
+ if (false_count > 0) {
149
+ std::sort(false_sel->data(), false_sel->data() + false_count);
150
+ }
151
+ }
152
+ return result_count;
153
+ }
143
154
  default:
144
155
  throw InternalException("Invalid type for comparison");
145
156
  }
@@ -522,7 +522,9 @@ ErrorData ART::Insert(IndexLock &l, DataChunk &chunk, Vector &row_ids, IndexAppe
522
522
  if (keys[i].Empty()) {
523
523
  continue;
524
524
  }
525
- D_ASSERT(ARTOperator::Lookup(*this, tree, keys[i], 0));
525
+ auto leaf = ARTOperator::Lookup(*this, tree, keys[i], 0);
526
+ D_ASSERT(leaf);
527
+ D_ASSERT(ARTOperator::LookupInLeaf(*this, *leaf, row_id_keys[i]));
526
528
  }
527
529
  #endif
528
530
  return ErrorData();
@@ -602,8 +604,9 @@ void ART::Delete(IndexLock &state, DataChunk &input, Vector &row_ids) {
602
604
  continue;
603
605
  }
604
606
  auto leaf = ARTOperator::Lookup(*this, tree, keys[i], 0);
605
- if (leaf && leaf->GetType() == NType::LEAF_INLINED) {
606
- D_ASSERT(leaf->GetRowId() != row_id_keys[i].GetRowId());
607
+ if (leaf) {
608
+ auto contains_row_id = ARTOperator::LookupInLeaf(*this, *leaf, row_id_keys[i]);
609
+ D_ASSERT(!contains_row_id);
607
610
  }
608
611
  }
609
612
  #endif
@@ -154,28 +154,39 @@ string BoundIndex::AppendRowError(DataChunk &input, idx_t index) {
154
154
  return error;
155
155
  }
156
156
 
157
- void BoundIndex::ApplyBufferedAppends(const vector<LogicalType> &table_types, ColumnDataCollection &buffered_appends,
157
+ void BoundIndex::ApplyBufferedReplays(const vector<LogicalType> &table_types,
158
+ vector<BufferedIndexData> &buffered_replays,
158
159
  const vector<StorageIndex> &mapped_column_ids) {
159
- IndexAppendInfo index_append_info(IndexAppendMode::INSERT_DUPLICATES, nullptr);
160
-
161
- ColumnDataScanState state;
162
- buffered_appends.InitializeScan(state);
163
-
164
- DataChunk scan_chunk;
165
- buffered_appends.InitializeScanChunk(scan_chunk);
166
- DataChunk table_chunk;
167
- table_chunk.InitializeEmpty(table_types);
168
-
169
- while (buffered_appends.Scan(state, scan_chunk)) {
170
- for (idx_t i = 0; i < scan_chunk.ColumnCount() - 1; i++) {
171
- auto col_id = mapped_column_ids[i].GetPrimaryIndex();
172
- table_chunk.data[col_id].Reference(scan_chunk.data[i]);
173
- }
174
- table_chunk.SetCardinality(scan_chunk.size());
175
-
176
- auto error = Append(table_chunk, scan_chunk.data.back(), index_append_info);
177
- if (error.HasError()) {
178
- throw InternalException("error while applying buffered appends: " + error.Message());
160
+ for (auto &replay : buffered_replays) {
161
+ ColumnDataScanState state;
162
+ auto &buffered_data = *replay.data;
163
+ buffered_data.InitializeScan(state);
164
+
165
+ DataChunk scan_chunk;
166
+ buffered_data.InitializeScanChunk(scan_chunk);
167
+ DataChunk table_chunk;
168
+ table_chunk.InitializeEmpty(table_types);
169
+
170
+ while (buffered_data.Scan(state, scan_chunk)) {
171
+ for (idx_t i = 0; i < scan_chunk.ColumnCount() - 1; i++) {
172
+ auto col_id = mapped_column_ids[i].GetPrimaryIndex();
173
+ table_chunk.data[col_id].Reference(scan_chunk.data[i]);
174
+ }
175
+ table_chunk.SetCardinality(scan_chunk.size());
176
+
177
+ switch (replay.type) {
178
+ case BufferedIndexReplay::INSERT_ENTRY: {
179
+ IndexAppendInfo index_append_info(IndexAppendMode::INSERT_DUPLICATES, nullptr);
180
+ auto error = Append(table_chunk, scan_chunk.data.back(), index_append_info);
181
+ if (error.HasError()) {
182
+ throw InternalException("error while applying buffered appends: " + error.Message());
183
+ }
184
+ continue;
185
+ }
186
+ case BufferedIndexReplay::DEL_ENTRY: {
187
+ Delete(table_chunk, scan_chunk.data.back());
188
+ }
189
+ }
179
190
  }
180
191
  }
181
192
  }
@@ -8,6 +8,10 @@
8
8
 
9
9
  namespace duckdb {
10
10
 
11
+ BufferedIndexData::BufferedIndexData(BufferedIndexReplay replay_type, unique_ptr<ColumnDataCollection> data_p)
12
+ : type(replay_type), data(std::move(data_p)) {
13
+ }
14
+
11
15
  UnboundIndex::UnboundIndex(unique_ptr<CreateInfo> create_info, IndexStorageInfo storage_info_p,
12
16
  TableIOManager &table_io_manager, AttachedDatabase &db)
13
17
  : Index(create_info->Cast<CreateIndexInfo>().column_ids, table_io_manager, db), create_info(std::move(create_info)),
@@ -35,26 +39,33 @@ void UnboundIndex::CommitDrop() {
35
39
  }
36
40
  }
37
41
 
38
- void UnboundIndex::BufferChunk(DataChunk &chunk, Vector &row_ids, const vector<StorageIndex> &mapped_column_ids_p) {
42
+ void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids,
43
+ const vector<StorageIndex> &mapped_column_ids_p, BufferedIndexReplay replay_type) {
39
44
  D_ASSERT(!column_ids.empty());
40
- auto types = chunk.GetTypes();
45
+ auto types = index_column_chunk.GetTypes(); // column types
41
46
  types.push_back(LogicalType::ROW_TYPE);
42
47
 
43
- if (!buffered_appends) {
44
- auto &allocator = Allocator::Get(db);
45
- buffered_appends = make_uniq<ColumnDataCollection>(allocator, types);
48
+ auto &allocator = Allocator::Get(db);
49
+
50
+ BufferedIndexData buffered_data(replay_type, make_uniq<ColumnDataCollection>(allocator, types));
51
+
52
+ //! First time we are buffering data, canonical column_id mapping is stored.
53
+ //! This should be a sorted list of all the physical offsets of Indexed columns on this table.
54
+ if (mapped_column_ids.empty()) {
46
55
  mapped_column_ids = mapped_column_ids_p;
47
56
  }
48
57
  D_ASSERT(mapped_column_ids == mapped_column_ids_p);
49
58
 
59
+ // Combined chunk has all the indexed columns and rowids.
50
60
  DataChunk combined_chunk;
51
61
  combined_chunk.InitializeEmpty(types);
52
- for (idx_t i = 0; i < chunk.ColumnCount(); i++) {
53
- combined_chunk.data[i].Reference(chunk.data[i]);
62
+ for (idx_t i = 0; i < index_column_chunk.ColumnCount(); i++) {
63
+ combined_chunk.data[i].Reference(index_column_chunk.data[i]);
54
64
  }
55
65
  combined_chunk.data.back().Reference(row_ids);
56
- combined_chunk.SetCardinality(chunk.size());
57
- buffered_appends->Append(combined_chunk);
66
+ combined_chunk.SetCardinality(index_column_chunk.size());
67
+ buffered_data.data->Append(combined_chunk);
68
+ buffered_replays.emplace_back(std::move(buffered_data));
58
69
  }
59
70
 
60
71
  } // namespace duckdb
@@ -888,6 +888,7 @@ idx_t ScanStructure::ResolvePredicates(DataChunk &keys, SelectionVector &match_s
888
888
  }
889
889
 
890
890
  // If there is a matcher for the probing side because of non-equality predicates, use it
891
+ idx_t result_count;
891
892
  if (ht.needs_chain_matcher) {
892
893
  idx_t no_match_count = 0;
893
894
  auto &matcher = no_match_sel ? ht.row_matcher_probe_no_match_sel : ht.row_matcher_probe;
@@ -895,12 +896,17 @@ idx_t ScanStructure::ResolvePredicates(DataChunk &keys, SelectionVector &match_s
895
896
 
896
897
  // we need to only use the vectors with the indices of the columns that are used in the probe phase, namely
897
898
  // the non-equality columns
898
- return matcher->Match(keys, key_state.vector_data, match_sel, this->count, pointers, no_match_sel,
899
- no_match_count);
899
+ result_count =
900
+ matcher->Match(keys, key_state.vector_data, match_sel, this->count, pointers, no_match_sel, no_match_count);
900
901
  } else {
901
902
  // no match sel is the opposite of match sel
902
- return this->count;
903
+ result_count = this->count;
903
904
  }
905
+
906
+ // Update total probe match count
907
+ ht.total_probe_matches.fetch_add(result_count, std::memory_order_relaxed);
908
+
909
+ return result_count;
904
910
  }
905
911
 
906
912
  idx_t ScanStructure::ScanInnerJoin(DataChunk &keys, SelectionVector &result_vector) {
@@ -94,7 +94,7 @@ unique_ptr<LocalSinkState> PhysicalBufferedBatchCollector::GetLocalSinkState(Exe
94
94
  unique_ptr<GlobalSinkState> PhysicalBufferedBatchCollector::GetGlobalSinkState(ClientContext &context) const {
95
95
  auto state = make_uniq<BufferedBatchCollectorGlobalState>();
96
96
  state->context = context.shared_from_this();
97
- state->buffered_data = make_shared_ptr<BatchedBufferedData>(state->context);
97
+ state->buffered_data = make_shared_ptr<BatchedBufferedData>(context);
98
98
  return std::move(state);
99
99
  }
100
100
 
@@ -48,7 +48,7 @@ SinkCombineResultType PhysicalBufferedCollector::Combine(ExecutionContext &conte
48
48
  unique_ptr<GlobalSinkState> PhysicalBufferedCollector::GetGlobalSinkState(ClientContext &context) const {
49
49
  auto state = make_uniq<BufferedCollectorGlobalState>();
50
50
  state->context = context.shared_from_this();
51
- state->buffered_data = make_shared_ptr<SimpleBufferedData>(state->context);
51
+ state->buffered_data = make_shared_ptr<SimpleBufferedData>(context);
52
52
  return std::move(state);
53
53
  }
54
54
 
@@ -164,6 +164,11 @@ public:
164
164
  }
165
165
  }
166
166
 
167
+ ~HashJoinGlobalSinkState() override {
168
+ DUCKDB_LOG(context, PhysicalOperatorLogType, op, "PhysicalHashJoin", "GetData",
169
+ {{"total_probe_matches", to_string(hash_table->total_probe_matches)}});
170
+ }
171
+
167
172
  void ScheduleFinalize(Pipeline &pipeline, Event &event);
168
173
  void InitializeProbeSpill();
169
174
 
@@ -184,7 +184,9 @@ int64_t CastFunctionSet::ImplicitCastCost(optional_ptr<ClientContext> context, c
184
184
  old_implicit_casting = DBConfig::GetSetting<OldImplicitCastingSetting>(*config);
185
185
  }
186
186
  if (old_implicit_casting) {
187
- score = 149;
187
+ // very high cost to avoid choosing this cast if any other option is available
188
+ // (it should be more costly than casting to TEMPLATE if that is available)
189
+ score = 10000000000;
188
190
  }
189
191
  }
190
192
  return score;
@@ -47,7 +47,7 @@ MacroBindResult MacroFunction::BindMacroFunction(
47
47
  InsertionOrderPreservingMap<unique_ptr<ParsedExpression>> &named_arguments, idx_t depth) {
48
48
 
49
49
  ExpressionBinder expr_binder(binder, binder.context);
50
-
50
+ expr_binder.lambda_bindings = binder.lambda_bindings;
51
51
  // Find argument types and separate positional and default arguments
52
52
  vector<LogicalType> positional_arg_types;
53
53
  InsertionOrderPreservingMap<LogicalType> named_arg_types;
@@ -198,7 +198,7 @@ scalar_function_t GetStringDecompressFunctionSwitch(const LogicalType &input_typ
198
198
  case LogicalTypeId::UHUGEINT:
199
199
  return GetStringDecompressFunction<uhugeint_t>(input_type);
200
200
  case LogicalTypeId::HUGEINT:
201
- return GetStringCompressFunction<hugeint_t>(input_type);
201
+ return GetStringDecompressFunction<hugeint_t>(input_type);
202
202
  default:
203
203
  throw InternalException("Unexpected type in GetStringDecompressFunctionSwitch");
204
204
  }
@@ -696,13 +696,15 @@ void PrepareSortData(Vector &result, idx_t size, SortKeyLengthInfo &key_lengths,
696
696
  }
697
697
  }
698
698
 
699
- void FinalizeSortData(Vector &result, idx_t size) {
699
+ void FinalizeSortData(Vector &result, idx_t size, const SortKeyLengthInfo &key_lengths,
700
+ const unsafe_vector<idx_t> &offsets) {
700
701
  switch (result.GetType().id()) {
701
702
  case LogicalTypeId::BLOB: {
702
703
  auto result_data = FlatVector::GetData<string_t>(result);
703
704
  // call Finalize on the result
704
705
  for (idx_t r = 0; r < size; r++) {
705
- result_data[r].Finalize();
706
+ result_data[r].SetSizeAndFinalize(offsets[r],
707
+ key_lengths.variable_lengths[r] + key_lengths.constant_length);
706
708
  }
707
709
  break;
708
710
  }
@@ -739,7 +741,7 @@ void CreateSortKeyInternal(vector<unique_ptr<SortKeyVectorData>> &sort_key_data,
739
741
  SortKeyConstructInfo info(modifiers[c], offsets, data_pointers.get());
740
742
  ConstructSortKey(*sort_key_data[c], info);
741
743
  }
742
- FinalizeSortData(result, row_count);
744
+ FinalizeSortData(result, row_count, key_lengths, offsets);
743
745
  }
744
746
 
745
747
  } // namespace
@@ -1220,7 +1220,7 @@ hugeint_t InterpolateOperator::Operation(const hugeint_t &lo, const double d, co
1220
1220
 
1221
1221
  template <>
1222
1222
  uhugeint_t InterpolateOperator::Operation(const uhugeint_t &lo, const double d, const uhugeint_t &hi) {
1223
- return Hugeint::Convert(Operation(Uhugeint::Cast<double>(lo), d, Uhugeint::Cast<double>(hi)));
1223
+ return Uhugeint::Convert(Operation(Uhugeint::Cast<double>(lo), d, Uhugeint::Cast<double>(hi)));
1224
1224
  }
1225
1225
 
1226
1226
  static interval_t MultiplyByDouble(const interval_t &i, const double &d) { // NOLINT
@@ -77,8 +77,10 @@ void ParseLogMessageFunction(DataChunk &args, ExpressionState &state, Vector &re
77
77
  } // namespace
78
78
 
79
79
  ScalarFunction ParseLogMessage::GetFunction() {
80
- return ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::ANY, ParseLogMessageFunction,
81
- ParseLogMessageBind, nullptr, nullptr, nullptr, LogicalType(LogicalTypeId::INVALID));
80
+ auto fun = ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::ANY, ParseLogMessageFunction,
81
+ ParseLogMessageBind, nullptr, nullptr, nullptr, LogicalType(LogicalTypeId::INVALID));
82
+ fun.errors = FunctionErrors::CAN_THROW_RUNTIME_ERROR;
83
+ return fun;
82
84
  }
83
85
 
84
86
  } // namespace duckdb
@@ -280,7 +280,31 @@ struct GlobalWriteCSVData : public GlobalFunctionData {
280
280
  return writer.FileSize();
281
281
  }
282
282
 
283
+ unique_ptr<CSVWriterState> GetLocalState(ClientContext &context, const idx_t flush_size) {
284
+ {
285
+ lock_guard<mutex> guard(local_state_lock);
286
+ if (!local_states.empty()) {
287
+ auto result = std::move(local_states.back());
288
+ local_states.pop_back();
289
+ return result;
290
+ }
291
+ }
292
+ auto result = make_uniq<CSVWriterState>(context, flush_size);
293
+ result->require_manual_flush = true;
294
+ return result;
295
+ }
296
+
297
+ void StoreLocalState(unique_ptr<CSVWriterState> lstate) {
298
+ lock_guard<mutex> guard(local_state_lock);
299
+ lstate->Reset();
300
+ local_states.push_back(std::move(lstate));
301
+ }
302
+
283
303
  CSVWriter writer;
304
+
305
+ private:
306
+ mutex local_state_lock;
307
+ vector<unique_ptr<CSVWriterState>> local_states;
284
308
  };
285
309
 
286
310
  static unique_ptr<LocalFunctionData> WriteCSVInitializeLocal(ExecutionContext &context, FunctionData &bind_data) {
@@ -371,9 +395,7 @@ CopyFunctionExecutionMode WriteCSVExecutionMode(bool preserve_insertion_order, b
371
395
  // Prepare Batch
372
396
  //===--------------------------------------------------------------------===//
373
397
  struct WriteCSVBatchData : public PreparedBatchData {
374
- explicit WriteCSVBatchData(ClientContext &context, const idx_t flush_size)
375
- : writer_local_state(make_uniq<CSVWriterState>(context, flush_size)) {
376
- writer_local_state->require_manual_flush = true;
398
+ explicit WriteCSVBatchData(unique_ptr<CSVWriterState> writer_state) : writer_local_state(std::move(writer_state)) {
377
399
  }
378
400
 
379
401
  //! The thread-local buffer to write data into
@@ -397,7 +419,8 @@ unique_ptr<PreparedBatchData> WriteCSVPrepareBatch(ClientContext &context, Funct
397
419
  auto &global_state = gstate.Cast<GlobalWriteCSVData>();
398
420
 
399
421
  // write CSV chunks to the batch data
400
- auto batch = make_uniq<WriteCSVBatchData>(context, NextPowerOfTwo(collection->SizeInBytes()));
422
+ auto local_writer_state = global_state.GetLocalState(context, NextPowerOfTwo(collection->SizeInBytes()));
423
+ auto batch = make_uniq<WriteCSVBatchData>(std::move(local_writer_state));
401
424
  for (auto &chunk : collection->Chunks()) {
402
425
  WriteCSVChunkInternal(global_state.writer, *batch->writer_local_state, cast_chunk, chunk, executor);
403
426
  }
@@ -412,6 +435,7 @@ void WriteCSVFlushBatch(ClientContext &context, FunctionData &bind_data, GlobalF
412
435
  auto &csv_batch = batch.Cast<WriteCSVBatchData>();
413
436
  auto &global_state = gstate.Cast<GlobalWriteCSVData>();
414
437
  global_state.writer.Flush(*csv_batch.writer_local_state);
438
+ global_state.StoreLocalState(std::move(csv_batch.writer_local_state));
415
439
  }
416
440
 
417
441
  //===--------------------------------------------------------------------===//
@@ -52,6 +52,8 @@ void DirectFileReader::Scan(ClientContext &context, GlobalTableFunctionState &gl
52
52
  }
53
53
 
54
54
  auto files = state.file_list;
55
+
56
+ auto &regular_fs = FileSystem::GetFileSystem(context);
55
57
  auto fs = CachingFileSystem::Get(context);
56
58
  idx_t out_idx = 0;
57
59
 
@@ -65,6 +67,14 @@ void DirectFileReader::Scan(ClientContext &context, GlobalTableFunctionState &gl
65
67
  flags |= FileFlags::FILE_FLAGS_DIRECT_IO;
66
68
  }
67
69
  file_handle = fs.OpenFile(QueryContext(context), file, flags);
70
+ } else {
71
+ // At least verify that the file exist
72
+ // The globbing behavior in remote filesystems can lead to files being listed that do not actually exist
73
+ if (FileSystem::IsRemoteFile(file.path) && !regular_fs.FileExists(file.path)) {
74
+ output.SetCardinality(0);
75
+ done = true;
76
+ return;
77
+ }
68
78
  }
69
79
 
70
80
  for (idx_t col_idx = 0; col_idx < state.column_ids.size(); col_idx++) {
@@ -10,10 +10,43 @@
10
10
 
11
11
  namespace duckdb {
12
12
 
13
+ namespace {
14
+
13
15
  //------------------------------------------------------------------------------
14
16
  // DirectMultiFileInfo
15
17
  //------------------------------------------------------------------------------
16
18
 
19
+ template <class OP>
20
+ struct DirectMultiFileInfo : MultiFileReaderInterface {
21
+ static unique_ptr<MultiFileReaderInterface> CreateInterface(ClientContext &context);
22
+ unique_ptr<BaseFileReaderOptions> InitializeOptions(ClientContext &context,
23
+ optional_ptr<TableFunctionInfo> info) override;
24
+ bool ParseCopyOption(ClientContext &context, const string &key, const vector<Value> &values,
25
+ BaseFileReaderOptions &options, vector<string> &expected_names,
26
+ vector<LogicalType> &expected_types) override;
27
+ bool ParseOption(ClientContext &context, const string &key, const Value &val, MultiFileOptions &file_options,
28
+ BaseFileReaderOptions &options) override;
29
+ unique_ptr<TableFunctionData> InitializeBindData(MultiFileBindData &multi_file_data,
30
+ unique_ptr<BaseFileReaderOptions> options) override;
31
+ void BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
32
+ MultiFileBindData &bind_data) override;
33
+ optional_idx MaxThreads(const MultiFileBindData &bind_data_p, const MultiFileGlobalState &global_state,
34
+ FileExpandResult expand_result) override;
35
+ unique_ptr<GlobalTableFunctionState> InitializeGlobalState(ClientContext &context, MultiFileBindData &bind_data,
36
+ MultiFileGlobalState &global_state) override;
37
+ unique_ptr<LocalTableFunctionState> InitializeLocalState(ExecutionContext &, GlobalTableFunctionState &) override;
38
+ shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
39
+ BaseUnionData &union_data, const MultiFileBindData &bind_data_p) override;
40
+ shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
41
+ const OpenFileInfo &file, idx_t file_idx,
42
+ const MultiFileBindData &bind_data) override;
43
+ shared_ptr<BaseFileReader> CreateReader(ClientContext &context, const OpenFileInfo &file,
44
+ BaseFileReaderOptions &options,
45
+ const MultiFileOptions &file_options) override;
46
+ unique_ptr<NodeStatistics> GetCardinality(const MultiFileBindData &bind_data, idx_t file_count) override;
47
+ FileGlobInput GetGlobInput() override;
48
+ };
49
+
17
50
  template <class OP>
18
51
  unique_ptr<MultiFileReaderInterface> DirectMultiFileInfo<OP>::CreateInterface(ClientContext &context) {
19
52
  return make_uniq<DirectMultiFileInfo>();
@@ -132,14 +165,45 @@ FileGlobInput DirectMultiFileInfo<OP>::GetGlobInput() {
132
165
  }
133
166
 
134
167
  //------------------------------------------------------------------------------
135
- // Register
168
+ // Operations
136
169
  //------------------------------------------------------------------------------
170
+
171
+ struct ReadBlobOperation {
172
+ static constexpr const char *NAME = "read_blob";
173
+ static constexpr const char *FILE_TYPE = "blob";
174
+
175
+ static inline LogicalType TYPE() {
176
+ return LogicalType::BLOB;
177
+ }
178
+ };
179
+
180
+ struct ReadTextOperation {
181
+ static constexpr const char *NAME = "read_text";
182
+ static constexpr const char *FILE_TYPE = "text";
183
+
184
+ static inline LogicalType TYPE() {
185
+ return LogicalType::VARCHAR;
186
+ }
187
+ };
188
+
137
189
  template <class OP>
138
190
  static TableFunction GetFunction() {
139
191
  MultiFileFunction<DirectMultiFileInfo<OP>> table_function(OP::NAME);
192
+ // Erase extra multi file reader options
193
+ table_function.named_parameters.erase("filename");
194
+ table_function.named_parameters.erase("hive_partitioning");
195
+ table_function.named_parameters.erase("union_by_name");
196
+ table_function.named_parameters.erase("hive_types");
197
+ table_function.named_parameters.erase("hive_types_autocast");
140
198
  return table_function;
141
199
  }
142
200
 
201
+ } // namespace
202
+
203
+ //------------------------------------------------------------------------------
204
+ // Register
205
+ //------------------------------------------------------------------------------
206
+
143
207
  void ReadBlobFunction::RegisterFunction(BuiltinFunctions &set) {
144
208
  auto scan_fun = GetFunction<ReadBlobOperation>();
145
209
  set.AddFunction(MultiFileReader::CreateFunctionSet(scan_fun));
@@ -1,5 +1,5 @@
1
1
  #ifndef DUCKDB_PATCH_VERSION
2
- #define DUCKDB_PATCH_VERSION "1"
2
+ #define DUCKDB_PATCH_VERSION "2"
3
3
  #endif
4
4
  #ifndef DUCKDB_MINOR_VERSION
5
5
  #define DUCKDB_MINOR_VERSION 4
@@ -8,10 +8,10 @@
8
8
  #define DUCKDB_MAJOR_VERSION 1
9
9
  #endif
10
10
  #ifndef DUCKDB_VERSION
11
- #define DUCKDB_VERSION "v1.4.1"
11
+ #define DUCKDB_VERSION "v1.4.2"
12
12
  #endif
13
13
  #ifndef DUCKDB_SOURCE_ID
14
- #define DUCKDB_SOURCE_ID "b390a7c376"
14
+ #define DUCKDB_SOURCE_ID "68d7555f68"
15
15
  #endif
16
16
  #include "duckdb/function/table/system_functions.hpp"
17
17
  #include "duckdb/main/database.hpp"
@@ -90,9 +90,6 @@ public:
90
90
  //! Closes the writer, optionally writes a postfix
91
91
  void Close();
92
92
 
93
- unique_ptr<CSVWriterState> InitializeLocalWriteState(ClientContext &context, idx_t flush_size);
94
- unique_ptr<CSVWriterState> InitializeLocalWriteState(DatabaseInstance &db, idx_t flush_size);
95
-
96
93
  vector<unique_ptr<Expression>> string_casts;
97
94
 
98
95
  idx_t BytesWritten();