duckdb 1.1.4-dev11.0 → 1.1.4-dev14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/LICENSE +1 -1
  2. package/binding.gyp +1 -0
  3. package/package.json +1 -1
  4. package/src/duckdb/extension/core_functions/function_list.cpp +1 -0
  5. package/src/duckdb/extension/core_functions/include/core_functions/scalar/map_functions.hpp +9 -0
  6. package/src/duckdb/extension/core_functions/scalar/date/current.cpp +1 -0
  7. package/src/duckdb/extension/core_functions/scalar/generic/can_implicitly_cast.cpp +2 -2
  8. package/src/duckdb/extension/core_functions/scalar/generic/typeof.cpp +1 -1
  9. package/src/duckdb/extension/core_functions/scalar/list/flatten.cpp +91 -61
  10. package/src/duckdb/extension/core_functions/scalar/map/map_extract.cpp +89 -8
  11. package/src/duckdb/extension/icu/icu-current.cpp +63 -0
  12. package/src/duckdb/extension/icu/icu-makedate.cpp +43 -39
  13. package/src/duckdb/extension/icu/icu-timezone.cpp +63 -63
  14. package/src/duckdb/extension/icu/icu_extension.cpp +2 -0
  15. package/src/duckdb/extension/icu/include/icu-casts.hpp +39 -0
  16. package/src/duckdb/extension/icu/include/icu-current.hpp +17 -0
  17. package/src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp +1 -1
  18. package/src/duckdb/extension/json/json_functions/json_structure.cpp +3 -1
  19. package/src/duckdb/extension/parquet/column_writer.cpp +26 -18
  20. package/src/duckdb/extension/parquet/include/parquet_reader.hpp +0 -6
  21. package/src/duckdb/extension/parquet/include/parquet_writer.hpp +15 -1
  22. package/src/duckdb/extension/parquet/include/resizable_buffer.hpp +1 -0
  23. package/src/duckdb/extension/parquet/parquet_extension.cpp +67 -15
  24. package/src/duckdb/extension/parquet/parquet_reader.cpp +5 -3
  25. package/src/duckdb/extension/parquet/parquet_writer.cpp +5 -6
  26. package/src/duckdb/src/catalog/catalog.cpp +21 -8
  27. package/src/duckdb/src/catalog/catalog_search_path.cpp +17 -1
  28. package/src/duckdb/src/catalog/catalog_set.cpp +1 -1
  29. package/src/duckdb/src/catalog/default/default_functions.cpp +0 -3
  30. package/src/duckdb/src/catalog/dependency_list.cpp +7 -0
  31. package/src/duckdb/src/common/adbc/adbc.cpp +1 -56
  32. package/src/duckdb/src/common/arrow/arrow_converter.cpp +3 -2
  33. package/src/duckdb/src/common/arrow/arrow_type_extension.cpp +58 -28
  34. package/src/duckdb/src/common/arrow/schema_metadata.cpp +1 -1
  35. package/src/duckdb/src/common/compressed_file_system.cpp +6 -2
  36. package/src/duckdb/src/common/enum_util.cpp +26 -22
  37. package/src/duckdb/src/common/error_data.cpp +3 -2
  38. package/src/duckdb/src/common/gzip_file_system.cpp +8 -8
  39. package/src/duckdb/src/common/local_file_system.cpp +2 -2
  40. package/src/duckdb/src/common/multi_file_reader.cpp +1 -1
  41. package/src/duckdb/src/common/random_engine.cpp +4 -1
  42. package/src/duckdb/src/common/serializer/memory_stream.cpp +23 -19
  43. package/src/duckdb/src/common/serializer/serializer.cpp +1 -1
  44. package/src/duckdb/src/common/types/bit.cpp +1 -1
  45. package/src/duckdb/src/common/types/column/column_data_allocator.cpp +0 -5
  46. package/src/duckdb/src/common/types/column/column_data_collection.cpp +4 -1
  47. package/src/duckdb/src/common/types/data_chunk.cpp +2 -1
  48. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +0 -4
  49. package/src/duckdb/src/common/types.cpp +1 -1
  50. package/src/duckdb/src/execution/index/art/art.cpp +52 -42
  51. package/src/duckdb/src/execution/index/art/leaf.cpp +4 -9
  52. package/src/duckdb/src/execution/index/art/node.cpp +13 -13
  53. package/src/duckdb/src/execution/index/art/prefix.cpp +21 -16
  54. package/src/duckdb/src/execution/index/bound_index.cpp +6 -8
  55. package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +39 -34
  56. package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +2 -1
  57. package/src/duckdb/src/execution/index/unbound_index.cpp +10 -0
  58. package/src/duckdb/src/execution/operator/aggregate/physical_streaming_window.cpp +62 -44
  59. package/src/duckdb/src/execution/operator/csv_scanner/scanner/column_count_scanner.cpp +26 -0
  60. package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +69 -40
  61. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +3 -7
  62. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +11 -5
  63. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +4 -0
  64. package/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp +8 -8
  65. package/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp +36 -12
  66. package/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +12 -9
  67. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +0 -1
  68. package/src/duckdb/src/execution/operator/persistent/physical_copy_database.cpp +29 -1
  69. package/src/duckdb/src/execution/operator/persistent/physical_delete.cpp +58 -10
  70. package/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +58 -35
  71. package/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +2 -1
  72. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +9 -4
  73. package/src/duckdb/src/execution/sample/reservoir_sample.cpp +7 -6
  74. package/src/duckdb/src/function/compression_config.cpp +4 -0
  75. package/src/duckdb/src/function/function_binder.cpp +1 -1
  76. package/src/duckdb/src/function/scalar/system/write_log.cpp +2 -2
  77. package/src/duckdb/src/function/table/arrow/arrow_duck_schema.cpp +15 -2
  78. package/src/duckdb/src/function/table/arrow_conversion.cpp +10 -10
  79. package/src/duckdb/src/function/table/copy_csv.cpp +8 -5
  80. package/src/duckdb/src/function/table/read_csv.cpp +21 -4
  81. package/src/duckdb/src/function/table/sniff_csv.cpp +7 -0
  82. package/src/duckdb/src/function/table/system/duckdb_extensions.cpp +4 -0
  83. package/src/duckdb/src/function/table/system/duckdb_secret_types.cpp +71 -0
  84. package/src/duckdb/src/function/table/system_functions.cpp +1 -0
  85. package/src/duckdb/src/function/table/table_scan.cpp +120 -36
  86. package/src/duckdb/src/function/table/version/pragma_version.cpp +4 -4
  87. package/src/duckdb/src/function/window/window_aggregate_function.cpp +6 -1
  88. package/src/duckdb/src/function/window/window_boundaries_state.cpp +135 -11
  89. package/src/duckdb/src/function/window/window_segment_tree.cpp +50 -22
  90. package/src/duckdb/src/function/window/window_token_tree.cpp +4 -3
  91. package/src/duckdb/src/include/duckdb/catalog/catalog.hpp +4 -0
  92. package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +2 -0
  93. package/src/duckdb/src/include/duckdb/catalog/dependency_list.hpp +1 -0
  94. package/src/duckdb/src/include/duckdb/common/arrow/arrow_type_extension.hpp +4 -2
  95. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -8
  96. package/src/duckdb/src/include/duckdb/common/multi_file_reader.hpp +0 -2
  97. package/src/duckdb/src/include/duckdb/common/serializer/deserializer.hpp +8 -3
  98. package/src/duckdb/src/include/duckdb/common/serializer/memory_stream.hpp +6 -1
  99. package/src/duckdb/src/include/duckdb/common/serializer/serialization_data.hpp +25 -0
  100. package/src/duckdb/src/include/duckdb/common/serializer/serializer.hpp +9 -3
  101. package/src/duckdb/src/include/duckdb/common/types/selection_vector.hpp +1 -1
  102. package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +11 -14
  103. package/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +5 -4
  104. package/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp +21 -10
  105. package/src/duckdb/src/include/duckdb/execution/index/fixed_size_allocator.hpp +6 -5
  106. package/src/duckdb/src/include/duckdb/execution/index/fixed_size_buffer.hpp +37 -32
  107. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp +36 -1
  108. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/column_count_scanner.hpp +3 -0
  109. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +2 -0
  110. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/state_machine_options.hpp +5 -5
  111. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +5 -30
  112. package/src/duckdb/src/include/duckdb/execution/reservoir_sample.hpp +7 -1
  113. package/src/duckdb/src/include/duckdb/function/scalar_function.hpp +3 -3
  114. package/src/duckdb/src/include/duckdb/function/table/arrow/arrow_duck_schema.hpp +1 -0
  115. package/src/duckdb/src/include/duckdb/function/table/system_functions.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/function/window/window_boundaries_state.hpp +2 -2
  117. package/src/duckdb/src/include/duckdb/logging/logger.hpp +40 -119
  118. package/src/duckdb/src/include/duckdb/logging/logging.hpp +0 -2
  119. package/src/duckdb/src/include/duckdb/main/config.hpp +5 -0
  120. package/src/duckdb/src/include/duckdb/main/connection.hpp +0 -8
  121. package/src/duckdb/src/include/duckdb/main/connection_manager.hpp +2 -1
  122. package/src/duckdb/src/include/duckdb/main/extension.hpp +1 -0
  123. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +11 -7
  124. package/src/duckdb/src/include/duckdb/main/extension_helper.hpp +1 -0
  125. package/src/duckdb/src/include/duckdb/main/secret/secret.hpp +2 -0
  126. package/src/duckdb/src/include/duckdb/main/secret/secret_manager.hpp +3 -0
  127. package/src/duckdb/src/include/duckdb/main/settings.hpp +10 -0
  128. package/src/duckdb/src/include/duckdb/parser/constraint.hpp +9 -0
  129. package/src/duckdb/src/include/duckdb/parser/expression/window_expression.hpp +36 -9
  130. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_view_info.hpp +2 -1
  131. package/src/duckdb/src/include/duckdb/parser/query_node/set_operation_node.hpp +8 -2
  132. package/src/duckdb/src/include/duckdb/planner/binder.hpp +4 -0
  133. package/src/duckdb/src/include/duckdb/planner/expression/bound_parameter_data.hpp +9 -1
  134. package/src/duckdb/src/include/duckdb/planner/filter/constant_filter.hpp +1 -0
  135. package/src/duckdb/src/include/duckdb/planner/filter/in_filter.hpp +0 -2
  136. package/src/duckdb/src/include/duckdb/planner/filter/optional_filter.hpp +4 -4
  137. package/src/duckdb/src/include/duckdb/planner/table_filter.hpp +1 -1
  138. package/src/duckdb/src/include/duckdb/storage/data_table.hpp +14 -10
  139. package/src/duckdb/src/include/duckdb/storage/index_storage_info.hpp +4 -0
  140. package/src/duckdb/src/include/duckdb/storage/single_file_block_manager.hpp +6 -1
  141. package/src/duckdb/src/include/duckdb/storage/storage_info.hpp +7 -2
  142. package/src/duckdb/src/include/duckdb/storage/storage_manager.hpp +9 -0
  143. package/src/duckdb/src/include/duckdb/storage/storage_options.hpp +2 -0
  144. package/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp +4 -3
  145. package/src/duckdb/src/include/duckdb/storage/table/column_data.hpp +2 -0
  146. package/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp +6 -4
  147. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +1 -1
  148. package/src/duckdb/src/include/duckdb/storage/write_ahead_log.hpp +2 -0
  149. package/src/duckdb/src/include/duckdb/transaction/local_storage.hpp +2 -0
  150. package/src/duckdb/src/include/duckdb/transaction/meta_transaction.hpp +1 -1
  151. package/src/duckdb/src/logging/logger.cpp +8 -66
  152. package/src/duckdb/src/main/attached_database.cpp +3 -1
  153. package/src/duckdb/src/main/client_context.cpp +4 -2
  154. package/src/duckdb/src/main/config.cpp +20 -2
  155. package/src/duckdb/src/main/connection.cpp +2 -29
  156. package/src/duckdb/src/main/connection_manager.cpp +5 -3
  157. package/src/duckdb/src/main/database.cpp +2 -2
  158. package/src/duckdb/src/main/extension/extension_helper.cpp +4 -5
  159. package/src/duckdb/src/main/extension/extension_install.cpp +23 -10
  160. package/src/duckdb/src/main/extension/extension_load.cpp +6 -7
  161. package/src/duckdb/src/main/extension.cpp +27 -9
  162. package/src/duckdb/src/main/secret/secret_manager.cpp +11 -0
  163. package/src/duckdb/src/main/settings/custom_settings.cpp +44 -0
  164. package/src/duckdb/src/optimizer/column_lifetime_analyzer.cpp +6 -0
  165. package/src/duckdb/src/optimizer/filter_combiner.cpp +13 -3
  166. package/src/duckdb/src/optimizer/filter_pushdown.cpp +33 -6
  167. package/src/duckdb/src/optimizer/late_materialization.cpp +14 -3
  168. package/src/duckdb/src/optimizer/remove_unused_columns.cpp +0 -3
  169. package/src/duckdb/src/parser/parsed_data/attach_info.cpp +5 -1
  170. package/src/duckdb/src/parser/parsed_data/create_view_info.cpp +6 -3
  171. package/src/duckdb/src/parser/query_node/set_operation_node.cpp +49 -0
  172. package/src/duckdb/src/parser/transform/expression/transform_columnref.cpp +1 -0
  173. package/src/duckdb/src/parser/transform/expression/transform_function.cpp +50 -12
  174. package/src/duckdb/src/planner/binder/expression/bind_columnref_expression.cpp +7 -5
  175. package/src/duckdb/src/planner/binder/expression/bind_comparison_expression.cpp +1 -0
  176. package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +2 -2
  177. package/src/duckdb/src/planner/binder/expression/bind_star_expression.cpp +12 -2
  178. package/src/duckdb/src/planner/binder/statement/bind_copy_database.cpp +0 -1
  179. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +55 -39
  180. package/src/duckdb/src/planner/binder/statement/bind_execute.cpp +2 -1
  181. package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +15 -7
  182. package/src/duckdb/src/planner/binder/tableref/bind_showref.cpp +13 -8
  183. package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +8 -3
  184. package/src/duckdb/src/planner/expression/bound_function_expression.cpp +17 -1
  185. package/src/duckdb/src/planner/expression_binder/index_binder.cpp +1 -0
  186. package/src/duckdb/src/planner/filter/conjunction_filter.cpp +1 -0
  187. package/src/duckdb/src/planner/filter/constant_filter.cpp +21 -0
  188. package/src/duckdb/src/planner/filter/in_filter.cpp +4 -7
  189. package/src/duckdb/src/planner/logical_operator.cpp +5 -3
  190. package/src/duckdb/src/planner/planner.cpp +1 -1
  191. package/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp +2 -0
  192. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +3 -4
  193. package/src/duckdb/src/storage/checkpoint_manager.cpp +3 -5
  194. package/src/duckdb/src/storage/compression/dictionary/decompression.cpp +4 -4
  195. package/src/duckdb/src/storage/compression/fsst.cpp +2 -2
  196. package/src/duckdb/src/storage/compression/roaring/common.cpp +10 -1
  197. package/src/duckdb/src/storage/compression/string_uncompressed.cpp +11 -6
  198. package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +4 -0
  199. package/src/duckdb/src/storage/compression/zstd.cpp +6 -0
  200. package/src/duckdb/src/storage/data_table.cpp +104 -109
  201. package/src/duckdb/src/storage/local_storage.cpp +8 -6
  202. package/src/duckdb/src/storage/magic_bytes.cpp +1 -1
  203. package/src/duckdb/src/storage/serialization/serialize_dependency.cpp +3 -3
  204. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +3 -3
  205. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +7 -5
  206. package/src/duckdb/src/storage/single_file_block_manager.cpp +95 -28
  207. package/src/duckdb/src/storage/storage_info.cpp +38 -0
  208. package/src/duckdb/src/storage/storage_manager.cpp +11 -0
  209. package/src/duckdb/src/storage/table/column_data.cpp +4 -0
  210. package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +3 -3
  211. package/src/duckdb/src/storage/table/row_group_collection.cpp +67 -68
  212. package/src/duckdb/src/storage/table/table_statistics.cpp +4 -4
  213. package/src/duckdb/src/storage/table_index_list.cpp +41 -15
  214. package/src/duckdb/src/storage/wal_replay.cpp +3 -1
  215. package/src/duckdb/src/storage/write_ahead_log.cpp +11 -4
  216. package/src/duckdb/src/transaction/meta_transaction.cpp +1 -1
  217. package/src/duckdb/src/verification/deserialized_statement_verifier.cpp +2 -1
  218. package/src/duckdb/third_party/httplib/httplib.hpp +0 -1
  219. package/src/duckdb/third_party/re2/util/logging.h +10 -10
  220. package/src/duckdb/ub_src_function_table_system.cpp +2 -0
@@ -47,7 +47,10 @@ class DeleteLocalState : public LocalSinkState {
47
47
  public:
48
48
  DeleteLocalState(ClientContext &context, TableCatalogEntry &table,
49
49
  const vector<unique_ptr<BoundConstraint>> &bound_constraints) {
50
- delete_chunk.Initialize(Allocator::Get(context), table.GetTypes());
50
+ const auto &types = table.GetTypes();
51
+ auto initialize = vector<bool>(types.size(), false);
52
+ delete_chunk.Initialize(Allocator::Get(context), types, initialize);
53
+
51
54
  auto &storage = table.GetStorage();
52
55
  delete_state = storage.InitializeDelete(table, context, bound_constraints);
53
56
  }
@@ -64,34 +67,79 @@ SinkResultType PhysicalDelete::Sink(ExecutionContext &context, DataChunk &chunk,
64
67
  auto &transaction = DuckTransaction::Get(context.client, table.db);
65
68
  auto &row_ids = chunk.data[row_id_index];
66
69
 
67
- vector<StorageIndex> column_ids;
68
- for (idx_t i = 0; i < table.ColumnCount(); i++) {
69
- column_ids.emplace_back(i);
70
- };
71
- auto fetch_state = ColumnFetchState();
72
-
73
70
  lock_guard<mutex> delete_guard(g_state.delete_lock);
74
71
  if (!return_chunk && !g_state.has_unique_indexes) {
75
72
  g_state.deleted_count += table.Delete(*l_state.delete_state, context.client, row_ids, chunk.size());
76
73
  return SinkResultType::NEED_MORE_INPUT;
77
74
  }
78
75
 
79
- // Fetch the to-be-deleted chunk.
76
+ auto types = table.GetTypes();
77
+ auto to_be_fetched = vector<bool>(types.size(), return_chunk);
78
+ vector<StorageIndex> column_ids;
79
+ vector<LogicalType> column_types;
80
+ if (return_chunk) {
81
+ // Fetch all columns.
82
+ column_types = types;
83
+ for (idx_t i = 0; i < table.ColumnCount(); i++) {
84
+ column_ids.emplace_back(i);
85
+ }
86
+
87
+ } else {
88
+ // Fetch only the required columns for updating the delete indexes.
89
+ auto &local_storage = LocalStorage::Get(context.client, table.db);
90
+ auto storage = local_storage.GetStorage(table);
91
+ unordered_set<column_t> indexed_column_id_set;
92
+ storage->delete_indexes.Scan([&](Index &index) {
93
+ if (!index.IsBound() || !index.IsUnique()) {
94
+ return false;
95
+ }
96
+ auto &set = index.GetColumnIdSet();
97
+ indexed_column_id_set.insert(set.begin(), set.end());
98
+ return false;
99
+ });
100
+ for (auto &col : indexed_column_id_set) {
101
+ column_ids.emplace_back(col);
102
+ }
103
+ sort(column_ids.begin(), column_ids.end());
104
+ for (auto &col : column_ids) {
105
+ auto i = col.GetPrimaryIndex();
106
+ to_be_fetched[i] = true;
107
+ column_types.push_back(types[i]);
108
+ }
109
+ }
110
+
80
111
  l_state.delete_chunk.Reset();
81
112
  row_ids.Flatten(chunk.size());
82
- table.Fetch(transaction, l_state.delete_chunk, column_ids, row_ids, chunk.size(), fetch_state);
113
+
114
+ // Fetch the to-be-deleted chunk.
115
+ DataChunk fetch_chunk;
116
+ fetch_chunk.Initialize(Allocator::Get(context.client), column_types, chunk.size());
117
+ auto fetch_state = ColumnFetchState();
118
+ table.Fetch(transaction, fetch_chunk, column_ids, row_ids, chunk.size(), fetch_state);
119
+
120
+ // Reference the necessary columns of the fetch_chunk.
121
+ idx_t fetch_idx = 0;
122
+ for (idx_t i = 0; i < table.ColumnCount(); i++) {
123
+ if (to_be_fetched[i]) {
124
+ l_state.delete_chunk.data[i].Reference(fetch_chunk.data[fetch_idx++]);
125
+ continue;
126
+ }
127
+ l_state.delete_chunk.data[i].Reference(Value(types[i]));
128
+ }
129
+ l_state.delete_chunk.SetCardinality(fetch_chunk);
83
130
 
84
131
  // Append the deleted row IDs to the delete indexes.
85
132
  // If we only delete local row IDs, then the delete_chunk is empty.
86
133
  if (g_state.has_unique_indexes && l_state.delete_chunk.size() != 0) {
87
134
  auto &local_storage = LocalStorage::Get(context.client, table.db);
88
135
  auto storage = local_storage.GetStorage(table);
136
+ IndexAppendInfo index_append_info(IndexAppendMode::IGNORE_DUPLICATES, nullptr);
89
137
  storage->delete_indexes.Scan([&](Index &index) {
90
138
  if (!index.IsBound() || !index.IsUnique()) {
91
139
  return false;
92
140
  }
93
141
  auto &bound_index = index.Cast<BoundIndex>();
94
- auto error = bound_index.Append(l_state.delete_chunk, row_ids);
142
+ auto error = bound_index.Append(l_state.delete_chunk, row_ids, index_append_info);
95
143
  if (error.HasError()) {
96
144
  throw InternalException("failed to update delete ART in physical delete: ", error.Message());
97
145
  }
@@ -229,6 +229,7 @@ static void CreateUpdateChunk(ExecutionContext &context, DataChunk &chunk, Table
229
229
  auto &do_update_condition = op.do_update_condition;
230
230
  auto &set_types = op.set_types;
231
231
  auto &set_expressions = op.set_expressions;
232
+
232
233
  // Check the optional condition for the DO UPDATE clause, to filter which rows will be updated
233
234
  if (do_update_condition) {
234
235
  DataChunk do_update_filter_result;
@@ -252,19 +253,28 @@ static void CreateUpdateChunk(ExecutionContext &context, DataChunk &chunk, Table
252
253
  chunk.SetCardinality(selection.Count());
253
254
  // Also apply this Slice to the to-update row_ids
254
255
  row_ids.Slice(selection.Selection(), selection.Count());
256
+ row_ids.Flatten(selection.Count());
255
257
  }
256
258
  }
257
259
 
258
- // Execute the SET expressions
259
- update_chunk.Initialize(context.client, set_types);
260
+ if (chunk.size() == 0) {
261
+ auto initialize = vector<bool>(set_types.size(), false);
262
+ update_chunk.Initialize(context.client, set_types, initialize, chunk.size());
263
+ update_chunk.SetCardinality(chunk);
264
+ return;
265
+ }
266
+
267
+ // Execute the SET expressions.
268
+ update_chunk.Initialize(context.client, set_types, chunk.size());
260
269
  ExpressionExecutor executor(context.client, set_expressions);
261
270
  executor.Execute(chunk, update_chunk);
262
271
  update_chunk.SetCardinality(chunk);
263
272
  }
264
273
 
265
274
  template <bool GLOBAL>
266
- static idx_t PerformOnConflictAction(InsertLocalState &lstate, ExecutionContext &context, DataChunk &chunk,
267
- TableCatalogEntry &table, Vector &row_ids, const PhysicalInsert &op) {
275
+ static idx_t PerformOnConflictAction(InsertLocalState &lstate, InsertGlobalState &gstate, ExecutionContext &context,
276
+ DataChunk &chunk, TableCatalogEntry &table, Vector &row_ids,
277
+ const PhysicalInsert &op) {
268
278
  // Early-out, if we do nothing on conflicting rows.
269
279
  if (op.action_type == OnConflictAction::NOTHING) {
270
280
  return 0;
@@ -275,15 +285,8 @@ static idx_t PerformOnConflictAction(InsertLocalState &lstate, ExecutionContext
275
285
  CreateUpdateChunk(context, chunk, table, row_ids, update_chunk, op);
276
286
  auto &data_table = table.GetStorage();
277
287
 
278
- // Perform the UPDATE on the (global) storage.
279
- if (!op.update_is_del_and_insert) {
280
- if (GLOBAL) {
281
- auto update_state = data_table.InitializeUpdate(table, context.client, op.bound_constraints);
282
- data_table.Update(*update_state, context.client, row_ids, set_columns, update_chunk);
283
- return update_chunk.size();
284
- }
285
- auto &local_storage = LocalStorage::Get(context.client, data_table.db);
286
- local_storage.Update(data_table, row_ids, set_columns, update_chunk);
288
+ if (update_chunk.size() == 0) {
289
+ // Nothing to do
287
290
  return update_chunk.size();
288
291
  }
289
292
 
@@ -297,6 +300,27 @@ static idx_t PerformOnConflictAction(InsertLocalState &lstate, ExecutionContext
297
300
  append_chunk.data[set_columns[i].index].Reference(update_chunk.data[i]);
298
301
  }
299
302
 
303
+ // Perform the UPDATE on the (global) storage.
304
+ if (!op.update_is_del_and_insert) {
305
+ if (!op.parallel && op.return_chunk) {
306
+ gstate.return_collection.Append(append_chunk);
307
+ }
308
+
309
+ if (GLOBAL) {
310
+ auto update_state = data_table.InitializeUpdate(table, context.client, op.bound_constraints);
311
+ data_table.Update(*update_state, context.client, row_ids, set_columns, update_chunk);
312
+ return update_chunk.size();
313
+ }
314
+ auto &local_storage = LocalStorage::Get(context.client, data_table.db);
315
+ if (gstate.initialized) {
316
+ // Flush the data first, it might be referenced by the Update
317
+ data_table.FinalizeLocalAppend(gstate.append_state);
318
+ gstate.initialized = false;
319
+ }
320
+ local_storage.Update(data_table, row_ids, set_columns, update_chunk);
321
+ return update_chunk.size();
322
+ }
323
+
300
324
  if (GLOBAL) {
301
325
  auto &delete_state = lstate.GetDeleteState(data_table, table, context.client);
302
326
  data_table.Delete(delete_state, context.client, row_ids, update_chunk.size());
@@ -305,6 +329,9 @@ static idx_t PerformOnConflictAction(InsertLocalState &lstate, ExecutionContext
305
329
  local_storage.Delete(data_table, row_ids, update_chunk.size());
306
330
  }
307
331
 
332
+ if (!op.parallel && op.return_chunk) {
333
+ gstate.return_collection.Append(append_chunk);
334
+ }
308
335
  data_table.LocalAppend(table, context.client, append_chunk, op.bound_constraints, row_ids, append_chunk);
309
336
  return update_chunk.size();
310
337
  }
@@ -357,8 +384,8 @@ static void CheckDistinctnessInternal(ValidityMask &valid, vector<reference<Vect
357
384
  }
358
385
  }
359
386
 
360
- void PrepareSortKeys(DataChunk &input, unordered_map<column_t, unique_ptr<Vector>> &sort_keys,
361
- const unordered_set<column_t> &column_ids) {
387
+ static void PrepareSortKeys(DataChunk &input, unordered_map<column_t, unique_ptr<Vector>> &sort_keys,
388
+ const unordered_set<column_t> &column_ids) {
362
389
  OrderModifiers order_modifiers(OrderType::ASCENDING, OrderByNullType::NULLS_LAST);
363
390
  for (auto &it : column_ids) {
364
391
  auto &sort_key = sort_keys[it];
@@ -440,7 +467,7 @@ static void VerifyOnConflictCondition(ExecutionContext &context, DataChunk &comb
440
467
 
441
468
  template <bool GLOBAL>
442
469
  static idx_t HandleInsertConflicts(TableCatalogEntry &table, ExecutionContext &context, InsertLocalState &lstate,
443
- DataChunk &tuples, const PhysicalInsert &op) {
470
+ InsertGlobalState &gstate, DataChunk &tuples, const PhysicalInsert &op) {
444
471
  auto &types_to_fetch = op.types_to_fetch;
445
472
  auto &on_conflict_condition = op.on_conflict_condition;
446
473
  auto &conflict_target = op.conflict_target;
@@ -510,7 +537,7 @@ static idx_t HandleInsertConflicts(TableCatalogEntry &table, ExecutionContext &c
510
537
  RegisterUpdatedRows(lstate, row_ids, combined_chunk.size());
511
538
  }
512
539
 
513
- affected_tuples += PerformOnConflictAction<GLOBAL>(lstate, context, combined_chunk, table, row_ids, op);
540
+ affected_tuples += PerformOnConflictAction<GLOBAL>(lstate, gstate, context, combined_chunk, table, row_ids, op);
514
541
 
515
542
  // Remove the conflicting tuples from the insert chunk
516
543
  SelectionVector sel_vec(tuples.size());
@@ -590,6 +617,11 @@ idx_t PhysicalInsert::OnConflictHandling(TableCatalogEntry &table, ExecutionCont
590
617
  }
591
618
  }
592
619
  if (action_type == OnConflictAction::UPDATE) {
620
+ if (do_update_condition) {
621
+ //! See https://github.com/duckdblabs/duckdb-internal/issues/4090 for context
622
+ throw NotImplementedException("Inner conflicts detected with a conditional DO UPDATE on-conflict "
623
+ "action, not fully implemented yet");
624
+ }
593
625
  ManagedSelection last_occurrences(last_occurrences_of_conflict.size());
594
626
  for (auto &idx : last_occurrences_of_conflict) {
595
627
  last_occurrences.Append(idx);
@@ -607,9 +639,9 @@ idx_t PhysicalInsert::OnConflictHandling(TableCatalogEntry &table, ExecutionCont
607
639
  // Check whether any conflicts arise, and if they all meet the conflict_target + condition
608
640
  // If that's not the case - We throw the first error
609
641
  idx_t updated_tuples = 0;
610
- updated_tuples += HandleInsertConflicts<true>(table, context, lstate, lstate.insert_chunk, *this);
642
+ updated_tuples += HandleInsertConflicts<true>(table, context, lstate, gstate, lstate.insert_chunk, *this);
611
643
  // Also check the transaction-local storage+ART so we can detect conflicts within this transaction
612
- updated_tuples += HandleInsertConflicts<false>(table, context, lstate, lstate.insert_chunk, *this);
644
+ updated_tuples += HandleInsertConflicts<false>(table, context, lstate, gstate, lstate.insert_chunk, *this);
613
645
 
614
646
  return updated_tuples;
615
647
  }
@@ -628,31 +660,22 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, DataChunk &chunk,
628
660
  gstate.initialized = true;
629
661
  }
630
662
 
631
- if (action_type != OnConflictAction::NOTHING && return_chunk) {
632
- // If the action is UPDATE or REPLACE, we will always create either an APPEND or an INSERT
633
- // for NOTHING we don't create either an APPEND or an INSERT for the tuple
634
- // so it should not be added to the RETURNING chunk
635
- gstate.return_collection.Append(lstate.insert_chunk);
636
- }
637
663
  idx_t updated_tuples = OnConflictHandling(table, context, gstate, lstate);
638
- if (action_type == OnConflictAction::NOTHING && return_chunk) {
639
- // Because we didn't add to the RETURNING chunk yet
640
- // we add the tuples that did not get filtered out now
641
- gstate.return_collection.Append(lstate.insert_chunk);
642
- }
664
+
643
665
  gstate.insert_count += lstate.insert_chunk.size();
644
666
  gstate.insert_count += updated_tuples;
667
+ if (!parallel && return_chunk) {
668
+ gstate.return_collection.Append(lstate.insert_chunk);
669
+ }
645
670
  storage.LocalAppend(gstate.append_state, context.client, lstate.insert_chunk, true);
646
671
  if (action_type == OnConflictAction::UPDATE && lstate.update_chunk.size() != 0) {
647
- // Flush the append so we can target the data we just appended with the update
648
- storage.FinalizeLocalAppend(gstate.append_state);
649
- gstate.initialized = false;
650
- (void)HandleInsertConflicts<true>(table, context, lstate, lstate.update_chunk, *this);
651
- (void)HandleInsertConflicts<false>(table, context, lstate, lstate.update_chunk, *this);
672
+ (void)HandleInsertConflicts<true>(table, context, lstate, gstate, lstate.update_chunk, *this);
673
+ (void)HandleInsertConflicts<false>(table, context, lstate, gstate, lstate.update_chunk, *this);
652
674
  // All of the tuples should have been turned into an update, leaving the chunk empty afterwards
653
675
  D_ASSERT(lstate.update_chunk.size() == 0);
654
676
  }
655
677
  } else {
678
+ //! FIXME: can't we enable this by using a BatchedDataCollection ?
656
679
  D_ASSERT(!return_chunk);
657
680
  // parallel append
658
681
  if (!lstate.local_collection) {
@@ -88,7 +88,8 @@ SinkResultType PhysicalCreateARTIndex::SinkUnsorted(OperatorSinkInput &input) co
88
88
  // Insert each key and its corresponding row ID.
89
89
  for (idx_t i = 0; i < row_count; i++) {
90
90
  auto status = art.tree.GetGateStatus();
91
- auto conflict_type = art.Insert(art.tree, l_state.keys[i], 0, l_state.row_ids[i], status, nullptr);
91
+ auto conflict_type =
92
+ art.Insert(art.tree, l_state.keys[i], 0, l_state.row_ids[i], status, nullptr, IndexAppendMode::DEFAULT);
92
93
  D_ASSERT(conflict_type != ARTConflictType::TRANSACTION);
93
94
  if (conflict_type == ARTConflictType::CONSTRAINT) {
94
95
  throw ConstraintException("Data contains duplicates on indexed column(s)");
@@ -97,6 +97,7 @@ public:
97
97
  void SetRadixBits(const idx_t &radix_bits_p);
98
98
  bool SetRadixBitsToExternal();
99
99
  idx_t GetRadixBits() const;
100
+ idx_t GetExternalRadixBits() const;
100
101
 
101
102
  private:
102
103
  void SetRadixBitsInternal(idx_t radix_bits_p, bool external);
@@ -210,13 +211,13 @@ RadixHTGlobalSinkState::RadixHTGlobalSinkState(ClientContext &context_p, const R
210
211
  auto tuples_per_block = block_alloc_size / radix_ht.GetLayout().GetRowWidth();
211
212
  idx_t ht_count =
212
213
  LossyNumericCast<idx_t>(static_cast<double>(config.sink_capacity) / GroupedAggregateHashTable::LOAD_FACTOR);
213
- auto num_partitions = RadixPartitioning::NumberOfPartitions(config.GetRadixBits());
214
+ auto num_partitions = RadixPartitioning::NumberOfPartitions(config.GetExternalRadixBits());
214
215
  auto count_per_partition = ht_count / num_partitions;
215
216
  auto blocks_per_partition = (count_per_partition + tuples_per_block) / tuples_per_block + 1;
216
217
  if (!radix_ht.GetLayout().AllConstant()) {
217
218
  blocks_per_partition += 2;
218
219
  }
219
- auto ht_size = blocks_per_partition * block_alloc_size + config.sink_capacity * sizeof(ht_entry_t);
220
+ auto ht_size = num_partitions * blocks_per_partition * block_alloc_size + config.sink_capacity * sizeof(ht_entry_t);
220
221
 
221
222
  // This really is the minimum reservation that we can do
222
223
  auto num_threads = NumericCast<idx_t>(TaskScheduler::GetScheduler(context).NumberOfThreads());
@@ -280,13 +281,17 @@ idx_t RadixHTConfig::GetRadixBits() const {
280
281
  return sink_radix_bits;
281
282
  }
282
283
 
284
+ idx_t RadixHTConfig::GetExternalRadixBits() const {
285
+ return MAXIMUM_FINAL_SINK_RADIX_BITS;
286
+ }
287
+
283
288
  void RadixHTConfig::SetRadixBitsInternal(const idx_t radix_bits_p, bool external) {
284
- if (sink_radix_bits >= radix_bits_p || sink.any_combined) {
289
+ if (sink_radix_bits > radix_bits_p || sink.any_combined) {
285
290
  return;
286
291
  }
287
292
 
288
293
  auto guard = sink.Lock();
289
- if (sink_radix_bits >= radix_bits_p || sink.any_combined) {
294
+ if (sink_radix_bits > radix_bits_p || sink.any_combined) {
290
295
  return;
291
296
  }
292
297
 
@@ -50,7 +50,10 @@ ReservoirSample::ReservoirSample(idx_t sample_count, unique_ptr<ReservoirChunk>
50
50
  if (reservoir_chunk) {
51
51
  this->reservoir_chunk = std::move(reservoir_chunk);
52
52
  sel_size = this->reservoir_chunk->chunk.size();
53
- sel = SelectionVector(0, sel_size);
53
+ sel = SelectionVector(FIXED_SAMPLE_SIZE);
54
+ for (idx_t i = 0; i < sel_size; i++) {
55
+ sel.set_index(i, i);
56
+ }
54
57
  ExpandSerializedSample();
55
58
  }
56
59
  stats_sample = true;
@@ -225,10 +228,6 @@ vector<uint32_t> ReservoirSample::GetRandomizedVector(uint32_t range, uint32_t s
225
228
  for (uint32_t i = 0; i < range; i++) {
226
229
  ret.push_back(i);
227
230
  }
228
- if (size == FIXED_SAMPLE_SIZE) {
229
- std::shuffle(ret.begin(), ret.end(), base_reservoir_sample->random);
230
- return ret;
231
- }
232
231
  for (uint32_t i = 0; i < size; i++) {
233
232
  uint32_t random_shuffle = base_reservoir_sample->random.NextRandomInteger32(i, range);
234
233
  if (random_shuffle == i) {
@@ -305,6 +304,7 @@ void ReservoirSample::SimpleMerge(ReservoirSample &other) {
305
304
  auto offset = reservoir_chunk->chunk.size();
306
305
  for (idx_t i = keep_from_this; i < size_after_merge; i++) {
307
306
  if (i >= GetActiveSampleCount()) {
307
+ D_ASSERT(sel_size >= GetActiveSampleCount());
308
308
  sel.set_index(GetActiveSampleCount(), offset);
309
309
  sel_size += 1;
310
310
  } else {
@@ -551,7 +551,7 @@ void ReservoirSample::ExpandSerializedSample() {
551
551
  }
552
552
 
553
553
  idx_t ReservoirSample::GetReservoirChunkCapacity() const {
554
- return sample_count + (FIXED_SAMPLE_SIZE_MULTIPLIER * FIXED_SAMPLE_SIZE);
554
+ return sample_count + (FIXED_SAMPLE_SIZE_MULTIPLIER * MinValue<idx_t>(sample_count, FIXED_SAMPLE_SIZE));
555
555
  }
556
556
 
557
557
  idx_t ReservoirSample::FillReservoir(DataChunk &chunk) {
@@ -749,6 +749,7 @@ void ReservoirSample::AddToReservoir(DataChunk &chunk) {
749
749
 
750
750
  if (chunk_sel.size == 0) {
751
751
  // not adding any samples
752
+ base_reservoir_sample->num_entries_seen_total += chunk.size();
752
753
  return;
753
754
  }
754
755
  idx_t size = chunk_sel.size;
@@ -65,6 +65,10 @@ static optional_ptr<CompressionFunction> LoadCompressionFunction(CompressionFunc
65
65
 
66
66
  static void TryLoadCompression(DBConfig &config, vector<reference<CompressionFunction>> &result, CompressionType type,
67
67
  const PhysicalType physical_type) {
68
+ if (config.options.disabled_compression_methods.find(type) != config.options.disabled_compression_methods.end()) {
69
+ // explicitly disabled
70
+ return;
71
+ }
68
72
  auto function = config.GetCompressionFunction(type, physical_type);
69
73
  if (!function) {
70
74
  return;
@@ -457,7 +457,7 @@ unique_ptr<Expression> FunctionBinder::BindScalarFunction(ScalarFunction bound_f
457
457
  std::move(children), std::move(bind_info), is_operator);
458
458
  if (result_func->function.bind_expression) {
459
459
  // if a bind_expression callback is registered - call it and emit the resulting expression
460
- FunctionBindExpressionInput input(context, result_func->bind_info.get(), *result_func);
460
+ FunctionBindExpressionInput input(context, result_func->bind_info.get(), result_func->children);
461
461
  result = result_func->function.bind_expression(input);
462
462
  }
463
463
  if (!result) {
@@ -114,11 +114,11 @@ static void WriteLogValues(T &LogSource, LogLevel level, const string_t *data, c
114
114
  const string &type) {
115
115
  if (!type.empty()) {
116
116
  for (idx_t i = 0; i < size; i++) {
117
- Logger::Log(type.c_str(), LogSource, level, data[sel->get_index(i)]);
117
+ DUCKDB_LOG(LogSource, type.c_str(), level, data[sel->get_index(i)]);
118
118
  }
119
119
  } else {
120
120
  for (idx_t i = 0; i < size; i++) {
121
- Logger::Log(LogSource, level, data[sel->get_index(i)]);
121
+ DUCKDB_LOG(LogSource, type.c_str(), level, data[sel->get_index(i)]);
122
122
  }
123
123
  }
124
124
  }
@@ -56,7 +56,7 @@ void ArrowType::ThrowIfInvalid() const {
56
56
  }
57
57
  }
58
58
 
59
- unique_ptr<ArrowType> ArrowType::GetTypeFromFormat(DBConfig &config, ArrowSchema &schema, string &format) {
59
+ unique_ptr<ArrowType> ArrowType::GetTypeFromFormat(string &format) {
60
60
  if (format == "n") {
61
61
  return make_uniq<ArrowType>(LogicalType::SQLNULL);
62
62
  } else if (format == "b") {
@@ -179,6 +179,14 @@ unique_ptr<ArrowType> ArrowType::GetTypeFromFormat(DBConfig &config, ArrowSchema
179
179
  }
180
180
  return make_uniq<ArrowType>(LogicalType::TIMESTAMP_TZ, std::move(type_info));
181
181
  }
182
+ return nullptr;
183
+ }
184
+
185
+ unique_ptr<ArrowType> ArrowType::GetTypeFromFormat(DBConfig &config, ArrowSchema &schema, string &format) {
186
+ auto type = GetTypeFromFormat(format);
187
+ if (type) {
188
+ return type;
189
+ }
182
190
  if (format == "+l") {
183
191
  return CreateListType(config, *schema.children[0], ArrowVariableSizeType::NORMAL, false);
184
192
  } else if (format == "+L") {
@@ -361,8 +369,13 @@ unique_ptr<ArrowType> ArrowType::GetTypeFromSchema(DBConfig &config, ArrowSchema
361
369
  auto arrow_type = GetTypeFromFormat(config, schema, format);
362
370
  if (schema_metadata.HasExtension()) {
363
371
  auto extension_info = schema_metadata.GetExtensionInfo(string(format));
364
- arrow_type->extension_data = config.GetArrowExtension(extension_info).GetTypeExtension();
372
+ if (config.HasArrowExtension(extension_info)) {
373
+ auto extension = config.GetArrowExtension(extension_info);
374
+ arrow_type = extension.GetType(schema, schema_metadata);
375
+ arrow_type->extension_data = extension.GetTypeExtension();
376
+ }
365
377
  }
378
+
366
379
  return arrow_type;
367
380
  }
368
381
 
@@ -118,7 +118,8 @@ static void ColumnArrowToDuckDBRunEndEncoded(Vector &vector, const ArrowArray &a
118
118
 
119
119
  static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, idx_t size,
120
120
  const ArrowType &arrow_type, int64_t nested_offset = -1,
121
- ValidityMask *parent_mask = nullptr, uint64_t parent_offset = 0);
121
+ ValidityMask *parent_mask = nullptr, uint64_t parent_offset = 0,
122
+ bool ignore_extensions = false);
122
123
 
123
124
  static void ColumnArrowToDuckDBDictionary(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state,
124
125
  idx_t size, const ArrowType &arrow_type, int64_t nested_offset = -1,
@@ -765,17 +766,15 @@ static void ColumnArrowToDuckDBRunEndEncoded(Vector &vector, const ArrowArray &a
765
766
 
766
767
  static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, idx_t size,
767
768
  const ArrowType &arrow_type, int64_t nested_offset, ValidityMask *parent_mask,
768
- uint64_t parent_offset) {
769
+ uint64_t parent_offset, bool ignore_extensions) {
769
770
  auto &scan_state = array_state.state;
770
771
  D_ASSERT(!array.dictionary);
771
- if (arrow_type.HasExtension()) {
772
+ if (!ignore_extensions && arrow_type.HasExtension()) {
772
773
  if (arrow_type.extension_data->arrow_to_duckdb) {
773
- // We allocate with the internal type, and cast to the end result
774
+ // Convert the storage and then call the cast function
774
775
  Vector input_data(arrow_type.extension_data->GetInternalType());
775
- // FIXME do we need this?
776
- auto input_arrow_type = ArrowType(arrow_type.extension_data->GetInternalType());
777
- ColumnArrowToDuckDB(input_data, array, array_state, size, input_arrow_type, nested_offset, parent_mask,
778
- parent_offset);
776
+ ColumnArrowToDuckDB(input_data, array, array_state, size, arrow_type, nested_offset, parent_mask,
777
+ parent_offset, /*ignore_extensions*/ true);
779
778
  arrow_type.extension_data->arrow_to_duckdb(array_state.context, input_data, vector, size);
780
779
  return;
781
780
  }
@@ -1105,7 +1104,7 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArraySca
1105
1104
  break;
1106
1105
  case ArrowArrayPhysicalType::DEFAULT:
1107
1106
  ColumnArrowToDuckDB(child_entry, child_array, child_state, size, child_type, nested_offset,
1108
- &struct_validity_mask, NumericCast<uint64_t>(array.offset));
1107
+ &struct_validity_mask, NumericCast<uint64_t>(array.offset), false);
1109
1108
  break;
1110
1109
  default:
1111
1110
  throw NotImplementedException("ArrowArrayPhysicalType not recognized");
@@ -1138,7 +1137,8 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArraySca
1138
1137
  ColumnArrowToDuckDBRunEndEncoded(child, child_array, child_state, size, child_type);
1139
1138
  break;
1140
1139
  case ArrowArrayPhysicalType::DEFAULT:
1141
- ColumnArrowToDuckDB(child, child_array, child_state, size, child_type, nested_offset, &validity_mask);
1140
+ ColumnArrowToDuckDB(child, child_array, child_state, size, child_type, nested_offset, &validity_mask,
1141
+ false);
1142
1142
  break;
1143
1143
  default:
1144
1144
  throw NotImplementedException("ArrowArrayPhysicalType not recognized");
@@ -97,7 +97,7 @@ void BaseCSVData::Finalize() {
97
97
  const char escape = options.dialect_options.state_machine_options.escape.GetValue();
98
98
  // Allow nullstr to be escape character + some non-special character, e.g., "\N" (MySQL default).
99
99
  // In this case, only unquoted occurrences of the nullstr will be recognized as null values.
100
- if (options.dialect_options.state_machine_options.rfc_4180 == false && null_str.size() == 2 &&
100
+ if (options.dialect_options.state_machine_options.strict_mode == false && null_str.size() == 2 &&
101
101
  null_str[0] == escape && null_str[1] != '\0') {
102
102
  continue;
103
103
  }
@@ -371,7 +371,7 @@ static void WriteQuotedString(WriteStream &writer, WriteCSVData &csv_data, const
371
371
  struct LocalWriteCSVData : public LocalFunctionData {
372
372
  public:
373
373
  LocalWriteCSVData(ClientContext &context, vector<unique_ptr<Expression>> &expressions)
374
- : executor(context, expressions) {
374
+ : executor(context, expressions), stream(Allocator::Get(context)) {
375
375
  }
376
376
 
377
377
  public:
@@ -451,7 +451,7 @@ static unique_ptr<GlobalFunctionData> WriteCSVInitializeGlobal(ClientContext &co
451
451
  }
452
452
 
453
453
  if (!(options.dialect_options.header.IsSetByUser() && !options.dialect_options.header.GetValue())) {
454
- MemoryStream stream;
454
+ MemoryStream stream(Allocator::Get(context));
455
455
  // write the header line to the file
456
456
  for (idx_t i = 0; i < csv_data.options.name_list.size(); i++) {
457
457
  if (i != 0) {
@@ -554,7 +554,7 @@ void WriteCSVFinalize(ClientContext &context, FunctionData &bind_data, GlobalFun
554
554
  auto &csv_data = bind_data.Cast<WriteCSVData>();
555
555
  auto &options = csv_data.options;
556
556
 
557
- MemoryStream stream;
557
+ MemoryStream stream(Allocator::Get(context));
558
558
  if (!options.suffix.empty()) {
559
559
  stream.WriteData(const_data_ptr_cast(options.suffix.c_str()), options.suffix.size());
560
560
  } else if (global_state.written_anything) {
@@ -582,6 +582,9 @@ CopyFunctionExecutionMode WriteCSVExecutionMode(bool preserve_insertion_order, b
582
582
  // Prepare Batch
583
583
  //===--------------------------------------------------------------------===//
584
584
  struct WriteCSVBatchData : public PreparedBatchData {
585
+ explicit WriteCSVBatchData(Allocator &allocator) : stream(allocator) {
586
+ }
587
+
585
588
  //! The thread-local buffer to write data into
586
589
  MemoryStream stream;
587
590
  };
@@ -603,7 +606,7 @@ unique_ptr<PreparedBatchData> WriteCSVPrepareBatch(ClientContext &context, Funct
603
606
 
604
607
  // write CSV chunks to the batch data
605
608
  bool written_anything = false;
606
- auto batch = make_uniq<WriteCSVBatchData>();
609
+ auto batch = make_uniq<WriteCSVBatchData>(Allocator::Get(context));
607
610
  for (auto &chunk : collection->Chunks()) {
608
611
  WriteCSVChunkInternal(context, bind_data, cast_chunk, batch->stream, chunk, written_anything, executor);
609
612
  }
@@ -62,6 +62,8 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
62
62
  options.file_path = file_paths[current_file];
63
63
 
64
64
  result.buffer_manager = make_shared_ptr<CSVBufferManager>(context, options, options.file_path, 0, false);
65
+ idx_t only_header_or_empty_files = 0;
66
+
65
67
  {
66
68
  CSVSniffer sniffer(options, result.buffer_manager, CSVStateMachineCache::Get(context));
67
69
  auto sniffer_result = sniffer.SniffCSV();
@@ -71,14 +73,17 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
71
73
  schemas.emplace_back(sniffer_result.names, sniffer_result.return_types, file_paths[0], rows_read,
72
74
  result.buffer_manager->GetBuffer(0)->actual_size == 0);
73
75
  total_number_of_rows += sniffer.LinesSniffed();
76
+ current_file++;
77
+ if (sniffer.EmptyOrOnlyHeader()) {
78
+ only_header_or_empty_files++;
79
+ }
74
80
  }
75
81
 
76
82
  // We do a copy of the options to not pollute the options of the first file.
77
83
  constexpr idx_t max_files_to_sniff = 10;
78
84
  idx_t files_to_sniff = file_paths.size() > max_files_to_sniff ? max_files_to_sniff : file_paths.size();
79
- while (total_number_of_rows < required_number_of_lines && current_file + 1 < files_to_sniff) {
85
+ while (total_number_of_rows < required_number_of_lines && current_file < files_to_sniff) {
80
86
  auto option_copy = option_og;
81
- current_file++;
82
87
  option_copy.file_path = file_paths[current_file];
83
88
  auto buffer_manager =
84
89
  make_shared_ptr<CSVBufferManager>(context, option_copy, option_copy.file_path, current_file, false);
@@ -94,6 +99,10 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
94
99
  schemas.emplace_back(sniffer_result.names, sniffer_result.return_types, option_copy.file_path, rows_read);
95
100
  }
96
101
  total_number_of_rows += sniffer.LinesSniffed();
102
+ if (sniffer.EmptyOrOnlyHeader()) {
103
+ only_header_or_empty_files++;
104
+ }
105
+ current_file++;
97
106
  }
98
107
 
99
108
  // We might now have multiple schemas, we need to go through them to define the one true schema
@@ -115,6 +124,13 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
115
124
  names = best_schema.GetNames();
116
125
  return_types = best_schema.GetTypes();
117
126
  }
127
+ if (only_header_or_empty_files == current_file && !options.columns_set) {
128
+ for (auto &type : return_types) {
129
+ D_ASSERT(type.id() == LogicalTypeId::BOOLEAN);
130
+ // we default to varchar if all files are empty or only have a header after all the sniffing
131
+ type = LogicalType::VARCHAR;
132
+ }
133
+ }
118
134
  result.csv_types = return_types;
119
135
  result.csv_names = names;
120
136
  }
@@ -334,7 +350,7 @@ void ReadCSVTableFunction::ReadCSVAddNamedParameters(TableFunction &table_functi
334
350
  table_function.named_parameters["column_names"] = LogicalType::LIST(LogicalType::VARCHAR);
335
351
  table_function.named_parameters["comment"] = LogicalType::VARCHAR;
336
352
  table_function.named_parameters["encoding"] = LogicalType::VARCHAR;
337
- table_function.named_parameters["rfc_4180"] = LogicalType::BOOLEAN;
353
+ table_function.named_parameters["strict_mode"] = LogicalType::BOOLEAN;
338
354
 
339
355
  MultiFileReader::AddParameters(table_function);
340
356
  }
@@ -358,7 +374,8 @@ void CSVComplexFilterPushdown(ClientContext &context, LogicalGet &get, FunctionD
358
374
  MultiFileReader().ComplexFilterPushdown(context, file_list, data.options.file_options, info, filters);
359
375
  if (filtered_list) {
360
376
  data.files = filtered_list->GetAllFiles();
361
- MultiFileReader::PruneReaders(data, file_list);
377
+ SimpleMultiFileList simple_filtered_list(data.files);
378
+ MultiFileReader::PruneReaders(data, simple_filtered_list);
362
379
  } else {
363
380
  data.files = file_list.GetAllFiles();
364
381
  }
@@ -152,6 +152,13 @@ static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p,
152
152
  }
153
153
  CSVSniffer sniffer(sniffer_options, buffer_manager, CSVStateMachineCache::Get(context));
154
154
  auto sniffer_result = sniffer.SniffCSV(data.force_match);
155
+ if (sniffer.EmptyOrOnlyHeader()) {
156
+ for (auto &type : sniffer_result.return_types) {
157
+ D_ASSERT(type.id() == LogicalTypeId::BOOLEAN);
158
+ // we default to varchar if all files are empty or only have a header after all the sniffing
159
+ type = LogicalType::VARCHAR;
160
+ }
161
+ }
155
162
  string str_opt;
156
163
  string separator = ", ";
157
164
  // Set output