duckdb 1.2.1-dev6.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/core_functions/aggregate/distributive/string_agg.cpp +14 -22
  3. package/src/duckdb/extension/core_functions/aggregate/nested/list.cpp +0 -1
  4. package/src/duckdb/extension/core_functions/lambda_functions.cpp +0 -11
  5. package/src/duckdb/extension/core_functions/scalar/list/list_aggregates.cpp +18 -6
  6. package/src/duckdb/extension/icu/icu-datefunc.cpp +9 -2
  7. package/src/duckdb/extension/icu/icu-strptime.cpp +7 -11
  8. package/src/duckdb/extension/icu/include/icu-datefunc.hpp +3 -1
  9. package/src/duckdb/extension/json/buffered_json_reader.cpp +18 -31
  10. package/src/duckdb/extension/json/json_extension.cpp +8 -3
  11. package/src/duckdb/extension/parquet/column_reader.cpp +4 -6
  12. package/src/duckdb/extension/parquet/column_writer.cpp +33 -12
  13. package/src/duckdb/extension/parquet/include/column_reader.hpp +0 -2
  14. package/src/duckdb/extension/parquet/include/parquet_bss_encoder.hpp +0 -1
  15. package/src/duckdb/extension/parquet/include/parquet_dlba_encoder.hpp +1 -2
  16. package/src/duckdb/src/catalog/catalog.cpp +12 -0
  17. package/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp +1 -1
  18. package/src/duckdb/src/catalog/catalog_entry_retriever.cpp +1 -1
  19. package/src/duckdb/src/catalog/catalog_search_path.cpp +8 -8
  20. package/src/duckdb/src/common/bind_helpers.cpp +3 -0
  21. package/src/duckdb/src/common/compressed_file_system.cpp +2 -0
  22. package/src/duckdb/src/common/hive_partitioning.cpp +1 -1
  23. package/src/duckdb/src/common/multi_file_reader.cpp +3 -3
  24. package/src/duckdb/src/execution/aggregate_hashtable.cpp +1 -1
  25. package/src/duckdb/src/execution/index/art/art.cpp +19 -6
  26. package/src/duckdb/src/execution/index/art/iterator.cpp +7 -3
  27. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +11 -4
  28. package/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +2 -2
  29. package/src/duckdb/src/execution/operator/csv_scanner/encode/csv_encoder.cpp +5 -1
  30. package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp +3 -2
  31. package/src/duckdb/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +2 -2
  32. package/src/duckdb/src/execution/operator/csv_scanner/scanner/scanner_boundary.cpp +1 -1
  33. package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +20 -12
  34. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +19 -22
  35. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +1 -1
  36. package/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp +1 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +16 -0
  38. package/src/duckdb/src/execution/operator/helper/physical_reservoir_sample.cpp +1 -0
  39. package/src/duckdb/src/execution/operator/helper/physical_streaming_sample.cpp +16 -7
  40. package/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp +3 -1
  41. package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +11 -1
  42. package/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +5 -7
  43. package/src/duckdb/src/execution/physical_plan/plan_create_index.cpp +11 -0
  44. package/src/duckdb/src/execution/physical_plan/plan_sample.cpp +1 -3
  45. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +14 -5
  46. package/src/duckdb/src/execution/sample/reservoir_sample.cpp +24 -12
  47. package/src/duckdb/src/function/scalar/generic/getvariable.cpp +3 -3
  48. package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
  49. package/src/duckdb/src/function/window/window_aggregate_states.cpp +3 -0
  50. package/src/duckdb/src/function/window/window_boundaries_state.cpp +108 -48
  51. package/src/duckdb/src/function/window/window_constant_aggregator.cpp +5 -5
  52. package/src/duckdb/src/function/window/window_distinct_aggregator.cpp +6 -0
  53. package/src/duckdb/src/include/duckdb/catalog/catalog_entry_retriever.hpp +1 -1
  54. package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +10 -9
  55. package/src/duckdb/src/include/duckdb/common/adbc/adbc-init.hpp +1 -1
  56. package/src/duckdb/src/include/duckdb/common/multi_file_reader.hpp +2 -2
  57. package/src/duckdb/src/include/duckdb/execution/index/art/iterator.hpp +2 -0
  58. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp +1 -1
  59. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp +5 -4
  60. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_option.hpp +1 -1
  61. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp +2 -2
  62. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/encode/csv_encoder.hpp +1 -1
  63. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +1 -1
  64. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +2 -2
  65. package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp +3 -7
  66. package/src/duckdb/src/include/duckdb/execution/reservoir_sample.hpp +2 -1
  67. package/src/duckdb/src/include/duckdb/function/lambda_functions.hpp +11 -3
  68. package/src/duckdb/src/include/duckdb/function/window/window_boundaries_state.hpp +4 -0
  69. package/src/duckdb/src/include/duckdb/main/client_context_state.hpp +4 -0
  70. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +25 -7
  71. package/src/duckdb/src/include/duckdb/main/pending_query_result.hpp +2 -0
  72. package/src/duckdb/src/include/duckdb/main/query_profiler.hpp +7 -0
  73. package/src/duckdb/src/include/duckdb/optimizer/filter_combiner.hpp +2 -2
  74. package/src/duckdb/src/include/duckdb/optimizer/late_materialization.hpp +2 -1
  75. package/src/duckdb/src/include/duckdb/optimizer/optimizer_extension.hpp +11 -5
  76. package/src/duckdb/src/include/duckdb/parallel/executor_task.hpp +4 -1
  77. package/src/duckdb/src/include/duckdb/parallel/pipeline.hpp +0 -1
  78. package/src/duckdb/src/include/duckdb/parallel/task_executor.hpp +3 -0
  79. package/src/duckdb/src/include/duckdb/parallel/task_notifier.hpp +27 -0
  80. package/src/duckdb/src/include/duckdb/parallel/task_scheduler.hpp +4 -0
  81. package/src/duckdb/src/include/duckdb/planner/expression/bound_subquery_expression.hpp +1 -1
  82. package/src/duckdb/src/include/duckdb/planner/tableref/bound_cteref.hpp +1 -0
  83. package/src/duckdb/src/include/duckdb/storage/checkpoint/table_data_writer.hpp +3 -1
  84. package/src/duckdb/src/include/duckdb/storage/checkpoint_manager.hpp +7 -1
  85. package/src/duckdb/src/include/duckdb/storage/storage_manager.hpp +3 -2
  86. package/src/duckdb/src/include/duckdb.h +495 -480
  87. package/src/duckdb/src/main/attached_database.cpp +1 -1
  88. package/src/duckdb/src/main/capi/duckdb-c.cpp +5 -1
  89. package/src/duckdb/src/main/capi/helper-c.cpp +8 -0
  90. package/src/duckdb/src/main/config.cpp +7 -1
  91. package/src/duckdb/src/main/database.cpp +8 -8
  92. package/src/duckdb/src/main/extension/extension_helper.cpp +3 -1
  93. package/src/duckdb/src/main/extension/extension_load.cpp +12 -12
  94. package/src/duckdb/src/optimizer/column_lifetime_analyzer.cpp +1 -0
  95. package/src/duckdb/src/optimizer/join_order/query_graph_manager.cpp +2 -2
  96. package/src/duckdb/src/optimizer/late_materialization.cpp +26 -5
  97. package/src/duckdb/src/optimizer/optimizer.cpp +12 -1
  98. package/src/duckdb/src/parallel/executor_task.cpp +10 -6
  99. package/src/duckdb/src/parallel/task_executor.cpp +4 -1
  100. package/src/duckdb/src/parallel/task_notifier.cpp +23 -0
  101. package/src/duckdb/src/parallel/task_scheduler.cpp +33 -0
  102. package/src/duckdb/src/parser/transform/expression/transform_subquery.cpp +4 -1
  103. package/src/duckdb/src/planner/binder/expression/bind_subquery_expression.cpp +1 -1
  104. package/src/duckdb/src/planner/binder/query_node/plan_subquery.cpp +4 -2
  105. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +7 -2
  106. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +6 -5
  107. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +4 -2
  108. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  109. package/src/duckdb/src/storage/compression/string_uncompressed.cpp +21 -10
  110. package/src/duckdb/src/storage/storage_info.cpp +2 -0
  111. package/src/duckdb/src/storage/storage_manager.cpp +2 -2
  112. package/src/duckdb/src/storage/table/row_group.cpp +5 -6
  113. package/src/duckdb/src/storage/table/scan_state.cpp +6 -0
  114. package/src/duckdb/src/transaction/duck_transaction.cpp +11 -3
  115. package/src/duckdb/src/transaction/duck_transaction_manager.cpp +2 -2
  116. package/src/duckdb/third_party/concurrentqueue/concurrentqueue.h +17 -0
  117. package/src/duckdb/ub_src_parallel.cpp +2 -0
@@ -189,11 +189,11 @@ void CatalogSearchPath::Set(CatalogSearchEntry new_value, CatalogSetPathType set
189
189
  Set(std::move(new_paths), set_type);
190
190
  }
191
191
 
192
- const vector<CatalogSearchEntry> &CatalogSearchPath::Get() {
192
+ const vector<CatalogSearchEntry> &CatalogSearchPath::Get() const {
193
193
  return paths;
194
194
  }
195
195
 
196
- string CatalogSearchPath::GetDefaultSchema(const string &catalog) {
196
+ string CatalogSearchPath::GetDefaultSchema(const string &catalog) const {
197
197
  for (auto &path : paths) {
198
198
  if (path.catalog == TEMP_CATALOG) {
199
199
  continue;
@@ -205,7 +205,7 @@ string CatalogSearchPath::GetDefaultSchema(const string &catalog) {
205
205
  return DEFAULT_SCHEMA;
206
206
  }
207
207
 
208
- string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string &catalog) {
208
+ string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string &catalog) const {
209
209
  for (auto &path : paths) {
210
210
  if (path.catalog == TEMP_CATALOG) {
211
211
  continue;
@@ -221,7 +221,7 @@ string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string
221
221
  return DEFAULT_SCHEMA;
222
222
  }
223
223
 
224
- string CatalogSearchPath::GetDefaultCatalog(const string &schema) {
224
+ string CatalogSearchPath::GetDefaultCatalog(const string &schema) const {
225
225
  if (DefaultSchemaGenerator::IsDefaultSchema(schema)) {
226
226
  return SYSTEM_CATALOG;
227
227
  }
@@ -236,7 +236,7 @@ string CatalogSearchPath::GetDefaultCatalog(const string &schema) {
236
236
  return INVALID_CATALOG;
237
237
  }
238
238
 
239
- vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) {
239
+ vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) const {
240
240
  vector<string> catalogs;
241
241
  if (DefaultSchemaGenerator::IsDefaultSchema(schema)) {
242
242
  catalogs.push_back(SYSTEM_CATALOG);
@@ -250,7 +250,7 @@ vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) {
250
250
  return catalogs;
251
251
  }
252
252
 
253
- vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) {
253
+ vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) const {
254
254
  vector<string> schemas;
255
255
  for (auto &path : paths) {
256
256
  if (StringUtil::CIEquals(path.catalog, catalog)) {
@@ -260,7 +260,7 @@ vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) {
260
260
  return schemas;
261
261
  }
262
262
 
263
- const CatalogSearchEntry &CatalogSearchPath::GetDefault() {
263
+ const CatalogSearchEntry &CatalogSearchPath::GetDefault() const {
264
264
  const auto &paths = Get();
265
265
  D_ASSERT(paths.size() >= 2);
266
266
  return paths[1];
@@ -281,7 +281,7 @@ void CatalogSearchPath::SetPathsInternal(vector<CatalogSearchEntry> new_paths) {
281
281
  }
282
282
 
283
283
  bool CatalogSearchPath::SchemaInSearchPath(ClientContext &context, const string &catalog_name,
284
- const string &schema_name) {
284
+ const string &schema_name) const {
285
285
  for (auto &path : paths) {
286
286
  if (!StringUtil::CIEquals(path.schema, schema_name)) {
287
287
  continue;
@@ -56,6 +56,9 @@ vector<bool> ParseColumnList(const Value &value, vector<string> &names, const st
56
56
  }
57
57
  throw BinderException("\"%s\" expects a column list or * as parameter", loption);
58
58
  }
59
+ if (value.IsNull()) {
60
+ throw BinderException("\"%s\" expects a column list or * as parameter, it can't be a NULL value", loption);
61
+ }
59
62
  auto &children = ListValue::GetChildren(value);
60
63
  // accept '*' as single argument
61
64
  if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
@@ -31,6 +31,8 @@ void CompressedFile::Initialize(bool write) {
31
31
  stream_data.out_buff_start = stream_data.out_buff.get();
32
32
  stream_data.out_buff_end = stream_data.out_buff.get();
33
33
 
34
+ current_position = 0;
35
+
34
36
  stream_wrapper = compressed_fs.CreateStream();
35
37
  stream_wrapper->Initialize(*this, write);
36
38
  }
@@ -245,7 +245,7 @@ static void TemplatedGetHivePartitionValues(Vector &input, vector<HivePartitionK
245
245
 
246
246
  const auto &type = input.GetType();
247
247
 
248
- const auto reinterpret = Value::CreateValue<T>(data[0]).GetTypeMutable() != type;
248
+ const auto reinterpret = Value::CreateValue<T>(data[sel.get_index(0)]).GetTypeMutable() != type;
249
249
  if (reinterpret) {
250
250
  for (idx_t i = 0; i < count; i++) {
251
251
  auto &key = keys[i];
@@ -508,14 +508,14 @@ void MultiFileReader::CreateMapping(const string &file_name,
508
508
  // copy global columns and inject any different defaults
509
509
  CreateColumnMapping(file_name, local_columns, global_columns, global_column_ids, reader_data, bind_data,
510
510
  initial_file, global_state);
511
- CreateFilterMap(global_columns, filters, reader_data, global_state);
511
+ CreateFilterMap(global_column_ids, filters, reader_data, global_state);
512
512
  }
513
513
 
514
- void MultiFileReader::CreateFilterMap(const vector<MultiFileReaderColumnDefinition> &global_columns,
514
+ void MultiFileReader::CreateFilterMap(const vector<ColumnIndex> &global_column_ids,
515
515
  optional_ptr<TableFilterSet> filters, MultiFileReaderData &reader_data,
516
516
  optional_ptr<MultiFileReaderGlobalState> global_state) {
517
517
  if (filters) {
518
- auto filter_map_size = global_columns.size();
518
+ auto filter_map_size = global_column_ids.size();
519
519
  if (global_state) {
520
520
  filter_map_size += global_state->extra_columns.size();
521
521
  }
@@ -329,7 +329,7 @@ optional_idx GroupedAggregateHashTable::TryAddDictionaryGroups(DataChunk &groups
329
329
  if (dictionary_id.empty()) {
330
330
  // dictionary has no id, we can't cache across vectors
331
331
  // only use dictionary compression if there are fewer entries than groups
332
- if (dict_size >= groups.size() * DICTIONARY_THRESHOLD) {
332
+ if (dict_size * DICTIONARY_THRESHOLD >= groups.size()) {
333
333
  // dictionary is too large - use regular aggregation
334
334
  return optional_idx();
335
335
  }
@@ -1038,9 +1038,11 @@ string ART::GenerateConstraintErrorMessage(VerifyExistenceType verify_type, cons
1038
1038
  }
1039
1039
  case VerifyExistenceType::DELETE_FK: {
1040
1040
  // DELETE_FK that still exists in a FK table, i.e., not a valid delete.
1041
- return StringUtil::Format("Violates foreign key constraint because key \"%s\" is still referenced by a foreign "
1042
- "key in a different table",
1043
- key_name);
1041
+ return StringUtil::Format(
1042
+ "Violates foreign key constraint because key \"%s\" is still referenced by a foreign "
1043
+ "key in a different table. If this is an unexpected constraint violation, please refer to our "
1044
+ "foreign key limitations in the documentation",
1045
+ key_name);
1044
1046
  }
1045
1047
  default:
1046
1048
  throw NotImplementedException("Type not implemented for VerifyExistenceType");
@@ -1091,16 +1093,27 @@ void ART::VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr<ART> dele
1091
1093
  return;
1092
1094
  }
1093
1095
 
1096
+ // Fast path for FOREIGN KEY constraints.
1097
+ // Up to here, the above code paths work implicitly for FKs, as the leaf is inlined.
1094
1098
  // FIXME: proper foreign key + delete ART support.
1095
- // This implicitly works for foreign keys, as we do not have to consider the actual row IDs.
1096
- // We only need to know that there are conflicts (for now), as we still perform over-eager constraint checking.
1099
+ if (index_constraint_type == IndexConstraintType::FOREIGN) {
1100
+ D_ASSERT(!deleted_leaf);
1101
+ // We don't handle FK conflicts in UPSERT, so the row ID should not matter.
1102
+ if (manager.AddHit(i, MAX_ROW_ID)) {
1103
+ conflict_idx = i;
1104
+ }
1105
+ return;
1106
+ }
1097
1107
 
1098
1108
  // Scan the two row IDs in the leaf.
1099
1109
  Iterator it(*this);
1100
1110
  it.FindMinimum(leaf);
1101
1111
  ARTKey empty_key = ARTKey();
1102
1112
  unsafe_vector<row_t> row_ids;
1103
- it.Scan(empty_key, 2, row_ids, false);
1113
+ auto success = it.Scan(empty_key, 2, row_ids, false);
1114
+ if (!success || row_ids.size() != 2) {
1115
+ throw InternalException("VerifyLeaf expects exactly two row IDs to be scanned");
1116
+ }
1104
1117
 
1105
1118
  if (!deleted_leaf) {
1106
1119
  if (manager.AddHit(i, row_ids[0]) || manager.AddHit(i, row_ids[1])) {
@@ -46,9 +46,11 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec
46
46
  bool has_next;
47
47
  do {
48
48
  // An empty upper bound indicates that no upper bound exists.
49
- if (!upper_bound.Empty() && status == GateStatus::GATE_NOT_SET) {
50
- if (current_key.GreaterThan(upper_bound, equal, nested_depth)) {
51
- return true;
49
+ if (!upper_bound.Empty()) {
50
+ if (status == GateStatus::GATE_NOT_SET || entered_nested_leaf) {
51
+ if (current_key.GreaterThan(upper_bound, equal, nested_depth)) {
52
+ return true;
53
+ }
52
54
  }
53
55
  }
54
56
 
@@ -86,6 +88,7 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec
86
88
  throw InternalException("Invalid leaf type for index scan.");
87
89
  }
88
90
 
91
+ entered_nested_leaf = false;
89
92
  has_next = Next();
90
93
  } while (has_next);
91
94
  return true;
@@ -104,6 +107,7 @@ void Iterator::FindMinimum(const Node &node) {
104
107
  if (node.GetGateStatus() == GateStatus::GATE_SET) {
105
108
  D_ASSERT(status == GateStatus::GATE_NOT_SET);
106
109
  status = GateStatus::GATE_SET;
110
+ entered_nested_leaf = true;
107
111
  nested_depth = 0;
108
112
  }
109
113
 
@@ -575,6 +575,11 @@ public:
575
575
 
576
576
  explicit WindowLocalSourceState(WindowGlobalSourceState &gsource);
577
577
 
578
+ void ReleaseLocalStates() {
579
+ auto &local_states = window_hash_group->thread_states.at(task->thread_idx);
580
+ local_states.clear();
581
+ }
582
+
578
583
  //! Does the task have more work to do?
579
584
  bool TaskFinished() const {
580
585
  return !task || task->begin_idx == task->end_idx;
@@ -792,6 +797,12 @@ void WindowGlobalSourceState::FinishTask(TaskPtr task) {
792
797
  }
793
798
 
794
799
  bool WindowLocalSourceState::TryAssignTask() {
800
+ D_ASSERT(TaskFinished());
801
+ if (task && task->stage == WindowGroupStage::GETDATA) {
802
+ // If this state completed the last block in the previous iteration,
803
+ // release out local state memory.
804
+ ReleaseLocalStates();
805
+ }
795
806
  // Because downstream operators may be using our internal buffers,
796
807
  // we can't "finish" a task until we are about to get the next one.
797
808
 
@@ -888,10 +899,6 @@ void WindowLocalSourceState::GetData(DataChunk &result) {
888
899
  ++task->begin_idx;
889
900
  }
890
901
 
891
- // If that was the last block, release out local state memory.
892
- if (TaskFinished()) {
893
- local_states.clear();
894
- }
895
902
  result.Verify();
896
903
  }
897
904
 
@@ -4,7 +4,7 @@
4
4
  namespace duckdb {
5
5
 
6
6
  CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
7
- idx_t &global_csv_current_position, idx_t file_number_p)
7
+ const idx_t &global_csv_current_position, idx_t file_number_p)
8
8
  : context(context), requested_size(buffer_size_p), file_number(file_number_p), can_seek(file_handle.CanSeek()),
9
9
  is_pipe(file_handle.IsPipe()) {
10
10
  AllocateBuffer(buffer_size_p);
@@ -34,7 +34,7 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b
34
34
  }
35
35
 
36
36
  shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p,
37
- bool &has_seaked) {
37
+ bool &has_seaked) const {
38
38
  if (has_seaked) {
39
39
  // This means that at some point a reload was done, and we are currently on the incorrect position in our file
40
40
  // handle
@@ -36,7 +36,7 @@ void CSVEncoderBuffer::Reset() {
36
36
  actual_encoded_buffer_size = 0;
37
37
  }
38
38
 
39
- CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, idx_t buffer_size) {
39
+ CSVEncoder::CSVEncoder(const DBConfig &config, const string &encoding_name_to_find, idx_t buffer_size) {
40
40
  encoding_name = StringUtil::Lower(encoding_name_to_find);
41
41
  auto function = config.GetEncodeFunction(encoding_name_to_find);
42
42
  if (!function) {
@@ -51,6 +51,10 @@ CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, id
51
51
  }
52
52
  // We ensure that the encoded buffer size is an even number to make the two byte lookup on utf-16 work
53
53
  idx_t encoded_buffer_size = buffer_size % 2 != 0 ? buffer_size - 1 : buffer_size;
54
+ if (encoded_buffer_size == 0) {
55
+ // This might happen if buffer size = 1
56
+ encoded_buffer_size = 2;
57
+ }
54
58
  D_ASSERT(encoded_buffer_size > 0);
55
59
  encoded_buffer.Initialize(encoded_buffer_size);
56
60
  remaining_bytes_buffer.Initialize(function->GetBytesPerIteration());
@@ -11,9 +11,10 @@ ScannerResult::ScannerResult(CSVStates &states_p, CSVStateMachine &state_machine
11
11
 
12
12
  BaseScanner::BaseScanner(shared_ptr<CSVBufferManager> buffer_manager_p, shared_ptr<CSVStateMachine> state_machine_p,
13
13
  shared_ptr<CSVErrorHandler> error_handler_p, bool sniffing_p,
14
- shared_ptr<CSVFileScan> csv_file_scan_p, CSVIterator iterator_p)
14
+ shared_ptr<CSVFileScan> csv_file_scan_p, const CSVIterator &iterator_p)
15
15
  : csv_file_scan(std::move(csv_file_scan_p)), sniffing(sniffing_p), error_handler(std::move(error_handler_p)),
16
- state_machine(std::move(state_machine_p)), buffer_manager(std::move(buffer_manager_p)), iterator(iterator_p) {
16
+ state_machine(std::move(state_machine_p)), states(), buffer_manager(std::move(buffer_manager_p)),
17
+ iterator(iterator_p) {
17
18
  D_ASSERT(buffer_manager);
18
19
  D_ASSERT(state_machine);
19
20
  // Initialize current buffer handle
@@ -76,8 +76,8 @@ void CSVSchema::MergeSchemas(CSVSchema &other, bool null_padding) {
76
76
  }
77
77
  }
78
78
 
79
- CSVSchema::CSVSchema(vector<string> &names, vector<LogicalType> &types, const string &file_path, idx_t rows_read_p,
80
- const bool empty_p)
79
+ CSVSchema::CSVSchema(const vector<string> &names, const vector<LogicalType> &types, const string &file_path,
80
+ idx_t rows_read_p, const bool empty_p)
81
81
  : rows_read(rows_read_p), empty(empty_p) {
82
82
  Initialize(names, types, file_path);
83
83
  }
@@ -13,7 +13,7 @@ CSVBoundary::CSVBoundary(idx_t buffer_idx_p, idx_t buffer_pos_p, idx_t boundary_
13
13
  CSVBoundary::CSVBoundary() : buffer_idx(0), buffer_pos(0), boundary_idx(0), end_pos(NumericLimits<idx_t>::Maximum()) {
14
14
  }
15
15
 
16
- CSVIterator::CSVIterator() : is_set(false) {
16
+ CSVIterator::CSVIterator() : buffer_size(0), is_set(false) {
17
17
  }
18
18
 
19
19
  void CSVBoundary::Print() const {
@@ -688,23 +688,29 @@ bool LineError::HandleErrors(StringValueResult &result) {
688
688
  line_pos.GetGlobalPosition(result.requested_size), result.path);
689
689
  }
690
690
  break;
691
- case CAST_ERROR:
691
+ case CAST_ERROR: {
692
+ string column_name;
693
+ LogicalTypeId type_id;
694
+ if (cur_error.col_idx < result.names.size()) {
695
+ column_name = result.names[cur_error.col_idx];
696
+ }
697
+ if (cur_error.col_idx < result.number_of_columns) {
698
+ type_id = result.parse_types[cur_error.chunk_idx].type_id;
699
+ }
692
700
  if (result.current_line_position.begin == line_pos) {
693
701
  csv_error = CSVError::CastError(
694
- result.state_machine.options, result.names[cur_error.col_idx], cur_error.error_message,
695
- cur_error.col_idx, borked_line, lines_per_batch,
702
+ result.state_machine.options, column_name, cur_error.error_message, cur_error.col_idx, borked_line,
703
+ lines_per_batch,
696
704
  result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl),
697
- line_pos.GetGlobalPosition(result.requested_size, first_nl),
698
- result.parse_types[cur_error.chunk_idx].type_id, result.path);
705
+ line_pos.GetGlobalPosition(result.requested_size, first_nl), type_id, result.path);
699
706
  } else {
700
707
  csv_error = CSVError::CastError(
701
- result.state_machine.options, result.names[cur_error.col_idx], cur_error.error_message,
702
- cur_error.col_idx, borked_line, lines_per_batch,
708
+ result.state_machine.options, column_name, cur_error.error_message, cur_error.col_idx, borked_line,
709
+ lines_per_batch,
703
710
  result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl),
704
- line_pos.GetGlobalPosition(result.requested_size), result.parse_types[cur_error.chunk_idx].type_id,
705
- result.path);
711
+ line_pos.GetGlobalPosition(result.requested_size), type_id, result.path);
706
712
  }
707
- break;
713
+ } break;
708
714
  case MAXIMUM_LINE_SIZE:
709
715
  csv_error = CSVError::LineSizeError(
710
716
  result.state_machine.options, lines_per_batch, borked_line,
@@ -964,7 +970,8 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptr<CSV
964
970
  result(states, *state_machine, cur_buffer_handle, BufferAllocator::Get(buffer_manager->context), result_size,
965
971
  iterator.pos.buffer_pos, *error_handler, iterator,
966
972
  buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
967
- buffer_manager->GetFilePath(), scanner_idx_p) {
973
+ buffer_manager->GetFilePath(), scanner_idx_p),
974
+ start_pos(0) {
968
975
  iterator.buffer_size = state_machine->options.buffer_size_option.GetValue();
969
976
  }
970
977
 
@@ -976,7 +983,8 @@ StringValueScanner::StringValueScanner(const shared_ptr<CSVBufferManager> &buffe
976
983
  result(states, *state_machine, cur_buffer_handle, Allocator::DefaultAllocator(), result_size,
977
984
  iterator.pos.buffer_pos, *error_handler, iterator,
978
985
  buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
979
- buffer_manager->GetFilePath(), 0) {
986
+ buffer_manager->GetFilePath(), 0),
987
+ start_pos(0) {
980
988
  iterator.buffer_size = state_machine->options.buffer_size_option.GetValue();
981
989
  }
982
990
 
@@ -397,7 +397,13 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
397
397
  }
398
398
  }
399
399
  }
400
- if (max_columns_found == num_cols && ignored_rows > min_ignored_rows) {
400
+ if (max_columns_found == num_cols && (ignored_rows > min_ignored_rows)) {
401
+ return;
402
+ }
403
+ if (max_columns_found > 1 && num_cols > max_columns_found && consistent_rows < best_consistent_rows / 2 &&
404
+ options.null_padding) {
405
+ // When null_padding is true, we only give preference to a max number of columns if null padding is at least
406
+ // 50% as consistent as the best case scenario
401
407
  return;
402
408
  }
403
409
  if (quoted && num_cols < max_columns_found) {
@@ -436,28 +442,19 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
436
442
  !require_more_padding && !invalid_padding && num_cols == max_columns_found && comments_are_acceptable) {
437
443
  auto &sniffing_state_machine = scanner->GetStateMachine();
438
444
 
439
- bool same_quote_is_candidate = false;
440
- for (const auto &candidate : candidates) {
441
- if (sniffing_state_machine.dialect_options.state_machine_options.quote ==
442
- candidate->GetStateMachine().dialect_options.state_machine_options.quote) {
443
- same_quote_is_candidate = true;
444
- }
445
- }
446
- if (!same_quote_is_candidate) {
447
- if (options.dialect_options.skip_rows.IsSetByUser()) {
448
- // If skip rows is set by user, and we found dirty notes, we only accept it if either null_padding or
449
- // ignore_errors is set
450
- if (dirty_notes != 0 && !options.null_padding && !options.ignore_errors.GetValue()) {
451
- return;
452
- }
453
- sniffing_state_machine.dialect_options.skip_rows = options.dialect_options.skip_rows.GetValue();
454
- } else if (!options.null_padding) {
455
- sniffing_state_machine.dialect_options.skip_rows = dirty_notes;
445
+ if (options.dialect_options.skip_rows.IsSetByUser()) {
446
+ // If skip rows is set by user, and we found dirty notes, we only accept it if either null_padding or
447
+ // ignore_errors is set
448
+ if (dirty_notes != 0 && !options.null_padding && !options.ignore_errors.GetValue()) {
449
+ return;
456
450
  }
457
- sniffing_state_machine.dialect_options.num_cols = num_cols;
458
- lines_sniffed = sniffed_column_counts.result_position;
459
- candidates.emplace_back(std::move(scanner));
451
+ sniffing_state_machine.dialect_options.skip_rows = options.dialect_options.skip_rows.GetValue();
452
+ } else if (!options.null_padding) {
453
+ sniffing_state_machine.dialect_options.skip_rows = dirty_notes;
460
454
  }
455
+ sniffing_state_machine.dialect_options.num_cols = num_cols;
456
+ lines_sniffed = sniffed_column_counts.result_position;
457
+ candidates.emplace_back(std::move(scanner));
461
458
  }
462
459
  }
463
460
 
@@ -491,7 +488,7 @@ void CSVSniffer::RefineCandidates() {
491
488
 
492
489
  for (idx_t i = 1; i <= options.sample_size_chunks; i++) {
493
490
  vector<unique_ptr<ColumnCountScanner>> successful_candidates;
494
- bool done = false;
491
+ bool done = candidates.empty();
495
492
  for (auto &cur_candidate : candidates) {
496
493
  const bool finished_file = cur_candidate->FinishedFile();
497
494
  if (successful_candidates.empty()) {
@@ -2,7 +2,7 @@
2
2
  #include "duckdb/execution/operator/csv_scanner/csv_casting.hpp"
3
3
 
4
4
  namespace duckdb {
5
- bool CSVSniffer::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
5
+ bool CSVSniffer::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) const {
6
6
  auto &sniffing_state_machine = best_candidate->GetStateMachine();
7
7
  // try vector-cast from string to sql_type
8
8
  Vector dummy_result(sql_type, size);
@@ -303,6 +303,7 @@ CSVError CSVError::CastError(const CSVReaderOptions &options, const string &colu
303
303
  "correctly parse this column."
304
304
  << '\n';
305
305
  }
306
+ how_to_fix_it << "* Check whether the null string value is set correctly (e.g., nullstr = 'N/A')" << '\n';
306
307
 
307
308
  return CSVError(error.str(), CAST_ERROR, column_idx, csv_row, error_info, row_byte_position, byte_position, options,
308
309
  how_to_fix_it.str(), current_path);
@@ -251,6 +251,10 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
251
251
  throw BinderException("Invalid value for MAX_LINE_SIZE parameter: it cannot be smaller than 0");
252
252
  }
253
253
  maximum_line_size.Set(NumericCast<idx_t>(line_size));
254
+ if (buffer_size_option.IsSetByUser() && maximum_line_size.GetValue() > buffer_size_option.GetValue()) {
255
+ throw InvalidInputException("Buffer Size of %d must be a higher value than the maximum line size %d",
256
+ buffer_size_option.GetValue(), maximum_line_size.GetValue());
257
+ }
254
258
  } else if (loption == "date_format" || loption == "dateformat") {
255
259
  string format = ParseString(value, loption);
256
260
  SetDateFormat(LogicalTypeId::DATE, format, true);
@@ -264,6 +268,12 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
264
268
  if (buffer_size_option == 0) {
265
269
  throw InvalidInputException("Buffer Size option must be higher than 0");
266
270
  }
271
+ if (maximum_line_size.IsSetByUser() && maximum_line_size.GetValue() > buffer_size_option.GetValue()) {
272
+ throw InvalidInputException("Buffer Size of %d must be a higher value than the maximum line size %d",
273
+ buffer_size_option.GetValue(), maximum_line_size.GetValue());
274
+ } else {
275
+ maximum_line_size.Set(buffer_size_option.GetValue(), false);
276
+ }
267
277
  } else if (loption == "decimal_separator") {
268
278
  decimal_separator = ParseString(value, loption);
269
279
  if (decimal_separator != "." && decimal_separator != ",") {
@@ -298,6 +308,9 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
298
308
  if (table_name.empty()) {
299
309
  throw BinderException("REJECTS_TABLE option cannot be empty");
300
310
  }
311
+ if (KeywordHelper::RequiresQuotes(table_name)) {
312
+ throw BinderException("rejects_scan option: %s requires quotes to be used as an identifier", table_name);
313
+ }
301
314
  rejects_table_name.Set(table_name);
302
315
  } else if (loption == "rejects_scan") {
303
316
  // skip, handled in SetRejectsOptions
@@ -305,6 +318,9 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
305
318
  if (table_name.empty()) {
306
319
  throw BinderException("rejects_scan option cannot be empty");
307
320
  }
321
+ if (KeywordHelper::RequiresQuotes(table_name)) {
322
+ throw BinderException("rejects_scan option: %s requires quotes to be used as an identifier", table_name);
323
+ }
308
324
  rejects_scan_name.Set(table_name);
309
325
  } else if (loption == "rejects_limit") {
310
326
  auto limit = ParseInteger(value, loption);
@@ -86,6 +86,7 @@ SourceResultType PhysicalReservoirSample::GetData(ExecutionContext &context, Dat
86
86
  return SourceResultType::FINISHED;
87
87
  }
88
88
  auto sample_chunk = sink.sample->GetChunk();
89
+
89
90
  if (!sample_chunk) {
90
91
  return SourceResultType::FINISHED;
91
92
  }
@@ -5,10 +5,11 @@
5
5
 
6
6
  namespace duckdb {
7
7
 
8
- PhysicalStreamingSample::PhysicalStreamingSample(vector<LogicalType> types, SampleMethod method, double percentage,
9
- int64_t seed, idx_t estimated_cardinality)
10
- : PhysicalOperator(PhysicalOperatorType::STREAMING_SAMPLE, std::move(types), estimated_cardinality), method(method),
11
- percentage(percentage / 100), seed(seed) {
8
+ PhysicalStreamingSample::PhysicalStreamingSample(vector<LogicalType> types, unique_ptr<SampleOptions> options,
9
+ idx_t estimated_cardinality)
10
+ : PhysicalOperator(PhysicalOperatorType::STREAMING_SAMPLE, std::move(types), estimated_cardinality),
11
+ sample_options(std::move(options)) {
12
+ percentage = sample_options->sample_size.GetValue<double>() / 100;
12
13
  }
13
14
 
14
15
  //===--------------------------------------------------------------------===//
@@ -49,13 +50,21 @@ void PhysicalStreamingSample::BernoulliSample(DataChunk &input, DataChunk &resul
49
50
  }
50
51
  }
51
52
 
53
+ bool PhysicalStreamingSample::ParallelOperator() const {
54
+ return !(sample_options->repeatable || sample_options->seed.IsValid());
55
+ }
56
+
52
57
  unique_ptr<OperatorState> PhysicalStreamingSample::GetOperatorState(ExecutionContext &context) const {
53
- return make_uniq<StreamingSampleOperatorState>(seed);
58
+ if (!ParallelOperator()) {
59
+ return make_uniq<StreamingSampleOperatorState>(static_cast<int64_t>(sample_options->seed.GetIndex()));
60
+ }
61
+ RandomEngine random;
62
+ return make_uniq<StreamingSampleOperatorState>(static_cast<int64_t>(random.NextRandomInteger64()));
54
63
  }
55
64
 
56
65
  OperatorResultType PhysicalStreamingSample::Execute(ExecutionContext &context, DataChunk &input, DataChunk &chunk,
57
66
  GlobalOperatorState &gstate, OperatorState &state) const {
58
- switch (method) {
67
+ switch (sample_options->method) {
59
68
  case SampleMethod::BERNOULLI_SAMPLE:
60
69
  BernoulliSample(input, chunk, state);
61
70
  break;
@@ -70,7 +79,7 @@ OperatorResultType PhysicalStreamingSample::Execute(ExecutionContext &context, D
70
79
 
71
80
  InsertionOrderPreservingMap<string> PhysicalStreamingSample::ParamsToString() const {
72
81
  InsertionOrderPreservingMap<string> result;
73
- result["Sample Method"] = EnumUtil::ToString(method) + ": " + to_string(100 * percentage) + "%";
82
+ result["Sample Method"] = EnumUtil::ToString(sample_options->method) + ": " + to_string(100 * percentage) + "%";
74
83
  return result;
75
84
  }
76
85
 
@@ -215,7 +215,9 @@ public:
215
215
  auto &gstate = gstate_p.Cast<BatchInsertGlobalState>();
216
216
  auto &lstate = lstate_p.Cast<BatchInsertLocalState>();
217
217
  // merge together the collections
218
- D_ASSERT(lstate.writer);
218
+ if (!lstate.writer) {
219
+ lstate.writer = &gstate.table.GetStorage().CreateOptimisticWriter(context);
220
+ }
219
221
  auto final_collection = gstate.MergeCollections(context, std::move(merge_collections), *lstate.writer);
220
222
  // add the merged-together collection to the set of batch indexes
221
223
  lock_guard<mutex> l(gstate.lock);
@@ -108,7 +108,17 @@ SourceResultType PhysicalTableScan::GetData(ExecutionContext &context, DataChunk
108
108
  if (g_state.in_out_final) {
109
109
  function.in_out_function_final(context, data, chunk);
110
110
  }
111
- function.in_out_function(context, data, g_state.input_chunk, chunk);
111
+ switch (function.in_out_function(context, data, g_state.input_chunk, chunk)) {
112
+ case OperatorResultType::BLOCKED: {
113
+ auto guard = g_state.Lock();
114
+ return g_state.BlockSource(guard, input.interrupt_state);
115
+ }
116
+ default:
117
+ // FIXME: Handling for other cases (such as NEED_MORE_INPUT) breaks current functionality and extensions that
118
+ // might be relying on current behaviour. Needs a rework that is not in scope
119
+ break;
120
+ }
121
+
112
122
  if (chunk.size() == 0 && function.in_out_function_final) {
113
123
  function.in_out_function_final(context, data, chunk);
114
124
  g_state.in_out_final = true;