duckdb 1.2.1-dev6.0 → 1.2.1-dev8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/core_functions/aggregate/distributive/string_agg.cpp +14 -22
- package/src/duckdb/extension/core_functions/aggregate/nested/list.cpp +0 -1
- package/src/duckdb/extension/core_functions/lambda_functions.cpp +0 -11
- package/src/duckdb/extension/core_functions/scalar/list/list_aggregates.cpp +18 -6
- package/src/duckdb/extension/icu/icu-datefunc.cpp +9 -2
- package/src/duckdb/extension/icu/icu-strptime.cpp +7 -11
- package/src/duckdb/extension/icu/include/icu-datefunc.hpp +3 -1
- package/src/duckdb/extension/json/buffered_json_reader.cpp +18 -31
- package/src/duckdb/extension/json/json_extension.cpp +8 -3
- package/src/duckdb/extension/parquet/column_reader.cpp +4 -6
- package/src/duckdb/extension/parquet/column_writer.cpp +33 -12
- package/src/duckdb/extension/parquet/include/column_reader.hpp +0 -2
- package/src/duckdb/extension/parquet/include/parquet_bss_encoder.hpp +0 -1
- package/src/duckdb/extension/parquet/include/parquet_dlba_encoder.hpp +1 -2
- package/src/duckdb/src/catalog/catalog.cpp +12 -0
- package/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp +1 -1
- package/src/duckdb/src/catalog/catalog_entry_retriever.cpp +1 -1
- package/src/duckdb/src/catalog/catalog_search_path.cpp +8 -8
- package/src/duckdb/src/common/bind_helpers.cpp +3 -0
- package/src/duckdb/src/common/compressed_file_system.cpp +2 -0
- package/src/duckdb/src/common/hive_partitioning.cpp +1 -1
- package/src/duckdb/src/common/multi_file_reader.cpp +3 -3
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +1 -1
- package/src/duckdb/src/execution/index/art/art.cpp +19 -6
- package/src/duckdb/src/execution/index/art/iterator.cpp +7 -3
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +11 -4
- package/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +2 -2
- package/src/duckdb/src/execution/operator/csv_scanner/encode/csv_encoder.cpp +5 -1
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp +3 -2
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +2 -2
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/scanner_boundary.cpp +1 -1
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +20 -12
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +19 -22
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +1 -1
- package/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp +1 -0
- package/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +16 -0
- package/src/duckdb/src/execution/operator/helper/physical_reservoir_sample.cpp +1 -0
- package/src/duckdb/src/execution/operator/helper/physical_streaming_sample.cpp +16 -7
- package/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp +3 -1
- package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +11 -1
- package/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +5 -7
- package/src/duckdb/src/execution/physical_plan/plan_create_index.cpp +11 -0
- package/src/duckdb/src/execution/physical_plan/plan_sample.cpp +1 -3
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +14 -5
- package/src/duckdb/src/execution/sample/reservoir_sample.cpp +24 -12
- package/src/duckdb/src/function/scalar/generic/getvariable.cpp +3 -3
- package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
- package/src/duckdb/src/function/window/window_aggregate_states.cpp +3 -0
- package/src/duckdb/src/function/window/window_boundaries_state.cpp +108 -48
- package/src/duckdb/src/function/window/window_constant_aggregator.cpp +5 -5
- package/src/duckdb/src/function/window/window_distinct_aggregator.cpp +6 -0
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry_retriever.hpp +1 -1
- package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +10 -9
- package/src/duckdb/src/include/duckdb/common/adbc/adbc-init.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/multi_file_reader.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/index/art/iterator.hpp +2 -0
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp +5 -4
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_option.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/encode/csv_encoder.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp +3 -7
- package/src/duckdb/src/include/duckdb/execution/reservoir_sample.hpp +2 -1
- package/src/duckdb/src/include/duckdb/function/lambda_functions.hpp +11 -3
- package/src/duckdb/src/include/duckdb/function/window/window_boundaries_state.hpp +4 -0
- package/src/duckdb/src/include/duckdb/main/client_context_state.hpp +4 -0
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +25 -7
- package/src/duckdb/src/include/duckdb/main/pending_query_result.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/query_profiler.hpp +7 -0
- package/src/duckdb/src/include/duckdb/optimizer/filter_combiner.hpp +2 -2
- package/src/duckdb/src/include/duckdb/optimizer/late_materialization.hpp +2 -1
- package/src/duckdb/src/include/duckdb/optimizer/optimizer_extension.hpp +11 -5
- package/src/duckdb/src/include/duckdb/parallel/executor_task.hpp +4 -1
- package/src/duckdb/src/include/duckdb/parallel/pipeline.hpp +0 -1
- package/src/duckdb/src/include/duckdb/parallel/task_executor.hpp +3 -0
- package/src/duckdb/src/include/duckdb/parallel/task_notifier.hpp +27 -0
- package/src/duckdb/src/include/duckdb/parallel/task_scheduler.hpp +4 -0
- package/src/duckdb/src/include/duckdb/planner/expression/bound_subquery_expression.hpp +1 -1
- package/src/duckdb/src/include/duckdb/planner/tableref/bound_cteref.hpp +1 -0
- package/src/duckdb/src/include/duckdb/storage/checkpoint/table_data_writer.hpp +3 -1
- package/src/duckdb/src/include/duckdb/storage/checkpoint_manager.hpp +7 -1
- package/src/duckdb/src/include/duckdb/storage/storage_manager.hpp +3 -2
- package/src/duckdb/src/include/duckdb.h +495 -480
- package/src/duckdb/src/main/attached_database.cpp +1 -1
- package/src/duckdb/src/main/capi/duckdb-c.cpp +5 -1
- package/src/duckdb/src/main/capi/helper-c.cpp +8 -0
- package/src/duckdb/src/main/config.cpp +7 -1
- package/src/duckdb/src/main/database.cpp +8 -8
- package/src/duckdb/src/main/extension/extension_helper.cpp +3 -1
- package/src/duckdb/src/main/extension/extension_load.cpp +12 -12
- package/src/duckdb/src/optimizer/column_lifetime_analyzer.cpp +1 -0
- package/src/duckdb/src/optimizer/join_order/query_graph_manager.cpp +2 -2
- package/src/duckdb/src/optimizer/late_materialization.cpp +26 -5
- package/src/duckdb/src/optimizer/optimizer.cpp +12 -1
- package/src/duckdb/src/parallel/executor_task.cpp +10 -6
- package/src/duckdb/src/parallel/task_executor.cpp +4 -1
- package/src/duckdb/src/parallel/task_notifier.cpp +23 -0
- package/src/duckdb/src/parallel/task_scheduler.cpp +33 -0
- package/src/duckdb/src/parser/transform/expression/transform_subquery.cpp +4 -1
- package/src/duckdb/src/planner/binder/expression/bind_subquery_expression.cpp +1 -1
- package/src/duckdb/src/planner/binder/query_node/plan_subquery.cpp +4 -2
- package/src/duckdb/src/planner/binder/statement/bind_create.cpp +7 -2
- package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +6 -5
- package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +4 -2
- package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
- package/src/duckdb/src/storage/compression/string_uncompressed.cpp +21 -10
- package/src/duckdb/src/storage/storage_info.cpp +2 -0
- package/src/duckdb/src/storage/storage_manager.cpp +2 -2
- package/src/duckdb/src/storage/table/row_group.cpp +5 -6
- package/src/duckdb/src/storage/table/scan_state.cpp +6 -0
- package/src/duckdb/src/transaction/duck_transaction.cpp +11 -3
- package/src/duckdb/src/transaction/duck_transaction_manager.cpp +2 -2
- package/src/duckdb/third_party/concurrentqueue/concurrentqueue.h +17 -0
- package/src/duckdb/ub_src_parallel.cpp +2 -0
@@ -189,11 +189,11 @@ void CatalogSearchPath::Set(CatalogSearchEntry new_value, CatalogSetPathType set
|
|
189
189
|
Set(std::move(new_paths), set_type);
|
190
190
|
}
|
191
191
|
|
192
|
-
const vector<CatalogSearchEntry> &CatalogSearchPath::Get() {
|
192
|
+
const vector<CatalogSearchEntry> &CatalogSearchPath::Get() const {
|
193
193
|
return paths;
|
194
194
|
}
|
195
195
|
|
196
|
-
string CatalogSearchPath::GetDefaultSchema(const string &catalog) {
|
196
|
+
string CatalogSearchPath::GetDefaultSchema(const string &catalog) const {
|
197
197
|
for (auto &path : paths) {
|
198
198
|
if (path.catalog == TEMP_CATALOG) {
|
199
199
|
continue;
|
@@ -205,7 +205,7 @@ string CatalogSearchPath::GetDefaultSchema(const string &catalog) {
|
|
205
205
|
return DEFAULT_SCHEMA;
|
206
206
|
}
|
207
207
|
|
208
|
-
string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string &catalog) {
|
208
|
+
string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string &catalog) const {
|
209
209
|
for (auto &path : paths) {
|
210
210
|
if (path.catalog == TEMP_CATALOG) {
|
211
211
|
continue;
|
@@ -221,7 +221,7 @@ string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string
|
|
221
221
|
return DEFAULT_SCHEMA;
|
222
222
|
}
|
223
223
|
|
224
|
-
string CatalogSearchPath::GetDefaultCatalog(const string &schema) {
|
224
|
+
string CatalogSearchPath::GetDefaultCatalog(const string &schema) const {
|
225
225
|
if (DefaultSchemaGenerator::IsDefaultSchema(schema)) {
|
226
226
|
return SYSTEM_CATALOG;
|
227
227
|
}
|
@@ -236,7 +236,7 @@ string CatalogSearchPath::GetDefaultCatalog(const string &schema) {
|
|
236
236
|
return INVALID_CATALOG;
|
237
237
|
}
|
238
238
|
|
239
|
-
vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) {
|
239
|
+
vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) const {
|
240
240
|
vector<string> catalogs;
|
241
241
|
if (DefaultSchemaGenerator::IsDefaultSchema(schema)) {
|
242
242
|
catalogs.push_back(SYSTEM_CATALOG);
|
@@ -250,7 +250,7 @@ vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) {
|
|
250
250
|
return catalogs;
|
251
251
|
}
|
252
252
|
|
253
|
-
vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) {
|
253
|
+
vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) const {
|
254
254
|
vector<string> schemas;
|
255
255
|
for (auto &path : paths) {
|
256
256
|
if (StringUtil::CIEquals(path.catalog, catalog)) {
|
@@ -260,7 +260,7 @@ vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) {
|
|
260
260
|
return schemas;
|
261
261
|
}
|
262
262
|
|
263
|
-
const CatalogSearchEntry &CatalogSearchPath::GetDefault() {
|
263
|
+
const CatalogSearchEntry &CatalogSearchPath::GetDefault() const {
|
264
264
|
const auto &paths = Get();
|
265
265
|
D_ASSERT(paths.size() >= 2);
|
266
266
|
return paths[1];
|
@@ -281,7 +281,7 @@ void CatalogSearchPath::SetPathsInternal(vector<CatalogSearchEntry> new_paths) {
|
|
281
281
|
}
|
282
282
|
|
283
283
|
bool CatalogSearchPath::SchemaInSearchPath(ClientContext &context, const string &catalog_name,
|
284
|
-
const string &schema_name) {
|
284
|
+
const string &schema_name) const {
|
285
285
|
for (auto &path : paths) {
|
286
286
|
if (!StringUtil::CIEquals(path.schema, schema_name)) {
|
287
287
|
continue;
|
@@ -56,6 +56,9 @@ vector<bool> ParseColumnList(const Value &value, vector<string> &names, const st
|
|
56
56
|
}
|
57
57
|
throw BinderException("\"%s\" expects a column list or * as parameter", loption);
|
58
58
|
}
|
59
|
+
if (value.IsNull()) {
|
60
|
+
throw BinderException("\"%s\" expects a column list or * as parameter, it can't be a NULL value", loption);
|
61
|
+
}
|
59
62
|
auto &children = ListValue::GetChildren(value);
|
60
63
|
// accept '*' as single argument
|
61
64
|
if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
|
@@ -31,6 +31,8 @@ void CompressedFile::Initialize(bool write) {
|
|
31
31
|
stream_data.out_buff_start = stream_data.out_buff.get();
|
32
32
|
stream_data.out_buff_end = stream_data.out_buff.get();
|
33
33
|
|
34
|
+
current_position = 0;
|
35
|
+
|
34
36
|
stream_wrapper = compressed_fs.CreateStream();
|
35
37
|
stream_wrapper->Initialize(*this, write);
|
36
38
|
}
|
@@ -245,7 +245,7 @@ static void TemplatedGetHivePartitionValues(Vector &input, vector<HivePartitionK
|
|
245
245
|
|
246
246
|
const auto &type = input.GetType();
|
247
247
|
|
248
|
-
const auto reinterpret = Value::CreateValue<T>(data[0]).GetTypeMutable() != type;
|
248
|
+
const auto reinterpret = Value::CreateValue<T>(data[sel.get_index(0)]).GetTypeMutable() != type;
|
249
249
|
if (reinterpret) {
|
250
250
|
for (idx_t i = 0; i < count; i++) {
|
251
251
|
auto &key = keys[i];
|
@@ -508,14 +508,14 @@ void MultiFileReader::CreateMapping(const string &file_name,
|
|
508
508
|
// copy global columns and inject any different defaults
|
509
509
|
CreateColumnMapping(file_name, local_columns, global_columns, global_column_ids, reader_data, bind_data,
|
510
510
|
initial_file, global_state);
|
511
|
-
CreateFilterMap(
|
511
|
+
CreateFilterMap(global_column_ids, filters, reader_data, global_state);
|
512
512
|
}
|
513
513
|
|
514
|
-
void MultiFileReader::CreateFilterMap(const vector<
|
514
|
+
void MultiFileReader::CreateFilterMap(const vector<ColumnIndex> &global_column_ids,
|
515
515
|
optional_ptr<TableFilterSet> filters, MultiFileReaderData &reader_data,
|
516
516
|
optional_ptr<MultiFileReaderGlobalState> global_state) {
|
517
517
|
if (filters) {
|
518
|
-
auto filter_map_size =
|
518
|
+
auto filter_map_size = global_column_ids.size();
|
519
519
|
if (global_state) {
|
520
520
|
filter_map_size += global_state->extra_columns.size();
|
521
521
|
}
|
@@ -329,7 +329,7 @@ optional_idx GroupedAggregateHashTable::TryAddDictionaryGroups(DataChunk &groups
|
|
329
329
|
if (dictionary_id.empty()) {
|
330
330
|
// dictionary has no id, we can't cache across vectors
|
331
331
|
// only use dictionary compression if there are fewer entries than groups
|
332
|
-
if (dict_size >= groups.size()
|
332
|
+
if (dict_size * DICTIONARY_THRESHOLD >= groups.size()) {
|
333
333
|
// dictionary is too large - use regular aggregation
|
334
334
|
return optional_idx();
|
335
335
|
}
|
@@ -1038,9 +1038,11 @@ string ART::GenerateConstraintErrorMessage(VerifyExistenceType verify_type, cons
|
|
1038
1038
|
}
|
1039
1039
|
case VerifyExistenceType::DELETE_FK: {
|
1040
1040
|
// DELETE_FK that still exists in a FK table, i.e., not a valid delete.
|
1041
|
-
return StringUtil::Format(
|
1042
|
-
|
1043
|
-
|
1041
|
+
return StringUtil::Format(
|
1042
|
+
"Violates foreign key constraint because key \"%s\" is still referenced by a foreign "
|
1043
|
+
"key in a different table. If this is an unexpected constraint violation, please refer to our "
|
1044
|
+
"foreign key limitations in the documentation",
|
1045
|
+
key_name);
|
1044
1046
|
}
|
1045
1047
|
default:
|
1046
1048
|
throw NotImplementedException("Type not implemented for VerifyExistenceType");
|
@@ -1091,16 +1093,27 @@ void ART::VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr<ART> dele
|
|
1091
1093
|
return;
|
1092
1094
|
}
|
1093
1095
|
|
1096
|
+
// Fast path for FOREIGN KEY constraints.
|
1097
|
+
// Up to here, the above code paths work implicitly for FKs, as the leaf is inlined.
|
1094
1098
|
// FIXME: proper foreign key + delete ART support.
|
1095
|
-
|
1096
|
-
|
1099
|
+
if (index_constraint_type == IndexConstraintType::FOREIGN) {
|
1100
|
+
D_ASSERT(!deleted_leaf);
|
1101
|
+
// We don't handle FK conflicts in UPSERT, so the row ID should not matter.
|
1102
|
+
if (manager.AddHit(i, MAX_ROW_ID)) {
|
1103
|
+
conflict_idx = i;
|
1104
|
+
}
|
1105
|
+
return;
|
1106
|
+
}
|
1097
1107
|
|
1098
1108
|
// Scan the two row IDs in the leaf.
|
1099
1109
|
Iterator it(*this);
|
1100
1110
|
it.FindMinimum(leaf);
|
1101
1111
|
ARTKey empty_key = ARTKey();
|
1102
1112
|
unsafe_vector<row_t> row_ids;
|
1103
|
-
it.Scan(empty_key, 2, row_ids, false);
|
1113
|
+
auto success = it.Scan(empty_key, 2, row_ids, false);
|
1114
|
+
if (!success || row_ids.size() != 2) {
|
1115
|
+
throw InternalException("VerifyLeaf expects exactly two row IDs to be scanned");
|
1116
|
+
}
|
1104
1117
|
|
1105
1118
|
if (!deleted_leaf) {
|
1106
1119
|
if (manager.AddHit(i, row_ids[0]) || manager.AddHit(i, row_ids[1])) {
|
@@ -46,9 +46,11 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec
|
|
46
46
|
bool has_next;
|
47
47
|
do {
|
48
48
|
// An empty upper bound indicates that no upper bound exists.
|
49
|
-
if (!upper_bound.Empty()
|
50
|
-
if (
|
51
|
-
|
49
|
+
if (!upper_bound.Empty()) {
|
50
|
+
if (status == GateStatus::GATE_NOT_SET || entered_nested_leaf) {
|
51
|
+
if (current_key.GreaterThan(upper_bound, equal, nested_depth)) {
|
52
|
+
return true;
|
53
|
+
}
|
52
54
|
}
|
53
55
|
}
|
54
56
|
|
@@ -86,6 +88,7 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec
|
|
86
88
|
throw InternalException("Invalid leaf type for index scan.");
|
87
89
|
}
|
88
90
|
|
91
|
+
entered_nested_leaf = false;
|
89
92
|
has_next = Next();
|
90
93
|
} while (has_next);
|
91
94
|
return true;
|
@@ -104,6 +107,7 @@ void Iterator::FindMinimum(const Node &node) {
|
|
104
107
|
if (node.GetGateStatus() == GateStatus::GATE_SET) {
|
105
108
|
D_ASSERT(status == GateStatus::GATE_NOT_SET);
|
106
109
|
status = GateStatus::GATE_SET;
|
110
|
+
entered_nested_leaf = true;
|
107
111
|
nested_depth = 0;
|
108
112
|
}
|
109
113
|
|
@@ -575,6 +575,11 @@ public:
|
|
575
575
|
|
576
576
|
explicit WindowLocalSourceState(WindowGlobalSourceState &gsource);
|
577
577
|
|
578
|
+
void ReleaseLocalStates() {
|
579
|
+
auto &local_states = window_hash_group->thread_states.at(task->thread_idx);
|
580
|
+
local_states.clear();
|
581
|
+
}
|
582
|
+
|
578
583
|
//! Does the task have more work to do?
|
579
584
|
bool TaskFinished() const {
|
580
585
|
return !task || task->begin_idx == task->end_idx;
|
@@ -792,6 +797,12 @@ void WindowGlobalSourceState::FinishTask(TaskPtr task) {
|
|
792
797
|
}
|
793
798
|
|
794
799
|
bool WindowLocalSourceState::TryAssignTask() {
|
800
|
+
D_ASSERT(TaskFinished());
|
801
|
+
if (task && task->stage == WindowGroupStage::GETDATA) {
|
802
|
+
// If this state completed the last block in the previous iteration,
|
803
|
+
// release out local state memory.
|
804
|
+
ReleaseLocalStates();
|
805
|
+
}
|
795
806
|
// Because downstream operators may be using our internal buffers,
|
796
807
|
// we can't "finish" a task until we are about to get the next one.
|
797
808
|
|
@@ -888,10 +899,6 @@ void WindowLocalSourceState::GetData(DataChunk &result) {
|
|
888
899
|
++task->begin_idx;
|
889
900
|
}
|
890
901
|
|
891
|
-
// If that was the last block, release out local state memory.
|
892
|
-
if (TaskFinished()) {
|
893
|
-
local_states.clear();
|
894
|
-
}
|
895
902
|
result.Verify();
|
896
903
|
}
|
897
904
|
|
@@ -4,7 +4,7 @@
|
|
4
4
|
namespace duckdb {
|
5
5
|
|
6
6
|
CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
|
7
|
-
idx_t &global_csv_current_position, idx_t file_number_p)
|
7
|
+
const idx_t &global_csv_current_position, idx_t file_number_p)
|
8
8
|
: context(context), requested_size(buffer_size_p), file_number(file_number_p), can_seek(file_handle.CanSeek()),
|
9
9
|
is_pipe(file_handle.IsPipe()) {
|
10
10
|
AllocateBuffer(buffer_size_p);
|
@@ -34,7 +34,7 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b
|
|
34
34
|
}
|
35
35
|
|
36
36
|
shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p,
|
37
|
-
bool &has_seaked) {
|
37
|
+
bool &has_seaked) const {
|
38
38
|
if (has_seaked) {
|
39
39
|
// This means that at some point a reload was done, and we are currently on the incorrect position in our file
|
40
40
|
// handle
|
@@ -36,7 +36,7 @@ void CSVEncoderBuffer::Reset() {
|
|
36
36
|
actual_encoded_buffer_size = 0;
|
37
37
|
}
|
38
38
|
|
39
|
-
CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, idx_t buffer_size) {
|
39
|
+
CSVEncoder::CSVEncoder(const DBConfig &config, const string &encoding_name_to_find, idx_t buffer_size) {
|
40
40
|
encoding_name = StringUtil::Lower(encoding_name_to_find);
|
41
41
|
auto function = config.GetEncodeFunction(encoding_name_to_find);
|
42
42
|
if (!function) {
|
@@ -51,6 +51,10 @@ CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, id
|
|
51
51
|
}
|
52
52
|
// We ensure that the encoded buffer size is an even number to make the two byte lookup on utf-16 work
|
53
53
|
idx_t encoded_buffer_size = buffer_size % 2 != 0 ? buffer_size - 1 : buffer_size;
|
54
|
+
if (encoded_buffer_size == 0) {
|
55
|
+
// This might happen if buffer size = 1
|
56
|
+
encoded_buffer_size = 2;
|
57
|
+
}
|
54
58
|
D_ASSERT(encoded_buffer_size > 0);
|
55
59
|
encoded_buffer.Initialize(encoded_buffer_size);
|
56
60
|
remaining_bytes_buffer.Initialize(function->GetBytesPerIteration());
|
@@ -11,9 +11,10 @@ ScannerResult::ScannerResult(CSVStates &states_p, CSVStateMachine &state_machine
|
|
11
11
|
|
12
12
|
BaseScanner::BaseScanner(shared_ptr<CSVBufferManager> buffer_manager_p, shared_ptr<CSVStateMachine> state_machine_p,
|
13
13
|
shared_ptr<CSVErrorHandler> error_handler_p, bool sniffing_p,
|
14
|
-
shared_ptr<CSVFileScan> csv_file_scan_p, CSVIterator iterator_p)
|
14
|
+
shared_ptr<CSVFileScan> csv_file_scan_p, const CSVIterator &iterator_p)
|
15
15
|
: csv_file_scan(std::move(csv_file_scan_p)), sniffing(sniffing_p), error_handler(std::move(error_handler_p)),
|
16
|
-
state_machine(std::move(state_machine_p)), buffer_manager(std::move(buffer_manager_p)),
|
16
|
+
state_machine(std::move(state_machine_p)), states(), buffer_manager(std::move(buffer_manager_p)),
|
17
|
+
iterator(iterator_p) {
|
17
18
|
D_ASSERT(buffer_manager);
|
18
19
|
D_ASSERT(state_machine);
|
19
20
|
// Initialize current buffer handle
|
@@ -76,8 +76,8 @@ void CSVSchema::MergeSchemas(CSVSchema &other, bool null_padding) {
|
|
76
76
|
}
|
77
77
|
}
|
78
78
|
|
79
|
-
CSVSchema::CSVSchema(vector<string> &names, vector<LogicalType> &types, const string &file_path,
|
80
|
-
const bool empty_p)
|
79
|
+
CSVSchema::CSVSchema(const vector<string> &names, const vector<LogicalType> &types, const string &file_path,
|
80
|
+
idx_t rows_read_p, const bool empty_p)
|
81
81
|
: rows_read(rows_read_p), empty(empty_p) {
|
82
82
|
Initialize(names, types, file_path);
|
83
83
|
}
|
@@ -13,7 +13,7 @@ CSVBoundary::CSVBoundary(idx_t buffer_idx_p, idx_t buffer_pos_p, idx_t boundary_
|
|
13
13
|
CSVBoundary::CSVBoundary() : buffer_idx(0), buffer_pos(0), boundary_idx(0), end_pos(NumericLimits<idx_t>::Maximum()) {
|
14
14
|
}
|
15
15
|
|
16
|
-
CSVIterator::CSVIterator() : is_set(false) {
|
16
|
+
CSVIterator::CSVIterator() : buffer_size(0), is_set(false) {
|
17
17
|
}
|
18
18
|
|
19
19
|
void CSVBoundary::Print() const {
|
@@ -688,23 +688,29 @@ bool LineError::HandleErrors(StringValueResult &result) {
|
|
688
688
|
line_pos.GetGlobalPosition(result.requested_size), result.path);
|
689
689
|
}
|
690
690
|
break;
|
691
|
-
case CAST_ERROR:
|
691
|
+
case CAST_ERROR: {
|
692
|
+
string column_name;
|
693
|
+
LogicalTypeId type_id;
|
694
|
+
if (cur_error.col_idx < result.names.size()) {
|
695
|
+
column_name = result.names[cur_error.col_idx];
|
696
|
+
}
|
697
|
+
if (cur_error.col_idx < result.number_of_columns) {
|
698
|
+
type_id = result.parse_types[cur_error.chunk_idx].type_id;
|
699
|
+
}
|
692
700
|
if (result.current_line_position.begin == line_pos) {
|
693
701
|
csv_error = CSVError::CastError(
|
694
|
-
result.state_machine.options,
|
695
|
-
|
702
|
+
result.state_machine.options, column_name, cur_error.error_message, cur_error.col_idx, borked_line,
|
703
|
+
lines_per_batch,
|
696
704
|
result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl),
|
697
|
-
line_pos.GetGlobalPosition(result.requested_size, first_nl),
|
698
|
-
result.parse_types[cur_error.chunk_idx].type_id, result.path);
|
705
|
+
line_pos.GetGlobalPosition(result.requested_size, first_nl), type_id, result.path);
|
699
706
|
} else {
|
700
707
|
csv_error = CSVError::CastError(
|
701
|
-
result.state_machine.options,
|
702
|
-
|
708
|
+
result.state_machine.options, column_name, cur_error.error_message, cur_error.col_idx, borked_line,
|
709
|
+
lines_per_batch,
|
703
710
|
result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl),
|
704
|
-
line_pos.GetGlobalPosition(result.requested_size), result.
|
705
|
-
result.path);
|
711
|
+
line_pos.GetGlobalPosition(result.requested_size), type_id, result.path);
|
706
712
|
}
|
707
|
-
|
713
|
+
} break;
|
708
714
|
case MAXIMUM_LINE_SIZE:
|
709
715
|
csv_error = CSVError::LineSizeError(
|
710
716
|
result.state_machine.options, lines_per_batch, borked_line,
|
@@ -964,7 +970,8 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptr<CSV
|
|
964
970
|
result(states, *state_machine, cur_buffer_handle, BufferAllocator::Get(buffer_manager->context), result_size,
|
965
971
|
iterator.pos.buffer_pos, *error_handler, iterator,
|
966
972
|
buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
|
967
|
-
buffer_manager->GetFilePath(), scanner_idx_p)
|
973
|
+
buffer_manager->GetFilePath(), scanner_idx_p),
|
974
|
+
start_pos(0) {
|
968
975
|
iterator.buffer_size = state_machine->options.buffer_size_option.GetValue();
|
969
976
|
}
|
970
977
|
|
@@ -976,7 +983,8 @@ StringValueScanner::StringValueScanner(const shared_ptr<CSVBufferManager> &buffe
|
|
976
983
|
result(states, *state_machine, cur_buffer_handle, Allocator::DefaultAllocator(), result_size,
|
977
984
|
iterator.pos.buffer_pos, *error_handler, iterator,
|
978
985
|
buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
|
979
|
-
buffer_manager->GetFilePath(), 0)
|
986
|
+
buffer_manager->GetFilePath(), 0),
|
987
|
+
start_pos(0) {
|
980
988
|
iterator.buffer_size = state_machine->options.buffer_size_option.GetValue();
|
981
989
|
}
|
982
990
|
|
@@ -397,7 +397,13 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
|
|
397
397
|
}
|
398
398
|
}
|
399
399
|
}
|
400
|
-
if (max_columns_found == num_cols && ignored_rows > min_ignored_rows) {
|
400
|
+
if (max_columns_found == num_cols && (ignored_rows > min_ignored_rows)) {
|
401
|
+
return;
|
402
|
+
}
|
403
|
+
if (max_columns_found > 1 && num_cols > max_columns_found && consistent_rows < best_consistent_rows / 2 &&
|
404
|
+
options.null_padding) {
|
405
|
+
// When null_padding is true, we only give preference to a max number of columns if null padding is at least
|
406
|
+
// 50% as consistent as the best case scenario
|
401
407
|
return;
|
402
408
|
}
|
403
409
|
if (quoted && num_cols < max_columns_found) {
|
@@ -436,28 +442,19 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
|
|
436
442
|
!require_more_padding && !invalid_padding && num_cols == max_columns_found && comments_are_acceptable) {
|
437
443
|
auto &sniffing_state_machine = scanner->GetStateMachine();
|
438
444
|
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
}
|
445
|
-
}
|
446
|
-
if (!same_quote_is_candidate) {
|
447
|
-
if (options.dialect_options.skip_rows.IsSetByUser()) {
|
448
|
-
// If skip rows is set by user, and we found dirty notes, we only accept it if either null_padding or
|
449
|
-
// ignore_errors is set
|
450
|
-
if (dirty_notes != 0 && !options.null_padding && !options.ignore_errors.GetValue()) {
|
451
|
-
return;
|
452
|
-
}
|
453
|
-
sniffing_state_machine.dialect_options.skip_rows = options.dialect_options.skip_rows.GetValue();
|
454
|
-
} else if (!options.null_padding) {
|
455
|
-
sniffing_state_machine.dialect_options.skip_rows = dirty_notes;
|
445
|
+
if (options.dialect_options.skip_rows.IsSetByUser()) {
|
446
|
+
// If skip rows is set by user, and we found dirty notes, we only accept it if either null_padding or
|
447
|
+
// ignore_errors is set
|
448
|
+
if (dirty_notes != 0 && !options.null_padding && !options.ignore_errors.GetValue()) {
|
449
|
+
return;
|
456
450
|
}
|
457
|
-
sniffing_state_machine.dialect_options.
|
458
|
-
|
459
|
-
|
451
|
+
sniffing_state_machine.dialect_options.skip_rows = options.dialect_options.skip_rows.GetValue();
|
452
|
+
} else if (!options.null_padding) {
|
453
|
+
sniffing_state_machine.dialect_options.skip_rows = dirty_notes;
|
460
454
|
}
|
455
|
+
sniffing_state_machine.dialect_options.num_cols = num_cols;
|
456
|
+
lines_sniffed = sniffed_column_counts.result_position;
|
457
|
+
candidates.emplace_back(std::move(scanner));
|
461
458
|
}
|
462
459
|
}
|
463
460
|
|
@@ -491,7 +488,7 @@ void CSVSniffer::RefineCandidates() {
|
|
491
488
|
|
492
489
|
for (idx_t i = 1; i <= options.sample_size_chunks; i++) {
|
493
490
|
vector<unique_ptr<ColumnCountScanner>> successful_candidates;
|
494
|
-
bool done =
|
491
|
+
bool done = candidates.empty();
|
495
492
|
for (auto &cur_candidate : candidates) {
|
496
493
|
const bool finished_file = cur_candidate->FinishedFile();
|
497
494
|
if (successful_candidates.empty()) {
|
@@ -2,7 +2,7 @@
|
|
2
2
|
#include "duckdb/execution/operator/csv_scanner/csv_casting.hpp"
|
3
3
|
|
4
4
|
namespace duckdb {
|
5
|
-
bool CSVSniffer::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
|
5
|
+
bool CSVSniffer::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) const {
|
6
6
|
auto &sniffing_state_machine = best_candidate->GetStateMachine();
|
7
7
|
// try vector-cast from string to sql_type
|
8
8
|
Vector dummy_result(sql_type, size);
|
@@ -303,6 +303,7 @@ CSVError CSVError::CastError(const CSVReaderOptions &options, const string &colu
|
|
303
303
|
"correctly parse this column."
|
304
304
|
<< '\n';
|
305
305
|
}
|
306
|
+
how_to_fix_it << "* Check whether the null string value is set correctly (e.g., nullstr = 'N/A')" << '\n';
|
306
307
|
|
307
308
|
return CSVError(error.str(), CAST_ERROR, column_idx, csv_row, error_info, row_byte_position, byte_position, options,
|
308
309
|
how_to_fix_it.str(), current_path);
|
@@ -251,6 +251,10 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
|
|
251
251
|
throw BinderException("Invalid value for MAX_LINE_SIZE parameter: it cannot be smaller than 0");
|
252
252
|
}
|
253
253
|
maximum_line_size.Set(NumericCast<idx_t>(line_size));
|
254
|
+
if (buffer_size_option.IsSetByUser() && maximum_line_size.GetValue() > buffer_size_option.GetValue()) {
|
255
|
+
throw InvalidInputException("Buffer Size of %d must be a higher value than the maximum line size %d",
|
256
|
+
buffer_size_option.GetValue(), maximum_line_size.GetValue());
|
257
|
+
}
|
254
258
|
} else if (loption == "date_format" || loption == "dateformat") {
|
255
259
|
string format = ParseString(value, loption);
|
256
260
|
SetDateFormat(LogicalTypeId::DATE, format, true);
|
@@ -264,6 +268,12 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
|
|
264
268
|
if (buffer_size_option == 0) {
|
265
269
|
throw InvalidInputException("Buffer Size option must be higher than 0");
|
266
270
|
}
|
271
|
+
if (maximum_line_size.IsSetByUser() && maximum_line_size.GetValue() > buffer_size_option.GetValue()) {
|
272
|
+
throw InvalidInputException("Buffer Size of %d must be a higher value than the maximum line size %d",
|
273
|
+
buffer_size_option.GetValue(), maximum_line_size.GetValue());
|
274
|
+
} else {
|
275
|
+
maximum_line_size.Set(buffer_size_option.GetValue(), false);
|
276
|
+
}
|
267
277
|
} else if (loption == "decimal_separator") {
|
268
278
|
decimal_separator = ParseString(value, loption);
|
269
279
|
if (decimal_separator != "." && decimal_separator != ",") {
|
@@ -298,6 +308,9 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
|
|
298
308
|
if (table_name.empty()) {
|
299
309
|
throw BinderException("REJECTS_TABLE option cannot be empty");
|
300
310
|
}
|
311
|
+
if (KeywordHelper::RequiresQuotes(table_name)) {
|
312
|
+
throw BinderException("rejects_scan option: %s requires quotes to be used as an identifier", table_name);
|
313
|
+
}
|
301
314
|
rejects_table_name.Set(table_name);
|
302
315
|
} else if (loption == "rejects_scan") {
|
303
316
|
// skip, handled in SetRejectsOptions
|
@@ -305,6 +318,9 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
|
|
305
318
|
if (table_name.empty()) {
|
306
319
|
throw BinderException("rejects_scan option cannot be empty");
|
307
320
|
}
|
321
|
+
if (KeywordHelper::RequiresQuotes(table_name)) {
|
322
|
+
throw BinderException("rejects_scan option: %s requires quotes to be used as an identifier", table_name);
|
323
|
+
}
|
308
324
|
rejects_scan_name.Set(table_name);
|
309
325
|
} else if (loption == "rejects_limit") {
|
310
326
|
auto limit = ParseInteger(value, loption);
|
@@ -5,10 +5,11 @@
|
|
5
5
|
|
6
6
|
namespace duckdb {
|
7
7
|
|
8
|
-
PhysicalStreamingSample::PhysicalStreamingSample(vector<LogicalType> types,
|
9
|
-
|
10
|
-
: PhysicalOperator(PhysicalOperatorType::STREAMING_SAMPLE, std::move(types), estimated_cardinality),
|
11
|
-
|
8
|
+
PhysicalStreamingSample::PhysicalStreamingSample(vector<LogicalType> types, unique_ptr<SampleOptions> options,
|
9
|
+
idx_t estimated_cardinality)
|
10
|
+
: PhysicalOperator(PhysicalOperatorType::STREAMING_SAMPLE, std::move(types), estimated_cardinality),
|
11
|
+
sample_options(std::move(options)) {
|
12
|
+
percentage = sample_options->sample_size.GetValue<double>() / 100;
|
12
13
|
}
|
13
14
|
|
14
15
|
//===--------------------------------------------------------------------===//
|
@@ -49,13 +50,21 @@ void PhysicalStreamingSample::BernoulliSample(DataChunk &input, DataChunk &resul
|
|
49
50
|
}
|
50
51
|
}
|
51
52
|
|
53
|
+
bool PhysicalStreamingSample::ParallelOperator() const {
|
54
|
+
return !(sample_options->repeatable || sample_options->seed.IsValid());
|
55
|
+
}
|
56
|
+
|
52
57
|
unique_ptr<OperatorState> PhysicalStreamingSample::GetOperatorState(ExecutionContext &context) const {
|
53
|
-
|
58
|
+
if (!ParallelOperator()) {
|
59
|
+
return make_uniq<StreamingSampleOperatorState>(static_cast<int64_t>(sample_options->seed.GetIndex()));
|
60
|
+
}
|
61
|
+
RandomEngine random;
|
62
|
+
return make_uniq<StreamingSampleOperatorState>(static_cast<int64_t>(random.NextRandomInteger64()));
|
54
63
|
}
|
55
64
|
|
56
65
|
OperatorResultType PhysicalStreamingSample::Execute(ExecutionContext &context, DataChunk &input, DataChunk &chunk,
|
57
66
|
GlobalOperatorState &gstate, OperatorState &state) const {
|
58
|
-
switch (method) {
|
67
|
+
switch (sample_options->method) {
|
59
68
|
case SampleMethod::BERNOULLI_SAMPLE:
|
60
69
|
BernoulliSample(input, chunk, state);
|
61
70
|
break;
|
@@ -70,7 +79,7 @@ OperatorResultType PhysicalStreamingSample::Execute(ExecutionContext &context, D
|
|
70
79
|
|
71
80
|
InsertionOrderPreservingMap<string> PhysicalStreamingSample::ParamsToString() const {
|
72
81
|
InsertionOrderPreservingMap<string> result;
|
73
|
-
result["Sample Method"] = EnumUtil::ToString(method) + ": " + to_string(100 * percentage) + "%";
|
82
|
+
result["Sample Method"] = EnumUtil::ToString(sample_options->method) + ": " + to_string(100 * percentage) + "%";
|
74
83
|
return result;
|
75
84
|
}
|
76
85
|
|
@@ -215,7 +215,9 @@ public:
|
|
215
215
|
auto &gstate = gstate_p.Cast<BatchInsertGlobalState>();
|
216
216
|
auto &lstate = lstate_p.Cast<BatchInsertLocalState>();
|
217
217
|
// merge together the collections
|
218
|
-
|
218
|
+
if (!lstate.writer) {
|
219
|
+
lstate.writer = &gstate.table.GetStorage().CreateOptimisticWriter(context);
|
220
|
+
}
|
219
221
|
auto final_collection = gstate.MergeCollections(context, std::move(merge_collections), *lstate.writer);
|
220
222
|
// add the merged-together collection to the set of batch indexes
|
221
223
|
lock_guard<mutex> l(gstate.lock);
|
@@ -108,7 +108,17 @@ SourceResultType PhysicalTableScan::GetData(ExecutionContext &context, DataChunk
|
|
108
108
|
if (g_state.in_out_final) {
|
109
109
|
function.in_out_function_final(context, data, chunk);
|
110
110
|
}
|
111
|
-
function.in_out_function(context, data, g_state.input_chunk, chunk)
|
111
|
+
switch (function.in_out_function(context, data, g_state.input_chunk, chunk)) {
|
112
|
+
case OperatorResultType::BLOCKED: {
|
113
|
+
auto guard = g_state.Lock();
|
114
|
+
return g_state.BlockSource(guard, input.interrupt_state);
|
115
|
+
}
|
116
|
+
default:
|
117
|
+
// FIXME: Handling for other cases (such as NEED_MORE_INPUT) breaks current functionality and extensions that
|
118
|
+
// might be relying on current behaviour. Needs a rework that is not in scope
|
119
|
+
break;
|
120
|
+
}
|
121
|
+
|
112
122
|
if (chunk.size() == 0 && function.in_out_function_final) {
|
113
123
|
function.in_out_function_final(context, data, chunk);
|
114
124
|
g_state.in_out_final = true;
|