duckdb 1.4.3-dev0.0 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/core_functions/aggregate/holistic/approximate_quantile.cpp +1 -1
- package/src/duckdb/extension/icu/icu_extension.cpp +14 -5
- package/src/duckdb/extension/parquet/column_writer.cpp +4 -4
- package/src/duckdb/extension/parquet/include/writer/templated_column_writer.hpp +12 -4
- package/src/duckdb/src/common/encryption_key_manager.cpp +4 -0
- package/src/duckdb/src/common/local_file_system.cpp +23 -0
- package/src/duckdb/src/common/types/column/column_data_collection.cpp +6 -0
- package/src/duckdb/src/common/types/conflict_manager.cpp +1 -1
- package/src/duckdb/src/execution/index/art/base_node.cpp +3 -1
- package/src/duckdb/src/execution/index/art/prefix.cpp +5 -8
- package/src/duckdb/src/execution/index/bound_index.cpp +68 -25
- package/src/duckdb/src/execution/index/unbound_index.cpp +21 -10
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp +4 -0
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +36 -28
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +3 -2
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +12 -6
- package/src/duckdb/src/execution/operator/scan/physical_positional_scan.cpp +8 -4
- package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
- package/src/duckdb/src/execution/physical_plan/plan_aggregate.cpp +4 -3
- package/src/duckdb/src/execution/physical_plan/plan_distinct.cpp +3 -2
- package/src/duckdb/src/execution/physical_plan/plan_filter.cpp +0 -1
- package/src/duckdb/src/execution/physical_plan/plan_window.cpp +6 -8
- package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +4 -3
- package/src/duckdb/src/function/macro_function.cpp +20 -2
- package/src/duckdb/src/function/table/system/duckdb_log.cpp +3 -0
- package/src/duckdb/src/function/table/system/test_all_types.cpp +26 -13
- package/src/duckdb/src/function/table/table_scan.cpp +72 -38
- package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
- package/src/duckdb/src/function/table_function.cpp +24 -0
- package/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp +1 -0
- package/src/duckdb/src/include/duckdb/common/limits.hpp +4 -2
- package/src/duckdb/src/include/duckdb/common/local_file_system.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/types/row/block_iterator.hpp +2 -0
- package/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp +2 -0
- package/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/index/unbound_index.hpp +41 -7
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp +15 -1
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +1 -0
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +2 -1
- package/src/duckdb/src/include/duckdb/execution/physical_plan_generator.hpp +3 -1
- package/src/duckdb/src/include/duckdb/function/function_binder.hpp +2 -1
- package/src/duckdb/src/include/duckdb/function/table_function.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/db_instance_cache.hpp +5 -0
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +2 -0
- package/src/duckdb/src/include/duckdb/optimizer/filter_combiner.hpp +1 -0
- package/src/duckdb/src/include/duckdb/optimizer/join_order/relation_manager.hpp +4 -4
- package/src/duckdb/src/include/duckdb/optimizer/rule/ordered_aggregate_optimizer.hpp +3 -1
- package/src/duckdb/src/include/duckdb/parser/parsed_data/sample_options.hpp +3 -0
- package/src/duckdb/src/include/duckdb/planner/binder.hpp +1 -1
- package/src/duckdb/src/include/duckdb/planner/bound_result_modifier.hpp +4 -2
- package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +1 -2
- package/src/duckdb/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp +1 -1
- package/src/duckdb/src/include/duckdb/planner/subquery/rewrite_cte_scan.hpp +3 -1
- package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +3 -3
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +2 -6
- package/src/duckdb/src/include/duckdb/storage/table/row_version_manager.hpp +4 -1
- package/src/duckdb/src/include/duckdb/storage/table/validity_column_data.hpp +2 -0
- package/src/duckdb/src/logging/log_storage.cpp +17 -23
- package/src/duckdb/src/main/capi/duckdb-c.cpp +1 -1
- package/src/duckdb/src/main/connection.cpp +0 -5
- package/src/duckdb/src/main/database_manager.cpp +12 -9
- package/src/duckdb/src/main/db_instance_cache.cpp +15 -1
- package/src/duckdb/src/main/extension/extension_alias.cpp +1 -0
- package/src/duckdb/src/optimizer/filter_combiner.cpp +38 -4
- package/src/duckdb/src/optimizer/join_order/relation_manager.cpp +15 -15
- package/src/duckdb/src/optimizer/late_materialization.cpp +5 -0
- package/src/duckdb/src/optimizer/rule/ordered_aggregate_optimizer.cpp +6 -3
- package/src/duckdb/src/parser/transform/helpers/transform_sample.cpp +3 -2
- package/src/duckdb/src/planner/binder/expression/bind_star_expression.cpp +1 -1
- package/src/duckdb/src/planner/binder/query_node/plan_select_node.cpp +1 -1
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +4 -1
- package/src/duckdb/src/planner/binder/statement/bind_insert.cpp +17 -10
- package/src/duckdb/src/planner/binder.cpp +3 -3
- package/src/duckdb/src/planner/bound_result_modifier.cpp +22 -5
- package/src/duckdb/src/planner/expression/bound_function_expression.cpp +4 -1
- package/src/duckdb/src/planner/expression_binder/constant_binder.cpp +1 -1
- package/src/duckdb/src/planner/expression_binder.cpp +1 -2
- package/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp +57 -24
- package/src/duckdb/src/planner/subquery/rewrite_cte_scan.cpp +5 -3
- package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +9 -0
- package/src/duckdb/src/storage/storage_info.cpp +2 -0
- package/src/duckdb/src/storage/table/chunk_info.cpp +3 -3
- package/src/duckdb/src/storage/table/column_data.cpp +5 -1
- package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +1 -1
- package/src/duckdb/src/storage/table/column_segment.cpp +3 -1
- package/src/duckdb/src/storage/table/row_group.cpp +6 -8
- package/src/duckdb/src/storage/table/row_group_collection.cpp +41 -1
- package/src/duckdb/src/storage/table/row_version_manager.cpp +37 -23
- package/src/duckdb/src/storage/table/standard_column_data.cpp +5 -5
- package/src/duckdb/src/storage/table/validity_column_data.cpp +17 -0
package/package.json
CHANGED
|
@@ -355,11 +355,11 @@ AggregateFunction GetApproxQuantileListAggregateFunction(const LogicalType &type
|
|
|
355
355
|
return GetTypedApproxQuantileListAggregateFunction<int16_t, int16_t>(type);
|
|
356
356
|
case LogicalTypeId::INTEGER:
|
|
357
357
|
case LogicalTypeId::DATE:
|
|
358
|
-
case LogicalTypeId::TIME:
|
|
359
358
|
return GetTypedApproxQuantileListAggregateFunction<int32_t, int32_t>(type);
|
|
360
359
|
case LogicalTypeId::BIGINT:
|
|
361
360
|
case LogicalTypeId::TIMESTAMP:
|
|
362
361
|
case LogicalTypeId::TIMESTAMP_TZ:
|
|
362
|
+
case LogicalTypeId::TIME:
|
|
363
363
|
return GetTypedApproxQuantileListAggregateFunction<int64_t, int64_t>(type);
|
|
364
364
|
case LogicalTypeId::TIME_TZ:
|
|
365
365
|
// Not binary comparable
|
|
@@ -230,8 +230,16 @@ static string NormalizeTimeZone(const string &tz_str) {
|
|
|
230
230
|
}
|
|
231
231
|
|
|
232
232
|
idx_t pos = 3;
|
|
233
|
-
const auto
|
|
234
|
-
|
|
233
|
+
const auto utc = tz_str[pos++];
|
|
234
|
+
// Invert the sign (UTC and Etc use opposite sign conventions)
|
|
235
|
+
// https://en.wikipedia.org/wiki/Tz_database#Area
|
|
236
|
+
auto sign = utc;
|
|
237
|
+
if (utc == '+') {
|
|
238
|
+
sign = '-';
|
|
239
|
+
;
|
|
240
|
+
} else if (utc == '-') {
|
|
241
|
+
sign = '+';
|
|
242
|
+
} else {
|
|
235
243
|
break;
|
|
236
244
|
}
|
|
237
245
|
|
|
@@ -424,12 +432,13 @@ static void LoadInternal(ExtensionLoader &loader) {
|
|
|
424
432
|
auto locales = icu::Collator::getAvailableLocales(count);
|
|
425
433
|
for (int32_t i = 0; i < count; i++) {
|
|
426
434
|
string collation;
|
|
427
|
-
|
|
435
|
+
const auto &locale = locales[i]; // NOLINT
|
|
436
|
+
if (string(locale.getCountry()).empty()) {
|
|
428
437
|
// language only
|
|
429
|
-
collation =
|
|
438
|
+
collation = locale.getLanguage();
|
|
430
439
|
} else {
|
|
431
440
|
// language + country
|
|
432
|
-
collation =
|
|
441
|
+
collation = locale.getLanguage() + string("_") + locale.getCountry();
|
|
433
442
|
}
|
|
434
443
|
collation = StringUtil::Lower(collation);
|
|
435
444
|
|
|
@@ -534,10 +534,10 @@ ColumnWriter::CreateWriterRecursive(ClientContext &context, ParquetWriter &write
|
|
|
534
534
|
template <>
|
|
535
535
|
struct NumericLimits<float_na_equal> {
|
|
536
536
|
static constexpr float Minimum() {
|
|
537
|
-
return
|
|
537
|
+
return NumericLimits<float>::Minimum();
|
|
538
538
|
};
|
|
539
539
|
static constexpr float Maximum() {
|
|
540
|
-
return
|
|
540
|
+
return NumericLimits<float>::Maximum();
|
|
541
541
|
};
|
|
542
542
|
static constexpr bool IsSigned() {
|
|
543
543
|
return std::is_signed<float>::value;
|
|
@@ -550,10 +550,10 @@ struct NumericLimits<float_na_equal> {
|
|
|
550
550
|
template <>
|
|
551
551
|
struct NumericLimits<double_na_equal> {
|
|
552
552
|
static constexpr double Minimum() {
|
|
553
|
-
return
|
|
553
|
+
return NumericLimits<double>::Minimum();
|
|
554
554
|
};
|
|
555
555
|
static constexpr double Maximum() {
|
|
556
|
-
return
|
|
556
|
+
return NumericLimits<double>::Maximum();
|
|
557
557
|
};
|
|
558
558
|
static constexpr bool IsSigned() {
|
|
559
559
|
return std::is_signed<double>::value;
|
|
@@ -126,7 +126,8 @@ public:
|
|
|
126
126
|
public:
|
|
127
127
|
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override {
|
|
128
128
|
auto result = make_uniq<StandardColumnWriterState<SRC, TGT, OP>>(writer, row_group, row_group.columns.size());
|
|
129
|
-
result->encoding = duckdb_parquet::Encoding::
|
|
129
|
+
result->encoding = writer.GetParquetVersion() == ParquetVersion::V1 ? duckdb_parquet::Encoding::PLAIN_DICTIONARY
|
|
130
|
+
: duckdb_parquet::Encoding::RLE_DICTIONARY;
|
|
130
131
|
RegisterToRowGroup(row_group);
|
|
131
132
|
return std::move(result);
|
|
132
133
|
}
|
|
@@ -150,6 +151,8 @@ public:
|
|
|
150
151
|
}
|
|
151
152
|
page_state.dbp_encoder.FinishWrite(temp_writer);
|
|
152
153
|
break;
|
|
154
|
+
case duckdb_parquet::Encoding::PLAIN_DICTIONARY:
|
|
155
|
+
// PLAIN_DICTIONARY can be treated the same as RLE_DICTIONARY
|
|
153
156
|
case duckdb_parquet::Encoding::RLE_DICTIONARY:
|
|
154
157
|
D_ASSERT(page_state.dict_bit_width != 0);
|
|
155
158
|
if (!page_state.dict_written_value) {
|
|
@@ -265,7 +268,8 @@ public:
|
|
|
265
268
|
|
|
266
269
|
bool HasDictionary(PrimitiveColumnWriterState &state_p) override {
|
|
267
270
|
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
|
268
|
-
return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY
|
|
271
|
+
return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY ||
|
|
272
|
+
state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY;
|
|
269
273
|
}
|
|
270
274
|
|
|
271
275
|
idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override {
|
|
@@ -285,7 +289,8 @@ public:
|
|
|
285
289
|
|
|
286
290
|
void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override {
|
|
287
291
|
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
|
288
|
-
D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY
|
|
292
|
+
D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY ||
|
|
293
|
+
state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY);
|
|
289
294
|
|
|
290
295
|
if (writer.EnableBloomFilters()) {
|
|
291
296
|
state.bloom_filter =
|
|
@@ -310,7 +315,8 @@ public:
|
|
|
310
315
|
idx_t GetRowSize(const Vector &vector, const idx_t index,
|
|
311
316
|
const PrimitiveColumnWriterState &state_p) const override {
|
|
312
317
|
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
|
313
|
-
if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY
|
|
318
|
+
if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY ||
|
|
319
|
+
state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY) {
|
|
314
320
|
return (state.key_bit_width + 7) / 8;
|
|
315
321
|
} else {
|
|
316
322
|
return OP::template GetRowSize<SRC, TGT>(vector, index);
|
|
@@ -328,6 +334,8 @@ private:
|
|
|
328
334
|
const auto *data_ptr = FlatVector::GetData<SRC>(input_column);
|
|
329
335
|
|
|
330
336
|
switch (page_state.encoding) {
|
|
337
|
+
case duckdb_parquet::Encoding::PLAIN_DICTIONARY:
|
|
338
|
+
// PLAIN_DICTIONARY can be treated the same as RLE_DICTIONARY
|
|
331
339
|
case duckdb_parquet::Encoding::RLE_DICTIONARY: {
|
|
332
340
|
idx_t r = chunk_start;
|
|
333
341
|
if (!page_state.dict_written_value) {
|
|
@@ -72,21 +72,25 @@ string EncryptionKeyManager::GenerateRandomKeyID() {
|
|
|
72
72
|
}
|
|
73
73
|
|
|
74
74
|
void EncryptionKeyManager::AddKey(const string &key_name, data_ptr_t key) {
|
|
75
|
+
lock_guard<mutex> guard(lock);
|
|
75
76
|
derived_keys.emplace(key_name, EncryptionKey(key));
|
|
76
77
|
// Zero-out the encryption key
|
|
77
78
|
duckdb_mbedtls::MbedTlsWrapper::AESStateMBEDTLS::SecureClearData(key, DERIVED_KEY_LENGTH);
|
|
78
79
|
}
|
|
79
80
|
|
|
80
81
|
bool EncryptionKeyManager::HasKey(const string &key_name) const {
|
|
82
|
+
lock_guard<mutex> guard(lock);
|
|
81
83
|
return derived_keys.find(key_name) != derived_keys.end();
|
|
82
84
|
}
|
|
83
85
|
|
|
84
86
|
const_data_ptr_t EncryptionKeyManager::GetKey(const string &key_name) const {
|
|
85
87
|
D_ASSERT(HasKey(key_name));
|
|
88
|
+
lock_guard<mutex> guard(lock);
|
|
86
89
|
return derived_keys.at(key_name).GetPtr();
|
|
87
90
|
}
|
|
88
91
|
|
|
89
92
|
void EncryptionKeyManager::DeleteKey(const string &key_name) {
|
|
93
|
+
lock_guard<mutex> guard(lock);
|
|
90
94
|
derived_keys.erase(key_name);
|
|
91
95
|
}
|
|
92
96
|
|
|
@@ -1283,6 +1283,29 @@ bool LocalFileSystem::OnDiskFile(FileHandle &handle) {
|
|
|
1283
1283
|
return true;
|
|
1284
1284
|
}
|
|
1285
1285
|
|
|
1286
|
+
string LocalFileSystem::GetVersionTag(FileHandle &handle) {
|
|
1287
|
+
// TODO: Fix using FileSystem::Stats for v1.5, which should also fix it for Windows
|
|
1288
|
+
#ifdef _WIN32
|
|
1289
|
+
return "";
|
|
1290
|
+
#else
|
|
1291
|
+
int fd = handle.Cast<UnixFileHandle>().fd;
|
|
1292
|
+
struct stat s;
|
|
1293
|
+
if (fstat(fd, &s) == -1) {
|
|
1294
|
+
throw IOException("Failed to get file size for file \"%s\": %s", {{"errno", std::to_string(errno)}},
|
|
1295
|
+
handle.path, strerror(errno));
|
|
1296
|
+
}
|
|
1297
|
+
|
|
1298
|
+
// dev/ino should be enough, but to guard against in-place writes we also add file size and modification time
|
|
1299
|
+
uint64_t version_tag[4];
|
|
1300
|
+
Store(NumericCast<uint64_t>(s.st_dev), data_ptr_cast(&version_tag[0]));
|
|
1301
|
+
Store(NumericCast<uint64_t>(s.st_ino), data_ptr_cast(&version_tag[1]));
|
|
1302
|
+
Store(NumericCast<uint64_t>(s.st_size), data_ptr_cast(&version_tag[2]));
|
|
1303
|
+
Store(Timestamp::FromEpochSeconds(s.st_mtime).value, data_ptr_cast(&version_tag[3]));
|
|
1304
|
+
|
|
1305
|
+
return string(char_ptr_cast(version_tag), sizeof(uint64_t) * 4);
|
|
1306
|
+
#endif
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1286
1309
|
void LocalFileSystem::Seek(FileHandle &handle, idx_t location) {
|
|
1287
1310
|
if (!CanSeek()) {
|
|
1288
1311
|
throw IOException("Cannot seek in files of this type");
|
|
@@ -1036,6 +1036,7 @@ void ColumnDataCollection::InitializeScan(ColumnDataParallelScanState &state, ve
|
|
|
1036
1036
|
|
|
1037
1037
|
bool ColumnDataCollection::Scan(ColumnDataParallelScanState &state, ColumnDataLocalScanState &lstate,
|
|
1038
1038
|
DataChunk &result) const {
|
|
1039
|
+
D_ASSERT(result.GetTypes() == types);
|
|
1039
1040
|
result.Reset();
|
|
1040
1041
|
|
|
1041
1042
|
idx_t chunk_index;
|
|
@@ -1129,6 +1130,10 @@ void ColumnDataCollection::ScanAtIndex(ColumnDataParallelScanState &state, Colum
|
|
|
1129
1130
|
}
|
|
1130
1131
|
|
|
1131
1132
|
bool ColumnDataCollection::Scan(ColumnDataScanState &state, DataChunk &result) const {
|
|
1133
|
+
for (idx_t i = 0; i < state.column_ids.size(); i++) {
|
|
1134
|
+
D_ASSERT(result.GetTypes()[i] == types[state.column_ids[i]]);
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1132
1137
|
result.Reset();
|
|
1133
1138
|
|
|
1134
1139
|
idx_t chunk_index;
|
|
@@ -1213,6 +1218,7 @@ idx_t ColumnDataCollection::ChunkCount() const {
|
|
|
1213
1218
|
}
|
|
1214
1219
|
|
|
1215
1220
|
void ColumnDataCollection::FetchChunk(idx_t chunk_idx, DataChunk &result) const {
|
|
1221
|
+
D_ASSERT(result.GetTypes() == types);
|
|
1216
1222
|
D_ASSERT(chunk_idx < ChunkCount());
|
|
1217
1223
|
for (auto &segment : segments) {
|
|
1218
1224
|
if (chunk_idx >= segment->ChunkCount()) {
|
|
@@ -87,7 +87,7 @@ optional_idx ConflictManager::GetFirstInvalidIndex(const idx_t count, const bool
|
|
|
87
87
|
for (idx_t i = 0; i < count; i++) {
|
|
88
88
|
if (negate && !validity.RowIsValid(i)) {
|
|
89
89
|
return i;
|
|
90
|
-
} else if (validity.RowIsValid(i)) {
|
|
90
|
+
} else if (!negate && validity.RowIsValid(i)) {
|
|
91
91
|
return i;
|
|
92
92
|
}
|
|
93
93
|
}
|
|
@@ -95,7 +95,9 @@ void Node4::DeleteChild(ART &art, Node &node, Node &parent, const uint8_t byte,
|
|
|
95
95
|
|
|
96
96
|
auto prev_node4_status = node.GetGateStatus();
|
|
97
97
|
Node::FreeNode(art, node);
|
|
98
|
-
|
|
98
|
+
// Propagate both the prev_node_4 status and the general gate status (if the gate was earlier on),
|
|
99
|
+
// since the concatenation logic depends on both.
|
|
100
|
+
Prefix::Concat(art, parent, node, child, remaining_byte, prev_node4_status, status);
|
|
99
101
|
}
|
|
100
102
|
|
|
101
103
|
void Node4::ShrinkNode16(ART &art, Node &node4, Node &node16) {
|
|
@@ -65,8 +65,8 @@ void Prefix::New(ART &art, reference<Node> &ref, const ARTKey &key, const idx_t
|
|
|
65
65
|
}
|
|
66
66
|
}
|
|
67
67
|
|
|
68
|
-
void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte,
|
|
69
|
-
const GateStatus
|
|
68
|
+
void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte, const GateStatus node4_status,
|
|
69
|
+
const GateStatus status) {
|
|
70
70
|
// We have four situations from which we enter here:
|
|
71
71
|
// 1: PREFIX (parent) - Node4 (prev_node4) - PREFIX (child) - INLINED_LEAF, or
|
|
72
72
|
// 2: PREFIX (parent) - Node4 (prev_node4) - INLINED_LEAF (child), or
|
|
@@ -90,10 +90,7 @@ void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8
|
|
|
90
90
|
ConcatChildIsGate(art, parent, node4, child, byte);
|
|
91
91
|
return;
|
|
92
92
|
}
|
|
93
|
-
|
|
94
|
-
auto inside_gate = parent.GetGateStatus() == GateStatus::GATE_SET;
|
|
95
|
-
ConcatInternal(art, parent, node4, child, byte, inside_gate);
|
|
96
|
-
return;
|
|
93
|
+
ConcatInternal(art, parent, node4, child, byte, status);
|
|
97
94
|
}
|
|
98
95
|
|
|
99
96
|
void Prefix::Reduce(ART &art, Node &node, const idx_t pos) {
|
|
@@ -286,9 +283,9 @@ Prefix Prefix::GetTail(ART &art, const Node &node) {
|
|
|
286
283
|
}
|
|
287
284
|
|
|
288
285
|
void Prefix::ConcatInternal(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte,
|
|
289
|
-
const
|
|
286
|
+
const GateStatus status) {
|
|
290
287
|
if (child.GetType() == NType::LEAF_INLINED) {
|
|
291
|
-
if (
|
|
288
|
+
if (status == GateStatus::GATE_SET) {
|
|
292
289
|
if (parent.GetType() == NType::PREFIX) {
|
|
293
290
|
// The parent only contained the Node4, so we can now inline 'all the way up',
|
|
294
291
|
// and the gate is no longer nested.
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
#include "duckdb/execution/index/bound_index.hpp"
|
|
2
2
|
|
|
3
|
+
#include "duckdb/common/array.hpp"
|
|
3
4
|
#include "duckdb/common/radix.hpp"
|
|
4
5
|
#include "duckdb/common/serializer/serializer.hpp"
|
|
5
6
|
#include "duckdb/planner/expression/bound_columnref_expression.hpp"
|
|
6
7
|
#include "duckdb/planner/expression/bound_reference_expression.hpp"
|
|
7
8
|
#include "duckdb/planner/expression_iterator.hpp"
|
|
8
9
|
#include "duckdb/storage/table/append_state.hpp"
|
|
10
|
+
#include "duckdb/common/types/selection_vector.hpp"
|
|
9
11
|
|
|
10
12
|
namespace duckdb {
|
|
11
13
|
|
|
@@ -154,39 +156,80 @@ string BoundIndex::AppendRowError(DataChunk &input, idx_t index) {
|
|
|
154
156
|
return error;
|
|
155
157
|
}
|
|
156
158
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
+
namespace {
|
|
160
|
+
|
|
161
|
+
struct BufferedReplayState {
|
|
162
|
+
optional_ptr<ColumnDataCollection> buffer = nullptr;
|
|
163
|
+
ColumnDataScanState scan_state;
|
|
164
|
+
DataChunk current_chunk;
|
|
165
|
+
bool scan_initialized = false;
|
|
166
|
+
};
|
|
167
|
+
} // namespace
|
|
168
|
+
|
|
169
|
+
void BoundIndex::ApplyBufferedReplays(const vector<LogicalType> &table_types, BufferedIndexReplays &buffered_replays,
|
|
159
170
|
const vector<StorageIndex> &mapped_column_ids) {
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
171
|
+
if (!buffered_replays.HasBufferedReplays()) {
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// We have two replay states: one for inserts and one for deletes. These are indexed into using the
|
|
176
|
+
// replay_type. Both scans are interleaved, so the state maintains the position of each scan.
|
|
177
|
+
array<BufferedReplayState, 2> replay_states;
|
|
178
|
+
DataChunk table_chunk;
|
|
179
|
+
table_chunk.InitializeEmpty(table_types);
|
|
180
|
+
|
|
181
|
+
for (const auto &replay_range : buffered_replays.ranges) {
|
|
182
|
+
const auto type_idx = static_cast<idx_t>(replay_range.type);
|
|
183
|
+
auto &state = replay_states[type_idx];
|
|
184
|
+
|
|
185
|
+
// Initialize the scan state if necessary. Take ownership of buffered operations, since we won't need
|
|
186
|
+
// them after replaying anyways.
|
|
187
|
+
if (!state.scan_initialized) {
|
|
188
|
+
state.buffer = buffered_replays.GetBuffer(replay_range.type);
|
|
189
|
+
state.buffer->InitializeScan(state.scan_state);
|
|
190
|
+
state.buffer->InitializeScanChunk(state.current_chunk);
|
|
191
|
+
state.scan_initialized = true;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
idx_t current_row = replay_range.start;
|
|
195
|
+
while (current_row < replay_range.end) {
|
|
196
|
+
// Scan the next DataChunk from the ColumnDataCollection buffer if the current row is on or after
|
|
197
|
+
// that chunk's starting row index.
|
|
198
|
+
if (current_row >= state.scan_state.next_row_index) {
|
|
199
|
+
if (!state.buffer->Scan(state.scan_state, state.current_chunk)) {
|
|
200
|
+
throw InternalException("Buffered index data exhausted during replay");
|
|
201
|
+
}
|
|
174
202
|
}
|
|
175
|
-
table_chunk.SetCardinality(scan_chunk.size());
|
|
176
203
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
204
|
+
// We need to process the remaining rows in the current chunk, which is the minimum of the available
|
|
205
|
+
// rows in the chunk and the remaining rows in the current range.
|
|
206
|
+
const auto offset_in_chunk = current_row - state.scan_state.current_row_index;
|
|
207
|
+
const auto available_in_chunk = state.current_chunk.size() - offset_in_chunk;
|
|
208
|
+
// [start, end) in ReplayRange is [inclusive, exclusive).
|
|
209
|
+
const auto range_remaining = replay_range.end - current_row;
|
|
210
|
+
const auto rows_to_process = MinValue<idx_t>(available_in_chunk, range_remaining);
|
|
211
|
+
|
|
212
|
+
SelectionVector sel(offset_in_chunk, rows_to_process);
|
|
213
|
+
|
|
214
|
+
for (idx_t col_idx = 0; col_idx < state.current_chunk.ColumnCount() - 1; col_idx++) {
|
|
215
|
+
const auto col_id = mapped_column_ids[col_idx].GetPrimaryIndex();
|
|
216
|
+
table_chunk.data[col_id].Reference(state.current_chunk.data[col_idx]);
|
|
217
|
+
table_chunk.data[col_id].Slice(sel, rows_to_process);
|
|
218
|
+
}
|
|
219
|
+
table_chunk.SetCardinality(rows_to_process);
|
|
220
|
+
Vector row_ids(state.current_chunk.data.back(), sel, rows_to_process);
|
|
221
|
+
|
|
222
|
+
if (replay_range.type == BufferedIndexReplay::INSERT_ENTRY) {
|
|
223
|
+
IndexAppendInfo append_info(IndexAppendMode::INSERT_DUPLICATES, nullptr);
|
|
224
|
+
const auto error = Append(table_chunk, row_ids, append_info);
|
|
181
225
|
if (error.HasError()) {
|
|
182
226
|
throw InternalException("error while applying buffered appends: " + error.Message());
|
|
183
227
|
}
|
|
228
|
+
current_row += rows_to_process;
|
|
184
229
|
continue;
|
|
185
230
|
}
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
}
|
|
189
|
-
}
|
|
231
|
+
Delete(table_chunk, row_ids);
|
|
232
|
+
current_row += rows_to_process;
|
|
190
233
|
}
|
|
191
234
|
}
|
|
192
235
|
}
|
|
@@ -8,10 +8,6 @@
|
|
|
8
8
|
|
|
9
9
|
namespace duckdb {
|
|
10
10
|
|
|
11
|
-
BufferedIndexData::BufferedIndexData(BufferedIndexReplay replay_type, unique_ptr<ColumnDataCollection> data_p)
|
|
12
|
-
: type(replay_type), data(std::move(data_p)) {
|
|
13
|
-
}
|
|
14
|
-
|
|
15
11
|
UnboundIndex::UnboundIndex(unique_ptr<CreateInfo> create_info, IndexStorageInfo storage_info_p,
|
|
16
12
|
TableIOManager &table_io_manager, AttachedDatabase &db)
|
|
17
13
|
: Index(create_info->Cast<CreateIndexInfo>().column_ids, table_io_manager, db), create_info(std::move(create_info)),
|
|
@@ -40,15 +36,13 @@ void UnboundIndex::CommitDrop() {
|
|
|
40
36
|
}
|
|
41
37
|
|
|
42
38
|
void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids,
|
|
43
|
-
const vector<StorageIndex> &mapped_column_ids_p, BufferedIndexReplay replay_type) {
|
|
39
|
+
const vector<StorageIndex> &mapped_column_ids_p, const BufferedIndexReplay replay_type) {
|
|
44
40
|
D_ASSERT(!column_ids.empty());
|
|
45
41
|
auto types = index_column_chunk.GetTypes(); // column types
|
|
46
42
|
types.push_back(LogicalType::ROW_TYPE);
|
|
47
43
|
|
|
48
44
|
auto &allocator = Allocator::Get(db);
|
|
49
45
|
|
|
50
|
-
BufferedIndexData buffered_data(replay_type, make_uniq<ColumnDataCollection>(allocator, types));
|
|
51
|
-
|
|
52
46
|
//! First time we are buffering data, canonical column_id mapping is stored.
|
|
53
47
|
//! This should be a sorted list of all the physical offsets of Indexed columns on this table.
|
|
54
48
|
if (mapped_column_ids.empty()) {
|
|
@@ -56,7 +50,7 @@ void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids,
|
|
|
56
50
|
}
|
|
57
51
|
D_ASSERT(mapped_column_ids == mapped_column_ids_p);
|
|
58
52
|
|
|
59
|
-
//
|
|
53
|
+
// combined_chunk has all the indexed columns according to mapped_column_ids ordering, as well as a rowid column.
|
|
60
54
|
DataChunk combined_chunk;
|
|
61
55
|
combined_chunk.InitializeEmpty(types);
|
|
62
56
|
for (idx_t i = 0; i < index_column_chunk.ColumnCount(); i++) {
|
|
@@ -64,8 +58,25 @@ void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids,
|
|
|
64
58
|
}
|
|
65
59
|
combined_chunk.data.back().Reference(row_ids);
|
|
66
60
|
combined_chunk.SetCardinality(index_column_chunk.size());
|
|
67
|
-
|
|
68
|
-
buffered_replays.
|
|
61
|
+
|
|
62
|
+
auto &buffer = buffered_replays.GetBuffer(replay_type);
|
|
63
|
+
if (buffer == nullptr) {
|
|
64
|
+
buffer = make_uniq<ColumnDataCollection>(allocator, types);
|
|
65
|
+
}
|
|
66
|
+
// The starting index of the buffer range is the size of the buffer.
|
|
67
|
+
const idx_t start = buffer->Count();
|
|
68
|
+
const idx_t end = start + combined_chunk.size();
|
|
69
|
+
auto &ranges = buffered_replays.ranges;
|
|
70
|
+
|
|
71
|
+
if (ranges.empty() || ranges.back().type != replay_type) {
|
|
72
|
+
// If there are no buffered ranges, or the replay types don't match, append a new range.
|
|
73
|
+
ranges.emplace_back(replay_type, start, end);
|
|
74
|
+
buffer->Append(combined_chunk);
|
|
75
|
+
return;
|
|
76
|
+
}
|
|
77
|
+
// Otherwise merge the range with the previous one.
|
|
78
|
+
ranges.back().end = end;
|
|
79
|
+
buffer->Append(combined_chunk);
|
|
69
80
|
}
|
|
70
81
|
|
|
71
82
|
} // namespace duckdb
|
|
@@ -26,6 +26,10 @@ BaseScanner::BaseScanner(shared_ptr<CSVBufferManager> buffer_manager_p, shared_p
|
|
|
26
26
|
}
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
+
void BaseScanner::Print() const {
|
|
30
|
+
state_machine->Print();
|
|
31
|
+
}
|
|
32
|
+
|
|
29
33
|
string BaseScanner::RemoveSeparator(const char *value_ptr, const idx_t size, char thousands_separator) {
|
|
30
34
|
string result;
|
|
31
35
|
result.reserve(size);
|
|
@@ -22,7 +22,7 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m
|
|
|
22
22
|
idx_t result_size_p, idx_t buffer_position, CSVErrorHandler &error_hander_p,
|
|
23
23
|
CSVIterator &iterator_p, bool store_line_size_p,
|
|
24
24
|
shared_ptr<CSVFileScan> csv_file_scan_p, idx_t &lines_read_p, bool sniffing_p,
|
|
25
|
-
string path_p, idx_t scan_id)
|
|
25
|
+
const string &path_p, idx_t scan_id, bool &used_unstrictness)
|
|
26
26
|
: ScannerResult(states, state_machine, result_size_p),
|
|
27
27
|
number_of_columns(NumericCast<uint32_t>(state_machine.dialect_options.num_cols)),
|
|
28
28
|
null_padding(state_machine.options.null_padding), ignore_errors(state_machine.options.ignore_errors.GetValue()),
|
|
@@ -30,8 +30,8 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m
|
|
|
30
30
|
? 0
|
|
31
31
|
: state_machine.dialect_options.state_machine_options.delimiter.GetValue().size() - 1),
|
|
32
32
|
error_handler(error_hander_p), iterator(iterator_p), store_line_size(store_line_size_p),
|
|
33
|
-
csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p),
|
|
34
|
-
current_errors(scan_id, state_machine.options.IgnoreErrors()), sniffing(sniffing_p), path(
|
|
33
|
+
csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p), used_unstrictness(used_unstrictness),
|
|
34
|
+
current_errors(scan_id, state_machine.options.IgnoreErrors()), sniffing(sniffing_p), path(path_p) {
|
|
35
35
|
// Vector information
|
|
36
36
|
D_ASSERT(number_of_columns > 0);
|
|
37
37
|
if (!buffer_handle) {
|
|
@@ -154,23 +154,26 @@ inline bool IsValueNull(const char *null_str_ptr, const char *value_ptr, const i
|
|
|
154
154
|
}
|
|
155
155
|
|
|
156
156
|
bool StringValueResult::HandleTooManyColumnsError(const char *value_ptr, const idx_t size) {
|
|
157
|
-
if (cur_col_id >= number_of_columns
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
157
|
+
if (cur_col_id >= number_of_columns) {
|
|
158
|
+
if (state_machine.state_machine_options.strict_mode.GetValue()) {
|
|
159
|
+
bool error = true;
|
|
160
|
+
if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
|
|
161
|
+
// we make an exception if the first over-value is null
|
|
162
|
+
bool is_value_null = false;
|
|
163
|
+
for (idx_t i = 0; i < null_str_count; i++) {
|
|
164
|
+
is_value_null = is_value_null || IsValueNull(null_str_ptr[i], value_ptr, size);
|
|
165
|
+
}
|
|
166
|
+
error = !is_value_null;
|
|
164
167
|
}
|
|
165
|
-
error
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
168
|
+
if (error) {
|
|
169
|
+
// We error pointing to the current value error.
|
|
170
|
+
current_errors.Insert(TOO_MANY_COLUMNS, cur_col_id, chunk_col_id, last_position);
|
|
171
|
+
cur_col_id++;
|
|
172
|
+
}
|
|
173
|
+
// We had an error
|
|
174
|
+
return true;
|
|
171
175
|
}
|
|
172
|
-
|
|
173
|
-
return true;
|
|
176
|
+
used_unstrictness = true;
|
|
174
177
|
}
|
|
175
178
|
return false;
|
|
176
179
|
}
|
|
@@ -231,6 +234,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, idx_t size, bool
|
|
|
231
234
|
}
|
|
232
235
|
if (cur_col_id >= number_of_columns) {
|
|
233
236
|
if (!state_machine.state_machine_options.strict_mode.GetValue()) {
|
|
237
|
+
used_unstrictness = true;
|
|
234
238
|
return;
|
|
235
239
|
}
|
|
236
240
|
bool error = true;
|
|
@@ -549,6 +553,7 @@ void StringValueResult::AddPossiblyEscapedValue(StringValueResult &result, const
|
|
|
549
553
|
}
|
|
550
554
|
if (result.cur_col_id >= result.number_of_columns &&
|
|
551
555
|
!result.state_machine.state_machine_options.strict_mode.GetValue()) {
|
|
556
|
+
result.used_unstrictness = true;
|
|
552
557
|
return;
|
|
553
558
|
}
|
|
554
559
|
if (!result.HandleTooManyColumnsError(value_ptr, length)) {
|
|
@@ -980,7 +985,7 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptr<CSV
|
|
|
980
985
|
result(states, *state_machine, cur_buffer_handle, BufferAllocator::Get(buffer_manager->context), result_size,
|
|
981
986
|
iterator.pos.buffer_pos, *error_handler, iterator,
|
|
982
987
|
buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
|
|
983
|
-
buffer_manager->GetFilePath(), scanner_idx_p),
|
|
988
|
+
buffer_manager->GetFilePath(), scanner_idx_p, used_unstrictness),
|
|
984
989
|
start_pos(0) {
|
|
985
990
|
if (scanner_idx == 0 && csv_file_scan) {
|
|
986
991
|
lines_read += csv_file_scan->skipped_rows;
|
|
@@ -997,7 +1002,7 @@ StringValueScanner::StringValueScanner(const shared_ptr<CSVBufferManager> &buffe
|
|
|
997
1002
|
result(states, *state_machine, cur_buffer_handle, Allocator::DefaultAllocator(), result_size,
|
|
998
1003
|
iterator.pos.buffer_pos, *error_handler, iterator,
|
|
999
1004
|
buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
|
|
1000
|
-
buffer_manager->GetFilePath(), 0),
|
|
1005
|
+
buffer_manager->GetFilePath(), 0, used_unstrictness),
|
|
1001
1006
|
start_pos(0) {
|
|
1002
1007
|
if (scanner_idx == 0 && csv_file_scan) {
|
|
1003
1008
|
lines_read += csv_file_scan->skipped_rows;
|
|
@@ -1939,14 +1944,17 @@ void StringValueScanner::FinalizeChunkProcess() {
|
|
|
1939
1944
|
if (result.current_errors.HandleErrors(result)) {
|
|
1940
1945
|
result.number_of_rows++;
|
|
1941
1946
|
}
|
|
1942
|
-
if (states.IsQuotedCurrent() && !found_error
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1947
|
+
if (states.IsQuotedCurrent() && !found_error) {
|
|
1948
|
+
if (state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
|
|
1949
|
+
type = UNTERMINATED_QUOTES;
|
|
1950
|
+
// If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated
|
|
1951
|
+
// quotes
|
|
1952
|
+
result.current_errors.Insert(type, result.cur_col_id, result.chunk_col_id, result.last_position);
|
|
1953
|
+
if (result.current_errors.HandleErrors(result)) {
|
|
1954
|
+
result.number_of_rows++;
|
|
1955
|
+
}
|
|
1956
|
+
} else {
|
|
1957
|
+
used_unstrictness = true;
|
|
1950
1958
|
}
|
|
1951
1959
|
}
|
|
1952
1960
|
if (!iterator.done) {
|
|
@@ -14,7 +14,7 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, const MultiFileOptions &file
|
|
|
14
14
|
auto &logical_type = format_template.first;
|
|
15
15
|
best_format_candidates[logical_type].clear();
|
|
16
16
|
}
|
|
17
|
-
// Initialize max columns found to either 0 or however many were set
|
|
17
|
+
// Initialize max columns found to either 0, or however many were set
|
|
18
18
|
max_columns_found = set_columns.Size();
|
|
19
19
|
error_handler = make_shared_ptr<CSVErrorHandler>(options.ignore_errors.GetValue());
|
|
20
20
|
detection_error_handler = make_shared_ptr<CSVErrorHandler>(true);
|
|
@@ -193,7 +193,8 @@ SnifferResult CSVSniffer::SniffCSV(const bool force_match) {
|
|
|
193
193
|
buffer_manager->ResetBufferManager();
|
|
194
194
|
}
|
|
195
195
|
buffer_manager->sniffing = false;
|
|
196
|
-
if (best_candidate->error_handler->AnyErrors() && !options.ignore_errors.GetValue()
|
|
196
|
+
if (best_candidate->error_handler->AnyErrors() && !options.ignore_errors.GetValue() &&
|
|
197
|
+
best_candidate->state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
|
|
197
198
|
best_candidate->error_handler->ErrorIfTypeExists(MAXIMUM_LINE_SIZE);
|
|
198
199
|
}
|
|
199
200
|
D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
|