duckdb 0.8.2-dev4376.0 → 0.8.2-dev4424.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/src/common/types/date.cpp +1 -1
- package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +3 -10
- package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +6 -3
- package/src/duckdb/src/function/table/read_csv.cpp +5 -22
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/main/query_result.hpp +1 -1
- package/src/duckdb/src/storage/checkpoint_manager.cpp +37 -36
- package/src/duckdb/src/storage/local_storage.cpp +8 -1
- package/test/prepare.test.ts +10 -1
- package/test/test_all_types.test.ts +4 -4
package/package.json
CHANGED
@@ -492,7 +492,7 @@ int32_t Date::ExtractDayOfTheYear(date_t date) {
|
|
492
492
|
|
493
493
|
int64_t Date::ExtractJulianDay(date_t date) {
|
494
494
|
// Julian Day 0 is (-4713, 11, 24) in the proleptic Gregorian calendar.
|
495
|
-
static const
|
495
|
+
static const int64_t JULIAN_EPOCH = -2440588;
|
496
496
|
return date.days - JULIAN_EPOCH;
|
497
497
|
}
|
498
498
|
|
@@ -148,9 +148,6 @@ void FixedSizeBuffer::Pin() {
|
|
148
148
|
|
149
149
|
uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
|
150
150
|
|
151
|
-
// this function calls Get() on the buffer, so the buffer must already be in memory
|
152
|
-
D_ASSERT(InMemory());
|
153
|
-
|
154
151
|
// get the bitmask data
|
155
152
|
auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
|
156
153
|
ValidityMask mask(bitmask_ptr);
|
@@ -200,7 +197,7 @@ uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
|
|
200
197
|
|
201
198
|
uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
|
202
199
|
|
203
|
-
// this function calls Get() on the buffer
|
200
|
+
// this function calls Get() on the buffer
|
204
201
|
D_ASSERT(InMemory());
|
205
202
|
|
206
203
|
// finds the maximum zero bit in a bitmask, and adds one to it,
|
@@ -259,17 +256,13 @@ uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
|
|
259
256
|
}
|
260
257
|
|
261
258
|
// there are no allocations in this buffer
|
262
|
-
|
263
|
-
// FIXME: test_index_large_aborted_append.test with force_restart
|
264
|
-
// FIXME: test if we still have non-dirty buffer to serialize after fixing this
|
265
|
-
// throw InternalException("tried to serialize empty buffer");
|
266
|
-
return 0;
|
259
|
+
throw InternalException("tried to serialize empty buffer");
|
267
260
|
}
|
268
261
|
|
269
262
|
void FixedSizeBuffer::SetUninitializedRegions(PartialBlockForIndex &p_block_for_index, const idx_t segment_size,
|
270
263
|
const idx_t offset, const idx_t bitmask_offset) {
|
271
264
|
|
272
|
-
// this function calls Get() on the buffer
|
265
|
+
// this function calls Get() on the buffer
|
273
266
|
D_ASSERT(InMemory());
|
274
267
|
|
275
268
|
auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
|
@@ -89,17 +89,19 @@ bool ParallelCSVReader::SetPosition() {
|
|
89
89
|
position_buffer++;
|
90
90
|
}
|
91
91
|
if (position_buffer > end_buffer) {
|
92
|
+
VerifyLineLength(position_buffer, buffer->batch_index);
|
92
93
|
return false;
|
93
94
|
}
|
94
95
|
SkipEmptyLines();
|
95
96
|
if (verification_positions.beginning_of_first_line == 0) {
|
96
97
|
verification_positions.beginning_of_first_line = position_buffer;
|
97
98
|
}
|
98
|
-
|
99
|
+
VerifyLineLength(position_buffer, buffer->batch_index);
|
99
100
|
verification_positions.end_of_last_line = position_buffer;
|
100
101
|
return true;
|
101
102
|
}
|
102
103
|
}
|
104
|
+
VerifyLineLength(position_buffer, buffer->batch_index);
|
103
105
|
return false;
|
104
106
|
}
|
105
107
|
SkipEmptyLines();
|
@@ -143,12 +145,13 @@ bool ParallelCSVReader::SetPosition() {
|
|
143
145
|
break;
|
144
146
|
}
|
145
147
|
|
146
|
-
|
148
|
+
auto pos_check = position_buffer == 0 ? position_buffer : position_buffer - 1;
|
149
|
+
if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[pos_check])) {
|
147
150
|
break;
|
148
151
|
}
|
149
152
|
|
150
153
|
if (position_buffer > end_buffer && options.dialect_options.new_line == NewLineIdentifier::CARRY_ON &&
|
151
|
-
(*buffer)[
|
154
|
+
(*buffer)[pos_check] == '\n') {
|
152
155
|
break;
|
153
156
|
}
|
154
157
|
idx_t position_set = position_buffer;
|
@@ -300,7 +300,7 @@ public:
|
|
300
300
|
const CSVReaderOptions &options, idx_t system_threads_p, const vector<string> &files_path_p,
|
301
301
|
bool force_parallelism_p, vector<column_t> column_ids_p)
|
302
302
|
: buffer_manager(std::move(buffer_manager_p)), system_threads(system_threads_p),
|
303
|
-
|
303
|
+
force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
|
304
304
|
line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
|
305
305
|
current_file_path = files_path_p[0];
|
306
306
|
CSVFileHandle *file_handle_ptr;
|
@@ -316,16 +316,6 @@ public:
|
|
316
316
|
first_file_size = file_size;
|
317
317
|
on_disk_file = file_handle_ptr->OnDiskFile();
|
318
318
|
bytes_read = 0;
|
319
|
-
if (buffer_size < file_size || file_size == 0) {
|
320
|
-
bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
|
321
|
-
} else {
|
322
|
-
bytes_per_local_state = file_size / MaxThreads();
|
323
|
-
}
|
324
|
-
if (bytes_per_local_state == 0) {
|
325
|
-
// In practice, I think this won't happen, it only happens because we are mocking up test scenarios
|
326
|
-
// this boy needs to be at least one.
|
327
|
-
bytes_per_local_state = 1;
|
328
|
-
}
|
329
319
|
running_threads = MaxThreads();
|
330
320
|
|
331
321
|
// Initialize all the book-keeping variables
|
@@ -368,8 +358,6 @@ public:
|
|
368
358
|
|
369
359
|
void UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx);
|
370
360
|
|
371
|
-
void IncrementThread();
|
372
|
-
|
373
361
|
void DecrementThread();
|
374
362
|
|
375
363
|
bool Finished();
|
@@ -402,16 +390,12 @@ private:
|
|
402
390
|
mutex main_mutex;
|
403
391
|
//! Byte set from for last thread
|
404
392
|
idx_t next_byte = 0;
|
405
|
-
//! How many bytes we should execute per local state
|
406
|
-
idx_t bytes_per_local_state;
|
407
393
|
//! Size of first file
|
408
394
|
idx_t first_file_size = 0;
|
409
395
|
//! Whether or not this is an on-disk file
|
410
396
|
bool on_disk_file = true;
|
411
397
|
//! Basically max number of threads in DuckDB
|
412
398
|
idx_t system_threads;
|
413
|
-
//! Size of the buffers
|
414
|
-
idx_t buffer_size;
|
415
399
|
//! Current batch index
|
416
400
|
idx_t batch_index = 0;
|
417
401
|
idx_t local_batch_index = 0;
|
@@ -454,11 +438,6 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
|
|
454
438
|
return system_threads;
|
455
439
|
}
|
456
440
|
|
457
|
-
void ParallelCSVGlobalState::IncrementThread() {
|
458
|
-
lock_guard<mutex> parallel_lock(main_mutex);
|
459
|
-
running_threads++;
|
460
|
-
}
|
461
|
-
|
462
441
|
void ParallelCSVGlobalState::DecrementThread() {
|
463
442
|
lock_guard<mutex> parallel_lock(main_mutex);
|
464
443
|
D_ASSERT(running_threads > 0);
|
@@ -572,6 +551,7 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
572
551
|
}
|
573
552
|
// set up the current buffer
|
574
553
|
line_info.current_batches[file_index - 1].insert(local_batch_index);
|
554
|
+
idx_t bytes_per_local_state = current_buffer->actual_size / MaxThreads() + 1;
|
575
555
|
auto result = make_uniq<CSVBufferRead>(
|
576
556
|
buffer_manager->GetBuffer(cur_buffer_idx), buffer_manager->GetBuffer(cur_buffer_idx + 1), next_byte,
|
577
557
|
next_byte + bytes_per_local_state, batch_index++, local_batch_index++, &line_info);
|
@@ -1135,6 +1115,9 @@ unique_ptr<TableRef> ReadCSVReplacement(ClientContext &context, const string &ta
|
|
1135
1115
|
if (StringUtil::EndsWith(lower_name, ".gz")) {
|
1136
1116
|
lower_name = lower_name.substr(0, lower_name.size() - 3);
|
1137
1117
|
} else if (StringUtil::EndsWith(lower_name, ".zst")) {
|
1118
|
+
if (!Catalog::TryAutoLoad(context, "parquet")) {
|
1119
|
+
throw MissingExtensionException("parquet extension is required for reading zst compressed file");
|
1120
|
+
}
|
1138
1121
|
lower_name = lower_name.substr(0, lower_name.size() - 4);
|
1139
1122
|
}
|
1140
1123
|
if (!StringUtil::EndsWith(lower_name, ".csv") && !StringUtil::Contains(lower_name, ".csv?") &&
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.8.2-
|
2
|
+
#define DUCKDB_VERSION "0.8.2-dev4424"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "b78b24ad26"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -40,7 +40,7 @@ public:
|
|
40
40
|
vector<string> names;
|
41
41
|
|
42
42
|
public:
|
43
|
-
DUCKDB_API void ThrowError(const string &prepended_message = "") const;
|
43
|
+
[[noreturn]] DUCKDB_API void ThrowError(const string &prepended_message = "") const;
|
44
44
|
DUCKDB_API void SetError(PreservedError error);
|
45
45
|
DUCKDB_API bool HasError() const;
|
46
46
|
DUCKDB_API const ExceptionType &GetErrorType() const;
|
@@ -363,63 +363,64 @@ void CheckpointWriter::WriteIndex(IndexCatalogEntry &index_catalog, Serializer &
|
|
363
363
|
|
364
364
|
void CheckpointReader::ReadIndex(ClientContext &context, Deserializer &deserializer) {
|
365
365
|
|
366
|
-
//
|
367
|
-
auto
|
368
|
-
auto &
|
369
|
-
|
370
|
-
//
|
371
|
-
auto &
|
372
|
-
auto &
|
373
|
-
catalog.GetEntry(context, CatalogType::TABLE_ENTRY,
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
366
|
+
// deserialize the index create info
|
367
|
+
auto create_info = deserializer.ReadProperty<unique_ptr<CreateInfo>>(100, "index");
|
368
|
+
auto &info = create_info->Cast<CreateIndexInfo>();
|
369
|
+
|
370
|
+
// create the index in the catalog
|
371
|
+
auto &schema = catalog.GetSchema(context, create_info->schema);
|
372
|
+
auto &table =
|
373
|
+
catalog.GetEntry(context, CatalogType::TABLE_ENTRY, create_info->schema, info.table).Cast<DuckTableEntry>();
|
374
|
+
|
375
|
+
auto &index = schema.CreateIndex(context, info, table)->Cast<DuckIndexEntry>();
|
376
|
+
|
377
|
+
index.info = table.GetStorage().info;
|
378
|
+
// insert the parsed expressions into the stored index so that we correctly (de)serialize it during consecutive
|
379
|
+
// checkpoints
|
380
|
+
for (auto &parsed_expr : info.parsed_expressions) {
|
381
|
+
index.parsed_expressions.push_back(parsed_expr->Copy());
|
382
|
+
}
|
383
|
+
|
384
|
+
// we deserialize the index lazily, i.e., we do not need to load any node information
|
378
385
|
// except the root block pointer
|
379
|
-
auto
|
386
|
+
auto root_block_pointer = deserializer.ReadProperty<BlockPointer>(101, "root_block_pointer");
|
380
387
|
|
381
|
-
// obtain the expressions of the ART from the index metadata
|
382
|
-
vector<unique_ptr<Expression>> unbound_expressions;
|
388
|
+
// obtain the parsed expressions of the ART from the index metadata
|
383
389
|
vector<unique_ptr<ParsedExpression>> parsed_expressions;
|
384
|
-
for (auto &
|
385
|
-
parsed_expressions.push_back(
|
390
|
+
for (auto &parsed_expr : info.parsed_expressions) {
|
391
|
+
parsed_expressions.push_back(parsed_expr->Copy());
|
386
392
|
}
|
393
|
+
D_ASSERT(!parsed_expressions.empty());
|
387
394
|
|
388
|
-
// bind the parsed expressions
|
389
|
-
// add the table to the bind context
|
395
|
+
// add the table to the bind context to bind the parsed expressions
|
390
396
|
auto binder = Binder::CreateBinder(context);
|
391
397
|
vector<LogicalType> column_types;
|
392
398
|
vector<string> column_names;
|
393
|
-
for (auto &col :
|
399
|
+
for (auto &col : table.GetColumns().Logical()) {
|
394
400
|
column_types.push_back(col.Type());
|
395
401
|
column_names.push_back(col.Name());
|
396
402
|
}
|
403
|
+
|
404
|
+
// create a binder to bind the parsed expressions
|
397
405
|
vector<column_t> column_ids;
|
398
|
-
binder->bind_context.AddBaseTable(0,
|
406
|
+
binder->bind_context.AddBaseTable(0, info.table, column_names, column_types, column_ids, &table);
|
399
407
|
IndexBinder idx_binder(*binder, context);
|
408
|
+
|
409
|
+
// bind the parsed expressions to create unbound expressions
|
410
|
+
vector<unique_ptr<Expression>> unbound_expressions;
|
400
411
|
unbound_expressions.reserve(parsed_expressions.size());
|
401
412
|
for (auto &expr : parsed_expressions) {
|
402
413
|
unbound_expressions.push_back(idx_binder.Bind(expr));
|
403
414
|
}
|
404
415
|
|
405
|
-
if (parsed_expressions.empty()) {
|
406
|
-
// this is a PK/FK index: we create the necessary bound column ref expressions
|
407
|
-
unbound_expressions.reserve(index_info.column_ids.size());
|
408
|
-
for (idx_t key_nr = 0; key_nr < index_info.column_ids.size(); key_nr++) {
|
409
|
-
auto &col = table_catalog.GetColumn(LogicalIndex(index_info.column_ids[key_nr]));
|
410
|
-
unbound_expressions.push_back(
|
411
|
-
make_uniq<BoundColumnRefExpression>(col.GetName(), col.GetType(), ColumnBinding(0, key_nr)));
|
412
|
-
}
|
413
|
-
}
|
414
|
-
|
415
416
|
// create the index and add it to the storage
|
416
|
-
switch (
|
417
|
+
switch (info.index_type) {
|
417
418
|
case IndexType::ART: {
|
418
|
-
auto &storage =
|
419
|
-
auto art = make_uniq<ART>(
|
420
|
-
|
419
|
+
auto &storage = table.GetStorage();
|
420
|
+
auto art = make_uniq<ART>(info.column_ids, TableIOManager::Get(storage), std::move(unbound_expressions),
|
421
|
+
info.constraint_type, storage.db, nullptr, root_block_pointer);
|
421
422
|
|
422
|
-
|
423
|
+
index.index = art.get();
|
423
424
|
storage.info->indexes.AddIndex(std::move(art));
|
424
425
|
} break;
|
425
426
|
default:
|
@@ -159,7 +159,7 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
|
|
159
159
|
AppendToIndexes(transaction, *row_groups, table.info->indexes, table.GetTypes(), append_state.current_row);
|
160
160
|
}
|
161
161
|
if (error) {
|
162
|
-
// need to revert
|
162
|
+
// need to revert all appended row ids
|
163
163
|
row_t current_row = append_state.row_start;
|
164
164
|
// remove the data from the indexes, if there are any indexes
|
165
165
|
row_groups->Scan(transaction, [&](DataChunk &chunk) -> bool {
|
@@ -184,6 +184,13 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
|
|
184
184
|
if (append_to_table) {
|
185
185
|
table.RevertAppendInternal(append_state.row_start, append_count);
|
186
186
|
}
|
187
|
+
|
188
|
+
// we need to vacuum the indexes to remove any buffers that are now empty
|
189
|
+
// due to reverting the appends
|
190
|
+
table.info->indexes.Scan([&](Index &index) {
|
191
|
+
index.Vacuum();
|
192
|
+
return false;
|
193
|
+
});
|
187
194
|
error.Throw();
|
188
195
|
}
|
189
196
|
}
|
package/test/prepare.test.ts
CHANGED
@@ -652,7 +652,16 @@ describe('prepare', function() {
|
|
652
652
|
});
|
653
653
|
it("should aggregate kurtosis(num)", function (done) {
|
654
654
|
db.all("SELECT kurtosis(num) as kurtosis FROM foo", function (err: null | Error, res: TableData) {
|
655
|
-
|
655
|
+
// The `num` column of table `foo` contains each integer from 0 to 999,999 exactly once.
|
656
|
+
// This is a uniform distribution. The excess kurtosis for a uniform distribution is exactly -1.2.
|
657
|
+
// See https://en.wikipedia.org/wiki/Kurtosis#Other_well-known_distributions
|
658
|
+
const expected = -1.2;
|
659
|
+
|
660
|
+
// The calculated value can differ from the exact answer by small amounts on different platforms due
|
661
|
+
// to floating-point errors. This tolerance was determined experimentally.
|
662
|
+
const tolerance = Number.EPSILON * 10;
|
663
|
+
|
664
|
+
assert.ok(Math.abs(res[0].kurtosis - expected) < tolerance);
|
656
665
|
done(err);
|
657
666
|
});
|
658
667
|
});
|
@@ -90,7 +90,7 @@ const correct_answer_map: Record<string, any[]> = {
|
|
90
90
|
date_array: [
|
91
91
|
[],
|
92
92
|
[
|
93
|
-
new Date(1970, 0, 1),
|
93
|
+
new Date(Date.UTC(1970, 0, 1)),
|
94
94
|
null,
|
95
95
|
new Date("0001-01-01T00:00:00.000Z"),
|
96
96
|
new Date("9999-12-31T00:00:00.000Z"),
|
@@ -100,7 +100,7 @@ const correct_answer_map: Record<string, any[]> = {
|
|
100
100
|
timestamp_array: [
|
101
101
|
[],
|
102
102
|
[
|
103
|
-
new Date(1970, 0, 1),
|
103
|
+
new Date(Date.UTC(1970, 0, 1)),
|
104
104
|
null,
|
105
105
|
new Date("0001-01-01T00:00:00.000Z"),
|
106
106
|
new Date("9999-12-31T23:59:59.999Z"),
|
@@ -111,7 +111,7 @@ const correct_answer_map: Record<string, any[]> = {
|
|
111
111
|
timestamptz_array: [
|
112
112
|
[],
|
113
113
|
[
|
114
|
-
new Date(1970, 0, 1),
|
114
|
+
new Date(Date.UTC(1970, 0, 1)),
|
115
115
|
null,
|
116
116
|
new Date("0001-01-01T00:00:00.000Z"),
|
117
117
|
new Date("9999-12-31T23:59:59.999Z"),
|
@@ -171,7 +171,7 @@ const correct_answer_map: Record<string, any[]> = {
|
|
171
171
|
],
|
172
172
|
|
173
173
|
timestamp: [
|
174
|
-
new Date(
|
174
|
+
new Date(Date.UTC(1990, 0, 1)),
|
175
175
|
new Date("9999-12-31T23:59:59.000Z"),
|
176
176
|
null,
|
177
177
|
],
|