duckdb 0.8.2-dev4314.0 → 0.8.2-dev4424.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/parquet_extension.cpp +1 -1
- package/src/duckdb/src/common/enum_util.cpp +5 -0
- package/src/duckdb/src/common/file_buffer.cpp +1 -1
- package/src/duckdb/src/common/types/date.cpp +1 -1
- package/src/duckdb/src/common/types/validity_mask.cpp +56 -0
- package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +3 -10
- package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +6 -3
- package/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp +1 -1
- package/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +1 -1
- package/src/duckdb/src/function/table/arrow_conversion.cpp +9 -1
- package/src/duckdb/src/function/table/read_csv.cpp +5 -22
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/constants.hpp +0 -15
- package/src/duckdb/src/include/duckdb/common/serializer/memory_stream.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +3 -0
- package/src/duckdb/src/include/duckdb/function/table/arrow.hpp +3 -0
- package/src/duckdb/src/include/duckdb/main/query_result.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/block.hpp +3 -3
- package/src/duckdb/src/include/duckdb/storage/compression/bitpacking.hpp +1 -8
- package/src/duckdb/src/include/duckdb/storage/data_pointer.hpp +2 -2
- package/src/duckdb/src/include/duckdb/storage/metadata/metadata_manager.hpp +2 -0
- package/src/duckdb/src/include/duckdb/storage/metadata/metadata_reader.hpp +2 -0
- package/src/duckdb/src/include/duckdb/storage/metadata/metadata_writer.hpp +6 -2
- package/src/duckdb/src/include/duckdb/storage/storage_info.hpp +19 -0
- package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +19 -13
- package/src/duckdb/src/include/duckdb/storage/table/column_data.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +15 -15
- package/src/duckdb/src/include/duckdb/storage/table/row_version_manager.hpp +59 -0
- package/src/duckdb/src/include/duckdb/storage/table/update_segment.hpp +1 -1
- package/src/duckdb/src/include/duckdb/transaction/commit_state.hpp +1 -6
- package/src/duckdb/src/include/duckdb/transaction/delete_info.hpp +3 -2
- package/src/duckdb/src/include/duckdb/transaction/duck_transaction.hpp +4 -2
- package/src/duckdb/src/include/duckdb/transaction/local_storage.hpp +1 -1
- package/src/duckdb/src/include/duckdb/transaction/undo_buffer.hpp +0 -1
- package/src/duckdb/src/main/settings/settings.cpp +5 -10
- package/src/duckdb/src/optimizer/statistics/expression/propagate_cast.cpp +14 -0
- package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +0 -1
- package/src/duckdb/src/storage/checkpoint_manager.cpp +37 -36
- package/src/duckdb/src/storage/compression/bitpacking.cpp +55 -48
- package/src/duckdb/src/storage/data_table.cpp +1 -1
- package/src/duckdb/src/storage/local_storage.cpp +9 -2
- package/src/duckdb/src/storage/metadata/metadata_manager.cpp +41 -2
- package/src/duckdb/src/storage/metadata/metadata_reader.cpp +12 -3
- package/src/duckdb/src/storage/metadata/metadata_writer.cpp +8 -2
- package/src/duckdb/src/storage/single_file_block_manager.cpp +1 -2
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/table/chunk_info.cpp +39 -33
- package/src/duckdb/src/storage/table/column_data.cpp +14 -9
- package/src/duckdb/src/storage/table/list_column_data.cpp +2 -2
- package/src/duckdb/src/storage/table/row_group.cpp +102 -192
- package/src/duckdb/src/storage/table/row_group_collection.cpp +2 -2
- package/src/duckdb/src/storage/table/row_version_manager.cpp +228 -0
- package/src/duckdb/src/storage/table/update_segment.cpp +2 -2
- package/src/duckdb/src/transaction/cleanup_state.cpp +2 -1
- package/src/duckdb/src/transaction/commit_state.cpp +5 -4
- package/src/duckdb/src/transaction/duck_transaction.cpp +4 -2
- package/src/duckdb/src/transaction/rollback_state.cpp +2 -1
- package/src/duckdb/src/transaction/undo_buffer.cpp +3 -5
- package/src/duckdb/ub_src_storage_table.cpp +2 -0
- package/test/prepare.test.ts +10 -1
- package/test/test_all_types.test.ts +4 -4
package/package.json
CHANGED
@@ -118,7 +118,7 @@ struct ParquetWriteBindData : public TableFunctionData {
|
|
118
118
|
vector<LogicalType> sql_types;
|
119
119
|
vector<string> column_names;
|
120
120
|
duckdb_parquet::format::CompressionCodec::type codec = duckdb_parquet::format::CompressionCodec::SNAPPY;
|
121
|
-
idx_t row_group_size =
|
121
|
+
idx_t row_group_size = Storage::ROW_GROUP_SIZE;
|
122
122
|
|
123
123
|
//! If row_group_size_bytes is not set, we default to row_group_size * BYTES_PER_ROW
|
124
124
|
static constexpr const idx_t BYTES_PER_ROW = 1024;
|
@@ -551,6 +551,8 @@ BindingMode EnumUtil::FromString<BindingMode>(const char *value) {
|
|
551
551
|
template<>
|
552
552
|
const char* EnumUtil::ToChars<BitpackingMode>(BitpackingMode value) {
|
553
553
|
switch(value) {
|
554
|
+
case BitpackingMode::INVALID:
|
555
|
+
return "INVALID";
|
554
556
|
case BitpackingMode::AUTO:
|
555
557
|
return "AUTO";
|
556
558
|
case BitpackingMode::CONSTANT:
|
@@ -568,6 +570,9 @@ const char* EnumUtil::ToChars<BitpackingMode>(BitpackingMode value) {
|
|
568
570
|
|
569
571
|
template<>
|
570
572
|
BitpackingMode EnumUtil::FromString<BitpackingMode>(const char *value) {
|
573
|
+
if (StringUtil::Equals(value, "INVALID")) {
|
574
|
+
return BitpackingMode::INVALID;
|
575
|
+
}
|
571
576
|
if (StringUtil::Equals(value, "AUTO")) {
|
572
577
|
return BitpackingMode::AUTO;
|
573
578
|
}
|
@@ -492,7 +492,7 @@ int32_t Date::ExtractDayOfTheYear(date_t date) {
|
|
492
492
|
|
493
493
|
int64_t Date::ExtractJulianDay(date_t date) {
|
494
494
|
// Julian Day 0 is (-4713, 11, 24) in the proleptic Gregorian calendar.
|
495
|
-
static const
|
495
|
+
static const int64_t JULIAN_EPOCH = -2440588;
|
496
496
|
return date.days - JULIAN_EPOCH;
|
497
497
|
}
|
498
498
|
|
@@ -1,4 +1,7 @@
|
|
1
1
|
#include "duckdb/common/types/validity_mask.hpp"
|
2
|
+
#include "duckdb/common/limits.hpp"
|
3
|
+
#include "duckdb/common/serializer/write_stream.hpp"
|
4
|
+
#include "duckdb/common/serializer/read_stream.hpp"
|
2
5
|
|
3
6
|
namespace duckdb {
|
4
7
|
|
@@ -173,4 +176,57 @@ void ValidityMask::SliceInPlace(const ValidityMask &other, idx_t target_offset,
|
|
173
176
|
#endif
|
174
177
|
}
|
175
178
|
|
179
|
+
enum class ValiditySerialization : uint8_t { BITMASK = 0, VALID_VALUES = 1, INVALID_VALUES = 2 };
|
180
|
+
|
181
|
+
void ValidityMask::Write(WriteStream &writer, idx_t count) {
|
182
|
+
auto valid_values = CountValid(count);
|
183
|
+
auto invalid_values = count - valid_values;
|
184
|
+
auto bitmask_bytes = ValidityMask::ValidityMaskSize(count);
|
185
|
+
auto need_u32 = count >= NumericLimits<uint16_t>::Maximum();
|
186
|
+
auto bytes_per_value = need_u32 ? sizeof(uint32_t) : sizeof(uint16_t);
|
187
|
+
auto valid_value_size = bytes_per_value * valid_values + sizeof(uint32_t);
|
188
|
+
auto invalid_value_size = bytes_per_value * invalid_values + sizeof(uint32_t);
|
189
|
+
if (valid_value_size < bitmask_bytes || invalid_value_size < bitmask_bytes) {
|
190
|
+
auto serialize_valid = valid_value_size < invalid_value_size;
|
191
|
+
// serialize (in)valid value indexes as [COUNT][V0][V1][...][VN]
|
192
|
+
auto flag = serialize_valid ? ValiditySerialization::VALID_VALUES : ValiditySerialization::INVALID_VALUES;
|
193
|
+
writer.Write(flag);
|
194
|
+
writer.Write<uint32_t>(MinValue<uint32_t>(valid_values, invalid_values));
|
195
|
+
for (idx_t i = 0; i < count; i++) {
|
196
|
+
if (RowIsValid(i) == serialize_valid) {
|
197
|
+
if (need_u32) {
|
198
|
+
writer.Write<uint32_t>(i);
|
199
|
+
} else {
|
200
|
+
writer.Write<uint16_t>(i);
|
201
|
+
}
|
202
|
+
}
|
203
|
+
}
|
204
|
+
} else {
|
205
|
+
// serialize the entire bitmask
|
206
|
+
writer.Write(ValiditySerialization::BITMASK);
|
207
|
+
writer.WriteData(const_data_ptr_cast(GetData()), bitmask_bytes);
|
208
|
+
}
|
209
|
+
}
|
210
|
+
|
211
|
+
void ValidityMask::Read(ReadStream &reader, idx_t count) {
|
212
|
+
Initialize(count);
|
213
|
+
// deserialize the storage type
|
214
|
+
auto flag = reader.Read<ValiditySerialization>();
|
215
|
+
if (flag == ValiditySerialization::BITMASK) {
|
216
|
+
// deserialize the bitmask
|
217
|
+
reader.ReadData(data_ptr_cast(GetData()), ValidityMask::ValidityMaskSize(count));
|
218
|
+
return;
|
219
|
+
}
|
220
|
+
auto is_u32 = count >= NumericLimits<uint16_t>::Maximum();
|
221
|
+
auto is_valid = flag == ValiditySerialization::VALID_VALUES;
|
222
|
+
auto serialize_count = reader.Read<uint32_t>();
|
223
|
+
if (is_valid) {
|
224
|
+
SetAllInvalid(count);
|
225
|
+
}
|
226
|
+
for (idx_t i = 0; i < serialize_count; i++) {
|
227
|
+
idx_t index = is_u32 ? reader.Read<uint32_t>() : reader.Read<uint16_t>();
|
228
|
+
Set(index, is_valid);
|
229
|
+
}
|
230
|
+
}
|
231
|
+
|
176
232
|
} // namespace duckdb
|
@@ -148,9 +148,6 @@ void FixedSizeBuffer::Pin() {
|
|
148
148
|
|
149
149
|
uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
|
150
150
|
|
151
|
-
// this function calls Get() on the buffer, so the buffer must already be in memory
|
152
|
-
D_ASSERT(InMemory());
|
153
|
-
|
154
151
|
// get the bitmask data
|
155
152
|
auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
|
156
153
|
ValidityMask mask(bitmask_ptr);
|
@@ -200,7 +197,7 @@ uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
|
|
200
197
|
|
201
198
|
uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
|
202
199
|
|
203
|
-
// this function calls Get() on the buffer
|
200
|
+
// this function calls Get() on the buffer
|
204
201
|
D_ASSERT(InMemory());
|
205
202
|
|
206
203
|
// finds the maximum zero bit in a bitmask, and adds one to it,
|
@@ -259,17 +256,13 @@ uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
|
|
259
256
|
}
|
260
257
|
|
261
258
|
// there are no allocations in this buffer
|
262
|
-
|
263
|
-
// FIXME: test_index_large_aborted_append.test with force_restart
|
264
|
-
// FIXME: test if we still have non-dirty buffer to serialize after fixing this
|
265
|
-
// throw InternalException("tried to serialize empty buffer");
|
266
|
-
return 0;
|
259
|
+
throw InternalException("tried to serialize empty buffer");
|
267
260
|
}
|
268
261
|
|
269
262
|
void FixedSizeBuffer::SetUninitializedRegions(PartialBlockForIndex &p_block_for_index, const idx_t segment_size,
|
270
263
|
const idx_t offset, const idx_t bitmask_offset) {
|
271
264
|
|
272
|
-
// this function calls Get() on the buffer
|
265
|
+
// this function calls Get() on the buffer
|
273
266
|
D_ASSERT(InMemory());
|
274
267
|
|
275
268
|
auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
|
@@ -89,17 +89,19 @@ bool ParallelCSVReader::SetPosition() {
|
|
89
89
|
position_buffer++;
|
90
90
|
}
|
91
91
|
if (position_buffer > end_buffer) {
|
92
|
+
VerifyLineLength(position_buffer, buffer->batch_index);
|
92
93
|
return false;
|
93
94
|
}
|
94
95
|
SkipEmptyLines();
|
95
96
|
if (verification_positions.beginning_of_first_line == 0) {
|
96
97
|
verification_positions.beginning_of_first_line = position_buffer;
|
97
98
|
}
|
98
|
-
|
99
|
+
VerifyLineLength(position_buffer, buffer->batch_index);
|
99
100
|
verification_positions.end_of_last_line = position_buffer;
|
100
101
|
return true;
|
101
102
|
}
|
102
103
|
}
|
104
|
+
VerifyLineLength(position_buffer, buffer->batch_index);
|
103
105
|
return false;
|
104
106
|
}
|
105
107
|
SkipEmptyLines();
|
@@ -143,12 +145,13 @@ bool ParallelCSVReader::SetPosition() {
|
|
143
145
|
break;
|
144
146
|
}
|
145
147
|
|
146
|
-
|
148
|
+
auto pos_check = position_buffer == 0 ? position_buffer : position_buffer - 1;
|
149
|
+
if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[pos_check])) {
|
147
150
|
break;
|
148
151
|
}
|
149
152
|
|
150
153
|
if (position_buffer > end_buffer && options.dialect_options.new_line == NewLineIdentifier::CARRY_ON &&
|
151
|
-
(*buffer)[
|
154
|
+
(*buffer)[pos_check] == '\n') {
|
152
155
|
break;
|
153
156
|
}
|
154
157
|
idx_t position_set = position_buffer;
|
@@ -194,7 +194,7 @@ public:
|
|
194
194
|
}
|
195
195
|
auto new_count = current_collection->GetTotalRows();
|
196
196
|
auto batch_type =
|
197
|
-
new_count <
|
197
|
+
new_count < Storage::ROW_GROUP_SIZE ? RowGroupBatchType::NOT_FLUSHED : RowGroupBatchType::FLUSHED;
|
198
198
|
if (batch_type == RowGroupBatchType::FLUSHED && writer) {
|
199
199
|
writer->WriteLastRowGroup(*current_collection);
|
200
200
|
}
|
@@ -482,7 +482,7 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato
|
|
482
482
|
|
483
483
|
lock_guard<mutex> lock(gstate.lock);
|
484
484
|
gstate.insert_count += append_count;
|
485
|
-
if (append_count <
|
485
|
+
if (append_count < Storage::ROW_GROUP_SIZE) {
|
486
486
|
// we have few rows - append to the local storage directly
|
487
487
|
auto &table = gstate.table;
|
488
488
|
auto &storage = table.GetStorage();
|
@@ -837,7 +837,15 @@ void ArrowTableFunction::ArrowToDuckDB(ArrowScanLocalState &scan_state, const ar
|
|
837
837
|
throw InvalidInputException("arrow_scan: array length mismatch");
|
838
838
|
}
|
839
839
|
// Make sure this Vector keeps the Arrow chunk alive in case we can zero-copy the data
|
840
|
-
|
840
|
+
if (scan_state.arrow_owned_data.find(idx) == scan_state.arrow_owned_data.end()) {
|
841
|
+
auto arrow_data = make_shared<ArrowArrayWrapper>();
|
842
|
+
arrow_data->arrow_array = scan_state.chunk->arrow_array;
|
843
|
+
scan_state.chunk->arrow_array.release = nullptr;
|
844
|
+
scan_state.arrow_owned_data[idx] = arrow_data;
|
845
|
+
}
|
846
|
+
|
847
|
+
output.data[idx].GetBuffer()->SetAuxiliaryData(make_uniq<ArrowAuxiliaryData>(scan_state.arrow_owned_data[idx]));
|
848
|
+
|
841
849
|
D_ASSERT(arrow_convert_data.find(col_idx) != arrow_convert_data.end());
|
842
850
|
auto &arrow_type = *arrow_convert_data.at(col_idx);
|
843
851
|
if (array.dictionary) {
|
@@ -300,7 +300,7 @@ public:
|
|
300
300
|
const CSVReaderOptions &options, idx_t system_threads_p, const vector<string> &files_path_p,
|
301
301
|
bool force_parallelism_p, vector<column_t> column_ids_p)
|
302
302
|
: buffer_manager(std::move(buffer_manager_p)), system_threads(system_threads_p),
|
303
|
-
|
303
|
+
force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
|
304
304
|
line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
|
305
305
|
current_file_path = files_path_p[0];
|
306
306
|
CSVFileHandle *file_handle_ptr;
|
@@ -316,16 +316,6 @@ public:
|
|
316
316
|
first_file_size = file_size;
|
317
317
|
on_disk_file = file_handle_ptr->OnDiskFile();
|
318
318
|
bytes_read = 0;
|
319
|
-
if (buffer_size < file_size || file_size == 0) {
|
320
|
-
bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
|
321
|
-
} else {
|
322
|
-
bytes_per_local_state = file_size / MaxThreads();
|
323
|
-
}
|
324
|
-
if (bytes_per_local_state == 0) {
|
325
|
-
// In practice, I think this won't happen, it only happens because we are mocking up test scenarios
|
326
|
-
// this boy needs to be at least one.
|
327
|
-
bytes_per_local_state = 1;
|
328
|
-
}
|
329
319
|
running_threads = MaxThreads();
|
330
320
|
|
331
321
|
// Initialize all the book-keeping variables
|
@@ -368,8 +358,6 @@ public:
|
|
368
358
|
|
369
359
|
void UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx);
|
370
360
|
|
371
|
-
void IncrementThread();
|
372
|
-
|
373
361
|
void DecrementThread();
|
374
362
|
|
375
363
|
bool Finished();
|
@@ -402,16 +390,12 @@ private:
|
|
402
390
|
mutex main_mutex;
|
403
391
|
//! Byte set from for last thread
|
404
392
|
idx_t next_byte = 0;
|
405
|
-
//! How many bytes we should execute per local state
|
406
|
-
idx_t bytes_per_local_state;
|
407
393
|
//! Size of first file
|
408
394
|
idx_t first_file_size = 0;
|
409
395
|
//! Whether or not this is an on-disk file
|
410
396
|
bool on_disk_file = true;
|
411
397
|
//! Basically max number of threads in DuckDB
|
412
398
|
idx_t system_threads;
|
413
|
-
//! Size of the buffers
|
414
|
-
idx_t buffer_size;
|
415
399
|
//! Current batch index
|
416
400
|
idx_t batch_index = 0;
|
417
401
|
idx_t local_batch_index = 0;
|
@@ -454,11 +438,6 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
|
|
454
438
|
return system_threads;
|
455
439
|
}
|
456
440
|
|
457
|
-
void ParallelCSVGlobalState::IncrementThread() {
|
458
|
-
lock_guard<mutex> parallel_lock(main_mutex);
|
459
|
-
running_threads++;
|
460
|
-
}
|
461
|
-
|
462
441
|
void ParallelCSVGlobalState::DecrementThread() {
|
463
442
|
lock_guard<mutex> parallel_lock(main_mutex);
|
464
443
|
D_ASSERT(running_threads > 0);
|
@@ -572,6 +551,7 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
572
551
|
}
|
573
552
|
// set up the current buffer
|
574
553
|
line_info.current_batches[file_index - 1].insert(local_batch_index);
|
554
|
+
idx_t bytes_per_local_state = current_buffer->actual_size / MaxThreads() + 1;
|
575
555
|
auto result = make_uniq<CSVBufferRead>(
|
576
556
|
buffer_manager->GetBuffer(cur_buffer_idx), buffer_manager->GetBuffer(cur_buffer_idx + 1), next_byte,
|
577
557
|
next_byte + bytes_per_local_state, batch_index++, local_batch_index++, &line_info);
|
@@ -1135,6 +1115,9 @@ unique_ptr<TableRef> ReadCSVReplacement(ClientContext &context, const string &ta
|
|
1135
1115
|
if (StringUtil::EndsWith(lower_name, ".gz")) {
|
1136
1116
|
lower_name = lower_name.substr(0, lower_name.size() - 3);
|
1137
1117
|
} else if (StringUtil::EndsWith(lower_name, ".zst")) {
|
1118
|
+
if (!Catalog::TryAutoLoad(context, "parquet")) {
|
1119
|
+
throw MissingExtensionException("parquet extension is required for reading zst compressed file");
|
1120
|
+
}
|
1138
1121
|
lower_name = lower_name.substr(0, lower_name.size() - 4);
|
1139
1122
|
}
|
1140
1123
|
if (!StringUtil::EndsWith(lower_name, ".csv") && !StringUtil::Contains(lower_name, ".csv?") &&
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.8.2-
|
2
|
+
#define DUCKDB_VERSION "0.8.2-dev4424"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "b78b24ad26"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -58,21 +58,6 @@ struct DConstants {
|
|
58
58
|
static constexpr const idx_t INVALID_INDEX = idx_t(-1);
|
59
59
|
};
|
60
60
|
|
61
|
-
struct Storage {
|
62
|
-
//! The size of a hard disk sector, only really needed for Direct IO
|
63
|
-
constexpr static int SECTOR_SIZE = 4096;
|
64
|
-
//! Block header size for blocks written to the storage
|
65
|
-
constexpr static int BLOCK_HEADER_SIZE = sizeof(uint64_t);
|
66
|
-
// Size of a memory slot managed by the StorageManager. This is the quantum of allocation for Blocks on DuckDB. We
|
67
|
-
// default to 256KB. (1 << 18)
|
68
|
-
constexpr static int BLOCK_ALLOC_SIZE = 262144;
|
69
|
-
//! The actual memory space that is available within the blocks
|
70
|
-
constexpr static int BLOCK_SIZE = BLOCK_ALLOC_SIZE - BLOCK_HEADER_SIZE;
|
71
|
-
//! The size of the headers. This should be small and written more or less atomically by the hard disk. We default
|
72
|
-
//! to the page size, which is 4KB. (1 << 12)
|
73
|
-
constexpr static int FILE_HEADER_SIZE = 4096;
|
74
|
-
};
|
75
|
-
|
76
61
|
struct LogicalIndex {
|
77
62
|
explicit LogicalIndex(idx_t index) : index(index) {
|
78
63
|
}
|
@@ -1,7 +1,7 @@
|
|
1
1
|
//===----------------------------------------------------------------------===//
|
2
2
|
// DuckDB
|
3
3
|
//
|
4
|
-
// duckdb/common/serializer/
|
4
|
+
// duckdb/common/serializer/memory_stream.hpp
|
5
5
|
//
|
6
6
|
//
|
7
7
|
//===----------------------------------------------------------------------===//
|
@@ -332,6 +332,9 @@ public:
|
|
332
332
|
DUCKDB_API string ToString(idx_t count) const;
|
333
333
|
|
334
334
|
DUCKDB_API static bool IsAligned(idx_t count);
|
335
|
+
|
336
|
+
void Write(WriteStream &writer, idx_t count);
|
337
|
+
void Read(ReadStream &reader, idx_t count);
|
335
338
|
};
|
336
339
|
|
337
340
|
} // namespace duckdb
|
@@ -67,6 +67,9 @@ struct ArrowScanLocalState : public LocalTableFunctionState {
|
|
67
67
|
|
68
68
|
unique_ptr<ArrowArrayStreamWrapper> stream;
|
69
69
|
shared_ptr<ArrowArrayWrapper> chunk;
|
70
|
+
// This vector hold the Arrow Vectors owned by DuckDB to allow for zero-copy
|
71
|
+
// Note that only DuckDB can release these vectors
|
72
|
+
unordered_map<idx_t, shared_ptr<ArrowArrayWrapper>> arrow_owned_data;
|
70
73
|
idx_t chunk_offset = 0;
|
71
74
|
idx_t batch_index = 0;
|
72
75
|
vector<column_t> column_ids;
|
@@ -40,7 +40,7 @@ public:
|
|
40
40
|
vector<string> names;
|
41
41
|
|
42
42
|
public:
|
43
|
-
DUCKDB_API void ThrowError(const string &prepended_message = "") const;
|
43
|
+
[[noreturn]] DUCKDB_API void ThrowError(const string &prepended_message = "") const;
|
44
44
|
DUCKDB_API void SetError(PreservedError error);
|
45
45
|
DUCKDB_API bool HasError() const;
|
46
46
|
DUCKDB_API const ExceptionType &GetErrorType() const;
|
@@ -52,11 +52,11 @@ struct MetaBlockPointer {
|
|
52
52
|
idx_t block_pointer;
|
53
53
|
uint32_t offset;
|
54
54
|
|
55
|
-
bool IsValid() {
|
55
|
+
bool IsValid() const {
|
56
56
|
return block_pointer != DConstants::INVALID_INDEX;
|
57
57
|
}
|
58
|
-
block_id_t GetBlockId();
|
59
|
-
uint32_t GetBlockIndex();
|
58
|
+
block_id_t GetBlockId() const;
|
59
|
+
uint32_t GetBlockIndex() const;
|
60
60
|
|
61
61
|
void Serialize(Serializer &serializer) const;
|
62
62
|
static MetaBlockPointer Deserialize(Deserializer &source);
|
@@ -12,14 +12,7 @@
|
|
12
12
|
|
13
13
|
namespace duckdb {
|
14
14
|
|
15
|
-
enum class BitpackingMode : uint8_t {
|
16
|
-
AUTO,
|
17
|
-
|
18
|
-
CONSTANT,
|
19
|
-
CONSTANT_DELTA,
|
20
|
-
DELTA_FOR,
|
21
|
-
FOR
|
22
|
-
};
|
15
|
+
enum class BitpackingMode : uint8_t { INVALID, AUTO, CONSTANT, CONSTANT_DELTA, DELTA_FOR, FOR };
|
23
16
|
|
24
17
|
BitpackingMode BitpackingModeFromString(const string &str);
|
25
18
|
string BitpackingModeToString(const BitpackingMode &mode);
|
@@ -40,8 +40,8 @@ struct RowGroupPointer {
|
|
40
40
|
uint64_t tuple_count;
|
41
41
|
//! The data pointers of the column segments stored in the row group
|
42
42
|
vector<MetaBlockPointer> data_pointers;
|
43
|
-
//!
|
44
|
-
|
43
|
+
//! Data pointers to the delete information of the row group (if any)
|
44
|
+
vector<MetaBlockPointer> deletes_pointers;
|
45
45
|
};
|
46
46
|
|
47
47
|
} // namespace duckdb
|
@@ -64,6 +64,7 @@ public:
|
|
64
64
|
void Flush();
|
65
65
|
|
66
66
|
void MarkBlocksAsModified();
|
67
|
+
void ClearModifiedBlocks(const vector<MetaBlockPointer> &pointers);
|
67
68
|
|
68
69
|
idx_t BlockCount();
|
69
70
|
|
@@ -82,6 +83,7 @@ protected:
|
|
82
83
|
|
83
84
|
void AddBlock(MetadataBlock new_block, bool if_exists = false);
|
84
85
|
void AddAndRegisterBlock(MetadataBlock block);
|
86
|
+
void ConvertToTransient(MetadataBlock &block);
|
85
87
|
};
|
86
88
|
|
87
89
|
} // namespace duckdb
|
@@ -18,6 +18,7 @@ enum class BlockReaderType { EXISTING_BLOCKS, REGISTER_BLOCKS };
|
|
18
18
|
class MetadataReader : public ReadStream {
|
19
19
|
public:
|
20
20
|
MetadataReader(MetadataManager &manager, MetaBlockPointer pointer,
|
21
|
+
optional_ptr<vector<MetaBlockPointer>> read_pointers = nullptr,
|
21
22
|
BlockReaderType type = BlockReaderType::EXISTING_BLOCKS);
|
22
23
|
MetadataReader(MetadataManager &manager, BlockPointer pointer);
|
23
24
|
~MetadataReader() override;
|
@@ -46,6 +47,7 @@ private:
|
|
46
47
|
MetadataHandle block;
|
47
48
|
MetadataPointer next_pointer;
|
48
49
|
bool has_next_block;
|
50
|
+
optional_ptr<vector<MetaBlockPointer>> read_pointers;
|
49
51
|
idx_t index;
|
50
52
|
idx_t offset;
|
51
53
|
idx_t next_offset;
|
@@ -15,10 +15,10 @@ namespace duckdb {
|
|
15
15
|
|
16
16
|
class MetadataWriter : public WriteStream {
|
17
17
|
public:
|
18
|
+
explicit MetadataWriter(MetadataManager &manager,
|
19
|
+
optional_ptr<vector<MetaBlockPointer>> written_pointers = nullptr);
|
18
20
|
MetadataWriter(const MetadataWriter &) = delete;
|
19
21
|
MetadataWriter &operator=(const MetadataWriter &) = delete;
|
20
|
-
|
21
|
-
explicit MetadataWriter(MetadataManager &manager);
|
22
22
|
~MetadataWriter() override;
|
23
23
|
|
24
24
|
public:
|
@@ -27,6 +27,9 @@ public:
|
|
27
27
|
|
28
28
|
BlockPointer GetBlockPointer();
|
29
29
|
MetaBlockPointer GetMetaBlockPointer();
|
30
|
+
MetadataManager &GetManager() {
|
31
|
+
return manager;
|
32
|
+
}
|
30
33
|
|
31
34
|
protected:
|
32
35
|
virtual MetadataHandle NextHandle();
|
@@ -41,6 +44,7 @@ private:
|
|
41
44
|
MetadataManager &manager;
|
42
45
|
MetadataHandle block;
|
43
46
|
MetadataPointer current_pointer;
|
47
|
+
optional_ptr<vector<MetaBlockPointer>> written_pointers;
|
44
48
|
idx_t capacity;
|
45
49
|
idx_t offset;
|
46
50
|
};
|
@@ -23,6 +23,25 @@ struct FileHandle;
|
|
23
23
|
#error Row group size should be cleanly divisible by vector size
|
24
24
|
#endif
|
25
25
|
|
26
|
+
struct Storage {
|
27
|
+
//! The size of a hard disk sector, only really needed for Direct IO
|
28
|
+
constexpr static int SECTOR_SIZE = 4096;
|
29
|
+
//! Block header size for blocks written to the storage
|
30
|
+
constexpr static int BLOCK_HEADER_SIZE = sizeof(uint64_t);
|
31
|
+
// Size of a memory slot managed by the StorageManager. This is the quantum of allocation for Blocks on DuckDB. We
|
32
|
+
// default to 256KB. (1 << 18)
|
33
|
+
constexpr static int BLOCK_ALLOC_SIZE = 262144;
|
34
|
+
//! The actual memory space that is available within the blocks
|
35
|
+
constexpr static int BLOCK_SIZE = BLOCK_ALLOC_SIZE - BLOCK_HEADER_SIZE;
|
36
|
+
//! The size of the headers. This should be small and written more or less atomically by the hard disk. We default
|
37
|
+
//! to the page size, which is 4KB. (1 << 12)
|
38
|
+
constexpr static int FILE_HEADER_SIZE = 4096;
|
39
|
+
//! The number of rows per row group (must be a multiple of the vector size)
|
40
|
+
constexpr static const idx_t ROW_GROUP_SIZE = STANDARD_ROW_GROUPS_SIZE;
|
41
|
+
//! The number of vectors per row group
|
42
|
+
constexpr static const idx_t ROW_GROUP_VECTOR_COUNT = ROW_GROUP_SIZE / STANDARD_VECTOR_SIZE;
|
43
|
+
};
|
44
|
+
|
26
45
|
//! The version number of the database storage format
|
27
46
|
extern const uint64_t VERSION_NUMBER;
|
28
47
|
|
@@ -46,8 +46,10 @@ public:
|
|
46
46
|
virtual void CommitAppend(transaction_t commit_id, idx_t start, idx_t end) = 0;
|
47
47
|
virtual idx_t GetCommittedDeletedCount(idx_t max_count) = 0;
|
48
48
|
|
49
|
-
virtual
|
50
|
-
|
49
|
+
virtual bool HasDeletes() const = 0;
|
50
|
+
|
51
|
+
virtual void Write(WriteStream &writer) const;
|
52
|
+
static unique_ptr<ChunkInfo> Read(ReadStream &reader);
|
51
53
|
|
52
54
|
public:
|
53
55
|
template <class TARGET>
|
@@ -74,8 +76,8 @@ public:
|
|
74
76
|
public:
|
75
77
|
explicit ChunkConstantInfo(idx_t start);
|
76
78
|
|
77
|
-
|
78
|
-
|
79
|
+
transaction_t insert_id;
|
80
|
+
transaction_t delete_id;
|
79
81
|
|
80
82
|
public:
|
81
83
|
idx_t GetSelVector(TransactionData transaction, SelectionVector &sel_vector, idx_t max_count) override;
|
@@ -85,8 +87,10 @@ public:
|
|
85
87
|
void CommitAppend(transaction_t commit_id, idx_t start, idx_t end) override;
|
86
88
|
idx_t GetCommittedDeletedCount(idx_t max_count) override;
|
87
89
|
|
88
|
-
|
89
|
-
|
90
|
+
bool HasDeletes() const override;
|
91
|
+
|
92
|
+
void Write(WriteStream &writer) const override;
|
93
|
+
static unique_ptr<ChunkInfo> Read(ReadStream &reader);
|
90
94
|
|
91
95
|
private:
|
92
96
|
template <class OP>
|
@@ -102,13 +106,13 @@ public:
|
|
102
106
|
explicit ChunkVectorInfo(idx_t start);
|
103
107
|
|
104
108
|
//! The transaction ids of the transactions that inserted the tuples (if any)
|
105
|
-
|
106
|
-
|
107
|
-
|
109
|
+
transaction_t inserted[STANDARD_VECTOR_SIZE];
|
110
|
+
transaction_t insert_id;
|
111
|
+
bool same_inserted_id;
|
108
112
|
|
109
113
|
//! The transaction ids of the transactions that deleted the tuples (if any)
|
110
|
-
|
111
|
-
|
114
|
+
transaction_t deleted[STANDARD_VECTOR_SIZE];
|
115
|
+
bool any_deleted;
|
112
116
|
|
113
117
|
public:
|
114
118
|
idx_t GetSelVector(transaction_t start_time, transaction_t transaction_id, SelectionVector &sel_vector,
|
@@ -130,8 +134,10 @@ public:
|
|
130
134
|
idx_t Delete(transaction_t transaction_id, row_t rows[], idx_t count);
|
131
135
|
void CommitDelete(transaction_t commit_id, row_t rows[], idx_t count);
|
132
136
|
|
133
|
-
|
134
|
-
|
137
|
+
bool HasDeletes() const override;
|
138
|
+
|
139
|
+
void Write(WriteStream &writer) const override;
|
140
|
+
static unique_ptr<ChunkInfo> Read(ReadStream &reader);
|
135
141
|
|
136
142
|
private:
|
137
143
|
template <class OP>
|
@@ -151,7 +151,7 @@ protected:
|
|
151
151
|
void AppendTransientSegment(SegmentLock &l, idx_t start_row);
|
152
152
|
|
153
153
|
//! Scans a base vector from the column
|
154
|
-
idx_t ScanVector(ColumnScanState &state, Vector &result, idx_t remaining);
|
154
|
+
idx_t ScanVector(ColumnScanState &state, Vector &result, idx_t remaining, bool has_updates);
|
155
155
|
//! Scans a vector from the column merged with any potential updates
|
156
156
|
//! If ALLOW_UPDATES is set to false, the function will instead throw an exception if any updates are found
|
157
157
|
template <bool SCAN_COMMITTED, bool ALLOW_UPDATES>
|