duckdb 0.8.2-dev3989.0 → 0.8.2-dev4126.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +8 -7
- package/package.json +1 -1
- package/src/duckdb/extension/json/buffered_json_reader.cpp +76 -74
- package/src/duckdb/extension/json/include/buffered_json_reader.hpp +35 -32
- package/src/duckdb/extension/json/include/json_scan.hpp +9 -6
- package/src/duckdb/extension/json/json_scan.cpp +124 -121
- package/src/duckdb/extension/parquet/parquet_extension.cpp +23 -13
- package/src/duckdb/src/catalog/catalog_entry/duck_index_entry.cpp +5 -0
- package/src/duckdb/src/common/crypto/md5.cpp +2 -12
- package/src/duckdb/src/common/radix_partitioning.cpp +1 -1
- package/src/duckdb/src/common/sort/partition_state.cpp +5 -1
- package/src/duckdb/src/core_functions/aggregate/holistic/mode.cpp +1 -1
- package/src/duckdb/src/core_functions/function_list.cpp +8 -0
- package/src/duckdb/src/core_functions/scalar/list/list_cosine_similarity.cpp +78 -0
- package/src/duckdb/src/core_functions/scalar/list/list_distance.cpp +72 -0
- package/src/duckdb/src/core_functions/scalar/list/list_inner_product.cpp +70 -0
- package/src/duckdb/src/core_functions/scalar/string/sha256.cpp +32 -0
- package/src/duckdb/src/execution/index/art/art.cpp +111 -92
- package/src/duckdb/src/execution/index/art/iterator.cpp +21 -27
- package/src/duckdb/src/execution/index/art/leaf.cpp +72 -153
- package/src/duckdb/src/execution/index/art/node.cpp +109 -203
- package/src/duckdb/src/execution/index/art/node16.cpp +32 -64
- package/src/duckdb/src/execution/index/art/node256.cpp +38 -53
- package/src/duckdb/src/execution/index/art/node4.cpp +31 -62
- package/src/duckdb/src/execution/index/art/node48.cpp +43 -65
- package/src/duckdb/src/execution/index/art/prefix.cpp +70 -141
- package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +345 -0
- package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +74 -0
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +1 -1
- package/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +1 -1
- package/src/duckdb/src/function/scalar/string/suffix.cpp +1 -1
- package/src/duckdb/src/function/table/system/duckdb_columns.cpp +3 -1
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_index_entry.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +1 -1
- package/src/duckdb/src/include/duckdb/core_functions/scalar/list_functions.hpp +51 -0
- package/src/duckdb/src/include/duckdb/core_functions/scalar/string_functions.hpp +9 -0
- package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +17 -7
- package/src/duckdb/src/include/duckdb/execution/index/art/iterator.hpp +5 -5
- package/src/duckdb/src/include/duckdb/execution/index/art/leaf.hpp +10 -16
- package/src/duckdb/src/include/duckdb/execution/index/art/node.hpp +38 -116
- package/src/duckdb/src/include/duckdb/execution/index/art/node16.hpp +17 -18
- package/src/duckdb/src/include/duckdb/execution/index/art/node256.hpp +17 -23
- package/src/duckdb/src/include/duckdb/execution/index/art/node4.hpp +17 -18
- package/src/duckdb/src/include/duckdb/execution/index/art/node48.hpp +17 -24
- package/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +16 -22
- package/src/duckdb/src/include/duckdb/execution/index/fixed_size_allocator.hpp +126 -0
- package/src/duckdb/src/include/duckdb/execution/index/fixed_size_buffer.hpp +79 -0
- package/src/duckdb/src/include/duckdb/execution/index/index_pointer.hpp +96 -0
- package/src/duckdb/src/include/duckdb/parallel/task_scheduler.hpp +1 -1
- package/src/duckdb/src/include/duckdb/planner/operator/logical_join.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/block.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/index.hpp +10 -8
- package/src/duckdb/src/include/duckdb/storage/metadata/metadata_writer.hpp +3 -0
- package/src/duckdb/src/main/extension/extension_helper.cpp +15 -1
- package/src/duckdb/src/planner/binder/expression/bind_function_expression.cpp +14 -5
- package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +2 -3
- package/src/duckdb/src/storage/checkpoint_manager.cpp +16 -21
- package/src/duckdb/src/storage/data_table.cpp +3 -3
- package/src/duckdb/src/storage/index.cpp +7 -1
- package/src/duckdb/src/storage/metadata/metadata_manager.cpp +21 -21
- package/src/duckdb/src/storage/standard_buffer_manager.cpp +10 -16
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/table_index_list.cpp +1 -1
- package/src/duckdb/src/transaction/commit_state.cpp +5 -1
- package/src/duckdb/third_party/mbedtls/include/mbedtls_wrapper.hpp +4 -1
- package/src/duckdb/third_party/mbedtls/mbedtls_wrapper.cpp +24 -2
- package/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +5 -5
- package/src/duckdb/ub_src_core_functions_scalar_list.cpp +6 -0
- package/src/duckdb/ub_src_core_functions_scalar_string.cpp +2 -0
- package/src/duckdb/ub_src_execution_index.cpp +4 -0
- package/src/duckdb/ub_src_execution_index_art.cpp +0 -2
- package/src/duckdb/src/execution/index/art/fixed_size_allocator.cpp +0 -238
- package/src/duckdb/src/include/duckdb/execution/index/art/fixed_size_allocator.hpp +0 -115
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
#include "duckdb/common/enum_util.hpp"
|
4
4
|
#include "duckdb/common/multi_file_reader.hpp"
|
5
|
+
#include "duckdb/common/serializer/format_deserializer.hpp"
|
6
|
+
#include "duckdb/common/serializer/format_serializer.hpp"
|
5
7
|
#include "duckdb/main/extension_helper.hpp"
|
6
8
|
#include "duckdb/parallel/task_scheduler.hpp"
|
7
9
|
#include "duckdb/storage/buffer_manager.hpp"
|
8
|
-
#include "duckdb/common/serializer/format_serializer.hpp"
|
9
|
-
#include "duckdb/common/serializer/format_deserializer.hpp"
|
10
10
|
|
11
11
|
namespace duckdb {
|
12
12
|
|
@@ -39,16 +39,16 @@ void JSONScanData::Bind(ClientContext &context, TableFunctionBindInput &input) {
|
|
39
39
|
maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), maximum_object_size);
|
40
40
|
} else if (loption == "format") {
|
41
41
|
auto arg = StringUtil::Lower(StringValue::Get(kv.second));
|
42
|
-
static const auto
|
42
|
+
static const auto FORMAT_OPTIONS =
|
43
43
|
case_insensitive_map_t<JSONFormat> {{"auto", JSONFormat::AUTO_DETECT},
|
44
44
|
{"unstructured", JSONFormat::UNSTRUCTURED},
|
45
45
|
{"newline_delimited", JSONFormat::NEWLINE_DELIMITED},
|
46
46
|
{"nd", JSONFormat::NEWLINE_DELIMITED},
|
47
47
|
{"array", JSONFormat::ARRAY}};
|
48
|
-
auto lookup =
|
49
|
-
if (lookup ==
|
48
|
+
auto lookup = FORMAT_OPTIONS.find(arg);
|
49
|
+
if (lookup == FORMAT_OPTIONS.end()) {
|
50
50
|
vector<string> valid_options;
|
51
|
-
for (auto &pair :
|
51
|
+
for (auto &pair : FORMAT_OPTIONS) {
|
52
52
|
valid_options.push_back(StringUtil::Format("'%s'", pair.first));
|
53
53
|
}
|
54
54
|
throw BinderException("format must be one of [%s], not '%s'", StringUtil::Join(valid_options, ", "),
|
@@ -198,7 +198,8 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, const JSONScanD
|
|
198
198
|
: bind_data(bind_data_p), transform_options(bind_data.transform_options),
|
199
199
|
allocator(BufferManager::GetBufferManager(context).GetBufferAllocator()),
|
200
200
|
buffer_capacity(bind_data.maximum_object_size * 2), file_index(0), batch_index(0),
|
201
|
-
system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads())
|
201
|
+
system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()),
|
202
|
+
enable_parallel_scans(bind_data.files.size() < system_threads) {
|
202
203
|
}
|
203
204
|
|
204
205
|
JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
|
@@ -275,7 +276,7 @@ idx_t JSONGlobalTableFunctionState::MaxThreads() const {
|
|
275
276
|
return state.system_threads;
|
276
277
|
}
|
277
278
|
|
278
|
-
if (!state.json_readers.empty() && state.json_readers[0]->
|
279
|
+
if (!state.json_readers.empty() && state.json_readers[0]->HasFileHandle()) {
|
279
280
|
auto &reader = *state.json_readers[0];
|
280
281
|
if (reader.GetFormat() == JSONFormat::NEWLINE_DELIMITED) { // Auto-detected NDJSON
|
281
282
|
return state.system_threads;
|
@@ -291,7 +292,7 @@ JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context,
|
|
291
292
|
}
|
292
293
|
|
293
294
|
unique_ptr<LocalTableFunctionState> JSONLocalTableFunctionState::Init(ExecutionContext &context,
|
294
|
-
TableFunctionInitInput
|
295
|
+
TableFunctionInitInput &,
|
295
296
|
GlobalTableFunctionState *global_state) {
|
296
297
|
auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>();
|
297
298
|
auto result = make_uniq<JSONLocalTableFunctionState>(context.client, gstate.state);
|
@@ -318,19 +319,24 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
|
|
318
319
|
|
319
320
|
idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
|
320
321
|
allocator.Reset();
|
321
|
-
|
322
322
|
scan_count = 0;
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
323
|
+
|
324
|
+
// We have to wrap this in a loop otherwise we stop scanning too early when there's an empty JSON file
|
325
|
+
while (scan_count == 0) {
|
326
|
+
if (buffer_offset == buffer_size) {
|
327
|
+
if (!ReadNextBuffer(gstate)) {
|
328
|
+
break;
|
329
|
+
}
|
330
|
+
D_ASSERT(buffer_size != 0);
|
331
|
+
if (current_buffer_handle->buffer_index != 0 &&
|
332
|
+
current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
|
333
|
+
ReconstructFirstObject();
|
334
|
+
scan_count++;
|
335
|
+
}
|
331
336
|
}
|
337
|
+
|
338
|
+
ParseNextChunk();
|
332
339
|
}
|
333
|
-
ParseNextChunk();
|
334
340
|
|
335
341
|
return scan_count;
|
336
342
|
}
|
@@ -349,7 +355,7 @@ static inline const char *PreviousNewline(const char *ptr) {
|
|
349
355
|
return ptr;
|
350
356
|
}
|
351
357
|
|
352
|
-
static inline const char *NextJSONDefault(const char *ptr, const
|
358
|
+
static inline const char *NextJSONDefault(const char *ptr, const char *const end) {
|
353
359
|
idx_t parents = 0;
|
354
360
|
while (ptr != end) {
|
355
361
|
switch (*ptr++) {
|
@@ -393,7 +399,7 @@ static inline const char *NextJSON(const char *ptr, const idx_t size) {
|
|
393
399
|
case '{':
|
394
400
|
case '[':
|
395
401
|
case '"':
|
396
|
-
ptr = NextJSONDefault(ptr,
|
402
|
+
ptr = NextJSONDefault(ptr, end);
|
397
403
|
break;
|
398
404
|
default:
|
399
405
|
// Special case: JSON array containing JSON without clear "parents", i.e., not obj/arr/str
|
@@ -482,18 +488,21 @@ void JSONScanLocalState::ThrowInvalidAtEndError() {
|
|
482
488
|
throw InvalidInputException("Invalid JSON detected at the end of file \"%s\".", current_reader->GetFileName());
|
483
489
|
}
|
484
490
|
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
491
|
+
void JSONScanLocalState::TryIncrementFileIndex(JSONScanGlobalState &gstate) const {
|
492
|
+
lock_guard<mutex> guard(gstate.lock);
|
493
|
+
if (gstate.file_index < gstate.json_readers.size() &&
|
494
|
+
current_reader.get() == gstate.json_readers[gstate.file_index].get()) {
|
495
|
+
gstate.file_index++;
|
489
496
|
}
|
497
|
+
}
|
490
498
|
|
491
|
-
|
492
|
-
|
493
|
-
return
|
499
|
+
bool JSONScanLocalState::IsParallel(JSONScanGlobalState &gstate) const {
|
500
|
+
if (bind_data.files.size() >= gstate.system_threads) {
|
501
|
+
return false; // More files than threads, just parallelize over the files
|
494
502
|
}
|
495
503
|
|
496
|
-
|
504
|
+
// NDJSON can be read in parallel
|
505
|
+
return current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED;
|
497
506
|
}
|
498
507
|
|
499
508
|
static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(char *const buffer_ptr, const idx_t buffer_size,
|
@@ -578,104 +587,107 @@ static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(char *const bu
|
|
578
587
|
}
|
579
588
|
|
580
589
|
bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
|
590
|
+
// First we make sure we have a buffer to read into
|
581
591
|
AllocatedData buffer;
|
592
|
+
|
593
|
+
// Try to re-use a buffer that was used before
|
582
594
|
if (current_reader) {
|
583
|
-
// Keep track of this for accurate errors
|
584
595
|
current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
|
585
|
-
|
586
|
-
// Try to re-use existing buffer
|
587
596
|
if (current_buffer_handle && --current_buffer_handle->readers == 0) {
|
588
597
|
buffer = current_reader->RemoveBuffer(current_buffer_handle->buffer_index);
|
589
|
-
} else {
|
590
|
-
buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
|
591
598
|
}
|
599
|
+
}
|
592
600
|
|
593
|
-
|
594
|
-
|
595
|
-
memcpy(buffer.get(), reconstruct_buffer.get(),
|
596
|
-
prev_buffer_remainder); // Copy last bit of previous buffer
|
597
|
-
}
|
598
|
-
} else {
|
599
|
-
if (gstate.bind_data.type != JSONScanType::SAMPLE) {
|
600
|
-
current_reader->CloseJSONFile(); // Close files that are done if we're not sampling
|
601
|
-
}
|
602
|
-
current_reader = nullptr;
|
603
|
-
}
|
604
|
-
} else {
|
601
|
+
// If we cannot re-use a buffer we create a new one
|
602
|
+
if (!buffer.IsSet()) {
|
605
603
|
buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
|
606
604
|
}
|
605
|
+
|
607
606
|
buffer_ptr = char_ptr_cast(buffer.get());
|
608
607
|
|
609
|
-
|
608
|
+
// Copy last bit of previous buffer
|
609
|
+
if (current_reader && current_reader->GetFormat() != JSONFormat::NEWLINE_DELIMITED && !is_last) {
|
610
|
+
memcpy(buffer_ptr, reconstruct_buffer.get(), prev_buffer_remainder);
|
611
|
+
}
|
612
|
+
|
613
|
+
optional_idx buffer_index;
|
610
614
|
while (true) {
|
615
|
+
// Now we finish the current reader
|
611
616
|
if (current_reader) {
|
612
|
-
|
613
|
-
if (
|
614
|
-
if (
|
617
|
+
// If we performed the final read of this reader in the previous iteration, close it now
|
618
|
+
if (is_last) {
|
619
|
+
if (gstate.bind_data.type != JSONScanType::SAMPLE) {
|
620
|
+
TryIncrementFileIndex(gstate);
|
615
621
|
current_reader->CloseJSONFile();
|
616
622
|
}
|
617
|
-
if (IsParallel(gstate)) {
|
618
|
-
// If this threads' current reader is still the one at gstate.file_index,
|
619
|
-
// this thread can end the parallel scan
|
620
|
-
lock_guard<mutex> guard(gstate.lock);
|
621
|
-
if (gstate.file_index < gstate.json_readers.size() &&
|
622
|
-
current_reader == gstate.json_readers[gstate.file_index].get()) {
|
623
|
-
gstate.file_index++; // End parallel scan
|
624
|
-
}
|
625
|
-
}
|
626
623
|
current_reader = nullptr;
|
624
|
+
continue;
|
625
|
+
}
|
626
|
+
|
627
|
+
// Try to read
|
628
|
+
ReadNextBufferInternal(gstate, buffer_index);
|
629
|
+
|
630
|
+
// If this is the last read, end the parallel scan now so threads can move on
|
631
|
+
if (is_last && IsParallel(gstate)) {
|
632
|
+
TryIncrementFileIndex(gstate);
|
633
|
+
}
|
634
|
+
|
635
|
+
if (buffer_size == 0) {
|
636
|
+
// We didn't read anything, re-enter the loop
|
637
|
+
continue;
|
627
638
|
} else {
|
628
|
-
|
639
|
+
// We read something!
|
640
|
+
break;
|
629
641
|
}
|
630
642
|
}
|
631
643
|
|
632
|
-
//
|
644
|
+
// If we got here, we don't have a reader (anymore). Try to get one
|
645
|
+
is_last = false;
|
633
646
|
{
|
634
647
|
lock_guard<mutex> guard(gstate.lock);
|
635
648
|
if (gstate.file_index == gstate.json_readers.size()) {
|
636
649
|
return false; // No more files left
|
637
650
|
}
|
638
651
|
|
639
|
-
//
|
652
|
+
// Assign the next reader to this thread
|
640
653
|
current_reader = gstate.json_readers[gstate.file_index].get();
|
641
|
-
if (current_reader->IsOpen()) {
|
642
|
-
// Can only be open from auto detection, so these should be known
|
643
|
-
if (!IsParallel(gstate)) {
|
644
|
-
batch_index = gstate.batch_index++;
|
645
|
-
gstate.file_index++;
|
646
|
-
}
|
647
|
-
continue; // Re-enter the loop to start scanning the assigned file
|
648
|
-
}
|
649
654
|
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
if (!IsParallel(gstate)) {
|
654
|
-
gstate.file_index++;
|
655
|
-
}
|
656
|
-
continue;
|
655
|
+
// Open the file if it is not yet open
|
656
|
+
if (!current_reader->IsOpen()) {
|
657
|
+
current_reader->OpenJSONFile();
|
657
658
|
}
|
659
|
+
batch_index = gstate.batch_index++;
|
658
660
|
|
659
|
-
//
|
660
|
-
|
661
|
-
|
662
|
-
if (
|
663
|
-
|
661
|
+
// Auto-detect format / record type
|
662
|
+
if (gstate.enable_parallel_scans) {
|
663
|
+
// Auto-detect within the lock, so threads may join a parallel NDJSON scan
|
664
|
+
if (current_reader->GetFormat() == JSONFormat::AUTO_DETECT) {
|
665
|
+
ReadAndAutoDetect(gstate, buffer_index);
|
664
666
|
}
|
665
|
-
|
667
|
+
} else {
|
668
|
+
gstate.file_index++; // Increment the file index before dropping lock so other threads move on
|
666
669
|
}
|
670
|
+
}
|
667
671
|
|
668
|
-
|
669
|
-
|
672
|
+
// If we didn't auto-detect within the lock, do it now
|
673
|
+
if (current_reader->GetFormat() == JSONFormat::AUTO_DETECT) {
|
674
|
+
ReadAndAutoDetect(gstate, buffer_index);
|
670
675
|
}
|
671
676
|
|
672
|
-
//
|
673
|
-
if (
|
677
|
+
// If we haven't already, increment the file index if non-parallel scan
|
678
|
+
if (gstate.enable_parallel_scans && !IsParallel(gstate)) {
|
679
|
+
TryIncrementFileIndex(gstate);
|
680
|
+
}
|
681
|
+
|
682
|
+
if (!buffer_index.IsValid() || buffer_size == 0) {
|
683
|
+
// If we didn't get a buffer index (because not auto-detecting), or the file was empty, just re-enter loop
|
674
684
|
continue;
|
675
685
|
}
|
686
|
+
|
676
687
|
break;
|
677
688
|
}
|
678
689
|
D_ASSERT(buffer_size != 0); // We should have read something if we got here
|
690
|
+
D_ASSERT(buffer_index.IsValid());
|
679
691
|
|
680
692
|
idx_t readers = 1;
|
681
693
|
if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
|
@@ -683,9 +695,10 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
|
|
683
695
|
}
|
684
696
|
|
685
697
|
// Create an entry and insert it into the map
|
686
|
-
auto json_buffer_handle =
|
698
|
+
auto json_buffer_handle =
|
699
|
+
make_uniq<JSONBufferHandle>(buffer_index.GetIndex(), readers, std::move(buffer), buffer_size);
|
687
700
|
current_buffer_handle = json_buffer_handle.get();
|
688
|
-
current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
|
701
|
+
current_reader->InsertBuffer(buffer_index.GetIndex(), std::move(json_buffer_handle));
|
689
702
|
|
690
703
|
prev_buffer_remainder = 0;
|
691
704
|
lines_or_objects_in_buffer = 0;
|
@@ -696,15 +709,11 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
|
|
696
709
|
return true;
|
697
710
|
}
|
698
711
|
|
699
|
-
|
700
|
-
const bool already_incremented_file_idx) {
|
712
|
+
void JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
|
701
713
|
// We have to detect the JSON format - hold the gstate lock while we do this
|
702
714
|
ReadNextBufferInternal(gstate, buffer_index);
|
703
715
|
if (buffer_size == 0) {
|
704
|
-
|
705
|
-
gstate.file_index++; // Empty file, move to the next one
|
706
|
-
}
|
707
|
-
return true;
|
716
|
+
return;
|
708
717
|
}
|
709
718
|
|
710
719
|
auto format_and_record_type = DetectFormatAndRecordType(buffer_ptr, buffer_size, allocator.GetYYAlc());
|
@@ -721,13 +730,9 @@ bool JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &b
|
|
721
730
|
throw InvalidInputException("Expected file \"%s\" to contain records, detected non-record JSON instead.",
|
722
731
|
current_reader->GetFileName());
|
723
732
|
}
|
724
|
-
if (!already_incremented_file_idx && !IsParallel(gstate)) {
|
725
|
-
gstate.file_index++;
|
726
|
-
}
|
727
|
-
return false;
|
728
733
|
}
|
729
734
|
|
730
|
-
void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate,
|
735
|
+
void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
|
731
736
|
if (current_reader->GetFileHandle().CanSeek()) {
|
732
737
|
ReadNextBufferSeek(gstate, buffer_index);
|
733
738
|
} else {
|
@@ -735,12 +740,12 @@ void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, idx
|
|
735
740
|
}
|
736
741
|
|
737
742
|
buffer_offset = 0;
|
738
|
-
if (buffer_index == 0 && current_reader->GetFormat() == JSONFormat::ARRAY) {
|
743
|
+
if (buffer_index.GetIndex() == 0 && current_reader->GetFormat() == JSONFormat::ARRAY) {
|
739
744
|
SkipOverArrayStart();
|
740
745
|
}
|
741
746
|
}
|
742
747
|
|
743
|
-
void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate,
|
748
|
+
void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
|
744
749
|
auto &file_handle = current_reader->GetFileHandle();
|
745
750
|
|
746
751
|
idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
|
@@ -758,13 +763,13 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
|
|
758
763
|
ThrowInvalidAtEndError();
|
759
764
|
}
|
760
765
|
|
761
|
-
if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
|
766
|
+
if (read_size != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
|
762
767
|
batch_index = gstate.batch_index++;
|
763
768
|
}
|
764
769
|
}
|
765
770
|
buffer_size = prev_buffer_remainder + read_size;
|
766
771
|
if (buffer_size == 0) {
|
767
|
-
current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
|
772
|
+
current_reader->SetBufferLineOrObjectCount(buffer_index.GetIndex(), 0);
|
768
773
|
return;
|
769
774
|
}
|
770
775
|
|
@@ -773,33 +778,33 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
|
|
773
778
|
gstate.bind_data.type == JSONScanType::SAMPLE);
|
774
779
|
}
|
775
780
|
|
776
|
-
void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate,
|
781
|
+
void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
|
777
782
|
idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
|
778
783
|
idx_t read_size;
|
779
784
|
{
|
780
785
|
lock_guard<mutex> reader_guard(current_reader->lock);
|
781
786
|
buffer_index = current_reader->GetBufferIndex();
|
782
787
|
|
783
|
-
if (current_reader->
|
788
|
+
if (current_reader->HasFileHandle() && current_reader->IsOpen()) {
|
784
789
|
read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
|
785
790
|
gstate.bind_data.type == JSONScanType::SAMPLE);
|
786
791
|
is_last = read_size < request_size;
|
787
792
|
} else {
|
788
793
|
read_size = 0;
|
789
|
-
is_last =
|
794
|
+
is_last = true;
|
790
795
|
}
|
791
796
|
|
792
797
|
if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
|
793
798
|
ThrowInvalidAtEndError();
|
794
799
|
}
|
795
800
|
|
796
|
-
if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
|
801
|
+
if (read_size != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
|
797
802
|
batch_index = gstate.batch_index++;
|
798
803
|
}
|
799
804
|
}
|
800
805
|
buffer_size = prev_buffer_remainder + read_size;
|
801
806
|
if (buffer_size == 0) {
|
802
|
-
current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
|
807
|
+
current_reader->SetBufferLineOrObjectCount(buffer_index.GetIndex(), 0);
|
803
808
|
return;
|
804
809
|
}
|
805
810
|
}
|
@@ -833,7 +838,7 @@ void JSONScanLocalState::SkipOverArrayStart() {
|
|
833
838
|
}
|
834
839
|
}
|
835
840
|
|
836
|
-
void JSONScanLocalState::ReconstructFirstObject(
|
841
|
+
void JSONScanLocalState::ReconstructFirstObject() {
|
837
842
|
D_ASSERT(current_buffer_handle->buffer_index != 0);
|
838
843
|
D_ASSERT(current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED);
|
839
844
|
|
@@ -947,8 +952,7 @@ void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &e
|
|
947
952
|
current_reader->ThrowTransformError(current_buffer_handle->buffer_index, line_or_object_in_buffer, error_message);
|
948
953
|
}
|
949
954
|
|
950
|
-
double JSONScan::ScanProgress(ClientContext
|
951
|
-
const GlobalTableFunctionState *global_state) {
|
955
|
+
double JSONScan::ScanProgress(ClientContext &, const FunctionData *, const GlobalTableFunctionState *global_state) {
|
952
956
|
auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>().state;
|
953
957
|
double progress = 0;
|
954
958
|
for (auto &reader : gstate.json_readers) {
|
@@ -957,16 +961,16 @@ double JSONScan::ScanProgress(ClientContext &context, const FunctionData *bind_d
|
|
957
961
|
return progress / double(gstate.json_readers.size());
|
958
962
|
}
|
959
963
|
|
960
|
-
idx_t JSONScan::GetBatchIndex(ClientContext
|
961
|
-
|
964
|
+
idx_t JSONScan::GetBatchIndex(ClientContext &, const FunctionData *, LocalTableFunctionState *local_state,
|
965
|
+
GlobalTableFunctionState *) {
|
962
966
|
auto &lstate = local_state->Cast<JSONLocalTableFunctionState>();
|
963
967
|
return lstate.GetBatchIndex();
|
964
968
|
}
|
965
969
|
|
966
|
-
unique_ptr<NodeStatistics> JSONScan::Cardinality(ClientContext
|
970
|
+
unique_ptr<NodeStatistics> JSONScan::Cardinality(ClientContext &, const FunctionData *bind_data) {
|
967
971
|
auto &data = bind_data->Cast<JSONScanData>();
|
968
972
|
idx_t per_file_cardinality;
|
969
|
-
if (data.initial_reader && data.initial_reader->
|
973
|
+
if (data.initial_reader && data.initial_reader->HasFileHandle()) {
|
970
974
|
per_file_cardinality = data.initial_reader->GetFileHandle().FileSize() / data.avg_tuple_size;
|
971
975
|
} else {
|
972
976
|
per_file_cardinality = 42; // The cardinality of an unknown JSON file is the almighty number 42
|
@@ -984,25 +988,24 @@ void JSONScan::ComplexFilterPushdown(ClientContext &context, LogicalGet &get, Fu
|
|
984
988
|
}
|
985
989
|
}
|
986
990
|
|
987
|
-
void JSONScan::Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &
|
991
|
+
void JSONScan::Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &) {
|
988
992
|
auto &bind_data = bind_data_p->Cast<JSONScanData>();
|
989
993
|
bind_data.Serialize(writer);
|
990
994
|
}
|
991
995
|
|
992
|
-
unique_ptr<FunctionData> JSONScan::Deserialize(PlanDeserializationState &state, FieldReader &reader,
|
993
|
-
TableFunction &function) {
|
996
|
+
unique_ptr<FunctionData> JSONScan::Deserialize(PlanDeserializationState &state, FieldReader &reader, TableFunction &) {
|
994
997
|
auto result = make_uniq<JSONScanData>();
|
995
998
|
result->Deserialize(state.context, reader);
|
996
999
|
return std::move(result);
|
997
1000
|
}
|
998
1001
|
|
999
1002
|
void JSONScan::FormatSerialize(FormatSerializer &serializer, const optional_ptr<FunctionData> bind_data_p,
|
1000
|
-
const TableFunction &
|
1003
|
+
const TableFunction &) {
|
1001
1004
|
auto &bind_data = bind_data_p->Cast<JSONScanData>();
|
1002
1005
|
serializer.WriteProperty(100, "scan_data", &bind_data);
|
1003
1006
|
}
|
1004
1007
|
|
1005
|
-
unique_ptr<FunctionData> JSONScan::FormatDeserialize(FormatDeserializer &deserializer, TableFunction &
|
1008
|
+
unique_ptr<FunctionData> JSONScan::FormatDeserialize(FormatDeserializer &deserializer, TableFunction &) {
|
1006
1009
|
unique_ptr<JSONScanData> result;
|
1007
1010
|
deserializer.ReadProperty(100, "scan_data", result);
|
1008
1011
|
return std::move(result);
|
@@ -21,6 +21,8 @@
|
|
21
21
|
#include "duckdb/common/field_writer.hpp"
|
22
22
|
#include "duckdb/common/file_system.hpp"
|
23
23
|
#include "duckdb/common/multi_file_reader.hpp"
|
24
|
+
#include "duckdb/common/serializer/format_deserializer.hpp"
|
25
|
+
#include "duckdb/common/serializer/format_serializer.hpp"
|
24
26
|
#include "duckdb/common/types/chunk_collection.hpp"
|
25
27
|
#include "duckdb/function/copy_function.hpp"
|
26
28
|
#include "duckdb/function/table_function.hpp"
|
@@ -35,8 +37,6 @@
|
|
35
37
|
#include "duckdb/planner/operator/logical_get.hpp"
|
36
38
|
#include "duckdb/storage/statistics/base_statistics.hpp"
|
37
39
|
#include "duckdb/storage/table/row_group.hpp"
|
38
|
-
#include "duckdb/common/serializer/format_serializer.hpp"
|
39
|
-
#include "duckdb/common/serializer/format_deserializer.hpp"
|
40
40
|
|
41
41
|
#endif
|
42
42
|
|
@@ -78,6 +78,8 @@ struct ParquetReadLocalState : public LocalTableFunctionState {
|
|
78
78
|
DataChunk all_columns;
|
79
79
|
};
|
80
80
|
|
81
|
+
enum class ParquetFileState : uint8_t { UNOPENED, OPENING, OPEN, CLOSED };
|
82
|
+
|
81
83
|
struct ParquetReadGlobalState : public GlobalTableFunctionState {
|
82
84
|
mutex lock;
|
83
85
|
|
@@ -86,7 +88,7 @@ struct ParquetReadGlobalState : public GlobalTableFunctionState {
|
|
86
88
|
//! Currently opened readers
|
87
89
|
vector<shared_ptr<ParquetReader>> readers;
|
88
90
|
//! Flag to indicate a file is being opened
|
89
|
-
vector<
|
91
|
+
vector<ParquetFileState> file_states;
|
90
92
|
//! Mutexes to wait for a file that is currently being opened
|
91
93
|
unique_ptr<mutex[]> file_mutexes;
|
92
94
|
//! Signal to other threads that a file failed to open, letting every thread abort.
|
@@ -359,7 +361,7 @@ public:
|
|
359
361
|
auto &bind_data = input.bind_data->CastNoConst<ParquetReadBindData>();
|
360
362
|
auto result = make_uniq<ParquetReadGlobalState>();
|
361
363
|
|
362
|
-
result->
|
364
|
+
result->file_states = vector<ParquetFileState>(bind_data.files.size(), ParquetFileState::UNOPENED);
|
363
365
|
result->file_mutexes = unique_ptr<mutex[]>(new mutex[bind_data.files.size()]);
|
364
366
|
if (bind_data.files.empty()) {
|
365
367
|
result->initial_reader = nullptr;
|
@@ -367,6 +369,8 @@ public:
|
|
367
369
|
result->readers = std::move(bind_data.union_readers);
|
368
370
|
if (result->readers.size() != bind_data.files.size()) {
|
369
371
|
result->readers = vector<shared_ptr<ParquetReader>>(bind_data.files.size(), nullptr);
|
372
|
+
} else {
|
373
|
+
std::fill(result->file_states.begin(), result->file_states.end(), ParquetFileState::OPEN);
|
370
374
|
}
|
371
375
|
if (bind_data.initial_reader) {
|
372
376
|
result->initial_reader = std::move(bind_data.initial_reader);
|
@@ -378,6 +382,7 @@ public:
|
|
378
382
|
make_shared<ParquetReader>(context, bind_data.files[0], bind_data.parquet_options);
|
379
383
|
result->readers[0] = result->initial_reader;
|
380
384
|
}
|
385
|
+
result->file_states[0] = ParquetFileState::OPEN;
|
381
386
|
}
|
382
387
|
for (auto &reader : result->readers) {
|
383
388
|
if (!reader) {
|
@@ -511,7 +516,7 @@ public:
|
|
511
516
|
|
512
517
|
D_ASSERT(parallel_state.initial_reader);
|
513
518
|
|
514
|
-
if (parallel_state.
|
519
|
+
if (parallel_state.file_states[parallel_state.file_index] == ParquetFileState::OPEN) {
|
515
520
|
if (parallel_state.row_group_index <
|
516
521
|
parallel_state.readers[parallel_state.file_index]->NumRowGroups()) {
|
517
522
|
// The current reader has rowgroups left to be scanned
|
@@ -523,12 +528,14 @@ public:
|
|
523
528
|
parallel_state.row_group_index++;
|
524
529
|
return true;
|
525
530
|
} else {
|
531
|
+
// Close current file
|
532
|
+
parallel_state.file_states[parallel_state.file_index] = ParquetFileState::CLOSED;
|
533
|
+
parallel_state.readers[parallel_state.file_index] = nullptr;
|
534
|
+
|
526
535
|
// Set state to the next file
|
527
536
|
parallel_state.file_index++;
|
528
537
|
parallel_state.row_group_index = 0;
|
529
538
|
|
530
|
-
parallel_state.readers[parallel_state.file_index - 1] = nullptr;
|
531
|
-
|
532
539
|
if (parallel_state.file_index >= bind_data.files.size()) {
|
533
540
|
return false;
|
534
541
|
}
|
@@ -541,8 +548,7 @@ public:
|
|
541
548
|
}
|
542
549
|
|
543
550
|
// Check if the current file is being opened, in that case we need to wait for it.
|
544
|
-
if (
|
545
|
-
parallel_state.file_opening[parallel_state.file_index]) {
|
551
|
+
if (parallel_state.file_states[parallel_state.file_index] == ParquetFileState::OPENING) {
|
546
552
|
WaitForFile(parallel_state.file_index, parallel_state, parallel_lock);
|
547
553
|
}
|
548
554
|
}
|
@@ -573,7 +579,8 @@ public:
|
|
573
579
|
// - the thread opening the file has failed
|
574
580
|
// - the file was somehow scanned till the end while we were waiting
|
575
581
|
if (parallel_state.file_index >= parallel_state.readers.size() ||
|
576
|
-
parallel_state.
|
582
|
+
parallel_state.file_states[parallel_state.file_index] != ParquetFileState::OPENING ||
|
583
|
+
parallel_state.error_opening_file) {
|
577
584
|
return;
|
578
585
|
}
|
579
586
|
}
|
@@ -583,10 +590,12 @@ public:
|
|
583
590
|
static bool TryOpenNextFile(ClientContext &context, const ParquetReadBindData &bind_data,
|
584
591
|
ParquetReadLocalState &scan_data, ParquetReadGlobalState ¶llel_state,
|
585
592
|
unique_lock<mutex> ¶llel_lock) {
|
586
|
-
|
587
|
-
|
593
|
+
const auto num_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
|
594
|
+
const auto file_index_limit = MinValue<idx_t>(parallel_state.file_index + num_threads, bind_data.files.size());
|
595
|
+
for (idx_t i = parallel_state.file_index; i < file_index_limit; i++) {
|
596
|
+
if (parallel_state.file_states[i] == ParquetFileState::UNOPENED) {
|
588
597
|
string file = bind_data.files[i];
|
589
|
-
parallel_state.
|
598
|
+
parallel_state.file_states[i] = ParquetFileState::OPENING;
|
590
599
|
auto pq_options = parallel_state.initial_reader->parquet_options;
|
591
600
|
|
592
601
|
// Now we switch which lock we are holding, instead of locking the global state, we grab the lock on
|
@@ -611,6 +620,7 @@ public:
|
|
611
620
|
// Now re-lock the state and add the reader
|
612
621
|
parallel_lock.lock();
|
613
622
|
parallel_state.readers[i] = reader;
|
623
|
+
parallel_state.file_states[i] = ParquetFileState::OPEN;
|
614
624
|
|
615
625
|
return true;
|
616
626
|
}
|
@@ -20,6 +20,7 @@
|
|
20
20
|
* will fill a supplied 16-byte array with the digest.
|
21
21
|
*/
|
22
22
|
#include "duckdb/common/crypto/md5.hpp"
|
23
|
+
#include "mbedtls_wrapper.hpp"
|
23
24
|
|
24
25
|
namespace duckdb {
|
25
26
|
|
@@ -236,21 +237,10 @@ void MD5Context::Finish(data_ptr_t out_digest) {
|
|
236
237
|
memcpy(out_digest, buf, 16);
|
237
238
|
}
|
238
239
|
|
239
|
-
void MD5Context::DigestToBase16(const_data_ptr_t digest, char *zbuf) {
|
240
|
-
static char const HEX_CODES[] = "0123456789abcdef";
|
241
|
-
int i, j;
|
242
|
-
|
243
|
-
for (j = i = 0; i < 16; i++) {
|
244
|
-
int a = digest[i];
|
245
|
-
zbuf[j++] = HEX_CODES[(a >> 4) & 0xf];
|
246
|
-
zbuf[j++] = HEX_CODES[a & 0xf];
|
247
|
-
}
|
248
|
-
}
|
249
|
-
|
250
240
|
void MD5Context::FinishHex(char *out_digest) {
|
251
241
|
data_t digest[MD5_HASH_LENGTH_BINARY];
|
252
242
|
Finish(digest);
|
253
|
-
|
243
|
+
duckdb_mbedtls::MbedTlsWrapper::ToBase16(reinterpret_cast<char *>(digest), out_digest, MD5_HASH_LENGTH_BINARY);
|
254
244
|
}
|
255
245
|
|
256
246
|
string MD5Context::FinishHex() {
|
@@ -26,7 +26,7 @@ public:
|
|
26
26
|
};
|
27
27
|
|
28
28
|
template <class OP, class RETURN_TYPE, typename... ARGS>
|
29
|
-
RETURN_TYPE RadixBitsSwitch(idx_t radix_bits, ARGS &&...args) {
|
29
|
+
RETURN_TYPE RadixBitsSwitch(idx_t radix_bits, ARGS &&... args) {
|
30
30
|
D_ASSERT(radix_bits <= RadixPartitioning::MAX_RADIX_BITS);
|
31
31
|
switch (radix_bits) {
|
32
32
|
case 0:
|