duckdb 0.8.2-dev3989.0 → 0.8.2-dev4126.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/binding.gyp +8 -7
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/json/buffered_json_reader.cpp +76 -74
  4. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +35 -32
  5. package/src/duckdb/extension/json/include/json_scan.hpp +9 -6
  6. package/src/duckdb/extension/json/json_scan.cpp +124 -121
  7. package/src/duckdb/extension/parquet/parquet_extension.cpp +23 -13
  8. package/src/duckdb/src/catalog/catalog_entry/duck_index_entry.cpp +5 -0
  9. package/src/duckdb/src/common/crypto/md5.cpp +2 -12
  10. package/src/duckdb/src/common/radix_partitioning.cpp +1 -1
  11. package/src/duckdb/src/common/sort/partition_state.cpp +5 -1
  12. package/src/duckdb/src/core_functions/aggregate/holistic/mode.cpp +1 -1
  13. package/src/duckdb/src/core_functions/function_list.cpp +8 -0
  14. package/src/duckdb/src/core_functions/scalar/list/list_cosine_similarity.cpp +78 -0
  15. package/src/duckdb/src/core_functions/scalar/list/list_distance.cpp +72 -0
  16. package/src/duckdb/src/core_functions/scalar/list/list_inner_product.cpp +70 -0
  17. package/src/duckdb/src/core_functions/scalar/string/sha256.cpp +32 -0
  18. package/src/duckdb/src/execution/index/art/art.cpp +111 -92
  19. package/src/duckdb/src/execution/index/art/iterator.cpp +21 -27
  20. package/src/duckdb/src/execution/index/art/leaf.cpp +72 -153
  21. package/src/duckdb/src/execution/index/art/node.cpp +109 -203
  22. package/src/duckdb/src/execution/index/art/node16.cpp +32 -64
  23. package/src/duckdb/src/execution/index/art/node256.cpp +38 -53
  24. package/src/duckdb/src/execution/index/art/node4.cpp +31 -62
  25. package/src/duckdb/src/execution/index/art/node48.cpp +43 -65
  26. package/src/duckdb/src/execution/index/art/prefix.cpp +70 -141
  27. package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +345 -0
  28. package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +74 -0
  29. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +1 -1
  30. package/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +1 -1
  31. package/src/duckdb/src/function/scalar/string/suffix.cpp +1 -1
  32. package/src/duckdb/src/function/table/system/duckdb_columns.cpp +3 -1
  33. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  34. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_index_entry.hpp +2 -0
  35. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +1 -1
  36. package/src/duckdb/src/include/duckdb/core_functions/scalar/list_functions.hpp +51 -0
  37. package/src/duckdb/src/include/duckdb/core_functions/scalar/string_functions.hpp +9 -0
  38. package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +17 -7
  39. package/src/duckdb/src/include/duckdb/execution/index/art/iterator.hpp +5 -5
  40. package/src/duckdb/src/include/duckdb/execution/index/art/leaf.hpp +10 -16
  41. package/src/duckdb/src/include/duckdb/execution/index/art/node.hpp +38 -116
  42. package/src/duckdb/src/include/duckdb/execution/index/art/node16.hpp +17 -18
  43. package/src/duckdb/src/include/duckdb/execution/index/art/node256.hpp +17 -23
  44. package/src/duckdb/src/include/duckdb/execution/index/art/node4.hpp +17 -18
  45. package/src/duckdb/src/include/duckdb/execution/index/art/node48.hpp +17 -24
  46. package/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +16 -22
  47. package/src/duckdb/src/include/duckdb/execution/index/fixed_size_allocator.hpp +126 -0
  48. package/src/duckdb/src/include/duckdb/execution/index/fixed_size_buffer.hpp +79 -0
  49. package/src/duckdb/src/include/duckdb/execution/index/index_pointer.hpp +96 -0
  50. package/src/duckdb/src/include/duckdb/parallel/task_scheduler.hpp +1 -1
  51. package/src/duckdb/src/include/duckdb/planner/operator/logical_join.hpp +1 -1
  52. package/src/duckdb/src/include/duckdb/storage/block.hpp +1 -1
  53. package/src/duckdb/src/include/duckdb/storage/index.hpp +10 -8
  54. package/src/duckdb/src/include/duckdb/storage/metadata/metadata_writer.hpp +3 -0
  55. package/src/duckdb/src/main/extension/extension_helper.cpp +15 -1
  56. package/src/duckdb/src/planner/binder/expression/bind_function_expression.cpp +14 -5
  57. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +2 -3
  58. package/src/duckdb/src/storage/checkpoint_manager.cpp +16 -21
  59. package/src/duckdb/src/storage/data_table.cpp +3 -3
  60. package/src/duckdb/src/storage/index.cpp +7 -1
  61. package/src/duckdb/src/storage/metadata/metadata_manager.cpp +21 -21
  62. package/src/duckdb/src/storage/standard_buffer_manager.cpp +10 -16
  63. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  64. package/src/duckdb/src/storage/table_index_list.cpp +1 -1
  65. package/src/duckdb/src/transaction/commit_state.cpp +5 -1
  66. package/src/duckdb/third_party/mbedtls/include/mbedtls_wrapper.hpp +4 -1
  67. package/src/duckdb/third_party/mbedtls/mbedtls_wrapper.cpp +24 -2
  68. package/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +5 -5
  69. package/src/duckdb/ub_src_core_functions_scalar_list.cpp +6 -0
  70. package/src/duckdb/ub_src_core_functions_scalar_string.cpp +2 -0
  71. package/src/duckdb/ub_src_execution_index.cpp +4 -0
  72. package/src/duckdb/ub_src_execution_index_art.cpp +0 -2
  73. package/src/duckdb/src/execution/index/art/fixed_size_allocator.cpp +0 -238
  74. package/src/duckdb/src/include/duckdb/execution/index/art/fixed_size_allocator.hpp +0 -115
@@ -2,11 +2,11 @@
2
2
 
3
3
  #include "duckdb/common/enum_util.hpp"
4
4
  #include "duckdb/common/multi_file_reader.hpp"
5
+ #include "duckdb/common/serializer/format_deserializer.hpp"
6
+ #include "duckdb/common/serializer/format_serializer.hpp"
5
7
  #include "duckdb/main/extension_helper.hpp"
6
8
  #include "duckdb/parallel/task_scheduler.hpp"
7
9
  #include "duckdb/storage/buffer_manager.hpp"
8
- #include "duckdb/common/serializer/format_serializer.hpp"
9
- #include "duckdb/common/serializer/format_deserializer.hpp"
10
10
 
11
11
  namespace duckdb {
12
12
 
@@ -39,16 +39,16 @@ void JSONScanData::Bind(ClientContext &context, TableFunctionBindInput &input) {
39
39
  maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), maximum_object_size);
40
40
  } else if (loption == "format") {
41
41
  auto arg = StringUtil::Lower(StringValue::Get(kv.second));
42
- static const auto format_options =
42
+ static const auto FORMAT_OPTIONS =
43
43
  case_insensitive_map_t<JSONFormat> {{"auto", JSONFormat::AUTO_DETECT},
44
44
  {"unstructured", JSONFormat::UNSTRUCTURED},
45
45
  {"newline_delimited", JSONFormat::NEWLINE_DELIMITED},
46
46
  {"nd", JSONFormat::NEWLINE_DELIMITED},
47
47
  {"array", JSONFormat::ARRAY}};
48
- auto lookup = format_options.find(arg);
49
- if (lookup == format_options.end()) {
48
+ auto lookup = FORMAT_OPTIONS.find(arg);
49
+ if (lookup == FORMAT_OPTIONS.end()) {
50
50
  vector<string> valid_options;
51
- for (auto &pair : format_options) {
51
+ for (auto &pair : FORMAT_OPTIONS) {
52
52
  valid_options.push_back(StringUtil::Format("'%s'", pair.first));
53
53
  }
54
54
  throw BinderException("format must be one of [%s], not '%s'", StringUtil::Join(valid_options, ", "),
@@ -198,7 +198,8 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, const JSONScanD
198
198
  : bind_data(bind_data_p), transform_options(bind_data.transform_options),
199
199
  allocator(BufferManager::GetBufferManager(context).GetBufferAllocator()),
200
200
  buffer_capacity(bind_data.maximum_object_size * 2), file_index(0), batch_index(0),
201
- system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()) {
201
+ system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()),
202
+ enable_parallel_scans(bind_data.files.size() < system_threads) {
202
203
  }
203
204
 
204
205
  JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
@@ -275,7 +276,7 @@ idx_t JSONGlobalTableFunctionState::MaxThreads() const {
275
276
  return state.system_threads;
276
277
  }
277
278
 
278
- if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
279
+ if (!state.json_readers.empty() && state.json_readers[0]->HasFileHandle()) {
279
280
  auto &reader = *state.json_readers[0];
280
281
  if (reader.GetFormat() == JSONFormat::NEWLINE_DELIMITED) { // Auto-detected NDJSON
281
282
  return state.system_threads;
@@ -291,7 +292,7 @@ JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context,
291
292
  }
292
293
 
293
294
  unique_ptr<LocalTableFunctionState> JSONLocalTableFunctionState::Init(ExecutionContext &context,
294
- TableFunctionInitInput &input,
295
+ TableFunctionInitInput &,
295
296
  GlobalTableFunctionState *global_state) {
296
297
  auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>();
297
298
  auto result = make_uniq<JSONLocalTableFunctionState>(context.client, gstate.state);
@@ -318,19 +319,24 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
318
319
 
319
320
  idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
320
321
  allocator.Reset();
321
-
322
322
  scan_count = 0;
323
- if (buffer_offset == buffer_size) {
324
- if (!ReadNextBuffer(gstate)) {
325
- return scan_count;
326
- }
327
- D_ASSERT(buffer_size != 0);
328
- if (current_buffer_handle->buffer_index != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
329
- ReconstructFirstObject(gstate);
330
- scan_count++;
323
+
324
+ // We have to wrap this in a loop otherwise we stop scanning too early when there's an empty JSON file
325
+ while (scan_count == 0) {
326
+ if (buffer_offset == buffer_size) {
327
+ if (!ReadNextBuffer(gstate)) {
328
+ break;
329
+ }
330
+ D_ASSERT(buffer_size != 0);
331
+ if (current_buffer_handle->buffer_index != 0 &&
332
+ current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
333
+ ReconstructFirstObject();
334
+ scan_count++;
335
+ }
331
336
  }
337
+
338
+ ParseNextChunk();
332
339
  }
333
- ParseNextChunk();
334
340
 
335
341
  return scan_count;
336
342
  }
@@ -349,7 +355,7 @@ static inline const char *PreviousNewline(const char *ptr) {
349
355
  return ptr;
350
356
  }
351
357
 
352
- static inline const char *NextJSONDefault(const char *ptr, const idx_t size, const char *const end) {
358
+ static inline const char *NextJSONDefault(const char *ptr, const char *const end) {
353
359
  idx_t parents = 0;
354
360
  while (ptr != end) {
355
361
  switch (*ptr++) {
@@ -393,7 +399,7 @@ static inline const char *NextJSON(const char *ptr, const idx_t size) {
393
399
  case '{':
394
400
  case '[':
395
401
  case '"':
396
- ptr = NextJSONDefault(ptr, size, end);
402
+ ptr = NextJSONDefault(ptr, end);
397
403
  break;
398
404
  default:
399
405
  // Special case: JSON array containing JSON without clear "parents", i.e., not obj/arr/str
@@ -482,18 +488,21 @@ void JSONScanLocalState::ThrowInvalidAtEndError() {
482
488
  throw InvalidInputException("Invalid JSON detected at the end of file \"%s\".", current_reader->GetFileName());
483
489
  }
484
490
 
485
- bool JSONScanLocalState::IsParallel(JSONScanGlobalState &gstate) const {
486
- if (bind_data.files.size() >= gstate.system_threads) {
487
- // More files than threads, just parallelize over the files
488
- return false;
491
+ void JSONScanLocalState::TryIncrementFileIndex(JSONScanGlobalState &gstate) const {
492
+ lock_guard<mutex> guard(gstate.lock);
493
+ if (gstate.file_index < gstate.json_readers.size() &&
494
+ current_reader.get() == gstate.json_readers[gstate.file_index].get()) {
495
+ gstate.file_index++;
489
496
  }
497
+ }
490
498
 
491
- if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
492
- // NDJSON can be read in parallel
493
- return true;
499
+ bool JSONScanLocalState::IsParallel(JSONScanGlobalState &gstate) const {
500
+ if (bind_data.files.size() >= gstate.system_threads) {
501
+ return false; // More files than threads, just parallelize over the files
494
502
  }
495
503
 
496
- return false;
504
+ // NDJSON can be read in parallel
505
+ return current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED;
497
506
  }
498
507
 
499
508
  static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(char *const buffer_ptr, const idx_t buffer_size,
@@ -578,104 +587,107 @@ static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(char *const bu
578
587
  }
579
588
 
580
589
  bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
590
+ // First we make sure we have a buffer to read into
581
591
  AllocatedData buffer;
592
+
593
+ // Try to re-use a buffer that was used before
582
594
  if (current_reader) {
583
- // Keep track of this for accurate errors
584
595
  current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
585
-
586
- // Try to re-use existing buffer
587
596
  if (current_buffer_handle && --current_buffer_handle->readers == 0) {
588
597
  buffer = current_reader->RemoveBuffer(current_buffer_handle->buffer_index);
589
- } else {
590
- buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
591
598
  }
599
+ }
592
600
 
593
- if (!is_last) {
594
- if (current_reader->GetFormat() != JSONFormat::NEWLINE_DELIMITED) {
595
- memcpy(buffer.get(), reconstruct_buffer.get(),
596
- prev_buffer_remainder); // Copy last bit of previous buffer
597
- }
598
- } else {
599
- if (gstate.bind_data.type != JSONScanType::SAMPLE) {
600
- current_reader->CloseJSONFile(); // Close files that are done if we're not sampling
601
- }
602
- current_reader = nullptr;
603
- }
604
- } else {
601
+ // If we cannot re-use a buffer we create a new one
602
+ if (!buffer.IsSet()) {
605
603
  buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
606
604
  }
605
+
607
606
  buffer_ptr = char_ptr_cast(buffer.get());
608
607
 
609
- idx_t buffer_index;
608
+ // Copy last bit of previous buffer
609
+ if (current_reader && current_reader->GetFormat() != JSONFormat::NEWLINE_DELIMITED && !is_last) {
610
+ memcpy(buffer_ptr, reconstruct_buffer.get(), prev_buffer_remainder);
611
+ }
612
+
613
+ optional_idx buffer_index;
610
614
  while (true) {
615
+ // Now we finish the current reader
611
616
  if (current_reader) {
612
- ReadNextBufferInternal(gstate, buffer_index);
613
- if (buffer_size == 0) {
614
- if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
617
+ // If we performed the final read of this reader in the previous iteration, close it now
618
+ if (is_last) {
619
+ if (gstate.bind_data.type != JSONScanType::SAMPLE) {
620
+ TryIncrementFileIndex(gstate);
615
621
  current_reader->CloseJSONFile();
616
622
  }
617
- if (IsParallel(gstate)) {
618
- // If this threads' current reader is still the one at gstate.file_index,
619
- // this thread can end the parallel scan
620
- lock_guard<mutex> guard(gstate.lock);
621
- if (gstate.file_index < gstate.json_readers.size() &&
622
- current_reader == gstate.json_readers[gstate.file_index].get()) {
623
- gstate.file_index++; // End parallel scan
624
- }
625
- }
626
623
  current_reader = nullptr;
624
+ continue;
625
+ }
626
+
627
+ // Try to read
628
+ ReadNextBufferInternal(gstate, buffer_index);
629
+
630
+ // If this is the last read, end the parallel scan now so threads can move on
631
+ if (is_last && IsParallel(gstate)) {
632
+ TryIncrementFileIndex(gstate);
633
+ }
634
+
635
+ if (buffer_size == 0) {
636
+ // We didn't read anything, re-enter the loop
637
+ continue;
627
638
  } else {
628
- break; // We read something!
639
+ // We read something!
640
+ break;
629
641
  }
630
642
  }
631
643
 
632
- // This thread needs a new reader
644
+ // If we got here, we don't have a reader (anymore). Try to get one
645
+ is_last = false;
633
646
  {
634
647
  lock_guard<mutex> guard(gstate.lock);
635
648
  if (gstate.file_index == gstate.json_readers.size()) {
636
649
  return false; // No more files left
637
650
  }
638
651
 
639
- // Try the next reader
652
+ // Assign the next reader to this thread
640
653
  current_reader = gstate.json_readers[gstate.file_index].get();
641
- if (current_reader->IsOpen()) {
642
- // Can only be open from auto detection, so these should be known
643
- if (!IsParallel(gstate)) {
644
- batch_index = gstate.batch_index++;
645
- gstate.file_index++;
646
- }
647
- continue; // Re-enter the loop to start scanning the assigned file
648
- }
649
654
 
650
- current_reader->OpenJSONFile();
651
- batch_index = gstate.batch_index++;
652
- if (current_reader->GetFormat() != JSONFormat::AUTO_DETECT) {
653
- if (!IsParallel(gstate)) {
654
- gstate.file_index++;
655
- }
656
- continue;
655
+ // Open the file if it is not yet open
656
+ if (!current_reader->IsOpen()) {
657
+ current_reader->OpenJSONFile();
657
658
  }
659
+ batch_index = gstate.batch_index++;
658
660
 
659
- // If we have less files than threads, we auto-detect within the lock,
660
- // so other threads may join a parallel NDJSON scan
661
- if (gstate.json_readers.size() < gstate.system_threads) {
662
- if (ReadAndAutoDetect(gstate, buffer_index, false)) {
663
- continue;
661
+ // Auto-detect format / record type
662
+ if (gstate.enable_parallel_scans) {
663
+ // Auto-detect within the lock, so threads may join a parallel NDJSON scan
664
+ if (current_reader->GetFormat() == JSONFormat::AUTO_DETECT) {
665
+ ReadAndAutoDetect(gstate, buffer_index);
664
666
  }
665
- break;
667
+ } else {
668
+ gstate.file_index++; // Increment the file index before dropping lock so other threads move on
666
669
  }
670
+ }
667
671
 
668
- // Increment the file index within the lock, then read/auto-detect outside of the lock
669
- gstate.file_index++;
672
+ // If we didn't auto-detect within the lock, do it now
673
+ if (current_reader->GetFormat() == JSONFormat::AUTO_DETECT) {
674
+ ReadAndAutoDetect(gstate, buffer_index);
670
675
  }
671
676
 
672
- // High amount of files, just do 1 thread per file
673
- if (ReadAndAutoDetect(gstate, buffer_index, true)) {
677
+ // If we haven't already, increment the file index if non-parallel scan
678
+ if (gstate.enable_parallel_scans && !IsParallel(gstate)) {
679
+ TryIncrementFileIndex(gstate);
680
+ }
681
+
682
+ if (!buffer_index.IsValid() || buffer_size == 0) {
683
+ // If we didn't get a buffer index (because not auto-detecting), or the file was empty, just re-enter loop
674
684
  continue;
675
685
  }
686
+
676
687
  break;
677
688
  }
678
689
  D_ASSERT(buffer_size != 0); // We should have read something if we got here
690
+ D_ASSERT(buffer_index.IsValid());
679
691
 
680
692
  idx_t readers = 1;
681
693
  if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
@@ -683,9 +695,10 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
683
695
  }
684
696
 
685
697
  // Create an entry and insert it into the map
686
- auto json_buffer_handle = make_uniq<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
698
+ auto json_buffer_handle =
699
+ make_uniq<JSONBufferHandle>(buffer_index.GetIndex(), readers, std::move(buffer), buffer_size);
687
700
  current_buffer_handle = json_buffer_handle.get();
688
- current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
701
+ current_reader->InsertBuffer(buffer_index.GetIndex(), std::move(json_buffer_handle));
689
702
 
690
703
  prev_buffer_remainder = 0;
691
704
  lines_or_objects_in_buffer = 0;
@@ -696,15 +709,11 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
696
709
  return true;
697
710
  }
698
711
 
699
- bool JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &buffer_index,
700
- const bool already_incremented_file_idx) {
712
+ void JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
701
713
  // We have to detect the JSON format - hold the gstate lock while we do this
702
714
  ReadNextBufferInternal(gstate, buffer_index);
703
715
  if (buffer_size == 0) {
704
- if (!already_incremented_file_idx) {
705
- gstate.file_index++; // Empty file, move to the next one
706
- }
707
- return true;
716
+ return;
708
717
  }
709
718
 
710
719
  auto format_and_record_type = DetectFormatAndRecordType(buffer_ptr, buffer_size, allocator.GetYYAlc());
@@ -721,13 +730,9 @@ bool JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &b
721
730
  throw InvalidInputException("Expected file \"%s\" to contain records, detected non-record JSON instead.",
722
731
  current_reader->GetFileName());
723
732
  }
724
- if (!already_incremented_file_idx && !IsParallel(gstate)) {
725
- gstate.file_index++;
726
- }
727
- return false;
728
733
  }
729
734
 
730
- void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, idx_t &buffer_index) {
735
+ void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
731
736
  if (current_reader->GetFileHandle().CanSeek()) {
732
737
  ReadNextBufferSeek(gstate, buffer_index);
733
738
  } else {
@@ -735,12 +740,12 @@ void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, idx
735
740
  }
736
741
 
737
742
  buffer_offset = 0;
738
- if (buffer_index == 0 && current_reader->GetFormat() == JSONFormat::ARRAY) {
743
+ if (buffer_index.GetIndex() == 0 && current_reader->GetFormat() == JSONFormat::ARRAY) {
739
744
  SkipOverArrayStart();
740
745
  }
741
746
  }
742
747
 
743
- void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
748
+ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
744
749
  auto &file_handle = current_reader->GetFileHandle();
745
750
 
746
751
  idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
@@ -758,13 +763,13 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
758
763
  ThrowInvalidAtEndError();
759
764
  }
760
765
 
761
- if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
766
+ if (read_size != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
762
767
  batch_index = gstate.batch_index++;
763
768
  }
764
769
  }
765
770
  buffer_size = prev_buffer_remainder + read_size;
766
771
  if (buffer_size == 0) {
767
- current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
772
+ current_reader->SetBufferLineOrObjectCount(buffer_index.GetIndex(), 0);
768
773
  return;
769
774
  }
770
775
 
@@ -773,33 +778,33 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
773
778
  gstate.bind_data.type == JSONScanType::SAMPLE);
774
779
  }
775
780
 
776
- void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
781
+ void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
777
782
  idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
778
783
  idx_t read_size;
779
784
  {
780
785
  lock_guard<mutex> reader_guard(current_reader->lock);
781
786
  buffer_index = current_reader->GetBufferIndex();
782
787
 
783
- if (current_reader->IsOpen() && !current_reader->IsDone()) {
788
+ if (current_reader->HasFileHandle() && current_reader->IsOpen()) {
784
789
  read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
785
790
  gstate.bind_data.type == JSONScanType::SAMPLE);
786
791
  is_last = read_size < request_size;
787
792
  } else {
788
793
  read_size = 0;
789
- is_last = false;
794
+ is_last = true;
790
795
  }
791
796
 
792
797
  if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
793
798
  ThrowInvalidAtEndError();
794
799
  }
795
800
 
796
- if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
801
+ if (read_size != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
797
802
  batch_index = gstate.batch_index++;
798
803
  }
799
804
  }
800
805
  buffer_size = prev_buffer_remainder + read_size;
801
806
  if (buffer_size == 0) {
802
- current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
807
+ current_reader->SetBufferLineOrObjectCount(buffer_index.GetIndex(), 0);
803
808
  return;
804
809
  }
805
810
  }
@@ -833,7 +838,7 @@ void JSONScanLocalState::SkipOverArrayStart() {
833
838
  }
834
839
  }
835
840
 
836
- void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
841
+ void JSONScanLocalState::ReconstructFirstObject() {
837
842
  D_ASSERT(current_buffer_handle->buffer_index != 0);
838
843
  D_ASSERT(current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED);
839
844
 
@@ -947,8 +952,7 @@ void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &e
947
952
  current_reader->ThrowTransformError(current_buffer_handle->buffer_index, line_or_object_in_buffer, error_message);
948
953
  }
949
954
 
950
- double JSONScan::ScanProgress(ClientContext &context, const FunctionData *bind_data_p,
951
- const GlobalTableFunctionState *global_state) {
955
+ double JSONScan::ScanProgress(ClientContext &, const FunctionData *, const GlobalTableFunctionState *global_state) {
952
956
  auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>().state;
953
957
  double progress = 0;
954
958
  for (auto &reader : gstate.json_readers) {
@@ -957,16 +961,16 @@ double JSONScan::ScanProgress(ClientContext &context, const FunctionData *bind_d
957
961
  return progress / double(gstate.json_readers.size());
958
962
  }
959
963
 
960
- idx_t JSONScan::GetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
961
- LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
964
+ idx_t JSONScan::GetBatchIndex(ClientContext &, const FunctionData *, LocalTableFunctionState *local_state,
965
+ GlobalTableFunctionState *) {
962
966
  auto &lstate = local_state->Cast<JSONLocalTableFunctionState>();
963
967
  return lstate.GetBatchIndex();
964
968
  }
965
969
 
966
- unique_ptr<NodeStatistics> JSONScan::Cardinality(ClientContext &context, const FunctionData *bind_data) {
970
+ unique_ptr<NodeStatistics> JSONScan::Cardinality(ClientContext &, const FunctionData *bind_data) {
967
971
  auto &data = bind_data->Cast<JSONScanData>();
968
972
  idx_t per_file_cardinality;
969
- if (data.initial_reader && data.initial_reader->IsOpen()) {
973
+ if (data.initial_reader && data.initial_reader->HasFileHandle()) {
970
974
  per_file_cardinality = data.initial_reader->GetFileHandle().FileSize() / data.avg_tuple_size;
971
975
  } else {
972
976
  per_file_cardinality = 42; // The cardinality of an unknown JSON file is the almighty number 42
@@ -984,25 +988,24 @@ void JSONScan::ComplexFilterPushdown(ClientContext &context, LogicalGet &get, Fu
984
988
  }
985
989
  }
986
990
 
987
- void JSONScan::Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function) {
991
+ void JSONScan::Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &) {
988
992
  auto &bind_data = bind_data_p->Cast<JSONScanData>();
989
993
  bind_data.Serialize(writer);
990
994
  }
991
995
 
992
- unique_ptr<FunctionData> JSONScan::Deserialize(PlanDeserializationState &state, FieldReader &reader,
993
- TableFunction &function) {
996
+ unique_ptr<FunctionData> JSONScan::Deserialize(PlanDeserializationState &state, FieldReader &reader, TableFunction &) {
994
997
  auto result = make_uniq<JSONScanData>();
995
998
  result->Deserialize(state.context, reader);
996
999
  return std::move(result);
997
1000
  }
998
1001
 
999
1002
  void JSONScan::FormatSerialize(FormatSerializer &serializer, const optional_ptr<FunctionData> bind_data_p,
1000
- const TableFunction &function) {
1003
+ const TableFunction &) {
1001
1004
  auto &bind_data = bind_data_p->Cast<JSONScanData>();
1002
1005
  serializer.WriteProperty(100, "scan_data", &bind_data);
1003
1006
  }
1004
1007
 
1005
- unique_ptr<FunctionData> JSONScan::FormatDeserialize(FormatDeserializer &deserializer, TableFunction &function) {
1008
+ unique_ptr<FunctionData> JSONScan::FormatDeserialize(FormatDeserializer &deserializer, TableFunction &) {
1006
1009
  unique_ptr<JSONScanData> result;
1007
1010
  deserializer.ReadProperty(100, "scan_data", result);
1008
1011
  return std::move(result);
@@ -21,6 +21,8 @@
21
21
  #include "duckdb/common/field_writer.hpp"
22
22
  #include "duckdb/common/file_system.hpp"
23
23
  #include "duckdb/common/multi_file_reader.hpp"
24
+ #include "duckdb/common/serializer/format_deserializer.hpp"
25
+ #include "duckdb/common/serializer/format_serializer.hpp"
24
26
  #include "duckdb/common/types/chunk_collection.hpp"
25
27
  #include "duckdb/function/copy_function.hpp"
26
28
  #include "duckdb/function/table_function.hpp"
@@ -35,8 +37,6 @@
35
37
  #include "duckdb/planner/operator/logical_get.hpp"
36
38
  #include "duckdb/storage/statistics/base_statistics.hpp"
37
39
  #include "duckdb/storage/table/row_group.hpp"
38
- #include "duckdb/common/serializer/format_serializer.hpp"
39
- #include "duckdb/common/serializer/format_deserializer.hpp"
40
40
 
41
41
  #endif
42
42
 
@@ -78,6 +78,8 @@ struct ParquetReadLocalState : public LocalTableFunctionState {
78
78
  DataChunk all_columns;
79
79
  };
80
80
 
81
+ enum class ParquetFileState : uint8_t { UNOPENED, OPENING, OPEN, CLOSED };
82
+
81
83
  struct ParquetReadGlobalState : public GlobalTableFunctionState {
82
84
  mutex lock;
83
85
 
@@ -86,7 +88,7 @@ struct ParquetReadGlobalState : public GlobalTableFunctionState {
86
88
  //! Currently opened readers
87
89
  vector<shared_ptr<ParquetReader>> readers;
88
90
  //! Flag to indicate a file is being opened
89
- vector<bool> file_opening;
91
+ vector<ParquetFileState> file_states;
90
92
  //! Mutexes to wait for a file that is currently being opened
91
93
  unique_ptr<mutex[]> file_mutexes;
92
94
  //! Signal to other threads that a file failed to open, letting every thread abort.
@@ -359,7 +361,7 @@ public:
359
361
  auto &bind_data = input.bind_data->CastNoConst<ParquetReadBindData>();
360
362
  auto result = make_uniq<ParquetReadGlobalState>();
361
363
 
362
- result->file_opening = vector<bool>(bind_data.files.size(), false);
364
+ result->file_states = vector<ParquetFileState>(bind_data.files.size(), ParquetFileState::UNOPENED);
363
365
  result->file_mutexes = unique_ptr<mutex[]>(new mutex[bind_data.files.size()]);
364
366
  if (bind_data.files.empty()) {
365
367
  result->initial_reader = nullptr;
@@ -367,6 +369,8 @@ public:
367
369
  result->readers = std::move(bind_data.union_readers);
368
370
  if (result->readers.size() != bind_data.files.size()) {
369
371
  result->readers = vector<shared_ptr<ParquetReader>>(bind_data.files.size(), nullptr);
372
+ } else {
373
+ std::fill(result->file_states.begin(), result->file_states.end(), ParquetFileState::OPEN);
370
374
  }
371
375
  if (bind_data.initial_reader) {
372
376
  result->initial_reader = std::move(bind_data.initial_reader);
@@ -378,6 +382,7 @@ public:
378
382
  make_shared<ParquetReader>(context, bind_data.files[0], bind_data.parquet_options);
379
383
  result->readers[0] = result->initial_reader;
380
384
  }
385
+ result->file_states[0] = ParquetFileState::OPEN;
381
386
  }
382
387
  for (auto &reader : result->readers) {
383
388
  if (!reader) {
@@ -511,7 +516,7 @@ public:
511
516
 
512
517
  D_ASSERT(parallel_state.initial_reader);
513
518
 
514
- if (parallel_state.readers[parallel_state.file_index]) {
519
+ if (parallel_state.file_states[parallel_state.file_index] == ParquetFileState::OPEN) {
515
520
  if (parallel_state.row_group_index <
516
521
  parallel_state.readers[parallel_state.file_index]->NumRowGroups()) {
517
522
  // The current reader has rowgroups left to be scanned
@@ -523,12 +528,14 @@ public:
523
528
  parallel_state.row_group_index++;
524
529
  return true;
525
530
  } else {
531
+ // Close current file
532
+ parallel_state.file_states[parallel_state.file_index] = ParquetFileState::CLOSED;
533
+ parallel_state.readers[parallel_state.file_index] = nullptr;
534
+
526
535
  // Set state to the next file
527
536
  parallel_state.file_index++;
528
537
  parallel_state.row_group_index = 0;
529
538
 
530
- parallel_state.readers[parallel_state.file_index - 1] = nullptr;
531
-
532
539
  if (parallel_state.file_index >= bind_data.files.size()) {
533
540
  return false;
534
541
  }
@@ -541,8 +548,7 @@ public:
541
548
  }
542
549
 
543
550
  // Check if the current file is being opened, in that case we need to wait for it.
544
- if (!parallel_state.readers[parallel_state.file_index] &&
545
- parallel_state.file_opening[parallel_state.file_index]) {
551
+ if (parallel_state.file_states[parallel_state.file_index] == ParquetFileState::OPENING) {
546
552
  WaitForFile(parallel_state.file_index, parallel_state, parallel_lock);
547
553
  }
548
554
  }
@@ -573,7 +579,8 @@ public:
573
579
  // - the thread opening the file has failed
574
580
  // - the file was somehow scanned till the end while we were waiting
575
581
  if (parallel_state.file_index >= parallel_state.readers.size() ||
576
- parallel_state.readers[parallel_state.file_index] || parallel_state.error_opening_file) {
582
+ parallel_state.file_states[parallel_state.file_index] != ParquetFileState::OPENING ||
583
+ parallel_state.error_opening_file) {
577
584
  return;
578
585
  }
579
586
  }
@@ -583,10 +590,12 @@ public:
583
590
  static bool TryOpenNextFile(ClientContext &context, const ParquetReadBindData &bind_data,
584
591
  ParquetReadLocalState &scan_data, ParquetReadGlobalState &parallel_state,
585
592
  unique_lock<mutex> &parallel_lock) {
586
- for (idx_t i = parallel_state.file_index; i < bind_data.files.size(); i++) {
587
- if (!parallel_state.readers[i] && parallel_state.file_opening[i] == false) {
593
+ const auto num_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
594
+ const auto file_index_limit = MinValue<idx_t>(parallel_state.file_index + num_threads, bind_data.files.size());
595
+ for (idx_t i = parallel_state.file_index; i < file_index_limit; i++) {
596
+ if (parallel_state.file_states[i] == ParquetFileState::UNOPENED) {
588
597
  string file = bind_data.files[i];
589
- parallel_state.file_opening[i] = true;
598
+ parallel_state.file_states[i] = ParquetFileState::OPENING;
590
599
  auto pq_options = parallel_state.initial_reader->parquet_options;
591
600
 
592
601
  // Now we switch which lock we are holding, instead of locking the global state, we grab the lock on
@@ -611,6 +620,7 @@ public:
611
620
  // Now re-lock the state and add the reader
612
621
  parallel_lock.lock();
613
622
  parallel_state.readers[i] = reader;
623
+ parallel_state.file_states[i] = ParquetFileState::OPEN;
614
624
 
615
625
  return true;
616
626
  }
@@ -24,4 +24,9 @@ string DuckIndexEntry::GetTableName() const {
24
24
  return info->table;
25
25
  }
26
26
 
27
+ void DuckIndexEntry::CommitDrop() {
28
+ D_ASSERT(info && index);
29
+ index->CommitDrop();
30
+ }
31
+
27
32
  } // namespace duckdb
@@ -20,6 +20,7 @@
20
20
  * will fill a supplied 16-byte array with the digest.
21
21
  */
22
22
  #include "duckdb/common/crypto/md5.hpp"
23
+ #include "mbedtls_wrapper.hpp"
23
24
 
24
25
  namespace duckdb {
25
26
 
@@ -236,21 +237,10 @@ void MD5Context::Finish(data_ptr_t out_digest) {
236
237
  memcpy(out_digest, buf, 16);
237
238
  }
238
239
 
239
- void MD5Context::DigestToBase16(const_data_ptr_t digest, char *zbuf) {
240
- static char const HEX_CODES[] = "0123456789abcdef";
241
- int i, j;
242
-
243
- for (j = i = 0; i < 16; i++) {
244
- int a = digest[i];
245
- zbuf[j++] = HEX_CODES[(a >> 4) & 0xf];
246
- zbuf[j++] = HEX_CODES[a & 0xf];
247
- }
248
- }
249
-
250
240
  void MD5Context::FinishHex(char *out_digest) {
251
241
  data_t digest[MD5_HASH_LENGTH_BINARY];
252
242
  Finish(digest);
253
- DigestToBase16(digest, out_digest);
243
+ duckdb_mbedtls::MbedTlsWrapper::ToBase16(reinterpret_cast<char *>(digest), out_digest, MD5_HASH_LENGTH_BINARY);
254
244
  }
255
245
 
256
246
  string MD5Context::FinishHex() {
@@ -26,7 +26,7 @@ public:
26
26
  };
27
27
 
28
28
  template <class OP, class RETURN_TYPE, typename... ARGS>
29
- RETURN_TYPE RadixBitsSwitch(idx_t radix_bits, ARGS &&...args) {
29
+ RETURN_TYPE RadixBitsSwitch(idx_t radix_bits, ARGS &&... args) {
30
30
  D_ASSERT(radix_bits <= RadixPartitioning::MAX_RADIX_BITS);
31
31
  switch (radix_bits) {
32
32
  case 0: