duckdb 0.8.2-dev4025.0 → 0.8.2-dev4126.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/binding.gyp +1 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/json/buffered_json_reader.cpp +76 -74
  4. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +35 -32
  5. package/src/duckdb/extension/json/include/json_scan.hpp +9 -6
  6. package/src/duckdb/extension/json/json_scan.cpp +124 -121
  7. package/src/duckdb/src/catalog/catalog_entry/duck_index_entry.cpp +5 -0
  8. package/src/duckdb/src/common/radix_partitioning.cpp +1 -1
  9. package/src/duckdb/src/common/sort/partition_state.cpp +5 -1
  10. package/src/duckdb/src/core_functions/aggregate/holistic/mode.cpp +1 -1
  11. package/src/duckdb/src/core_functions/function_list.cpp +7 -0
  12. package/src/duckdb/src/core_functions/scalar/list/list_cosine_similarity.cpp +78 -0
  13. package/src/duckdb/src/core_functions/scalar/list/list_distance.cpp +72 -0
  14. package/src/duckdb/src/core_functions/scalar/list/list_inner_product.cpp +70 -0
  15. package/src/duckdb/src/execution/index/art/art.cpp +111 -92
  16. package/src/duckdb/src/execution/index/art/iterator.cpp +21 -27
  17. package/src/duckdb/src/execution/index/art/leaf.cpp +72 -153
  18. package/src/duckdb/src/execution/index/art/node.cpp +109 -203
  19. package/src/duckdb/src/execution/index/art/node16.cpp +32 -64
  20. package/src/duckdb/src/execution/index/art/node256.cpp +38 -53
  21. package/src/duckdb/src/execution/index/art/node4.cpp +31 -62
  22. package/src/duckdb/src/execution/index/art/node48.cpp +43 -65
  23. package/src/duckdb/src/execution/index/art/prefix.cpp +70 -141
  24. package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +345 -0
  25. package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +74 -0
  26. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +1 -1
  27. package/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +1 -1
  28. package/src/duckdb/src/function/table/system/duckdb_columns.cpp +3 -1
  29. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  30. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_index_entry.hpp +2 -0
  31. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +1 -1
  32. package/src/duckdb/src/include/duckdb/core_functions/scalar/list_functions.hpp +51 -0
  33. package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +17 -7
  34. package/src/duckdb/src/include/duckdb/execution/index/art/iterator.hpp +5 -5
  35. package/src/duckdb/src/include/duckdb/execution/index/art/leaf.hpp +10 -16
  36. package/src/duckdb/src/include/duckdb/execution/index/art/node.hpp +38 -116
  37. package/src/duckdb/src/include/duckdb/execution/index/art/node16.hpp +17 -18
  38. package/src/duckdb/src/include/duckdb/execution/index/art/node256.hpp +17 -23
  39. package/src/duckdb/src/include/duckdb/execution/index/art/node4.hpp +17 -18
  40. package/src/duckdb/src/include/duckdb/execution/index/art/node48.hpp +17 -24
  41. package/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +16 -22
  42. package/src/duckdb/src/include/duckdb/execution/index/fixed_size_allocator.hpp +126 -0
  43. package/src/duckdb/src/include/duckdb/execution/index/fixed_size_buffer.hpp +79 -0
  44. package/src/duckdb/src/include/duckdb/execution/index/index_pointer.hpp +96 -0
  45. package/src/duckdb/src/include/duckdb/parallel/task_scheduler.hpp +1 -1
  46. package/src/duckdb/src/include/duckdb/storage/block.hpp +1 -1
  47. package/src/duckdb/src/include/duckdb/storage/index.hpp +10 -8
  48. package/src/duckdb/src/include/duckdb/storage/metadata/metadata_writer.hpp +3 -0
  49. package/src/duckdb/src/planner/binder/expression/bind_function_expression.cpp +14 -5
  50. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +2 -3
  51. package/src/duckdb/src/storage/checkpoint_manager.cpp +16 -21
  52. package/src/duckdb/src/storage/data_table.cpp +3 -3
  53. package/src/duckdb/src/storage/index.cpp +7 -1
  54. package/src/duckdb/src/storage/metadata/metadata_manager.cpp +21 -21
  55. package/src/duckdb/src/storage/standard_buffer_manager.cpp +0 -8
  56. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  57. package/src/duckdb/src/storage/table_index_list.cpp +1 -1
  58. package/src/duckdb/src/transaction/commit_state.cpp +5 -1
  59. package/src/duckdb/ub_src_core_functions_scalar_list.cpp +6 -0
  60. package/src/duckdb/ub_src_execution_index.cpp +4 -0
  61. package/src/duckdb/ub_src_execution_index_art.cpp +0 -2
  62. package/src/duckdb/src/execution/index/art/fixed_size_allocator.cpp +0 -238
  63. package/src/duckdb/src/include/duckdb/execution/index/art/fixed_size_allocator.hpp +0 -115
@@ -2,11 +2,11 @@
2
2
 
3
3
  #include "duckdb/common/enum_util.hpp"
4
4
  #include "duckdb/common/multi_file_reader.hpp"
5
+ #include "duckdb/common/serializer/format_deserializer.hpp"
6
+ #include "duckdb/common/serializer/format_serializer.hpp"
5
7
  #include "duckdb/main/extension_helper.hpp"
6
8
  #include "duckdb/parallel/task_scheduler.hpp"
7
9
  #include "duckdb/storage/buffer_manager.hpp"
8
- #include "duckdb/common/serializer/format_serializer.hpp"
9
- #include "duckdb/common/serializer/format_deserializer.hpp"
10
10
 
11
11
  namespace duckdb {
12
12
 
@@ -39,16 +39,16 @@ void JSONScanData::Bind(ClientContext &context, TableFunctionBindInput &input) {
39
39
  maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), maximum_object_size);
40
40
  } else if (loption == "format") {
41
41
  auto arg = StringUtil::Lower(StringValue::Get(kv.second));
42
- static const auto format_options =
42
+ static const auto FORMAT_OPTIONS =
43
43
  case_insensitive_map_t<JSONFormat> {{"auto", JSONFormat::AUTO_DETECT},
44
44
  {"unstructured", JSONFormat::UNSTRUCTURED},
45
45
  {"newline_delimited", JSONFormat::NEWLINE_DELIMITED},
46
46
  {"nd", JSONFormat::NEWLINE_DELIMITED},
47
47
  {"array", JSONFormat::ARRAY}};
48
- auto lookup = format_options.find(arg);
49
- if (lookup == format_options.end()) {
48
+ auto lookup = FORMAT_OPTIONS.find(arg);
49
+ if (lookup == FORMAT_OPTIONS.end()) {
50
50
  vector<string> valid_options;
51
- for (auto &pair : format_options) {
51
+ for (auto &pair : FORMAT_OPTIONS) {
52
52
  valid_options.push_back(StringUtil::Format("'%s'", pair.first));
53
53
  }
54
54
  throw BinderException("format must be one of [%s], not '%s'", StringUtil::Join(valid_options, ", "),
@@ -198,7 +198,8 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, const JSONScanD
198
198
  : bind_data(bind_data_p), transform_options(bind_data.transform_options),
199
199
  allocator(BufferManager::GetBufferManager(context).GetBufferAllocator()),
200
200
  buffer_capacity(bind_data.maximum_object_size * 2), file_index(0), batch_index(0),
201
- system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()) {
201
+ system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()),
202
+ enable_parallel_scans(bind_data.files.size() < system_threads) {
202
203
  }
203
204
 
204
205
  JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
@@ -275,7 +276,7 @@ idx_t JSONGlobalTableFunctionState::MaxThreads() const {
275
276
  return state.system_threads;
276
277
  }
277
278
 
278
- if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
279
+ if (!state.json_readers.empty() && state.json_readers[0]->HasFileHandle()) {
279
280
  auto &reader = *state.json_readers[0];
280
281
  if (reader.GetFormat() == JSONFormat::NEWLINE_DELIMITED) { // Auto-detected NDJSON
281
282
  return state.system_threads;
@@ -291,7 +292,7 @@ JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context,
291
292
  }
292
293
 
293
294
  unique_ptr<LocalTableFunctionState> JSONLocalTableFunctionState::Init(ExecutionContext &context,
294
- TableFunctionInitInput &input,
295
+ TableFunctionInitInput &,
295
296
  GlobalTableFunctionState *global_state) {
296
297
  auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>();
297
298
  auto result = make_uniq<JSONLocalTableFunctionState>(context.client, gstate.state);
@@ -318,19 +319,24 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
318
319
 
319
320
  idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
320
321
  allocator.Reset();
321
-
322
322
  scan_count = 0;
323
- if (buffer_offset == buffer_size) {
324
- if (!ReadNextBuffer(gstate)) {
325
- return scan_count;
326
- }
327
- D_ASSERT(buffer_size != 0);
328
- if (current_buffer_handle->buffer_index != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
329
- ReconstructFirstObject(gstate);
330
- scan_count++;
323
+
324
+ // We have to wrap this in a loop otherwise we stop scanning too early when there's an empty JSON file
325
+ while (scan_count == 0) {
326
+ if (buffer_offset == buffer_size) {
327
+ if (!ReadNextBuffer(gstate)) {
328
+ break;
329
+ }
330
+ D_ASSERT(buffer_size != 0);
331
+ if (current_buffer_handle->buffer_index != 0 &&
332
+ current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
333
+ ReconstructFirstObject();
334
+ scan_count++;
335
+ }
331
336
  }
337
+
338
+ ParseNextChunk();
332
339
  }
333
- ParseNextChunk();
334
340
 
335
341
  return scan_count;
336
342
  }
@@ -349,7 +355,7 @@ static inline const char *PreviousNewline(const char *ptr) {
349
355
  return ptr;
350
356
  }
351
357
 
352
- static inline const char *NextJSONDefault(const char *ptr, const idx_t size, const char *const end) {
358
+ static inline const char *NextJSONDefault(const char *ptr, const char *const end) {
353
359
  idx_t parents = 0;
354
360
  while (ptr != end) {
355
361
  switch (*ptr++) {
@@ -393,7 +399,7 @@ static inline const char *NextJSON(const char *ptr, const idx_t size) {
393
399
  case '{':
394
400
  case '[':
395
401
  case '"':
396
- ptr = NextJSONDefault(ptr, size, end);
402
+ ptr = NextJSONDefault(ptr, end);
397
403
  break;
398
404
  default:
399
405
  // Special case: JSON array containing JSON without clear "parents", i.e., not obj/arr/str
@@ -482,18 +488,21 @@ void JSONScanLocalState::ThrowInvalidAtEndError() {
482
488
  throw InvalidInputException("Invalid JSON detected at the end of file \"%s\".", current_reader->GetFileName());
483
489
  }
484
490
 
485
- bool JSONScanLocalState::IsParallel(JSONScanGlobalState &gstate) const {
486
- if (bind_data.files.size() >= gstate.system_threads) {
487
- // More files than threads, just parallelize over the files
488
- return false;
491
+ void JSONScanLocalState::TryIncrementFileIndex(JSONScanGlobalState &gstate) const {
492
+ lock_guard<mutex> guard(gstate.lock);
493
+ if (gstate.file_index < gstate.json_readers.size() &&
494
+ current_reader.get() == gstate.json_readers[gstate.file_index].get()) {
495
+ gstate.file_index++;
489
496
  }
497
+ }
490
498
 
491
- if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
492
- // NDJSON can be read in parallel
493
- return true;
499
+ bool JSONScanLocalState::IsParallel(JSONScanGlobalState &gstate) const {
500
+ if (bind_data.files.size() >= gstate.system_threads) {
501
+ return false; // More files than threads, just parallelize over the files
494
502
  }
495
503
 
496
- return false;
504
+ // NDJSON can be read in parallel
505
+ return current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED;
497
506
  }
498
507
 
499
508
  static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(char *const buffer_ptr, const idx_t buffer_size,
@@ -578,104 +587,107 @@ static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(char *const bu
578
587
  }
579
588
 
580
589
  bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
590
+ // First we make sure we have a buffer to read into
581
591
  AllocatedData buffer;
592
+
593
+ // Try to re-use a buffer that was used before
582
594
  if (current_reader) {
583
- // Keep track of this for accurate errors
584
595
  current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
585
-
586
- // Try to re-use existing buffer
587
596
  if (current_buffer_handle && --current_buffer_handle->readers == 0) {
588
597
  buffer = current_reader->RemoveBuffer(current_buffer_handle->buffer_index);
589
- } else {
590
- buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
591
598
  }
599
+ }
592
600
 
593
- if (!is_last) {
594
- if (current_reader->GetFormat() != JSONFormat::NEWLINE_DELIMITED) {
595
- memcpy(buffer.get(), reconstruct_buffer.get(),
596
- prev_buffer_remainder); // Copy last bit of previous buffer
597
- }
598
- } else {
599
- if (gstate.bind_data.type != JSONScanType::SAMPLE) {
600
- current_reader->CloseJSONFile(); // Close files that are done if we're not sampling
601
- }
602
- current_reader = nullptr;
603
- }
604
- } else {
601
+ // If we cannot re-use a buffer we create a new one
602
+ if (!buffer.IsSet()) {
605
603
  buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
606
604
  }
605
+
607
606
  buffer_ptr = char_ptr_cast(buffer.get());
608
607
 
609
- idx_t buffer_index;
608
+ // Copy last bit of previous buffer
609
+ if (current_reader && current_reader->GetFormat() != JSONFormat::NEWLINE_DELIMITED && !is_last) {
610
+ memcpy(buffer_ptr, reconstruct_buffer.get(), prev_buffer_remainder);
611
+ }
612
+
613
+ optional_idx buffer_index;
610
614
  while (true) {
615
+ // Now we finish the current reader
611
616
  if (current_reader) {
612
- ReadNextBufferInternal(gstate, buffer_index);
613
- if (buffer_size == 0) {
614
- if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
617
+ // If we performed the final read of this reader in the previous iteration, close it now
618
+ if (is_last) {
619
+ if (gstate.bind_data.type != JSONScanType::SAMPLE) {
620
+ TryIncrementFileIndex(gstate);
615
621
  current_reader->CloseJSONFile();
616
622
  }
617
- if (IsParallel(gstate)) {
618
- // If this threads' current reader is still the one at gstate.file_index,
619
- // this thread can end the parallel scan
620
- lock_guard<mutex> guard(gstate.lock);
621
- if (gstate.file_index < gstate.json_readers.size() &&
622
- current_reader == gstate.json_readers[gstate.file_index].get()) {
623
- gstate.file_index++; // End parallel scan
624
- }
625
- }
626
623
  current_reader = nullptr;
624
+ continue;
625
+ }
626
+
627
+ // Try to read
628
+ ReadNextBufferInternal(gstate, buffer_index);
629
+
630
+ // If this is the last read, end the parallel scan now so threads can move on
631
+ if (is_last && IsParallel(gstate)) {
632
+ TryIncrementFileIndex(gstate);
633
+ }
634
+
635
+ if (buffer_size == 0) {
636
+ // We didn't read anything, re-enter the loop
637
+ continue;
627
638
  } else {
628
- break; // We read something!
639
+ // We read something!
640
+ break;
629
641
  }
630
642
  }
631
643
 
632
- // This thread needs a new reader
644
+ // If we got here, we don't have a reader (anymore). Try to get one
645
+ is_last = false;
633
646
  {
634
647
  lock_guard<mutex> guard(gstate.lock);
635
648
  if (gstate.file_index == gstate.json_readers.size()) {
636
649
  return false; // No more files left
637
650
  }
638
651
 
639
- // Try the next reader
652
+ // Assign the next reader to this thread
640
653
  current_reader = gstate.json_readers[gstate.file_index].get();
641
- if (current_reader->IsOpen()) {
642
- // Can only be open from auto detection, so these should be known
643
- if (!IsParallel(gstate)) {
644
- batch_index = gstate.batch_index++;
645
- gstate.file_index++;
646
- }
647
- continue; // Re-enter the loop to start scanning the assigned file
648
- }
649
654
 
650
- current_reader->OpenJSONFile();
651
- batch_index = gstate.batch_index++;
652
- if (current_reader->GetFormat() != JSONFormat::AUTO_DETECT) {
653
- if (!IsParallel(gstate)) {
654
- gstate.file_index++;
655
- }
656
- continue;
655
+ // Open the file if it is not yet open
656
+ if (!current_reader->IsOpen()) {
657
+ current_reader->OpenJSONFile();
657
658
  }
659
+ batch_index = gstate.batch_index++;
658
660
 
659
- // If we have less files than threads, we auto-detect within the lock,
660
- // so other threads may join a parallel NDJSON scan
661
- if (gstate.json_readers.size() < gstate.system_threads) {
662
- if (ReadAndAutoDetect(gstate, buffer_index, false)) {
663
- continue;
661
+ // Auto-detect format / record type
662
+ if (gstate.enable_parallel_scans) {
663
+ // Auto-detect within the lock, so threads may join a parallel NDJSON scan
664
+ if (current_reader->GetFormat() == JSONFormat::AUTO_DETECT) {
665
+ ReadAndAutoDetect(gstate, buffer_index);
664
666
  }
665
- break;
667
+ } else {
668
+ gstate.file_index++; // Increment the file index before dropping lock so other threads move on
666
669
  }
670
+ }
667
671
 
668
- // Increment the file index within the lock, then read/auto-detect outside of the lock
669
- gstate.file_index++;
672
+ // If we didn't auto-detect within the lock, do it now
673
+ if (current_reader->GetFormat() == JSONFormat::AUTO_DETECT) {
674
+ ReadAndAutoDetect(gstate, buffer_index);
670
675
  }
671
676
 
672
- // High amount of files, just do 1 thread per file
673
- if (ReadAndAutoDetect(gstate, buffer_index, true)) {
677
+ // If we haven't already, increment the file index if non-parallel scan
678
+ if (gstate.enable_parallel_scans && !IsParallel(gstate)) {
679
+ TryIncrementFileIndex(gstate);
680
+ }
681
+
682
+ if (!buffer_index.IsValid() || buffer_size == 0) {
683
+ // If we didn't get a buffer index (because not auto-detecting), or the file was empty, just re-enter loop
674
684
  continue;
675
685
  }
686
+
676
687
  break;
677
688
  }
678
689
  D_ASSERT(buffer_size != 0); // We should have read something if we got here
690
+ D_ASSERT(buffer_index.IsValid());
679
691
 
680
692
  idx_t readers = 1;
681
693
  if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
@@ -683,9 +695,10 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
683
695
  }
684
696
 
685
697
  // Create an entry and insert it into the map
686
- auto json_buffer_handle = make_uniq<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
698
+ auto json_buffer_handle =
699
+ make_uniq<JSONBufferHandle>(buffer_index.GetIndex(), readers, std::move(buffer), buffer_size);
687
700
  current_buffer_handle = json_buffer_handle.get();
688
- current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
701
+ current_reader->InsertBuffer(buffer_index.GetIndex(), std::move(json_buffer_handle));
689
702
 
690
703
  prev_buffer_remainder = 0;
691
704
  lines_or_objects_in_buffer = 0;
@@ -696,15 +709,11 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
696
709
  return true;
697
710
  }
698
711
 
699
- bool JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &buffer_index,
700
- const bool already_incremented_file_idx) {
712
+ void JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
701
713
  // We have to detect the JSON format - hold the gstate lock while we do this
702
714
  ReadNextBufferInternal(gstate, buffer_index);
703
715
  if (buffer_size == 0) {
704
- if (!already_incremented_file_idx) {
705
- gstate.file_index++; // Empty file, move to the next one
706
- }
707
- return true;
716
+ return;
708
717
  }
709
718
 
710
719
  auto format_and_record_type = DetectFormatAndRecordType(buffer_ptr, buffer_size, allocator.GetYYAlc());
@@ -721,13 +730,9 @@ bool JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &b
721
730
  throw InvalidInputException("Expected file \"%s\" to contain records, detected non-record JSON instead.",
722
731
  current_reader->GetFileName());
723
732
  }
724
- if (!already_incremented_file_idx && !IsParallel(gstate)) {
725
- gstate.file_index++;
726
- }
727
- return false;
728
733
  }
729
734
 
730
- void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, idx_t &buffer_index) {
735
+ void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
731
736
  if (current_reader->GetFileHandle().CanSeek()) {
732
737
  ReadNextBufferSeek(gstate, buffer_index);
733
738
  } else {
@@ -735,12 +740,12 @@ void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, idx
735
740
  }
736
741
 
737
742
  buffer_offset = 0;
738
- if (buffer_index == 0 && current_reader->GetFormat() == JSONFormat::ARRAY) {
743
+ if (buffer_index.GetIndex() == 0 && current_reader->GetFormat() == JSONFormat::ARRAY) {
739
744
  SkipOverArrayStart();
740
745
  }
741
746
  }
742
747
 
743
- void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
748
+ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
744
749
  auto &file_handle = current_reader->GetFileHandle();
745
750
 
746
751
  idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
@@ -758,13 +763,13 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
758
763
  ThrowInvalidAtEndError();
759
764
  }
760
765
 
761
- if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
766
+ if (read_size != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
762
767
  batch_index = gstate.batch_index++;
763
768
  }
764
769
  }
765
770
  buffer_size = prev_buffer_remainder + read_size;
766
771
  if (buffer_size == 0) {
767
- current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
772
+ current_reader->SetBufferLineOrObjectCount(buffer_index.GetIndex(), 0);
768
773
  return;
769
774
  }
770
775
 
@@ -773,33 +778,33 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
773
778
  gstate.bind_data.type == JSONScanType::SAMPLE);
774
779
  }
775
780
 
776
- void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
781
+ void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
777
782
  idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
778
783
  idx_t read_size;
779
784
  {
780
785
  lock_guard<mutex> reader_guard(current_reader->lock);
781
786
  buffer_index = current_reader->GetBufferIndex();
782
787
 
783
- if (current_reader->IsOpen() && !current_reader->IsDone()) {
788
+ if (current_reader->HasFileHandle() && current_reader->IsOpen()) {
784
789
  read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
785
790
  gstate.bind_data.type == JSONScanType::SAMPLE);
786
791
  is_last = read_size < request_size;
787
792
  } else {
788
793
  read_size = 0;
789
- is_last = false;
794
+ is_last = true;
790
795
  }
791
796
 
792
797
  if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
793
798
  ThrowInvalidAtEndError();
794
799
  }
795
800
 
796
- if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
801
+ if (read_size != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
797
802
  batch_index = gstate.batch_index++;
798
803
  }
799
804
  }
800
805
  buffer_size = prev_buffer_remainder + read_size;
801
806
  if (buffer_size == 0) {
802
- current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
807
+ current_reader->SetBufferLineOrObjectCount(buffer_index.GetIndex(), 0);
803
808
  return;
804
809
  }
805
810
  }
@@ -833,7 +838,7 @@ void JSONScanLocalState::SkipOverArrayStart() {
833
838
  }
834
839
  }
835
840
 
836
- void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
841
+ void JSONScanLocalState::ReconstructFirstObject() {
837
842
  D_ASSERT(current_buffer_handle->buffer_index != 0);
838
843
  D_ASSERT(current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED);
839
844
 
@@ -947,8 +952,7 @@ void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &e
947
952
  current_reader->ThrowTransformError(current_buffer_handle->buffer_index, line_or_object_in_buffer, error_message);
948
953
  }
949
954
 
950
- double JSONScan::ScanProgress(ClientContext &context, const FunctionData *bind_data_p,
951
- const GlobalTableFunctionState *global_state) {
955
+ double JSONScan::ScanProgress(ClientContext &, const FunctionData *, const GlobalTableFunctionState *global_state) {
952
956
  auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>().state;
953
957
  double progress = 0;
954
958
  for (auto &reader : gstate.json_readers) {
@@ -957,16 +961,16 @@ double JSONScan::ScanProgress(ClientContext &context, const FunctionData *bind_d
957
961
  return progress / double(gstate.json_readers.size());
958
962
  }
959
963
 
960
- idx_t JSONScan::GetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
961
- LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
964
+ idx_t JSONScan::GetBatchIndex(ClientContext &, const FunctionData *, LocalTableFunctionState *local_state,
965
+ GlobalTableFunctionState *) {
962
966
  auto &lstate = local_state->Cast<JSONLocalTableFunctionState>();
963
967
  return lstate.GetBatchIndex();
964
968
  }
965
969
 
966
- unique_ptr<NodeStatistics> JSONScan::Cardinality(ClientContext &context, const FunctionData *bind_data) {
970
+ unique_ptr<NodeStatistics> JSONScan::Cardinality(ClientContext &, const FunctionData *bind_data) {
967
971
  auto &data = bind_data->Cast<JSONScanData>();
968
972
  idx_t per_file_cardinality;
969
- if (data.initial_reader && data.initial_reader->IsOpen()) {
973
+ if (data.initial_reader && data.initial_reader->HasFileHandle()) {
970
974
  per_file_cardinality = data.initial_reader->GetFileHandle().FileSize() / data.avg_tuple_size;
971
975
  } else {
972
976
  per_file_cardinality = 42; // The cardinality of an unknown JSON file is the almighty number 42
@@ -984,25 +988,24 @@ void JSONScan::ComplexFilterPushdown(ClientContext &context, LogicalGet &get, Fu
984
988
  }
985
989
  }
986
990
 
987
- void JSONScan::Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function) {
991
+ void JSONScan::Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &) {
988
992
  auto &bind_data = bind_data_p->Cast<JSONScanData>();
989
993
  bind_data.Serialize(writer);
990
994
  }
991
995
 
992
- unique_ptr<FunctionData> JSONScan::Deserialize(PlanDeserializationState &state, FieldReader &reader,
993
- TableFunction &function) {
996
+ unique_ptr<FunctionData> JSONScan::Deserialize(PlanDeserializationState &state, FieldReader &reader, TableFunction &) {
994
997
  auto result = make_uniq<JSONScanData>();
995
998
  result->Deserialize(state.context, reader);
996
999
  return std::move(result);
997
1000
  }
998
1001
 
999
1002
  void JSONScan::FormatSerialize(FormatSerializer &serializer, const optional_ptr<FunctionData> bind_data_p,
1000
- const TableFunction &function) {
1003
+ const TableFunction &) {
1001
1004
  auto &bind_data = bind_data_p->Cast<JSONScanData>();
1002
1005
  serializer.WriteProperty(100, "scan_data", &bind_data);
1003
1006
  }
1004
1007
 
1005
- unique_ptr<FunctionData> JSONScan::FormatDeserialize(FormatDeserializer &deserializer, TableFunction &function) {
1008
+ unique_ptr<FunctionData> JSONScan::FormatDeserialize(FormatDeserializer &deserializer, TableFunction &) {
1006
1009
  unique_ptr<JSONScanData> result;
1007
1010
  deserializer.ReadProperty(100, "scan_data", result);
1008
1011
  return std::move(result);
@@ -24,4 +24,9 @@ string DuckIndexEntry::GetTableName() const {
24
24
  return info->table;
25
25
  }
26
26
 
27
+ void DuckIndexEntry::CommitDrop() {
28
+ D_ASSERT(info && index);
29
+ index->CommitDrop();
30
+ }
31
+
27
32
  } // namespace duckdb
@@ -26,7 +26,7 @@ public:
26
26
  };
27
27
 
28
28
  template <class OP, class RETURN_TYPE, typename... ARGS>
29
- RETURN_TYPE RadixBitsSwitch(idx_t radix_bits, ARGS &&...args) {
29
+ RETURN_TYPE RadixBitsSwitch(idx_t radix_bits, ARGS &&... args) {
30
30
  D_ASSERT(radix_bits <= RadixPartitioning::MAX_RADIX_BITS);
31
31
  switch (radix_bits) {
32
32
  case 0:
@@ -429,7 +429,11 @@ bool PartitionGlobalMergeState::TryPrepareNextStage() {
429
429
 
430
430
  switch (stage) {
431
431
  case PartitionSortStage::INIT:
432
- total_tasks = num_threads;
432
+ // If the partitions are unordered, don't scan in parallel
433
+ // because it produces non-deterministic orderings.
434
+ // This can theoretically happen with ORDER BY,
435
+ // but that is something the query should be explicit about.
436
+ total_tasks = sink.orders.size() > sink.partitions.size() ? num_threads : 1;
433
437
  stage = PartitionSortStage::SCAN;
434
438
  return true;
435
439
 
@@ -220,7 +220,7 @@ struct ModeFunction {
220
220
  state.frequency_map = new typename STATE::Counts;
221
221
  }
222
222
  const double tau = .25;
223
- if (state.nonzero <= tau * state.frequency_map->size()) {
223
+ if (state.nonzero <= tau * state.frequency_map->size() || prev.end <= frame.start || frame.end <= prev.start) {
224
224
  state.Reset();
225
225
  // for f ∈ F do
226
226
  for (auto f = frame.start; f < frame.end; ++f) {
@@ -49,7 +49,10 @@ static StaticFunctionDefinition internal_functions[] = {
49
49
  DUCKDB_SCALAR_FUNCTION(FactorialOperatorFun),
50
50
  DUCKDB_SCALAR_FUNCTION_SET(BitwiseAndFun),
51
51
  DUCKDB_SCALAR_FUNCTION(PowOperatorFun),
52
+ DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListInnerProductFunAlias),
53
+ DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListDistanceFunAlias),
52
54
  DUCKDB_SCALAR_FUNCTION_SET(LeftShiftFun),
55
+ DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListCosineSimilarityFunAlias),
53
56
  DUCKDB_SCALAR_FUNCTION_SET(RightShiftFun),
54
57
  DUCKDB_SCALAR_FUNCTION_SET(AbsOperatorFun),
55
58
  DUCKDB_SCALAR_FUNCTION_ALIAS(PowOperatorFunAlias),
@@ -197,8 +200,12 @@ static StaticFunctionDefinition internal_functions[] = {
197
200
  DUCKDB_SCALAR_FUNCTION_ALIAS(ListAggrFun),
198
201
  DUCKDB_SCALAR_FUNCTION(ListAggregateFun),
199
202
  DUCKDB_SCALAR_FUNCTION_ALIAS(ListApplyFun),
203
+ DUCKDB_SCALAR_FUNCTION_SET(ListCosineSimilarityFun),
204
+ DUCKDB_SCALAR_FUNCTION_SET(ListDistanceFun),
200
205
  DUCKDB_SCALAR_FUNCTION(ListDistinctFun),
206
+ DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListDotProductFun),
201
207
  DUCKDB_SCALAR_FUNCTION(ListFilterFun),
208
+ DUCKDB_SCALAR_FUNCTION_SET(ListInnerProductFun),
202
209
  DUCKDB_SCALAR_FUNCTION_ALIAS(ListPackFun),
203
210
  DUCKDB_SCALAR_FUNCTION_SET(ListReverseSortFun),
204
211
  DUCKDB_SCALAR_FUNCTION_SET(ListSliceFun),
@@ -0,0 +1,78 @@
1
+ #include "duckdb/core_functions/scalar/list_functions.hpp"
2
+ #include <cmath>
3
+ #include <algorithm>
4
+
5
+ namespace duckdb {
6
+
7
+ template <class NUMERIC_TYPE>
8
+ static void ListCosineSimilarity(DataChunk &args, ExpressionState &, Vector &result) {
9
+ D_ASSERT(args.ColumnCount() == 2);
10
+
11
+ auto count = args.size();
12
+ auto &left = args.data[0];
13
+ auto &right = args.data[1];
14
+ auto left_count = ListVector::GetListSize(left);
15
+ auto right_count = ListVector::GetListSize(right);
16
+
17
+ auto &left_child = ListVector::GetEntry(left);
18
+ auto &right_child = ListVector::GetEntry(right);
19
+
20
+ D_ASSERT(left_child.GetVectorType() == VectorType::FLAT_VECTOR);
21
+ D_ASSERT(right_child.GetVectorType() == VectorType::FLAT_VECTOR);
22
+
23
+ if (!FlatVector::Validity(left_child).CheckAllValid(left_count)) {
24
+ throw InvalidInputException("list_cosine_similarity: left argument can not contain NULL values");
25
+ }
26
+
27
+ if (!FlatVector::Validity(right_child).CheckAllValid(right_count)) {
28
+ throw InvalidInputException("list_cosine_similarity: right argument can not contain NULL values");
29
+ }
30
+
31
+ auto left_data = FlatVector::GetData<NUMERIC_TYPE>(left_child);
32
+ auto right_data = FlatVector::GetData<NUMERIC_TYPE>(right_child);
33
+
34
+ BinaryExecutor::Execute<list_entry_t, list_entry_t, NUMERIC_TYPE>(
35
+ left, right, result, count, [&](list_entry_t left, list_entry_t right) {
36
+ if (left.length != right.length) {
37
+ throw InvalidInputException(StringUtil::Format(
38
+ "list_cosine_similarity: list dimensions must be equal, got left length %d and right length %d",
39
+ left.length, right.length));
40
+ }
41
+
42
+ auto dimensions = left.length;
43
+
44
+ NUMERIC_TYPE distance = 0;
45
+ NUMERIC_TYPE norm_l = 0;
46
+ NUMERIC_TYPE norm_r = 0;
47
+
48
+ auto l_ptr = left_data + left.offset;
49
+ auto r_ptr = right_data + right.offset;
50
+ for (idx_t i = 0; i < dimensions; i++) {
51
+ auto x = *l_ptr++;
52
+ auto y = *r_ptr++;
53
+ distance += x * y;
54
+ norm_l += x * x;
55
+ norm_r += y * y;
56
+ }
57
+
58
+ auto similarity = distance / (std::sqrt(norm_l) * std::sqrt(norm_r));
59
+
60
+ // clamp to [-1, 1] to avoid floating point errors
61
+ return std::max(static_cast<NUMERIC_TYPE>(-1), std::min(similarity, static_cast<NUMERIC_TYPE>(1)));
62
+ });
63
+
64
+ if (args.AllConstant()) {
65
+ result.SetVectorType(VectorType::CONSTANT_VECTOR);
66
+ }
67
+ }
68
+
69
+ ScalarFunctionSet ListCosineSimilarityFun::GetFunctions() {
70
+ ScalarFunctionSet set("list_cosine_similarity");
71
+ set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::FLOAT), LogicalType::LIST(LogicalType::FLOAT)},
72
+ LogicalType::FLOAT, ListCosineSimilarity<float>));
73
+ set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::DOUBLE), LogicalType::LIST(LogicalType::DOUBLE)},
74
+ LogicalType::DOUBLE, ListCosineSimilarity<double>));
75
+ return set;
76
+ }
77
+
78
+ } // namespace duckdb