duckdb 0.8.2-dev3989.0 → 0.8.2-dev4126.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/binding.gyp +8 -7
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/json/buffered_json_reader.cpp +76 -74
  4. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +35 -32
  5. package/src/duckdb/extension/json/include/json_scan.hpp +9 -6
  6. package/src/duckdb/extension/json/json_scan.cpp +124 -121
  7. package/src/duckdb/extension/parquet/parquet_extension.cpp +23 -13
  8. package/src/duckdb/src/catalog/catalog_entry/duck_index_entry.cpp +5 -0
  9. package/src/duckdb/src/common/crypto/md5.cpp +2 -12
  10. package/src/duckdb/src/common/radix_partitioning.cpp +1 -1
  11. package/src/duckdb/src/common/sort/partition_state.cpp +5 -1
  12. package/src/duckdb/src/core_functions/aggregate/holistic/mode.cpp +1 -1
  13. package/src/duckdb/src/core_functions/function_list.cpp +8 -0
  14. package/src/duckdb/src/core_functions/scalar/list/list_cosine_similarity.cpp +78 -0
  15. package/src/duckdb/src/core_functions/scalar/list/list_distance.cpp +72 -0
  16. package/src/duckdb/src/core_functions/scalar/list/list_inner_product.cpp +70 -0
  17. package/src/duckdb/src/core_functions/scalar/string/sha256.cpp +32 -0
  18. package/src/duckdb/src/execution/index/art/art.cpp +111 -92
  19. package/src/duckdb/src/execution/index/art/iterator.cpp +21 -27
  20. package/src/duckdb/src/execution/index/art/leaf.cpp +72 -153
  21. package/src/duckdb/src/execution/index/art/node.cpp +109 -203
  22. package/src/duckdb/src/execution/index/art/node16.cpp +32 -64
  23. package/src/duckdb/src/execution/index/art/node256.cpp +38 -53
  24. package/src/duckdb/src/execution/index/art/node4.cpp +31 -62
  25. package/src/duckdb/src/execution/index/art/node48.cpp +43 -65
  26. package/src/duckdb/src/execution/index/art/prefix.cpp +70 -141
  27. package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +345 -0
  28. package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +74 -0
  29. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +1 -1
  30. package/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +1 -1
  31. package/src/duckdb/src/function/scalar/string/suffix.cpp +1 -1
  32. package/src/duckdb/src/function/table/system/duckdb_columns.cpp +3 -1
  33. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  34. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_index_entry.hpp +2 -0
  35. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +1 -1
  36. package/src/duckdb/src/include/duckdb/core_functions/scalar/list_functions.hpp +51 -0
  37. package/src/duckdb/src/include/duckdb/core_functions/scalar/string_functions.hpp +9 -0
  38. package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +17 -7
  39. package/src/duckdb/src/include/duckdb/execution/index/art/iterator.hpp +5 -5
  40. package/src/duckdb/src/include/duckdb/execution/index/art/leaf.hpp +10 -16
  41. package/src/duckdb/src/include/duckdb/execution/index/art/node.hpp +38 -116
  42. package/src/duckdb/src/include/duckdb/execution/index/art/node16.hpp +17 -18
  43. package/src/duckdb/src/include/duckdb/execution/index/art/node256.hpp +17 -23
  44. package/src/duckdb/src/include/duckdb/execution/index/art/node4.hpp +17 -18
  45. package/src/duckdb/src/include/duckdb/execution/index/art/node48.hpp +17 -24
  46. package/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +16 -22
  47. package/src/duckdb/src/include/duckdb/execution/index/fixed_size_allocator.hpp +126 -0
  48. package/src/duckdb/src/include/duckdb/execution/index/fixed_size_buffer.hpp +79 -0
  49. package/src/duckdb/src/include/duckdb/execution/index/index_pointer.hpp +96 -0
  50. package/src/duckdb/src/include/duckdb/parallel/task_scheduler.hpp +1 -1
  51. package/src/duckdb/src/include/duckdb/planner/operator/logical_join.hpp +1 -1
  52. package/src/duckdb/src/include/duckdb/storage/block.hpp +1 -1
  53. package/src/duckdb/src/include/duckdb/storage/index.hpp +10 -8
  54. package/src/duckdb/src/include/duckdb/storage/metadata/metadata_writer.hpp +3 -0
  55. package/src/duckdb/src/main/extension/extension_helper.cpp +15 -1
  56. package/src/duckdb/src/planner/binder/expression/bind_function_expression.cpp +14 -5
  57. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +2 -3
  58. package/src/duckdb/src/storage/checkpoint_manager.cpp +16 -21
  59. package/src/duckdb/src/storage/data_table.cpp +3 -3
  60. package/src/duckdb/src/storage/index.cpp +7 -1
  61. package/src/duckdb/src/storage/metadata/metadata_manager.cpp +21 -21
  62. package/src/duckdb/src/storage/standard_buffer_manager.cpp +10 -16
  63. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  64. package/src/duckdb/src/storage/table_index_list.cpp +1 -1
  65. package/src/duckdb/src/transaction/commit_state.cpp +5 -1
  66. package/src/duckdb/third_party/mbedtls/include/mbedtls_wrapper.hpp +4 -1
  67. package/src/duckdb/third_party/mbedtls/mbedtls_wrapper.cpp +24 -2
  68. package/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +5 -5
  69. package/src/duckdb/ub_src_core_functions_scalar_list.cpp +6 -0
  70. package/src/duckdb/ub_src_core_functions_scalar_string.cpp +2 -0
  71. package/src/duckdb/ub_src_execution_index.cpp +4 -0
  72. package/src/duckdb/ub_src_execution_index_art.cpp +0 -2
  73. package/src/duckdb/src/execution/index/art/fixed_size_allocator.cpp +0 -238
  74. package/src/duckdb/src/include/duckdb/execution/index/art/fixed_size_allocator.hpp +0 -115
@@ -429,7 +429,11 @@ bool PartitionGlobalMergeState::TryPrepareNextStage() {
429
429
 
430
430
  switch (stage) {
431
431
  case PartitionSortStage::INIT:
432
- total_tasks = num_threads;
432
+ // If the partitions are unordered, don't scan in parallel
433
+ // because it produces non-deterministic orderings.
434
+ // This can theoretically happen with ORDER BY,
435
+ // but that is something the query should be explicit about.
436
+ total_tasks = sink.orders.size() > sink.partitions.size() ? num_threads : 1;
433
437
  stage = PartitionSortStage::SCAN;
434
438
  return true;
435
439
 
@@ -220,7 +220,7 @@ struct ModeFunction {
220
220
  state.frequency_map = new typename STATE::Counts;
221
221
  }
222
222
  const double tau = .25;
223
- if (state.nonzero <= tau * state.frequency_map->size()) {
223
+ if (state.nonzero <= tau * state.frequency_map->size() || prev.end <= frame.start || frame.end <= prev.start) {
224
224
  state.Reset();
225
225
  // for f ∈ F do
226
226
  for (auto f = frame.start; f < frame.end; ++f) {
@@ -49,7 +49,10 @@ static StaticFunctionDefinition internal_functions[] = {
49
49
  DUCKDB_SCALAR_FUNCTION(FactorialOperatorFun),
50
50
  DUCKDB_SCALAR_FUNCTION_SET(BitwiseAndFun),
51
51
  DUCKDB_SCALAR_FUNCTION(PowOperatorFun),
52
+ DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListInnerProductFunAlias),
53
+ DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListDistanceFunAlias),
52
54
  DUCKDB_SCALAR_FUNCTION_SET(LeftShiftFun),
55
+ DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListCosineSimilarityFunAlias),
53
56
  DUCKDB_SCALAR_FUNCTION_SET(RightShiftFun),
54
57
  DUCKDB_SCALAR_FUNCTION_SET(AbsOperatorFun),
55
58
  DUCKDB_SCALAR_FUNCTION_ALIAS(PowOperatorFunAlias),
@@ -197,8 +200,12 @@ static StaticFunctionDefinition internal_functions[] = {
197
200
  DUCKDB_SCALAR_FUNCTION_ALIAS(ListAggrFun),
198
201
  DUCKDB_SCALAR_FUNCTION(ListAggregateFun),
199
202
  DUCKDB_SCALAR_FUNCTION_ALIAS(ListApplyFun),
203
+ DUCKDB_SCALAR_FUNCTION_SET(ListCosineSimilarityFun),
204
+ DUCKDB_SCALAR_FUNCTION_SET(ListDistanceFun),
200
205
  DUCKDB_SCALAR_FUNCTION(ListDistinctFun),
206
+ DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListDotProductFun),
201
207
  DUCKDB_SCALAR_FUNCTION(ListFilterFun),
208
+ DUCKDB_SCALAR_FUNCTION_SET(ListInnerProductFun),
202
209
  DUCKDB_SCALAR_FUNCTION_ALIAS(ListPackFun),
203
210
  DUCKDB_SCALAR_FUNCTION_SET(ListReverseSortFun),
204
211
  DUCKDB_SCALAR_FUNCTION_SET(ListSliceFun),
@@ -281,6 +288,7 @@ static StaticFunctionDefinition internal_functions[] = {
281
288
  DUCKDB_AGGREGATE_FUNCTION(StandardErrorOfTheMeanFun),
282
289
  DUCKDB_SCALAR_FUNCTION(SetBitFun),
283
290
  DUCKDB_SCALAR_FUNCTION(SetseedFun),
291
+ DUCKDB_SCALAR_FUNCTION(SHA256Fun),
284
292
  DUCKDB_SCALAR_FUNCTION_SET(SignFun),
285
293
  DUCKDB_SCALAR_FUNCTION_SET(SignBitFun),
286
294
  DUCKDB_SCALAR_FUNCTION(SinFun),
@@ -0,0 +1,78 @@
1
+ #include "duckdb/core_functions/scalar/list_functions.hpp"
2
+ #include <cmath>
3
+ #include <algorithm>
4
+
5
+ namespace duckdb {
6
+
7
+ template <class NUMERIC_TYPE>
8
+ static void ListCosineSimilarity(DataChunk &args, ExpressionState &, Vector &result) {
9
+ D_ASSERT(args.ColumnCount() == 2);
10
+
11
+ auto count = args.size();
12
+ auto &left = args.data[0];
13
+ auto &right = args.data[1];
14
+ auto left_count = ListVector::GetListSize(left);
15
+ auto right_count = ListVector::GetListSize(right);
16
+
17
+ auto &left_child = ListVector::GetEntry(left);
18
+ auto &right_child = ListVector::GetEntry(right);
19
+
20
+ D_ASSERT(left_child.GetVectorType() == VectorType::FLAT_VECTOR);
21
+ D_ASSERT(right_child.GetVectorType() == VectorType::FLAT_VECTOR);
22
+
23
+ if (!FlatVector::Validity(left_child).CheckAllValid(left_count)) {
24
+ throw InvalidInputException("list_cosine_similarity: left argument can not contain NULL values");
25
+ }
26
+
27
+ if (!FlatVector::Validity(right_child).CheckAllValid(right_count)) {
28
+ throw InvalidInputException("list_cosine_similarity: right argument can not contain NULL values");
29
+ }
30
+
31
+ auto left_data = FlatVector::GetData<NUMERIC_TYPE>(left_child);
32
+ auto right_data = FlatVector::GetData<NUMERIC_TYPE>(right_child);
33
+
34
+ BinaryExecutor::Execute<list_entry_t, list_entry_t, NUMERIC_TYPE>(
35
+ left, right, result, count, [&](list_entry_t left, list_entry_t right) {
36
+ if (left.length != right.length) {
37
+ throw InvalidInputException(StringUtil::Format(
38
+ "list_cosine_similarity: list dimensions must be equal, got left length %d and right length %d",
39
+ left.length, right.length));
40
+ }
41
+
42
+ auto dimensions = left.length;
43
+
44
+ NUMERIC_TYPE distance = 0;
45
+ NUMERIC_TYPE norm_l = 0;
46
+ NUMERIC_TYPE norm_r = 0;
47
+
48
+ auto l_ptr = left_data + left.offset;
49
+ auto r_ptr = right_data + right.offset;
50
+ for (idx_t i = 0; i < dimensions; i++) {
51
+ auto x = *l_ptr++;
52
+ auto y = *r_ptr++;
53
+ distance += x * y;
54
+ norm_l += x * x;
55
+ norm_r += y * y;
56
+ }
57
+
58
+ auto similarity = distance / (std::sqrt(norm_l) * std::sqrt(norm_r));
59
+
60
+ // clamp to [-1, 1] to avoid floating point errors
61
+ return std::max(static_cast<NUMERIC_TYPE>(-1), std::min(similarity, static_cast<NUMERIC_TYPE>(1)));
62
+ });
63
+
64
+ if (args.AllConstant()) {
65
+ result.SetVectorType(VectorType::CONSTANT_VECTOR);
66
+ }
67
+ }
68
+
69
+ ScalarFunctionSet ListCosineSimilarityFun::GetFunctions() {
70
+ ScalarFunctionSet set("list_cosine_similarity");
71
+ set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::FLOAT), LogicalType::LIST(LogicalType::FLOAT)},
72
+ LogicalType::FLOAT, ListCosineSimilarity<float>));
73
+ set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::DOUBLE), LogicalType::LIST(LogicalType::DOUBLE)},
74
+ LogicalType::DOUBLE, ListCosineSimilarity<double>));
75
+ return set;
76
+ }
77
+
78
+ } // namespace duckdb
@@ -0,0 +1,72 @@
1
+ #include "duckdb/core_functions/scalar/list_functions.hpp"
2
+ #include <cmath>
3
+
4
+ namespace duckdb {
5
+
6
+ template <class NUMERIC_TYPE>
7
+ static void ListDistance(DataChunk &args, ExpressionState &, Vector &result) {
8
+ D_ASSERT(args.ColumnCount() == 2);
9
+
10
+ auto count = args.size();
11
+ auto &left = args.data[0];
12
+ auto &right = args.data[1];
13
+ auto left_count = ListVector::GetListSize(left);
14
+ auto right_count = ListVector::GetListSize(right);
15
+
16
+ auto &left_child = ListVector::GetEntry(left);
17
+ auto &right_child = ListVector::GetEntry(right);
18
+
19
+ D_ASSERT(left_child.GetVectorType() == VectorType::FLAT_VECTOR);
20
+ D_ASSERT(right_child.GetVectorType() == VectorType::FLAT_VECTOR);
21
+
22
+ if (!FlatVector::Validity(left_child).CheckAllValid(left_count)) {
23
+ throw InvalidInputException("list_distance: left argument can not contain NULL values");
24
+ }
25
+
26
+ if (!FlatVector::Validity(right_child).CheckAllValid(right_count)) {
27
+ throw InvalidInputException("list_distance: right argument can not contain NULL values");
28
+ }
29
+
30
+ auto left_data = FlatVector::GetData<NUMERIC_TYPE>(left_child);
31
+ auto right_data = FlatVector::GetData<NUMERIC_TYPE>(right_child);
32
+
33
+ BinaryExecutor::Execute<list_entry_t, list_entry_t, NUMERIC_TYPE>(
34
+ left, right, result, count, [&](list_entry_t left, list_entry_t right) {
35
+ if (left.length != right.length) {
36
+ throw InvalidInputException(StringUtil::Format(
37
+ "list_distance: list dimensions must be equal, got left length %d and right length %d", left.length,
38
+ right.length));
39
+ }
40
+
41
+ auto dimensions = left.length;
42
+
43
+ NUMERIC_TYPE distance = 0;
44
+
45
+ auto l_ptr = left_data + left.offset;
46
+ auto r_ptr = right_data + right.offset;
47
+
48
+ for (idx_t i = 0; i < dimensions; i++) {
49
+ auto x = *l_ptr++;
50
+ auto y = *r_ptr++;
51
+ auto diff = x - y;
52
+ distance += diff * diff;
53
+ }
54
+
55
+ return std::sqrt(distance);
56
+ });
57
+
58
+ if (args.AllConstant()) {
59
+ result.SetVectorType(VectorType::CONSTANT_VECTOR);
60
+ }
61
+ }
62
+
63
+ ScalarFunctionSet ListDistanceFun::GetFunctions() {
64
+ ScalarFunctionSet set("list_distance");
65
+ set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::FLOAT), LogicalType::LIST(LogicalType::FLOAT)},
66
+ LogicalType::FLOAT, ListDistance<float>));
67
+ set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::DOUBLE), LogicalType::LIST(LogicalType::DOUBLE)},
68
+ LogicalType::DOUBLE, ListDistance<double>));
69
+ return set;
70
+ }
71
+
72
+ } // namespace duckdb
@@ -0,0 +1,70 @@
1
+ #include "duckdb/core_functions/scalar/list_functions.hpp"
2
+
3
+ namespace duckdb {
4
+
5
+ template <class NUMERIC_TYPE>
6
+ static void ListInnerProduct(DataChunk &args, ExpressionState &, Vector &result) {
7
+ D_ASSERT(args.ColumnCount() == 2);
8
+
9
+ auto count = args.size();
10
+ auto &left = args.data[0];
11
+ auto &right = args.data[1];
12
+ auto left_count = ListVector::GetListSize(left);
13
+ auto right_count = ListVector::GetListSize(right);
14
+
15
+ auto &left_child = ListVector::GetEntry(left);
16
+ auto &right_child = ListVector::GetEntry(right);
17
+
18
+ D_ASSERT(left_child.GetVectorType() == VectorType::FLAT_VECTOR);
19
+ D_ASSERT(right_child.GetVectorType() == VectorType::FLAT_VECTOR);
20
+
21
+ if (!FlatVector::Validity(left_child).CheckAllValid(left_count)) {
22
+ throw InvalidInputException("list_inner_product: left argument can not contain NULL values");
23
+ }
24
+
25
+ if (!FlatVector::Validity(right_child).CheckAllValid(right_count)) {
26
+ throw InvalidInputException("list_inner_product: right argument can not contain NULL values");
27
+ }
28
+
29
+ auto left_data = FlatVector::GetData<NUMERIC_TYPE>(left_child);
30
+ auto right_data = FlatVector::GetData<NUMERIC_TYPE>(right_child);
31
+
32
+ BinaryExecutor::Execute<list_entry_t, list_entry_t, NUMERIC_TYPE>(
33
+ left, right, result, count, [&](list_entry_t left, list_entry_t right) {
34
+ if (left.length != right.length) {
35
+ throw InvalidInputException(StringUtil::Format(
36
+ "list_inner_product: list dimensions must be equal, got left length %d and right length %d",
37
+ left.length, right.length));
38
+ }
39
+
40
+ auto dimensions = left.length;
41
+
42
+ NUMERIC_TYPE distance = 0;
43
+
44
+ auto l_ptr = left_data + left.offset;
45
+ auto r_ptr = right_data + right.offset;
46
+
47
+ for (idx_t i = 0; i < dimensions; i++) {
48
+ auto x = *l_ptr++;
49
+ auto y = *r_ptr++;
50
+ distance += x * y;
51
+ }
52
+
53
+ return distance;
54
+ });
55
+
56
+ if (args.AllConstant()) {
57
+ result.SetVectorType(VectorType::CONSTANT_VECTOR);
58
+ }
59
+ }
60
+
61
+ ScalarFunctionSet ListInnerProductFun::GetFunctions() {
62
+ ScalarFunctionSet set("list_inner_product");
63
+ set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::FLOAT), LogicalType::LIST(LogicalType::FLOAT)},
64
+ LogicalType::FLOAT, ListInnerProduct<float>));
65
+ set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::DOUBLE), LogicalType::LIST(LogicalType::DOUBLE)},
66
+ LogicalType::DOUBLE, ListInnerProduct<double>));
67
+ return set;
68
+ }
69
+
70
+ } // namespace duckdb
@@ -0,0 +1,32 @@
1
+ #include "duckdb/common/exception.hpp"
2
+ #include "duckdb/common/vector_operations/unary_executor.hpp"
3
+ #include "duckdb/core_functions/scalar/string_functions.hpp"
4
+ #include "mbedtls_wrapper.hpp"
5
+
6
+ namespace duckdb {
7
+
8
+ struct SHA256Operator {
9
+ template <class INPUT_TYPE, class RESULT_TYPE>
10
+ static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) {
11
+ auto hash = StringVector::EmptyString(result, duckdb_mbedtls::MbedTlsWrapper::SHA256_HASH_LENGTH_TEXT);
12
+
13
+ duckdb_mbedtls::MbedTlsWrapper::SHA256State state;
14
+ state.AddString(input.GetString());
15
+ state.FinishHex(hash.GetDataWriteable());
16
+
17
+ hash.Finalize();
18
+ return hash;
19
+ }
20
+ };
21
+
22
+ static void SHA256Function(DataChunk &args, ExpressionState &state, Vector &result) {
23
+ auto &input = args.data[0];
24
+
25
+ UnaryExecutor::ExecuteString<string_t, string_t, SHA256Operator>(input, result, args.size());
26
+ }
27
+
28
+ ScalarFunction SHA256Fun::GetFunction() {
29
+ return ScalarFunction({LogicalType::VARCHAR}, LogicalType::VARCHAR, SHA256Function);
30
+ }
31
+
32
+ } // namespace duckdb