duckdb 0.8.2-dev2399.0 → 0.8.2-dev2669.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/binding.gyp +1 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu-datepart.cpp +3 -3
  4. package/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp +1 -1
  5. package/src/duckdb/src/catalog/default/default_functions.cpp +5 -0
  6. package/src/duckdb/src/common/enum_util.cpp +35 -1
  7. package/src/duckdb/src/common/http_state.cpp +78 -0
  8. package/src/duckdb/src/core_functions/function_list.cpp +2 -2
  9. package/src/duckdb/src/core_functions/scalar/list/array_slice.cpp +314 -82
  10. package/src/duckdb/src/execution/expression_executor/execute_parameter.cpp +2 -2
  11. package/src/duckdb/src/execution/index/art/art.cpp +43 -31
  12. package/src/duckdb/src/execution/index/art/leaf.cpp +47 -33
  13. package/src/duckdb/src/execution/index/art/node.cpp +31 -24
  14. package/src/duckdb/src/execution/index/art/prefix.cpp +100 -16
  15. package/src/duckdb/src/execution/operator/schema/physical_create_index.cpp +54 -31
  16. package/src/duckdb/src/execution/physical_plan/plan_create_index.cpp +32 -15
  17. package/src/duckdb/src/function/table/arrow/arrow_duck_schema.cpp +57 -0
  18. package/src/duckdb/src/function/table/arrow.cpp +95 -92
  19. package/src/duckdb/src/function/table/arrow_conversion.cpp +45 -68
  20. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  21. package/src/duckdb/src/include/duckdb/common/case_insensitive_map.hpp +1 -0
  22. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -0
  23. package/src/duckdb/src/include/duckdb/common/helper.hpp +8 -3
  24. package/src/duckdb/src/include/duckdb/common/http_state.hpp +61 -28
  25. package/src/duckdb/src/include/duckdb/common/types/value.hpp +4 -1
  26. package/src/duckdb/src/include/duckdb/core_functions/scalar/list_functions.hpp +4 -4
  27. package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +7 -5
  28. package/src/duckdb/src/include/duckdb/execution/index/art/leaf.hpp +6 -6
  29. package/src/duckdb/src/include/duckdb/execution/index/art/node.hpp +6 -0
  30. package/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +9 -11
  31. package/src/duckdb/src/include/duckdb/execution/operator/schema/physical_create_index.hpp +8 -1
  32. package/src/duckdb/src/include/duckdb/function/table/arrow/arrow_duck_schema.hpp +99 -0
  33. package/src/duckdb/src/include/duckdb/function/table/arrow.hpp +6 -36
  34. package/src/duckdb/src/include/duckdb/main/capi/capi_internal.hpp +3 -1
  35. package/src/duckdb/src/include/duckdb/main/client_context.hpp +15 -14
  36. package/src/duckdb/src/include/duckdb/main/prepared_statement.hpp +73 -5
  37. package/src/duckdb/src/include/duckdb/main/prepared_statement_data.hpp +6 -6
  38. package/src/duckdb/src/include/duckdb/parser/expression/operator_expression.hpp +20 -3
  39. package/src/duckdb/src/include/duckdb/parser/expression/parameter_expression.hpp +17 -1
  40. package/src/duckdb/src/include/duckdb/parser/statement/execute_statement.hpp +1 -1
  41. package/src/duckdb/src/include/duckdb/parser/transformer.hpp +5 -3
  42. package/src/duckdb/src/include/duckdb/planner/bound_parameter_map.hpp +2 -1
  43. package/src/duckdb/src/include/duckdb/planner/expression/bound_parameter_data.hpp +20 -5
  44. package/src/duckdb/src/include/duckdb/planner/expression/bound_parameter_expression.hpp +3 -3
  45. package/src/duckdb/src/include/duckdb/planner/planner.hpp +4 -3
  46. package/src/duckdb/src/include/duckdb/storage/object_cache.hpp +1 -1
  47. package/src/duckdb/src/include/duckdb/verification/prepared_statement_verifier.hpp +1 -1
  48. package/src/duckdb/src/include/duckdb.h +16 -0
  49. package/src/duckdb/src/main/capi/pending-c.cpp +6 -0
  50. package/src/duckdb/src/main/capi/prepared-c.cpp +52 -4
  51. package/src/duckdb/src/main/client_context.cpp +27 -17
  52. package/src/duckdb/src/main/client_verify.cpp +17 -0
  53. package/src/duckdb/src/main/extension/extension_helper.cpp +2 -1
  54. package/src/duckdb/src/main/prepared_statement.cpp +38 -11
  55. package/src/duckdb/src/main/prepared_statement_data.cpp +23 -18
  56. package/src/duckdb/src/parser/expression/parameter_expression.cpp +7 -7
  57. package/src/duckdb/src/parser/statement/execute_statement.cpp +2 -2
  58. package/src/duckdb/src/parser/transform/expression/transform_array_access.cpp +13 -4
  59. package/src/duckdb/src/parser/transform/expression/transform_param_ref.cpp +45 -26
  60. package/src/duckdb/src/parser/transform/statement/transform_prepare.cpp +28 -6
  61. package/src/duckdb/src/parser/transformer.cpp +27 -9
  62. package/src/duckdb/src/planner/binder/expression/bind_parameter_expression.cpp +10 -10
  63. package/src/duckdb/src/planner/binder/statement/bind_execute.cpp +13 -7
  64. package/src/duckdb/src/planner/expression/bound_parameter_expression.cpp +13 -13
  65. package/src/duckdb/src/planner/planner.cpp +7 -6
  66. package/src/duckdb/src/storage/checkpoint_manager.cpp +1 -1
  67. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +3 -3
  68. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +2 -2
  69. package/src/duckdb/src/verification/prepared_statement_verifier.cpp +16 -11
  70. package/src/duckdb/third_party/libpg_query/include/nodes/parsenodes.hpp +1 -0
  71. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +12855 -12282
  72. package/src/duckdb/ub_src_common.cpp +2 -0
  73. package/src/duckdb/ub_src_function_table_arrow.cpp +2 -0
@@ -42,19 +42,30 @@ void Leaf::New(ART &art, reference<Node> &node, const row_t *row_ids, idx_t coun
42
42
 
43
43
  void Leaf::Free(ART &art, Node &node) {
44
44
 
45
- D_ASSERT(node.IsSet() && !node.IsSerialized());
46
- auto &child = Leaf::Get(art, node).ptr;
47
- Node::Free(art, child);
45
+ Node current_node = node;
46
+ Node next_node;
47
+ while (current_node.IsSet() && !current_node.IsSerialized()) {
48
+ next_node = Leaf::Get(art, current_node).ptr;
49
+ Node::GetAllocator(art, NType::LEAF).Free(current_node);
50
+ current_node = next_node;
51
+ }
52
+
53
+ node.Reset();
48
54
  }
49
55
 
50
56
  void Leaf::InitializeMerge(ART &art, Node &node, const ARTFlags &flags) {
51
57
 
52
- D_ASSERT(node.IsSet() && !node.IsSerialized());
53
- D_ASSERT(node.GetType() == NType::LEAF);
58
+ auto merge_buffer_count = flags.merge_buffer_counts[(uint8_t)NType::LEAF - 1];
54
59
 
55
- auto &leaf = Leaf::Get(art, node);
56
- if (leaf.ptr.IsSet()) {
57
- leaf.ptr.InitializeMerge(art, flags);
60
+ Node next_node = node;
61
+ node.AddToBufferID(merge_buffer_count);
62
+
63
+ while (next_node.IsSet()) {
64
+ auto &leaf = Leaf::Get(art, next_node);
65
+ next_node = leaf.ptr;
66
+ if (leaf.ptr.IsSet()) {
67
+ leaf.ptr.AddToBufferID(merge_buffer_count);
68
+ }
58
69
  }
59
70
  }
60
71
 
@@ -290,7 +301,6 @@ string Leaf::VerifyAndToString(ART &art, Node &node) {
290
301
  return "Leaf [count: 1, row ID: " + to_string(node.GetRowId()) + "]";
291
302
  }
292
303
 
293
- // NOTE: we could do this recursively, but the function-call overhead can become kinda crazy
294
304
  string str = "";
295
305
 
296
306
  reference<Node> node_ref(node);
@@ -322,46 +332,51 @@ BlockPointer Leaf::Serialize(ART &art, Node &node, MetaBlockWriter &writer) {
322
332
  return block_pointer;
323
333
  }
324
334
 
325
- // recurse into the child and retrieve its block pointer
326
- auto &leaf = Leaf::Get(art, node);
327
- auto child_block_pointer = leaf.ptr.Serialize(art, writer);
328
-
329
- // get pointer and write fields
330
335
  auto block_pointer = writer.GetBlockPointer();
331
336
  writer.Write(NType::LEAF);
332
- writer.Write<uint8_t>(leaf.count);
337
+ idx_t total_count = Leaf::TotalCount(art, node);
338
+ writer.Write<idx_t>(total_count);
333
339
 
334
- // write row IDs
335
- for (idx_t i = 0; i < leaf.count; i++) {
336
- writer.Write(leaf.row_ids[i]);
337
- }
340
+ // iterate all leaves and write their row IDs
341
+ reference<Node> ref_node(node);
342
+ while (ref_node.get().IsSet()) {
343
+ D_ASSERT(!ref_node.get().IsSerialized());
344
+ auto &leaf = Leaf::Get(art, ref_node);
338
345
 
339
- // write child block pointer
340
- writer.Write(child_block_pointer.block_id);
341
- writer.Write(child_block_pointer.offset);
346
+ // write row IDs
347
+ for (idx_t i = 0; i < leaf.count; i++) {
348
+ writer.Write(leaf.row_ids[i]);
349
+ }
350
+ ref_node = leaf.ptr;
351
+ }
342
352
 
343
353
  return block_pointer;
344
354
  }
345
355
 
346
356
  void Leaf::Deserialize(ART &art, Node &node, MetaBlockReader &reader) {
347
357
 
348
- D_ASSERT(node.GetType() == NType::LEAF);
358
+ auto total_count = reader.Read<idx_t>();
359
+ reference<Node> ref_node(node);
349
360
 
350
- auto &leaf = Leaf::Get(art, node);
351
- leaf.count = reader.Read<uint8_t>();
361
+ while (total_count) {
362
+ ref_node.get() = Node::GetAllocator(art, NType::LEAF).New();
363
+ ref_node.get().SetType((uint8_t)NType::LEAF);
352
364
 
353
- // read row IDs
354
- for (idx_t i = 0; i < leaf.count; i++) {
355
- leaf.row_ids[i] = reader.Read<row_t>();
356
- }
365
+ auto &leaf = Leaf::Get(art, ref_node);
366
+
367
+ leaf.count = MinValue((idx_t)Node::LEAF_SIZE, total_count);
368
+ for (idx_t i = 0; i < leaf.count; i++) {
369
+ leaf.row_ids[i] = reader.Read<row_t>();
370
+ }
357
371
 
358
- // read child block pointer
359
- leaf.ptr = Node(reader);
372
+ total_count -= leaf.count;
373
+ ref_node = leaf.ptr;
374
+ leaf.ptr.Reset();
375
+ }
360
376
  }
361
377
 
362
378
  void Leaf::Vacuum(ART &art, Node &node) {
363
379
 
364
- // NOTE: we could do this recursively, but the function-call overhead can become kinda crazy
365
380
  auto &allocator = Node::GetAllocator(art, NType::LEAF);
366
381
 
367
382
  reference<Node> node_ref(node);
@@ -373,7 +388,6 @@ void Leaf::Vacuum(ART &art, Node &node) {
373
388
  auto &leaf = Leaf::Get(art, node_ref);
374
389
  node_ref = leaf.ptr;
375
390
  }
376
- return;
377
391
  }
378
392
 
379
393
  void Leaf::MoveInlinedToLeaf(ART &art, Node &node) {
@@ -61,7 +61,6 @@ void Node::New(ART &art, Node &node, const NType type) {
61
61
 
62
62
  void Node::Free(ART &art, Node &node) {
63
63
 
64
- // recursively free all nodes that are in-memory, and skip serialized and empty nodes
65
64
  if (!node.IsSet()) {
66
65
  return;
67
66
  }
@@ -72,11 +71,11 @@ void Node::Free(ART &art, Node &node) {
72
71
  auto type = node.GetType();
73
72
  switch (type) {
74
73
  case NType::PREFIX:
75
- Prefix::Free(art, node);
76
- break;
74
+ // iterative
75
+ return Prefix::Free(art, node);
77
76
  case NType::LEAF:
78
- Leaf::Free(art, node);
79
- break;
77
+ // iterative
78
+ return Leaf::Free(art, node);
80
79
  case NType::NODE_4:
81
80
  Node4::Free(art, node);
82
81
  break;
@@ -90,8 +89,7 @@ void Node::Free(ART &art, Node &node) {
90
89
  Node256::Free(art, node);
91
90
  break;
92
91
  case NType::LEAF_INLINED:
93
- node.Reset();
94
- return;
92
+ return node.Reset();
95
93
  }
96
94
 
97
95
  Node::GetAllocator(art, type).Free(node);
@@ -236,8 +234,10 @@ BlockPointer Node::Serialize(ART &art, MetaBlockWriter &writer) {
236
234
 
237
235
  switch (GetType()) {
238
236
  case NType::PREFIX:
239
- return Prefix::Get(art, *this).Serialize(art, writer);
237
+ // iterative
238
+ return Prefix::Serialize(art, *this, writer);
240
239
  case NType::LEAF:
240
+ // iterative
241
241
  return Leaf::Serialize(art, *this, writer);
242
242
  case NType::NODE_4:
243
243
  return Node4::Get(art, *this).Serialize(art, writer);
@@ -263,19 +263,23 @@ void Node::Deserialize(ART &art) {
263
263
  SetType(reader.Read<uint8_t>());
264
264
 
265
265
  auto decoded_type = GetType();
266
+
267
+ // iterative functions
268
+ if (decoded_type == NType::PREFIX) {
269
+ return Prefix::Deserialize(art, *this, reader);
270
+ }
266
271
  if (decoded_type == NType::LEAF_INLINED) {
267
- SetRowId(reader.Read<row_t>());
268
- return;
272
+ return SetRowId(reader.Read<row_t>());
273
+ }
274
+ if (decoded_type == NType::LEAF) {
275
+ return Leaf::Deserialize(art, *this, reader);
269
276
  }
270
277
 
271
278
  *this = Node::GetAllocator(art, decoded_type).New();
272
279
  SetType((uint8_t)decoded_type);
273
280
 
281
+ // recursive functions
274
282
  switch (decoded_type) {
275
- case NType::PREFIX:
276
- return Prefix::Get(art, *this).Deserialize(reader);
277
- case NType::LEAF:
278
- return Leaf::Deserialize(art, *this, reader);
279
283
  case NType::NODE_4:
280
284
  return Node4::Get(art, *this).Deserialize(reader);
281
285
  case NType::NODE_16:
@@ -363,7 +367,7 @@ NType Node::GetARTNodeTypeByCount(const idx_t count) {
363
367
  }
364
368
 
365
369
  FixedSizeAllocator &Node::GetAllocator(const ART &art, NType type) {
366
- return *art.allocators[(uint8_t)type - 1];
370
+ return (*art.allocators)[(uint8_t)type - 1];
367
371
  }
368
372
 
369
373
  //===--------------------------------------------------------------------===//
@@ -377,11 +381,11 @@ void Node::InitializeMerge(ART &art, const ARTFlags &flags) {
377
381
 
378
382
  switch (GetType()) {
379
383
  case NType::PREFIX:
380
- Prefix::Get(art, *this).InitializeMerge(art, flags);
381
- break;
384
+ // iterative
385
+ return Prefix::InitializeMerge(art, *this, flags);
382
386
  case NType::LEAF:
383
- Leaf::InitializeMerge(art, *this, flags);
384
- break;
387
+ // iterative
388
+ return Leaf::InitializeMerge(art, *this, flags);
385
389
  case NType::NODE_4:
386
390
  Node4::Get(art, *this).InitializeMerge(art, flags);
387
391
  break;
@@ -398,8 +402,7 @@ void Node::InitializeMerge(ART &art, const ARTFlags &flags) {
398
402
  return;
399
403
  }
400
404
 
401
- // NOTE: this works because the rightmost 32 bits contain the buffer ID
402
- data += flags.merge_buffer_counts[(uint8_t)GetType() - 1];
405
+ AddToBufferID(flags.merge_buffer_counts[(uint8_t)GetType() - 1]);
403
406
  }
404
407
 
405
408
  bool Node::Merge(ART &art, Node &other) {
@@ -572,11 +575,16 @@ void Node::Vacuum(ART &art, const ARTFlags &flags) {
572
575
  }
573
576
 
574
577
  auto node_type = GetType();
578
+
579
+ // iterative functions
580
+ if (node_type == NType::PREFIX) {
581
+ return Prefix::Vacuum(art, *this, flags);
582
+ }
575
583
  if (node_type == NType::LEAF_INLINED) {
576
584
  return;
577
585
  }
578
586
  if (node_type == NType::LEAF) {
579
- if (flags.vacuum_flags[(uint8_t)GetType() - 1]) {
587
+ if (flags.vacuum_flags[(uint8_t)node_type - 1]) {
580
588
  Leaf::Vacuum(art, *this);
581
589
  }
582
590
  return;
@@ -589,9 +597,8 @@ void Node::Vacuum(ART &art, const ARTFlags &flags) {
589
597
  SetType((uint8_t)node_type);
590
598
  }
591
599
 
600
+ // recursive functions
592
601
  switch (node_type) {
593
- case NType::PREFIX:
594
- return Prefix::Get(art, *this).Vacuum(art, flags);
595
602
  case NType::NODE_4:
596
603
  return Node4::Get(art, *this).Vacuum(art, flags);
597
604
  case NType::NODE_16:
@@ -55,9 +55,35 @@ void Prefix::New(ART &art, reference<Node> &node, const ARTKey &key, const uint3
55
55
 
56
56
  void Prefix::Free(ART &art, Node &node) {
57
57
 
58
- D_ASSERT(node.IsSet() && !node.IsSerialized());
59
- auto &child = Prefix::Get(art, node).ptr;
60
- Node::Free(art, child);
58
+ Node current_node = node;
59
+ Node next_node;
60
+ while (current_node.IsSet() && !current_node.IsSerialized() && current_node.GetType() == NType::PREFIX) {
61
+ next_node = Prefix::Get(art, current_node).ptr;
62
+ Node::GetAllocator(art, NType::PREFIX).Free(current_node);
63
+ current_node = next_node;
64
+ }
65
+
66
+ Node::Free(art, current_node);
67
+ node.Reset();
68
+ }
69
+
70
+ void Prefix::InitializeMerge(ART &art, Node &node, const ARTFlags &flags) {
71
+
72
+ auto merge_buffer_count = flags.merge_buffer_counts[(uint8_t)NType::PREFIX - 1];
73
+
74
+ Node next_node = node;
75
+ reference<Prefix> prefix = Prefix::Get(art, next_node);
76
+
77
+ while (next_node.GetType() == NType::PREFIX) {
78
+ next_node = prefix.get().ptr;
79
+ if (prefix.get().ptr.GetType() == NType::PREFIX) {
80
+ prefix.get().ptr.AddToBufferID(merge_buffer_count);
81
+ prefix = Prefix::Get(art, next_node);
82
+ }
83
+ }
84
+
85
+ node.AddToBufferID(merge_buffer_count);
86
+ prefix.get().ptr.InitializeMerge(art, flags);
61
87
  }
62
88
 
63
89
  void Prefix::Concatenate(ART &art, Node &prefix_node, const uint8_t byte, Node &child_prefix_node) {
@@ -280,19 +306,28 @@ string Prefix::VerifyAndToString(ART &art, Node &node, const bool only_verify) {
280
306
  return str + node_ref.get().VerifyAndToString(art, only_verify);
281
307
  }
282
308
 
283
- BlockPointer Prefix::Serialize(ART &art, MetaBlockWriter &writer) {
309
+ BlockPointer Prefix::Serialize(ART &art, Node &node, MetaBlockWriter &writer) {
284
310
 
285
- // recurse into the child and retrieve its block pointer
286
- auto child_block_pointer = ptr.Serialize(art, writer);
311
+ reference<Node> first_non_prefix(node);
312
+ idx_t total_count = Prefix::TotalCount(art, first_non_prefix);
313
+ auto child_block_pointer = first_non_prefix.get().Serialize(art, writer);
287
314
 
288
315
  // get pointer and write fields
289
316
  auto block_pointer = writer.GetBlockPointer();
290
317
  writer.Write(NType::PREFIX);
291
- writer.Write<uint8_t>(data[Node::PREFIX_SIZE]);
318
+ writer.Write<idx_t>(total_count);
319
+
320
+ reference<Node> current_node(node);
321
+ while (current_node.get().GetType() == NType::PREFIX) {
292
322
 
293
- // write prefix bytes
294
- for (idx_t i = 0; i < data[Node::PREFIX_SIZE]; i++) {
295
- writer.Write(data[i]);
323
+ // write prefix bytes
324
+ D_ASSERT(!current_node.get().IsSerialized());
325
+ auto &prefix = Prefix::Get(art, current_node);
326
+ for (idx_t i = 0; i < prefix.data[Node::PREFIX_SIZE]; i++) {
327
+ writer.Write(prefix.data[i]);
328
+ }
329
+
330
+ current_node = prefix.ptr;
296
331
  }
297
332
 
298
333
  // write child block pointer
@@ -302,17 +337,48 @@ BlockPointer Prefix::Serialize(ART &art, MetaBlockWriter &writer) {
302
337
  return block_pointer;
303
338
  }
304
339
 
305
- void Prefix::Deserialize(MetaBlockReader &reader) {
340
+ void Prefix::Deserialize(ART &art, Node &node, MetaBlockReader &reader) {
341
+
342
+ auto total_count = reader.Read<idx_t>();
343
+ reference<Node> current_node(node);
344
+
345
+ while (total_count) {
346
+ current_node.get() = Node::GetAllocator(art, NType::PREFIX).New();
347
+ current_node.get().SetType((uint8_t)NType::PREFIX);
306
348
 
307
- data[Node::PREFIX_SIZE] = reader.Read<uint8_t>();
349
+ auto &prefix = Prefix::Get(art, current_node);
350
+ prefix.data[Node::PREFIX_SIZE] = MinValue((idx_t)Node::PREFIX_SIZE, total_count);
308
351
 
309
- // read bytes
310
- for (idx_t i = 0; i < data[Node::PREFIX_SIZE]; i++) {
311
- data[i] = reader.Read<uint8_t>();
352
+ // read bytes
353
+ for (idx_t i = 0; i < prefix.data[Node::PREFIX_SIZE]; i++) {
354
+ prefix.data[i] = reader.Read<uint8_t>();
355
+ }
356
+
357
+ total_count -= prefix.data[Node::PREFIX_SIZE];
358
+ current_node = prefix.ptr;
359
+ prefix.ptr.Reset();
312
360
  }
313
361
 
314
362
  // read child block pointer
315
- ptr = Node(reader);
363
+ current_node.get() = Node(reader);
364
+ }
365
+
366
+ void Prefix::Vacuum(ART &art, Node &node, const ARTFlags &flags) {
367
+
368
+ bool flag_set = flags.vacuum_flags[(uint8_t)NType::PREFIX - 1];
369
+ auto &allocator = Node::GetAllocator(art, NType::PREFIX);
370
+
371
+ reference<Node> node_ref(node);
372
+ while (!node_ref.get().IsSerialized() && node_ref.get().GetType() == NType::PREFIX) {
373
+ if (flag_set && allocator.NeedsVacuum(node_ref)) {
374
+ node_ref.get() = allocator.VacuumPointer(node_ref);
375
+ node_ref.get().SetType((uint8_t)NType::PREFIX);
376
+ }
377
+ auto &prefix = Prefix::Get(art, node_ref);
378
+ node_ref = prefix.ptr;
379
+ }
380
+
381
+ node_ref.get().Vacuum(art, flags);
316
382
  }
317
383
 
318
384
  Prefix &Prefix::Append(ART &art, const uint8_t byte) {
@@ -356,4 +422,22 @@ void Prefix::Append(ART &art, Node other_prefix) {
356
422
  D_ASSERT(prefix.get().ptr.GetType() != NType::PREFIX);
357
423
  }
358
424
 
425
+ idx_t Prefix::TotalCount(ART &art, reference<Node> &node) {
426
+
427
+ // NOTE: first prefix in the prefix chain is already deserialized
428
+ D_ASSERT(node.get().IsSet() && !node.get().IsSerialized());
429
+
430
+ idx_t count = 0;
431
+ while (node.get().GetType() == NType::PREFIX) {
432
+ auto &prefix = Prefix::Get(art, node);
433
+ count += prefix.data[Node::PREFIX_SIZE];
434
+
435
+ if (prefix.ptr.IsSerialized()) {
436
+ prefix.ptr.Deserialize(art);
437
+ }
438
+ node = prefix.ptr;
439
+ }
440
+ return count;
441
+ }
442
+
359
443
  } // namespace duckdb
@@ -4,7 +4,9 @@
4
4
  #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
5
5
  #include "duckdb/catalog/catalog_entry/duck_index_entry.hpp"
6
6
  #include "duckdb/main/client_context.hpp"
7
+ #include "duckdb/storage/index.hpp"
7
8
  #include "duckdb/storage/storage_manager.hpp"
9
+ #include "duckdb/storage/table/append_state.hpp"
8
10
  #include "duckdb/main/database_manager.hpp"
9
11
  #include "duckdb/execution/index/art/art_key.hpp"
10
12
  #include "duckdb/execution/index/art/node.hpp"
@@ -15,10 +17,10 @@ namespace duckdb {
15
17
  PhysicalCreateIndex::PhysicalCreateIndex(LogicalOperator &op, TableCatalogEntry &table_p,
16
18
  const vector<column_t> &column_ids, unique_ptr<CreateIndexInfo> info,
17
19
  vector<unique_ptr<Expression>> unbound_expressions,
18
- idx_t estimated_cardinality)
20
+ idx_t estimated_cardinality, const bool sorted)
19
21
  : PhysicalOperator(PhysicalOperatorType::CREATE_INDEX, op.types, estimated_cardinality),
20
- table(table_p.Cast<DuckTableEntry>()), info(std::move(info)),
21
- unbound_expressions(std::move(unbound_expressions)) {
22
+ table(table_p.Cast<DuckTableEntry>()), info(std::move(info)), unbound_expressions(std::move(unbound_expressions)),
23
+ sorted(sorted) {
22
24
  // convert virtual column ids to storage column ids
23
25
  for (auto &column_id : column_ids) {
24
26
  storage_ids.push_back(table.GetColumns().LogicalToPhysical(LogicalIndex(column_id)).index);
@@ -86,43 +88,65 @@ unique_ptr<LocalSinkState> PhysicalCreateIndex::GetLocalSinkState(ExecutionConte
86
88
  return std::move(state);
87
89
  }
88
90
 
89
- SinkResultType PhysicalCreateIndex::Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input) const {
91
+ SinkResultType PhysicalCreateIndex::SinkUnsorted(Vector &row_identifiers, OperatorSinkInput &input) const {
90
92
 
91
- D_ASSERT(chunk.ColumnCount() >= 2);
92
- auto &lstate = input.local_state.Cast<CreateIndexLocalSinkState>();
93
- auto &row_identifiers = chunk.data[chunk.ColumnCount() - 1];
93
+ auto &l_state = input.local_state.Cast<CreateIndexLocalSinkState>();
94
+ auto count = l_state.key_chunk.size();
94
95
 
95
- // generate the keys for the given input
96
- lstate.key_chunk.ReferenceColumns(chunk, lstate.key_column_ids);
97
- lstate.arena_allocator.Reset();
98
- ART::GenerateKeys(lstate.arena_allocator, lstate.key_chunk, lstate.keys);
96
+ // get the corresponding row IDs
97
+ row_identifiers.Flatten(count);
98
+ auto row_ids = FlatVector::GetData<row_t>(row_identifiers);
99
+
100
+ // insert the row IDs
101
+ auto &art = l_state.local_index->Cast<ART>();
102
+ for (idx_t i = 0; i < count; i++) {
103
+ if (!art.Insert(*art.tree, l_state.keys[i], 0, row_ids[i])) {
104
+ throw ConstraintException("Data contains duplicates on indexed column(s)");
105
+ }
106
+ }
99
107
 
108
+ return SinkResultType::NEED_MORE_INPUT;
109
+ }
110
+
111
+ SinkResultType PhysicalCreateIndex::SinkSorted(Vector &row_identifiers, OperatorSinkInput &input) const {
112
+
113
+ auto &l_state = input.local_state.Cast<CreateIndexLocalSinkState>();
100
114
  auto &storage = table.GetStorage();
101
- auto art = make_uniq<ART>(lstate.local_index->column_ids, lstate.local_index->table_io_manager,
102
- lstate.local_index->unbound_expressions, lstate.local_index->constraint_type, storage.db);
103
- if (!art->ConstructFromSorted(lstate.key_chunk.size(), lstate.keys, row_identifiers)) {
115
+ auto &l_index = l_state.local_index;
116
+
117
+ // create an ART from the chunk
118
+ auto art = make_uniq<ART>(l_index->column_ids, l_index->table_io_manager, l_index->unbound_expressions,
119
+ l_index->constraint_type, storage.db, l_index->Cast<ART>().allocators);
120
+ if (!art->ConstructFromSorted(l_state.key_chunk.size(), l_state.keys, row_identifiers)) {
104
121
  throw ConstraintException("Data contains duplicates on indexed column(s)");
105
122
  }
106
123
 
107
124
  // merge into the local ART
108
- if (!lstate.local_index->MergeIndexes(*art)) {
125
+ if (!l_index->MergeIndexes(*art)) {
109
126
  throw ConstraintException("Data contains duplicates on indexed column(s)");
110
127
  }
111
128
 
112
- #ifdef DEBUG
113
- // ensure that all row IDs of this chunk exist in the ART
114
- auto &local_art = lstate.local_index->Cast<ART>();
115
- auto row_ids = FlatVector::GetData<row_t>(row_identifiers);
116
- for (idx_t i = 0; i < lstate.key_chunk.size(); i++) {
117
- auto leaf = local_art.Lookup(*local_art.tree, lstate.keys[i], 0);
118
- D_ASSERT(leaf.IsSet());
119
- D_ASSERT(Leaf::ContainsRowId(local_art, leaf, row_ids[i]));
120
- }
121
- #endif
122
-
123
129
  return SinkResultType::NEED_MORE_INPUT;
124
130
  }
125
131
 
132
+ SinkResultType PhysicalCreateIndex::Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input) const {
133
+
134
+ D_ASSERT(chunk.ColumnCount() >= 2);
135
+
136
+ // generate the keys for the given input
137
+ auto &l_state = input.local_state.Cast<CreateIndexLocalSinkState>();
138
+ l_state.key_chunk.ReferenceColumns(chunk, l_state.key_column_ids);
139
+ l_state.arena_allocator.Reset();
140
+ ART::GenerateKeys(l_state.arena_allocator, l_state.key_chunk, l_state.keys);
141
+
142
+ // insert the keys and their corresponding row IDs
143
+ auto &row_identifiers = chunk.data[chunk.ColumnCount() - 1];
144
+ if (sorted) {
145
+ return SinkSorted(row_identifiers, input);
146
+ }
147
+ return SinkUnsorted(row_identifiers, input);
148
+ }
149
+
126
150
  SinkCombineResultType PhysicalCreateIndex::Combine(ExecutionContext &context, OperatorSinkCombineInput &input) const {
127
151
 
128
152
  auto &gstate = input.global_state.Cast<CreateIndexGlobalSinkState>();
@@ -133,18 +157,17 @@ SinkCombineResultType PhysicalCreateIndex::Combine(ExecutionContext &context, Op
133
157
  throw ConstraintException("Data contains duplicates on indexed column(s)");
134
158
  }
135
159
 
136
- // vacuum excess memory
137
- gstate.global_index->Vacuum();
138
-
139
160
  return SinkCombineResultType::FINISHED;
140
161
  }
141
162
 
142
163
  SinkFinalizeType PhysicalCreateIndex::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
143
164
  OperatorSinkFinalizeInput &input) const {
144
165
 
145
- // here, we just set the resulting global index as the newly created index of the table
146
-
166
+ // here, we set the resulting global index as the newly created index of the table
147
167
  auto &state = input.global_state.Cast<CreateIndexGlobalSinkState>();
168
+
169
+ // vacuum excess memory and verify
170
+ state.global_index->Vacuum();
148
171
  D_ASSERT(!state.global_index->VerifyAndToString(true).empty());
149
172
 
150
173
  auto &storage = table.GetStorage();
@@ -68,27 +68,44 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalCreateInde
68
68
  null_filter->types.emplace_back(LogicalType::ROW_TYPE);
69
69
  null_filter->children.push_back(std::move(projection));
70
70
 
71
- // order operator
72
-
73
- vector<BoundOrderByNode> orders;
74
- vector<idx_t> projections;
75
- for (idx_t i = 0; i < new_column_types.size() - 1; i++) {
76
- auto col_expr = make_uniq_base<Expression, BoundReferenceExpression>(new_column_types[i], i);
77
- orders.emplace_back(OrderType::ASCENDING, OrderByNullType::NULLS_FIRST, std::move(col_expr));
78
- projections.emplace_back(i);
71
+ // determine if we sort the data prior to index creation
72
+ // we don't sort, if either VARCHAR or compound key
73
+ auto perform_sorting = true;
74
+ if (op.unbound_expressions.size() > 1) {
75
+ perform_sorting = false;
76
+ } else if (op.unbound_expressions[0]->return_type.InternalType() == PhysicalType::VARCHAR) {
77
+ perform_sorting = false;
79
78
  }
80
- projections.emplace_back(new_column_types.size() - 1);
81
-
82
- auto physical_order =
83
- make_uniq<PhysicalOrder>(new_column_types, std::move(orders), std::move(projections), op.estimated_cardinality);
84
- physical_order->children.push_back(std::move(null_filter));
85
79
 
86
80
  // actual physical create index operator
87
81
 
88
82
  auto physical_create_index =
89
83
  make_uniq<PhysicalCreateIndex>(op, op.table, op.info->column_ids, std::move(op.info),
90
- std::move(op.unbound_expressions), op.estimated_cardinality);
91
- physical_create_index->children.push_back(std::move(physical_order));
84
+ std::move(op.unbound_expressions), op.estimated_cardinality, perform_sorting);
85
+
86
+ if (perform_sorting) {
87
+
88
+ // optional order operator
89
+ vector<BoundOrderByNode> orders;
90
+ vector<idx_t> projections;
91
+ for (idx_t i = 0; i < new_column_types.size() - 1; i++) {
92
+ auto col_expr = make_uniq_base<Expression, BoundReferenceExpression>(new_column_types[i], i);
93
+ orders.emplace_back(OrderType::ASCENDING, OrderByNullType::NULLS_FIRST, std::move(col_expr));
94
+ projections.emplace_back(i);
95
+ }
96
+ projections.emplace_back(new_column_types.size() - 1);
97
+
98
+ auto physical_order = make_uniq<PhysicalOrder>(new_column_types, std::move(orders), std::move(projections),
99
+ op.estimated_cardinality);
100
+ physical_order->children.push_back(std::move(null_filter));
101
+
102
+ physical_create_index->children.push_back(std::move(physical_order));
103
+ } else {
104
+
105
+ // no ordering
106
+ physical_create_index->children.push_back(std::move(null_filter));
107
+ }
108
+
92
109
  return std::move(physical_create_index);
93
110
  }
94
111
 
@@ -0,0 +1,57 @@
1
+ #include "duckdb/function/table/arrow/arrow_duck_schema.hpp"
2
+ #include "duckdb/common/arrow/arrow.hpp"
3
+ #include "duckdb/common/exception.hpp"
4
+
5
+ namespace duckdb {
6
+
7
+ void ArrowTableType::AddColumn(idx_t index, unique_ptr<ArrowType> type) {
8
+ D_ASSERT(arrow_convert_data.find(index) == arrow_convert_data.end());
9
+ arrow_convert_data.emplace(std::make_pair(index, std::move(type)));
10
+ }
11
+
12
+ const arrow_column_map_t &ArrowTableType::GetColumns() const {
13
+ return arrow_convert_data;
14
+ }
15
+
16
+ void ArrowType::AddChild(unique_ptr<ArrowType> child) {
17
+ children.emplace_back(std::move(child));
18
+ }
19
+
20
+ void ArrowType::AssignChildren(vector<unique_ptr<ArrowType>> children) {
21
+ D_ASSERT(this->children.empty());
22
+ this->children = std::move(children);
23
+ }
24
+
25
+ void ArrowType::SetDictionary(unique_ptr<ArrowType> dictionary) {
26
+ D_ASSERT(!this->dictionary_type);
27
+ dictionary_type = std::move(dictionary);
28
+ }
29
+
30
+ const ArrowType &ArrowType::GetDictionary() const {
31
+ D_ASSERT(dictionary_type);
32
+ return *dictionary_type;
33
+ }
34
+
35
+ const LogicalType &ArrowType::GetDuckType() const {
36
+ return type;
37
+ }
38
+
39
+ ArrowVariableSizeType ArrowType::GetSizeType() const {
40
+ return size_type;
41
+ }
42
+
43
+ ArrowDateTimeType ArrowType::GetDateTimeType() const {
44
+ return date_time_precision;
45
+ }
46
+
47
+ const ArrowType &ArrowType::operator[](idx_t index) const {
48
+ D_ASSERT(index < children.size());
49
+ return *children[index];
50
+ }
51
+
52
+ idx_t ArrowType::FixedSize() const {
53
+ D_ASSERT(size_type == ArrowVariableSizeType::FIXED_SIZE);
54
+ return fixed_size;
55
+ }
56
+
57
+ } // namespace duckdb