duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -10,28 +10,22 @@
10
10
 
11
11
  #include "duckdb/common/common.hpp"
12
12
  #include "duckdb/common/types/validity_mask.hpp"
13
- #include "duckdb/planner/expression.hpp"
14
13
  #include "duckdb/execution/operator/aggregate/aggregate_object.hpp"
14
+ #include "duckdb/planner/expression.hpp"
15
15
 
16
16
  namespace duckdb {
17
17
 
18
18
  class RowLayout {
19
19
  public:
20
20
  friend class TupleDataLayout;
21
-
22
- using Aggregates = vector<AggregateObject>;
23
21
  using ValidityBytes = TemplatedValidityMask<uint8_t>;
24
22
 
25
23
  //! Creates an empty RowLayout
26
24
  RowLayout();
27
25
 
28
26
  public:
29
- //! Initializes the RowLayout with the specified types and aggregates to an empty RowLayout
30
- void Initialize(vector<LogicalType> types_p, Aggregates aggregates_p, bool align = true);
31
27
  //! Initializes the RowLayout with the specified types to an empty RowLayout
32
28
  void Initialize(vector<LogicalType> types, bool align = true);
33
- //! Initializes the RowLayout with the specified aggregates to an empty RowLayout
34
- void Initialize(Aggregates aggregates_p, bool align = true);
35
29
  //! Returns the number of data columns
36
30
  inline idx_t ColumnCount() const {
37
31
  return types.size();
@@ -40,14 +34,6 @@ public:
40
34
  inline const vector<LogicalType> &GetTypes() const {
41
35
  return types;
42
36
  }
43
- //! Returns the number of aggregates
44
- inline idx_t AggregateCount() const {
45
- return aggregates.size();
46
- }
47
- //! Returns a list of the aggregates for this data chunk
48
- inline Aggregates &GetAggregates() {
49
- return aggregates;
50
- }
51
37
  //! Returns the total width required for each row, including padding
52
38
  inline idx_t GetRowWidth() const {
53
39
  return row_width;
@@ -64,10 +50,6 @@ public:
64
50
  inline idx_t GetAggrOffset() const {
65
51
  return flag_width + data_width;
66
52
  }
67
- //! Returns the total width required for the aggregates, including padding
68
- inline idx_t GetAggrWidth() const {
69
- return aggr_width;
70
- }
71
53
  //! Returns the column offsets into each row
72
54
  inline const vector<idx_t> &GetOffsets() const {
73
55
  return offsets;
@@ -83,14 +65,10 @@ public:
83
65
  private:
84
66
  //! The types of the data columns
85
67
  vector<LogicalType> types;
86
- //! The aggregate functions
87
- Aggregates aggregates;
88
68
  //! The width of the validity header
89
69
  idx_t flag_width;
90
70
  //! The width of the data portion
91
71
  idx_t data_width;
92
- //! The width of the aggregate state portion
93
- idx_t aggr_width;
94
72
  //! The width of the entire row
95
73
  idx_t row_width;
96
74
  //! The offsets to the columns and aggregate data in each row
@@ -55,6 +55,8 @@ public:
55
55
  TupleDataAllocator(BufferManager &buffer_manager, const TupleDataLayout &layout);
56
56
  TupleDataAllocator(TupleDataAllocator &allocator);
57
57
 
58
+ //! Get the buffer manager
59
+ BufferManager &GetBufferManager();
58
60
  //! Get the buffer allocator
59
61
  Allocator &GetAllocator();
60
62
  //! Get the layout
@@ -83,16 +85,16 @@ public:
83
85
  private:
84
86
  //! Builds out a single part (grabs the lock)
85
87
  TupleDataChunkPart BuildChunkPart(TupleDataPinState &pin_state, TupleDataChunkState &chunk_state,
86
- const idx_t append_offset, const idx_t append_count);
88
+ const idx_t append_offset, const idx_t append_count, TupleDataChunk &chunk);
87
89
  //! Internal function for InitializeChunkState
88
90
  void InitializeChunkStateInternal(TupleDataPinState &pin_state, TupleDataChunkState &chunk_state, idx_t offset,
89
91
  bool recompute, bool init_heap_pointers, bool init_heap_sizes,
90
- vector<TupleDataChunkPart *> &parts);
92
+ unsafe_vector<reference<TupleDataChunkPart>> &parts);
91
93
  //! Internal function for ReleaseOrStoreHandles
92
- static void ReleaseOrStoreHandlesInternal(TupleDataSegment &segment, vector<BufferHandle> &pinned_row_handles,
93
- unordered_map<uint32_t, BufferHandle> &handles,
94
- const unordered_set<uint32_t> &block_ids, vector<TupleDataBlock> &blocks,
95
- TupleDataPinProperties properties);
94
+ static void ReleaseOrStoreHandlesInternal(TupleDataSegment &segment,
95
+ unsafe_vector<BufferHandle> &pinned_row_handles,
96
+ perfect_map_t<BufferHandle> &handles, const perfect_set_t &block_ids,
97
+ unsafe_vector<TupleDataBlock> &blocks, TupleDataPinProperties properties);
96
98
  //! Pins the given row block
97
99
  BufferHandle &PinRowBlock(TupleDataPinState &state, const TupleDataChunkPart &part);
98
100
  //! Pins the given heap block
@@ -108,9 +110,13 @@ private:
108
110
  //! The layout of the data
109
111
  const TupleDataLayout layout;
110
112
  //! Blocks storing the fixed-size rows
111
- vector<TupleDataBlock> row_blocks;
113
+ unsafe_vector<TupleDataBlock> row_blocks;
112
114
  //! Blocks storing the variable-size data of the fixed-size rows (e.g., string, list)
113
- vector<TupleDataBlock> heap_blocks;
115
+ unsafe_vector<TupleDataBlock> heap_blocks;
116
+
117
+ //! Re-usable arrays used while building buffer space
118
+ unsafe_vector<reference<TupleDataChunkPart>> chunk_parts;
119
+ unsafe_vector<pair<idx_t, idx_t>> chunk_part_indices;
114
120
  };
115
121
 
116
122
  } // namespace duckdb
@@ -45,6 +45,7 @@ struct TupleDataGatherFunction {
45
45
  //! FIXME: rename to RowDataCollection after we phase it out
46
46
  class TupleDataCollection {
47
47
  friend class TupleDataChunkIterator;
48
+ friend class PartitionedTupleData;
48
49
 
49
50
  public:
50
51
  //! Constructs a TupleDataCollection with the specified layout
@@ -63,8 +64,6 @@ public:
63
64
  idx_t ChunkCount() const;
64
65
  //! The size (in bytes) of the blocks held by this tuple data collection
65
66
  idx_t SizeInBytes() const;
66
- //! Get pointers to the pinned blocks
67
- void GetBlockPointers(vector<data_ptr_t> &block_pointers) const;
68
67
  //! Unpins all held pins
69
68
  void Unpin();
70
69
 
@@ -186,6 +185,8 @@ private:
186
185
  void Initialize();
187
186
  //! Gets all column ids
188
187
  void GetAllColumnIDs(vector<column_t> &column_ids);
188
+ //! Adds a segment to this TupleDataCollection
189
+ void AddSegment(TupleDataSegment &&segment);
189
190
 
190
191
  //! Computes the heap sizes for the specific Vector that will be appended
191
192
  static void ComputeHeapSizes(Vector &heap_sizes_v, const Vector &source_v, TupleDataVectorFormat &source,
@@ -219,7 +220,7 @@ private:
219
220
  void ScanAtIndex(TupleDataPinState &pin_state, TupleDataChunkState &chunk_state, const vector<column_t> &column_ids,
220
221
  idx_t segment_index, idx_t chunk_index, DataChunk &result);
221
222
 
222
- //! Verify counts of the segments in this collection
223
+ //! Verify count/data size of this collection
223
224
  void Verify() const;
224
225
 
225
226
  private:
@@ -229,6 +230,8 @@ private:
229
230
  shared_ptr<TupleDataAllocator> allocator;
230
231
  //! The number of entries stored in the TupleDataCollection
231
232
  idx_t count;
233
+ //! The size (in bytes) of this TupleDataCollection
234
+ idx_t data_size;
232
235
  //! The data segments of the TupleDataCollection
233
236
  unsafe_vector<TupleDataSegment> segments;
234
237
  //! The set of scatter functions
@@ -83,9 +83,14 @@ public:
83
83
  inline bool AllConstant() const {
84
84
  return all_constant;
85
85
  }
86
+ //! Gets offset to where heap size is stored
86
87
  inline idx_t GetHeapSizeOffset() const {
87
88
  return heap_size_offset;
88
89
  }
90
+ //! Returns whether any of the aggregates have a destructor
91
+ inline bool HasDestructor() const {
92
+ return has_destructor;
93
+ }
89
94
 
90
95
  private:
91
96
  //! The types of the data columns
@@ -108,6 +113,8 @@ private:
108
113
  bool all_constant;
109
114
  //! Offset to the heap size of every row
110
115
  idx_t heap_size_offset;
116
+ //! Whether any of the aggregates have a destructor
117
+ bool has_destructor;
111
118
  };
112
119
 
113
120
  } // namespace duckdb
@@ -10,6 +10,7 @@
10
10
 
11
11
  #include "duckdb/common/common.hpp"
12
12
  #include "duckdb/common/mutex.hpp"
13
+ #include "duckdb/common/perfect_map_set.hpp"
13
14
  #include "duckdb/common/unordered_set.hpp"
14
15
  #include "duckdb/common/vector.hpp"
15
16
  #include "duckdb/storage/buffer_manager.hpp"
@@ -21,7 +22,7 @@ class TupleDataLayout;
21
22
 
22
23
  struct TupleDataChunkPart {
23
24
  public:
24
- TupleDataChunkPart();
25
+ TupleDataChunkPart(mutex &lock);
25
26
 
26
27
  //! Disable copy constructors
27
28
  TupleDataChunkPart(const TupleDataChunkPart &other) = delete;
@@ -45,8 +46,8 @@ public:
45
46
  uint32_t total_heap_size;
46
47
  //! Tuple count for this chunk part
47
48
  uint32_t count;
48
- //! Lock for recomputing heap pointers
49
- mutex lock;
49
+ //! Lock for recomputing heap pointers (owned by TupleDataChunk)
50
+ reference<mutex> lock;
50
51
  };
51
52
 
52
53
  struct TupleDataChunk {
@@ -70,13 +71,15 @@ public:
70
71
 
71
72
  public:
72
73
  //! The parts of this chunk
73
- vector<TupleDataChunkPart> parts;
74
+ unsafe_vector<TupleDataChunkPart> parts;
74
75
  //! The row block ids referenced by the chunk
75
- unordered_set<uint32_t> row_block_ids;
76
+ perfect_set_t row_block_ids;
76
77
  //! The heap block ids referenced by the chunk
77
- unordered_set<uint32_t> heap_block_ids;
78
+ perfect_set_t heap_block_ids;
78
79
  //! Tuple count for this chunk
79
80
  idx_t count;
81
+ //! Lock for recomputing heap pointers
82
+ unsafe_unique_ptr<mutex> lock;
80
83
  };
81
84
 
82
85
  struct TupleDataSegment {
@@ -112,13 +115,15 @@ public:
112
115
  unsafe_vector<TupleDataChunk> chunks;
113
116
  //! The tuple count of this segment
114
117
  idx_t count;
118
+ //! The data size of this segment
119
+ idx_t data_size;
115
120
 
116
121
  //! Lock for modifying pinned_handles
117
122
  mutex pinned_handles_lock;
118
123
  //! Where handles to row blocks will be stored with TupleDataPinProperties::KEEP_EVERYTHING_PINNED
119
- vector<BufferHandle> pinned_row_handles;
124
+ unsafe_vector<BufferHandle> pinned_row_handles;
120
125
  //! Where handles to heap blocks will be stored with TupleDataPinProperties::KEEP_EVERYTHING_PINNED
121
- vector<BufferHandle> pinned_heap_handles;
126
+ unsafe_vector<BufferHandle> pinned_heap_handles;
122
127
  };
123
128
 
124
129
  } // namespace duckdb
@@ -9,6 +9,7 @@
9
9
  #pragma once
10
10
 
11
11
  #include "duckdb/common/mutex.hpp"
12
+ #include "duckdb/common/perfect_map_set.hpp"
12
13
  #include "duckdb/common/types.hpp"
13
14
 
14
15
  namespace duckdb {
@@ -26,8 +27,8 @@ enum class TupleDataPinProperties : uint8_t {
26
27
  };
27
28
 
28
29
  struct TupleDataPinState {
29
- unordered_map<uint32_t, BufferHandle> row_handles;
30
- unordered_map<uint32_t, BufferHandle> heap_handles;
30
+ perfect_map_t<BufferHandle> row_handles;
31
+ perfect_map_t<BufferHandle> heap_handles;
31
32
  TupleDataPinProperties properties = TupleDataPinProperties::INVALID;
32
33
  };
33
34
 
@@ -101,14 +101,14 @@ public:
101
101
  DUCKDB_API void Reference(const Value &value);
102
102
  //! Causes this vector to reference the data held by the other vector.
103
103
  //! The type of the "other" vector should match the type of this vector
104
- DUCKDB_API void Reference(Vector &other);
104
+ DUCKDB_API void Reference(const Vector &other);
105
105
  //! Reinterpret the data of the other vector as the type of this vector
106
106
  //! Note that this takes the data of the other vector as-is and places it in this vector
107
107
  //! Without changing the type of this vector
108
- DUCKDB_API void Reinterpret(Vector &other);
108
+ DUCKDB_API void Reinterpret(const Vector &other);
109
109
 
110
110
  //! Causes this vector to reference the data held by the other vector, changes the type if required.
111
- DUCKDB_API void ReferenceAndSetType(Vector &other);
111
+ DUCKDB_API void ReferenceAndSetType(const Vector &other);
112
112
 
113
113
  //! Resets a vector from a vector cache.
114
114
  //! This turns the vector back into an empty FlatVector with STANDARD_VECTOR_SIZE entries.
@@ -88,14 +88,14 @@ public:
88
88
  }
89
89
 
90
90
  typename original::reference back() {
91
- if (original::empty()) {
91
+ if (MemorySafety<SAFE>::enabled && original::empty()) {
92
92
  throw InternalException("'back' called on an empty vector!");
93
93
  }
94
94
  return get<SAFE>(original::size() - 1);
95
95
  }
96
96
 
97
97
  typename original::const_reference back() const {
98
- if (original::empty()) {
98
+ if (MemorySafety<SAFE>::enabled && original::empty()) {
99
99
  throw InternalException("'back' called on an empty vector!");
100
100
  }
101
101
  return get<SAFE>(original::size() - 1);
@@ -8,12 +8,13 @@
8
8
 
9
9
  #pragma once
10
10
 
11
- #include "duckdb/common/types/row/tuple_data_collection.hpp"
11
+ #include "duckdb/common/types/row/partitioned_tuple_data.hpp"
12
12
  #include "duckdb/execution/base_aggregate_hashtable.hpp"
13
13
  #include "duckdb/storage/arena_allocator.hpp"
14
14
  #include "duckdb/storage/buffer/buffer_handle.hpp"
15
15
 
16
16
  namespace duckdb {
17
+
17
18
  class BlockHandle;
18
19
  class BufferHandle;
19
20
 
@@ -27,91 +28,87 @@ struct FlushMoveState;
27
28
  stores them in the HT. It uses linear probing for collision resolution.
28
29
  */
29
30
 
30
- // two part hash table
31
- // hashes and payload
32
- // hashes layout:
33
- // [SALT][PAGE_NR][PAGE_OFFSET]
34
- // [SALT] are the high bits of the hash value, e.g. 16 for 64 bit hashes
35
- // [PAGE_NR] is the buffer managed payload page index
36
- // [PAGE_OFFSET] is the logical entry offset into said payload page
37
-
38
- // NOTE: PAGE_NR and PAGE_OFFSET are reversed for 64 bit HTs because struct packing
39
-
40
- // payload layout
41
- // [VALIDITY][GROUPS][HASH][PADDING][PAYLOAD]
42
- // [VALIDITY] is the validity bits of the data columns (including the HASH)
43
- // [GROUPS] is the group data, could be multiple values, fixed size, strings are elsewhere
44
- // [HASH] is the hash data of the groups
45
- // [PADDING] is gunk data to align payload properly
46
- // [PAYLOAD] is the payload (i.e. the aggregate states)
47
- struct aggr_ht_entry_64 {
48
- uint16_t salt;
49
- uint16_t page_offset;
50
- uint32_t page_nr; // this has to come last because alignment
51
- };
31
+ struct aggr_ht_entry_t {
32
+ public:
33
+ explicit aggr_ht_entry_t(hash_t value_p) : value(value_p) {
34
+ }
52
35
 
53
- struct aggr_ht_entry_32 {
54
- uint8_t salt;
55
- uint8_t page_nr;
56
- uint16_t page_offset;
57
- };
36
+ inline bool IsOccupied() const {
37
+ return value != 0;
38
+ }
39
+
40
+ inline data_ptr_t GetPointer() const {
41
+ D_ASSERT(IsOccupied());
42
+ return reinterpret_cast<data_ptr_t>(value & POINTER_MASK);
43
+ }
44
+ inline void SetPointer(const data_ptr_t &pointer) {
45
+ // Pointer shouldn't use upper bits
46
+ D_ASSERT((reinterpret_cast<uint64_t>(pointer) & SALT_MASK) == 0);
47
+ // Value should have all 1's in the pointer area
48
+ D_ASSERT((value & POINTER_MASK) == POINTER_MASK);
49
+ // Set upper bits to 1 in pointer so the salt stays intact
50
+ value &= reinterpret_cast<uint64_t>(pointer) | SALT_MASK;
51
+ }
58
52
 
59
- enum HtEntryType { HT_WIDTH_32, HT_WIDTH_64 };
53
+ static inline hash_t ExtractSalt(const hash_t &hash) {
54
+ // Leaves upper bits intact, sets lower bits to all 1's
55
+ return hash | POINTER_MASK;
56
+ }
57
+ inline hash_t GetSalt() const {
58
+ return ExtractSalt(value);
59
+ }
60
+ inline void SetSalt(const hash_t &salt) {
61
+ // Shouldn't be occupied when we set this
62
+ D_ASSERT(!IsOccupied());
63
+ // Salt should have all 1's in the pointer field
64
+ D_ASSERT((salt & POINTER_MASK) == POINTER_MASK);
65
+ // No need to mask, just put the whole thing there
66
+ value = salt;
67
+ }
60
68
 
61
- struct AggregateHTScanState {
62
- mutex lock;
63
- TupleDataScanState scan_state;
64
- };
69
+ private:
70
+ //! Upper 16 bits are salt
71
+ static constexpr const hash_t SALT_MASK = 0xFFFF000000000000;
72
+ //! Lower 48 bits are the pointer
73
+ static constexpr const hash_t POINTER_MASK = 0x0000FFFFFFFFFFFF;
65
74
 
66
- struct AggregateHTAppendState {
67
- AggregateHTAppendState();
68
-
69
- Vector ht_offsets;
70
- Vector hash_salts;
71
- SelectionVector group_compare_vector;
72
- SelectionVector no_match_vector;
73
- SelectionVector empty_vector;
74
- SelectionVector new_groups;
75
- Vector addresses;
76
- unsafe_unique_array<UnifiedVectorFormat> group_data;
77
- DataChunk group_chunk;
78
-
79
- TupleDataChunkState chunk_state;
80
- bool chunk_state_initialized;
75
+ hash_t value;
81
76
  };
82
77
 
83
78
  class GroupedAggregateHashTable : public BaseAggregateHashTable {
84
- public:
85
- //! The hash table load factor, when a resize is triggered
86
- constexpr static float LOAD_FACTOR = 1.5;
87
- constexpr static uint8_t HASH_WIDTH = sizeof(hash_t);
88
-
89
79
  public:
90
80
  GroupedAggregateHashTable(ClientContext &context, Allocator &allocator, vector<LogicalType> group_types,
91
81
  vector<LogicalType> payload_types, const vector<BoundAggregateExpression *> &aggregates,
92
- HtEntryType entry_type = HtEntryType::HT_WIDTH_64,
93
- idx_t initial_capacity = InitialCapacity());
82
+ idx_t initial_capacity = InitialCapacity(), idx_t radix_bits = 0);
94
83
  GroupedAggregateHashTable(ClientContext &context, Allocator &allocator, vector<LogicalType> group_types,
95
84
  vector<LogicalType> payload_types, vector<AggregateObject> aggregates,
96
- HtEntryType entry_type = HtEntryType::HT_WIDTH_64,
97
- idx_t initial_capacity = InitialCapacity());
85
+ idx_t initial_capacity = InitialCapacity(), idx_t radix_bits = 0);
98
86
  GroupedAggregateHashTable(ClientContext &context, Allocator &allocator, vector<LogicalType> group_types);
99
87
  ~GroupedAggregateHashTable() override;
100
88
 
101
89
  public:
90
+ //! The hash table load factor, when a resize is triggered
91
+ constexpr static float LOAD_FACTOR = 1.5;
92
+
93
+ //! Get the layout of this HT
94
+ const TupleDataLayout &GetLayout() const;
95
+ //! Number of groups in the HT
96
+ idx_t Count() const;
97
+ //! Initial capacity of the HT
98
+ static idx_t InitialCapacity();
99
+ //! Capacity that can hold 'count' entries without resizing
100
+ static idx_t GetCapacityForCount(idx_t count);
101
+ //! Current capacity of the HT
102
+ idx_t Capacity() const;
103
+ //! Threshold at which to resize the HT
104
+ idx_t ResizeThreshold() const;
105
+
102
106
  //! Add the given data to the HT, computing the aggregates grouped by the
103
107
  //! data in the group chunk. When resize = true, aggregates will not be
104
108
  //! computed but instead just assigned.
105
- idx_t AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload,
106
- const unsafe_vector<idx_t> &filter);
107
- idx_t AddChunk(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes, DataChunk &payload,
108
- const unsafe_vector<idx_t> &filter);
109
- idx_t AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload, AggregateType filter);
110
-
111
- //! Scan the HT starting from the scan_position until the result and group
112
- //! chunks are filled. scan_position will be updated by this function.
113
- //! Returns the amount of elements found.
114
- idx_t Scan(TupleDataParallelScanState &gstate, TupleDataLocalScanState &lstate, DataChunk &result);
109
+ idx_t AddChunk(DataChunk &groups, DataChunk &payload, const unsafe_vector<idx_t> &filter);
110
+ idx_t AddChunk(DataChunk &groups, Vector &group_hashes, DataChunk &payload, const unsafe_vector<idx_t> &filter);
111
+ idx_t AddChunk(DataChunk &groups, DataChunk &payload, AggregateType filter);
115
112
 
116
113
  //! Fetch the aggregates for specific groups from the HT and place them in the result
117
114
  void FetchAggregates(DataChunk &groups, DataChunk &result);
@@ -119,108 +116,90 @@ public:
119
116
  //! Finds or creates groups in the hashtable using the specified group keys. The addresses vector will be filled
120
117
  //! with pointers to the groups in the hash table, and the new_groups selection vector will point to the newly
121
118
  //! created groups. The return value is the amount of newly created groups.
122
- idx_t FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes,
123
- Vector &addresses_out, SelectionVector &new_groups_out);
124
- idx_t FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups, Vector &addresses_out,
119
+ idx_t FindOrCreateGroups(DataChunk &groups, Vector &group_hashes, Vector &addresses_out,
125
120
  SelectionVector &new_groups_out);
126
- void FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups, Vector &addresses_out);
127
-
128
- //! Executes the filter(if any) and update the aggregates
129
- void Combine(GroupedAggregateHashTable &other);
130
-
131
- //! Appends the data in the other HT to this one
132
- void Append(GroupedAggregateHashTable &other);
133
-
134
- TupleDataCollection &GetDataCollection() {
135
- return *data_collection;
136
- }
137
-
138
- idx_t Count() const {
139
- return data_collection->Count();
140
- }
141
-
142
- idx_t DataSize() const {
143
- return data_collection->SizeInBytes();
144
- }
145
-
146
- static idx_t InitialCapacity();
147
- idx_t Capacity() {
148
- return capacity;
149
- }
150
-
151
- static idx_t FirstPartSize(idx_t count, HtEntryType entry_type) {
152
- idx_t entry_size = entry_type == HT_WIDTH_32 ? sizeof(aggr_ht_entry_32) : sizeof(aggr_ht_entry_64);
153
- return NextPowerOfTwo(count * 2L) * entry_size;
154
- }
121
+ idx_t FindOrCreateGroups(DataChunk &groups, Vector &addresses_out, SelectionVector &new_groups_out);
122
+ void FindOrCreateGroups(DataChunk &groups, Vector &addresses_out);
155
123
 
156
- idx_t TotalSize() const {
157
- return DataSize() + FirstPartSize(Count(), entry_type);
158
- }
124
+ unique_ptr<PartitionedTupleData> &GetPartitionedData();
125
+ shared_ptr<ArenaAllocator> GetAggregateAllocator();
159
126
 
160
- idx_t ResizeThreshold();
161
- idx_t MaxCapacity();
162
- static idx_t GetMaxCapacity(HtEntryType entry_type, idx_t tuple_size);
127
+ //! Resize the HT to the specified size. Must be larger than the current size.
128
+ void Resize(idx_t size);
129
+ //! Resets the pointer table of the HT to all 0's
130
+ void ClearPointerTable();
131
+ //! Resets the group count to 0
132
+ void ResetCount();
133
+ //! Set the radix bits for this HT
134
+ void SetRadixBits(idx_t radix_bits);
135
+ //! Initializes the PartitionedTupleData
136
+ void InitializePartitionedData();
163
137
 
164
- void Partition(vector<GroupedAggregateHashTable *> &partition_hts, idx_t radix_bits, bool sink_done);
165
- void InitializeFirstPart();
138
+ //! Executes the filter(if any) and update the aggregates
139
+ void Combine(GroupedAggregateHashTable &other);
140
+ void Combine(TupleDataCollection &other_data);
166
141
 
167
- void Finalize();
142
+ //! Unpins the data blocks
143
+ void UnpinData();
168
144
 
169
145
  private:
170
- HtEntryType entry_type;
171
-
172
- //! The capacity of the HT. This can be increased using GroupedAggregateHashTable::Resize
173
- idx_t capacity;
174
- //! Tuple width
175
- idx_t tuple_size;
176
- //! Tuples per block
177
- idx_t tuples_per_block;
146
+ //! Append state
147
+ struct AggregateHTAppendState {
148
+ AggregateHTAppendState();
149
+
150
+ PartitionedTupleDataAppendState append_state;
151
+
152
+ Vector ht_offsets;
153
+ Vector hash_salts;
154
+ SelectionVector group_compare_vector;
155
+ SelectionVector no_match_vector;
156
+ SelectionVector empty_vector;
157
+ SelectionVector new_groups;
158
+ Vector addresses;
159
+ unsafe_unique_array<UnifiedVectorFormat> group_data;
160
+ DataChunk group_chunk;
161
+ } state;
162
+
163
+ //! The number of radix bits to partition by
164
+ idx_t radix_bits;
178
165
  //! The data of the HT
179
- unique_ptr<TupleDataCollection> data_collection;
180
- TupleDataPinState td_pin_state;
181
- vector<data_ptr_t> payload_hds_ptrs;
166
+ unique_ptr<PartitionedTupleData> partitioned_data;
182
167
 
183
- //! The hashes of the HT
184
- AllocatedData hashes_hdl;
185
- data_ptr_t hashes_hdl_ptr;
186
- idx_t hash_offset; // Offset into the layout of the hash column
187
-
188
- hash_t hash_prefix_shift;
168
+ //! Predicates for matching groups (always ExpressionType::COMPARE_EQUAL)
169
+ vector<ExpressionType> predicates;
189
170
 
171
+ //! The number of groups in the HT
172
+ idx_t count;
173
+ //! The capacity of the HT. This can be increased using GroupedAggregateHashTable::Resize
174
+ idx_t capacity;
175
+ //! The hash map (pointer table) of the HT: allocated data and pointer into it
176
+ AllocatedData hash_map;
177
+ aggr_ht_entry_t *entries;
178
+ //! Offset of the hash column in the rows
179
+ idx_t hash_offset;
190
180
  //! Bitmask for getting relevant bits from the hashes to determine the position
191
181
  hash_t bitmask;
192
182
 
193
- bool is_finalized;
194
-
195
- vector<ExpressionType> predicates;
196
-
197
183
  //! The active arena allocator used by the aggregates for their internal state
198
184
  shared_ptr<ArenaAllocator> aggregate_allocator;
199
185
  //! Owning arena allocators that this HT has data from
200
186
  vector<shared_ptr<ArenaAllocator>> stored_allocators;
201
187
 
202
188
  private:
189
+ //! Disabled the copy constructor
203
190
  GroupedAggregateHashTable(const GroupedAggregateHashTable &) = delete;
204
-
191
+ //! Destroy the HT
205
192
  void Destroy();
206
- void Verify();
207
- template <class ENTRY>
208
- void VerifyInternal();
209
- //! Resize the HT to the specified size. Must be larger than the current size.
210
- template <class ENTRY>
211
- void Resize(idx_t size);
212
- //! Initializes the first part of the HT
213
- template <class ENTRY>
214
- void InitializeHashes();
193
+
194
+ //! Apply bitmask to get the entry in the HT
195
+ inline idx_t ApplyBitMask(hash_t hash) const;
196
+
215
197
  //! Does the actual group matching / creation
216
- template <class ENTRY>
217
- idx_t FindOrCreateGroupsInternal(DataChunk &groups, Vector &group_hashes_v, Vector &addresses_v,
198
+ idx_t FindOrCreateGroupsInternal(DataChunk &groups, Vector &group_hashes, Vector &addresses,
218
199
  SelectionVector &new_groups);
219
- //! Updates payload_hds_ptrs with the new pointers (after appending to data_collection)
220
- void UpdateBlockPointers();
221
- template <class ENTRY>
222
- idx_t FindOrCreateGroupsInternal(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes,
223
- Vector &addresses, SelectionVector &new_groups);
200
+
201
+ //! Verify the pointer table of the HT
202
+ void Verify();
224
203
  };
225
204
 
226
205
  } // namespace duckdb
@@ -8,17 +8,18 @@
8
8
 
9
9
  #pragma once
10
10
 
11
+ #include "duckdb/execution/operator/aggregate/distinct_aggregate_data.hpp"
12
+ #include "duckdb/execution/operator/aggregate/grouped_aggregate_data.hpp"
11
13
  #include "duckdb/execution/physical_operator.hpp"
12
- #include "duckdb/storage/data_table.hpp"
13
- #include "duckdb/parser/group_by_node.hpp"
14
14
  #include "duckdb/execution/radix_partitioned_hashtable.hpp"
15
- #include "duckdb/execution/operator/aggregate/grouped_aggregate_data.hpp"
16
- #include "duckdb/execution/operator/aggregate/distinct_aggregate_data.hpp"
15
+ #include "duckdb/parser/group_by_node.hpp"
16
+ #include "duckdb/storage/data_table.hpp"
17
17
 
18
18
  namespace duckdb {
19
19
 
20
20
  class ClientContext;
21
21
  class BufferManager;
22
+ class PhysicalHashAggregate;
22
23
 
23
24
  struct HashAggregateGroupingData {
24
25
  public: