duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -8,7 +8,7 @@ namespace duckdb {
8
8
 
9
9
  PartitionedTupleData::PartitionedTupleData(PartitionedTupleDataType type_p, BufferManager &buffer_manager_p,
10
10
  const TupleDataLayout &layout_p)
11
- : type(type_p), buffer_manager(buffer_manager_p), layout(layout_p.Copy()),
11
+ : type(type_p), buffer_manager(buffer_manager_p), layout(layout_p.Copy()), count(0), data_size(0),
12
12
  allocators(make_shared<PartitionTupleDataAllocators>()) {
13
13
  }
14
14
 
@@ -19,6 +19,10 @@ PartitionedTupleData::PartitionedTupleData(const PartitionedTupleData &other)
19
19
  PartitionedTupleData::~PartitionedTupleData() {
20
20
  }
21
21
 
22
+ const TupleDataLayout &PartitionedTupleData::GetLayout() const {
23
+ return layout;
24
+ }
25
+
22
26
  PartitionedTupleDataType PartitionedTupleData::GetType() const {
23
27
  return type;
24
28
  }
@@ -26,6 +30,7 @@ PartitionedTupleDataType PartitionedTupleData::GetType() const {
26
30
  void PartitionedTupleData::InitializeAppendState(PartitionedTupleDataAppendState &state,
27
31
  TupleDataPinProperties properties) const {
28
32
  state.partition_sel.Initialize();
33
+ state.reverse_partition_sel.Initialize();
29
34
 
30
35
  vector<column_t> column_ids;
31
36
  column_ids.reserve(layout.ColumnCount());
@@ -36,97 +41,178 @@ void PartitionedTupleData::InitializeAppendState(PartitionedTupleDataAppendState
36
41
  InitializeAppendStateInternal(state, properties);
37
42
  }
38
43
 
39
- void PartitionedTupleData::Append(PartitionedTupleDataAppendState &state, DataChunk &input) {
44
+ void PartitionedTupleData::Append(PartitionedTupleDataAppendState &state, DataChunk &input,
45
+ const SelectionVector &append_sel, const idx_t append_count) {
46
+ TupleDataCollection::ToUnifiedFormat(state.chunk_state, input);
47
+ AppendUnified(state, input, append_sel, append_count);
48
+ }
49
+
50
+ bool PartitionedTupleData::UseFixedSizeMap() const {
51
+ return MaxPartitionIndex() < PartitionedTupleDataAppendState::MAP_THRESHOLD;
52
+ }
53
+
54
+ void PartitionedTupleData::AppendUnified(PartitionedTupleDataAppendState &state, DataChunk &input,
55
+ const SelectionVector &append_sel, const idx_t append_count) {
56
+ const idx_t actual_append_count = append_count == DConstants::INVALID_INDEX ? input.size() : append_count;
57
+
40
58
  // Compute partition indices and store them in state.partition_indices
41
59
  ComputePartitionIndices(state, input);
42
60
 
43
61
  // Build the selection vector for the partitions
44
- BuildPartitionSel(state, input.size());
62
+ BuildPartitionSel(state, append_sel, actual_append_count);
45
63
 
46
64
  // Early out: check if everything belongs to a single partition
47
- const auto &partition_entries = state.partition_entries;
48
- if (partition_entries.size() == 1) {
49
- const auto &partition_index = partition_entries.begin()->first;
50
- auto &partition = *partitions[partition_index];
51
- auto &partition_pin_state = *state.partition_pin_states[partition_index];
52
- partition.Append(partition_pin_state, state.chunk_state, input);
53
- return;
65
+ optional_idx partition_index;
66
+ if (UseFixedSizeMap()) {
67
+ if (state.fixed_partition_entries.size() == 1) {
68
+ partition_index = state.fixed_partition_entries.begin().GetKey();
69
+ }
70
+ } else {
71
+ if (state.partition_entries.size() == 1) {
72
+ partition_index = state.partition_entries.begin()->first;
73
+ }
54
74
  }
75
+ if (partition_index.IsValid()) {
76
+ auto &partition = *partitions[partition_index.GetIndex()];
77
+ auto &partition_pin_state = *state.partition_pin_states[partition_index.GetIndex()];
55
78
 
56
- TupleDataCollection::ToUnifiedFormat(state.chunk_state, input);
79
+ const auto size_before = partition.SizeInBytes();
80
+ partition.AppendUnified(partition_pin_state, state.chunk_state, input, append_sel, actual_append_count);
81
+ data_size += partition.SizeInBytes() - size_before;
82
+ } else {
83
+ // Compute the heap sizes for the whole chunk
84
+ if (!layout.AllConstant()) {
85
+ TupleDataCollection::ComputeHeapSizes(state.chunk_state, input, state.partition_sel, actual_append_count);
86
+ }
57
87
 
58
- // Compute the heap sizes for the whole chunk
59
- if (!layout.AllConstant()) {
60
- TupleDataCollection::ComputeHeapSizes(state.chunk_state, input, state.partition_sel, input.size());
61
- }
88
+ // Build the buffer space
89
+ BuildBufferSpace(state);
62
90
 
63
- // Build the buffer space
64
- BuildBufferSpace(state);
91
+ // Now scatter everything in one go
92
+ partitions[0]->Scatter(state.chunk_state, input, state.partition_sel, actual_append_count);
93
+ }
65
94
 
66
- // Now scatter everything in one go
67
- partitions[0]->Scatter(state.chunk_state, input, state.partition_sel, input.size());
95
+ count += actual_append_count;
96
+ Verify();
68
97
  }
69
98
 
70
- void PartitionedTupleData::Append(PartitionedTupleDataAppendState &state, TupleDataChunkState &input, idx_t count) {
99
+ void PartitionedTupleData::Append(PartitionedTupleDataAppendState &state, TupleDataChunkState &input,
100
+ const idx_t append_count) {
71
101
  // Compute partition indices and store them in state.partition_indices
72
- ComputePartitionIndices(input.row_locations, count, state.partition_indices);
102
+ ComputePartitionIndices(input.row_locations, append_count, state.partition_indices);
73
103
 
74
104
  // Build the selection vector for the partitions
75
- BuildPartitionSel(state, count);
105
+ BuildPartitionSel(state, *FlatVector::IncrementalSelectionVector(), append_count);
76
106
 
77
107
  // Early out: check if everything belongs to a single partition
78
- auto &partition_entries = state.partition_entries;
79
- if (partition_entries.size() == 1) {
80
- const auto &partition_index = partition_entries.begin()->first;
81
- auto &partition = *partitions[partition_index];
82
- auto &partition_pin_state = *state.partition_pin_states[partition_index];
108
+ optional_idx partition_index;
109
+ if (UseFixedSizeMap()) {
110
+ if (state.fixed_partition_entries.size() == 1) {
111
+ partition_index = state.fixed_partition_entries.begin().GetKey();
112
+ }
113
+ } else {
114
+ if (state.partition_entries.size() == 1) {
115
+ partition_index = state.partition_entries.begin()->first;
116
+ }
117
+ }
118
+
119
+ if (partition_index.IsValid()) {
120
+ auto &partition = *partitions[partition_index.GetIndex()];
121
+ auto &partition_pin_state = *state.partition_pin_states[partition_index.GetIndex()];
83
122
 
84
123
  state.chunk_state.heap_sizes.Reference(input.heap_sizes);
85
- partition.Build(partition_pin_state, state.chunk_state, 0, count);
86
- partition.CopyRows(state.chunk_state, input, *FlatVector::IncrementalSelectionVector(), count);
87
- return;
124
+
125
+ const auto size_before = partition.SizeInBytes();
126
+ partition.Build(partition_pin_state, state.chunk_state, 0, append_count);
127
+ data_size += partition.SizeInBytes() - size_before;
128
+
129
+ partition.CopyRows(state.chunk_state, input, *FlatVector::IncrementalSelectionVector(), append_count);
130
+ } else {
131
+ // Build the buffer space
132
+ state.chunk_state.heap_sizes.Slice(input.heap_sizes, state.partition_sel, append_count);
133
+ state.chunk_state.heap_sizes.Flatten(append_count);
134
+ BuildBufferSpace(state);
135
+
136
+ // Copy the rows
137
+ partitions[0]->CopyRows(state.chunk_state, input, state.partition_sel, append_count);
138
+ }
139
+
140
+ count += append_count;
141
+ Verify();
142
+ }
143
+
144
+ // LCOV_EXCL_START
145
+ template <class MAP_TYPE>
146
+ struct UnorderedMapGetter {
147
+ static inline const typename MAP_TYPE::key_type &GetKey(typename MAP_TYPE::iterator &iterator) {
148
+ return iterator->first;
88
149
  }
89
150
 
90
- // Build the buffer space
91
- state.chunk_state.heap_sizes.Slice(input.heap_sizes, state.partition_sel, count);
92
- state.chunk_state.heap_sizes.Flatten(count);
93
- BuildBufferSpace(state);
151
+ static inline const typename MAP_TYPE::key_type &GetKey(const typename MAP_TYPE::const_iterator &iterator) {
152
+ return iterator->first;
153
+ }
154
+
155
+ static inline typename MAP_TYPE::mapped_type &GetValue(typename MAP_TYPE::iterator &iterator) {
156
+ return iterator->second;
157
+ }
158
+
159
+ static inline const typename MAP_TYPE::mapped_type &GetValue(const typename MAP_TYPE::const_iterator &iterator) {
160
+ return iterator->second;
161
+ }
162
+ };
163
+
164
+ template <class T>
165
+ struct FixedSizeMapGetter {
166
+ static inline const idx_t &GetKey(fixed_size_map_iterator_t<T> &iterator) {
167
+ return iterator.GetKey();
168
+ }
94
169
 
95
- // Copy the rows
96
- partitions[0]->CopyRows(state.chunk_state, input, state.partition_sel, count);
170
+ static inline const idx_t &GetKey(const const_fixed_size_map_iterator_t<T> &iterator) {
171
+ return iterator.GetKey();
172
+ }
173
+
174
+ static inline T &GetValue(fixed_size_map_iterator_t<T> &iterator) {
175
+ return iterator.GetValue();
176
+ }
177
+
178
+ static inline const T &GetValue(const const_fixed_size_map_iterator_t<T> &iterator) {
179
+ return iterator.GetValue();
180
+ }
181
+ };
182
+ // LCOV_EXCL_STOP
183
+
184
+ void PartitionedTupleData::BuildPartitionSel(PartitionedTupleDataAppendState &state, const SelectionVector &append_sel,
185
+ const idx_t append_count) {
186
+ if (UseFixedSizeMap()) {
187
+ BuildPartitionSel<fixed_size_map_t<list_entry_t>, FixedSizeMapGetter<list_entry_t>>(
188
+ state, state.fixed_partition_entries, append_sel, append_count);
189
+ } else {
190
+ BuildPartitionSel<perfect_map_t<list_entry_t>, UnorderedMapGetter<perfect_map_t<list_entry_t>>>(
191
+ state, state.partition_entries, append_sel, append_count);
192
+ }
97
193
  }
98
194
 
99
- void PartitionedTupleData::BuildPartitionSel(PartitionedTupleDataAppendState &state, idx_t count) {
195
+ template <class MAP_TYPE, class GETTER>
196
+ void PartitionedTupleData::BuildPartitionSel(PartitionedTupleDataAppendState &state, MAP_TYPE &partition_entries,
197
+ const SelectionVector &append_sel, const idx_t append_count) {
100
198
  const auto partition_indices = FlatVector::GetData<idx_t>(state.partition_indices);
101
- auto &partition_entries = state.partition_entries;
102
- auto &partition_entries_arr = state.partition_entries_arr;
103
199
  partition_entries.clear();
104
200
 
105
- const auto max_partition_index = MaxPartitionIndex();
106
- const auto use_arr = max_partition_index < PartitionedTupleDataAppendState::MAP_THRESHOLD;
107
-
108
201
  switch (state.partition_indices.GetVectorType()) {
109
202
  case VectorType::FLAT_VECTOR:
110
- if (use_arr) {
111
- std::fill_n(partition_entries_arr, max_partition_index + 1, list_entry_t(0, 0));
112
- for (idx_t i = 0; i < count; i++) {
113
- const auto &partition_index = partition_indices[i];
114
- partition_entries_arr[partition_index].length++;
115
- }
116
- } else {
117
- for (idx_t i = 0; i < count; i++) {
118
- const auto &partition_index = partition_indices[i];
119
- auto partition_entry = partition_entries.find(partition_index);
120
- if (partition_entry == partition_entries.end()) {
121
- partition_entries.emplace(partition_index, list_entry_t(0, 1));
122
- } else {
123
- partition_entry->second.length++;
124
- }
203
+ for (idx_t i = 0; i < append_count; i++) {
204
+ const auto index = append_sel.get_index(i);
205
+ const auto &partition_index = partition_indices[index];
206
+ auto partition_entry = partition_entries.find(partition_index);
207
+ if (partition_entry == partition_entries.end()) {
208
+ partition_entries[partition_index] = list_entry_t(0, 1);
209
+ } else {
210
+ GETTER::GetValue(partition_entry).length++;
125
211
  }
126
212
  }
127
213
  break;
128
214
  case VectorType::CONSTANT_VECTOR:
129
- partition_entries[partition_indices[0]] = list_entry_t(0, count);
215
+ partition_entries[partition_indices[0]] = list_entry_t(0, append_count);
130
216
  break;
131
217
  default:
132
218
  throw InternalException("Unexpected VectorType in PartitionedTupleData::Append");
@@ -134,64 +220,62 @@ void PartitionedTupleData::BuildPartitionSel(PartitionedTupleDataAppendState &st
134
220
 
135
221
  // Early out: check if everything belongs to a single partition
136
222
  if (partition_entries.size() == 1) {
223
+ // This needs to be initialized, even if we go the short path here
224
+ for (idx_t i = 0; i < append_count; i++) {
225
+ const auto index = append_sel.get_index(i);
226
+ state.reverse_partition_sel[index] = i;
227
+ }
137
228
  return;
138
229
  }
139
230
 
140
231
  // Compute offsets from the counts
141
232
  idx_t offset = 0;
142
- if (use_arr) {
143
- for (idx_t partition_index = 0; partition_index <= max_partition_index; partition_index++) {
144
- auto &partition_entry = partition_entries_arr[partition_index];
145
- partition_entry.offset = offset;
146
- offset += partition_entry.length;
147
- }
148
- } else {
149
- for (auto &pc : partition_entries) {
150
- auto &partition_entry = pc.second;
151
- partition_entry.offset = offset;
152
- offset += partition_entry.length;
153
- }
233
+ for (auto it = partition_entries.begin(); it != partition_entries.end(); ++it) {
234
+ auto &partition_entry = GETTER::GetValue(it);
235
+ partition_entry.offset = offset;
236
+ offset += partition_entry.length;
154
237
  }
155
238
 
156
239
  // Now initialize a single selection vector that acts as a selection vector for every partition
157
- auto &all_partitions_sel = state.partition_sel;
158
- if (use_arr) {
159
- for (idx_t i = 0; i < count; i++) {
160
- const auto &partition_index = partition_indices[i];
161
- auto &partition_offset = partition_entries_arr[partition_index].offset;
162
- all_partitions_sel[partition_offset++] = i;
163
- }
164
- // Now just add it to the map anyway so the rest of the functionality is shared
165
- for (idx_t partition_index = 0; partition_index <= max_partition_index; partition_index++) {
166
- const auto &partition_entry = partition_entries_arr[partition_index];
167
- if (partition_entry.length != 0) {
168
- partition_entries.emplace(partition_index, partition_entry);
169
- }
170
- }
171
- } else {
172
- for (idx_t i = 0; i < count; i++) {
173
- const auto &partition_index = partition_indices[i];
174
- auto &partition_offset = partition_entries[partition_index].offset;
175
- all_partitions_sel[partition_offset++] = i;
176
- }
240
+ auto &partition_sel = state.partition_sel;
241
+ auto &reverse_partition_sel = state.reverse_partition_sel;
242
+ for (idx_t i = 0; i < append_count; i++) {
243
+ const auto index = append_sel.get_index(i);
244
+ const auto &partition_index = partition_indices[index];
245
+ auto &partition_offset = partition_entries[partition_index].offset;
246
+ reverse_partition_sel[index] = partition_offset;
247
+ partition_sel[partition_offset++] = index;
177
248
  }
178
249
  }
179
250
 
180
251
  void PartitionedTupleData::BuildBufferSpace(PartitionedTupleDataAppendState &state) {
181
- for (auto &pc : state.partition_entries) {
182
- const auto &partition_index = pc.first;
252
+ if (UseFixedSizeMap()) {
253
+ BuildBufferSpace<fixed_size_map_t<list_entry_t>, FixedSizeMapGetter<list_entry_t>>(
254
+ state, state.fixed_partition_entries);
255
+ } else {
256
+ BuildBufferSpace<perfect_map_t<list_entry_t>, UnorderedMapGetter<perfect_map_t<list_entry_t>>>(
257
+ state, state.partition_entries);
258
+ }
259
+ }
260
+
261
+ template <class MAP_TYPE, class GETTER>
262
+ void PartitionedTupleData::BuildBufferSpace(PartitionedTupleDataAppendState &state, const MAP_TYPE &partition_entries) {
263
+ for (auto it = partition_entries.begin(); it != partition_entries.end(); ++it) {
264
+ const auto &partition_index = GETTER::GetKey(it);
183
265
 
184
266
  // Partition, pin state for this partition index
185
267
  auto &partition = *partitions[partition_index];
186
268
  auto &partition_pin_state = *state.partition_pin_states[partition_index];
187
269
 
188
270
  // Length and offset for this partition
189
- const auto &partition_entry = pc.second;
271
+ const auto &partition_entry = GETTER::GetValue(it);
190
272
  const auto &partition_length = partition_entry.length;
191
273
  const auto partition_offset = partition_entry.offset - partition_length;
192
274
 
193
275
  // Build out the buffer space for this partition
276
+ const auto size_before = partition.SizeInBytes();
194
277
  partition.Build(partition_pin_state, state.chunk_state, partition_offset, partition_length);
278
+ data_size += partition.SizeInBytes() - size_before;
195
279
  }
196
280
  }
197
281
 
@@ -210,7 +294,6 @@ void PartitionedTupleData::Combine(PartitionedTupleData &other) {
210
294
 
211
295
  // Now combine the state's partitions into this
212
296
  lock_guard<mutex> guard(lock);
213
-
214
297
  if (partitions.empty()) {
215
298
  // This is the first merge, we just copy them over
216
299
  partitions = std::move(other.partitions);
@@ -221,40 +304,28 @@ void PartitionedTupleData::Combine(PartitionedTupleData &other) {
221
304
  partitions[i]->Combine(*other.partitions[i]);
222
305
  }
223
306
  }
307
+ this->count += other.count;
308
+ this->data_size += other.data_size;
309
+ Verify();
224
310
  }
225
311
 
226
- void PartitionedTupleData::Partition(TupleDataCollection &source, TupleDataPinProperties properties) {
227
- if (source.Count() == 0) {
228
- return;
229
- }
230
- #ifdef DEBUG
231
- const auto count_before = source.Count();
232
- #endif
233
-
234
- PartitionedTupleDataAppendState append_state;
235
- InitializeAppendState(append_state, properties);
236
-
237
- TupleDataChunkIterator iterator(source, TupleDataPinProperties::DESTROY_AFTER_DONE, true);
238
- auto &chunk_state = iterator.GetChunkState();
239
- do {
240
- Append(append_state, chunk_state, iterator.GetCurrentChunkCount());
241
- } while (iterator.Next());
242
-
243
- FlushAppendState(append_state);
244
- source.Reset();
245
-
246
- #ifdef DEBUG
247
- idx_t count_after = 0;
248
- for (const auto &partition : partitions) {
249
- count_after += partition->Count();
312
+ void PartitionedTupleData::Reset() {
313
+ for (auto &partition : partitions) {
314
+ partition->Reset();
250
315
  }
251
- D_ASSERT(count_before == count_after);
252
- #endif
316
+ this->count = 0;
317
+ this->data_size = 0;
318
+ Verify();
253
319
  }
254
320
 
255
321
  void PartitionedTupleData::Repartition(PartitionedTupleData &new_partitioned_data) {
256
322
  D_ASSERT(layout.GetTypes() == new_partitioned_data.layout.GetTypes());
257
323
 
324
+ if (partitions.size() == new_partitioned_data.partitions.size()) {
325
+ new_partitioned_data.Combine(*this);
326
+ return;
327
+ }
328
+
258
329
  PartitionedTupleDataAppendState append_state;
259
330
  new_partitioned_data.InitializeAppendState(append_state);
260
331
 
@@ -279,20 +350,42 @@ void PartitionedTupleData::Repartition(PartitionedTupleData &new_partitioned_dat
279
350
  }
280
351
  partitions[actual_partition_idx]->Reset();
281
352
  }
282
-
283
353
  new_partitioned_data.FlushAppendState(append_state);
354
+
355
+ count = 0;
356
+ data_size = 0;
357
+
358
+ Verify();
359
+ }
360
+
361
+ void PartitionedTupleData::Unpin() {
362
+ for (auto &partition : partitions) {
363
+ partition->Unpin();
364
+ }
284
365
  }
285
366
 
286
367
  vector<unique_ptr<TupleDataCollection>> &PartitionedTupleData::GetPartitions() {
287
368
  return partitions;
288
369
  }
289
370
 
290
- idx_t PartitionedTupleData::Count() const {
291
- idx_t total_count = 0;
292
- for (auto &partition : partitions) {
293
- total_count += partition->Count();
371
+ unique_ptr<TupleDataCollection> PartitionedTupleData::GetUnpartitioned() {
372
+ auto data_collection = std::move(partitions[0]);
373
+ partitions[0] = make_uniq<TupleDataCollection>(buffer_manager, layout);
374
+
375
+ for (idx_t i = 1; i < partitions.size(); i++) {
376
+ data_collection->Combine(*partitions[i]);
294
377
  }
295
- return total_count;
378
+ count = 0;
379
+ data_size = 0;
380
+
381
+ data_collection->Verify();
382
+ Verify();
383
+
384
+ return data_collection;
385
+ }
386
+
387
+ idx_t PartitionedTupleData::Count() const {
388
+ return count;
296
389
  }
297
390
 
298
391
  idx_t PartitionedTupleData::SizeInBytes() const {
@@ -303,6 +396,39 @@ idx_t PartitionedTupleData::SizeInBytes() const {
303
396
  return total_size;
304
397
  }
305
398
 
399
+ idx_t PartitionedTupleData::PartitionCount() const {
400
+ return partitions.size();
401
+ }
402
+
403
+ void PartitionedTupleData::Verify() const {
404
+ #ifdef DEBUG
405
+ idx_t total_count = 0;
406
+ idx_t total_size = 0;
407
+ for (auto &partition : partitions) {
408
+ partition->Verify();
409
+ total_count += partition->Count();
410
+ total_size += partition->SizeInBytes();
411
+ }
412
+ D_ASSERT(total_count == this->count);
413
+ D_ASSERT(total_size == this->data_size);
414
+ #endif
415
+ }
416
+
417
+ // LCOV_EXCL_START
418
+ string PartitionedTupleData::ToString() {
419
+ string result =
420
+ StringUtil::Format("PartitionedTupleData - [%llu Partitions, %llu Rows]\n", partitions.size(), Count());
421
+ for (idx_t partition_idx = 0; partition_idx < partitions.size(); partition_idx++) {
422
+ result += StringUtil::Format("Partition %llu: ", partition_idx) + partitions[partition_idx]->ToString();
423
+ }
424
+ return result;
425
+ }
426
+
427
+ void PartitionedTupleData::Print() {
428
+ Printer::Print(ToString());
429
+ }
430
+ // LCOV_EXCL_STOP
431
+
306
432
  void PartitionedTupleData::CreateAllocator() {
307
433
  allocators->allocators.emplace_back(make_shared<TupleDataAllocator>(buffer_manager, layout));
308
434
  }
@@ -12,11 +12,10 @@
12
12
 
13
13
  namespace duckdb {
14
14
 
15
- RowLayout::RowLayout()
16
- : flag_width(0), data_width(0), aggr_width(0), row_width(0), all_constant(true), heap_pointer_offset(0) {
15
+ RowLayout::RowLayout() : flag_width(0), data_width(0), row_width(0), all_constant(true), heap_pointer_offset(0) {
17
16
  }
18
17
 
19
- void RowLayout::Initialize(vector<LogicalType> types_p, Aggregates aggregates_p, bool align) {
18
+ void RowLayout::Initialize(vector<LogicalType> types_p, bool align) {
20
19
  offsets.clear();
21
20
  types = std::move(types_p);
22
21
 
@@ -31,7 +30,7 @@ void RowLayout::Initialize(vector<LogicalType> types_p, Aggregates aggregates_p,
31
30
 
32
31
  // This enables pointer swizzling for out-of-core computation.
33
32
  if (!all_constant) {
34
- // When unswizzled the pointer lives here.
33
+ // When unswizzled, the pointer lives here.
35
34
  // When swizzled, the pointer is replaced by an offset.
36
35
  heap_pointer_offset = row_width;
37
36
  // The 8 byte pointer will be replaced with an 8 byte idx_t when swizzled.
@@ -52,39 +51,12 @@ void RowLayout::Initialize(vector<LogicalType> types_p, Aggregates aggregates_p,
52
51
  }
53
52
  }
54
53
 
55
- // Alignment padding for aggregates
56
- #ifndef DUCKDB_ALLOW_UNDEFINED
57
- if (align) {
58
- row_width = AlignValue(row_width);
59
- }
60
- #endif
61
54
  data_width = row_width - flag_width;
62
55
 
63
- // Aggregate fields.
64
- aggregates = std::move(aggregates_p);
65
- for (auto &aggregate : aggregates) {
66
- offsets.push_back(row_width);
67
- row_width += aggregate.payload_size;
68
- #ifndef DUCKDB_ALLOW_UNDEFINED
69
- D_ASSERT(aggregate.payload_size == AlignValue(aggregate.payload_size));
70
- #endif
71
- }
72
- aggr_width = row_width - data_width - flag_width;
73
-
74
56
  // Alignment padding for the next row
75
- #ifndef DUCKDB_ALLOW_UNDEFINED
76
57
  if (align) {
77
58
  row_width = AlignValue(row_width);
78
59
  }
79
- #endif
80
- }
81
-
82
- void RowLayout::Initialize(vector<LogicalType> types_p, bool align) {
83
- Initialize(std::move(types_p), Aggregates(), align);
84
- }
85
-
86
- void RowLayout::Initialize(Aggregates aggregates_p, bool align) {
87
- Initialize(vector<LogicalType>(), std::move(aggregates_p), align);
88
60
  }
89
61
 
90
62
  } // namespace duckdb