duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -8,9 +8,9 @@
8
8
 
9
9
  #pragma once
10
10
 
11
+ #include "duckdb/common/enum_util.hpp"
11
12
  #include "duckdb/common/field_writer.hpp"
12
13
  #include "duckdb/common/serializer.hpp"
13
- #include "duckdb/common/enum_util.hpp"
14
14
  #include "duckdb/common/serializer/serialization_traits.hpp"
15
15
  #include "duckdb/common/types/interval.hpp"
16
16
  #include "duckdb/common/types/string_type.hpp"
@@ -20,47 +20,79 @@
20
20
  namespace duckdb {
21
21
 
22
22
  class FormatSerializer {
23
- friend Vector;
24
-
25
23
  protected:
26
24
  bool serialize_enum_as_string = false;
25
+ bool serialize_default_values = false;
26
+
27
+ public:
28
+ class List {
29
+ friend FormatSerializer;
30
+
31
+ private:
32
+ FormatSerializer &serializer;
33
+ explicit List(FormatSerializer &serializer) : serializer(serializer) {
34
+ }
35
+
36
+ public:
37
+ // Serialize an element
38
+ template <class T>
39
+ void WriteElement(const T &value);
40
+
41
+ // Serialize an object
42
+ template <class FUNC>
43
+ void WriteObject(FUNC f);
44
+ };
27
45
 
28
46
  public:
29
47
  // Serialize a value
30
48
  template <class T>
31
49
  void WriteProperty(const field_id_t field_id, const char *tag, const T &value) {
32
- SetTag(field_id, tag);
50
+ OnPropertyBegin(field_id, tag);
33
51
  WriteValue(value);
52
+ OnPropertyEnd();
34
53
  }
35
54
 
36
- // Optional pointer
37
- template <class POINTER>
38
- void WriteOptionalProperty(const field_id_t field_id, const char *tag, POINTER &&ptr) {
39
- SetTag(field_id, tag);
40
- if (ptr == nullptr) {
41
- OnOptionalBegin(false);
42
- OnOptionalEnd(false);
43
- } else {
44
- OnOptionalBegin(true);
45
- WriteValue(*ptr);
46
- OnOptionalEnd(true);
55
+ // Default value
56
+ template <class T>
57
+ void WritePropertyWithDefault(const field_id_t field_id, const char *tag, const T &value, const T &&default_value) {
58
+ // If current value is default, don't write it
59
+ if (!serialize_default_values && (value == default_value)) {
60
+ OnOptionalPropertyBegin(field_id, tag, false);
61
+ OnOptionalPropertyEnd(false);
62
+ return;
47
63
  }
64
+ OnOptionalPropertyBegin(field_id, tag, true);
65
+ WriteValue(value);
66
+ OnOptionalPropertyEnd(true);
48
67
  }
49
68
 
50
69
  // Special case: data_ptr_T
51
70
  void WriteProperty(const field_id_t field_id, const char *tag, const_data_ptr_t ptr, idx_t count) {
52
- SetTag(field_id, tag);
71
+ OnPropertyBegin(field_id, tag);
53
72
  WriteDataPtr(ptr, count);
73
+ OnPropertyEnd();
54
74
  }
55
75
 
56
- // Manually begin an object - should be followed by EndObject
57
- void BeginObject(const field_id_t field_id, const char *tag) {
58
- SetTag(field_id, tag);
76
+ // Manually begin an object
77
+ template <class FUNC>
78
+ void WriteObject(const field_id_t field_id, const char *tag, FUNC f) {
79
+ OnPropertyBegin(field_id, tag);
59
80
  OnObjectBegin();
81
+ f(*this);
82
+ OnObjectEnd();
83
+ OnPropertyEnd();
60
84
  }
61
85
 
62
- void EndObject() {
63
- OnObjectEnd();
86
+ template <class FUNC>
87
+ void WriteList(const field_id_t field_id, const char *tag, idx_t count, FUNC func) {
88
+ OnPropertyBegin(field_id, tag);
89
+ OnListBegin(count);
90
+ List list {*this};
91
+ for (idx_t i = 0; i < count; i++) {
92
+ func(list, i);
93
+ }
94
+ OnListEnd();
95
+ OnPropertyEnd();
64
96
  }
65
97
 
66
98
  protected:
@@ -82,27 +114,38 @@ protected:
82
114
  WriteValue(ptr.get());
83
115
  }
84
116
 
117
+ // Shared Pointer Ref
118
+ template <typename T>
119
+ void WriteValue(const shared_ptr<T> &ptr) {
120
+ WriteValue(ptr.get());
121
+ }
122
+
85
123
  // Pointer
86
124
  template <typename T>
87
- typename std::enable_if<std::is_pointer<T>::value, void>::type WriteValue(const T ptr) {
125
+ void WriteValue(const T *ptr) {
88
126
  if (ptr == nullptr) {
89
- WriteNull();
127
+ OnNullableBegin(false);
128
+ OnNullableEnd();
90
129
  } else {
130
+ OnNullableBegin(true);
91
131
  WriteValue(*ptr);
132
+ OnNullableEnd();
92
133
  }
93
134
  }
94
135
 
95
136
  // Pair
96
137
  template <class K, class V>
97
138
  void WriteValue(const std::pair<K, V> &pair) {
98
- OnPairBegin();
99
- OnPairKeyBegin();
100
- WriteValue(pair.first);
101
- OnPairKeyEnd();
102
- OnPairValueBegin();
103
- WriteValue(pair.second);
104
- OnPairValueEnd();
105
- OnPairEnd();
139
+ OnObjectBegin();
140
+ WriteProperty(0, "first", pair.first);
141
+ WriteProperty(1, "second", pair.second);
142
+ OnObjectEnd();
143
+ }
144
+
145
+ // Reference Wrapper
146
+ template <class T>
147
+ void WriteValue(const reference<T> ref) {
148
+ WriteValue(ref.get());
106
149
  }
107
150
 
108
151
  // Vector
@@ -113,7 +156,7 @@ protected:
113
156
  for (auto &item : vec) {
114
157
  WriteValue(item);
115
158
  }
116
- OnListEnd(count);
159
+ OnListEnd();
117
160
  }
118
161
 
119
162
  template <class T>
@@ -123,7 +166,7 @@ protected:
123
166
  for (auto &item : vec) {
124
167
  WriteValue(item);
125
168
  }
126
- OnListEnd(count);
169
+ OnListEnd();
127
170
  }
128
171
 
129
172
  // UnorderedSet
@@ -135,7 +178,7 @@ protected:
135
178
  for (auto &item : set) {
136
179
  WriteValue(item);
137
180
  }
138
- OnListEnd(count);
181
+ OnListEnd();
139
182
  }
140
183
 
141
184
  // Set
@@ -147,108 +190,65 @@ protected:
147
190
  for (auto &item : set) {
148
191
  WriteValue(item);
149
192
  }
150
- OnListEnd(count);
193
+ OnListEnd();
151
194
  }
152
195
 
153
196
  // Map
197
+ // serialized as a list of pairs
154
198
  template <class K, class V, class HASH, class CMP>
155
199
  void WriteValue(const duckdb::unordered_map<K, V, HASH, CMP> &map) {
156
200
  auto count = map.size();
157
- OnMapBegin(count);
201
+ OnListBegin(count);
158
202
  for (auto &item : map) {
159
- OnMapEntryBegin();
160
- OnMapKeyBegin();
161
- WriteValue(item.first);
162
- OnMapKeyEnd();
163
- OnMapValueBegin();
164
- WriteValue(item.second);
165
- OnMapValueEnd();
166
- OnMapEntryEnd();
203
+ OnObjectBegin();
204
+ WriteProperty(0, "key", item.first);
205
+ WriteProperty(1, "value", item.second);
206
+ OnObjectEnd();
167
207
  }
168
- OnMapEnd(count);
208
+ OnListEnd();
169
209
  }
170
210
 
171
211
  // Map
212
+ // serialized as a list of pairs
172
213
  template <class K, class V, class HASH, class CMP>
173
214
  void WriteValue(const duckdb::map<K, V, HASH, CMP> &map) {
174
215
  auto count = map.size();
175
- OnMapBegin(count);
216
+ OnListBegin(count);
176
217
  for (auto &item : map) {
177
- OnMapEntryBegin();
178
- OnMapKeyBegin();
179
- WriteValue(item.first);
180
- OnMapKeyEnd();
181
- OnMapValueBegin();
182
- WriteValue(item.second);
183
- OnMapValueEnd();
184
- OnMapEntryEnd();
218
+ OnObjectBegin();
219
+ WriteProperty(0, "key", item.first);
220
+ WriteProperty(1, "value", item.second);
221
+ OnObjectEnd();
185
222
  }
186
- OnMapEnd(count);
223
+ OnListEnd();
187
224
  }
188
225
 
189
226
  // class or struct implementing `FormatSerialize(FormatSerializer& FormatSerializer)`;
190
227
  template <typename T>
191
228
  typename std::enable_if<has_serialize<T>::value>::type WriteValue(const T &value) {
192
- // Else, we defer to the .FormatSerialize method
193
229
  OnObjectBegin();
194
230
  value.FormatSerialize(*this);
195
231
  OnObjectEnd();
196
232
  }
197
233
 
198
- // Handle setting a "tag" (optional)
199
- virtual void SetTag(const field_id_t field_id, const char *tag) {
200
- (void)field_id;
201
- (void)tag;
202
- }
203
-
234
+ protected:
204
235
  // Hooks for subclasses to override to implement custom behavior
205
- virtual void OnListBegin(idx_t count) {
206
- (void)count;
207
- }
208
- virtual void OnListEnd(idx_t count) {
209
- (void)count;
210
- }
211
- virtual void OnMapBegin(idx_t count) {
212
- (void)count;
213
- }
214
- virtual void OnMapEnd(idx_t count) {
215
- (void)count;
216
- }
217
- virtual void OnMapEntryBegin() {
218
- }
219
- virtual void OnMapEntryEnd() {
220
- }
221
- virtual void OnMapKeyBegin() {
222
- }
223
- virtual void OnMapKeyEnd() {
224
- }
225
- virtual void OnMapValueBegin() {
226
- }
227
- virtual void OnMapValueEnd() {
228
- }
229
- virtual void OnOptionalBegin(bool present) {
230
- }
231
- virtual void OnOptionalEnd(bool present) {
232
- }
233
- virtual void OnObjectBegin() {
234
- }
235
- virtual void OnObjectEnd() {
236
- }
237
- virtual void OnPairBegin() {
238
- }
239
- virtual void OnPairKeyBegin() {
240
- }
241
- virtual void OnPairKeyEnd() {
242
- }
243
- virtual void OnPairValueBegin() {
244
- }
245
- virtual void OnPairValueEnd() {
246
- }
247
- virtual void OnPairEnd() {
248
- }
236
+ virtual void OnPropertyBegin(const field_id_t field_id, const char *tag) = 0;
237
+ virtual void OnPropertyEnd() = 0;
238
+ virtual void OnOptionalPropertyBegin(const field_id_t field_id, const char *tag, bool present) = 0;
239
+ virtual void OnOptionalPropertyEnd(bool present) = 0;
240
+ virtual void OnObjectBegin() = 0;
241
+ virtual void OnObjectEnd() = 0;
242
+ virtual void OnListBegin(idx_t count) = 0;
243
+ virtual void OnListEnd() = 0;
244
+ virtual void OnNullableBegin(bool present) = 0;
245
+ virtual void OnNullableEnd() = 0;
249
246
 
250
247
  // Handle primitive types, a serializer needs to implement these.
251
248
  virtual void WriteNull() = 0;
249
+ virtual void WriteValue(char value) {
250
+ throw NotImplementedException("Write char value not implemented");
251
+ }
252
252
  virtual void WriteValue(bool value) = 0;
253
253
  virtual void WriteValue(uint8_t value) = 0;
254
254
  virtual void WriteValue(int8_t value) = 0;
@@ -264,7 +264,6 @@ protected:
264
264
  virtual void WriteValue(const string_t value) = 0;
265
265
  virtual void WriteValue(const string &value) = 0;
266
266
  virtual void WriteValue(const char *str) = 0;
267
- virtual void WriteValue(interval_t value) = 0;
268
267
  virtual void WriteDataPtr(const_data_ptr_t ptr, idx_t count) = 0;
269
268
  void WriteValue(LogicalIndex value) {
270
269
  WriteValue(value.index);
@@ -278,4 +277,17 @@ protected:
278
277
  template <>
279
278
  void FormatSerializer::WriteValue(const vector<bool> &vec);
280
279
 
280
+ // List Impl
281
+ template <class FUNC>
282
+ void FormatSerializer::List::WriteObject(FUNC f) {
283
+ serializer.OnObjectBegin();
284
+ f(serializer);
285
+ serializer.OnObjectEnd();
286
+ }
287
+
288
+ template <class T>
289
+ void FormatSerializer::List::WriteElement(const T &value) {
290
+ serializer.WriteValue(value);
291
+ }
292
+
281
293
  } // namespace duckdb
@@ -12,7 +12,8 @@ namespace duckdb {
12
12
  class FormatSerializer; // Forward declare
13
13
  class FormatDeserializer; // Forward declare
14
14
 
15
- typedef uint32_t field_id_t;
15
+ typedef uint16_t field_id_t;
16
+ const field_id_t MESSAGE_TERMINATOR_FIELD_ID = 0xFFFF;
16
17
 
17
18
  // Backport to c++11
18
19
  template <class...>
@@ -1,3 +1,11 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // duckdb/common/shared_ptr.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
1
9
  #pragma once
2
10
 
3
11
  #include <memory>
@@ -31,6 +31,7 @@ public:
31
31
 
32
32
  GlobalSortStatePtr global_sort;
33
33
  atomic<idx_t> count;
34
+ idx_t batch_base;
34
35
 
35
36
  // Mask computation
36
37
  SortLayout partition_layout;
@@ -59,9 +60,6 @@ public:
59
60
  void UpdateLocalPartition(GroupingPartition &local_partition, GroupingAppend &local_append);
60
61
  void CombineLocalPartition(GroupingPartition &local_partition, GroupingAppend &local_append);
61
62
 
62
- void BuildSortState(TupleDataCollection &group_data, GlobalSortState &global_sort) const;
63
- void BuildSortState(TupleDataCollection &group_data, PartitionGlobalHashGroup &global_sort);
64
-
65
63
  ClientContext &context;
66
64
  BufferManager &buffer_manager;
67
65
  Allocator &allocator;
@@ -128,7 +126,7 @@ public:
128
126
  void Combine();
129
127
  };
130
128
 
131
- enum class PartitionSortStage : uint8_t { INIT, PREPARE, MERGE, SORTED };
129
+ enum class PartitionSortStage : uint8_t { INIT, SCAN, PREPARE, MERGE, SORTED };
132
130
 
133
131
  class PartitionLocalMergeState;
134
132
 
@@ -150,7 +148,11 @@ public:
150
148
  PartitionGlobalSinkState &sink;
151
149
  GroupDataPtr group_data;
152
150
  PartitionGlobalHashGroup *hash_group;
151
+ vector<column_t> column_ids;
152
+ TupleDataParallelScanState chunk_state;
153
153
  GlobalSortState *global_sort;
154
+ const idx_t memory_per_thread;
155
+ const idx_t num_threads;
154
156
 
155
157
  private:
156
158
  mutable mutex lock;
@@ -162,15 +164,14 @@ private:
162
164
 
163
165
  class PartitionLocalMergeState {
164
166
  public:
165
- PartitionLocalMergeState() : merge_state(nullptr), stage(PartitionSortStage::INIT) {
166
- finished = true;
167
- }
167
+ explicit PartitionLocalMergeState(PartitionGlobalSinkState &gstate);
168
168
 
169
169
  bool TaskFinished() {
170
170
  return finished;
171
171
  }
172
172
 
173
173
  void Prepare();
174
+ void Scan();
174
175
  void Merge();
175
176
 
176
177
  void ExecuteTask();
@@ -178,6 +179,11 @@ public:
178
179
  PartitionGlobalMergeState *merge_state;
179
180
  PartitionSortStage stage;
180
181
  atomic<bool> finished;
182
+
183
+ // Sorting buffers
184
+ ExpressionExecutor executor;
185
+ DataChunk sort_chunk;
186
+ DataChunk payload_chunk;
181
187
  };
182
188
 
183
189
  class PartitionGlobalMergeStates {
@@ -19,6 +19,8 @@ class Allocator;
19
19
  class ClientContext;
20
20
  class ExecutionContext;
21
21
  class VectorCache;
22
+ class FormatSerializer;
23
+ class FormatDeserializer;
22
24
 
23
25
  //! A Data Chunk represents a set of vectors.
24
26
  /*!
@@ -141,6 +143,9 @@ public:
141
143
  //! Deserializes a blob back into a DataChunk
142
144
  DUCKDB_API void Deserialize(Deserializer &source);
143
145
 
146
+ DUCKDB_API void FormatSerialize(FormatSerializer &serializer) const;
147
+ DUCKDB_API void FormatDeserialize(FormatDeserializer &source);
148
+
144
149
  //! Hashes the DataChunk to the target vector
145
150
  DUCKDB_API void Hash(Vector &result);
146
151
  //! Hashes specific vectors of the DataChunk to the target vector
@@ -18,11 +18,14 @@ struct robj;
18
18
 
19
19
  namespace duckdb {
20
20
 
21
- enum class HLLStorageType { UNCOMPRESSED = 1 };
21
+ enum class HLLStorageType : uint8_t { UNCOMPRESSED = 1 };
22
22
 
23
23
  class FieldWriter;
24
24
  class FieldReader;
25
25
 
26
+ class FormatSerializer;
27
+ class FormatDeserializer;
28
+
26
29
  //! The HyperLogLog class holds a HyperLogLog counter for approximate cardinality counting
27
30
  class HyperLogLog {
28
31
  public:
@@ -50,6 +53,9 @@ public:
50
53
  void Serialize(FieldWriter &writer) const;
51
54
  static unique_ptr<HyperLogLog> Deserialize(FieldReader &reader);
52
55
 
56
+ void FormatSerialize(FormatSerializer &serializer) const;
57
+ static unique_ptr<HyperLogLog> FormatDeserialize(FormatDeserializer &deserializer);
58
+
53
59
  public:
54
60
  //! Compute HLL hashes over vdata, and store them in 'hashes'
55
61
  //! Then, compute register indices and prefix lengths, and also store them in 'hashes' as a pair of uint32_t
@@ -16,6 +16,9 @@ struct dtime_t;
16
16
  struct date_t;
17
17
  struct timestamp_t;
18
18
 
19
+ class FormatSerializer;
20
+ class FormatDeserializer;
21
+
19
22
  struct interval_t {
20
23
  int32_t months;
21
24
  int32_t days;
@@ -24,6 +27,10 @@ struct interval_t {
24
27
  inline bool operator==(const interval_t &rhs) const {
25
28
  return this->days == rhs.days && this->months == rhs.months && this->micros == rhs.micros;
26
29
  }
30
+
31
+ // Serialization
32
+ void FormatSerialize(FormatSerializer &serializer) const;
33
+ static interval_t FormatDeserialize(FormatDeserializer &source);
27
34
  };
28
35
 
29
36
  //! The Interval class is a static class that holds helper functions for the Interval
@@ -8,6 +8,7 @@
8
8
 
9
9
  #pragma once
10
10
 
11
+ #include "duckdb/common/fixed_size_map.hpp"
11
12
  #include "duckdb/common/perfect_map_set.hpp"
12
13
  #include "duckdb/common/types/row/tuple_data_allocator.hpp"
13
14
  #include "duckdb/common/types/row/tuple_data_collection.hpp"
@@ -23,10 +24,11 @@ public:
23
24
  public:
24
25
  Vector partition_indices;
25
26
  SelectionVector partition_sel;
27
+ SelectionVector reverse_partition_sel;
26
28
 
27
- static constexpr idx_t MAP_THRESHOLD = 32;
29
+ static constexpr idx_t MAP_THRESHOLD = 256;
28
30
  perfect_map_t<list_entry_t> partition_entries;
29
- list_entry_t partition_entries_arr[MAP_THRESHOLD];
31
+ fixed_size_map_t<list_entry_t> fixed_partition_entries;
30
32
 
31
33
  vector<unique_ptr<TupleDataPinState>> partition_pin_states;
32
34
  TupleDataChunkState chunk_state;
@@ -51,30 +53,48 @@ public:
51
53
  virtual ~PartitionedTupleData();
52
54
 
53
55
  public:
56
+ //! Get the layout of this PartitionedTupleData
57
+ const TupleDataLayout &GetLayout() const;
54
58
  //! Get the partitioning type of this PartitionedTupleData
55
59
  PartitionedTupleDataType GetType() const;
56
60
  //! Initializes a local state for parallel partitioning that can be merged into this PartitionedTupleData
57
61
  void InitializeAppendState(PartitionedTupleDataAppendState &state,
58
62
  TupleDataPinProperties properties = TupleDataPinProperties::UNPIN_AFTER_DONE) const;
59
63
  //! Appends a DataChunk to this PartitionedTupleData
60
- void Append(PartitionedTupleDataAppendState &state, DataChunk &input);
64
+ void Append(PartitionedTupleDataAppendState &state, DataChunk &input,
65
+ const SelectionVector &append_sel = *FlatVector::IncrementalSelectionVector(),
66
+ const idx_t append_count = DConstants::INVALID_INDEX);
67
+ //! Appends a DataChunk to this PartitionedTupleData
68
+ //! - ToUnifiedFormat has already been called
69
+ void AppendUnified(PartitionedTupleDataAppendState &state, DataChunk &input,
70
+ const SelectionVector &append_sel = *FlatVector::IncrementalSelectionVector(),
71
+ const idx_t append_count = DConstants::INVALID_INDEX);
61
72
  //! Appends rows to this PartitionedTupleData
62
- void Append(PartitionedTupleDataAppendState &state, TupleDataChunkState &input, idx_t count);
73
+ void Append(PartitionedTupleDataAppendState &state, TupleDataChunkState &input, const idx_t count);
63
74
  //! Flushes any remaining data in the append state into this PartitionedTupleData
64
75
  void FlushAppendState(PartitionedTupleDataAppendState &state);
65
76
  //! Combine another PartitionedTupleData into this PartitionedTupleData
66
77
  void Combine(PartitionedTupleData &other);
67
- //! Partition a TupleDataCollection
68
- void Partition(TupleDataCollection &source,
69
- TupleDataPinProperties properties = TupleDataPinProperties::UNPIN_AFTER_DONE);
78
+ //! Resets this PartitionedTupleData
79
+ void Reset();
70
80
  //! Repartition this PartitionedTupleData into the new PartitionedTupleData
71
81
  void Repartition(PartitionedTupleData &new_partitioned_data);
82
+ //! Unpins the data
83
+ void Unpin();
72
84
  //! Get the partitions in this PartitionedTupleData
73
85
  vector<unique_ptr<TupleDataCollection>> &GetPartitions();
86
+ //! Get the data of this PartitionedTupleData as a single unpartitioned TupleDataCollection
87
+ unique_ptr<TupleDataCollection> GetUnpartitioned();
74
88
  //! Get the count of this PartitionedTupleData
75
89
  idx_t Count() const;
76
90
  //! Get the size (in bytes) of this PartitionedTupleData
77
91
  idx_t SizeInBytes() const;
92
+ //! Get the number of partitions of this PartitionedTupleData
93
+ idx_t PartitionCount() const;
94
+ //! Converts this PartitionedTupleData to a string representation
95
+ string ToString();
96
+ //! Prints the string representation of this PartitionedTupleData
97
+ void Print();
78
98
 
79
99
  protected:
80
100
  //===--------------------------------------------------------------------===//
@@ -91,7 +111,7 @@ protected:
91
111
  throw NotImplementedException("ComputePartitionIndices for this type of PartitionedTupleData");
92
112
  }
93
113
  //! Compute partition indices from rows (similar to function above)
94
- virtual void ComputePartitionIndices(Vector &row_locations, idx_t count, Vector &partition_indices) const {
114
+ virtual void ComputePartitionIndices(Vector &row_locations, idx_t append_count, Vector &partition_indices) const {
95
115
  throw NotImplementedException("ComputePartitionIndices for this type of PartitionedTupleData");
96
116
  }
97
117
  //! Maximum partition index (optional)
@@ -116,11 +136,19 @@ protected:
116
136
 
117
137
  //! Create a new shared allocator
118
138
  void CreateAllocator();
139
+ //! Whether to use fixed size map or regular marp
140
+ bool UseFixedSizeMap() const;
119
141
  //! Builds a selection vector in the Append state for the partitions
120
142
  //! - returns true if everything belongs to the same partition - stores partition index in single_partition_idx
121
- void BuildPartitionSel(PartitionedTupleDataAppendState &state, idx_t count);
143
+ void BuildPartitionSel(PartitionedTupleDataAppendState &state, const SelectionVector &append_sel,
144
+ const idx_t append_count);
145
+ template <class MAP_TYPE, class GETTER>
146
+ void BuildPartitionSel(PartitionedTupleDataAppendState &state, MAP_TYPE &partition_entries,
147
+ const SelectionVector &append_sel, const idx_t append_count);
122
148
  //! Builds out the buffer space in the partitions
123
149
  void BuildBufferSpace(PartitionedTupleDataAppendState &state);
150
+ template <class MAP_TYPE, class GETTER>
151
+ void BuildBufferSpace(PartitionedTupleDataAppendState &state, const MAP_TYPE &partition_entries);
124
152
  //! Create a collection for a specific a partition
125
153
  unique_ptr<TupleDataCollection> CreatePartitionCollection(idx_t partition_index) const {
126
154
  if (allocators) {
@@ -129,11 +157,15 @@ protected:
129
157
  return make_uniq<TupleDataCollection>(buffer_manager, layout);
130
158
  }
131
159
  }
160
+ //! Verify count/data size of this PartitionedTupleData
161
+ void Verify() const;
132
162
 
133
163
  protected:
134
164
  PartitionedTupleDataType type;
135
165
  BufferManager &buffer_manager;
136
166
  const TupleDataLayout layout;
167
+ idx_t count;
168
+ idx_t data_size;
137
169
 
138
170
  mutex lock;
139
171
  shared_ptr<PartitionTupleDataAllocators> allocators;
@@ -74,6 +74,11 @@ public:
74
74
  return total_count - total_scanned;
75
75
  }
76
76
 
77
+ //! The number of remaining rows
78
+ inline idx_t BlockIndex() const {
79
+ return read_state.block_idx;
80
+ }
81
+
77
82
  //! Swizzle the blocks for external scanning
78
83
  //! Swizzling is all or nothing, so if we have scanned previously,
79
84
  //! we need to re-swizzle.