duckdb 0.7.2-dev225.0 → 0.7.2-dev314.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/parquet/column_reader.cpp +5 -6
  3. package/src/duckdb/extension/parquet/include/column_reader.hpp +1 -2
  4. package/src/duckdb/extension/parquet/include/generated_column_reader.hpp +1 -11
  5. package/src/duckdb/extension/parquet/parquet_statistics.cpp +26 -32
  6. package/src/duckdb/src/common/sort/sort_state.cpp +5 -7
  7. package/src/duckdb/src/execution/column_binding_resolver.cpp +6 -0
  8. package/src/duckdb/src/execution/operator/aggregate/physical_perfecthash_aggregate.cpp +4 -5
  9. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +1 -1
  10. package/src/duckdb/src/execution/operator/helper/physical_vacuum.cpp +2 -3
  11. package/src/duckdb/src/execution/operator/join/physical_blockwise_nl_join.cpp +32 -6
  12. package/src/duckdb/src/execution/physical_plan/plan_aggregate.cpp +15 -15
  13. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +18 -12
  14. package/src/duckdb/src/function/aggregate/distributive/bitstring_agg.cpp +6 -13
  15. package/src/duckdb/src/function/aggregate/distributive/count.cpp +2 -4
  16. package/src/duckdb/src/function/aggregate/distributive/sum.cpp +11 -13
  17. package/src/duckdb/src/function/scalar/date/date_diff.cpp +0 -1
  18. package/src/duckdb/src/function/scalar/date/date_part.cpp +17 -25
  19. package/src/duckdb/src/function/scalar/date/date_sub.cpp +0 -1
  20. package/src/duckdb/src/function/scalar/date/date_trunc.cpp +10 -14
  21. package/src/duckdb/src/function/scalar/generic/stats.cpp +2 -4
  22. package/src/duckdb/src/function/scalar/list/flatten.cpp +5 -12
  23. package/src/duckdb/src/function/scalar/list/list_concat.cpp +3 -8
  24. package/src/duckdb/src/function/scalar/list/list_extract.cpp +5 -12
  25. package/src/duckdb/src/function/scalar/list/list_value.cpp +5 -9
  26. package/src/duckdb/src/function/scalar/math/numeric.cpp +14 -17
  27. package/src/duckdb/src/function/scalar/operators/arithmetic.cpp +27 -34
  28. package/src/duckdb/src/function/scalar/string/caseconvert.cpp +2 -6
  29. package/src/duckdb/src/function/scalar/string/instr.cpp +2 -6
  30. package/src/duckdb/src/function/scalar/string/length.cpp +2 -6
  31. package/src/duckdb/src/function/scalar/string/like.cpp +2 -6
  32. package/src/duckdb/src/function/scalar/string/substring.cpp +2 -6
  33. package/src/duckdb/src/function/scalar/struct/struct_extract.cpp +4 -9
  34. package/src/duckdb/src/function/scalar/struct/struct_insert.cpp +10 -13
  35. package/src/duckdb/src/function/scalar/struct/struct_pack.cpp +5 -6
  36. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  37. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_perfecthash_aggregate.hpp +1 -1
  38. package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +12 -3
  39. package/src/duckdb/src/include/duckdb/function/scalar_function.hpp +2 -2
  40. package/src/duckdb/src/include/duckdb/planner/bind_context.hpp +2 -0
  41. package/src/duckdb/src/include/duckdb/storage/checkpoint/table_data_writer.hpp +3 -2
  42. package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_compress.hpp +2 -2
  43. package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_fetch.hpp +1 -1
  44. package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_scan.hpp +1 -1
  45. package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_compress.hpp +2 -2
  46. package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_fetch.hpp +1 -1
  47. package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp +1 -1
  48. package/src/duckdb/src/include/duckdb/storage/data_pointer.hpp +5 -2
  49. package/src/duckdb/src/include/duckdb/storage/data_table.hpp +1 -1
  50. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +93 -31
  51. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +22 -3
  52. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +6 -6
  53. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +41 -0
  54. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +157 -0
  55. package/src/duckdb/src/include/duckdb/storage/statistics/segment_statistics.hpp +2 -7
  56. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +74 -0
  57. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +42 -0
  58. package/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp +2 -3
  59. package/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp +2 -2
  60. package/src/duckdb/src/include/duckdb/storage/table/persistent_table_data.hpp +2 -1
  61. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -3
  62. package/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp +3 -2
  63. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  64. package/src/duckdb/src/main/config.cpp +66 -1
  65. package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +0 -1
  66. package/src/duckdb/src/optimizer/statistics/expression/propagate_aggregate.cpp +9 -3
  67. package/src/duckdb/src/optimizer/statistics/expression/propagate_and_compress.cpp +6 -7
  68. package/src/duckdb/src/optimizer/statistics/expression/propagate_cast.cpp +14 -11
  69. package/src/duckdb/src/optimizer/statistics/expression/propagate_columnref.cpp +1 -1
  70. package/src/duckdb/src/optimizer/statistics/expression/propagate_comparison.cpp +13 -15
  71. package/src/duckdb/src/optimizer/statistics/expression/propagate_conjunction.cpp +0 -1
  72. package/src/duckdb/src/optimizer/statistics/expression/propagate_constant.cpp +3 -75
  73. package/src/duckdb/src/optimizer/statistics/expression/propagate_function.cpp +7 -2
  74. package/src/duckdb/src/optimizer/statistics/expression/propagate_operator.cpp +10 -0
  75. package/src/duckdb/src/optimizer/statistics/operator/propagate_aggregate.cpp +2 -3
  76. package/src/duckdb/src/optimizer/statistics/operator/propagate_filter.cpp +28 -31
  77. package/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp +4 -5
  78. package/src/duckdb/src/optimizer/statistics/operator/propagate_set_operation.cpp +3 -3
  79. package/src/duckdb/src/optimizer/statistics_propagator.cpp +1 -1
  80. package/src/duckdb/src/parser/transform/tableref/transform_join.cpp +4 -0
  81. package/src/duckdb/src/planner/bind_context.cpp +16 -0
  82. package/src/duckdb/src/planner/binder/query_node/plan_select_node.cpp +0 -1
  83. package/src/duckdb/src/planner/binder/tableref/bind_joinref.cpp +9 -0
  84. package/src/duckdb/src/planner/binder.cpp +2 -1
  85. package/src/duckdb/src/planner/bound_result_modifier.cpp +1 -1
  86. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +1 -1
  87. package/src/duckdb/src/planner/filter/constant_filter.cpp +4 -6
  88. package/src/duckdb/src/storage/checkpoint/row_group_writer.cpp +1 -1
  89. package/src/duckdb/src/storage/checkpoint/table_data_reader.cpp +1 -4
  90. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +4 -4
  91. package/src/duckdb/src/storage/compression/bitpacking.cpp +3 -3
  92. package/src/duckdb/src/storage/compression/fixed_size_uncompressed.cpp +3 -3
  93. package/src/duckdb/src/storage/compression/numeric_constant.cpp +9 -10
  94. package/src/duckdb/src/storage/compression/patas.cpp +1 -1
  95. package/src/duckdb/src/storage/compression/rle.cpp +2 -2
  96. package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +5 -5
  97. package/src/duckdb/src/storage/data_table.cpp +4 -6
  98. package/src/duckdb/src/storage/statistics/base_statistics.cpp +373 -128
  99. package/src/duckdb/src/storage/statistics/column_statistics.cpp +58 -3
  100. package/src/duckdb/src/storage/statistics/distinct_statistics.cpp +4 -9
  101. package/src/duckdb/src/storage/statistics/list_stats.cpp +117 -0
  102. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +529 -0
  103. package/src/duckdb/src/storage/statistics/segment_statistics.cpp +2 -11
  104. package/src/duckdb/src/storage/statistics/string_stats.cpp +273 -0
  105. package/src/duckdb/src/storage/statistics/struct_stats.cpp +131 -0
  106. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  107. package/src/duckdb/src/storage/table/column_checkpoint_state.cpp +3 -4
  108. package/src/duckdb/src/storage/table/column_data.cpp +16 -14
  109. package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +2 -3
  110. package/src/duckdb/src/storage/table/column_segment.cpp +6 -8
  111. package/src/duckdb/src/storage/table/list_column_data.cpp +7 -11
  112. package/src/duckdb/src/storage/table/row_group.cpp +24 -23
  113. package/src/duckdb/src/storage/table/row_group_collection.cpp +12 -12
  114. package/src/duckdb/src/storage/table/standard_column_data.cpp +6 -6
  115. package/src/duckdb/src/storage/table/struct_column_data.cpp +15 -16
  116. package/src/duckdb/src/storage/table/table_statistics.cpp +27 -7
  117. package/src/duckdb/src/storage/table/update_segment.cpp +10 -12
  118. package/src/duckdb/third_party/libpg_query/include/parser/gram.hpp +923 -919
  119. package/src/duckdb/third_party/libpg_query/include/parser/kwlist.hpp +2 -0
  120. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +15684 -15571
  121. package/src/duckdb/ub_src_storage_statistics.cpp +4 -6
  122. package/src/duckdb/src/include/duckdb/storage/statistics/list_statistics.hpp +0 -36
  123. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_statistics.hpp +0 -75
  124. package/src/duckdb/src/include/duckdb/storage/statistics/string_statistics.hpp +0 -49
  125. package/src/duckdb/src/include/duckdb/storage/statistics/struct_statistics.hpp +0 -36
  126. package/src/duckdb/src/include/duckdb/storage/statistics/validity_statistics.hpp +0 -45
  127. package/src/duckdb/src/storage/statistics/list_statistics.cpp +0 -94
  128. package/src/duckdb/src/storage/statistics/numeric_statistics.cpp +0 -307
  129. package/src/duckdb/src/storage/statistics/string_statistics.cpp +0 -220
  130. package/src/duckdb/src/storage/statistics/struct_statistics.cpp +0 -108
  131. package/src/duckdb/src/storage/statistics/validity_statistics.cpp +0 -91
@@ -0,0 +1,273 @@
1
+ #include "duckdb/storage/statistics/string_stats.hpp"
2
+ #include "duckdb/storage/statistics/base_statistics.hpp"
3
+ #include "duckdb/common/field_writer.hpp"
4
+ #include "utf8proc_wrapper.hpp"
5
+ #include "duckdb/common/string_util.hpp"
6
+ #include "duckdb/common/types/vector.hpp"
7
+ #include "duckdb/main/error_manager.hpp"
8
+
9
+ namespace duckdb {
10
+
11
+ BaseStatistics StringStats::CreateUnknown(LogicalType type) {
12
+ BaseStatistics result(std::move(type));
13
+ result.InitializeUnknown();
14
+ auto &string_data = StringStats::GetDataUnsafe(result);
15
+ for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) {
16
+ string_data.min[i] = 0;
17
+ string_data.max[i] = 0xFF;
18
+ }
19
+ string_data.max_string_length = 0;
20
+ string_data.has_max_string_length = false;
21
+ string_data.has_unicode = true;
22
+ return result;
23
+ }
24
+
25
+ BaseStatistics StringStats::CreateEmpty(LogicalType type) {
26
+ BaseStatistics result(std::move(type));
27
+ result.InitializeEmpty();
28
+ auto &string_data = StringStats::GetDataUnsafe(result);
29
+ for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) {
30
+ string_data.min[i] = 0xFF;
31
+ string_data.max[i] = 0;
32
+ }
33
+ string_data.max_string_length = 0;
34
+ string_data.has_max_string_length = true;
35
+ string_data.has_unicode = false;
36
+ return result;
37
+ }
38
+
39
+ StringStatsData &StringStats::GetDataUnsafe(BaseStatistics &stats) {
40
+ D_ASSERT(stats.GetStatsType() == StatisticsType::STRING_STATS);
41
+ return stats.stats_union.string_data;
42
+ }
43
+
44
+ const StringStatsData &StringStats::GetDataUnsafe(const BaseStatistics &stats) {
45
+ D_ASSERT(stats.GetStatsType() == StatisticsType::STRING_STATS);
46
+ return stats.stats_union.string_data;
47
+ }
48
+
49
+ bool StringStats::HasMaxStringLength(const BaseStatistics &stats) {
50
+ if (stats.GetType().id() == LogicalTypeId::SQLNULL) {
51
+ return false;
52
+ }
53
+ return StringStats::GetDataUnsafe(stats).has_max_string_length;
54
+ }
55
+
56
+ uint32_t StringStats::MaxStringLength(const BaseStatistics &stats) {
57
+ if (!HasMaxStringLength(stats)) {
58
+ throw InternalException("MaxStringLength called on statistics that does not have a max string length");
59
+ }
60
+ return StringStats::GetDataUnsafe(stats).max_string_length;
61
+ }
62
+
63
+ bool StringStats::CanContainUnicode(const BaseStatistics &stats) {
64
+ if (stats.GetType().id() == LogicalTypeId::SQLNULL) {
65
+ return true;
66
+ }
67
+ return StringStats::GetDataUnsafe(stats).has_unicode;
68
+ }
69
+
70
+ void StringStats::ResetMaxStringLength(BaseStatistics &stats) {
71
+ StringStats::GetDataUnsafe(stats).has_max_string_length = false;
72
+ }
73
+
74
+ void StringStats::SetContainsUnicode(BaseStatistics &stats) {
75
+ StringStats::GetDataUnsafe(stats).has_unicode = true;
76
+ }
77
+
78
+ void StringStats::Serialize(const BaseStatistics &stats, FieldWriter &writer) {
79
+ auto &string_data = StringStats::GetDataUnsafe(stats);
80
+ writer.WriteBlob(string_data.min, StringStatsData::MAX_STRING_MINMAX_SIZE);
81
+ writer.WriteBlob(string_data.max, StringStatsData::MAX_STRING_MINMAX_SIZE);
82
+ writer.WriteField<bool>(string_data.has_unicode);
83
+ writer.WriteField<bool>(string_data.has_max_string_length);
84
+ writer.WriteField<uint32_t>(string_data.max_string_length);
85
+ }
86
+
87
+ BaseStatistics StringStats::Deserialize(FieldReader &reader, LogicalType type) {
88
+ BaseStatistics result(std::move(type));
89
+ auto &string_data = StringStats::GetDataUnsafe(result);
90
+ reader.ReadBlob(string_data.min, StringStatsData::MAX_STRING_MINMAX_SIZE);
91
+ reader.ReadBlob(string_data.max, StringStatsData::MAX_STRING_MINMAX_SIZE);
92
+ string_data.has_unicode = reader.ReadRequired<bool>();
93
+ string_data.has_max_string_length = reader.ReadRequired<bool>();
94
+ string_data.max_string_length = reader.ReadRequired<uint32_t>();
95
+ return result;
96
+ }
97
+
98
+ static int StringValueComparison(const_data_ptr_t data, idx_t len, const_data_ptr_t comparison) {
99
+ D_ASSERT(len <= StringStatsData::MAX_STRING_MINMAX_SIZE);
100
+ for (idx_t i = 0; i < len; i++) {
101
+ if (data[i] < comparison[i]) {
102
+ return -1;
103
+ } else if (data[i] > comparison[i]) {
104
+ return 1;
105
+ }
106
+ }
107
+ return 0;
108
+ }
109
+
110
+ static void ConstructValue(const_data_ptr_t data, idx_t size, data_t target[]) {
111
+ idx_t value_size = size > StringStatsData::MAX_STRING_MINMAX_SIZE ? StringStatsData::MAX_STRING_MINMAX_SIZE : size;
112
+ memcpy(target, data, value_size);
113
+ for (idx_t i = value_size; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) {
114
+ target[i] = '\0';
115
+ }
116
+ }
117
+
118
+ void StringStats::Update(BaseStatistics &stats, const string_t &value) {
119
+ auto data = (const_data_ptr_t)value.GetDataUnsafe();
120
+ auto size = value.GetSize();
121
+
122
+ //! we can only fit 8 bytes, so we might need to trim our string
123
+ // construct the value
124
+ data_t target[StringStatsData::MAX_STRING_MINMAX_SIZE];
125
+ ConstructValue(data, size, target);
126
+
127
+ // update the min and max
128
+ auto &string_data = StringStats::GetDataUnsafe(stats);
129
+ if (StringValueComparison(target, StringStatsData::MAX_STRING_MINMAX_SIZE, string_data.min) < 0) {
130
+ memcpy(string_data.min, target, StringStatsData::MAX_STRING_MINMAX_SIZE);
131
+ }
132
+ if (StringValueComparison(target, StringStatsData::MAX_STRING_MINMAX_SIZE, string_data.max) > 0) {
133
+ memcpy(string_data.max, target, StringStatsData::MAX_STRING_MINMAX_SIZE);
134
+ }
135
+ if (size > string_data.max_string_length) {
136
+ string_data.max_string_length = size;
137
+ }
138
+ if (stats.GetType().id() == LogicalTypeId::VARCHAR && !string_data.has_unicode) {
139
+ auto unicode = Utf8Proc::Analyze((const char *)data, size);
140
+ if (unicode == UnicodeType::UNICODE) {
141
+ string_data.has_unicode = true;
142
+ } else if (unicode == UnicodeType::INVALID) {
143
+ throw InternalException(
144
+ ErrorManager::InvalidUnicodeError(string((char *)data, size), "segment statistics update"));
145
+ }
146
+ }
147
+ }
148
+
149
+ void StringStats::Merge(BaseStatistics &stats, const BaseStatistics &other) {
150
+ if (other.GetType().id() == LogicalTypeId::VALIDITY) {
151
+ return;
152
+ }
153
+ auto &string_data = StringStats::GetDataUnsafe(stats);
154
+ auto &other_data = StringStats::GetDataUnsafe(other);
155
+ if (StringValueComparison(other_data.min, StringStatsData::MAX_STRING_MINMAX_SIZE, string_data.min) < 0) {
156
+ memcpy(string_data.min, other_data.min, StringStatsData::MAX_STRING_MINMAX_SIZE);
157
+ }
158
+ if (StringValueComparison(other_data.max, StringStatsData::MAX_STRING_MINMAX_SIZE, string_data.max) > 0) {
159
+ memcpy(string_data.max, other_data.max, StringStatsData::MAX_STRING_MINMAX_SIZE);
160
+ }
161
+ string_data.has_unicode = string_data.has_unicode || other_data.has_unicode;
162
+ string_data.has_max_string_length = string_data.has_max_string_length && other_data.has_max_string_length;
163
+ string_data.max_string_length = MaxValue<uint32_t>(string_data.max_string_length, other_data.max_string_length);
164
+ }
165
+
166
+ FilterPropagateResult StringStats::CheckZonemap(const BaseStatistics &stats, ExpressionType comparison_type,
167
+ const string &constant) {
168
+ auto &string_data = StringStats::GetDataUnsafe(stats);
169
+ auto data = (const_data_ptr_t)constant.c_str();
170
+ auto size = constant.size();
171
+
172
+ idx_t value_size = size > StringStatsData::MAX_STRING_MINMAX_SIZE ? StringStatsData::MAX_STRING_MINMAX_SIZE : size;
173
+ int min_comp = StringValueComparison(data, value_size, string_data.min);
174
+ int max_comp = StringValueComparison(data, value_size, string_data.max);
175
+ switch (comparison_type) {
176
+ case ExpressionType::COMPARE_EQUAL:
177
+ if (min_comp >= 0 && max_comp <= 0) {
178
+ return FilterPropagateResult::NO_PRUNING_POSSIBLE;
179
+ } else {
180
+ return FilterPropagateResult::FILTER_ALWAYS_FALSE;
181
+ }
182
+ case ExpressionType::COMPARE_NOTEQUAL:
183
+ if (min_comp < 0 || max_comp > 0) {
184
+ return FilterPropagateResult::FILTER_ALWAYS_TRUE;
185
+ }
186
+ return FilterPropagateResult::NO_PRUNING_POSSIBLE;
187
+ case ExpressionType::COMPARE_GREATERTHANOREQUALTO:
188
+ case ExpressionType::COMPARE_GREATERTHAN:
189
+ if (max_comp <= 0) {
190
+ return FilterPropagateResult::NO_PRUNING_POSSIBLE;
191
+ } else {
192
+ return FilterPropagateResult::FILTER_ALWAYS_FALSE;
193
+ }
194
+ case ExpressionType::COMPARE_LESSTHAN:
195
+ case ExpressionType::COMPARE_LESSTHANOREQUALTO:
196
+ if (min_comp >= 0) {
197
+ return FilterPropagateResult::NO_PRUNING_POSSIBLE;
198
+ } else {
199
+ return FilterPropagateResult::FILTER_ALWAYS_FALSE;
200
+ }
201
+ default:
202
+ throw InternalException("Expression type not implemented for string statistics zone map");
203
+ }
204
+ }
205
+
206
+ static idx_t GetValidMinMaxSubstring(const_data_ptr_t data) {
207
+ for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) {
208
+ if (data[i] == '\0') {
209
+ return i;
210
+ }
211
+ if ((data[i] & 0x80) != 0) {
212
+ return i;
213
+ }
214
+ }
215
+ return StringStatsData::MAX_STRING_MINMAX_SIZE;
216
+ }
217
+
218
+ string StringStats::ToString(const BaseStatistics &stats) {
219
+ auto &string_data = StringStats::GetDataUnsafe(stats);
220
+ idx_t min_len = GetValidMinMaxSubstring(string_data.min);
221
+ idx_t max_len = GetValidMinMaxSubstring(string_data.max);
222
+ return StringUtil::Format(
223
+ "[Min: %s, Max: %s, Has Unicode: %s, Max String Length: %s]", string((const char *)string_data.min, min_len),
224
+ string((const char *)string_data.max, max_len), string_data.has_unicode ? "true" : "false",
225
+ string_data.has_max_string_length ? to_string(string_data.max_string_length) : "?");
226
+ }
227
+
228
+ void StringStats::Verify(const BaseStatistics &stats, Vector &vector, const SelectionVector &sel, idx_t count) {
229
+ auto &string_data = StringStats::GetDataUnsafe(stats);
230
+
231
+ UnifiedVectorFormat vdata;
232
+ vector.ToUnifiedFormat(count, vdata);
233
+ auto data = (string_t *)vdata.data;
234
+ for (idx_t i = 0; i < count; i++) {
235
+ auto idx = sel.get_index(i);
236
+ auto index = vdata.sel->get_index(idx);
237
+ if (!vdata.validity.RowIsValid(index)) {
238
+ continue;
239
+ }
240
+ auto value = data[index];
241
+ auto data = value.GetDataUnsafe();
242
+ auto len = value.GetSize();
243
+ // LCOV_EXCL_START
244
+ if (string_data.has_max_string_length && len > string_data.max_string_length) {
245
+ throw InternalException(
246
+ "Statistics mismatch: string value exceeds maximum string length.\nStatistics: %s\nVector: %s",
247
+ stats.ToString(), vector.ToString(count));
248
+ }
249
+ if (stats.GetType().id() == LogicalTypeId::VARCHAR && !string_data.has_unicode) {
250
+ auto unicode = Utf8Proc::Analyze(data, len);
251
+ if (unicode == UnicodeType::UNICODE) {
252
+ throw InternalException("Statistics mismatch: string value contains unicode, but statistics says it "
253
+ "shouldn't.\nStatistics: %s\nVector: %s",
254
+ stats.ToString(), vector.ToString(count));
255
+ } else if (unicode == UnicodeType::INVALID) {
256
+ throw InternalException("Invalid unicode detected in vector: %s", vector.ToString(count));
257
+ }
258
+ }
259
+ if (StringValueComparison((const_data_ptr_t)data, MinValue<idx_t>(len, StringStatsData::MAX_STRING_MINMAX_SIZE),
260
+ string_data.min) < 0) {
261
+ throw InternalException("Statistics mismatch: value is smaller than min.\nStatistics: %s\nVector: %s",
262
+ stats.ToString(), vector.ToString(count));
263
+ }
264
+ if (StringValueComparison((const_data_ptr_t)data, MinValue<idx_t>(len, StringStatsData::MAX_STRING_MINMAX_SIZE),
265
+ string_data.max) > 0) {
266
+ throw InternalException("Statistics mismatch: value is bigger than max.\nStatistics: %s\nVector: %s",
267
+ stats.ToString(), vector.ToString(count));
268
+ }
269
+ // LCOV_EXCL_STOP
270
+ }
271
+ }
272
+
273
+ } // namespace duckdb
@@ -0,0 +1,131 @@
1
+ #include "duckdb/storage/statistics/struct_stats.hpp"
2
+ #include "duckdb/storage/statistics/base_statistics.hpp"
3
+ #include "duckdb/common/field_writer.hpp"
4
+ #include "duckdb/common/types/vector.hpp"
5
+
6
+ namespace duckdb {
7
+
8
+ void StructStats::Construct(BaseStatistics &stats) {
9
+ auto &child_types = StructType::GetChildTypes(stats.GetType());
10
+ stats.child_stats = unique_ptr<BaseStatistics[]>(new BaseStatistics[child_types.size()]);
11
+ for (idx_t i = 0; i < child_types.size(); i++) {
12
+ BaseStatistics::Construct(stats.child_stats[i], child_types[i].second);
13
+ }
14
+ }
15
+
16
+ BaseStatistics StructStats::CreateUnknown(LogicalType type) {
17
+ auto &child_types = StructType::GetChildTypes(type);
18
+ BaseStatistics result(std::move(type));
19
+ result.InitializeUnknown();
20
+ for (idx_t i = 0; i < child_types.size(); i++) {
21
+ result.child_stats[i].Copy(BaseStatistics::CreateUnknown(child_types[i].second));
22
+ }
23
+ return result;
24
+ }
25
+
26
+ BaseStatistics StructStats::CreateEmpty(LogicalType type) {
27
+ auto &child_types = StructType::GetChildTypes(type);
28
+ BaseStatistics result(std::move(type));
29
+ result.InitializeEmpty();
30
+ for (idx_t i = 0; i < child_types.size(); i++) {
31
+ result.child_stats[i].Copy(BaseStatistics::CreateEmpty(child_types[i].second));
32
+ }
33
+ return result;
34
+ }
35
+
36
+ const BaseStatistics *StructStats::GetChildStats(const BaseStatistics &stats) {
37
+ D_ASSERT(stats.GetStatsType() == StatisticsType::STRUCT_STATS);
38
+ return stats.child_stats.get();
39
+ }
40
+
41
+ const BaseStatistics &StructStats::GetChildStats(const BaseStatistics &stats, idx_t i) {
42
+ D_ASSERT(stats.GetStatsType() == StatisticsType::STRUCT_STATS);
43
+ if (i >= StructType::GetChildCount(stats.GetType())) {
44
+ throw InternalException("Calling StructStats::GetChildStats but there are no stats for this index");
45
+ }
46
+ return stats.child_stats[i];
47
+ }
48
+
49
+ BaseStatistics &StructStats::GetChildStats(BaseStatistics &stats, idx_t i) {
50
+ D_ASSERT(stats.GetStatsType() == StatisticsType::STRUCT_STATS);
51
+ if (i >= StructType::GetChildCount(stats.GetType())) {
52
+ throw InternalException("Calling StructStats::GetChildStats but there are no stats for this index");
53
+ }
54
+ return stats.child_stats[i];
55
+ }
56
+
57
+ void StructStats::SetChildStats(BaseStatistics &stats, idx_t i, const BaseStatistics &new_stats) {
58
+ D_ASSERT(stats.GetStatsType() == StatisticsType::STRUCT_STATS);
59
+ D_ASSERT(i < StructType::GetChildCount(stats.GetType()));
60
+ stats.child_stats[i].Copy(new_stats);
61
+ }
62
+
63
+ void StructStats::SetChildStats(BaseStatistics &stats, idx_t i, unique_ptr<BaseStatistics> new_stats) {
64
+ D_ASSERT(stats.GetStatsType() == StatisticsType::STRUCT_STATS);
65
+ if (!new_stats) {
66
+ StructStats::SetChildStats(stats, i,
67
+ BaseStatistics::CreateUnknown(StructType::GetChildType(stats.GetType(), i)));
68
+ } else {
69
+ StructStats::SetChildStats(stats, i, *new_stats);
70
+ }
71
+ }
72
+
73
+ void StructStats::Copy(BaseStatistics &stats, const BaseStatistics &other) {
74
+ auto count = StructType::GetChildCount(stats.GetType());
75
+ for (idx_t i = 0; i < count; i++) {
76
+ stats.child_stats[i].Copy(other.child_stats[i]);
77
+ }
78
+ }
79
+
80
+ void StructStats::Merge(BaseStatistics &stats, const BaseStatistics &other) {
81
+ if (other.GetType().id() == LogicalTypeId::VALIDITY) {
82
+ return;
83
+ }
84
+ D_ASSERT(stats.GetType() == other.GetType());
85
+ auto child_count = StructType::GetChildCount(stats.GetType());
86
+ for (idx_t i = 0; i < child_count; i++) {
87
+ stats.child_stats[i].Merge(other.child_stats[i]);
88
+ }
89
+ }
90
+
91
+ void StructStats::Serialize(const BaseStatistics &stats, FieldWriter &writer) {
92
+ auto child_stats = StructStats::GetChildStats(stats);
93
+ auto child_count = StructType::GetChildCount(stats.GetType());
94
+ for (idx_t i = 0; i < child_count; i++) {
95
+ writer.WriteSerializable(child_stats[i]);
96
+ }
97
+ }
98
+
99
+ BaseStatistics StructStats::Deserialize(FieldReader &reader, LogicalType type) {
100
+ D_ASSERT(type.InternalType() == PhysicalType::STRUCT);
101
+ auto &child_types = StructType::GetChildTypes(type);
102
+ BaseStatistics result(std::move(type));
103
+ for (idx_t i = 0; i < child_types.size(); i++) {
104
+ result.child_stats[i].Copy(
105
+ reader.ReadRequiredSerializable<BaseStatistics, BaseStatistics>(child_types[i].second));
106
+ }
107
+ return result;
108
+ }
109
+
110
+ string StructStats::ToString(const BaseStatistics &stats) {
111
+ string result;
112
+ result += " {";
113
+ auto &child_types = StructType::GetChildTypes(stats.GetType());
114
+ for (idx_t i = 0; i < child_types.size(); i++) {
115
+ if (i > 0) {
116
+ result += ", ";
117
+ }
118
+ result += child_types[i].first + ": " + stats.child_stats[i].ToString();
119
+ }
120
+ result += "}";
121
+ return result;
122
+ }
123
+
124
+ void StructStats::Verify(const BaseStatistics &stats, Vector &vector, const SelectionVector &sel, idx_t count) {
125
+ auto &child_entries = StructVector::GetEntries(vector);
126
+ for (idx_t i = 0; i < child_entries.size(); i++) {
127
+ stats.child_stats[i].Verify(*child_entries[i], sel, count);
128
+ }
129
+ }
130
+
131
+ } // namespace duckdb
@@ -2,7 +2,7 @@
2
2
 
3
3
  namespace duckdb {
4
4
 
5
- const uint64_t VERSION_NUMBER = 44;
5
+ const uint64_t VERSION_NUMBER = 45;
6
6
 
7
7
  struct StorageVersionInfo {
8
8
  const char *version_name;
@@ -96,7 +96,7 @@ void ColumnCheckpointState::FlushSegment(unique_ptr<ColumnSegment> segment, idx_
96
96
  } // LCOV_EXCL_STOP
97
97
 
98
98
  // merge the segment stats into the global stats
99
- global_stats->Merge(*segment->stats.statistics);
99
+ global_stats->Merge(segment->stats.statistics);
100
100
 
101
101
  // get the buffer of the segment and pin it
102
102
  auto &db = column_data.GetDatabase();
@@ -104,7 +104,7 @@ void ColumnCheckpointState::FlushSegment(unique_ptr<ColumnSegment> segment, idx_
104
104
  block_id_t block_id = INVALID_BLOCK;
105
105
  uint32_t offset_in_block = 0;
106
106
 
107
- if (!segment->stats.statistics->IsConstant()) {
107
+ if (!segment->stats.statistics.IsConstant()) {
108
108
  // non-constant block
109
109
  PartialBlockAllocation allocation = partial_block_manager.GetBlockAllocation(segment_size);
110
110
  block_id = allocation.state.block_id;
@@ -145,7 +145,7 @@ void ColumnCheckpointState::FlushSegment(unique_ptr<ColumnSegment> segment, idx_
145
145
  }
146
146
 
147
147
  // construct the data pointer
148
- DataPointer data_pointer;
148
+ DataPointer data_pointer(segment->stats.statistics.Copy());
149
149
  data_pointer.block_pointer.block_id = block_id;
150
150
  data_pointer.block_pointer.offset = offset_in_block;
151
151
  data_pointer.row_start = row_group.start;
@@ -155,7 +155,6 @@ void ColumnCheckpointState::FlushSegment(unique_ptr<ColumnSegment> segment, idx_
155
155
  }
156
156
  data_pointer.tuple_count = tuple_count;
157
157
  data_pointer.compression_type = segment->function->type;
158
- data_pointer.statistics = segment->stats.statistics->Copy();
159
158
 
160
159
  // append the segment to the new segment tree
161
160
  new_tree.AppendSegment(std::move(segment));
@@ -254,7 +254,7 @@ void ColumnData::AppendData(BaseStatistics &stats, ColumnAppendState &state, Uni
254
254
  while (true) {
255
255
  // append the data from the vector
256
256
  idx_t copied_elements = state.current->Append(state, vdata, offset, count);
257
- stats.Merge(*state.current->stats.statistics);
257
+ stats.Merge(state.current->stats.statistics);
258
258
  if (copied_elements == count) {
259
259
  // finished copying everything
260
260
  break;
@@ -389,7 +389,7 @@ unique_ptr<ColumnCheckpointState> ColumnData::Checkpoint(RowGroup &row_group,
389
389
  // scan the segments of the column data
390
390
  // set up the checkpoint state
391
391
  auto checkpoint_state = CreateCheckpointState(row_group, partial_block_manager);
392
- checkpoint_state->global_stats = BaseStatistics::CreateEmpty(type, StatisticsType::LOCAL_STATS);
392
+ checkpoint_state->global_stats = BaseStatistics::CreateEmpty(type).ToUnique();
393
393
 
394
394
  auto l = data.Lock();
395
395
  auto nodes = data.MoveSegments(l);
@@ -414,13 +414,19 @@ void ColumnData::DeserializeColumn(Deserializer &source) {
414
414
  idx_t data_pointer_count = source.Read<idx_t>();
415
415
  for (idx_t data_ptr = 0; data_ptr < data_pointer_count; data_ptr++) {
416
416
  // read the data pointer
417
- DataPointer data_pointer;
418
- data_pointer.row_start = source.Read<idx_t>();
419
- data_pointer.tuple_count = source.Read<idx_t>();
420
- data_pointer.block_pointer.block_id = source.Read<block_id_t>();
421
- data_pointer.block_pointer.offset = source.Read<uint32_t>();
422
- data_pointer.compression_type = source.Read<CompressionType>();
423
- data_pointer.statistics = BaseStatistics::Deserialize(source, type);
417
+ auto row_start = source.Read<idx_t>();
418
+ auto tuple_count = source.Read<idx_t>();
419
+ auto block_pointer_block_id = source.Read<block_id_t>();
420
+ auto block_pointer_offset = source.Read<uint32_t>();
421
+ auto compression_type = source.Read<CompressionType>();
422
+ auto stats = BaseStatistics::Deserialize(source, type);
423
+
424
+ DataPointer data_pointer(std::move(stats));
425
+ data_pointer.row_start = row_start;
426
+ data_pointer.tuple_count = tuple_count;
427
+ data_pointer.block_pointer.block_id = block_pointer_block_id;
428
+ data_pointer.block_pointer.offset = block_pointer_offset;
429
+ data_pointer.compression_type = compression_type;
424
430
 
425
431
  // create a persistent segment
426
432
  auto segment = ColumnSegment::CreatePersistentSegment(
@@ -466,11 +472,7 @@ void ColumnData::GetStorageInfo(idx_t row_group_index, vector<idx_t> col_path, T
466
472
  column_info.segment_start = segment->start;
467
473
  column_info.segment_count = segment->count;
468
474
  column_info.compression_type = CompressionTypeToString(segment->function->type);
469
- if (!segment->stats.statistics || type.id() == LogicalTypeId::LIST) {
470
- column_info.segment_stats = string("No Stats");
471
- } else {
472
- column_info.segment_stats = segment->stats.statistics->ToString();
473
- }
475
+ column_info.segment_stats = segment->stats.statistics.ToString();
474
476
  column_info.has_updates = updates ? true : false;
475
477
  // persistent
476
478
  // block_id
@@ -218,16 +218,15 @@ void ColumnDataCheckpointer::WritePersistentSegments() {
218
218
  D_ASSERT(segment->segment_type == ColumnSegmentType::PERSISTENT);
219
219
 
220
220
  // set up the data pointer directly using the data from the persistent segment
221
- DataPointer pointer;
221
+ DataPointer pointer(segment->stats.statistics.Copy());
222
222
  pointer.block_pointer.block_id = segment->GetBlockId();
223
223
  pointer.block_pointer.offset = segment->GetBlockOffset();
224
224
  pointer.row_start = segment->start;
225
225
  pointer.tuple_count = segment->count;
226
226
  pointer.compression_type = segment->function->type;
227
- pointer.statistics = segment->stats.statistics->Copy();
228
227
 
229
228
  // merge the persistent stats into the global column stats
230
- state.global_stats->Merge(*segment->stats.statistics);
229
+ state.global_stats->Merge(segment->stats.statistics);
231
230
 
232
231
  // directly append the current segment to the new tree
233
232
  state.new_tree.AppendSegment(std::move(nodes[segment_idx].node));
@@ -19,7 +19,7 @@ unique_ptr<ColumnSegment> ColumnSegment::CreatePersistentSegment(DatabaseInstanc
19
19
  block_id_t block_id, idx_t offset,
20
20
  const LogicalType &type, idx_t start, idx_t count,
21
21
  CompressionType compression_type,
22
- unique_ptr<BaseStatistics> statistics) {
22
+ BaseStatistics statistics) {
23
23
  auto &config = DBConfig::GetConfig(db);
24
24
  CompressionFunction *function;
25
25
  shared_ptr<BlockHandle> block;
@@ -48,7 +48,7 @@ unique_ptr<ColumnSegment> ColumnSegment::CreateTransientSegment(DatabaseInstance
48
48
  buffer_manager.Allocate(segment_size, false, &block);
49
49
  }
50
50
  return make_unique<ColumnSegment>(db, std::move(block), type, ColumnSegmentType::TRANSIENT, start, 0, function,
51
- nullptr, INVALID_BLOCK, 0, segment_size);
51
+ BaseStatistics::CreateEmpty(type), INVALID_BLOCK, 0, segment_size);
52
52
  }
53
53
 
54
54
  unique_ptr<ColumnSegment> ColumnSegment::CreateSegment(ColumnSegment &other, idx_t start) {
@@ -57,10 +57,9 @@ unique_ptr<ColumnSegment> ColumnSegment::CreateSegment(ColumnSegment &other, idx
57
57
 
58
58
  ColumnSegment::ColumnSegment(DatabaseInstance &db, shared_ptr<BlockHandle> block, LogicalType type_p,
59
59
  ColumnSegmentType segment_type, idx_t start, idx_t count, CompressionFunction *function_p,
60
- unique_ptr<BaseStatistics> statistics, block_id_t block_id_p, idx_t offset_p,
61
- idx_t segment_size_p)
60
+ BaseStatistics statistics, block_id_t block_id_p, idx_t offset_p, idx_t segment_size_p)
62
61
  : SegmentBase(start, count), db(db), type(std::move(type_p)), type_size(GetTypeIdSize(type.InternalType())),
63
- segment_type(segment_type), function(function_p), stats(type, std::move(statistics)), block(std::move(block)),
62
+ segment_type(segment_type), function(function_p), stats(std::move(statistics)), block(std::move(block)),
64
63
  block_id(block_id_p), offset(offset_p), segment_size(segment_size_p) {
65
64
  D_ASSERT(function);
66
65
  if (function->init_segment) {
@@ -181,13 +180,12 @@ void ColumnSegment::ConvertToPersistent(BlockManager *block_manager, block_id_t
181
180
  block_id = block_id_p;
182
181
  offset = 0;
183
182
 
184
- D_ASSERT(stats.statistics);
185
183
  if (block_id == INVALID_BLOCK) {
186
184
  // constant block: reset the block buffer
187
- D_ASSERT(stats.statistics->IsConstant());
185
+ D_ASSERT(stats.statistics.IsConstant());
188
186
  block.reset();
189
187
  } else {
190
- D_ASSERT(!stats.statistics->IsConstant());
188
+ D_ASSERT(!stats.statistics.IsConstant());
191
189
  // non-constant block: write the block to disk
192
190
  // the data for the block already exists in-memory of our block
193
191
  // instead of copying the data we alter some metadata so the buffer points to an on-disk block
@@ -1,5 +1,5 @@
1
1
  #include "duckdb/storage/table/list_column_data.hpp"
2
- #include "duckdb/storage/statistics/list_statistics.hpp"
2
+ #include "duckdb/storage/statistics/list_stats.hpp"
3
3
  #include "duckdb/transaction/transaction.hpp"
4
4
 
5
5
  namespace duckdb {
@@ -162,10 +162,8 @@ void ListColumnData::InitializeAppend(ColumnAppendState &state) {
162
162
  state.child_appends.push_back(std::move(child_append_state));
163
163
  }
164
164
 
165
- void ListColumnData::Append(BaseStatistics &stats_p, ColumnAppendState &state, Vector &vector, idx_t count) {
165
+ void ListColumnData::Append(BaseStatistics &stats, ColumnAppendState &state, Vector &vector, idx_t count) {
166
166
  D_ASSERT(count > 0);
167
- auto &stats = (ListStatistics &)stats_p;
168
-
169
167
  UnifiedVectorFormat list_data;
170
168
  vector.ToUnifiedFormat(count, list_data);
171
169
  auto &list_validity = list_data.validity;
@@ -220,10 +218,10 @@ void ListColumnData::Append(BaseStatistics &stats_p, ColumnAppendState &state, V
220
218
  ColumnData::AppendData(stats, state, vdata, count);
221
219
  // append the validity data
222
220
  vdata.validity = append_mask;
223
- validity.AppendData(*stats.validity_stats, state.child_appends[0], vdata, count);
221
+ validity.AppendData(stats, state.child_appends[0], vdata, count);
224
222
  // append the child vector
225
223
  if (child_count > 0) {
226
- child_column->Append(*stats.child_stats, state.child_appends[1], child_vector, child_count);
224
+ child_column->Append(ListStats::GetChildStats(stats), state.child_appends[1], child_vector, child_count);
227
225
  }
228
226
  }
229
227
 
@@ -308,7 +306,7 @@ void ListColumnData::CommitDropColumn() {
308
306
  struct ListColumnCheckpointState : public ColumnCheckpointState {
309
307
  ListColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, PartialBlockManager &partial_block_manager)
310
308
  : ColumnCheckpointState(row_group, column_data, partial_block_manager) {
311
- global_stats = make_unique<ListStatistics>(column_data.type);
309
+ global_stats = ListStats::CreateEmpty(column_data.type).ToUnique();
312
310
  }
313
311
 
314
312
  unique_ptr<ColumnCheckpointState> validity_state;
@@ -317,10 +315,8 @@ struct ListColumnCheckpointState : public ColumnCheckpointState {
317
315
  public:
318
316
  unique_ptr<BaseStatistics> GetStatistics() override {
319
317
  auto stats = global_stats->Copy();
320
- auto &list_stats = (ListStatistics &)*stats;
321
- stats->validity_stats = validity_state->GetStatistics();
322
- list_stats.child_stats = child_state->GetStatistics();
323
- return stats;
318
+ ListStats::SetChildStats(stats, child_state->GetStatistics());
319
+ return stats.ToUnique();
324
320
  }
325
321
 
326
322
  void WriteDataPointers(RowGroupWriter &writer) override {