duckdb 0.7.2-dev225.0 → 0.7.2-dev314.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/column_reader.cpp +5 -6
- package/src/duckdb/extension/parquet/include/column_reader.hpp +1 -2
- package/src/duckdb/extension/parquet/include/generated_column_reader.hpp +1 -11
- package/src/duckdb/extension/parquet/parquet_statistics.cpp +26 -32
- package/src/duckdb/src/common/sort/sort_state.cpp +5 -7
- package/src/duckdb/src/execution/column_binding_resolver.cpp +6 -0
- package/src/duckdb/src/execution/operator/aggregate/physical_perfecthash_aggregate.cpp +4 -5
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +1 -1
- package/src/duckdb/src/execution/operator/helper/physical_vacuum.cpp +2 -3
- package/src/duckdb/src/execution/operator/join/physical_blockwise_nl_join.cpp +32 -6
- package/src/duckdb/src/execution/physical_plan/plan_aggregate.cpp +15 -15
- package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +18 -12
- package/src/duckdb/src/function/aggregate/distributive/bitstring_agg.cpp +6 -13
- package/src/duckdb/src/function/aggregate/distributive/count.cpp +2 -4
- package/src/duckdb/src/function/aggregate/distributive/sum.cpp +11 -13
- package/src/duckdb/src/function/scalar/date/date_diff.cpp +0 -1
- package/src/duckdb/src/function/scalar/date/date_part.cpp +17 -25
- package/src/duckdb/src/function/scalar/date/date_sub.cpp +0 -1
- package/src/duckdb/src/function/scalar/date/date_trunc.cpp +10 -14
- package/src/duckdb/src/function/scalar/generic/stats.cpp +2 -4
- package/src/duckdb/src/function/scalar/list/flatten.cpp +5 -12
- package/src/duckdb/src/function/scalar/list/list_concat.cpp +3 -8
- package/src/duckdb/src/function/scalar/list/list_extract.cpp +5 -12
- package/src/duckdb/src/function/scalar/list/list_value.cpp +5 -9
- package/src/duckdb/src/function/scalar/math/numeric.cpp +14 -17
- package/src/duckdb/src/function/scalar/operators/arithmetic.cpp +27 -34
- package/src/duckdb/src/function/scalar/string/caseconvert.cpp +2 -6
- package/src/duckdb/src/function/scalar/string/instr.cpp +2 -6
- package/src/duckdb/src/function/scalar/string/length.cpp +2 -6
- package/src/duckdb/src/function/scalar/string/like.cpp +2 -6
- package/src/duckdb/src/function/scalar/string/substring.cpp +2 -6
- package/src/duckdb/src/function/scalar/struct/struct_extract.cpp +4 -9
- package/src/duckdb/src/function/scalar/struct/struct_insert.cpp +10 -13
- package/src/duckdb/src/function/scalar/struct/struct_pack.cpp +5 -6
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_perfecthash_aggregate.hpp +1 -1
- package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +12 -3
- package/src/duckdb/src/include/duckdb/function/scalar_function.hpp +2 -2
- package/src/duckdb/src/include/duckdb/planner/bind_context.hpp +2 -0
- package/src/duckdb/src/include/duckdb/storage/checkpoint/table_data_writer.hpp +3 -2
- package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_compress.hpp +2 -2
- package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_fetch.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_scan.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_compress.hpp +2 -2
- package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_fetch.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/data_pointer.hpp +5 -2
- package/src/duckdb/src/include/duckdb/storage/data_table.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +93 -31
- package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +22 -3
- package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +6 -6
- package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +41 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +157 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/segment_statistics.hpp +2 -7
- package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +74 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +42 -0
- package/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp +2 -3
- package/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp +2 -2
- package/src/duckdb/src/include/duckdb/storage/table/persistent_table_data.hpp +2 -1
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -3
- package/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp +3 -2
- package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
- package/src/duckdb/src/main/config.cpp +66 -1
- package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +0 -1
- package/src/duckdb/src/optimizer/statistics/expression/propagate_aggregate.cpp +9 -3
- package/src/duckdb/src/optimizer/statistics/expression/propagate_and_compress.cpp +6 -7
- package/src/duckdb/src/optimizer/statistics/expression/propagate_cast.cpp +14 -11
- package/src/duckdb/src/optimizer/statistics/expression/propagate_columnref.cpp +1 -1
- package/src/duckdb/src/optimizer/statistics/expression/propagate_comparison.cpp +13 -15
- package/src/duckdb/src/optimizer/statistics/expression/propagate_conjunction.cpp +0 -1
- package/src/duckdb/src/optimizer/statistics/expression/propagate_constant.cpp +3 -75
- package/src/duckdb/src/optimizer/statistics/expression/propagate_function.cpp +7 -2
- package/src/duckdb/src/optimizer/statistics/expression/propagate_operator.cpp +10 -0
- package/src/duckdb/src/optimizer/statistics/operator/propagate_aggregate.cpp +2 -3
- package/src/duckdb/src/optimizer/statistics/operator/propagate_filter.cpp +28 -31
- package/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp +4 -5
- package/src/duckdb/src/optimizer/statistics/operator/propagate_set_operation.cpp +3 -3
- package/src/duckdb/src/optimizer/statistics_propagator.cpp +1 -1
- package/src/duckdb/src/parser/transform/tableref/transform_join.cpp +4 -0
- package/src/duckdb/src/planner/bind_context.cpp +16 -0
- package/src/duckdb/src/planner/binder/query_node/plan_select_node.cpp +0 -1
- package/src/duckdb/src/planner/binder/tableref/bind_joinref.cpp +9 -0
- package/src/duckdb/src/planner/binder.cpp +2 -1
- package/src/duckdb/src/planner/bound_result_modifier.cpp +1 -1
- package/src/duckdb/src/planner/expression/bound_window_expression.cpp +1 -1
- package/src/duckdb/src/planner/filter/constant_filter.cpp +4 -6
- package/src/duckdb/src/storage/checkpoint/row_group_writer.cpp +1 -1
- package/src/duckdb/src/storage/checkpoint/table_data_reader.cpp +1 -4
- package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +4 -4
- package/src/duckdb/src/storage/compression/bitpacking.cpp +3 -3
- package/src/duckdb/src/storage/compression/fixed_size_uncompressed.cpp +3 -3
- package/src/duckdb/src/storage/compression/numeric_constant.cpp +9 -10
- package/src/duckdb/src/storage/compression/patas.cpp +1 -1
- package/src/duckdb/src/storage/compression/rle.cpp +2 -2
- package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +5 -5
- package/src/duckdb/src/storage/data_table.cpp +4 -6
- package/src/duckdb/src/storage/statistics/base_statistics.cpp +373 -128
- package/src/duckdb/src/storage/statistics/column_statistics.cpp +58 -3
- package/src/duckdb/src/storage/statistics/distinct_statistics.cpp +4 -9
- package/src/duckdb/src/storage/statistics/list_stats.cpp +117 -0
- package/src/duckdb/src/storage/statistics/numeric_stats.cpp +529 -0
- package/src/duckdb/src/storage/statistics/segment_statistics.cpp +2 -11
- package/src/duckdb/src/storage/statistics/string_stats.cpp +273 -0
- package/src/duckdb/src/storage/statistics/struct_stats.cpp +131 -0
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/table/column_checkpoint_state.cpp +3 -4
- package/src/duckdb/src/storage/table/column_data.cpp +16 -14
- package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +2 -3
- package/src/duckdb/src/storage/table/column_segment.cpp +6 -8
- package/src/duckdb/src/storage/table/list_column_data.cpp +7 -11
- package/src/duckdb/src/storage/table/row_group.cpp +24 -23
- package/src/duckdb/src/storage/table/row_group_collection.cpp +12 -12
- package/src/duckdb/src/storage/table/standard_column_data.cpp +6 -6
- package/src/duckdb/src/storage/table/struct_column_data.cpp +15 -16
- package/src/duckdb/src/storage/table/table_statistics.cpp +27 -7
- package/src/duckdb/src/storage/table/update_segment.cpp +10 -12
- package/src/duckdb/third_party/libpg_query/include/parser/gram.hpp +923 -919
- package/src/duckdb/third_party/libpg_query/include/parser/kwlist.hpp +2 -0
- package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +15684 -15571
- package/src/duckdb/ub_src_storage_statistics.cpp +4 -6
- package/src/duckdb/src/include/duckdb/storage/statistics/list_statistics.hpp +0 -36
- package/src/duckdb/src/include/duckdb/storage/statistics/numeric_statistics.hpp +0 -75
- package/src/duckdb/src/include/duckdb/storage/statistics/string_statistics.hpp +0 -49
- package/src/duckdb/src/include/duckdb/storage/statistics/struct_statistics.hpp +0 -36
- package/src/duckdb/src/include/duckdb/storage/statistics/validity_statistics.hpp +0 -45
- package/src/duckdb/src/storage/statistics/list_statistics.cpp +0 -94
- package/src/duckdb/src/storage/statistics/numeric_statistics.cpp +0 -307
- package/src/duckdb/src/storage/statistics/string_statistics.cpp +0 -220
- package/src/duckdb/src/storage/statistics/struct_statistics.cpp +0 -108
- package/src/duckdb/src/storage/statistics/validity_statistics.cpp +0 -91
package/package.json
CHANGED
@@ -865,7 +865,7 @@ RowNumberColumnReader::RowNumberColumnReader(ParquetReader &reader, LogicalType
|
|
865
865
|
|
866
866
|
unique_ptr<BaseStatistics> RowNumberColumnReader::Stats(idx_t row_group_idx_p,
|
867
867
|
const std::vector<ColumnChunk> &columns) {
|
868
|
-
auto stats =
|
868
|
+
auto stats = NumericStats::CreateUnknown(type);
|
869
869
|
auto &row_groups = reader.GetFileMetadata()->row_groups;
|
870
870
|
D_ASSERT(row_group_idx_p < row_groups.size());
|
871
871
|
idx_t row_group_offset_min = 0;
|
@@ -873,11 +873,10 @@ unique_ptr<BaseStatistics> RowNumberColumnReader::Stats(idx_t row_group_idx_p,
|
|
873
873
|
row_group_offset_min += row_groups[i].num_rows;
|
874
874
|
}
|
875
875
|
|
876
|
-
stats
|
877
|
-
stats
|
878
|
-
|
879
|
-
|
880
|
-
return std::move(stats);
|
876
|
+
NumericStats::SetMin(stats, Value::BIGINT(row_group_offset_min));
|
877
|
+
NumericStats::SetMax(stats, Value::BIGINT(row_group_offset_min + row_groups[row_group_idx_p].num_rows));
|
878
|
+
stats.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
|
879
|
+
return stats.ToUnique();
|
881
880
|
}
|
882
881
|
|
883
882
|
void RowNumberColumnReader::InitializeRead(idx_t row_group_idx_p, const std::vector<ColumnChunk> &columns,
|
@@ -18,8 +18,7 @@
|
|
18
18
|
|
19
19
|
#include "duckdb.hpp"
|
20
20
|
#ifndef DUCKDB_AMALGAMATION
|
21
|
-
|
22
|
-
#include "duckdb/storage/statistics/numeric_statistics.hpp"
|
21
|
+
|
23
22
|
#include "duckdb/common/types/vector.hpp"
|
24
23
|
#include "duckdb/common/types/string_type.hpp"
|
25
24
|
#include "duckdb/common/types/chunk_collection.hpp"
|
@@ -29,17 +29,7 @@ public:
|
|
29
29
|
Vector &result) override;
|
30
30
|
|
31
31
|
unique_ptr<BaseStatistics> Stats(idx_t row_group_idx_p, const std::vector<ColumnChunk> &columns) override {
|
32
|
-
|
33
|
-
case LogicalTypeId::VARCHAR: {
|
34
|
-
auto string_stats = make_unique<StringStatistics>(type, StatisticsType::LOCAL_STATS);
|
35
|
-
string string = constant.ToString();
|
36
|
-
string_stats->Update(string);
|
37
|
-
string_stats->max_string_length = string.length();
|
38
|
-
return std::move(string_stats);
|
39
|
-
}
|
40
|
-
default:
|
41
|
-
return nullptr;
|
42
|
-
}
|
32
|
+
return BaseStatistics::FromConstant(constant).ToUnique();
|
43
33
|
};
|
44
34
|
|
45
35
|
void InitializeRead(idx_t row_group_idx_p, const std::vector<ColumnChunk> &columns,
|
@@ -6,8 +6,7 @@
|
|
6
6
|
#ifndef DUCKDB_AMALGAMATION
|
7
7
|
#include "duckdb/common/types/blob.hpp"
|
8
8
|
#include "duckdb/common/types/value.hpp"
|
9
|
-
|
10
|
-
#include "duckdb/storage/statistics/string_statistics.hpp"
|
9
|
+
|
11
10
|
#endif
|
12
11
|
|
13
12
|
namespace duckdb {
|
@@ -18,27 +17,29 @@ using duckdb_parquet::format::Type;
|
|
18
17
|
static unique_ptr<BaseStatistics> CreateNumericStats(const LogicalType &type,
|
19
18
|
const duckdb_parquet::format::SchemaElement &schema_ele,
|
20
19
|
const duckdb_parquet::format::Statistics &parquet_stats) {
|
21
|
-
auto stats =
|
20
|
+
auto stats = NumericStats::CreateUnknown(type);
|
22
21
|
|
23
22
|
// for reasons unknown to science, Parquet defines *both* `min` and `min_value` as well as `max` and
|
24
23
|
// `max_value`. All are optional. such elegance.
|
24
|
+
Value min;
|
25
|
+
Value max;
|
25
26
|
if (parquet_stats.__isset.min) {
|
26
|
-
|
27
|
+
min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min).DefaultCastAs(type);
|
27
28
|
} else if (parquet_stats.__isset.min_value) {
|
28
|
-
|
29
|
-
ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value).DefaultCastAs(type);
|
29
|
+
min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value).DefaultCastAs(type);
|
30
30
|
} else {
|
31
|
-
|
31
|
+
min = Value(type);
|
32
32
|
}
|
33
33
|
if (parquet_stats.__isset.max) {
|
34
|
-
|
34
|
+
max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max).DefaultCastAs(type);
|
35
35
|
} else if (parquet_stats.__isset.max_value) {
|
36
|
-
|
37
|
-
ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max_value).DefaultCastAs(type);
|
36
|
+
max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max_value).DefaultCastAs(type);
|
38
37
|
} else {
|
39
|
-
|
38
|
+
max = Value(type);
|
40
39
|
}
|
41
|
-
|
40
|
+
NumericStats::SetMin(stats, min);
|
41
|
+
NumericStats::SetMax(stats, max);
|
42
|
+
return stats.ToUnique();
|
42
43
|
}
|
43
44
|
|
44
45
|
Value ParquetStatisticsUtils::ConvertValue(const LogicalType &type,
|
@@ -228,24 +229,24 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
|
|
228
229
|
row_group_stats = CreateNumericStats(type, s_ele, parquet_stats);
|
229
230
|
break;
|
230
231
|
case LogicalTypeId::VARCHAR: {
|
231
|
-
auto string_stats =
|
232
|
+
auto string_stats = StringStats::CreateEmpty(type);
|
232
233
|
if (parquet_stats.__isset.min) {
|
233
|
-
|
234
|
+
StringStats::Update(string_stats, parquet_stats.min);
|
234
235
|
} else if (parquet_stats.__isset.min_value) {
|
235
|
-
|
236
|
+
StringStats::Update(string_stats, parquet_stats.min_value);
|
236
237
|
} else {
|
237
238
|
return nullptr;
|
238
239
|
}
|
239
240
|
if (parquet_stats.__isset.max) {
|
240
|
-
|
241
|
+
StringStats::Update(string_stats, parquet_stats.max);
|
241
242
|
} else if (parquet_stats.__isset.max_value) {
|
242
|
-
|
243
|
+
StringStats::Update(string_stats, parquet_stats.max_value);
|
243
244
|
} else {
|
244
245
|
return nullptr;
|
245
246
|
}
|
246
|
-
string_stats
|
247
|
-
string_stats
|
248
|
-
row_group_stats =
|
247
|
+
StringStats::SetContainsUnicode(string_stats);
|
248
|
+
StringStats::ResetMaxStringLength(string_stats);
|
249
|
+
row_group_stats = string_stats.ToUnique();
|
249
250
|
break;
|
250
251
|
}
|
251
252
|
default:
|
@@ -254,21 +255,14 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
|
|
254
255
|
} // end of type switch
|
255
256
|
|
256
257
|
// null count is generic
|
257
|
-
if (row_group_stats) {
|
258
|
-
if (column_chunk.meta_data.type == duckdb_parquet::format::Type::FLOAT ||
|
259
|
-
column_chunk.meta_data.type == duckdb_parquet::format::Type::DOUBLE) {
|
260
|
-
// floats/doubles can have infinity, which can become NULL
|
261
|
-
row_group_stats->validity_stats = make_unique<ValidityStatistics>(true);
|
262
|
-
} else if (parquet_stats.__isset.null_count) {
|
263
|
-
row_group_stats->validity_stats = make_unique<ValidityStatistics>(parquet_stats.null_count != 0);
|
264
|
-
} else {
|
265
|
-
row_group_stats->validity_stats = make_unique<ValidityStatistics>(true);
|
266
|
-
}
|
267
|
-
} else {
|
258
|
+
if (!row_group_stats) {
|
268
259
|
// if stats are missing from any row group we know squat
|
269
260
|
return nullptr;
|
270
261
|
}
|
271
|
-
|
262
|
+
row_group_stats->Set(StatsInfo::CAN_HAVE_NULL_AND_VALID_VALUES);
|
263
|
+
if (parquet_stats.__isset.null_count && parquet_stats.null_count == 0) {
|
264
|
+
row_group_stats->Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
|
265
|
+
}
|
272
266
|
return row_group_stats;
|
273
267
|
}
|
274
268
|
|
@@ -3,7 +3,6 @@
|
|
3
3
|
#include "duckdb/common/row_operations/row_operations.hpp"
|
4
4
|
#include "duckdb/common/sort/sort.hpp"
|
5
5
|
#include "duckdb/common/sort/sorted_block.hpp"
|
6
|
-
#include "duckdb/storage/statistics/string_statistics.hpp"
|
7
6
|
|
8
7
|
#include <algorithm>
|
9
8
|
#include <numeric>
|
@@ -66,9 +65,8 @@ SortLayout::SortLayout(const vector<BoundOrderByNode> &orders)
|
|
66
65
|
prefix_lengths.back() = GetNestedSortingColSize(col_size, expr.return_type);
|
67
66
|
} else if (physical_type == PhysicalType::VARCHAR) {
|
68
67
|
idx_t size_before = col_size;
|
69
|
-
if (stats.back()) {
|
70
|
-
|
71
|
-
col_size += str_stats.max_string_length;
|
68
|
+
if (stats.back() && StringStats::HasMaxStringLength(*stats.back())) {
|
69
|
+
col_size += StringStats::MaxStringLength(*stats.back());
|
72
70
|
if (col_size > 12) {
|
73
71
|
col_size = 12;
|
74
72
|
} else {
|
@@ -95,9 +93,9 @@ SortLayout::SortLayout(const vector<BoundOrderByNode> &orders)
|
|
95
93
|
if (bytes_to_fill == 0) {
|
96
94
|
break;
|
97
95
|
}
|
98
|
-
if (logical_types[col_idx].InternalType() == PhysicalType::VARCHAR && stats[col_idx]
|
99
|
-
|
100
|
-
idx_t diff =
|
96
|
+
if (logical_types[col_idx].InternalType() == PhysicalType::VARCHAR && stats[col_idx] &&
|
97
|
+
StringStats::HasMaxStringLength(*stats[col_idx])) {
|
98
|
+
idx_t diff = StringStats::MaxStringLength(*stats[col_idx]) - prefix_lengths[col_idx];
|
101
99
|
if (diff > 0) {
|
102
100
|
// Increase all sizes accordingly
|
103
101
|
idx_t increase = MinValue(bytes_to_fill, diff);
|
@@ -1,6 +1,7 @@
|
|
1
1
|
#include "duckdb/execution/column_binding_resolver.hpp"
|
2
2
|
|
3
3
|
#include "duckdb/planner/operator/logical_comparison_join.hpp"
|
4
|
+
#include "duckdb/planner/operator/logical_any_join.hpp"
|
4
5
|
#include "duckdb/planner/operator/logical_create_index.hpp"
|
5
6
|
#include "duckdb/planner/operator/logical_delim_join.hpp"
|
6
7
|
#include "duckdb/planner/operator/logical_insert.hpp"
|
@@ -46,6 +47,11 @@ void ColumnBindingResolver::VisitOperator(LogicalOperator &op) {
|
|
46
47
|
// this operator
|
47
48
|
VisitOperatorChildren(op);
|
48
49
|
bindings = op.GetColumnBindings();
|
50
|
+
auto &any_join = (LogicalAnyJoin &)op;
|
51
|
+
if (any_join.join_type == JoinType::SEMI || any_join.join_type == JoinType::ANTI) {
|
52
|
+
auto right_bindings = op.children[1]->GetColumnBindings();
|
53
|
+
bindings.insert(bindings.end(), right_bindings.begin(), right_bindings.end());
|
54
|
+
}
|
49
55
|
VisitOperatorExpressions(op);
|
50
56
|
return;
|
51
57
|
} else if (op.type == LogicalOperatorType::LOGICAL_CREATE_INDEX) {
|
@@ -4,14 +4,13 @@
|
|
4
4
|
#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
|
5
5
|
#include "duckdb/planner/expression/bound_reference_expression.hpp"
|
6
6
|
#include "duckdb/storage/buffer_manager.hpp"
|
7
|
-
#include "duckdb/storage/statistics/numeric_statistics.hpp"
|
8
7
|
|
9
8
|
namespace duckdb {
|
10
9
|
|
11
10
|
PhysicalPerfectHashAggregate::PhysicalPerfectHashAggregate(ClientContext &context, vector<LogicalType> types_p,
|
12
11
|
vector<unique_ptr<Expression>> aggregates_p,
|
13
12
|
vector<unique_ptr<Expression>> groups_p,
|
14
|
-
vector<unique_ptr<BaseStatistics>> group_stats,
|
13
|
+
const vector<unique_ptr<BaseStatistics>> &group_stats,
|
15
14
|
vector<idx_t> required_bits_p, idx_t estimated_cardinality)
|
16
15
|
: PhysicalOperator(PhysicalOperatorType::PERFECT_HASH_GROUP_BY, std::move(types_p), estimated_cardinality),
|
17
16
|
groups(std::move(groups_p)), aggregates(std::move(aggregates_p)), required_bits(std::move(required_bits_p)) {
|
@@ -19,9 +18,9 @@ PhysicalPerfectHashAggregate::PhysicalPerfectHashAggregate(ClientContext &contex
|
|
19
18
|
group_minima.reserve(group_stats.size());
|
20
19
|
for (auto &stats : group_stats) {
|
21
20
|
D_ASSERT(stats);
|
22
|
-
auto &nstats =
|
23
|
-
D_ASSERT(
|
24
|
-
group_minima.push_back(
|
21
|
+
auto &nstats = *stats;
|
22
|
+
D_ASSERT(NumericStats::HasMin(nstats));
|
23
|
+
group_minima.push_back(NumericStats::Min(nstats));
|
25
24
|
}
|
26
25
|
for (auto &expr : groups) {
|
27
26
|
group_types.push_back(expr->return_type);
|
@@ -111,7 +111,7 @@ public:
|
|
111
111
|
orders.emplace_back(OrderType::ASCENDING, OrderByNullType::NULLS_FIRST, pexpr->Copy(), nullptr);
|
112
112
|
} else {
|
113
113
|
orders.emplace_back(OrderType::ASCENDING, OrderByNullType::NULLS_FIRST, pexpr->Copy(),
|
114
|
-
wexpr->partitions_stats[prt_idx]->
|
114
|
+
wexpr->partitions_stats[prt_idx]->ToUnique());
|
115
115
|
}
|
116
116
|
partitions.emplace_back(orders.back().Copy());
|
117
117
|
}
|
@@ -72,9 +72,8 @@ SinkFinalizeType PhysicalVacuum::Finalize(Pipeline &pipeline, Event &event, Clie
|
|
72
72
|
|
73
73
|
auto table = info->table;
|
74
74
|
for (idx_t col_idx = 0; col_idx < sink.column_distinct_stats.size(); col_idx++) {
|
75
|
-
table->GetStorage().
|
76
|
-
|
77
|
-
});
|
75
|
+
table->GetStorage().SetDistinct(info->column_id_map.at(col_idx),
|
76
|
+
std::move(sink.column_distinct_stats[col_idx]));
|
78
77
|
}
|
79
78
|
|
80
79
|
return SinkFinalizeType::READY;
|
@@ -87,11 +87,23 @@ public:
|
|
87
87
|
OuterJoinMarker left_outer;
|
88
88
|
SelectionVector match_sel;
|
89
89
|
ExpressionExecutor executor;
|
90
|
+
DataChunk intermediate_chunk;
|
90
91
|
};
|
91
92
|
|
92
93
|
unique_ptr<OperatorState> PhysicalBlockwiseNLJoin::GetOperatorState(ExecutionContext &context) const {
|
93
94
|
auto &gstate = (BlockwiseNLJoinGlobalState &)*sink_state;
|
94
|
-
|
95
|
+
auto result = make_unique<BlockwiseNLJoinState>(context, gstate.right_chunks, *this);
|
96
|
+
if (join_type == JoinType::SEMI || join_type == JoinType::ANTI) {
|
97
|
+
vector<LogicalType> intermediate_types;
|
98
|
+
for (auto &type : children[0]->types) {
|
99
|
+
intermediate_types.emplace_back(type);
|
100
|
+
}
|
101
|
+
for (auto &type : children[1]->types) {
|
102
|
+
intermediate_types.emplace_back(type);
|
103
|
+
}
|
104
|
+
result->intermediate_chunk.Initialize(Allocator::DefaultAllocator(), intermediate_types);
|
105
|
+
}
|
106
|
+
return std::move(result);
|
95
107
|
}
|
96
108
|
|
97
109
|
OperatorResultType PhysicalBlockwiseNLJoin::ExecuteInternal(ExecutionContext &context, DataChunk &input,
|
@@ -111,24 +123,30 @@ OperatorResultType PhysicalBlockwiseNLJoin::ExecuteInternal(ExecutionContext &co
|
|
111
123
|
}
|
112
124
|
}
|
113
125
|
|
126
|
+
DataChunk *intermediate_chunk = &chunk;
|
127
|
+
if (join_type == JoinType::SEMI || join_type == JoinType::ANTI) {
|
128
|
+
intermediate_chunk = &state.intermediate_chunk;
|
129
|
+
intermediate_chunk->Reset();
|
130
|
+
}
|
131
|
+
|
114
132
|
// now perform the actual join
|
115
133
|
// we perform a cross product, then execute the expression directly on the cross product' result
|
116
134
|
idx_t result_count = 0;
|
117
135
|
do {
|
118
|
-
auto result = state.cross_product.Execute(input,
|
136
|
+
auto result = state.cross_product.Execute(input, *intermediate_chunk);
|
119
137
|
if (result == OperatorResultType::NEED_MORE_INPUT) {
|
120
138
|
// exhausted input, have to pull new LHS chunk
|
121
139
|
if (state.left_outer.Enabled()) {
|
122
140
|
// left join: before we move to the next chunk, see if we need to output any vectors that didn't
|
123
141
|
// have a match found
|
124
|
-
state.left_outer.ConstructLeftJoinResult(input,
|
142
|
+
state.left_outer.ConstructLeftJoinResult(input, *intermediate_chunk);
|
125
143
|
state.left_outer.Reset();
|
126
144
|
}
|
127
145
|
return OperatorResultType::NEED_MORE_INPUT;
|
128
146
|
}
|
129
147
|
|
130
148
|
// now perform the computation
|
131
|
-
result_count = state.executor.SelectExpression(
|
149
|
+
result_count = state.executor.SelectExpression(*intermediate_chunk, state.match_sel);
|
132
150
|
if (result_count > 0) {
|
133
151
|
// found a match!
|
134
152
|
// check if the cross product is scanning the LHS or the RHS in its entirety
|
@@ -143,12 +161,20 @@ OperatorResultType PhysicalBlockwiseNLJoin::ExecuteInternal(ExecutionContext &co
|
|
143
161
|
// set the match flags in the RHS
|
144
162
|
gstate.right_outer.SetMatches(state.match_sel, result_count, state.cross_product.ScanPosition());
|
145
163
|
}
|
146
|
-
|
164
|
+
intermediate_chunk->Slice(state.match_sel, result_count);
|
147
165
|
} else {
|
148
166
|
// no result: reset the chunk
|
149
|
-
|
167
|
+
intermediate_chunk->Reset();
|
150
168
|
}
|
151
169
|
} while (result_count == 0);
|
170
|
+
|
171
|
+
if (join_type == JoinType::SEMI || join_type == JoinType::ANTI) {
|
172
|
+
for (idx_t col_idx = 0; col_idx < chunk.ColumnCount(); col_idx++) {
|
173
|
+
chunk.data[col_idx].Reference(intermediate_chunk->data[col_idx]);
|
174
|
+
}
|
175
|
+
chunk.SetCardinality(*intermediate_chunk);
|
176
|
+
}
|
177
|
+
|
152
178
|
return OperatorResultType::HAVE_MORE_OUTPUT;
|
153
179
|
}
|
154
180
|
|
@@ -9,7 +9,7 @@
|
|
9
9
|
#include "duckdb/parser/expression/comparison_expression.hpp"
|
10
10
|
#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
|
11
11
|
#include "duckdb/planner/operator/logical_aggregate.hpp"
|
12
|
-
|
12
|
+
|
13
13
|
namespace duckdb {
|
14
14
|
|
15
15
|
static uint32_t RequiredBitsForValue(uint32_t n) {
|
@@ -50,23 +50,20 @@ static bool CanUsePerfectHashAggregate(ClientContext &context, LogicalAggregate
|
|
50
50
|
// for small types we can just set the stats to [type_min, type_max]
|
51
51
|
switch (group_type.InternalType()) {
|
52
52
|
case PhysicalType::INT8:
|
53
|
-
stats = make_unique<NumericStatistics>(group_type, Value::MinimumValue(group_type),
|
54
|
-
Value::MaximumValue(group_type), StatisticsType::LOCAL_STATS);
|
55
|
-
break;
|
56
53
|
case PhysicalType::INT16:
|
57
|
-
stats = make_unique<NumericStatistics>(group_type, Value::MinimumValue(group_type),
|
58
|
-
Value::MaximumValue(group_type), StatisticsType::LOCAL_STATS);
|
59
54
|
break;
|
60
55
|
default:
|
61
56
|
// type is too large and there are no stats: skip perfect hashing
|
62
57
|
return false;
|
63
58
|
}
|
64
|
-
//
|
65
|
-
stats
|
59
|
+
// construct stats with the min and max value of the type
|
60
|
+
stats = NumericStats::CreateUnknown(group_type).ToUnique();
|
61
|
+
NumericStats::SetMin(*stats, Value::MinimumValue(group_type));
|
62
|
+
NumericStats::SetMax(*stats, Value::MaximumValue(group_type));
|
66
63
|
}
|
67
|
-
auto &nstats =
|
64
|
+
auto &nstats = *stats;
|
68
65
|
|
69
|
-
if (
|
66
|
+
if (!NumericStats::HasMinMax(nstats)) {
|
70
67
|
return false;
|
71
68
|
}
|
72
69
|
// we have a min and a max value for the stats: use that to figure out how many bits we have
|
@@ -75,17 +72,20 @@ static bool CanUsePerfectHashAggregate(ClientContext &context, LogicalAggregate
|
|
75
72
|
int64_t range;
|
76
73
|
switch (group_type.InternalType()) {
|
77
74
|
case PhysicalType::INT8:
|
78
|
-
range = int64_t(
|
75
|
+
range = int64_t(NumericStats::GetMaxUnsafe<int8_t>(nstats)) -
|
76
|
+
int64_t(NumericStats::GetMinUnsafe<int8_t>(nstats));
|
79
77
|
break;
|
80
78
|
case PhysicalType::INT16:
|
81
|
-
range = int64_t(
|
79
|
+
range = int64_t(NumericStats::GetMaxUnsafe<int16_t>(nstats)) -
|
80
|
+
int64_t(NumericStats::GetMinUnsafe<int16_t>(nstats));
|
82
81
|
break;
|
83
82
|
case PhysicalType::INT32:
|
84
|
-
range = int64_t(
|
83
|
+
range = int64_t(NumericStats::GetMaxUnsafe<int32_t>(nstats)) -
|
84
|
+
int64_t(NumericStats::GetMinUnsafe<int32_t>(nstats));
|
85
85
|
break;
|
86
86
|
case PhysicalType::INT64:
|
87
|
-
if (!TrySubtractOperator::Operation(
|
88
|
-
|
87
|
+
if (!TrySubtractOperator::Operation(NumericStats::GetMaxUnsafe<int64_t>(nstats),
|
88
|
+
NumericStats::GetMinUnsafe<int64_t>(nstats), range)) {
|
89
89
|
return false;
|
90
90
|
}
|
91
91
|
break;
|
@@ -10,7 +10,7 @@
|
|
10
10
|
#include "duckdb/function/table/table_scan.hpp"
|
11
11
|
#include "duckdb/main/client_context.hpp"
|
12
12
|
#include "duckdb/planner/operator/logical_comparison_join.hpp"
|
13
|
-
|
13
|
+
|
14
14
|
#include "duckdb/transaction/duck_transaction.hpp"
|
15
15
|
#include "duckdb/common/operator/subtract.hpp"
|
16
16
|
#include "duckdb/execution/operator/join/physical_blockwise_nl_join.hpp"
|
@@ -92,19 +92,21 @@ void CheckForPerfectJoinOpt(LogicalComparisonJoin &op, PerfectHashJoinStats &joi
|
|
92
92
|
}
|
93
93
|
// with integral internal types
|
94
94
|
for (auto &&join_stat : op.join_stats) {
|
95
|
-
if (!TypeIsInteger(join_stat->
|
95
|
+
if (!TypeIsInteger(join_stat->GetType().InternalType()) ||
|
96
|
+
join_stat->GetType().InternalType() == PhysicalType::INT128) {
|
96
97
|
// perfect join not possible for non-integral types or hugeint
|
97
98
|
return;
|
98
99
|
}
|
99
100
|
}
|
100
101
|
|
101
102
|
// and when the build range is smaller than the threshold
|
102
|
-
auto stats_build =
|
103
|
-
if (
|
103
|
+
auto &stats_build = *op.join_stats[0].get(); // lhs stats
|
104
|
+
if (!NumericStats::HasMinMax(stats_build)) {
|
104
105
|
return;
|
105
106
|
}
|
106
107
|
int64_t min_value, max_value;
|
107
|
-
if (!ExtractNumericValue(stats_build
|
108
|
+
if (!ExtractNumericValue(NumericStats::Min(stats_build), min_value) ||
|
109
|
+
!ExtractNumericValue(NumericStats::Max(stats_build), max_value)) {
|
108
110
|
return;
|
109
111
|
}
|
110
112
|
int64_t build_range;
|
@@ -113,20 +115,24 @@ void CheckForPerfectJoinOpt(LogicalComparisonJoin &op, PerfectHashJoinStats &joi
|
|
113
115
|
}
|
114
116
|
|
115
117
|
// Fill join_stats for invisible join
|
116
|
-
auto stats_probe =
|
118
|
+
auto &stats_probe = *op.join_stats[1].get(); // rhs stats
|
119
|
+
if (!NumericStats::HasMinMax(stats_probe)) {
|
120
|
+
return;
|
121
|
+
}
|
117
122
|
|
118
123
|
// The max size our build must have to run the perfect HJ
|
119
124
|
const idx_t MAX_BUILD_SIZE = 1000000;
|
120
|
-
join_state.probe_min = stats_probe
|
121
|
-
join_state.probe_max = stats_probe
|
122
|
-
join_state.build_min = stats_build
|
123
|
-
join_state.build_max = stats_build
|
125
|
+
join_state.probe_min = NumericStats::Min(stats_probe);
|
126
|
+
join_state.probe_max = NumericStats::Max(stats_probe);
|
127
|
+
join_state.build_min = NumericStats::Min(stats_build);
|
128
|
+
join_state.build_max = NumericStats::Max(stats_build);
|
124
129
|
join_state.estimated_cardinality = op.estimated_cardinality;
|
125
130
|
join_state.build_range = build_range;
|
126
|
-
if (join_state.build_range > MAX_BUILD_SIZE
|
131
|
+
if (join_state.build_range > MAX_BUILD_SIZE) {
|
127
132
|
return;
|
128
133
|
}
|
129
|
-
if (stats_build
|
134
|
+
if (NumericStats::Min(stats_build) <= NumericStats::Min(stats_probe) &&
|
135
|
+
NumericStats::Max(stats_probe) <= NumericStats::Max(stats_build)) {
|
130
136
|
join_state.is_probe_in_domain = true;
|
131
137
|
}
|
132
138
|
join_state.is_build_small = true;
|
@@ -3,7 +3,7 @@
|
|
3
3
|
#include "duckdb/common/types/null_value.hpp"
|
4
4
|
#include "duckdb/common/vector_operations/aggregate_executor.hpp"
|
5
5
|
#include "duckdb/common/types/bit.hpp"
|
6
|
-
#include "duckdb/storage/statistics/
|
6
|
+
#include "duckdb/storage/statistics/base_statistics.hpp"
|
7
7
|
#include "duckdb/execution/expression_executor.hpp"
|
8
8
|
#include "duckdb/common/types/cast_helpers.hpp"
|
9
9
|
|
@@ -171,22 +171,15 @@ idx_t BitStringAggOperation::GetRange(hugeint_t min, hugeint_t max) {
|
|
171
171
|
}
|
172
172
|
|
173
173
|
unique_ptr<BaseStatistics> BitstringPropagateStats(ClientContext &context, BoundAggregateExpression &expr,
|
174
|
-
|
175
|
-
vector<unique_ptr<BaseStatistics>> &child_stats,
|
176
|
-
NodeStatistics *node_stats) {
|
174
|
+
AggregateStatisticsInput &input) {
|
177
175
|
|
178
|
-
if (child_stats[0]) {
|
179
|
-
auto &numeric_stats = (NumericStatistics &)*child_stats[0];
|
180
|
-
if (numeric_stats.min.IsNull() || numeric_stats.max.IsNull()) {
|
181
|
-
return nullptr;
|
182
|
-
}
|
183
|
-
auto bind_agg_data = (BitstringAggBindData *)bind_data;
|
184
|
-
bind_agg_data->min = numeric_stats.min;
|
185
|
-
bind_agg_data->max = numeric_stats.max;
|
186
|
-
} else {
|
176
|
+
if (!NumericStats::HasMinMax(input.child_stats[0])) {
|
187
177
|
throw BinderException("Could not retrieve required statistics. Alternatively, try by providing the statistics "
|
188
178
|
"explicitly: BITSTRING_AGG(col, min, max) ");
|
189
179
|
}
|
180
|
+
auto bind_agg_data = (BitstringAggBindData *)input.bind_data;
|
181
|
+
bind_agg_data->min = NumericStats::Min(input.child_stats[0]);
|
182
|
+
bind_agg_data->max = NumericStats::Max(input.child_stats[0]);
|
190
183
|
return nullptr;
|
191
184
|
}
|
192
185
|
|
@@ -2,7 +2,6 @@
|
|
2
2
|
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
3
3
|
#include "duckdb/function/aggregate/distributive_functions.hpp"
|
4
4
|
#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
|
5
|
-
#include "duckdb/storage/statistics/validity_statistics.hpp"
|
6
5
|
|
7
6
|
namespace duckdb {
|
8
7
|
|
@@ -100,9 +99,8 @@ AggregateFunction CountStarFun::GetFunction() {
|
|
100
99
|
}
|
101
100
|
|
102
101
|
unique_ptr<BaseStatistics> CountPropagateStats(ClientContext &context, BoundAggregateExpression &expr,
|
103
|
-
|
104
|
-
|
105
|
-
if (!expr.IsDistinct() && child_stats[0] && !child_stats[0]->CanHaveNull()) {
|
102
|
+
AggregateStatisticsInput &input) {
|
103
|
+
if (!expr.IsDistinct() && !input.child_stats[0].CanHaveNull()) {
|
106
104
|
// count on a column without null values: use count star
|
107
105
|
expr.function = CountStarFun::GetFunction();
|
108
106
|
expr.function.name = "count_star";
|
@@ -2,7 +2,6 @@
|
|
2
2
|
#include "duckdb/function/aggregate/sum_helpers.hpp"
|
3
3
|
#include "duckdb/common/exception.hpp"
|
4
4
|
#include "duckdb/common/types/decimal.hpp"
|
5
|
-
#include "duckdb/storage/statistics/numeric_statistics.hpp"
|
6
5
|
#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
|
7
6
|
#include "duckdb/function/aggregate/algebraic_functions.hpp"
|
8
7
|
|
@@ -72,30 +71,29 @@ struct HugeintSumOperation : public BaseSumOperation<SumSetOperation, RegularAdd
|
|
72
71
|
};
|
73
72
|
|
74
73
|
unique_ptr<BaseStatistics> SumPropagateStats(ClientContext &context, BoundAggregateExpression &expr,
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
if (numeric_stats.min.IsNull() || numeric_stats.max.IsNull()) {
|
74
|
+
AggregateStatisticsInput &input) {
|
75
|
+
if (input.node_stats && input.node_stats->has_max_cardinality) {
|
76
|
+
auto &numeric_stats = input.child_stats[0];
|
77
|
+
if (!NumericStats::HasMinMax(numeric_stats)) {
|
80
78
|
return nullptr;
|
81
79
|
}
|
82
|
-
auto internal_type = numeric_stats.
|
80
|
+
auto internal_type = numeric_stats.GetType().InternalType();
|
83
81
|
hugeint_t max_negative;
|
84
82
|
hugeint_t max_positive;
|
85
83
|
switch (internal_type) {
|
86
84
|
case PhysicalType::INT32:
|
87
|
-
max_negative = numeric_stats.
|
88
|
-
max_positive = numeric_stats.
|
85
|
+
max_negative = NumericStats::Min(numeric_stats).GetValueUnsafe<int32_t>();
|
86
|
+
max_positive = NumericStats::Max(numeric_stats).GetValueUnsafe<int32_t>();
|
89
87
|
break;
|
90
88
|
case PhysicalType::INT64:
|
91
|
-
max_negative = numeric_stats.
|
92
|
-
max_positive = numeric_stats.
|
89
|
+
max_negative = NumericStats::Min(numeric_stats).GetValueUnsafe<int64_t>();
|
90
|
+
max_positive = NumericStats::Max(numeric_stats).GetValueUnsafe<int64_t>();
|
93
91
|
break;
|
94
92
|
default:
|
95
93
|
throw InternalException("Unsupported type for propagate sum stats");
|
96
94
|
}
|
97
|
-
auto max_sum_negative = max_negative * hugeint_t(node_stats->max_cardinality);
|
98
|
-
auto max_sum_positive = max_positive * hugeint_t(node_stats->max_cardinality);
|
95
|
+
auto max_sum_negative = max_negative * hugeint_t(input.node_stats->max_cardinality);
|
96
|
+
auto max_sum_positive = max_positive * hugeint_t(input.node_stats->max_cardinality);
|
99
97
|
if (max_sum_positive >= NumericLimits<int64_t>::Maximum() ||
|
100
98
|
max_sum_negative <= NumericLimits<int64_t>::Minimum()) {
|
101
99
|
// sum can potentially exceed int64_t bounds: use hugeint sum
|
@@ -9,7 +9,6 @@
|
|
9
9
|
#include "duckdb/common/vector_operations/ternary_executor.hpp"
|
10
10
|
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
11
11
|
#include "duckdb/common/string_util.hpp"
|
12
|
-
#include "duckdb/storage/statistics/numeric_statistics.hpp"
|
13
12
|
|
14
13
|
namespace duckdb {
|
15
14
|
|