duckdb 0.7.2-dev225.0 → 0.7.2-dev314.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/parquet/column_reader.cpp +5 -6
  3. package/src/duckdb/extension/parquet/include/column_reader.hpp +1 -2
  4. package/src/duckdb/extension/parquet/include/generated_column_reader.hpp +1 -11
  5. package/src/duckdb/extension/parquet/parquet_statistics.cpp +26 -32
  6. package/src/duckdb/src/common/sort/sort_state.cpp +5 -7
  7. package/src/duckdb/src/execution/column_binding_resolver.cpp +6 -0
  8. package/src/duckdb/src/execution/operator/aggregate/physical_perfecthash_aggregate.cpp +4 -5
  9. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +1 -1
  10. package/src/duckdb/src/execution/operator/helper/physical_vacuum.cpp +2 -3
  11. package/src/duckdb/src/execution/operator/join/physical_blockwise_nl_join.cpp +32 -6
  12. package/src/duckdb/src/execution/physical_plan/plan_aggregate.cpp +15 -15
  13. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +18 -12
  14. package/src/duckdb/src/function/aggregate/distributive/bitstring_agg.cpp +6 -13
  15. package/src/duckdb/src/function/aggregate/distributive/count.cpp +2 -4
  16. package/src/duckdb/src/function/aggregate/distributive/sum.cpp +11 -13
  17. package/src/duckdb/src/function/scalar/date/date_diff.cpp +0 -1
  18. package/src/duckdb/src/function/scalar/date/date_part.cpp +17 -25
  19. package/src/duckdb/src/function/scalar/date/date_sub.cpp +0 -1
  20. package/src/duckdb/src/function/scalar/date/date_trunc.cpp +10 -14
  21. package/src/duckdb/src/function/scalar/generic/stats.cpp +2 -4
  22. package/src/duckdb/src/function/scalar/list/flatten.cpp +5 -12
  23. package/src/duckdb/src/function/scalar/list/list_concat.cpp +3 -8
  24. package/src/duckdb/src/function/scalar/list/list_extract.cpp +5 -12
  25. package/src/duckdb/src/function/scalar/list/list_value.cpp +5 -9
  26. package/src/duckdb/src/function/scalar/math/numeric.cpp +14 -17
  27. package/src/duckdb/src/function/scalar/operators/arithmetic.cpp +27 -34
  28. package/src/duckdb/src/function/scalar/string/caseconvert.cpp +2 -6
  29. package/src/duckdb/src/function/scalar/string/instr.cpp +2 -6
  30. package/src/duckdb/src/function/scalar/string/length.cpp +2 -6
  31. package/src/duckdb/src/function/scalar/string/like.cpp +2 -6
  32. package/src/duckdb/src/function/scalar/string/substring.cpp +2 -6
  33. package/src/duckdb/src/function/scalar/struct/struct_extract.cpp +4 -9
  34. package/src/duckdb/src/function/scalar/struct/struct_insert.cpp +10 -13
  35. package/src/duckdb/src/function/scalar/struct/struct_pack.cpp +5 -6
  36. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  37. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_perfecthash_aggregate.hpp +1 -1
  38. package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +12 -3
  39. package/src/duckdb/src/include/duckdb/function/scalar_function.hpp +2 -2
  40. package/src/duckdb/src/include/duckdb/planner/bind_context.hpp +2 -0
  41. package/src/duckdb/src/include/duckdb/storage/checkpoint/table_data_writer.hpp +3 -2
  42. package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_compress.hpp +2 -2
  43. package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_fetch.hpp +1 -1
  44. package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_scan.hpp +1 -1
  45. package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_compress.hpp +2 -2
  46. package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_fetch.hpp +1 -1
  47. package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp +1 -1
  48. package/src/duckdb/src/include/duckdb/storage/data_pointer.hpp +5 -2
  49. package/src/duckdb/src/include/duckdb/storage/data_table.hpp +1 -1
  50. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +93 -31
  51. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +22 -3
  52. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +6 -6
  53. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +41 -0
  54. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +157 -0
  55. package/src/duckdb/src/include/duckdb/storage/statistics/segment_statistics.hpp +2 -7
  56. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +74 -0
  57. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +42 -0
  58. package/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp +2 -3
  59. package/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp +2 -2
  60. package/src/duckdb/src/include/duckdb/storage/table/persistent_table_data.hpp +2 -1
  61. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -3
  62. package/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp +3 -2
  63. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  64. package/src/duckdb/src/main/config.cpp +66 -1
  65. package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +0 -1
  66. package/src/duckdb/src/optimizer/statistics/expression/propagate_aggregate.cpp +9 -3
  67. package/src/duckdb/src/optimizer/statistics/expression/propagate_and_compress.cpp +6 -7
  68. package/src/duckdb/src/optimizer/statistics/expression/propagate_cast.cpp +14 -11
  69. package/src/duckdb/src/optimizer/statistics/expression/propagate_columnref.cpp +1 -1
  70. package/src/duckdb/src/optimizer/statistics/expression/propagate_comparison.cpp +13 -15
  71. package/src/duckdb/src/optimizer/statistics/expression/propagate_conjunction.cpp +0 -1
  72. package/src/duckdb/src/optimizer/statistics/expression/propagate_constant.cpp +3 -75
  73. package/src/duckdb/src/optimizer/statistics/expression/propagate_function.cpp +7 -2
  74. package/src/duckdb/src/optimizer/statistics/expression/propagate_operator.cpp +10 -0
  75. package/src/duckdb/src/optimizer/statistics/operator/propagate_aggregate.cpp +2 -3
  76. package/src/duckdb/src/optimizer/statistics/operator/propagate_filter.cpp +28 -31
  77. package/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp +4 -5
  78. package/src/duckdb/src/optimizer/statistics/operator/propagate_set_operation.cpp +3 -3
  79. package/src/duckdb/src/optimizer/statistics_propagator.cpp +1 -1
  80. package/src/duckdb/src/parser/transform/tableref/transform_join.cpp +4 -0
  81. package/src/duckdb/src/planner/bind_context.cpp +16 -0
  82. package/src/duckdb/src/planner/binder/query_node/plan_select_node.cpp +0 -1
  83. package/src/duckdb/src/planner/binder/tableref/bind_joinref.cpp +9 -0
  84. package/src/duckdb/src/planner/binder.cpp +2 -1
  85. package/src/duckdb/src/planner/bound_result_modifier.cpp +1 -1
  86. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +1 -1
  87. package/src/duckdb/src/planner/filter/constant_filter.cpp +4 -6
  88. package/src/duckdb/src/storage/checkpoint/row_group_writer.cpp +1 -1
  89. package/src/duckdb/src/storage/checkpoint/table_data_reader.cpp +1 -4
  90. package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +4 -4
  91. package/src/duckdb/src/storage/compression/bitpacking.cpp +3 -3
  92. package/src/duckdb/src/storage/compression/fixed_size_uncompressed.cpp +3 -3
  93. package/src/duckdb/src/storage/compression/numeric_constant.cpp +9 -10
  94. package/src/duckdb/src/storage/compression/patas.cpp +1 -1
  95. package/src/duckdb/src/storage/compression/rle.cpp +2 -2
  96. package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +5 -5
  97. package/src/duckdb/src/storage/data_table.cpp +4 -6
  98. package/src/duckdb/src/storage/statistics/base_statistics.cpp +373 -128
  99. package/src/duckdb/src/storage/statistics/column_statistics.cpp +58 -3
  100. package/src/duckdb/src/storage/statistics/distinct_statistics.cpp +4 -9
  101. package/src/duckdb/src/storage/statistics/list_stats.cpp +117 -0
  102. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +529 -0
  103. package/src/duckdb/src/storage/statistics/segment_statistics.cpp +2 -11
  104. package/src/duckdb/src/storage/statistics/string_stats.cpp +273 -0
  105. package/src/duckdb/src/storage/statistics/struct_stats.cpp +131 -0
  106. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  107. package/src/duckdb/src/storage/table/column_checkpoint_state.cpp +3 -4
  108. package/src/duckdb/src/storage/table/column_data.cpp +16 -14
  109. package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +2 -3
  110. package/src/duckdb/src/storage/table/column_segment.cpp +6 -8
  111. package/src/duckdb/src/storage/table/list_column_data.cpp +7 -11
  112. package/src/duckdb/src/storage/table/row_group.cpp +24 -23
  113. package/src/duckdb/src/storage/table/row_group_collection.cpp +12 -12
  114. package/src/duckdb/src/storage/table/standard_column_data.cpp +6 -6
  115. package/src/duckdb/src/storage/table/struct_column_data.cpp +15 -16
  116. package/src/duckdb/src/storage/table/table_statistics.cpp +27 -7
  117. package/src/duckdb/src/storage/table/update_segment.cpp +10 -12
  118. package/src/duckdb/third_party/libpg_query/include/parser/gram.hpp +923 -919
  119. package/src/duckdb/third_party/libpg_query/include/parser/kwlist.hpp +2 -0
  120. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +15684 -15571
  121. package/src/duckdb/ub_src_storage_statistics.cpp +4 -6
  122. package/src/duckdb/src/include/duckdb/storage/statistics/list_statistics.hpp +0 -36
  123. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_statistics.hpp +0 -75
  124. package/src/duckdb/src/include/duckdb/storage/statistics/string_statistics.hpp +0 -49
  125. package/src/duckdb/src/include/duckdb/storage/statistics/struct_statistics.hpp +0 -36
  126. package/src/duckdb/src/include/duckdb/storage/statistics/validity_statistics.hpp +0 -45
  127. package/src/duckdb/src/storage/statistics/list_statistics.cpp +0 -94
  128. package/src/duckdb/src/storage/statistics/numeric_statistics.cpp +0 -307
  129. package/src/duckdb/src/storage/statistics/string_statistics.cpp +0 -220
  130. package/src/duckdb/src/storage/statistics/struct_statistics.cpp +0 -108
  131. package/src/duckdb/src/storage/statistics/validity_statistics.cpp +0 -91
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.7.2-dev225.0",
5
+ "version": "0.7.2-dev314.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -865,7 +865,7 @@ RowNumberColumnReader::RowNumberColumnReader(ParquetReader &reader, LogicalType
865
865
 
866
866
  unique_ptr<BaseStatistics> RowNumberColumnReader::Stats(idx_t row_group_idx_p,
867
867
  const std::vector<ColumnChunk> &columns) {
868
- auto stats = make_unique<NumericStatistics>(type, StatisticsType::LOCAL_STATS);
868
+ auto stats = NumericStats::CreateUnknown(type);
869
869
  auto &row_groups = reader.GetFileMetadata()->row_groups;
870
870
  D_ASSERT(row_group_idx_p < row_groups.size());
871
871
  idx_t row_group_offset_min = 0;
@@ -873,11 +873,10 @@ unique_ptr<BaseStatistics> RowNumberColumnReader::Stats(idx_t row_group_idx_p,
873
873
  row_group_offset_min += row_groups[i].num_rows;
874
874
  }
875
875
 
876
- stats->min = Value::BIGINT(row_group_offset_min);
877
- stats->max = Value::BIGINT(row_group_offset_min + row_groups[row_group_idx_p].num_rows);
878
-
879
- D_ASSERT(!stats->CanHaveNull() && stats->CanHaveNoNull());
880
- return std::move(stats);
876
+ NumericStats::SetMin(stats, Value::BIGINT(row_group_offset_min));
877
+ NumericStats::SetMax(stats, Value::BIGINT(row_group_offset_min + row_groups[row_group_idx_p].num_rows));
878
+ stats.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
879
+ return stats.ToUnique();
881
880
  }
882
881
 
883
882
  void RowNumberColumnReader::InitializeRead(idx_t row_group_idx_p, const std::vector<ColumnChunk> &columns,
@@ -18,8 +18,7 @@
18
18
 
19
19
  #include "duckdb.hpp"
20
20
  #ifndef DUCKDB_AMALGAMATION
21
- #include "duckdb/storage/statistics/string_statistics.hpp"
22
- #include "duckdb/storage/statistics/numeric_statistics.hpp"
21
+
23
22
  #include "duckdb/common/types/vector.hpp"
24
23
  #include "duckdb/common/types/string_type.hpp"
25
24
  #include "duckdb/common/types/chunk_collection.hpp"
@@ -29,17 +29,7 @@ public:
29
29
  Vector &result) override;
30
30
 
31
31
  unique_ptr<BaseStatistics> Stats(idx_t row_group_idx_p, const std::vector<ColumnChunk> &columns) override {
32
- switch (type.id()) {
33
- case LogicalTypeId::VARCHAR: {
34
- auto string_stats = make_unique<StringStatistics>(type, StatisticsType::LOCAL_STATS);
35
- string string = constant.ToString();
36
- string_stats->Update(string);
37
- string_stats->max_string_length = string.length();
38
- return std::move(string_stats);
39
- }
40
- default:
41
- return nullptr;
42
- }
32
+ return BaseStatistics::FromConstant(constant).ToUnique();
43
33
  };
44
34
 
45
35
  void InitializeRead(idx_t row_group_idx_p, const std::vector<ColumnChunk> &columns,
@@ -6,8 +6,7 @@
6
6
  #ifndef DUCKDB_AMALGAMATION
7
7
  #include "duckdb/common/types/blob.hpp"
8
8
  #include "duckdb/common/types/value.hpp"
9
- #include "duckdb/storage/statistics/numeric_statistics.hpp"
10
- #include "duckdb/storage/statistics/string_statistics.hpp"
9
+
11
10
  #endif
12
11
 
13
12
  namespace duckdb {
@@ -18,27 +17,29 @@ using duckdb_parquet::format::Type;
18
17
  static unique_ptr<BaseStatistics> CreateNumericStats(const LogicalType &type,
19
18
  const duckdb_parquet::format::SchemaElement &schema_ele,
20
19
  const duckdb_parquet::format::Statistics &parquet_stats) {
21
- auto stats = make_unique<NumericStatistics>(type, StatisticsType::LOCAL_STATS);
20
+ auto stats = NumericStats::CreateUnknown(type);
22
21
 
23
22
  // for reasons unknown to science, Parquet defines *both* `min` and `min_value` as well as `max` and
24
23
  // `max_value`. All are optional. such elegance.
24
+ Value min;
25
+ Value max;
25
26
  if (parquet_stats.__isset.min) {
26
- stats->min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min).DefaultCastAs(type);
27
+ min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min).DefaultCastAs(type);
27
28
  } else if (parquet_stats.__isset.min_value) {
28
- stats->min =
29
- ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value).DefaultCastAs(type);
29
+ min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value).DefaultCastAs(type);
30
30
  } else {
31
- stats->min = Value(type);
31
+ min = Value(type);
32
32
  }
33
33
  if (parquet_stats.__isset.max) {
34
- stats->max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max).DefaultCastAs(type);
34
+ max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max).DefaultCastAs(type);
35
35
  } else if (parquet_stats.__isset.max_value) {
36
- stats->max =
37
- ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max_value).DefaultCastAs(type);
36
+ max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max_value).DefaultCastAs(type);
38
37
  } else {
39
- stats->max = Value(type);
38
+ max = Value(type);
40
39
  }
41
- return std::move(stats);
40
+ NumericStats::SetMin(stats, min);
41
+ NumericStats::SetMax(stats, max);
42
+ return stats.ToUnique();
42
43
  }
43
44
 
44
45
  Value ParquetStatisticsUtils::ConvertValue(const LogicalType &type,
@@ -228,24 +229,24 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
228
229
  row_group_stats = CreateNumericStats(type, s_ele, parquet_stats);
229
230
  break;
230
231
  case LogicalTypeId::VARCHAR: {
231
- auto string_stats = make_unique<StringStatistics>(type, StatisticsType::LOCAL_STATS);
232
+ auto string_stats = StringStats::CreateEmpty(type);
232
233
  if (parquet_stats.__isset.min) {
233
- string_stats->Update(parquet_stats.min);
234
+ StringStats::Update(string_stats, parquet_stats.min);
234
235
  } else if (parquet_stats.__isset.min_value) {
235
- string_stats->Update(parquet_stats.min_value);
236
+ StringStats::Update(string_stats, parquet_stats.min_value);
236
237
  } else {
237
238
  return nullptr;
238
239
  }
239
240
  if (parquet_stats.__isset.max) {
240
- string_stats->Update(parquet_stats.max);
241
+ StringStats::Update(string_stats, parquet_stats.max);
241
242
  } else if (parquet_stats.__isset.max_value) {
242
- string_stats->Update(parquet_stats.max_value);
243
+ StringStats::Update(string_stats, parquet_stats.max_value);
243
244
  } else {
244
245
  return nullptr;
245
246
  }
246
- string_stats->has_unicode = true; // we dont know better
247
- string_stats->max_string_length = NumericLimits<uint32_t>::Maximum();
248
- row_group_stats = std::move(string_stats);
247
+ StringStats::SetContainsUnicode(string_stats);
248
+ StringStats::ResetMaxStringLength(string_stats);
249
+ row_group_stats = string_stats.ToUnique();
249
250
  break;
250
251
  }
251
252
  default:
@@ -254,21 +255,14 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
254
255
  } // end of type switch
255
256
 
256
257
  // null count is generic
257
- if (row_group_stats) {
258
- if (column_chunk.meta_data.type == duckdb_parquet::format::Type::FLOAT ||
259
- column_chunk.meta_data.type == duckdb_parquet::format::Type::DOUBLE) {
260
- // floats/doubles can have infinity, which can become NULL
261
- row_group_stats->validity_stats = make_unique<ValidityStatistics>(true);
262
- } else if (parquet_stats.__isset.null_count) {
263
- row_group_stats->validity_stats = make_unique<ValidityStatistics>(parquet_stats.null_count != 0);
264
- } else {
265
- row_group_stats->validity_stats = make_unique<ValidityStatistics>(true);
266
- }
267
- } else {
258
+ if (!row_group_stats) {
268
259
  // if stats are missing from any row group we know squat
269
260
  return nullptr;
270
261
  }
271
-
262
+ row_group_stats->Set(StatsInfo::CAN_HAVE_NULL_AND_VALID_VALUES);
263
+ if (parquet_stats.__isset.null_count && parquet_stats.null_count == 0) {
264
+ row_group_stats->Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
265
+ }
272
266
  return row_group_stats;
273
267
  }
274
268
 
@@ -3,7 +3,6 @@
3
3
  #include "duckdb/common/row_operations/row_operations.hpp"
4
4
  #include "duckdb/common/sort/sort.hpp"
5
5
  #include "duckdb/common/sort/sorted_block.hpp"
6
- #include "duckdb/storage/statistics/string_statistics.hpp"
7
6
 
8
7
  #include <algorithm>
9
8
  #include <numeric>
@@ -66,9 +65,8 @@ SortLayout::SortLayout(const vector<BoundOrderByNode> &orders)
66
65
  prefix_lengths.back() = GetNestedSortingColSize(col_size, expr.return_type);
67
66
  } else if (physical_type == PhysicalType::VARCHAR) {
68
67
  idx_t size_before = col_size;
69
- if (stats.back()) {
70
- auto &str_stats = (StringStatistics &)*stats.back();
71
- col_size += str_stats.max_string_length;
68
+ if (stats.back() && StringStats::HasMaxStringLength(*stats.back())) {
69
+ col_size += StringStats::MaxStringLength(*stats.back());
72
70
  if (col_size > 12) {
73
71
  col_size = 12;
74
72
  } else {
@@ -95,9 +93,9 @@ SortLayout::SortLayout(const vector<BoundOrderByNode> &orders)
95
93
  if (bytes_to_fill == 0) {
96
94
  break;
97
95
  }
98
- if (logical_types[col_idx].InternalType() == PhysicalType::VARCHAR && stats[col_idx]) {
99
- auto &str_stats = (StringStatistics &)*stats[col_idx];
100
- idx_t diff = str_stats.max_string_length - prefix_lengths[col_idx];
96
+ if (logical_types[col_idx].InternalType() == PhysicalType::VARCHAR && stats[col_idx] &&
97
+ StringStats::HasMaxStringLength(*stats[col_idx])) {
98
+ idx_t diff = StringStats::MaxStringLength(*stats[col_idx]) - prefix_lengths[col_idx];
101
99
  if (diff > 0) {
102
100
  // Increase all sizes accordingly
103
101
  idx_t increase = MinValue(bytes_to_fill, diff);
@@ -1,6 +1,7 @@
1
1
  #include "duckdb/execution/column_binding_resolver.hpp"
2
2
 
3
3
  #include "duckdb/planner/operator/logical_comparison_join.hpp"
4
+ #include "duckdb/planner/operator/logical_any_join.hpp"
4
5
  #include "duckdb/planner/operator/logical_create_index.hpp"
5
6
  #include "duckdb/planner/operator/logical_delim_join.hpp"
6
7
  #include "duckdb/planner/operator/logical_insert.hpp"
@@ -46,6 +47,11 @@ void ColumnBindingResolver::VisitOperator(LogicalOperator &op) {
46
47
  // this operator
47
48
  VisitOperatorChildren(op);
48
49
  bindings = op.GetColumnBindings();
50
+ auto &any_join = (LogicalAnyJoin &)op;
51
+ if (any_join.join_type == JoinType::SEMI || any_join.join_type == JoinType::ANTI) {
52
+ auto right_bindings = op.children[1]->GetColumnBindings();
53
+ bindings.insert(bindings.end(), right_bindings.begin(), right_bindings.end());
54
+ }
49
55
  VisitOperatorExpressions(op);
50
56
  return;
51
57
  } else if (op.type == LogicalOperatorType::LOGICAL_CREATE_INDEX) {
@@ -4,14 +4,13 @@
4
4
  #include "duckdb/planner/expression/bound_aggregate_expression.hpp"
5
5
  #include "duckdb/planner/expression/bound_reference_expression.hpp"
6
6
  #include "duckdb/storage/buffer_manager.hpp"
7
- #include "duckdb/storage/statistics/numeric_statistics.hpp"
8
7
 
9
8
  namespace duckdb {
10
9
 
11
10
  PhysicalPerfectHashAggregate::PhysicalPerfectHashAggregate(ClientContext &context, vector<LogicalType> types_p,
12
11
  vector<unique_ptr<Expression>> aggregates_p,
13
12
  vector<unique_ptr<Expression>> groups_p,
14
- vector<unique_ptr<BaseStatistics>> group_stats,
13
+ const vector<unique_ptr<BaseStatistics>> &group_stats,
15
14
  vector<idx_t> required_bits_p, idx_t estimated_cardinality)
16
15
  : PhysicalOperator(PhysicalOperatorType::PERFECT_HASH_GROUP_BY, std::move(types_p), estimated_cardinality),
17
16
  groups(std::move(groups_p)), aggregates(std::move(aggregates_p)), required_bits(std::move(required_bits_p)) {
@@ -19,9 +18,9 @@ PhysicalPerfectHashAggregate::PhysicalPerfectHashAggregate(ClientContext &contex
19
18
  group_minima.reserve(group_stats.size());
20
19
  for (auto &stats : group_stats) {
21
20
  D_ASSERT(stats);
22
- auto &nstats = (NumericStatistics &)*stats;
23
- D_ASSERT(!nstats.min.IsNull());
24
- group_minima.push_back(std::move(nstats.min));
21
+ auto &nstats = *stats;
22
+ D_ASSERT(NumericStats::HasMin(nstats));
23
+ group_minima.push_back(NumericStats::Min(nstats));
25
24
  }
26
25
  for (auto &expr : groups) {
27
26
  group_types.push_back(expr->return_type);
@@ -111,7 +111,7 @@ public:
111
111
  orders.emplace_back(OrderType::ASCENDING, OrderByNullType::NULLS_FIRST, pexpr->Copy(), nullptr);
112
112
  } else {
113
113
  orders.emplace_back(OrderType::ASCENDING, OrderByNullType::NULLS_FIRST, pexpr->Copy(),
114
- wexpr->partitions_stats[prt_idx]->Copy());
114
+ wexpr->partitions_stats[prt_idx]->ToUnique());
115
115
  }
116
116
  partitions.emplace_back(orders.back().Copy());
117
117
  }
@@ -72,9 +72,8 @@ SinkFinalizeType PhysicalVacuum::Finalize(Pipeline &pipeline, Event &event, Clie
72
72
 
73
73
  auto table = info->table;
74
74
  for (idx_t col_idx = 0; col_idx < sink.column_distinct_stats.size(); col_idx++) {
75
- table->GetStorage().SetStatistics(info->column_id_map.at(col_idx), [&](BaseStatistics &stats) {
76
- stats.distinct_stats = std::move(sink.column_distinct_stats[col_idx]);
77
- });
75
+ table->GetStorage().SetDistinct(info->column_id_map.at(col_idx),
76
+ std::move(sink.column_distinct_stats[col_idx]));
78
77
  }
79
78
 
80
79
  return SinkFinalizeType::READY;
@@ -87,11 +87,23 @@ public:
87
87
  OuterJoinMarker left_outer;
88
88
  SelectionVector match_sel;
89
89
  ExpressionExecutor executor;
90
+ DataChunk intermediate_chunk;
90
91
  };
91
92
 
92
93
  unique_ptr<OperatorState> PhysicalBlockwiseNLJoin::GetOperatorState(ExecutionContext &context) const {
93
94
  auto &gstate = (BlockwiseNLJoinGlobalState &)*sink_state;
94
- return make_unique<BlockwiseNLJoinState>(context, gstate.right_chunks, *this);
95
+ auto result = make_unique<BlockwiseNLJoinState>(context, gstate.right_chunks, *this);
96
+ if (join_type == JoinType::SEMI || join_type == JoinType::ANTI) {
97
+ vector<LogicalType> intermediate_types;
98
+ for (auto &type : children[0]->types) {
99
+ intermediate_types.emplace_back(type);
100
+ }
101
+ for (auto &type : children[1]->types) {
102
+ intermediate_types.emplace_back(type);
103
+ }
104
+ result->intermediate_chunk.Initialize(Allocator::DefaultAllocator(), intermediate_types);
105
+ }
106
+ return std::move(result);
95
107
  }
96
108
 
97
109
  OperatorResultType PhysicalBlockwiseNLJoin::ExecuteInternal(ExecutionContext &context, DataChunk &input,
@@ -111,24 +123,30 @@ OperatorResultType PhysicalBlockwiseNLJoin::ExecuteInternal(ExecutionContext &co
111
123
  }
112
124
  }
113
125
 
126
+ DataChunk *intermediate_chunk = &chunk;
127
+ if (join_type == JoinType::SEMI || join_type == JoinType::ANTI) {
128
+ intermediate_chunk = &state.intermediate_chunk;
129
+ intermediate_chunk->Reset();
130
+ }
131
+
114
132
  // now perform the actual join
115
133
  // we perform a cross product, then execute the expression directly on the cross product' result
116
134
  idx_t result_count = 0;
117
135
  do {
118
- auto result = state.cross_product.Execute(input, chunk);
136
+ auto result = state.cross_product.Execute(input, *intermediate_chunk);
119
137
  if (result == OperatorResultType::NEED_MORE_INPUT) {
120
138
  // exhausted input, have to pull new LHS chunk
121
139
  if (state.left_outer.Enabled()) {
122
140
  // left join: before we move to the next chunk, see if we need to output any vectors that didn't
123
141
  // have a match found
124
- state.left_outer.ConstructLeftJoinResult(input, chunk);
142
+ state.left_outer.ConstructLeftJoinResult(input, *intermediate_chunk);
125
143
  state.left_outer.Reset();
126
144
  }
127
145
  return OperatorResultType::NEED_MORE_INPUT;
128
146
  }
129
147
 
130
148
  // now perform the computation
131
- result_count = state.executor.SelectExpression(chunk, state.match_sel);
149
+ result_count = state.executor.SelectExpression(*intermediate_chunk, state.match_sel);
132
150
  if (result_count > 0) {
133
151
  // found a match!
134
152
  // check if the cross product is scanning the LHS or the RHS in its entirety
@@ -143,12 +161,20 @@ OperatorResultType PhysicalBlockwiseNLJoin::ExecuteInternal(ExecutionContext &co
143
161
  // set the match flags in the RHS
144
162
  gstate.right_outer.SetMatches(state.match_sel, result_count, state.cross_product.ScanPosition());
145
163
  }
146
- chunk.Slice(state.match_sel, result_count);
164
+ intermediate_chunk->Slice(state.match_sel, result_count);
147
165
  } else {
148
166
  // no result: reset the chunk
149
- chunk.Reset();
167
+ intermediate_chunk->Reset();
150
168
  }
151
169
  } while (result_count == 0);
170
+
171
+ if (join_type == JoinType::SEMI || join_type == JoinType::ANTI) {
172
+ for (idx_t col_idx = 0; col_idx < chunk.ColumnCount(); col_idx++) {
173
+ chunk.data[col_idx].Reference(intermediate_chunk->data[col_idx]);
174
+ }
175
+ chunk.SetCardinality(*intermediate_chunk);
176
+ }
177
+
152
178
  return OperatorResultType::HAVE_MORE_OUTPUT;
153
179
  }
154
180
 
@@ -9,7 +9,7 @@
9
9
  #include "duckdb/parser/expression/comparison_expression.hpp"
10
10
  #include "duckdb/planner/expression/bound_aggregate_expression.hpp"
11
11
  #include "duckdb/planner/operator/logical_aggregate.hpp"
12
- #include "duckdb/storage/statistics/numeric_statistics.hpp"
12
+
13
13
  namespace duckdb {
14
14
 
15
15
  static uint32_t RequiredBitsForValue(uint32_t n) {
@@ -50,23 +50,20 @@ static bool CanUsePerfectHashAggregate(ClientContext &context, LogicalAggregate
50
50
  // for small types we can just set the stats to [type_min, type_max]
51
51
  switch (group_type.InternalType()) {
52
52
  case PhysicalType::INT8:
53
- stats = make_unique<NumericStatistics>(group_type, Value::MinimumValue(group_type),
54
- Value::MaximumValue(group_type), StatisticsType::LOCAL_STATS);
55
- break;
56
53
  case PhysicalType::INT16:
57
- stats = make_unique<NumericStatistics>(group_type, Value::MinimumValue(group_type),
58
- Value::MaximumValue(group_type), StatisticsType::LOCAL_STATS);
59
54
  break;
60
55
  default:
61
56
  // type is too large and there are no stats: skip perfect hashing
62
57
  return false;
63
58
  }
64
- // we had no stats before, so we have no clue if there are null values or not
65
- stats->validity_stats = make_unique<ValidityStatistics>(true);
59
+ // construct stats with the min and max value of the type
60
+ stats = NumericStats::CreateUnknown(group_type).ToUnique();
61
+ NumericStats::SetMin(*stats, Value::MinimumValue(group_type));
62
+ NumericStats::SetMax(*stats, Value::MaximumValue(group_type));
66
63
  }
67
- auto &nstats = (NumericStatistics &)*stats;
64
+ auto &nstats = *stats;
68
65
 
69
- if (nstats.min.IsNull() || nstats.max.IsNull()) {
66
+ if (!NumericStats::HasMinMax(nstats)) {
70
67
  return false;
71
68
  }
72
69
  // we have a min and a max value for the stats: use that to figure out how many bits we have
@@ -75,17 +72,20 @@ static bool CanUsePerfectHashAggregate(ClientContext &context, LogicalAggregate
75
72
  int64_t range;
76
73
  switch (group_type.InternalType()) {
77
74
  case PhysicalType::INT8:
78
- range = int64_t(nstats.max.GetValueUnsafe<int8_t>()) - int64_t(nstats.min.GetValueUnsafe<int8_t>());
75
+ range = int64_t(NumericStats::GetMaxUnsafe<int8_t>(nstats)) -
76
+ int64_t(NumericStats::GetMinUnsafe<int8_t>(nstats));
79
77
  break;
80
78
  case PhysicalType::INT16:
81
- range = int64_t(nstats.max.GetValueUnsafe<int16_t>()) - int64_t(nstats.min.GetValueUnsafe<int16_t>());
79
+ range = int64_t(NumericStats::GetMaxUnsafe<int16_t>(nstats)) -
80
+ int64_t(NumericStats::GetMinUnsafe<int16_t>(nstats));
82
81
  break;
83
82
  case PhysicalType::INT32:
84
- range = int64_t(nstats.max.GetValueUnsafe<int32_t>()) - int64_t(nstats.min.GetValueUnsafe<int32_t>());
83
+ range = int64_t(NumericStats::GetMaxUnsafe<int32_t>(nstats)) -
84
+ int64_t(NumericStats::GetMinUnsafe<int32_t>(nstats));
85
85
  break;
86
86
  case PhysicalType::INT64:
87
- if (!TrySubtractOperator::Operation(nstats.max.GetValueUnsafe<int64_t>(),
88
- nstats.min.GetValueUnsafe<int64_t>(), range)) {
87
+ if (!TrySubtractOperator::Operation(NumericStats::GetMaxUnsafe<int64_t>(nstats),
88
+ NumericStats::GetMinUnsafe<int64_t>(nstats), range)) {
89
89
  return false;
90
90
  }
91
91
  break;
@@ -10,7 +10,7 @@
10
10
  #include "duckdb/function/table/table_scan.hpp"
11
11
  #include "duckdb/main/client_context.hpp"
12
12
  #include "duckdb/planner/operator/logical_comparison_join.hpp"
13
- #include "duckdb/storage/statistics/numeric_statistics.hpp"
13
+
14
14
  #include "duckdb/transaction/duck_transaction.hpp"
15
15
  #include "duckdb/common/operator/subtract.hpp"
16
16
  #include "duckdb/execution/operator/join/physical_blockwise_nl_join.hpp"
@@ -92,19 +92,21 @@ void CheckForPerfectJoinOpt(LogicalComparisonJoin &op, PerfectHashJoinStats &joi
92
92
  }
93
93
  // with integral internal types
94
94
  for (auto &&join_stat : op.join_stats) {
95
- if (!TypeIsInteger(join_stat->type.InternalType()) || join_stat->type.InternalType() == PhysicalType::INT128) {
95
+ if (!TypeIsInteger(join_stat->GetType().InternalType()) ||
96
+ join_stat->GetType().InternalType() == PhysicalType::INT128) {
96
97
  // perfect join not possible for non-integral types or hugeint
97
98
  return;
98
99
  }
99
100
  }
100
101
 
101
102
  // and when the build range is smaller than the threshold
102
- auto stats_build = reinterpret_cast<NumericStatistics *>(op.join_stats[0].get()); // lhs stats
103
- if (stats_build->min.IsNull() || stats_build->max.IsNull()) {
103
+ auto &stats_build = *op.join_stats[0].get(); // lhs stats
104
+ if (!NumericStats::HasMinMax(stats_build)) {
104
105
  return;
105
106
  }
106
107
  int64_t min_value, max_value;
107
- if (!ExtractNumericValue(stats_build->min, min_value) || !ExtractNumericValue(stats_build->max, max_value)) {
108
+ if (!ExtractNumericValue(NumericStats::Min(stats_build), min_value) ||
109
+ !ExtractNumericValue(NumericStats::Max(stats_build), max_value)) {
108
110
  return;
109
111
  }
110
112
  int64_t build_range;
@@ -113,20 +115,24 @@ void CheckForPerfectJoinOpt(LogicalComparisonJoin &op, PerfectHashJoinStats &joi
113
115
  }
114
116
 
115
117
  // Fill join_stats for invisible join
116
- auto stats_probe = reinterpret_cast<NumericStatistics *>(op.join_stats[1].get()); // rhs stats
118
+ auto &stats_probe = *op.join_stats[1].get(); // rhs stats
119
+ if (!NumericStats::HasMinMax(stats_probe)) {
120
+ return;
121
+ }
117
122
 
118
123
  // The max size our build must have to run the perfect HJ
119
124
  const idx_t MAX_BUILD_SIZE = 1000000;
120
- join_state.probe_min = stats_probe->min;
121
- join_state.probe_max = stats_probe->max;
122
- join_state.build_min = stats_build->min;
123
- join_state.build_max = stats_build->max;
125
+ join_state.probe_min = NumericStats::Min(stats_probe);
126
+ join_state.probe_max = NumericStats::Max(stats_probe);
127
+ join_state.build_min = NumericStats::Min(stats_build);
128
+ join_state.build_max = NumericStats::Max(stats_build);
124
129
  join_state.estimated_cardinality = op.estimated_cardinality;
125
130
  join_state.build_range = build_range;
126
- if (join_state.build_range > MAX_BUILD_SIZE || stats_probe->max.IsNull() || stats_probe->min.IsNull()) {
131
+ if (join_state.build_range > MAX_BUILD_SIZE) {
127
132
  return;
128
133
  }
129
- if (stats_build->min <= stats_probe->min && stats_probe->max <= stats_build->max) {
134
+ if (NumericStats::Min(stats_build) <= NumericStats::Min(stats_probe) &&
135
+ NumericStats::Max(stats_probe) <= NumericStats::Max(stats_build)) {
130
136
  join_state.is_probe_in_domain = true;
131
137
  }
132
138
  join_state.is_build_small = true;
@@ -3,7 +3,7 @@
3
3
  #include "duckdb/common/types/null_value.hpp"
4
4
  #include "duckdb/common/vector_operations/aggregate_executor.hpp"
5
5
  #include "duckdb/common/types/bit.hpp"
6
- #include "duckdb/storage/statistics/numeric_statistics.hpp"
6
+ #include "duckdb/storage/statistics/base_statistics.hpp"
7
7
  #include "duckdb/execution/expression_executor.hpp"
8
8
  #include "duckdb/common/types/cast_helpers.hpp"
9
9
 
@@ -171,22 +171,15 @@ idx_t BitStringAggOperation::GetRange(hugeint_t min, hugeint_t max) {
171
171
  }
172
172
 
173
173
  unique_ptr<BaseStatistics> BitstringPropagateStats(ClientContext &context, BoundAggregateExpression &expr,
174
- FunctionData *bind_data,
175
- vector<unique_ptr<BaseStatistics>> &child_stats,
176
- NodeStatistics *node_stats) {
174
+ AggregateStatisticsInput &input) {
177
175
 
178
- if (child_stats[0]) {
179
- auto &numeric_stats = (NumericStatistics &)*child_stats[0];
180
- if (numeric_stats.min.IsNull() || numeric_stats.max.IsNull()) {
181
- return nullptr;
182
- }
183
- auto bind_agg_data = (BitstringAggBindData *)bind_data;
184
- bind_agg_data->min = numeric_stats.min;
185
- bind_agg_data->max = numeric_stats.max;
186
- } else {
176
+ if (!NumericStats::HasMinMax(input.child_stats[0])) {
187
177
  throw BinderException("Could not retrieve required statistics. Alternatively, try by providing the statistics "
188
178
  "explicitly: BITSTRING_AGG(col, min, max) ");
189
179
  }
180
+ auto bind_agg_data = (BitstringAggBindData *)input.bind_data;
181
+ bind_agg_data->min = NumericStats::Min(input.child_stats[0]);
182
+ bind_agg_data->max = NumericStats::Max(input.child_stats[0]);
190
183
  return nullptr;
191
184
  }
192
185
 
@@ -2,7 +2,6 @@
2
2
  #include "duckdb/common/vector_operations/vector_operations.hpp"
3
3
  #include "duckdb/function/aggregate/distributive_functions.hpp"
4
4
  #include "duckdb/planner/expression/bound_aggregate_expression.hpp"
5
- #include "duckdb/storage/statistics/validity_statistics.hpp"
6
5
 
7
6
  namespace duckdb {
8
7
 
@@ -100,9 +99,8 @@ AggregateFunction CountStarFun::GetFunction() {
100
99
  }
101
100
 
102
101
  unique_ptr<BaseStatistics> CountPropagateStats(ClientContext &context, BoundAggregateExpression &expr,
103
- FunctionData *bind_data, vector<unique_ptr<BaseStatistics>> &child_stats,
104
- NodeStatistics *node_stats) {
105
- if (!expr.IsDistinct() && child_stats[0] && !child_stats[0]->CanHaveNull()) {
102
+ AggregateStatisticsInput &input) {
103
+ if (!expr.IsDistinct() && !input.child_stats[0].CanHaveNull()) {
106
104
  // count on a column without null values: use count star
107
105
  expr.function = CountStarFun::GetFunction();
108
106
  expr.function.name = "count_star";
@@ -2,7 +2,6 @@
2
2
  #include "duckdb/function/aggregate/sum_helpers.hpp"
3
3
  #include "duckdb/common/exception.hpp"
4
4
  #include "duckdb/common/types/decimal.hpp"
5
- #include "duckdb/storage/statistics/numeric_statistics.hpp"
6
5
  #include "duckdb/planner/expression/bound_aggregate_expression.hpp"
7
6
  #include "duckdb/function/aggregate/algebraic_functions.hpp"
8
7
 
@@ -72,30 +71,29 @@ struct HugeintSumOperation : public BaseSumOperation<SumSetOperation, RegularAdd
72
71
  };
73
72
 
74
73
  unique_ptr<BaseStatistics> SumPropagateStats(ClientContext &context, BoundAggregateExpression &expr,
75
- FunctionData *bind_data, vector<unique_ptr<BaseStatistics>> &child_stats,
76
- NodeStatistics *node_stats) {
77
- if (child_stats[0] && node_stats && node_stats->has_max_cardinality) {
78
- auto &numeric_stats = (NumericStatistics &)*child_stats[0];
79
- if (numeric_stats.min.IsNull() || numeric_stats.max.IsNull()) {
74
+ AggregateStatisticsInput &input) {
75
+ if (input.node_stats && input.node_stats->has_max_cardinality) {
76
+ auto &numeric_stats = input.child_stats[0];
77
+ if (!NumericStats::HasMinMax(numeric_stats)) {
80
78
  return nullptr;
81
79
  }
82
- auto internal_type = numeric_stats.min.type().InternalType();
80
+ auto internal_type = numeric_stats.GetType().InternalType();
83
81
  hugeint_t max_negative;
84
82
  hugeint_t max_positive;
85
83
  switch (internal_type) {
86
84
  case PhysicalType::INT32:
87
- max_negative = numeric_stats.min.GetValueUnsafe<int32_t>();
88
- max_positive = numeric_stats.max.GetValueUnsafe<int32_t>();
85
+ max_negative = NumericStats::Min(numeric_stats).GetValueUnsafe<int32_t>();
86
+ max_positive = NumericStats::Max(numeric_stats).GetValueUnsafe<int32_t>();
89
87
  break;
90
88
  case PhysicalType::INT64:
91
- max_negative = numeric_stats.min.GetValueUnsafe<int64_t>();
92
- max_positive = numeric_stats.max.GetValueUnsafe<int64_t>();
89
+ max_negative = NumericStats::Min(numeric_stats).GetValueUnsafe<int64_t>();
90
+ max_positive = NumericStats::Max(numeric_stats).GetValueUnsafe<int64_t>();
93
91
  break;
94
92
  default:
95
93
  throw InternalException("Unsupported type for propagate sum stats");
96
94
  }
97
- auto max_sum_negative = max_negative * hugeint_t(node_stats->max_cardinality);
98
- auto max_sum_positive = max_positive * hugeint_t(node_stats->max_cardinality);
95
+ auto max_sum_negative = max_negative * hugeint_t(input.node_stats->max_cardinality);
96
+ auto max_sum_positive = max_positive * hugeint_t(input.node_stats->max_cardinality);
99
97
  if (max_sum_positive >= NumericLimits<int64_t>::Maximum() ||
100
98
  max_sum_negative <= NumericLimits<int64_t>::Minimum()) {
101
99
  // sum can potentially exceed int64_t bounds: use hugeint sum
@@ -9,7 +9,6 @@
9
9
  #include "duckdb/common/vector_operations/ternary_executor.hpp"
10
10
  #include "duckdb/common/vector_operations/vector_operations.hpp"
11
11
  #include "duckdb/common/string_util.hpp"
12
- #include "duckdb/storage/statistics/numeric_statistics.hpp"
13
12
 
14
13
  namespace duckdb {
15
14